extern int atomic_cmpset_int(volatile u_int *_dst, u_int _old, u_int _new);
extern long atomic_cmpset_long(volatile u_long *_dst, u_long _exp, u_long _src);
extern u_int atomic_fetchadd_int(volatile u_int *_p, u_int _v);
+extern u_long atomic_fetchadd_long(volatile u_long *_p, u_long _v);
#else
return (_v);
}
+static __inline u_long
+atomic_fetchadd_long(volatile u_long *_p, u_long _v)
+{
+ __asm __volatile(MPLOCKED "xaddl %0,%1; " \
+ : "+r" (_v), "=m" (*_p) \
+ : "m" (*_p) \
+ : "memory");
+ return (_v);
+}
+
#endif /* KLD_MODULE */
#if defined(KLD_MODULE)
*
* We now have to use a locked bus cycle due to LWKT_RESCHED/WAKEUP
* signalling by other cpus.
- *
- * NOTE: need_lwkt_resched() sets RQF_WAKEUP but clear_lwkt_resched() does
- * not clear it. Only the scheduler will clear RQF_WAKEUP.
*/
#define need_lwkt_resched() \
- atomic_set_int(&mycpu->gd_reqflags, RQF_AST_LWKT_RESCHED | RQF_WAKEUP)
+ atomic_set_int(&mycpu->gd_reqflags, RQF_AST_LWKT_RESCHED)
#define need_user_resched() \
atomic_set_int(&mycpu->gd_reqflags, RQF_AST_USER_RESCHED)
#define need_proftick() \
extern int atomic_cmpset_int(volatile u_int *_dst, u_int _old, u_int _new);
extern long atomic_cmpset_long(volatile u_long *_dst, u_long _exp, u_long _src);
extern u_int atomic_fetchadd_int(volatile u_int *_p, u_int _v);
+extern u_long atomic_fetchadd_long(volatile u_long *_p, u_long _v);
#else
return (_v);
}
+static __inline u_long
+atomic_fetchadd_long(volatile u_long *_p, u_long _v)
+{
+ __asm __volatile(MPLOCKED "xaddq %0,%1; " \
+ : "+r" (_v), "=m" (*_p) \
+ : "m" (*_p) \
+ : "memory");
+ return (_v);
+}
+
#endif /* KLD_MODULE */
#if defined(KLD_MODULE)
* We do not have to use a locked bus cycle but we do have to use an
* atomic instruction because an interrupt on the local cpu can modify
* the gd_reqflags field.
- *
- * NOTE: need_lwkt_resched() sets RQF_WAKEUP but clear_lwkt_resched() does
- * not clear it. Only the scheduler will clear RQF_WAKEUP.
*/
#define need_lwkt_resched() \
- atomic_set_int(&mycpu->gd_reqflags, RQF_AST_LWKT_RESCHED | RQF_WAKEUP)
+ atomic_set_int(&mycpu->gd_reqflags, RQF_AST_LWKT_RESCHED)
#define need_user_resched() \
atomic_set_int(&mycpu->gd_reqflags, RQF_AST_USER_RESCHED)
#define need_proftick() \
vm_page_wakeup(m);
for (k = 0; k < i + j; k += AGP_PAGE_SIZE)
AGP_UNBIND_PAGE(dev, offset + k);
- lwkt_gettoken(&vm_token);
+ vm_object_hold(mem->am_obj);
for (k = 0; k <= i; k += PAGE_SIZE) {
- m = vm_page_lookup(mem->am_obj,
- OFF_TO_IDX(k));
+ m = vm_page_lookup_busy_wait(
+ mem->am_obj, OFF_TO_IDX(k),
+ FALSE, "agppg");
vm_page_unwire(m, 0);
+ vm_page_wakeup(m);
}
- lwkt_reltoken(&vm_token);
+ vm_object_drop(mem->am_obj);
lockmgr(&sc->as_lock, LK_RELEASE);
return error;
}
*/
for (i = 0; i < mem->am_size; i += AGP_PAGE_SIZE)
AGP_UNBIND_PAGE(dev, mem->am_offset + i);
- lwkt_gettoken(&vm_token);
+ vm_object_hold(mem->am_obj);
for (i = 0; i < mem->am_size; i += PAGE_SIZE) {
- m = vm_page_lookup(mem->am_obj, atop(i));
+ m = vm_page_lookup_busy_wait(mem->am_obj, atop(i),
+ FALSE, "agppg");
vm_page_unwire(m, 0);
+ vm_page_wakeup(m);
}
- lwkt_reltoken(&vm_token);
+ vm_object_drop(mem->am_obj);
agp_flush_cache();
AGP_FLUSH_TLB(dev);
* Unwire the page which we wired in alloc_memory.
*/
vm_page_t m;
- lwkt_gettoken(&vm_token);
- m = vm_page_lookup(mem->am_obj, 0);
+
+ vm_object_hold(mem->am_obj);
+ m = vm_page_lookup_busy_wait(mem->am_obj, 0,
+ FALSE, "agppg");
+ vm_object_drop(mem->am_obj);
vm_page_unwire(m, 0);
- lwkt_reltoken(&vm_token);
+ vm_page_wakeup(m);
} else {
contigfree(sc->argb_cursor, mem->am_size, M_AGP);
sc->argb_cursor = NULL;
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/spinlock.h>
-#include <sys/spinlock2.h>
+#include <sys/lock.h>
#include <sys/kernel.h>
#include <sys/bus.h>
#include <bus/pci/pcivar.h>
#define PCI_COMMAND_REGISTER PCIR_COMMAND
/* Mutex used in the shared code */
-#define E1000_MUTEX struct spinlock
-#define E1000_MUTEX_INIT(spin) spin_init(spin)
-#define E1000_MUTEX_DESTROY(spin) spin_uninit(spin)
-#define E1000_MUTEX_LOCK(spin) spin_lock(spin)
-#define E1000_MUTEX_TRYLOCK(spin) spin_trylock(spin)
-#define E1000_MUTEX_UNLOCK(spin) spin_unlock(spin)
+#define E1000_MUTEX struct lock
+#define E1000_MUTEX_INIT(spin) lockinit(spin, "emtx", 0, 0)
+#define E1000_MUTEX_DESTROY(spin) lockuninit(spin)
+#define E1000_MUTEX_LOCK(spin) lockmgr(spin, LK_EXCLUSIVE)
+#define E1000_MUTEX_TRYLOCK(spin) (lockmgr(spin, LK_EXCLUSIVE | LK_NOWAIT) == 0)
+#define E1000_MUTEX_UNLOCK(spin) lockmgr(spin, LK_RELEASE)
typedef uint64_t u64;
typedef uint32_t u32;
int if_flags;
int max_frame_size;
int min_frame_size;
- struct spinlock core_spin;
- struct spinlock tx_spin;
- struct spinlock rx_spin;
+ struct lock core_spin;
+ struct lock tx_spin;
+ struct lock rx_spin;
int em_insert_vlan_header;
/* Task for FAST handling */
} DESC_ARRAY, *PDESC_ARRAY;
#define EM_CORE_LOCK_INIT(_sc, _name) \
- spin_init(&(_sc)->core_spin)
+ lockinit(&(_sc)->core_spin, "emcore", 0, 0)
#define EM_TX_LOCK_INIT(_sc, _name) \
- spin_init(&(_sc)->tx_spin)
+ lockinit(&(_sc)->tx_spin, "emtx", 0, 0)
#define EM_RX_LOCK_INIT(_sc, _name) \
- spin_init(&(_sc)->rx_spin)
-#define EM_CORE_LOCK_DESTROY(_sc) spin_uninit(&(_sc)->core_spin)
-#define EM_TX_LOCK_DESTROY(_sc) spin_uninit(&(_sc)->tx_spin)
-#define EM_RX_LOCK_DESTROY(_sc) spin_uninit(&(_sc)->rx_spin)
-#define EM_CORE_LOCK(_sc) spin_lock(&(_sc)->core_spin)
-#define EM_TX_LOCK(_sc) spin_lock(&(_sc)->tx_spin)
-#define EM_TX_TRYLOCK(_sc) spin_trylock(&(_sc)->tx_spin)
-#define EM_RX_LOCK(_sc) spin_lock(&(_sc)->rx_spin)
-#define EM_CORE_UNLOCK(_sc) spin_unlock(&(_sc)->core_spin)
-#define EM_TX_UNLOCK(_sc) spin_unlock(&(_sc)->tx_spin)
-#define EM_RX_UNLOCK(_sc) spin_unlock(&(_sc)->rx_spin)
+ lockinit(&(_sc)->rx_spi, "emrx", 0, 0n)
+#define EM_CORE_LOCK_DESTROY(_sc) lockuninit(&(_sc)->core_spin)
+#define EM_TX_LOCK_DESTROY(_sc) lockuninit(&(_sc)->tx_spin)
+#define EM_RX_LOCK_DESTROY(_sc) lockuninit(&(_sc)->rx_spin)
+#define EM_CORE_LOCK(_sc) lockmgr(&(_sc)->core_spin, LK_EXCLUSIVE)
+#define EM_TX_LOCK(_sc) lockmgr(&(_sc)->tx_spin, LK_EXCLUSIVE)
+#define EM_TX_TRYLOCK(_sc) (lockmgr(&(_sc)->tx_spin, LK_EXCLUSIVE | LK_NOWAIT) == 0)
+#define EM_RX_LOCK(_sc) lockmgr(&(_sc)->rx_spin, LK_EXCLUSIVE)
+#define EM_CORE_UNLOCK(_sc) lockmgr(&(_sc)->core_spin, LK_RELEASE)
+#define EM_TX_UNLOCK(_sc) lockmgr(&(_sc)->tx_spin, LK_RELEASE)
+#define EM_RX_UNLOCK(_sc) lockmgr(&(_sc)->rx_spin, LK_RELEASE)
#define EM_CORE_LOCK_ASSERT(_sc)
#define EM_TX_LOCK_ASSERT(_sc)
if (uap->flags & OMAP_INHERIT)
flags |= MAP_INHERIT;
- lwkt_gettoken(&vm_token);
error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len,
prot, flags, uap->fd, uap->pos,
&uap->sysmsg_resultp);
- lwkt_reltoken(&vm_token);
return (error);
}
*/
map->hint = entry;
ostart = entry->start;
- obj = entry->object.vm_object;
- for( lobj = tobj = obj; tobj; tobj = tobj->backing_object)
- lobj = tobj;
+ /*
+ * Find the bottom-most object, leaving the base object
+ * and the bottom-most object held (but only one hold
+ * if they happen to be the same).
+ */
+ obj = entry->object.vm_object;
+ vm_object_hold(obj);
+
+ lobj = obj;
+ while (lobj && (tobj = lobj->backing_object) != NULL) {
+ KKASSERT(tobj != obj);
+ vm_object_hold(tobj);
+ if (tobj == lobj->backing_object) {
+ if (lobj != obj) {
+ vm_object_lock_swap();
+ vm_object_drop(lobj);
+ }
+ lobj = tobj;
+ } else {
+ vm_object_drop(tobj);
+ }
+ }
if (lobj) {
off = IDX_TO_OFF(lobj->size);
name = "[stack]";
}
+ if (lobj != obj)
+ vm_object_drop(lobj);
+ vm_object_drop(obj);
+
/*
* We cannot safely hold the map locked while accessing
* userspace as a VM fault might recurse the locked map.
flags |= MAP_NOSYNC;
}
- lwkt_gettoken(&vm_token);
- lwkt_gettoken(&vmspace_token);
+ lwkt_gettoken(&curproc->p_vmspace->vm_map.token);
if (linux_flags & LINUX_MAP_GROWSDOWN) {
flags |= MAP_STACK;
error = kern_mmap(curproc->p_vmspace, addr, len,
prot, flags, fd, pos, &new);
- lwkt_reltoken(&vmspace_token);
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&curproc->p_vmspace->vm_map.token);
if (error == 0)
*res = new;
count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
vm_map_lock(map);
object = vp->v_object;
- vm_object_reference(object);
+ vm_object_hold(object);
+ vm_object_reference_locked(object);
text_end = virtual_offset + a_out->a_text;
error = vm_map_insert(map, &count, object,
VM_MAPTYPE_NORMAL,
VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT);
+
if (error) {
+ vm_object_drop(object);
vm_map_unlock(map);
vm_map_entry_release(count);
return (error);
}
data_end = text_end + a_out->a_data;
if (a_out->a_data) {
- vm_object_reference(object);
+ vm_object_reference_locked(object);
error = vm_map_insert(map, &count, object,
file_offset + a_out->a_text,
text_end, data_end,
VM_PROT_ALL, VM_PROT_ALL,
MAP_COPY_ON_WRITE | MAP_PREFAULT);
if (error) {
+ vm_object_drop(object);
vm_map_unlock(map);
vm_map_entry_release(count);
return (error);
}
}
+ vm_object_drop(object);
if (bss_size) {
error = vm_map_insert(map, &count, NULL, 0,
object = vp->v_object;
error = 0;
+ vm_object_hold(object);
+
/*
* It's necessary to fail if the filsz + offset taken from the
* header is greater than the actual file pager object's size.
*/
if ((off_t)filsz + offset > vp->v_filesize || filsz > memsz) {
uprintf("elf_load_section: truncated ELF file\n");
+ vm_object_drop(object);
return (ENOEXEC);
}
map_len = round_page(offset+filsz) - file_addr;
if (map_len != 0) {
- vm_object_reference(object);
+ vm_object_reference_locked(object);
/* cow flags: don't dump readonly sections in core */
cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
vm_map_entry_release(count);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(object);
+ vm_object_drop(object);
return (EINVAL);
}
/* we can stop now if we've covered it all */
if (memsz == filsz) {
+ vm_object_drop(object);
return (0);
}
}
vm_map_unlock(&vmspace->vm_map);
vm_map_entry_release(count);
if (rv != KERN_SUCCESS) {
+ vm_object_drop(object);
return (EINVAL);
}
}
vm_page_unhold(m);
}
if (error) {
+ vm_object_drop(object);
return (error);
}
}
+ vm_object_drop(object);
/*
* set it to the specified protection
*/
- vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot,
- FALSE);
+ vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len,
+ prot, FALSE);
return (error);
}
for (entry = map->header.next; error == 0 && entry != &map->header;
entry = entry->next) {
vm_object_t obj;
+ vm_object_t lobj;
+ vm_object_t tobj;
/*
* Don't dump inaccessible mappings, deal with legacy
if ((obj = entry->object.vm_object) == NULL)
continue;
- /* Find the deepest backing object. */
- while (obj->backing_object != NULL)
- obj = obj->backing_object;
-
- /* Ignore memory-mapped devices and such things. */
- if (obj->type != OBJT_DEFAULT &&
- obj->type != OBJT_SWAP &&
- obj->type != OBJT_VNODE)
- continue;
+ /*
+ * Find the bottom-most object, leaving the base object
+ * and the bottom-most object held (but only one hold
+ * if they happen to be the same).
+ */
+ vm_object_hold(obj);
+
+ lobj = obj;
+ while (lobj && (tobj = lobj->backing_object) != NULL) {
+ KKASSERT(tobj != obj);
+ vm_object_hold(tobj);
+ if (tobj == lobj->backing_object) {
+ if (lobj != obj) {
+ vm_object_lock_swap();
+ vm_object_drop(lobj);
+ }
+ lobj = tobj;
+ } else {
+ vm_object_drop(tobj);
+ }
+ }
- error = (*func)(entry, closure);
+ /*
+ * The callback only applies to default, swap, or vnode
+ * objects. Other types of objects such as memory-mapped
+ * devices are ignored.
+ */
+ if (lobj->type == OBJT_DEFAULT || lobj->type == OBJT_SWAP ||
+ lobj->type == OBJT_VNODE) {
+ error = (*func)(entry, closure);
+ }
+ if (lobj != obj)
+ vm_object_drop(lobj);
+ vm_object_drop(obj);
}
return (error);
}
*/
if ((long)sysinit % 8 != 0) {
kprintf("Fixing sysinit value...\n");
- sysinit = (long)sysinit + 4;
+ sysinit = (void *)((long)(intptr_t)sysinit + 4);
}
#endif
sysinit_end = SET_LIMIT(sysinit_set);
ru->ru_ixrss += pgtok(vm->vm_tsize);
ru->ru_idrss += pgtok(vm->vm_dsize);
ru->ru_isrss += pgtok(vm->vm_ssize);
- rss = pgtok(vmspace_resident_count(vm));
- if (ru->ru_maxrss < rss)
- ru->ru_maxrss = rss;
+ if (lwkt_trytoken(&vm->vm_map.token)) {
+ rss = pgtok(vmspace_resident_count(vm));
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ lwkt_reltoken(&vm->vm_map.token);
+ }
}
}
}
#endif
return(-1);
}
+
+/*
+ * Delay the specified number of nanoseconds using the tsc. This function
+ * returns immediately if the TSC is not supported. At least one cpu_pause()
+ * will be issued.
+ */
+void
+tsc_delay(int ns)
+{
+ int64_t clk;
+
+ clk = tsc_get_target(ns);
+ cpu_pause();
+ while (tsc_test_target(clk) == 0)
+ cpu_pause();
+}
/*
* execve() system call.
- *
- * MPALMOSTSAFE
*/
int
sys_execve(struct execve_args *uap)
bzero(&args, sizeof(args));
- get_mplock();
error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
if (error == 0) {
error = exec_copyin_args(&args, uap->fname, PATH_USERSPACE,
exit1(W_EXITCODE(0, SIGABRT));
/* NOTREACHED */
}
- rel_mplock();
/*
* The syscall result is returned in registers to the new program.
if (pageno >= object->size)
return (EIO);
+ vm_object_hold(object);
m = vm_page_grab(object, pageno, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
-
- lwkt_gettoken(&vm_token);
while ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
ma = m;
vm_page_protect(m, VM_PROT_NONE);
vnode_pager_freepage(m);
}
- lwkt_reltoken(&vm_token);
return EIO;
}
}
- vm_page_hold(m); /* requires vm_token to be held */
+ vm_page_hold(m);
vm_page_wakeup(m); /* unbusy the page */
- lwkt_reltoken(&vm_token);
+ vm_object_drop(object);
*plwb = lwbuf_alloc(m, *plwb);
*pdata = (void *)lwbuf_kva(*plwb);
#include <sys/globaldata.h>
#ifdef _KERNEL
#include <sys/systm.h>
+#include <sys/sysref.h>
+#include <sys/sysref2.h>
#else
#include <string.h>
{
struct session *sess;
struct pgrp *pgrp;
+ struct vmspace *vm;
pgrp = p->p_pgrp;
sess = pgrp ? pgrp->pg_session : NULL;
kp->kp_nice = p->p_nice;
kp->kp_swtime = p->p_swtime;
- if (p->p_vmspace) {
- kp->kp_vm_map_size = p->p_vmspace->vm_map.size;
- kp->kp_vm_rssize = vmspace_resident_count(p->p_vmspace);
- kp->kp_vm_prssize = vmspace_president_count(p->p_vmspace);
- kp->kp_vm_swrss = p->p_vmspace->vm_swrss;
- kp->kp_vm_tsize = p->p_vmspace->vm_tsize;
- kp->kp_vm_dsize = p->p_vmspace->vm_dsize;
- kp->kp_vm_ssize = p->p_vmspace->vm_ssize;
+ if ((vm = p->p_vmspace) != NULL) {
+#ifdef _KERNEL
+ sysref_get(&vm->vm_sysref);
+ lwkt_gettoken(&vm->vm_map.token);
+#endif
+ kp->kp_vm_map_size = vm->vm_map.size;
+ kp->kp_vm_rssize = vmspace_resident_count(vm);
+ kp->kp_vm_prssize = vmspace_president_count(vm);
+ kp->kp_vm_swrss = vm->vm_swrss;
+ kp->kp_vm_tsize = vm->vm_tsize;
+ kp->kp_vm_dsize = vm->vm_dsize;
+ kp->kp_vm_ssize = vm->vm_ssize;
+#ifdef _KERNEL
+ lwkt_reltoken(&vm->vm_map.token);
+ sysref_put(&vm->vm_sysref);
+#endif
}
if (p->p_ucred && jailed(p->p_ucred))
}
#endif
- /*
- * So sue me, I'm too tired.
- */
- if (spin_trylock(&lkp->lk_spinlock) == FALSE) {
- if (flags & LK_NOSPINWAIT)
- return(EBUSY);
- spin_lock(&lkp->lk_spinlock);
- }
+ spin_lock(&lkp->lk_spinlock);
extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK;
td = curthread;
SYSCTL_INT(_kern, OID_AUTO, zone_gen_alloc, CTLFLAG_RD, &ZoneGenAlloc, 0, "");
SYSCTL_INT(_kern, OID_AUTO, zone_cache, CTLFLAG_RW, &ZoneRelsThresh, 0, "");
+/*
+ * Returns the kernel memory size limit for the purposes of initializing
+ * various subsystem caches. The smaller of available memory and the KVM
+ * memory space is returned.
+ *
+ * The size in megabytes is returned.
+ */
+size_t
+kmem_lim_size(void)
+{
+ size_t limsize;
+
+ limsize = (size_t)vmstats.v_page_count * PAGE_SIZE;
+ if (limsize > KvaSize)
+ limsize = KvaSize;
+ return (limsize / (1024 * 1024));
+}
+
static void
kmeminit(void *dummy)
{
int usesize;
int i;
- limsize = (size_t)vmstats.v_page_count * PAGE_SIZE;
- if (limsize > KvaSize)
- limsize = KvaSize;
+ limsize = kmem_lim_size();
+ usesize = (int)(limsize * 1024); /* convert to KB */
- usesize = (int)(limsize / 1024); /* convert to KB */
+ /*
+ * If the machine has a large KVM space and more than 8G of ram,
+ * double the zone release threshold to reduce SMP invalidations.
+ * If more than 16G of ram, do it again.
+ *
+ * The BIOS eats a little ram so add some slop. We want 8G worth of
+ * memory sticks to trigger the first adjustment.
+ */
+ if (ZoneRelsThresh == ZONE_RELS_THRESH) {
+ if (limsize >= 7 * 1024)
+ ZoneRelsThresh *= 2;
+ if (limsize >= 15 * 1024)
+ ZoneRelsThresh *= 2;
+ }
+ /*
+ * Calculate the zone size. This typically calculates to
+ * ZALLOC_MAX_ZONE_SIZE
+ */
ZoneSize = ZALLOC_MIN_ZONE_SIZE;
while (ZoneSize < ZALLOC_MAX_ZONE_SIZE && (ZoneSize << 1) < usesize)
ZoneSize <<= 1;
if (vmstats.v_page_count == 0)
panic("malloc_init not allowed before vm init");
- limsize = (size_t)vmstats.v_page_count * PAGE_SIZE;
- if (limsize > KvaSize)
- limsize = KvaSize;
+ limsize = kmem_lim_size() * (1024 * 1024);
type->ks_limit = limsize / 10;
type->ks_next = kmemstatistics;
* Interrupt code which has preempted other code is not allowed to
* use PQ_CACHE pages. However, if an interrupt thread is run
* non-preemptively or blocks and then runs non-preemptively, then
- * it is free to use PQ_CACHE pages.
+ * it is free to use PQ_CACHE pages. <--- may not apply any longer XXX
*/
static void *
kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags)
vm_size_t i;
vm_offset_t addr;
int count, vmflags, base_vmflags;
- vm_page_t mp[ZALLOC_MAX_ZONE_SIZE / PAGE_SIZE];
+ vm_page_t mbase = NULL;
+ vm_page_t m;
thread_t td;
size = round_page(size);
addr = vm_map_min(&kernel_map);
- /*
- * Reserve properly aligned space from kernel_map. RNOWAIT allocations
- * cannot block.
- */
- if (flags & M_RNOWAIT) {
- if (lwkt_trytoken(&vm_token) == 0)
- return(NULL);
- } else {
- lwkt_gettoken(&vm_token);
- }
count = vm_map_entry_reserve(MAP_RESERVE_COUNT);
crit_enter();
vm_map_lock(&kernel_map);
panic("kmem_slab_alloc(): kernel_map ran out of space!");
vm_map_entry_release(count);
crit_exit();
- lwkt_reltoken(&vm_token);
return(NULL);
}
/*
* kernel_object maps 1:1 to kernel_map.
*/
- vm_object_reference(&kernel_object);
+ vm_object_hold(&kernel_object);
+ vm_object_reference_locked(&kernel_object);
vm_map_insert(&kernel_map, &count,
&kernel_object, addr, addr, addr + size,
VM_MAPTYPE_NORMAL,
VM_PROT_ALL, VM_PROT_ALL,
0);
+ vm_object_drop(&kernel_object);
+ vm_map_set_wired_quick(&kernel_map, addr, size, &count);
+ vm_map_unlock(&kernel_map);
td = curthread;
flags, ((int **)&size)[-1]);
}
-
/*
- * Allocate the pages. Do not mess with the PG_ZERO flag yet.
+ * Allocate the pages. Do not mess with the PG_ZERO flag or map
+ * them yet. VM_ALLOC_NORMAL can only be set if we are not preempting.
+ *
+ * VM_ALLOC_SYSTEM is automatically set if we are preempting and
+ * M_WAITOK was specified as an alternative (i.e. M_USE_RESERVE is
+ * implied in this case), though I'm not sure if we really need to
+ * do that.
*/
- for (i = 0; i < size; i += PAGE_SIZE) {
- vm_page_t m;
-
- /*
- * VM_ALLOC_NORMAL can only be set if we are not preempting.
- *
- * VM_ALLOC_SYSTEM is automatically set if we are preempting and
- * M_WAITOK was specified as an alternative (i.e. M_USE_RESERVE is
- * implied in this case), though I'm not sure if we really need to
- * do that.
- */
- vmflags = base_vmflags;
- if (flags & M_WAITOK) {
- if (td->td_preempted)
- vmflags |= VM_ALLOC_SYSTEM;
- else
- vmflags |= VM_ALLOC_NORMAL;
- }
+ vmflags = base_vmflags;
+ if (flags & M_WAITOK) {
+ if (td->td_preempted)
+ vmflags |= VM_ALLOC_SYSTEM;
+ else
+ vmflags |= VM_ALLOC_NORMAL;
+ }
+ vm_object_hold(&kernel_object);
+ for (i = 0; i < size; i += PAGE_SIZE) {
m = vm_page_alloc(&kernel_object, OFF_TO_IDX(addr + i), vmflags);
- if (i / PAGE_SIZE < NELEM(mp))
- mp[i / PAGE_SIZE] = m;
+ if (i == 0)
+ mbase = m;
/*
* If the allocation failed we either return NULL or we retry.
if (m == NULL) {
if (flags & M_WAITOK) {
if (td->td_preempted) {
- vm_map_unlock(&kernel_map);
lwkt_switch();
- vm_map_lock(&kernel_map);
} else {
- vm_map_unlock(&kernel_map);
vm_wait(0);
- vm_map_lock(&kernel_map);
}
i -= PAGE_SIZE; /* retry */
continue;
}
+ break;
+ }
+ }
- /*
- * We were unable to recover, cleanup and return NULL
- *
- * (vm_token already held)
- */
- while (i != 0) {
- i -= PAGE_SIZE;
- m = vm_page_lookup(&kernel_object, OFF_TO_IDX(addr + i));
- /* page should already be busy */
- vm_page_free(m);
- }
- vm_map_delete(&kernel_map, addr, addr + size, &count);
- vm_map_unlock(&kernel_map);
- vm_map_entry_release(count);
- crit_exit();
- lwkt_reltoken(&vm_token);
- return(NULL);
+ /*
+ * Check and deal with an allocation failure
+ */
+ if (i != size) {
+ while (i != 0) {
+ i -= PAGE_SIZE;
+ m = vm_page_lookup(&kernel_object, OFF_TO_IDX(addr + i));
+ /* page should already be busy */
+ vm_page_free(m);
}
+ vm_map_lock(&kernel_map);
+ vm_map_delete(&kernel_map, addr, addr + size, &count);
+ vm_map_unlock(&kernel_map);
+ vm_object_drop(&kernel_object);
+
+ vm_map_entry_release(count);
+ crit_exit();
+ return(NULL);
}
/*
* Success!
*
- * Mark the map entry as non-pageable using a routine that allows us to
- * populate the underlying pages.
- *
- * The pages were busied by the allocations above.
+ * NOTE: The VM pages are still busied. mbase points to the first one
+ * but we have to iterate via vm_page_next()
*/
- vm_map_set_wired_quick(&kernel_map, addr, size, &count);
+ vm_object_drop(&kernel_object);
crit_exit();
/*
* Enter the pages into the pmap and deal with PG_ZERO and M_ZERO.
*/
- for (i = 0; i < size; i += PAGE_SIZE) {
- vm_page_t m;
+ m = mbase;
+ i = 0;
- if (i / PAGE_SIZE < NELEM(mp))
- m = mp[i / PAGE_SIZE];
- else
- m = vm_page_lookup(&kernel_object, OFF_TO_IDX(addr + i));
+ while (i < size) {
+ /*
+ * page should already be busy
+ */
m->valid = VM_PAGE_BITS_ALL;
- /* page should already be busy */
vm_page_wire(m);
- pmap_enter(&kernel_pmap, addr + i, m, VM_PROT_ALL, 1);
+ pmap_enter(&kernel_pmap, addr + i, m, VM_PROT_ALL | VM_PROT_NOSYNC, 1);
if ((m->flags & PG_ZERO) == 0 && (flags & M_ZERO))
bzero((char *)addr + i, PAGE_SIZE);
vm_page_flag_clear(m, PG_ZERO);
KKASSERT(m->flags & (PG_WRITEABLE | PG_MAPPED));
vm_page_flag_set(m, PG_REFERENCED);
vm_page_wakeup(m);
+
+ i += PAGE_SIZE;
+ vm_object_hold(&kernel_object);
+ m = vm_page_next(m);
+ vm_object_drop(&kernel_object);
}
- vm_map_unlock(&kernel_map);
+ smp_invltlb();
vm_map_entry_release(count);
- lwkt_reltoken(&vm_token);
return((void *)addr);
}
kmem_slab_free(void *ptr, vm_size_t size)
{
crit_enter();
- lwkt_gettoken(&vm_token);
vm_map_remove(&kernel_map, (vm_offset_t)ptr, (vm_offset_t)ptr + size);
- lwkt_reltoken(&vm_token);
crit_exit();
}
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
- *
- * $DragonFly: src/sys/kern/kern_spinlock.c,v 1.16 2008/09/11 01:11:42 y0netan1 Exp $
+ */
+/*
+ * The spinlock code utilizes two counters to form a virtual FIFO, allowing
+ * a spinlock to allocate a slot and then only issue memory read operations
+ * until it is handed the lock (if it is not the next owner for the lock).
*/
#include <sys/param.h>
#endif
#include <sys/priv.h>
#include <machine/atomic.h>
+#include <machine/cpu.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <machine/clock.h>
#include <sys/spinlock2.h>
#include <sys/ktr.h>
-#define BACKOFF_INITIAL 1
-#define BACKOFF_LIMIT 256
-
#ifdef SMP
+struct indefinite_info {
+ sysclock_t base;
+ int secs;
+};
+
/*
* Kernal Trace
*/
KTR_INFO_MASTER(spin);
KTR_INFO(KTR_SPIN_CONTENTION, spin, beg, 0, SPIN_STRING, SPIN_ARG_SIZE);
KTR_INFO(KTR_SPIN_CONTENTION, spin, end, 1, SPIN_STRING, SPIN_ARG_SIZE);
-KTR_INFO(KTR_SPIN_CONTENTION, spin, backoff, 2,
- "spin=%p bo1=%d thr=%p bo=%d",
- ((2 * sizeof(void *)) + (2 * sizeof(int))));
-KTR_INFO(KTR_SPIN_CONTENTION, spin, bofail, 3, SPIN_STRING, SPIN_ARG_SIZE);
-
-#define logspin(name, mtx, type) \
- KTR_LOG(spin_ ## name, mtx, type)
-#define logspin_backoff(mtx, bo1, thr, bo) \
- KTR_LOG(spin_backoff, mtx, bo1, thr, bo)
+#define logspin(name, spin, type) \
+ KTR_LOG(spin_ ## name, spin, type)
#ifdef INVARIANTS
static int spin_lock_test_mode;
#endif
+struct spinlock pmap_spin = SPINLOCK_INITIALIZER(pmap_spin);
static int64_t spinlocks_contested1;
SYSCTL_QUAD(_debug, OID_AUTO, spinlocks_contested1, CTLFLAG_RD,
&spinlocks_contested2, 0,
"Serious spinlock contention count");
-static int spinlocks_backoff_limit = BACKOFF_LIMIT;
-SYSCTL_INT(_debug, OID_AUTO, spinlocks_bolim, CTLFLAG_RW,
- &spinlocks_backoff_limit, 0,
- "Contested spinlock backoff limit");
+static int spinlocks_hardloops = 40;
+SYSCTL_INT(_debug, OID_AUTO, spinlocks_hardloops, CTLFLAG_RW,
+ &spinlocks_hardloops, 0,
+ "Hard loops waiting for spinlock");
#define SPINLOCK_NUM_POOL (1024)
static struct spinlock pool_spinlocks[SPINLOCK_NUM_POOL];
-struct exponential_backoff {
- int backoff;
- int nsec;
- struct spinlock *mtx;
- sysclock_t base;
-};
-static int exponential_backoff(struct exponential_backoff *bo);
-
-static __inline
-void
-exponential_init(struct exponential_backoff *bo, struct spinlock *mtx)
-{
- bo->backoff = BACKOFF_INITIAL;
- bo->nsec = 0;
- bo->mtx = mtx;
- bo->base = 0; /* silence gcc */
-}
+static int spin_indefinite_check(struct spinlock *spin,
+ struct indefinite_info *info);
/*
* We contested due to another exclusive lock holder. We lose.
+ *
+ * We have to unwind the attempt and may acquire the spinlock
+ * anyway while doing so. countb was incremented on our behalf.
*/
int
-spin_trylock_wr_contested2(globaldata_t gd)
+spin_trylock_contested(struct spinlock *spin)
{
- ++spinlocks_contested1;
+ globaldata_t gd = mycpu;
+
+ /*++spinlocks_contested1;*/
--gd->gd_spinlocks_wr;
--gd->gd_curthread->td_critcount;
return (FALSE);
}
/*
- * We were either contested due to another exclusive lock holder,
- * or due to the presence of shared locks
+ * The spin_lock() inline was unable to acquire the lock.
*
- * NOTE: If value indicates an exclusively held mutex, no shared bits
- * would have been set and we can throw away value.
+ * atomic_swap_int() is the absolute fastest spinlock instruction, at
+ * least on multi-socket systems. All instructions seem to be about
+ * the same on single-socket multi-core systems.
*/
void
-spin_lock_wr_contested2(struct spinlock *mtx)
+spin_lock_contested(struct spinlock *spin)
{
- struct exponential_backoff backoff;
- int value;
+ int i;
- /*
- * Wait until we can gain exclusive access vs another exclusive
- * holder.
- */
- ++spinlocks_contested1;
- exponential_init(&backoff, mtx);
-
- logspin(beg, mtx, 'w');
- do {
- if (exponential_backoff(&backoff))
- break;
- value = atomic_swap_int(&mtx->lock, SPINLOCK_EXCLUSIVE);
- } while (value & SPINLOCK_EXCLUSIVE);
- logspin(end, mtx, 'w');
+ i = 0;
+ while (atomic_swap_int(&spin->counta, 1)) {
+ cpu_pause();
+ if (i == spinlocks_hardloops) {
+ struct indefinite_info info = { 0, 0 };
+
+ logspin(beg, spin, 'w');
+ while (atomic_swap_int(&spin->counta, 1)) {
+ cpu_pause();
+ ++spin->countb;
+ if ((++i & 0x7F) == 0x7F) {
+ if (spin_indefinite_check(spin, &info))
+ break;
+ }
+ }
+ logspin(end, spin, 'w');
+ return;
+ }
+ ++spin->countb;
+ ++i;
+ }
}
static __inline int
return (i);
}
-struct spinlock *
-spin_pool_lock(void *chan)
+void
+_spin_pool_lock(void *chan)
{
struct spinlock *sp;
sp = &pool_spinlocks[_spin_pool_hash(chan)];
spin_lock(sp);
-
- return (sp);
}
void
-spin_pool_unlock(void *chan)
+_spin_pool_unlock(void *chan)
{
struct spinlock *sp;
spin_unlock(sp);
}
-/*
- * Handle exponential backoff and indefinite waits.
- *
- * If the system is handling a panic we hand the spinlock over to the caller
- * after 1 second. After 10 seconds we attempt to print a debugger
- * backtrace. We also run pending interrupts in order to allow a console
- * break into DDB.
- */
+
static
int
-exponential_backoff(struct exponential_backoff *bo)
+spin_indefinite_check(struct spinlock *spin, struct indefinite_info *info)
{
sysclock_t count;
- int backoff;
-
-#ifdef _RDTSC_SUPPORTED_
- if (cpu_feature & CPUID_TSC) {
- backoff =
- (((u_long)rdtsc() ^ (((u_long)curthread) >> 5)) &
- (bo->backoff - 1)) + BACKOFF_INITIAL;
- } else
-#endif
- backoff = bo->backoff;
- logspin_backoff(bo->mtx, bo->backoff, curthread, backoff);
- /*
- * Quick backoff
- */
- for (; backoff; --backoff)
- cpu_pause();
- if (bo->backoff < spinlocks_backoff_limit) {
- bo->backoff <<= 1;
- return (FALSE);
- } else {
- bo->backoff = BACKOFF_INITIAL;
- }
-
- logspin(bofail, bo->mtx, 'u');
-
- /*
- * Indefinite
- */
- ++spinlocks_contested2;
cpu_spinlock_contested();
- if (bo->nsec == 0) {
- bo->base = sys_cputimer->count();
- bo->nsec = 1;
- }
count = sys_cputimer->count();
- if (count - bo->base > sys_cputimer->freq) {
- kprintf("spin_lock: %p, indefinite wait!\n", bo->mtx);
+ if (info->secs == 0) {
+ info->base = count;
+ ++info->secs;
+ } else if (count - info->base > sys_cputimer->freq) {
+ kprintf("spin_lock: %p, indefinite wait (%d secs)!\n",
+ spin, info->secs);
+ info->base = count;
+ ++info->secs;
if (panicstr)
return (TRUE);
#if defined(INVARIANTS)
return (TRUE);
}
#endif
- ++bo->nsec;
#if defined(INVARIANTS)
- if (bo->nsec == 11)
+ if (info->secs == 11)
print_backtrace(-1);
#endif
- if (bo->nsec == 60)
- panic("spin_lock: %p, indefinite wait!\n", bo->mtx);
- bo->base = count;
+ if (info->secs == 60)
+ panic("spin_lock: %p, indefinite wait!\n", spin);
}
return (FALSE);
}
static int
sysctl_spin_lock_test(SYSCTL_HANDLER_ARGS)
{
- struct spinlock mtx;
+ struct spinlock spin;
int error;
int value = 0;
int i;
* Indefinite wait test
*/
if (value == 1) {
- spin_init(&mtx);
- spin_lock(&mtx); /* force an indefinite wait */
+ spin_init(&spin);
+ spin_lock(&spin); /* force an indefinite wait */
spin_lock_test_mode = 1;
- spin_lock(&mtx);
- spin_unlock(&mtx); /* Clean up the spinlock count */
- spin_unlock(&mtx);
+ spin_lock(&spin);
+ spin_unlock(&spin); /* Clean up the spinlock count */
+ spin_unlock(&spin);
spin_lock_test_mode = 0;
}
if (value == 2) {
globaldata_t gd = mycpu;
- spin_init(&mtx);
+ spin_init(&spin);
for (i = spin_test_count; i > 0; --i) {
- spin_lock_quick(gd, &mtx);
- spin_unlock_quick(gd, &mtx);
+ spin_lock_quick(gd, &spin);
+ spin_unlock_quick(gd, &spin);
}
}
* tsleep/wakeup hash table parameters. Try to find the sweet spot for
* like addresses being slept on.
*/
-#define TABLESIZE 1024
-#define LOOKUP(x) (((intptr_t)(x) >> 6) & (TABLESIZE - 1))
+#define TABLESIZE 4001
+#define LOOKUP(x) (((u_int)(uintptr_t)(x)) % TABLESIZE)
static cpumask_t slpque_cpumasks[TABLESIZE];
if (td->td_flags & TDF_TSLEEPQ) {
id = LOOKUP(td->td_wchan);
TAILQ_REMOVE(&gd->gd_tsleep_hash[id], td, td_sleepq);
- if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL)
- atomic_clear_cpumask(&slpque_cpumasks[id], gd->gd_cpumask);
+ if (TAILQ_FIRST(&gd->gd_tsleep_hash[id]) == NULL) {
+ atomic_clear_cpumask(&slpque_cpumasks[id],
+ gd->gd_cpumask);
+ }
} else {
td->td_flags |= TDF_TSLEEPQ;
}
* Otherwise the physical page we sleep on my not match the page
* being woken up.
*/
- lwkt_gettoken(&vm_token);
m = vm_fault_page_quick((vm_offset_t)uap->ptr,
VM_PROT_READ|VM_PROT_WRITE, &error);
if (m == NULL) {
/*vm_page_dirty(m); we don't actually dirty the page */
vm_page_unhold(m);
done:
- lwkt_reltoken(&vm_token);
return(error);
}
static void
umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action)
{
- lwkt_gettoken(&vm_token);
wakeup_domain(action->data, PDOMAIN_UMTX);
- lwkt_reltoken(&vm_token);
}
/*
cpu_mfence();
if ((vm_offset_t)uap->ptr & (sizeof(int) - 1))
return (EFAULT);
- lwkt_gettoken(&vm_token);
m = vm_fault_page_quick((vm_offset_t)uap->ptr, VM_PROT_READ, &error);
if (m == NULL) {
error = EFAULT;
vm_page_unhold(m);
error = 0;
done:
- lwkt_reltoken(&vm_token);
return(error);
}
xio->xio_error = 0;
if ((n = PAGE_SIZE - xio->xio_offset) > kbytes)
n = kbytes;
- lwkt_gettoken(&vm_token);
- crit_enter();
for (i = 0; n && i < XIO_INTERNAL_PAGES; ++i) {
if ((paddr = pmap_kextract(addr)) == 0)
break;
n = PAGE_SIZE;
addr += PAGE_SIZE;
}
- crit_exit();
- lwkt_reltoken(&vm_token);
xio->xio_npages = i;
/*
xio->xio_pages = xio->xio_internal_pages;
xio->xio_npages = npages;
xio->xio_error = 0;
- lwkt_gettoken(&vm_token);
- crit_enter();
for (i = 0; i < npages; ++i) {
vm_page_hold(mbase[i]);
xio->xio_pages[i] = mbase[i];
}
- crit_exit();
- lwkt_reltoken(&vm_token);
return(0);
}
int i;
vm_page_t m;
- lwkt_gettoken(&vm_token);
- crit_enter();
for (i = 0; i < xio->xio_npages; ++i) {
m = xio->xio_pages[i];
if (xio->xio_flags & XIOF_WRITE)
vm_page_dirty(m);
vm_page_unhold(m);
}
- crit_exit();
- lwkt_reltoken(&vm_token);
xio->xio_offset = 0;
xio->xio_npages = 0;
xio->xio_bytes = 0;
error = ENOMEM;
goto out;
}
- vm_object_reference(ef->object);
+ vm_object_hold(ef->object);
+ vm_object_reference_locked(ef->object);
ef->address = (caddr_t)vm_map_min(&kernel_map);
error = vm_map_find(&kernel_map, ef->object, 0,
(vm_offset_t *)&ef->address,
1, VM_MAPTYPE_NORMAL,
VM_PROT_ALL, VM_PROT_ALL,
0);
+ vm_object_drop(ef->object);
if (error) {
vm_object_deallocate(ef->object);
kfree(ef, M_LINKER);
error = ENOMEM;
goto out;
}
- vm_object_reference(ef->object);
+ vm_object_hold(ef->object);
+ vm_object_reference_locked(ef->object);
ef->address = (caddr_t) vm_map_min(&kernel_map);
ef->bytes = 0;
round_page(mapsize), PAGE_SIZE,
TRUE, VM_MAPTYPE_NORMAL,
VM_PROT_ALL, VM_PROT_ALL, FALSE);
+ vm_object_drop(ef->object);
if (error) {
vm_object_deallocate(ef->object);
ef->object = NULL;
static __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */
static __int64_t ipiq_passive; /* passive IPI messages */
static __int64_t ipiq_cscount; /* number of cpu synchronizations */
-static int ipiq_optimized = 1; /* XXX temporary sysctl */
static int ipiq_debug; /* set to 1 for debug */
#ifdef PANIC_DEBUG
static int panic_ipiq_cpu = -1;
"Number of passive IPI messages sent");
SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0,
"Number of cpu synchronizations");
-SYSCTL_INT(_lwkt, OID_AUTO, ipiq_optimized, CTLFLAG_RW, &ipiq_optimized, 0,
- "");
SYSCTL_INT(_lwkt, OID_AUTO, ipiq_debug, CTLFLAG_RW, &ipiq_debug, 0,
"");
#ifdef PANIC_DEBUG
++ipiq_fifofull;
DEBUG_PUSH_INFO("send_ipiq3");
while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
- if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0) {
+ if (atomic_poll_acquire_int(&target->gd_npoll)) {
logipiq(cpu_send, func, arg1, arg2, gd, target);
cpu_send_ipiq(target->gd_cpuid);
}
* Queue the new message
*/
windex = ip->ip_windex & MAXCPUFIFO_MASK;
- ip->ip_func[windex] = func;
- ip->ip_arg1[windex] = arg1;
- ip->ip_arg2[windex] = arg2;
+ ip->ip_info[windex].func = func;
+ ip->ip_info[windex].arg1 = arg1;
+ ip->ip_info[windex].arg2 = arg2;
cpu_sfence();
++ip->ip_windex;
+ atomic_set_cpumask(&target->gd_ipimask, gd->gd_cpumask);
/*
* signal the target cpu that there is work pending.
*/
- if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0) {
+ if (atomic_poll_acquire_int(&target->gd_npoll)) {
logipiq(cpu_send, func, arg1, arg2, gd, target);
cpu_send_ipiq(target->gd_cpuid);
} else {
++ipiq_fifofull;
DEBUG_PUSH_INFO("send_ipiq3_passive");
while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
- if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0) {
+ if (atomic_poll_acquire_int(&target->gd_npoll)) {
logipiq(cpu_send, func, arg1, arg2, gd, target);
cpu_send_ipiq(target->gd_cpuid);
}
* Queue the new message
*/
windex = ip->ip_windex & MAXCPUFIFO_MASK;
- ip->ip_func[windex] = func;
- ip->ip_arg1[windex] = arg1;
- ip->ip_arg2[windex] = arg2;
+ ip->ip_info[windex].func = func;
+ ip->ip_info[windex].arg1 = arg1;
+ ip->ip_info[windex].arg2 = arg2;
cpu_sfence();
++ip->ip_windex;
+ atomic_set_cpumask(&target->gd_ipimask, gd->gd_cpumask);
--gd->gd_intr_nesting_level;
/*
return(ENOENT);
}
windex = ip->ip_windex & MAXCPUFIFO_MASK;
- ip->ip_func[windex] = func;
- ip->ip_arg1[windex] = arg1;
- ip->ip_arg2[windex] = arg2;
+ ip->ip_info[windex].func = func;
+ ip->ip_info[windex].arg1 = arg1;
+ ip->ip_info[windex].arg2 = arg2;
cpu_sfence();
++ip->ip_windex;
+ atomic_set_cpumask(&target->gd_ipimask, gd->gd_cpumask);
/*
* This isn't a passive IPI, we still have to signal the target cpu.
*/
- if (atomic_poll_acquire_int(&ip->ip_npoll) || ipiq_optimized == 0) {
+ if (atomic_poll_acquire_int(&target->gd_npoll)) {
logipiq(cpu_send, func, arg1, arg2, gd, target);
cpu_send_ipiq(target->gd_cpuid);
} else {
* Called from IPI interrupt (like a fast interrupt), which has placed
* us in a critical section. The MP lock may or may not be held.
* May also be called from doreti or splz, or be reentrantly called
- * indirectly through the ip_func[] we run.
+ * indirectly through the ip_info[].func we run.
*
* There are two versions, one where no interrupt frame is available (when
* called from the send code and from splz, and one where an interrupt
globaldata_t gd = mycpu;
globaldata_t sgd;
lwkt_ipiq_t ip;
+ cpumask_t mask;
int n;
++gd->gd_processing_ipiq;
again:
- for (n = 0; n < ncpus; ++n) {
+ cpu_lfence();
+ mask = gd->gd_ipimask;
+ atomic_clear_cpumask(&gd->gd_ipimask, mask);
+ while (mask) {
+ n = BSFCPUMASK(mask);
if (n != gd->gd_cpuid) {
sgd = globaldata_find(n);
ip = sgd->gd_ipiq;
;
}
}
+ mask &= ~CPUMASK(n);
}
if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) {
if (gd->gd_curthread->td_cscount == 0)
goto again;
/* need_ipiq(); do not reflag */
}
+
+ /*
+ * Interlock to allow more IPI interrupts. Recheck ipimask after
+ * releasing gd_npoll.
+ */
+ if (gd->gd_ipimask)
+ goto again;
+ atomic_poll_release_int(&gd->gd_npoll);
+ cpu_mfence();
+ if (gd->gd_ipimask)
+ goto again;
--gd->gd_processing_ipiq;
}
globaldata_t gd = mycpu;
globaldata_t sgd;
lwkt_ipiq_t ip;
+ cpumask_t mask;
int n;
again:
- for (n = 0; n < ncpus; ++n) {
+ cpu_lfence();
+ mask = gd->gd_ipimask;
+ atomic_clear_cpumask(&gd->gd_ipimask, mask);
+ while (mask) {
+ n = BSFCPUMASK(mask);
if (n != gd->gd_cpuid) {
sgd = globaldata_find(n);
ip = sgd->gd_ipiq;
;
}
}
+ mask &= ~CPUMASK(n);
}
if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) {
/* need_ipiq(); do not reflag */
}
}
+
+ /*
+ * Interlock to allow more IPI interrupts. Recheck ipimask after
+ * releasing gd_npoll.
+ */
+ if (gd->gd_ipimask)
+ goto again;
+ atomic_poll_release_int(&gd->gd_npoll);
+ cpu_mfence();
+ if (gd->gd_ipimask)
+ goto again;
}
#if 0
#endif
/*
+ * Clear the originating core from our ipimask, we will process all
+ * incoming messages.
+ *
* Obtain the current write index, which is modified by a remote cpu.
* Issue a load fence to prevent speculative reads of e.g. data written
* by the other cpu prior to it updating the index.
while (wi - (ri = ip->ip_rindex) > 0) {
ri &= MAXCPUFIFO_MASK;
cpu_lfence();
- copy_func = ip->ip_func[ri];
- copy_arg1 = ip->ip_arg1[ri];
- copy_arg2 = ip->ip_arg2[ri];
+ copy_func = ip->ip_info[ri].func;
+ copy_arg1 = ip->ip_info[ri].arg1;
+ copy_arg2 = ip->ip_info[ri].arg2;
cpu_mfence();
++ip->ip_rindex;
KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) ==
--mygd->gd_intr_nesting_level;
/*
- * If the queue is empty release ip_npoll to enable the other cpu to
- * send us an IPI interrupt again.
- *
- * Return non-zero if there is still more in the queue. Note that we
- * must re-check the indexes after potentially releasing ip_npoll. The
- * caller must loop or otherwise ensure that a loop will occur prior to
- * blocking.
+ * Return non-zero if there is still more in the queue.
*/
- if (ip->ip_rindex == ip->ip_windex)
- atomic_poll_release_int(&ip->ip_npoll);
cpu_lfence();
return (ip->ip_rindex != ip->ip_windex);
}
lwkt_cpusync_interlock(lwkt_cpusync_t cs)
{
#ifdef SMP
+#if 0
+ const char *smsg = "SMPSYNL";
+#endif
globaldata_t gd = mycpu;
cpumask_t mask;
++gd->gd_curthread->td_cscount;
lwkt_send_ipiq_mask(mask, (ipifunc1_t)lwkt_cpusync_remote1, cs);
logipiq2(sync_start, mask);
+#if 0
+ if (gd->gd_curthread->td_wmesg == NULL)
+ gd->gd_curthread->td_wmesg = smsg;
+#endif
while (cs->cs_mack != mask) {
lwkt_process_ipiq();
cpu_pause();
}
+#if 0
+ if (gd->gd_curthread->td_wmesg == smsg)
+ gd->gd_curthread->td_wmesg = NULL;
+#endif
DEBUG_POP_INFO();
}
#else
{
globaldata_t gd = mycpu;
#ifdef SMP
+#if 0
+ const char *smsg = "SMPSYNU";
+#endif
cpumask_t mask;
/*
cs->cs_func(cs->cs_data);
if (mask) {
DEBUG_PUSH_INFO("cpusync_deinterlock");
+#if 0
+ if (gd->gd_curthread->td_wmesg == NULL)
+ gd->gd_curthread->td_wmesg = smsg;
+#endif
while (cs->cs_mack != mask) {
lwkt_process_ipiq();
cpu_pause();
}
+#if 0
+ if (gd->gd_curthread->td_wmesg == smsg)
+ gd->gd_curthread->td_wmesg = NULL;
+#endif
DEBUG_POP_INFO();
/*
* cpusyncq ipis may be left queued without the RQF flag set due to
ip = &gd->gd_cpusyncq;
wi = ip->ip_windex & MAXCPUFIFO_MASK;
- ip->ip_func[wi] = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2;
- ip->ip_arg1[wi] = cs;
- ip->ip_arg2[wi] = 0;
+ ip->ip_info[wi].func = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2;
+ ip->ip_info[wi].arg1 = cs;
+ ip->ip_info[wi].arg2 = 0;
cpu_sfence();
++ip->ip_windex;
if (ipiq_debug && (ip->ip_windex & 0xFFFFFF) == 0) {
/*
- * Copyright (c) 2003-2010 The DragonFly Project. All rights reserved.
+ * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Matthew Dillon <dillon@backplane.com>
static void lwkt_setcpu_remote(void *arg);
#endif
static void lwkt_fairq_accumulate(globaldata_t gd, thread_t td);
+static int lwkt_fairq_tick(globaldata_t gd, thread_t td);
extern void cpu_heavy_restore(void);
extern void cpu_lwkt_restore(void);
SYSCTL_QUAD(_lwkt, OID_AUTO, token_contention_count, CTLFLAG_RW,
&token_contention_count, 0, "spinning due to token contention");
#endif
-static int fairq_enable = 1;
+static int fairq_enable = 0;
SYSCTL_INT(_lwkt, OID_AUTO, fairq_enable, CTLFLAG_RW,
&fairq_enable, 0, "Turn on fairq priority accumulators");
+static int fairq_bypass = 1;
+SYSCTL_INT(_lwkt, OID_AUTO, fairq_bypass, CTLFLAG_RW,
+ &fairq_bypass, 0, "Allow fairq to bypass td on token failure");
+extern int lwkt_sched_debug;
+int lwkt_sched_debug = 0;
+SYSCTL_INT(_lwkt, OID_AUTO, sched_debug, CTLFLAG_RW,
+ &lwkt_sched_debug, 0, "Scheduler debug");
static int lwkt_spin_loops = 10;
SYSCTL_INT(_lwkt, OID_AUTO, spin_loops, CTLFLAG_RW,
- &lwkt_spin_loops, 0, "");
-static int lwkt_spin_delay = 1;
-SYSCTL_INT(_lwkt, OID_AUTO, spin_delay, CTLFLAG_RW,
- &lwkt_spin_delay, 0, "Scheduler spin delay in microseconds 0=auto");
-static int lwkt_spin_method = 1;
-SYSCTL_INT(_lwkt, OID_AUTO, spin_method, CTLFLAG_RW,
- &lwkt_spin_method, 0, "LWKT scheduler behavior when contended");
+ &lwkt_spin_loops, 0, "Scheduler spin loops until sorted decon");
+static int lwkt_spin_reseq = 0;
+SYSCTL_INT(_lwkt, OID_AUTO, spin_reseq, CTLFLAG_RW,
+ &lwkt_spin_reseq, 0, "Scheduler resequencer enable");
+static int lwkt_spin_monitor = 0;
+SYSCTL_INT(_lwkt, OID_AUTO, spin_monitor, CTLFLAG_RW,
+ &lwkt_spin_monitor, 0, "Scheduler uses monitor/mwait");
static int lwkt_spin_fatal = 0; /* disabled */
SYSCTL_INT(_lwkt, OID_AUTO, spin_fatal, CTLFLAG_RW,
&lwkt_spin_fatal, 0, "LWKT scheduler spin loops till fatal panic");
td->td_flags &= ~TDF_RUNQ;
TAILQ_REMOVE(&gd->gd_tdrunq, td, td_threadq);
+
gd->gd_fairq_total_pri -= td->td_pri;
if (TAILQ_FIRST(&gd->gd_tdrunq) == NULL)
atomic_clear_int(&gd->gd_reqflags, RQF_RUNNING);
+
+ /*td->td_fairq_lticks = ticks;*/
}
}
TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
atomic_set_int(&gd->gd_reqflags, RQF_RUNNING);
} else {
- while (xtd && xtd->td_pri > td->td_pri)
+ while (xtd && xtd->td_pri >= td->td_pri)
xtd = TAILQ_NEXT(xtd, td_threadq);
if (xtd)
TAILQ_INSERT_BEFORE(xtd, td, td_threadq);
TAILQ_INSERT_TAIL(&gd->gd_tdrunq, td, td_threadq);
}
gd->gd_fairq_total_pri += td->td_pri;
+
+ /*
+ * The thread might have been dequeued for a while, bump it's
+ * fairq.
+ */
+ if (td->td_fairq_lticks != ticks) {
+ td->td_fairq_lticks = ticks;
+ lwkt_fairq_accumulate(gd, td);
+ }
}
}
thread_t td = gd->gd_curthread;
thread_t ntd;
thread_t xtd;
- int spinning = lwkt_spin_loops; /* loops before HLTing */
- int reqflags;
- int cseq;
- int oseq;
- int fatal_count;
+ int spinning = 0;
KKASSERT(gd->gd_processing_ipiq == 0);
}
/*
+ * Update the fairq accumulator if we are switching away in a
+ * different tick.
+ */
+ lwkt_fairq_tick(gd, td);
+
+ /*
* Implement round-robin fairq with priority insertion. The priority
* insertion is handled by _lwkt_enqueue()
*
*/
for (;;) {
/*
- * Clear RQF_AST_LWKT_RESCHED (we handle the reschedule request)
- * and set RQF_WAKEUP (prevent unnecessary IPIs from being
- * received).
+ * We have already docked the current thread. If we get stuck in a
+ * scheduler switching loop we do not want to dock it over and over
+ * again. Reset lticks.
*/
- for (;;) {
- reqflags = gd->gd_reqflags;
- if (atomic_cmpset_int(&gd->gd_reqflags, reqflags,
- (reqflags & ~RQF_AST_LWKT_RESCHED) |
- RQF_WAKEUP)) {
- break;
- }
- }
+ if (td != &gd->gd_idlethread)
+ td->td_fairq_lticks = ticks;
+
+ clear_lwkt_resched();
/*
* Hotpath - pull the head of the run queue and attempt to schedule
if (ntd == NULL) {
/*
- * Runq is empty, switch to idle and clear RQF_WAKEUP
- * to allow it to halt.
+ * Runq is empty, switch to idle to allow it to halt.
*/
ntd = &gd->gd_idlethread;
#ifdef SMP
#endif
cpu_time.cp_msg[0] = 0;
cpu_time.cp_stallpc = 0;
- atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP);
goto haveidle;
}
+ break;
+#if 0
if (ntd->td_fairq_accum >= 0)
break;
lwkt_fairq_accumulate(gd, ntd);
TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
TAILQ_INSERT_TAIL(&gd->gd_tdrunq, ntd, td_threadq);
+#endif
}
/*
- * Hotpath - schedule ntd. Leaves RQF_WAKEUP set to prevent
- * unwanted decontention IPIs.
+ * Hotpath - schedule ntd.
*
* NOTE: For UP there is no mplock and lwkt_getalltokens()
* always succeeds.
*/
- if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd))
+ if (TD_TOKS_NOT_HELD(ntd) ||
+ lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
+ {
goto havethread;
+ }
/*
* Coldpath (SMP only since tokens always succeed on UP)
*
* We had some contention on the thread we wanted to schedule.
* What we do now is try to find a thread that we can schedule
- * in its stead until decontention reschedules on our cpu.
+ * in its stead.
*
* The coldpath scan does NOT rearrange threads in the run list
- * and it also ignores the accumulator.
+ * and it also ignores the accumulator. We locate the thread with
+ * the highest accumulator value (positive or negative), then the
+ * next highest, and so forth. This isn't the most efficient but
+ * will theoretically try to schedule one thread per pass which
+ * is not horrible.
*
- * We do not immediately schedule a user priority thread, instead
- * we record it in xtd and continue looking for kernel threads.
- * A cpu can only have one user priority thread (normally) so just
- * record the first one.
+ * If the accumulator for the selected thread happens to be negative
+ * the timer interrupt will come along and ask for another reschedule
+ * within 1 tick.
*
* NOTE: This scan will also include threads whos fairq's were
* accumulated in the first loop.
*/
+#ifdef INVARIANTS
++token_contention_count;
+#endif
+
+ if (fairq_bypass)
+ goto skip;
+
+ need_lwkt_resched();
+ xtd = NULL;
+ while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
+#if 0
+ if (ntd->td_fairq_accum < 0)
+ continue;
+ if (xtd == NULL || ntd->td_pri > xtd->td_pri)
+ xtd = ntd;
+#endif
+ if (TD_TOKS_NOT_HELD(ntd) ||
+ lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops))) {
+ goto havethread;
+ }
+ }
+#if 0
+ if (xtd) {
+ if (TD_TOKS_NOT_HELD(xtd) ||
+ lwkt_getalltokens(xtd, (spinning >= lwkt_spin_loops)))
+ {
+ ntd = xtd;
+ goto havethread;
+ }
+ }
+#endif
+
+#if 0
+ if (fairq_bypass)
+ goto skip;
+
xtd = NULL;
while ((ntd = TAILQ_NEXT(ntd, td_threadq)) != NULL) {
/*
- * Try to switch to this thread. If the thread is running at
- * user priority we clear WAKEUP to allow decontention IPIs
- * (since this thread is simply running until the one we wanted
- * decontends), and we make sure that LWKT_RESCHED is not set.
- *
- * Otherwise for kernel threads we leave WAKEUP set to avoid
- * unnecessary decontention IPIs.
+ * Try to switch to this thread. Kernel threads have priority
+ * over user threads in this case.
*/
if (ntd->td_pri < TDPRI_KERN_LPSCHED) {
if (xtd == NULL)
continue;
}
- /*
- * Do not let the fairq get too negative. Even though we are
- * ignoring it atm once the scheduler decontends a very negative
- * thread will get moved to the end of the queue.
- */
- if (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd)) {
- if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd))
- ntd->td_fairq_accum = -TDFAIRQ_MAX(gd);
+ if (TD_TOKS_NOT_HELD(ntd) ||
+ lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
+ {
goto havethread;
}
-
- /*
- * Well fubar, this thread is contended as well, loop
- */
- /* */
+ /* thread contested, try another */
}
/*
* We exhausted the run list but we may have recorded a user
- * thread to try. We have three choices based on
- * lwkt.decontention_method.
- *
- * (0) Atomically clear RQF_WAKEUP in order to receive decontention
- * IPIs (to interrupt the user process) and test
- * RQF_AST_LWKT_RESCHED at the same time.
- *
- * This results in significant decontention IPI traffic but may
- * be more responsive.
- *
- * (1) Leave RQF_WAKEUP set so we do not receive a decontention IPI.
- * An automatic LWKT reschedule will occur on the next hardclock
- * (typically 100hz).
- *
- * This results in no decontention IPI traffic but may be less
- * responsive. This is the default.
- *
- * (2) Refuse to schedule the user process at this time.
- *
- * This is highly experimental and should not be used under
- * normal circumstances. This can cause a user process to
- * get starved out in situations where kernel threads are
- * fighting each other for tokens.
+ * thread to try.
*/
if (xtd) {
ntd = xtd;
-
- switch(lwkt_spin_method) {
- case 0:
- for (;;) {
- reqflags = gd->gd_reqflags;
- if (atomic_cmpset_int(&gd->gd_reqflags,
- reqflags,
- reqflags & ~RQF_WAKEUP)) {
- break;
- }
- }
- break;
- case 1:
- reqflags = gd->gd_reqflags;
- break;
- default:
- goto skip;
- break;
- }
- if ((reqflags & RQF_AST_LWKT_RESCHED) == 0 &&
- (TD_TOKS_NOT_HELD(ntd) || lwkt_getalltokens(ntd))
+ if ((gd->gd_reqflags & RQF_AST_LWKT_RESCHED) == 0 &&
+ (TD_TOKS_NOT_HELD(ntd) ||
+ lwkt_getalltokens(ntd, (spinning >= lwkt_spin_loops)))
) {
- if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd))
- ntd->td_fairq_accum = -TDFAIRQ_MAX(gd);
goto havethread;
}
-
-skip:
- /*
- * Make sure RQF_WAKEUP is set if we failed to schedule the
- * user thread to prevent the idle thread from halting.
- */
- atomic_set_int(&gd->gd_reqflags, RQF_WAKEUP);
}
+#endif
+skip:
/*
* We exhausted the run list, meaning that all runnable threads
- * are contended.
+ * are contested.
*/
cpu_pause();
ntd = &gd->gd_idlethread;
#endif
/*
- * Ok, we might want to spin a few times as some tokens are held for
- * very short periods of time and IPI overhead is 1uS or worse
- * (meaning it is usually better to spin). Regardless we have to
- * call splz_check() to be sure to service any interrupts blocked
- * by our critical section, otherwise we could livelock e.g. IPIs.
- *
- * The IPI mechanic is really a last resort. In nearly all other
- * cases RQF_WAKEUP is left set to prevent decontention IPIs.
+ * We are going to have to retry but if the current thread is not
+ * on the runq we instead switch through the idle thread to get away
+ * from the current thread. We have to flag for lwkt reschedule
+ * to prevent the idle thread from halting.
*
- * When we decide not to spin we clear RQF_WAKEUP and switch to
- * the idle thread. Clearing RQF_WEAKEUP allows the idle thread
- * to halt and decontended tokens will issue an IPI to us. The
- * idle thread will check for pending reschedules already set
- * (RQF_AST_LWKT_RESCHED) before actually halting so we don't have
- * to here.
- *
- * Also, if TDF_RUNQ is not set the current thread is trying to
- * deschedule, possibly in an atomic fashion. We cannot afford to
- * stay here.
+ * NOTE: A non-zero spinning is passed to lwkt_getalltokens() to
+ * instruct it to deal with the potential for deadlocks by
+ * ordering the tokens by address.
*/
- if (spinning <= 0 || (td->td_flags & TDF_RUNQ) == 0) {
- atomic_clear_int(&gd->gd_reqflags, RQF_WAKEUP);
+ if ((td->td_flags & TDF_RUNQ) == 0) {
+ need_lwkt_resched();
goto haveidle;
}
- --spinning;
-
- /*
- * When spinning a delay is required both to avoid livelocks from
- * token order reversals (a thread may be trying to acquire multiple
- * tokens), and also to reduce cpu cache management traffic.
- *
- * In order to scale to a large number of CPUs we use a time slot
- * resequencer to force contending cpus into non-contending
- * time-slots. The scheduler may still contend with the lock holder
- * but will not (generally) contend with all the other cpus trying
- * trying to get the same token.
- *
- * The resequencer uses a FIFO counter mechanic. The owner of the
- * rindex at the head of the FIFO is allowed to pull itself off
- * the FIFO and fetchadd is used to enter into the FIFO. This bit
- * of code is VERY cache friendly and forces all spinning schedulers
- * into their own time slots.
- *
- * This code has been tested to 48-cpus and caps the cache
- * contention load at ~1uS intervals regardless of the number of
- * cpus. Scaling beyond 64 cpus might require additional smarts
- * (such as separate FIFOs for specific token cases).
- *
- * WARNING! We can't call splz_check() or anything else here as
- * it could cause a deadlock.
- */
#if defined(INVARIANTS) && defined(__amd64__)
if ((read_rflags() & PSL_I) == 0) {
cpu_enable_intr();
panic("lwkt_switch() called with interrupts disabled");
}
#endif
- cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1);
- fatal_count = lwkt_spin_fatal;
- while ((oseq = lwkt_cseq_rindex) != cseq) {
- cpu_ccfence();
-#if !defined(_KERNEL_VIRTUAL)
- if (cpu_mi_feature & CPU_MI_MONITOR) {
- cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq);
- } else
+
+ /*
+ * Number iterations so far. After a certain point we switch to
+ * a sorted-address/monitor/mwait version of lwkt_getalltokens()
+ */
+ if (spinning < 0x7FFFFFFF)
+ ++spinning;
+
+#ifdef SMP
+ /*
+ * lwkt_getalltokens() failed in sorted token mode, we can use
+ * monitor/mwait in this case.
+ */
+ if (spinning >= lwkt_spin_loops &&
+ (cpu_mi_feature & CPU_MI_MONITOR) &&
+ lwkt_spin_monitor)
+ {
+ cpu_mmw_pause_int(&gd->gd_reqflags,
+ (gd->gd_reqflags | RQF_SPINNING) &
+ ~RQF_IDLECHECK_WK_MASK);
+ }
+#endif
+
+ /*
+ * We already checked that td is still scheduled so this should be
+ * safe.
+ */
+ splz_check();
+
+ /*
+ * This experimental resequencer is used as a fall-back to reduce
+ * hw cache line contention by placing each core's scheduler into a
+ * time-domain-multplexed slot.
+ *
+ * The resequencer is disabled by default. It's functionality has
+ * largely been superceeded by the token algorithm which limits races
+ * to a subset of cores.
+ *
+ * The resequencer algorithm tends to break down when more than
+ * 20 cores are contending. What appears to happen is that new
+ * tokens can be obtained out of address-sorted order by new cores
+ * while existing cores languish in long delays between retries and
+ * wind up being starved-out of the token acquisition.
+ */
+ if (lwkt_spin_reseq && spinning >= lwkt_spin_reseq) {
+ int cseq = atomic_fetchadd_int(&lwkt_cseq_windex, 1);
+ int oseq;
+
+ while ((oseq = lwkt_cseq_rindex) != cseq) {
+ cpu_ccfence();
+#if 1
+ if (cpu_mi_feature & CPU_MI_MONITOR) {
+ cpu_mmw_pause_int(&lwkt_cseq_rindex, oseq);
+ } else {
+#endif
+ cpu_pause();
+ cpu_lfence();
+#if 1
+ }
#endif
- {
- DELAY(1);
- cpu_lfence();
}
- if (fatal_count && --fatal_count == 0)
- panic("lwkt_switch: fatal spin wait");
+ DELAY(1);
+ atomic_add_int(&lwkt_cseq_rindex, 1);
}
- cseq = lwkt_spin_delay; /* don't trust the system operator */
- cpu_ccfence();
- if (cseq < 1)
- cseq = 1;
- if (cseq > 1000)
- cseq = 1000;
- DELAY(cseq);
- atomic_add_int(&lwkt_cseq_rindex, 1);
- splz_check(); /* ok, we already checked that td is still scheduled */
/* highest level for(;;) loop */
}
havethread:
/*
+ * The thread may have been sitting in the runq for a while, be sure
+ * to reset td_fairq_lticks to avoid an improper scheduling tick against
+ * the thread if it gets dequeued again quickly.
+ *
* We must always decrement td_fairq_accum on non-idle threads just
* in case a thread never gets a tick due to being in a continuous
* critical section. The page-zeroing code does this, for example.
- *
+ */
+ /* ntd->td_fairq_lticks = ticks; */
+ --ntd->td_fairq_accum;
+ if (ntd->td_fairq_accum < -TDFAIRQ_MAX(gd))
+ ntd->td_fairq_accum = -TDFAIRQ_MAX(gd);
+
+ /*
* If the thread we came up with is a higher or equal priority verses
* the thread at the head of the queue we move our thread to the
* front. This way we can always check the front of the queue.
* Clear gd_idle_repeat when doing a normal switch to a non-idle
* thread.
*/
- ++gd->gd_cnt.v_swtch;
- --ntd->td_fairq_accum;
ntd->td_wmesg = NULL;
+ ++gd->gd_cnt.v_swtch;
xtd = TAILQ_FIRST(&gd->gd_tdrunq);
if (ntd != xtd && ntd->td_pri >= xtd->td_pri) {
TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
lwkt_switch_return(td->td_switch(ntd));
/* ntd invalid, td_switch() can return a different thread_t */
}
+
+#if 1
+ /*
+ * catch-all
+ */
+ splz_check();
+#endif
+
/* NOTE: current cpu may have changed after switch */
crit_exit_quick(td);
}
*/
KASSERT(ntd->td_critcount, ("BADCRIT0 %d", ntd->td_pri));
+ td = gd->gd_curthread;
if (preempt_enable == 0) {
+ if (ntd->td_pri > td->td_pri)
+ need_lwkt_resched();
++preempt_miss;
return;
}
-
- td = gd->gd_curthread;
if (ntd->td_pri <= td->td_pri) {
++preempt_miss;
return;
KKASSERT(ntd->td_preempted && (td->td_flags & TDF_PREEMPT_DONE));
ntd->td_preempted = NULL;
td->td_flags &= ~(TDF_PREEMPT_LOCK|TDF_PREEMPT_DONE);
+#if 1
+ /*
+ * catch-all
+ */
+ splz_check();
+#endif
}
/*
}
/*
- * Give the thread a little fair share scheduler bump if it
- * has been asleep for a while. This is primarily to avoid
- * a degenerate case for interrupt threads where accumulator
- * crosses into negative territory unnecessarily.
+ * If we are in a different tick give the thread a cycle advantage.
+ * This is primarily to avoid a degenerate case for interrupt threads
+ * where accumulator crosses into negative territory unnecessarily.
*/
- if (ntd->td_fairq_lticks != ticks) {
- ntd->td_fairq_lticks = ticks;
- ntd->td_fairq_accum += gd->gd_fairq_total_pri;
- if (ntd->td_fairq_accum > TDFAIRQ_MAX(gd))
- ntd->td_fairq_accum = TDFAIRQ_MAX(gd);
- }
+ if (ntd->td_fairq_lticks != ticks)
+ lwkt_fairq_accumulate(gd, ntd);
}
}
if (fairq_enable) {
while (td) {
gd = td->td_gd;
- if (td != &gd->gd_idlethread) {
- td->td_fairq_accum -= gd->gd_fairq_total_pri;
- if (td->td_fairq_accum < -TDFAIRQ_MAX(gd))
- td->td_fairq_accum = -TDFAIRQ_MAX(gd);
- if (td->td_fairq_accum < 0)
- need_lwkt_resched();
- td->td_fairq_lticks = ticks;
- }
+ lwkt_fairq_tick(gd, td);
+ if (td->td_fairq_accum < 0)
+ need_lwkt_resched();
td = td->td_preempted;
}
}
td->td_fairq_accum = TDFAIRQ_MAX(td->td_gd);
}
+static int
+lwkt_fairq_tick(globaldata_t gd, thread_t td)
+{
+ if (td->td_fairq_lticks != ticks && td != &gd->gd_idlethread) {
+ td->td_fairq_lticks = ticks;
+ td->td_fairq_accum -= gd->gd_fairq_total_pri;
+ if (td->td_fairq_accum < -TDFAIRQ_MAX(gd))
+ td->td_fairq_accum = -TDFAIRQ_MAX(gd);
+ return TRUE;
+ }
+ return FALSE;
+}
+
/*
* Migrate the current thread to the specified cpu.
*
#include <machine/stdarg.h>
#include <machine/smp.h>
+extern int lwkt_sched_debug;
+
#ifndef LWKT_NUM_POOL_TOKENS
-#define LWKT_NUM_POOL_TOKENS 1024 /* power of 2 */
+#define LWKT_NUM_POOL_TOKENS 4001 /* prime number */
#endif
-#define LWKT_MASK_POOL_TOKENS (LWKT_NUM_POOL_TOKENS - 1)
static lwkt_token pool_tokens[LWKT_NUM_POOL_TOKENS];
struct lwkt_token vnode_token = LWKT_TOKEN_INITIALIZER(vnode_token);
struct lwkt_token vmobj_token = LWKT_TOKEN_INITIALIZER(vmobj_token);
-static int lwkt_token_ipi_dispatch = 4;
-SYSCTL_INT(_lwkt, OID_AUTO, token_ipi_dispatch, CTLFLAG_RW,
- &lwkt_token_ipi_dispatch, 0, "Number of IPIs to dispatch on token release");
+static int lwkt_token_spin = 5;
+SYSCTL_INT(_lwkt, OID_AUTO, token_spin, CTLFLAG_RW,
+ &lwkt_token_spin, 0, "Decontention spin loops");
+static int lwkt_token_delay = 0;
+SYSCTL_INT(_lwkt, OID_AUTO, token_delay, CTLFLAG_RW,
+ &lwkt_token_delay, 0, "Decontention spin delay in ns");
/*
* The collision count is bumped every time the LWKT scheduler fails
SYSCTL_LONG(_lwkt, OID_AUTO, vnode_collisions, CTLFLAG_RW,
&vnode_token.t_collisions, 0, "Collision counter of vnode_token");
+static int _lwkt_getalltokens_sorted(thread_t td);
+
#ifdef SMP
/*
* Acquire the initial mplock
#endif
/*
- * Return a pool token given an address
+ * Return a pool token given an address. Use a prime number to reduce
+ * overlaps.
*/
static __inline
lwkt_token_t
_lwkt_token_pool_lookup(void *ptr)
{
- int i;
+ u_int i;
- i = ((int)(intptr_t)ptr >> 2) ^ ((int)(intptr_t)ptr >> 12);
- return(&pool_tokens[i & LWKT_MASK_POOL_TOKENS]);
+ i = (u_int)(uintptr_t)ptr % LWKT_NUM_POOL_TOKENS;
+ return(&pool_tokens[i]);
}
/*
ref->tr_owner = td;
}
-#ifdef SMP
-/*
- * Force a LWKT reschedule on the target cpu when a requested token
- * becomes available.
- */
static
+int
+_lwkt_trytoken_spin(lwkt_token_t tok, lwkt_tokref_t ref)
+{
+ int n;
+
+ for (n = 0; n < lwkt_token_spin; ++n) {
+ if (tok->t_ref == NULL &&
+ atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
+ return TRUE;
+ }
+ if (lwkt_token_delay) {
+ tsc_delay(lwkt_token_delay);
+ } else {
+ cpu_lfence();
+ cpu_pause();
+ }
+ }
+ return FALSE;
+}
+
+static __inline
void
-lwkt_reltoken_mask_remote(void *arg, int arg2, struct intrframe *frame)
+_lwkt_reltoken_spin(lwkt_token_t tok)
{
- need_lwkt_resched();
+ tok->t_ref = NULL;
}
-#endif
+#if 0
/*
- * This bit of code sends a LWKT reschedule request to whatever other cpus
- * had contended on the token being released. We could wake up all the cpus
- * but generally speaking if there is a lot of contention we really only want
- * to wake up a subset of cpus to avoid aggregating O(N^2) IPIs. The current
- * cpuid is used as a basis to select which other cpus to wake up.
- *
- * For the selected cpus we can avoid issuing the actual IPI if the target
- * cpu's RQF_WAKEUP is already set. In this case simply setting the
- * reschedule flag RQF_AST_LWKT_RESCHED will be sufficient.
+ * Helper function used by lwkt_getalltokens[_sorted]().
*
- * lwkt.token_ipi_dispatch specifies the maximum number of IPIs to dispatch
- * on a token release.
+ * Our attempt to acquire the token has failed. To reduce cache coherency
+ * bandwidth we set our cpu bit in t_collmask then wait for a reasonable
+ * period of time for a hand-off from the current token owner.
*/
-static __inline
-void
-_lwkt_reltoken_mask(lwkt_token_t tok)
+static
+int
+_lwkt_trytoken_spin(lwkt_token_t tok, lwkt_tokref_t ref)
{
-#ifdef SMP
- globaldata_t ngd;
+ globaldata_t gd = mycpu;
cpumask_t mask;
- cpumask_t tmpmask;
- cpumask_t wumask; /* wakeup mask */
- cpumask_t remask; /* clear mask */
- int wucount; /* wakeup count */
- int cpuid;
- int reqflags;
+ int n;
/*
- * Mask of contending cpus we want to wake up.
+ * Add our cpu to the collision mask and wait for the token to be
+ * handed off to us.
*/
- mask = tok->t_collmask;
- cpu_ccfence();
- if (mask == 0)
- return;
+ crit_enter();
+ atomic_set_cpumask(&tok->t_collmask, gd->gd_cpumask);
+ for (n = 0; n < lwkt_token_spin; ++n) {
+ /*
+ * Token was released before we set our collision bit.
+ */
+ if (tok->t_ref == NULL &&
+ atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
+ KKASSERT((tok->t_collmask & gd->gd_cpumask) != 0);
+ atomic_clear_cpumask(&tok->t_collmask, gd->gd_cpumask);
+ crit_exit();
+ return TRUE;
+ }
- /*
- * Degenerate case - IPI to all contending cpus
- */
- wucount = lwkt_token_ipi_dispatch;
- if (wucount <= 0 || wucount >= ncpus) {
- wucount = 0;
- wumask = mask;
- remask = mask;
- } else {
- wumask = 0;
- remask = 0;
+ /*
+ * Token was handed-off to us.
+ */
+ if (tok->t_ref == &gd->gd_handoff) {
+ KKASSERT((tok->t_collmask & gd->gd_cpumask) == 0);
+ tok->t_ref = ref;
+ crit_exit();
+ return TRUE;
+ }
+ if (lwkt_token_delay)
+ tsc_delay(lwkt_token_delay);
+ else
+ cpu_pause();
}
/*
- * Calculate which cpus to IPI. These cpus are potentially in a
- * HLT state waiting for token contention to go away.
- *
- * Ask the cpu LWKT scheduler to reschedule by setting
- * RQF_AST_LWKT_RESCHEDULE. Signal the cpu if RQF_WAKEUP is not
- * set (otherwise it has already been signalled or will check the
- * flag very soon anyway). Both bits must be adjusted atomically
- * all in one go to avoid races.
- *
- * The collision mask is cleared for all cpus we set the resched
- * flag for, but we only IPI the ones that need signalling.
+ * We failed, attempt to clear our bit in the cpumask. We may race
+ * someone handing-off to us. If someone other than us cleared our
+ * cpu bit a handoff is incoming and we must wait for it.
*/
- while (wucount && mask) {
- tmpmask = mask & ~(CPUMASK(mycpu->gd_cpuid) - 1);
- if (tmpmask)
- cpuid = BSFCPUMASK(tmpmask);
- else
- cpuid = BSFCPUMASK(mask);
- ngd = globaldata_find(cpuid);
- for (;;) {
- reqflags = ngd->gd_reqflags;
- if (atomic_cmpset_int(&ngd->gd_reqflags, reqflags,
- reqflags |
- (RQF_WAKEUP |
- RQF_AST_LWKT_RESCHED))) {
- break;
+ for (;;) {
+ mask = tok->t_collmask;
+ cpu_ccfence();
+ if (mask & gd->gd_cpumask) {
+ if (atomic_cmpset_cpumask(&tok->t_collmask,
+ mask,
+ mask & ~gd->gd_cpumask)) {
+ crit_exit();
+ return FALSE;
}
+ continue;
}
- if ((reqflags & RQF_WAKEUP) == 0) {
- wumask |= CPUMASK(cpuid);
- --wucount;
+ if (tok->t_ref != &gd->gd_handoff) {
+ cpu_pause();
+ continue;
}
- remask |= CPUMASK(cpuid);
- mask &= ~CPUMASK(cpuid);
+ tok->t_ref = ref;
+ crit_exit();
+ return TRUE;
+ }
+}
+
+/*
+ * Release token with hand-off
+ */
+static __inline
+void
+_lwkt_reltoken_spin(lwkt_token_t tok)
+{
+ globaldata_t xgd;
+ cpumask_t sidemask;
+ cpumask_t mask;
+ int cpuid;
+
+ if (tok->t_collmask == 0) {
+ tok->t_ref = NULL;
+ return;
}
- if (remask) {
- atomic_clear_cpumask(&tok->t_collmask, remask);
- lwkt_send_ipiq3_mask(wumask, lwkt_reltoken_mask_remote,
- NULL, 0);
+
+ crit_enter();
+ sidemask = ~(mycpu->gd_cpumask - 1); /* high bits >= xcpu */
+ for (;;) {
+ mask = tok->t_collmask;
+ cpu_ccfence();
+ if (mask == 0) {
+ tok->t_ref = NULL;
+ break;
+ }
+ if (mask & sidemask)
+ cpuid = BSFCPUMASK(mask & sidemask);
+ else
+ cpuid = BSFCPUMASK(mask);
+ xgd = globaldata_find(cpuid);
+ if (atomic_cmpset_cpumask(&tok->t_collmask, mask,
+ mask & ~CPUMASK(cpuid))) {
+ tok->t_ref = &xgd->gd_handoff;
+ break;
+ }
}
-#endif
+ crit_exit();
}
+#endif
+
+
/*
* Obtain all the tokens required by the specified thread on the current
* cpu, return 0 on failure and non-zero on success. If a failure occurs
* lwkt_getalltokens is called by the LWKT scheduler to acquire all
* tokens that the thread had acquired prior to going to sleep.
*
- * We always clear the collision mask on token aquision.
+ * If spinning is non-zero this function acquires the tokens in a particular
+ * order to deal with potential deadlocks. We simply use address order for
+ * the case.
*
* Called from a critical section.
*/
int
-lwkt_getalltokens(thread_t td)
+lwkt_getalltokens(thread_t td, int spinning)
{
lwkt_tokref_t scan;
lwkt_tokref_t ref;
lwkt_token_t tok;
+ if (spinning)
+ return(_lwkt_getalltokens_sorted(td));
+
/*
* Acquire tokens in forward order, assign or validate tok->t_ref.
*/
*/
ref = tok->t_ref;
if (ref == NULL) {
- if (atomic_cmpset_ptr(&tok->t_ref, NULL, scan))
- {
- if (tok->t_collmask & td->td_gd->gd_cpumask) {
- atomic_clear_cpumask(&tok->t_collmask,
- td->td_gd->gd_cpumask);
- }
+ if (atomic_cmpset_ptr(&tok->t_ref, NULL,scan))
break;
- }
continue;
}
if (ref >= &td->td_toks_base && ref < td->td_toks_stop)
break;
-#ifdef SMP
/*
- * Otherwise we failed to acquire all the tokens.
- * Undo and return. We have to try once more after
- * setting cpumask to cover possible races against
- * the checking of t_collmask.
+ * Try hard to acquire this token before giving up
+ * and releasing the whole lot.
*/
- atomic_set_cpumask(&tok->t_collmask,
- td->td_gd->gd_cpumask);
- if (atomic_cmpset_ptr(&tok->t_ref, NULL, scan)) {
- if (tok->t_collmask & td->td_gd->gd_cpumask) {
- atomic_clear_cpumask(&tok->t_collmask,
- td->td_gd->gd_cpumask);
- }
+ if (_lwkt_trytoken_spin(tok, scan))
break;
- }
-#endif
+ if (lwkt_sched_debug)
+ kprintf("toka %p %s\n", tok, tok->t_desc);
+
+ /*
+ * Otherwise we failed to acquire all the tokens.
+ * Release whatever we did get.
+ */
td->td_wmesg = tok->t_desc;
atomic_add_long(&tok->t_collisions, 1);
lwkt_relalltokens(td);
+
return(FALSE);
}
+
}
return (TRUE);
}
* Release all tokens owned by the specified thread on the current cpu.
*
* This code is really simple. Even in cases where we own all the tokens
- * note that t_ref may not match the scan for recursively held tokens,
- * or for the case where a lwkt_getalltokens() failed.
+ * note that t_ref may not match the scan for recursively held tokens which
+ * are held deeper in the stack, or for the case where a lwkt_getalltokens()
+ * failed.
*
- * The scheduler is responsible for maintaining the MP lock count, so
- * we don't need to deal with tr_flags here.
+ * Tokens are released in reverse order to reduce chasing race failures.
*
* Called from a critical section.
*/
lwkt_tokref_t scan;
lwkt_token_t tok;
- for (scan = &td->td_toks_base; scan < td->td_toks_stop; ++scan) {
+ for (scan = td->td_toks_stop - 1; scan >= &td->td_toks_base; --scan) {
+ /*for (scan = &td->td_toks_base; scan < td->td_toks_stop; ++scan) {*/
tok = scan->tr_tok;
- if (tok->t_ref == scan) {
- tok->t_ref = NULL;
- _lwkt_reltoken_mask(tok);
+ if (tok->t_ref == scan)
+ _lwkt_reltoken_spin(tok);
+ }
+}
+
+/*
+ * This is the decontention version of lwkt_getalltokens(). The tokens are
+ * acquired in address-sorted order to deal with any deadlocks. Ultimately
+ * token failures will spin into the scheduler and get here.
+ *
+ * In addition, to reduce hardware cache coherency contention monitor/mwait
+ * is interlocked with gd->gd_reqflags and RQF_SPINNING. Other cores which
+ * release a contended token will clear RQF_SPINNING and cause the mwait
+ * to resume. Any interrupt will also generally set RQF_* flags and cause
+ * mwait to resume (or be a NOP in the first place).
+ *
+ * This code is required to set up RQF_SPINNING in case of failure. The
+ * caller may call monitor/mwait on gd->gd_reqflags on failure. We do NOT
+ * want to call mwait here, and doubly so while we are holding tokens.
+ *
+ * Called from critical section
+ */
+static
+int
+_lwkt_getalltokens_sorted(thread_t td)
+{
+ /*globaldata_t gd = td->td_gd;*/
+ lwkt_tokref_t sort_array[LWKT_MAXTOKENS];
+ lwkt_tokref_t scan;
+ lwkt_tokref_t ref;
+ lwkt_token_t tok;
+ int i;
+ int j;
+ int n;
+
+ /*
+ * Sort the token array. Yah yah, I know this isn't fun.
+ *
+ * NOTE: Recursively acquired tokens are ordered the same as in the
+ * td_toks_array so we can always get the earliest one first.
+ */
+ i = 0;
+ scan = &td->td_toks_base;
+ while (scan < td->td_toks_stop) {
+ for (j = 0; j < i; ++j) {
+ if (scan->tr_tok < sort_array[j]->tr_tok)
+ break;
}
+ if (j != i) {
+ bcopy(sort_array + j, sort_array + j + 1,
+ (i - j) * sizeof(lwkt_tokref_t));
+ }
+ sort_array[j] = scan;
+ ++scan;
+ ++i;
}
+ n = i;
+
+ /*
+ * Acquire tokens in forward order, assign or validate tok->t_ref.
+ */
+ for (i = 0; i < n; ++i) {
+ scan = sort_array[i];
+ tok = scan->tr_tok;
+ for (;;) {
+ /*
+ * Try to acquire the token if we do not already have
+ * it.
+ *
+ * NOTE: If atomic_cmpset_ptr() fails we have to
+ * loop and try again. It just means we
+ * lost a cpu race.
+ */
+ ref = tok->t_ref;
+ if (ref == NULL) {
+ if (atomic_cmpset_ptr(&tok->t_ref, NULL, scan))
+ break;
+ continue;
+ }
+
+ /*
+ * Someone holds the token.
+ *
+ * Test if ref is already recursively held by this
+ * thread. We cannot safely dereference tok->t_ref
+ * (it might belong to another thread and is thus
+ * unstable), but we don't have to. We can simply
+ * range-check it.
+ */
+ if (ref >= &td->td_toks_base && ref < td->td_toks_stop)
+ break;
+
+ /*
+ * Try hard to acquire this token before giving up
+ * and releasing the whole lot.
+ */
+ if (_lwkt_trytoken_spin(tok, scan))
+ break;
+ if (lwkt_sched_debug)
+ kprintf("tokb %p %s\n", tok, tok->t_desc);
+
+ /*
+ * Tokens are released in reverse order to reduce
+ * chasing race failures.
+ */
+ td->td_wmesg = tok->t_desc;
+ atomic_add_long(&tok->t_collisions, 1);
+
+ for (j = i - 1; j >= 0; --j) {
+ /*for (j = 0; j < i; ++j) {*/
+ scan = sort_array[j];
+ tok = scan->tr_tok;
+ if (tok->t_ref == scan)
+ _lwkt_reltoken_spin(tok);
+ }
+ return (FALSE);
+ }
+ }
+
+ /*
+ * We were successful, there is no need for another core to signal
+ * us.
+ */
+#if 0
+ atomic_clear_int(&gd->gd_reqflags, RQF_SPINNING);
+#endif
+ return (TRUE);
}
/*
return(TRUE);
/*
+ * Spin generously. This is preferable to just switching
+ * away unconditionally.
+ */
+ if (_lwkt_trytoken_spin(tok, nref))
+ return(TRUE);
+
+ /*
* Otherwise we failed, and it is not ok to attempt to
* acquire a token in a hard code section.
*/
* return tr_tok->t_ref should be assigned to this specific
* ref.
*/
-#ifdef SMP
-#if 0
- /*
- * (DISABLED ATM) - Do not set t_collmask on a token
- * acquisition failure, the scheduler will spin at least
- * once and deal with hlt/spin semantics.
- */
- atomic_set_cpumask(&tok->t_collmask, td->td_gd->gd_cpumask);
- if (atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
- atomic_clear_cpumask(&tok->t_collmask,
- td->td_gd->gd_cpumask);
- return;
- }
-#endif
-#endif
td->td_wmesg = tok->t_desc;
atomic_add_long(&tok->t_collisions, 1);
logtoken(fail, ref);
* return tr_tok->t_ref should be assigned to this specific
* ref.
*/
-#ifdef SMP
-#if 0
- /*
- * (DISABLED ATM) - Do not set t_collmask on a token
- * acquisition failure, the scheduler will spin at least
- * once and deal with hlt/spin semantics.
- */
- atomic_set_cpumask(&tok->t_collmask, td->td_gd->gd_cpumask);
- if (atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
- atomic_clear_cpumask(&tok->t_collmask,
- td->td_gd->gd_cpumask);
- goto success;
- }
-#endif
-#endif
td->td_wmesg = tok->t_desc;
atomic_add_long(&tok->t_collisions, 1);
logtoken(fail, ref);
logtoken(succ, ref);
KKASSERT(tok->t_ref == ref);
}
-#ifdef SMP
-#if 0
-success:
-#endif
-#endif
crit_enter_hard_gd(td->td_gd);
}
* return tr_tok->t_ref should be assigned to this specific
* ref.
*/
-#ifdef SMP
-#if 0
- /*
- * (DISABLED ATM) - Do not set t_collmask on a token
- * acquisition failure, the scheduler will spin at least
- * once and deal with hlt/spin semantics.
- */
- atomic_set_cpumask(&tok->t_collmask, td->td_gd->gd_cpumask);
- if (atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
- atomic_clear_cpumask(&tok->t_collmask,
- td->td_gd->gd_cpumask);
- goto success;
- }
-#endif
-#endif
td->td_wmesg = tok->t_desc;
atomic_add_long(&tok->t_collisions, 1);
logtoken(fail, ref);
logtoken(succ, ref);
KKASSERT(tok->t_ref == ref);
}
-#ifdef SMP
-#if 0
-success:
-#endif
-#endif
return(tok);
}
*
* NOTE: The mplock is a token also so sequencing is a bit complex.
*/
- if (tok->t_ref == ref) {
- tok->t_ref = NULL;
- _lwkt_reltoken_mask(tok);
- }
+ if (tok->t_ref == ref)
+ _lwkt_reltoken_spin(tok);
cpu_sfence();
cpu_ccfence();
td->td_toks_stop = ref;
CTLFLAG_RW, &pipe_bkmem_alloc, 0, "pipe buffer from kmem");
#endif
+/*
+ * Auto-size pipe cache to reduce kmem allocations and frees.
+ */
+static
+void
+pipeinit(void *dummy)
+{
+ size_t mbytes = kmem_lim_size();
+
+ if (pipe_maxbig == LIMITBIGPIPES) {
+ if (mbytes >= 7 * 1024)
+ pipe_maxbig *= 2;
+ if (mbytes >= 15 * 1024)
+ pipe_maxbig *= 2;
+ }
+ if (pipe_maxcache == PIPEQ_MAX_CACHE) {
+ if (mbytes >= 7 * 1024)
+ pipe_maxcache *= 2;
+ if (mbytes >= 15 * 1024)
+ pipe_maxcache *= 2;
+ }
+}
+SYSINIT(kmem, SI_BOOT2_MACHDEP, SI_ORDER_ANY, pipeinit, NULL)
+
static void pipeclose (struct pipe *cpipe);
static void pipe_free_kmem (struct pipe *cpipe);
static int pipe_create (struct pipe **cpipep);
0);
if (!rv) {
- vm_object_reference (object);
+ vm_object_reference XXX (object);
rv = vm_map_wire (&kernel_map, kva, kva + PAGE_SIZE, 0);
if (!rv) {
tmap = map;
rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry,
&object, &pindex, &out_prot, &wired);
- if (rv != KERN_SUCCESS) {
+ if (rv != KERN_SUCCESS)
return EINVAL;
- }
/*
* Okay, we've got the page. Let's release tmap.
*/
-
vm_map_lookup_done (tmap, out_entry, 0);
/*
* Fault the page in...
*/
-
rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE);
if (rv != KERN_SUCCESS)
return EFAULT;
VM_PROT_ALL, VM_PROT_ALL,
0);
if (!rv) {
- vm_object_reference (object);
+ vm_object_reference XXX (object);
rv = vm_map_wire (&kernel_map, kva, kva + PAGE_SIZE, 0);
if (!rv) {
}
shm_handle = shmseg->shm_internal;
- vm_object_reference(shm_handle->shm_object);
+ vm_object_hold(shm_handle->shm_object);
+ vm_object_reference_locked(shm_handle->shm_object);
rv = vm_map_find(&p->p_vmspace->vm_map,
shm_handle->shm_object, 0,
&attach_va,
VM_MAPTYPE_NORMAL,
prot, prot,
0);
+ vm_object_drop(shm_handle->shm_object);
if (rv != KERN_SUCCESS) {
vm_object_deallocate(shm_handle->shm_object);
error = ENOMEM;
pctcpu = (lp->lwp_pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
- if (pick->p_stat == SIDL || pick->p_stat == SZOMB)
+ if (pick->p_stat == SIDL || pick->p_stat == SZOMB) {
vmsz = 0;
- else
+ } else {
+ lwkt_gettoken(&pick->p_vmspace->vm_map.token);
vmsz = pgtok(vmspace_resident_count(pick->p_vmspace));
+ lwkt_reltoken(&pick->p_vmspace->vm_map.token);
+ }
crit_exit();
m = sf_buf_page(sf);
if (sf_buf_free(sf)) {
/* sf invalid now */
+ vm_page_busy_wait(m, FALSE, "sockpgf");
vm_page_unwire(m, 0);
+ vm_page_wakeup(m);
if (m->wire_count == 0 && m->object == NULL)
vm_page_try_to_free(m);
}
* interrupt can free the page) through to the
* vm_page_wire() call.
*/
- lwkt_gettoken(&vm_token);
- pg = vm_page_lookup(obj, pindex);
+ vm_object_hold(obj);
+ pg = vm_page_lookup_busy_try(obj, pindex, TRUE, &error);
+ if (error) {
+ vm_page_sleep_busy(pg, TRUE, "sfpbsy");
+ vm_object_drop(obj);
+ goto retry_lookup;
+ }
if (pg == NULL) {
pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
if (pg == NULL) {
vm_wait(0);
- lwkt_reltoken(&vm_token);
+ vm_object_drop(obj);
goto retry_lookup;
}
- vm_page_wire(pg);
- vm_page_wakeup(pg);
- } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
- lwkt_reltoken(&vm_token);
- goto retry_lookup;
- } else {
- vm_page_wire(pg);
}
- lwkt_reltoken(&vm_token);
+ vm_page_wire(pg);
+ vm_object_drop(obj);
/*
* If page is not valid for what we need, initiate I/O
* completes.
*/
vm_page_io_start(pg);
+ vm_page_wakeup(pg);
/*
* Get the page from backing store.
td->td_ucred);
vn_unlock(vp);
vm_page_flag_clear(pg, PG_ZERO);
+ vm_page_busy_wait(pg, FALSE, "sockpg");
vm_page_io_finish(pg);
if (error) {
- crit_enter();
vm_page_unwire(pg, 0);
+ vm_page_wakeup(pg);
vm_page_try_to_free(pg);
- crit_exit();
ssb_unlock(&so->so_snd);
goto done;
}
* but this wait can be interrupted.
*/
if ((sf = sf_buf_alloc(pg)) == NULL) {
- crit_enter();
vm_page_unwire(pg, 0);
+ vm_page_wakeup(pg);
vm_page_try_to_free(pg);
- crit_exit();
ssb_unlock(&so->so_snd);
error = EINTR;
goto done;
}
+ vm_page_wakeup(pg);
/*
* Get an mbuf header and set it up as having external storage.
*/
bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
+ vm_object_hold(&kernel_object);
bogus_page = vm_page_alloc(&kernel_object,
(bogus_offset >> PAGE_SHIFT),
VM_ALLOC_NORMAL);
+ vm_object_drop(&kernel_object);
vmstats.v_wire_count++;
}
/*
* Set valid & dirty.
- *
- * WARNING! vfs_dirty_one_page() assumes vm_token is held for now.
*/
- lwkt_gettoken(&vm_token);
for (i = 0; i < bp->b_xio.xio_npages; i++) {
m = bp->b_xio.xio_pages[i];
vfs_dirty_one_page(bp, i, m);
}
- lwkt_reltoken(&vm_token);
bqrelse(bp);
}
resid = bp->b_bufsize;
foff = bp->b_loffset;
- lwkt_gettoken(&vm_token);
for (i = 0; i < bp->b_xio.xio_npages; i++) {
m = bp->b_xio.xio_pages[i];
vm_page_flag_clear(m, PG_ZERO);
obj = vp->v_object;
poff = OFF_TO_IDX(bp->b_loffset);
+ vm_object_hold(obj);
for (j = i; j < bp->b_xio.xio_npages; j++) {
vm_page_t mtmp;
}
}
bp->b_flags &= ~B_HASBOGUS;
+ vm_object_drop(obj);
if ((bp->b_flags & B_INVAL) == 0) {
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
}
if (bp->b_flags & (B_INVAL | B_RELBUF))
vfs_vmio_release(bp);
- lwkt_reltoken(&vm_token);
} else {
/*
* Rundown for non-VMIO buffers.
int i;
vm_page_t m;
- lwkt_gettoken(&vm_token);
for (i = 0; i < bp->b_xio.xio_npages; i++) {
m = bp->b_xio.xio_pages[i];
bp->b_xio.xio_pages[i] = NULL;
+ vm_page_busy_wait(m, FALSE, "vmiopg");
+
/*
* The VFS is telling us this is not a meta-data buffer
* even if it is backed by a block device.
*/
if ((m->flags & PG_BUSY) || (m->busy != 0)) {
vm_page_protect(m, VM_PROT_NONE);
+ vm_page_wakeup(m);
continue;
}
#if 0
if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
m->hold_count == 0) {
- vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
} else
* being cached for long periods of time.
*/
if (bp->b_flags & B_DIRECT) {
+ vm_page_wakeup(m);
vm_page_try_to_free(m);
} else if ((bp->b_flags & B_NOTMETA) ||
vm_page_count_severe()) {
m->act_count = bp->b_act_count;
+ vm_page_wakeup(m);
vm_page_try_to_cache(m);
} else {
m->act_count = bp->b_act_count;
+ vm_page_wakeup(m);
}
+ } else {
+ vm_page_wakeup(m);
}
}
- lwkt_reltoken(&vm_token);
pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
bp->b_xio.xio_npages);
vm_object_t obj;
vm_offset_t toff, tinc, size;
vm_page_t m;
+ int res = 1;
if (findblk(vp, loffset, FINDBLK_TEST))
return 1;
if (size > vp->v_mount->mnt_stat.f_iosize)
size = vp->v_mount->mnt_stat.f_iosize;
+ vm_object_hold(obj);
for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
- lwkt_gettoken(&vm_token);
m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff));
- lwkt_reltoken(&vm_token);
- if (m == NULL)
- return 0;
+ if (m == NULL) {
+ res = 0;
+ break;
+ }
tinc = size;
if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK))
tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK);
if (vm_page_is_valid(m,
- (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0)
- return 0;
+ (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) {
+ res = 0;
+ break;
+ }
}
- return 1;
+ vm_object_drop(obj);
+ return (res);
}
/*
m = bp->b_xio.xio_pages[i];
KASSERT(m != bogus_page,
("allocbuf: bogus page found"));
- while (vm_page_sleep_busy(m, TRUE, "biodep"))
- ;
-
+ vm_page_busy_wait(m, TRUE, "biodep");
bp->b_xio.xio_pages[i] = NULL;
vm_page_unwire(m, 0);
+ vm_page_wakeup(m);
}
pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) +
(desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages));
vp = bp->b_vp;
obj = vp->v_object;
- lwkt_gettoken(&vm_token);
+ vm_object_hold(obj);
while (bp->b_xio.xio_npages < desiredpages) {
vm_page_t m;
vm_pindex_t pi;
+ int error;
- pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages;
- if ((m = vm_page_lookup(obj, pi)) == NULL) {
+ pi = OFF_TO_IDX(bp->b_loffset) +
+ bp->b_xio.xio_npages;
+
+ /*
+ * Blocking on m->busy might lead to a
+ * deadlock:
+ *
+ * vm_fault->getpages->cluster_read->allocbuf
+ */
+ m = vm_page_lookup_busy_try(obj, pi, FALSE,
+ &error);
+ if (error) {
+ vm_page_sleep_busy(m, FALSE, "pgtblk");
+ continue;
+ }
+ if (m == NULL) {
/*
* note: must allocate system pages
* since blocking here could intefere
}
/*
- * We found a page. If we have to sleep on it,
- * retry because it might have gotten freed out
- * from under us.
- *
- * We can only test PG_BUSY here. Blocking on
- * m->busy might lead to a deadlock:
- *
- * vm_fault->getpages->cluster_read->allocbuf
- *
+ * We found a page and were able to busy it.
*/
-
- if (vm_page_sleep_busy(m, FALSE, "pgtblk"))
- continue;
vm_page_flag_clear(m, PG_ZERO);
vm_page_wire(m);
+ vm_page_wakeup(m);
bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m;
++bp->b_xio.xio_npages;
if (bp->b_act_count < m->act_count)
bp->b_act_count = m->act_count;
}
- lwkt_reltoken(&vm_token);
+ vm_object_drop(obj);
/*
* Step 2. We've loaded the pages into the buffer,
bp->b_flags |= B_CACHE;
}
- lwkt_gettoken(&vm_token);
+ vm_object_hold(obj);
for (i = 0; i < bp->b_xio.xio_npages; i++) {
int bogusflag = 0;
int resid;
* already changed correctly (see bdwrite()), so we
* only need to do this here in the read case.
*/
+ vm_page_busy_wait(m, FALSE, "bpdpgw");
if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) {
vfs_clean_one_page(bp, i, m);
}
panic("biodone: page busy < 0");
}
vm_page_io_finish(m);
+ vm_page_wakeup(m);
vm_object_pip_wakeup(obj);
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
iosize -= resid;
}
bp->b_flags &= ~B_HASBOGUS;
- lwkt_reltoken(&vm_token);
+ vm_object_drop(obj);
}
/*
runningbufwakeup(bp);
- lwkt_gettoken(&vm_token);
if (bp->b_flags & B_VMIO) {
struct vnode *vp = bp->b_vp;
vm_object_t obj;
obj = vp->v_object;
+ vm_object_hold(obj);
for (i = 0; i < bp->b_xio.xio_npages; i++) {
vm_page_t m = bp->b_xio.xio_pages[i];
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_xio.xio_pages, bp->b_xio.xio_npages);
}
+ vm_page_busy_wait(m, FALSE, "bpdpgw");
vm_object_pip_wakeup(obj);
vm_page_flag_clear(m, PG_ZERO);
vm_page_io_finish(m);
+ vm_page_wakeup(m);
}
bp->b_flags &= ~B_HASBOGUS;
+ vm_object_drop(obj);
}
- lwkt_reltoken(&vm_token);
}
/*
if (bp->b_flags & B_VMIO) {
vm_object_t obj;
- lwkt_gettoken(&vm_token);
-
obj = vp->v_object;
KASSERT(bp->b_loffset != NOOFFSET,
("vfs_busy_pages: no buffer offset"));
/*
- * Loop until none of the pages are busy.
+ * Busy all the pages. We have to busy them all at once
+ * to avoid deadlocks.
*/
retry:
for (i = 0; i < bp->b_xio.xio_npages; i++) {
vm_page_t m = bp->b_xio.xio_pages[i];
- if (vm_page_sleep_busy(m, FALSE, "vbpage"))
+ if (vm_page_busy_try(m, FALSE)) {
+ vm_page_sleep_busy(m, FALSE, "vbpage");
+ while (--i >= 0)
+ vm_page_wakeup(bp->b_xio.xio_pages[i]);
goto retry;
+ }
}
/*
*/
vm_page_protect(m, VM_PROT_NONE);
}
+ vm_page_wakeup(m);
}
if (bogus) {
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_xio.xio_pages, bp->b_xio.xio_npages);
}
- lwkt_reltoken(&vm_token);
}
/*
KASSERT(bp->b_loffset != NOOFFSET,
("vfs_clean_pages: no buffer offset"));
- /*
- * vm_token must be held for vfs_clean_one_page() calls.
- */
- lwkt_gettoken(&vm_token);
for (i = 0; i < bp->b_xio.xio_npages; i++) {
m = bp->b_xio.xio_pages[i];
vfs_clean_one_page(bp, i, m);
}
- lwkt_reltoken(&vm_token);
}
/*
* This routine is typically called after a read completes (dirty should
* be zero in that case as we are not called on bogus-replace pages),
* or before a write is initiated.
- *
- * NOTE: vm_token must be held by the caller, and vm_page_set_validclean()
- * currently assumes the vm_token is held.
*/
static void
vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m)
*
* WARNING! vm_page_set_validclean() currently assumes vm_token
* is held. The page might not be busied (bdwrite() case).
+ * XXX remove this comment once we've validated that this
+ * is no longer an issue.
*/
vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff);
}
* could intefere with paging I/O, no matter which
* process we are.
*/
+ vm_object_hold(&kernel_object);
p = bio_page_alloc(&kernel_object, pg >> PAGE_SHIFT,
(vm_pindex_t)((to - pg) >> PAGE_SHIFT));
+ vm_object_drop(&kernel_object);
if (p) {
vm_page_wire(p);
p->valid = VM_PAGE_BITS_ALL;
{
vm_page_t p;
+ ASSERT_LWKT_TOKEN_HELD(vm_object_token(obj));
+
/*
* Try a normal allocation, allow use of system reserve.
*/
- lwkt_gettoken(&vm_token);
p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM);
- if (p) {
- lwkt_reltoken(&vm_token);
+ if (p)
return(p);
- }
/*
* The normal allocation failed and we clearly have a page
* page now exists.
*/
if (vm_page_lookup(obj, pg)) {
- lwkt_reltoken(&vm_token);
return(NULL);
}
++lowmempgfails;
vm_wait(hz);
}
- lwkt_reltoken(&vm_token);
return(p);
}
index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
newnpages = index;
- lwkt_gettoken(&vm_token);
for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
p = bp->b_xio.xio_pages[index];
if (p && (index < bp->b_xio.xio_npages)) {
}
bp->b_xio.xio_pages[index] = NULL;
pmap_kremove(pg);
- vm_page_busy(p);
+ vm_page_busy_wait(p, FALSE, "vmhldpg");
vm_page_unwire(p, 0);
vm_page_free(p);
}
}
bp->b_xio.xio_npages = newnpages;
- lwkt_reltoken(&vm_token);
}
/*
*/
if (!TAILQ_EMPTY(&ncp->nc_list))
vhold(vp);
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
ncp->nc_vp = vp;
TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
if (ncp->nc_exlocks)
vhold(vp);
ncp->nc_error = ENOTCONN;
if ((vp = ncp->nc_vp) != NULL) {
atomic_add_int(&numcache, -1);
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
ncp->nc_vp = NULL;
TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
/*
* Any vp associated with an ncp with children is
struct namecache *next;
restart:
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
ncp = TAILQ_FIRST(&vp->v_namecache);
if (ncp)
_cache_hold(ncp);
/* loop entered with ncp held and vp spin-locked */
if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
_cache_hold(next);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
_cache_lock(ncp);
if (ncp->nc_vp != vp) {
kprintf("Warning: cache_inval_vp: race-A detected on "
_cache_inval(ncp, flags);
_cache_put(ncp); /* also releases reference */
ncp = next;
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
if (ncp && ncp->nc_vp != vp) {
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
kprintf("Warning: cache_inval_vp: race-B detected on "
"%s\n", ncp->nc_name);
_cache_drop(ncp);
goto restart;
}
}
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
return(TAILQ_FIRST(&vp->v_namecache) != NULL);
}
struct namecache *ncp;
struct namecache *next;
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
ncp = TAILQ_FIRST(&vp->v_namecache);
if (ncp)
_cache_hold(ncp);
/* loop entered with ncp held */
if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
_cache_hold(next);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
if (_cache_lock_nonblock(ncp)) {
_cache_drop(ncp);
if (next)
_cache_inval(ncp, 0);
_cache_put(ncp); /* also releases reference */
ncp = next;
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
if (ncp && ncp->nc_vp != vp) {
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
kprintf("Warning: cache_inval_vp: race-B detected on "
"%s\n", ncp->nc_name);
_cache_drop(ncp);
goto done;
}
}
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
done:
return(TAILQ_FIRST(&vp->v_namecache) != NULL);
}
* Handle the makeit == 0 degenerate case
*/
if (makeit == 0) {
- spin_lock(&dvp->v_spinlock);
+ spin_lock(&dvp->v_spin);
nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
if (nch->ncp)
cache_hold(nch);
- spin_unlock(&dvp->v_spinlock);
+ spin_unlock(&dvp->v_spin);
}
/*
/*
* Break out if we successfully acquire a working ncp.
*/
- spin_lock(&dvp->v_spinlock);
+ spin_lock(&dvp->v_spin);
nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
if (nch->ncp) {
cache_hold(nch);
- spin_unlock(&dvp->v_spinlock);
+ spin_unlock(&dvp->v_spin);
break;
}
- spin_unlock(&dvp->v_spinlock);
+ spin_unlock(&dvp->v_spin);
/*
* If dvp is the root of its filesystem it should already
break;
}
vn_unlock(pvp);
- spin_lock(&pvp->v_spinlock);
+ spin_lock(&pvp->v_spin);
if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
_cache_hold(nch.ncp);
- spin_unlock(&pvp->v_spinlock);
+ spin_unlock(&pvp->v_spin);
vrele(pvp);
break;
}
- spin_unlock(&pvp->v_spinlock);
+ spin_unlock(&pvp->v_spin);
if (pvp->v_flag & VROOT) {
nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
error = cache_resolve_mp(nch.mount);
if ((vn = p->p_textvp) == NULL)
return (EINVAL);
}
- spin_lock(&vn->v_spinlock);
+ spin_lock(&vn->v_spin);
TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
if (ncp->nc_nlen)
break;
}
if (ncp == NULL) {
- spin_unlock(&vn->v_spinlock);
+ spin_unlock(&vn->v_spin);
return (EINVAL);
}
_cache_hold(ncp);
- spin_unlock(&vn->v_spinlock);
+ spin_unlock(&vn->v_spin);
atomic_add_int(&numfullpathcalls, -1);
nch.ncp = ncp;;
cluster_append(&bp->b_bio1, tbp);
for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
vm_page_t m;
+
m = tbp->b_xio.xio_pages[j];
+ vm_page_busy_wait(m, FALSE, "clurpg");
vm_page_io_start(m);
+ vm_page_wakeup(m);
vm_object_pip_add(m->object, 1);
if ((bp->b_xio.xio_npages == 0) ||
(bp->b_xio.xio_pages[bp->b_xio.xio_npages-1] != m)) {
for (j = 0; j < tbp->b_xio.xio_npages; ++j) {
m = tbp->b_xio.xio_pages[j];
+ vm_page_busy_wait(m, FALSE, "clurpg");
vm_page_io_start(m);
+ vm_page_wakeup(m);
vm_object_pip_add(m->object, 1);
if ((bp->b_xio.xio_npages == 0) ||
(bp->b_xio.xio_pages[bp->b_xio.xio_npages - 1] != m)) {
struct nchandle nch;
nch.mount = vp->v_mount;
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
break;
}
if (nch.ncp) {
cache_hold(&nch);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
cache_drop(&nch);
} else {
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
}
}
struct nchandle nch;
nch.mount = vp->v_mount;
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
if (nch.ncp == notncp)
continue;
}
if (nch.ncp) {
cache_hold(&nch);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
cache_drop(&nch);
} else {
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
}
}
* This routine is only valid if the vnode is already either VFREE or
* VCACHED, or if it can become VFREE or VCACHED via vnode_terminate().
*
- * WARNING! This functions is typically called with v_spinlock held.
+ * WARNING! This functions is typically called with v_spin held.
*
* MPSAFE
*/
* An auxiliary reference DOES NOT move a vnode out of the VFREE state
* once it has entered it.
*
- * WARNING! vhold() and vhold_interlocked() must not acquire v_spinlock.
+ * WARNING! vhold() and vhold_interlocked() must not acquire v_spin.
* The spinlock may or may not already be held by the caller.
* vdrop() will clean up the free list state.
*
* Remove an auxiliary reference from the vnode.
*
* vdrop needs to check for a VCACHE->VFREE transition to catch cases
- * where a vnode is held past its reclamation. We use v_spinlock to
+ * where a vnode is held past its reclamation. We use v_spin to
* interlock VCACHED -> !VCACHED transitions.
*
* MPSAFE
vdrop(struct vnode *vp)
{
KKASSERT(vp->v_sysref.refcnt != 0 && vp->v_auxrefs > 0);
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
atomic_subtract_int(&vp->v_auxrefs, 1);
if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
_vclrflags(vp, VCACHED);
__vfree(vp);
}
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
}
/*
if (vp->v_mount)
VOP_INACTIVE(vp);
}
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0);
if (vshouldfree(vp))
__vfree(vp);
else
_vsetflags(vp, VCACHED); /* inactive but not yet free*/
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
vx_unlock(vp);
}
RB_INIT(&vp->v_rbclean_tree);
RB_INIT(&vp->v_rbdirty_tree);
RB_INIT(&vp->v_rbhash_tree);
+ spin_init(&vp->v_spin);
return(TRUE);
}
{
if (lockcountnb(&vp->v_lock))
return(EBUSY);
- return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT | LK_NOSPINWAIT));
+ return(lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT));
}
void
* We are allowed to reactivate the vnode while we hold
* the VX lock, assuming it can be reactivated.
*/
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
if (vp->v_flag & VFREE) {
__vbusy(vp);
sysref_activate(&vp->v_sysref);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
sysref_put(&vp->v_sysref);
} else if (vp->v_flag & VCACHED) {
_vclrflags(vp, VCACHED);
sysref_activate(&vp->v_sysref);
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
sysref_put(&vp->v_sysref);
} else {
if (sysref_isinactive(&vp->v_sysref)) {
kprintf("Warning vp %p reactivation race\n",
vp);
}
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
}
_vclrflags(vp, VINACTIVE);
error = 0;
void
vx_put(struct vnode *vp)
{
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
if ((vp->v_flag & VCACHED) && vshouldfree(vp)) {
_vclrflags(vp, VCACHED);
__vfree(vp);
}
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
lockmgr(&vp->v_lock, LK_RELEASE);
sysref_put(&vp->v_sysref);
}
* Cycle if we can't.
*
* We use a bad hack in vx_lock_nonblock() which avoids
- * the lock order reversal between vfs_spin and v_spinlock.
+ * the lock order reversal between vfs_spin and v_spin.
* This is very fragile code and I don't want to use
* vhold here.
*/
{
struct namecache *ncp;
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
if (!TAILQ_EMPTY(&ncp->nc_list)) {
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
return(0);
}
}
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
return(1);
}
/*
* Debugging only
*/
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
filename = TAILQ_FIRST(&vp->v_namecache) ?
TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
/*
* Make sure no buffers were instantiated while we were trying
/*
* If the vnode has an object, destroy it.
*/
- lwkt_gettoken(&vmobj_token);
- object = vp->v_object;
+ while ((object = vp->v_object) != NULL) {
+ vm_object_hold(object);
+ if (object == vp->v_object)
+ break;
+ vm_object_drop(object);
+ }
+
if (object != NULL) {
/*
* Use vm_object_lock() rather than vm_object_hold to avoid
* creating an extra (self-)hold on the object.
- *
- * NOTE: vm_object_terminate() eats the object lock.
*/
- vm_object_lock(object);
- KKASSERT(object == vp->v_object);
if (object->ref_count == 0) {
- if ((object->flags & OBJ_DEAD) == 0) {
- /* eats object lock */
+ if ((object->flags & OBJ_DEAD) == 0)
vm_object_terminate(object);
- } else {
- vm_object_unlock(object);
- }
+ vm_object_drop(object);
vclrflags(vp, VOBJBUF);
} else {
vm_pager_deallocate(object);
vclrflags(vp, VOBJBUF);
- vm_object_unlock(object);
+ vm_object_drop(object);
}
}
- lwkt_reltoken(&vmobj_token);
KKASSERT((vp->v_flag & VOBJBUF) == 0);
/*
vm_object_t object;
int error = 0;
- lwkt_gettoken(&vmobj_token);
retry:
- if ((object = vp->v_object) == NULL) {
+ while ((object = vp->v_object) != NULL) {
+ vm_object_hold(object);
+ if (object == vp->v_object)
+ break;
+ vm_object_drop(object);
+ }
+
+ if (object == NULL) {
object = vnode_pager_alloc(vp, filesize, 0, 0, blksize, boff);
+
/*
* Dereference the reference we just created. This assumes
* that the object is associated with the vp.
*/
+ vm_object_hold(object);
object->ref_count--;
vrele(vp);
} else {
if (vp->v_object == object)
vm_object_dead_sleep(object, "vodead");
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ vm_object_drop(object);
goto retry;
}
}
KASSERT(vp->v_object != NULL, ("vinitvmio: NULL object"));
vsetflags(vp, VOBJBUF);
- lwkt_reltoken(&vmobj_token);
+ vm_object_drop(object);
return (error);
}
/*
* Debugging only
*/
- spin_lock(&vp->v_spinlock);
+ spin_lock(&vp->v_spin);
filename = TAILQ_FIRST(&vp->v_namecache) ?
TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
- spin_unlock(&vp->v_spinlock);
+ spin_unlock(&vp->v_spin);
/*
* Make sure no buffers were instantiated while we were trying
*/
if ((object = vp->v_object) == NULL)
return;
- if (length == vp->v_filesize)
+ vm_object_hold(object);
+ if (length == vp->v_filesize) {
+ vm_object_drop(object);
return;
+ }
/*
* Calculate the size of the VM object, coverage includes
* invalidated.
*/
pi = OFF_TO_IDX(length + PAGE_MASK);
- lwkt_gettoken(&vm_token);
while (pi < nobjsize) {
- do {
- m = vm_page_lookup(object, pi);
- } while (m && vm_page_sleep_busy(m, TRUE, "vsetsz"));
+ m = vm_page_lookup_busy_wait(object, pi, FALSE, "vmpg");
if (m) {
- vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_wakeup(m);
}
++pi;
}
- lwkt_reltoken(&vm_token);
} else {
/*
* File has expanded.
*/
vp->v_filesize = length;
}
+ vm_object_drop(object);
}
* critical section.
*
* NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
- * However, there are cases where the idlethread will be entered with
- * the possibility that no IPI will occur and in such cases
- * lwkt_switch() sets RQF_WAKEUP. We usually check
- * RQF_IDLECHECK_WK_MASK.
*
* NOTE: cpu_idle_hlt again defaults to 2 (use ACPI sleep states). Set to
* 1 to just use hlt and for debugging purposes.
/*
* Manages physical address maps.
*
- * In most cases the vm_token must be held when manipulating a user pmap
- * or elements within a vm_page, and the kvm_token must be held when
- * manipulating the kernel pmap. Operations on user pmaps may require
- * additional synchronization.
- *
- * In some cases the caller may hold the required tokens to prevent pmap
- * functions from blocking on those same tokens. This typically only works
- * for lookup-style operations.
+ * In most cases we hold page table pages busy in order to manipulate them.
*/
/*
* PMAP_DEBUG - see platform/pc32/include/pmap.h
#include <sys/msgbuf.h>
#include <sys/vmmeter.h>
#include <sys/mman.h>
+#include <sys/thread.h>
#include <vm/vm.h>
#include <vm/vm_param.h>
#include <sys/user.h>
#include <sys/thread2.h>
#include <sys/sysref2.h>
+#include <sys/spinlock2.h>
#include <machine/cputypes.h>
#include <machine/md_var.h>
* The kernel's pmap is statically allocated so we don't have to use
* pmap_create, which is unlikely to work correctly at this part of
* the boot sequence (XXX and which no longer exists).
+ *
+ * The kernel_pmap's pm_pteobj is used only for locking and not
+ * for mmu pages.
*/
kernel_pmap.pm_pdir = (pd_entry_t *)(KERNBASE + (u_int)IdlePTD);
kernel_pmap.pm_count = 1;
kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
+ kernel_pmap.pm_pteobj = &kernel_object;
TAILQ_INIT(&kernel_pmap.pm_pvlist);
+ TAILQ_INIT(&kernel_pmap.pm_pvlist_free);
+ spin_init(&kernel_pmap.pm_spin);
+ lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
nkpt = NKPT;
/*
* This routine works like vm_page_lookup() but also blocks as long as the
* page is busy. This routine does not busy the page it returns.
*
- * The caller must hold vm_token.
+ * The caller must hold the object.
*/
static vm_page_t
pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
{
vm_page_t m;
- ASSERT_LWKT_TOKEN_HELD(&vm_token);
- do {
- m = vm_page_lookup(object, pindex);
- } while (m && vm_page_sleep_busy(m, FALSE, "pplookp"));
+ ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
+ m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp");
return(m);
}
* Wait until we can busy the page ourselves. We cannot have
* any active flushes if we block.
*/
- if (m->flags & PG_BUSY) {
- while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
- ;
- }
+ vm_page_busy_wait(m, FALSE, "pmuwpt");
KASSERT(m->queue == PQ_NONE,
("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m));
* the current one, when clearing a page directory
* entry.
*/
- vm_page_busy(m);
pmap_inval_interlock(info, pmap, -1);
KKASSERT(pmap->pm_pdir[m->pindex]);
pmap->pm_pdir[m->pindex] = 0;
vm_page_unhold(m);
--m->wire_count;
KKASSERT(m->wire_count == 0);
- --vmstats.v_wire_count;
+ atomic_add_int(&vmstats.v_wire_count, -1);
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
vm_page_flash(m);
vm_page_free_zero(m);
} else {
KKASSERT(m->hold_count > 1);
vm_page_unhold(m);
+ vm_page_wakeup(m);
return 0;
}
}
{
unsigned ptepindex;
+ ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
+
if (va >= UPT_MIN_ADDRESS)
return 0;
(pmap->pm_ptphint->pindex == ptepindex)) {
mpte = pmap->pm_ptphint;
} else {
- mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
+ mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
pmap->pm_ptphint = mpte;
+ vm_page_wakeup(mpte);
}
}
pmap->pm_cached = 0;
pmap->pm_ptphint = NULL;
TAILQ_INIT(&pmap->pm_pvlist);
+ TAILQ_INIT(&pmap->pm_pvlist_free);
+ spin_init(&pmap->pm_spin);
+ lwkt_token_init(&pmap->pm_token, "pmap_tok");
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
}
ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
pmap->pm_pdirm = ptdpg;
- vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY);
+ vm_page_flag_clear(ptdpg, PG_MAPPED);
+ vm_page_wire(ptdpg);
ptdpg->valid = VM_PAGE_BITS_ALL;
- ptdpg->wire_count = 1;
- ++vmstats.v_wire_count;
pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
+ vm_page_wakeup(ptdpg);
}
if ((ptdpg->flags & PG_ZERO) == 0)
bzero(pmap->pm_pdir, PAGE_SIZE);
pmap->pm_cached = 0;
pmap->pm_ptphint = NULL;
TAILQ_INIT(&pmap->pm_pvlist);
+ TAILQ_INIT(&pmap->pm_pvlist_free);
+ spin_init(&pmap->pm_spin);
+ lwkt_token_init(&pmap->pm_token, "pmap_tok");
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
pmap->pm_stats.resident_count = 1;
}
vm_page_t p;
KKASSERT(pmap->pm_active == 0);
- lwkt_gettoken(&vm_token);
if ((p = pmap->pm_pdirm) != NULL) {
KKASSERT(pmap->pm_pdir != NULL);
pmap_kremove((vm_offset_t)pmap->pm_pdir);
+ vm_page_busy_wait(p, FALSE, "pgpun");
p->wire_count--;
- vmstats.v_wire_count--;
- KKASSERT((p->flags & PG_BUSY) == 0);
- vm_page_busy(p);
+ atomic_add_int(&vmstats.v_wire_count, -1);
vm_page_free_zero(p);
pmap->pm_pdirm = NULL;
}
- lwkt_reltoken(&vm_token);
if (pmap->pm_pdir) {
kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pdir, PAGE_SIZE);
pmap->pm_pdir = NULL;
void
pmap_pinit2(struct pmap *pmap)
{
- lwkt_gettoken(&vm_token);
+ /*
+ * XXX copies current process, does not fill in MPPTDI
+ */
+ spin_lock(&pmap_spin);
TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
- /* XXX copies current process, does not fill in MPPTDI */
bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
- lwkt_reltoken(&vm_token);
+ spin_unlock(&pmap_spin);
}
/*
* page-table pages. Those pages are zero now, and
* might as well be placed directly into the zero queue.
*/
- if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
+ if (vm_page_busy_try(p, FALSE)) {
+ vm_page_sleep_busy(p, FALSE, "pmaprl");
return 0;
-
- vm_page_busy(p);
+ }
/*
* Remove the page table page from the processes address space.
vm_page_wakeup(p);
} else {
p->wire_count--;
- vmstats.v_wire_count--;
+ atomic_add_int(&vmstats.v_wire_count, -1);
vm_page_free_zero(p);
}
return 1;
}
if (m->wire_count == 0)
- vmstats.v_wire_count++;
+ atomic_add_int(&vmstats.v_wire_count, 1);
m->wire_count++;
pmap_zero_page(ptepa);
}
}
-
m->valid = VM_PAGE_BITS_ALL;
vm_page_flag_clear(m, PG_ZERO);
- }
- else {
+ } else {
KKASSERT((m->flags & PG_ZERO) == 0);
}
vm_offset_t ptepa;
vm_page_t m;
+ ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
+
/*
* Calculate pagetable page index
*/
(pmap->pm_ptphint->pindex == ptepindex)) {
m = pmap->pm_ptphint;
} else {
- m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
+ m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
pmap->pm_ptphint = m;
+ vm_page_wakeup(m);
}
m->hold_count++;
return m;
* Called when a pmap initialized by pmap_pinit is being released.
* Should only be called if the map contains no valid mappings.
*
- * No requirements.
+ * Caller must hold pmap->pm_token
*/
static int pmap_release_callback(struct vm_page *p, void *data);
info.pmap = pmap;
info.object = object;
- vm_object_hold(object);
- lwkt_gettoken(&vm_token);
+
+ spin_lock(&pmap_spin);
TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
+ spin_unlock(&pmap_spin);
+ vm_object_hold(object);
do {
info.error = 0;
info.mpte = NULL;
info.error = 1;
}
} while (info.error);
- pmap->pm_cached = 0;
- lwkt_reltoken(&vm_token);
vm_object_drop(object);
+
+ pmap->pm_cached = 0;
}
/*
vm_page_t nkpg;
pd_entry_t newpdir;
- lwkt_gettoken(&vm_token);
+ vm_object_hold(kptobj);
if (kernel_vm_end == 0) {
kernel_vm_end = KERNBASE;
nkpt = 0;
/*
* This update must be interlocked with pmap_pinit2.
*/
+ spin_lock(&pmap_spin);
TAILQ_FOREACH(pmap, &pmap_list, pm_pmnode) {
*pmap_pde(pmap, kernel_vm_end) = newpdir;
}
+ spin_unlock(&pmap_spin);
kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
~(PAGE_SIZE * NPTEPG - 1);
}
- lwkt_reltoken(&vm_token);
+ vm_object_drop(kptobj);
}
/*
warningdone++;
}
- for(i = 0; i < vm_page_array_size; i++) {
+ for (i = 0; i < vm_page_array_size; i++) {
m = &vm_page_array[i];
- if (m->wire_count || m->hold_count || m->busy ||
- (m->flags & PG_BUSY)) {
+ if (m->wire_count || m->hold_count)
continue;
+ if (vm_page_busy_try(m, TRUE) == 0) {
+ if (m->wire_count == 0 && m->hold_count == 0) {
+ pmap_remove_all(m);
+ }
+ vm_page_wakeup(m);
}
- pmap_remove_all(m);
}
lwkt_reltoken(&vm_token);
}
test_m_maps_pv(m, pv);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
m->md.pv_list_count--;
- m->object->agg_pv_list_count--;
+ atomic_add_int(&m->object->agg_pv_list_count, -1);
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
++pmap->pm_generation;
+ vm_object_hold(pmap->pm_pteobj);
rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
+ vm_object_drop(pmap->pm_pteobj);
free_pv_entry(pv);
+
return rtval;
}
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
++pmap->pm_generation;
m->md.pv_list_count++;
- m->object->agg_pv_list_count++;
+ atomic_add_int(&m->object->agg_pv_list_count, 1);
}
/*
if (pmap == NULL)
return;
+ vm_object_hold(pmap->pm_pteobj);
lwkt_gettoken(&vm_token);
if (pmap->pm_stats.resident_count == 0) {
lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
return;
}
pmap_remove_page(pmap, sva, &info);
pmap_inval_done(&info);
lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
return;
}
}
pmap_inval_done(&info);
lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
}
/*
if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
return;
- lwkt_gettoken(&vm_token);
pmap_inval_init(&info);
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
++pv->pv_pmap->pm_generation;
m->md.pv_list_count--;
- m->object->agg_pv_list_count--;
+ atomic_add_int(&m->object->agg_pv_list_count, -1);
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+ vm_object_hold(pv->pv_pmap->pm_pteobj);
pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
+ vm_object_drop(pv->pv_pmap->pm_pteobj);
free_pv_entry(pv);
}
KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
}
/*
print_backtrace(-1);
}
+ vm_object_hold(pmap->pm_pteobj);
lwkt_gettoken(&vm_token);
/*
else
mpte = NULL;
- pmap_inval_init(&info);
+ if ((prot & VM_PROT_NOSYNC) == 0)
+ pmap_inval_init(&info);
pte = pmap_pte(pmap, va);
/*
* to update the pte.
*/
if ((origpte & ~(PG_M|PG_A)) != newpte) {
- pmap_inval_interlock(&info, pmap, va);
+ if (prot & VM_PROT_NOSYNC)
+ cpu_invlpg((void *)va);
+ else
+ pmap_inval_interlock(&info, pmap, va);
ptbase_assert(pmap);
KKASSERT(*pte == 0 ||
(*pte & PG_FRAME) == (newpte & PG_FRAME));
*pte = newpte | PG_A;
- pmap_inval_deinterlock(&info, pmap);
+ if ((prot & VM_PROT_NOSYNC) == 0)
+ pmap_inval_deinterlock(&info, pmap);
if (newpte & PG_RW)
vm_page_flag_set(m, PG_WRITEABLE);
}
KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
- pmap_inval_done(&info);
+ if ((prot & VM_PROT_NOSYNC) == 0)
+ pmap_inval_done(&info);
lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
}
/*
vm_offset_t ptepa;
pmap_inval_info info;
+ vm_object_hold(pmap->pm_pteobj);
lwkt_gettoken(&vm_token);
pmap_inval_init(&info);
(pmap->pm_ptphint->pindex == ptepindex)) {
mpte = pmap->pm_ptphint;
} else {
- mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
+ mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
pmap->pm_ptphint = mpte;
+ vm_page_wakeup(mpte);
}
if (mpte)
mpte->hold_count++;
KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
pmap_inval_done(&info);
lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
return;
}
/* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
pmap_inval_done(&info);
lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
}
/*
info.pmap = pmap;
vm_object_hold(object);
- lwkt_gettoken(&vm_token);
vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
pmap_object_init_pt_callback, &info);
- lwkt_reltoken(&vm_token);
vm_object_drop(object);
}
vmstats.v_free_count < vmstats.v_free_reserved) {
return(-1);
}
+ if (vm_page_busy_try(p, TRUE))
+ return 0;
if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
- (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
- vm_page_busy(p);
+ (p->flags & PG_FICTITIOUS) == 0) {
if ((p->queue - p->pc) == PQ_CACHE)
vm_page_deactivate(p);
rel_index = p->pindex - info->start_pindex;
pmap_enter_quick(info->pmap,
info->addr + i386_ptob(rel_index), p);
- vm_page_wakeup(p);
}
+ vm_page_wakeup(p);
return(0);
}
else
iscurrentpmap = 0;
+ if (pmap->pm_pteobj)
+ vm_object_hold(pmap->pm_pteobj);
lwkt_gettoken(&vm_token);
pmap_inval_init(&info);
+
for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
if (pv->pv_va >= eva || pv->pv_va < sva) {
npv = TAILQ_NEXT(pv, pv_plist);
save_generation = ++pmap->pm_generation;
m->md.pv_list_count--;
- m->object->agg_pv_list_count--;
+ atomic_add_int(&m->object->agg_pv_list_count, -1);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
}
pmap_inval_done(&info);
lwkt_reltoken(&vm_token);
+ if (pmap->pm_pteobj)
+ vm_object_drop(pmap->pm_pteobj);
}
/*
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
- *
* No requirements.
*/
int
*
* Only called with new VM spaces.
* The process must have only a single thread.
+ * The process must hold the vmspace->vm_map.token for oldvm and newvm
* No other requirements.
*/
void
struct pmap *pmap = &vm->vm_pmap;
if (pmap->pm_active & CPUMASK_LOCK) {
- DEBUG_PUSH_INFO("pmap_interlock_wait");
crit_enter();
+ DEBUG_PUSH_INFO("pmap_interlock_wait");
while (pmap->pm_active & CPUMASK_LOCK) {
cpu_ccfence();
lwkt_process_ipiq();
}
- crit_exit();
DEBUG_POP_INFO();
+ crit_exit();
}
}
#ifndef _SYS_QUEUE_H_
#include <sys/queue.h>
#endif
+#ifndef _SYS_SPINLOCK_H_
+#include <sys/spinlock.h>
+#endif
+#ifndef _SYS_THREAD_H_
+#include <sys/thread.h>
+#endif
#ifndef _MACHINE_TYPES_H_
#include <machine/types.h>
#endif
struct vm_object *pm_pteobj; /* Container for pte's */
TAILQ_ENTRY(pmap) pm_pmnode; /* list of pmaps */
TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */
+ TAILQ_HEAD(,pv_entry) pm_pvlist_free; /* free mappings */
int pm_count; /* reference count */
cpumask_t pm_active; /* active on cpus */
cpumask_t pm_cached; /* cached on cpus */
struct pmap_statistics pm_stats; /* pmap statistics */
struct vm_page *pm_ptphint; /* pmap ptp hint */
int pm_generation; /* detect pvlist deletions */
+ struct spinlock pm_spin;
+ struct lwkt_token pm_token;
};
#define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count
extern vm_offset_t clean_sva;
extern char *ptvmmap; /* poor name! */
+void pmap_release(struct pmap *pmap);
void pmap_interlock_wait (struct vmspace *);
void pmap_bootstrap (vm_paddr_t, vm_paddr_t);
void *pmap_mapdev (vm_paddr_t, vm_size_t);
#ifndef _SYS_QUEUE_H_
#include <sys/queue.h>
#endif
+#ifndef _SYS_SPINLOCK_H_
+#include <sys/spinlock.h>
+#endif
+#ifndef _SYS_THREAD_H_
+#include <sys/thread.h>
+#endif
#ifndef _MACHINE_TYPES_H_
#include <machine/types.h>
#endif
static __inline void
pte_store(pt_entry_t *ptep, pt_entry_t pte)
{
-
*ptep = pte;
}
struct md_page {
int pv_list_count;
+ int pv_generation;
TAILQ_HEAD(,pv_entry) pv_list;
};
struct vm_object *pm_pteobj; /* Container for pte's */
TAILQ_ENTRY(pmap) pm_pmnode; /* list of pmaps */
TAILQ_HEAD(,pv_entry) pm_pvlist; /* list of mappings in pmap */
+ TAILQ_HEAD(,pv_entry) pm_pvlist_free; /* free mappings */
int pm_count; /* reference count */
cpumask_t pm_active; /* active on cpus */
int pm_filler02; /* (filler sync w/vkernel) */
struct pmap_statistics pm_stats; /* pmap statistics */
struct vm_page *pm_ptphint; /* pmap ptp hint */
int pm_generation; /* detect pvlist deletions */
+ int pm_hold;
+ struct spinlock pm_spin;
+ struct lwkt_token pm_token;
};
#define CPUMASK_LOCK CPUMASK(SMP_MAXCPU)
TAILQ_ENTRY(pv_entry) pv_list;
TAILQ_ENTRY(pv_entry) pv_plist;
struct vm_page *pv_ptem; /* VM page for pte */
+ u_int pv_hold; /* hold on destruction count */
} *pv_entry_t;
#ifdef _KERNEL
extern vm_offset_t clean_sva;
extern char *ptvmmap; /* poor name! */
+void pmap_release(struct pmap *pmap);
void pmap_interlock_wait (struct vmspace *);
void pmap_bootstrap (vm_paddr_t *);
void *pmap_mapdev (vm_paddr_t, vm_size_t);
#include <sys/user.h>
#include <sys/thread2.h>
#include <sys/sysref2.h>
+#include <sys/spinlock2.h>
+#include <vm/vm_page2.h>
#include <machine/cputypes.h>
#include <machine/md_var.h>
static pt_entry_t *pt_crashdumpmap;
static caddr_t crashdumpmap;
+static int pmap_yield_count = 64;
+SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
+ &pmap_yield_count, 0, "Yield during init_pt/release");
+
#define DISABLE_PSE
static pv_entry_t get_pv_entry (void);
static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex);
static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
-static int _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
+static int pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
pmap_inval_info_t info);
-static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t);
static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
static unsigned pdir4mb;
pt_entry_t *
vtopte(vm_offset_t va)
{
- uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
+ NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
return (PTmap + ((va >> PAGE_SHIFT) & mask));
}
pd_entry_t *
vtopde(vm_offset_t va)
{
- uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
+ uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
+ NPML4EPGSHIFT)) - 1);
return (PDmap + ((va >> PDRSHIFT) & mask));
}
* The kernel's pmap is statically allocated so we don't have to use
* pmap_create, which is unlikely to work correctly at this part of
* the boot sequence (XXX and which no longer exists).
+ *
+ * The kernel_pmap's pm_pteobj is used only for locking and not
+ * for mmu pages.
*/
kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
kernel_pmap.pm_count = 1;
kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
+ kernel_pmap.pm_pteobj = &kernel_object;
TAILQ_INIT(&kernel_pmap.pm_pvlist);
+ TAILQ_INIT(&kernel_pmap.pm_pvlist_free);
+ kernel_pmap.pm_hold = 0;
+ spin_init(&kernel_pmap.pm_spin);
+ lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
/*
* Reserve some special page table entries/VA space for temporary
/*
* Extract the physical page address associated with the map/VA pair.
*
- * The caller must hold vm_token if non-blocking operation is desired.
+ * The caller must hold pmap->pm_token if non-blocking operation is desired.
*/
vm_paddr_t
pmap_extract(pmap_t pmap, vm_offset_t va)
pt_entry_t *pte;
pd_entry_t pde, *pdep;
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
rtval = 0;
pdep = pmap_pde(pmap, va);
if (pdep != NULL) {
}
}
}
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
return rtval;
}
* This routine works like vm_page_lookup() but also blocks as long as the
* page is busy. This routine does not busy the page it returns.
*
- * Unless the caller is managing objects whos pages are in a known state,
- * the call should be made with both vm_token held and the governing object
- * and its token held so the page's object association remains valid on
- * return.
+ * The call should be made with the governing object held so the page's
+ * object association remains valid on return.
*
* This function can block!
*/
{
vm_page_t m;
- do {
- m = vm_page_lookup(object, pindex);
- } while (m && vm_page_sleep_busy(m, FALSE, "pplookp"));
+ ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
+ m = vm_page_lookup_busy_wait(object, pindex, FALSE, "pplookp");
return(m);
}
***************************************************/
/*
- * This routine unholds page table pages, and if the hold count
- * drops to zero, then it decrements the wire count.
+ * After removing a page table entry, this routine is used to
+ * conditionally free the page, and manage the hold/wire counts.
*/
static __inline
int
-pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
- pmap_inval_info_t info)
+pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
+ pmap_inval_info_t info)
{
- KKASSERT(m->hold_count > 0);
- if (m->hold_count > 1) {
- vm_page_unhold(m);
- return 0;
- } else {
- return _pmap_unwire_pte_hold(pmap, va, m, info);
- }
+ if (mpte)
+ return (pmap_unwire_pte_hold(pmap, va, mpte, info));
+ return 0;
}
-static
+/*
+ * This routine reduces the wire_count on a page. If the wire_count
+ * would drop to zero we remove the PT, PD, or PDP from its parent page
+ * table. Under normal operation this only occurs with PT pages.
+ */
+static __inline
int
-_pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
- pmap_inval_info_t info)
+pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
+ pmap_inval_info_t info)
{
+ if (!vm_page_unwire_quick(m))
+ return 0;
+
/*
* Wait until we can busy the page ourselves. We cannot have
* any active flushes if we block. We own one hold count on the
* page so it cannot be freed out from under us.
*/
- if (m->flags & PG_BUSY) {
- while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
- ;
- }
+ vm_page_busy_wait(m, FALSE, "pmuwpt");
KASSERT(m->queue == PQ_NONE,
("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m));
/*
- * This case can occur if new references were acquired while
- * we were blocked.
+ * New references can bump the wire_count while we were blocked,
+ * try to unwire quickly again (e.g. 2->1).
*/
- if (m->hold_count > 1) {
- KKASSERT(m->hold_count > 1);
- vm_page_unhold(m);
+ if (vm_page_unwire_quick(m) == 0) {
+ vm_page_wakeup(m);
return 0;
}
/*
* Unmap the page table page
*/
- KKASSERT(m->hold_count == 1);
- vm_page_busy(m);
+ KKASSERT(m->wire_count == 1);
pmap_inval_interlock(info, pmap, -1);
if (m->pindex >= (NUPDE + NUPDPE)) {
/* PDP page */
pml4_entry_t *pml4;
pml4 = pmap_pml4e(pmap, va);
+ KKASSERT(*pml4);
*pml4 = 0;
} else if (m->pindex >= NUPDE) {
/* PD page */
pdp_entry_t *pdp;
pdp = pmap_pdpe(pmap, va);
+ KKASSERT(*pdp);
*pdp = 0;
} else {
/* PT page */
pd_entry_t *pd;
pd = pmap_pde(pmap, va);
+ KKASSERT(*pd);
*pd = 0;
}
}
/*
- * This was our last hold, the page had better be unwired
- * after we decrement wire_count.
- *
- * FUTURE NOTE: shared page directory page could result in
- * multiple wire counts.
+ * This was our wiring.
*/
- vm_page_unhold(m);
- --m->wire_count;
+ KKASSERT(m->flags & PG_UNMANAGED);
+ vm_page_unwire(m, 0);
KKASSERT(m->wire_count == 0);
- --vmstats.v_wire_count;
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
vm_page_flash(m);
vm_page_free_zero(m);
}
/*
- * After removing a page table entry, this routine is used to
- * conditionally free the page, and manage the hold/wire counts.
- */
-static
-int
-pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
- pmap_inval_info_t info)
-{
- vm_pindex_t ptepindex;
-
- if (va >= VM_MAX_USER_ADDRESS)
- return 0;
-
- if (mpte == NULL) {
- ptepindex = pmap_pde_pindex(va);
-#if JGHINT
- if (pmap->pm_ptphint &&
- (pmap->pm_ptphint->pindex == ptepindex)) {
- mpte = pmap->pm_ptphint;
- } else {
-#endif
- mpte = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
- pmap->pm_ptphint = mpte;
-#if JGHINT
- }
-#endif
- }
- return pmap_unwire_pte_hold(pmap, va, mpte, info);
-}
-
-/*
* Initialize pmap0/vmspace0. This pmap is not added to pmap_list because
* it, and IdlePTD, represents the template used to update all other pmaps.
*
pmap->pm_active = 0;
pmap->pm_ptphint = NULL;
TAILQ_INIT(&pmap->pm_pvlist);
+ TAILQ_INIT(&pmap->pm_pvlist_free);
+ pmap->pm_hold = 0;
+ spin_init(&pmap->pm_spin);
+ lwkt_token_init(&pmap->pm_token, "pmap_tok");
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
}
void
pmap_pinit(struct pmap *pmap)
{
- vm_page_t ptdpg;
+ vm_page_t pml4pg;
/*
* No need to allocate page table space yet but we do need a valid
/*
* Allocate an object for the ptes
*/
- if (pmap->pm_pteobj == NULL)
- pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, NUPDE + NUPDPE + PML4PML4I + 1);
+ if (pmap->pm_pteobj == NULL) {
+ pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT,
+ NUPDE + NUPDPE + PML4PML4I + 1);
+ }
/*
* Allocate the page directory page, unless we already have
* one cached. If we used the cached page the wire_count will
* already be set appropriately.
*/
- if ((ptdpg = pmap->pm_pdirm) == NULL) {
- ptdpg = vm_page_grab(pmap->pm_pteobj,
- NUPDE + NUPDPE + PML4PML4I,
- VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
- pmap->pm_pdirm = ptdpg;
- vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY);
- ptdpg->valid = VM_PAGE_BITS_ALL;
- if (ptdpg->wire_count == 0)
- ++vmstats.v_wire_count;
- ptdpg->wire_count = 1;
- pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg));
- }
- if ((ptdpg->flags & PG_ZERO) == 0)
+ if ((pml4pg = pmap->pm_pdirm) == NULL) {
+ pml4pg = vm_page_grab(pmap->pm_pteobj,
+ NUPDE + NUPDPE + PML4PML4I,
+ VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+ pmap->pm_pdirm = pml4pg;
+ vm_page_unmanage(pml4pg);
+ vm_page_flag_clear(pml4pg, PG_MAPPED);
+ pml4pg->valid = VM_PAGE_BITS_ALL;
+ vm_page_wire(pml4pg);
+ vm_page_wakeup(pml4pg);
+ pmap_kenter((vm_offset_t)pmap->pm_pml4,
+ VM_PAGE_TO_PHYS(pml4pg));
+ }
+ if ((pml4pg->flags & PG_ZERO) == 0)
bzero(pmap->pm_pml4, PAGE_SIZE);
#ifdef PMAP_DEBUG
else
- pmap_page_assertzero(VM_PAGE_TO_PHYS(ptdpg));
+ pmap_page_assertzero(VM_PAGE_TO_PHYS(pml4pg));
#endif
+ vm_page_flag_clear(pml4pg, PG_ZERO);
pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
pmap->pm_pml4[DMPML4I] = DMPDPphys | PG_RW | PG_V | PG_U;
/* install self-referential address mapping entry */
- pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(ptdpg) | PG_V | PG_RW | PG_A | PG_M;
+ pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) |
+ PG_V | PG_RW | PG_A | PG_M;
pmap->pm_count = 1;
pmap->pm_active = 0;
pmap->pm_ptphint = NULL;
TAILQ_INIT(&pmap->pm_pvlist);
+ TAILQ_INIT(&pmap->pm_pvlist_free);
+ pmap->pm_hold = 0;
+ spin_init(&pmap->pm_spin);
+ lwkt_token_init(&pmap->pm_token, "pmap_tok");
bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
pmap->pm_stats.resident_count = 1;
}
vm_page_t p;
KKASSERT(pmap->pm_active == 0);
- lwkt_gettoken(&vm_token);
if ((p = pmap->pm_pdirm) != NULL) {
KKASSERT(pmap->pm_pml4 != NULL);
KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
pmap_kremove((vm_offset_t)pmap->pm_pml4);
- p->wire_count--;
- vmstats.v_wire_count--;
- KKASSERT((p->flags & PG_BUSY) == 0);
- vm_page_busy(p);
+ vm_page_busy_wait(p, FALSE, "pgpun");
+ KKASSERT(p->flags & PG_UNMANAGED);
+ vm_page_unwire(p, 0);
vm_page_free_zero(p);
pmap->pm_pdirm = NULL;
}
vm_object_deallocate(pmap->pm_pteobj);
pmap->pm_pteobj = NULL;
}
- lwkt_reltoken(&vm_token);
}
/*
void
pmap_pinit2(struct pmap *pmap)
{
- lwkt_gettoken(&vm_token);
+ /*
+ * XXX copies current process, does not fill in MPPTDI
+ */
+ spin_lock(&pmap_spin);
TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
- /* XXX copies current process, does not fill in MPPTDI */
- lwkt_reltoken(&vm_token);
+ spin_unlock(&pmap_spin);
}
/*
* page-table pages. Those pages are zero now, and
* might as well be placed directly into the zero queue.
*/
- if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
+ if (vm_page_busy_try(p, FALSE)) {
+ vm_page_sleep_busy(p, FALSE, "pmaprl");
return 0;
-
- vm_page_busy(p);
+ }
/*
* Remove the page table page from the processes address space.
} else if (p->pindex >= (NUPDE + NUPDPE)) {
/*
* Remove a PDP page from the PML4. We do not maintain
- * hold counts on the PML4 page.
+ * wire counts on the PML4 page.
*/
pml4_entry_t *pml4;
vm_page_t m4;
int idx;
- m4 = vm_page_lookup(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I);
+ m4 = vm_page_lookup(pmap->pm_pteobj,
+ NUPDE + NUPDPE + PML4PML4I);
KKASSERT(m4 != NULL);
pml4 = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m4));
idx = (p->pindex - (NUPDE + NUPDPE)) % NPML4EPG;
pml4[idx] = 0;
} else if (p->pindex >= NUPDE) {
/*
- * Remove a PD page from the PDP and drop the hold count
- * on the PDP. The PDP is left cached in the pmap if
- * the hold count drops to 0 so the wire count remains
- * intact.
+ * Remove a PD page from the PDP and drop the wire count
+ * on the PDP. The PDP has a wire_count just from being
+ * mapped so the wire_count should never drop to 0 here.
*/
vm_page_t m3;
pdp_entry_t *pdp;
idx = (p->pindex - NUPDE) % NPDPEPG;
KKASSERT(pdp[idx] != 0);
pdp[idx] = 0;
- m3->hold_count--;
+ if (vm_page_unwire_quick(m3))
+ panic("pmap_release_free_page: m3 wire_count 1->0");
} else {
/*
- * Remove a PT page from the PD and drop the hold count
- * on the PD. The PD is left cached in the pmap if
- * the hold count drops to 0 so the wire count remains
- * intact.
+ * Remove a PT page from the PD and drop the wire count
+ * on the PD. The PD has a wire_count just from being
+ * mapped so the wire_count should never drop to 0 here.
*/
vm_page_t m2;
pd_entry_t *pd;
pd = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m2));
idx = p->pindex % NPDEPG;
pd[idx] = 0;
- m2->hold_count--;
+ if (vm_page_unwire_quick(m2))
+ panic("pmap_release_free_page: m2 wire_count 1->0");
}
/*
- * One fewer mappings in the pmap. p's hold count had better
- * be zero.
+ * p's wire_count should be transitioning from 1 to 0 here.
*/
+ KKASSERT(p->wire_count == 1);
+ KKASSERT(p->flags & PG_UNMANAGED);
KKASSERT(pmap->pm_stats.resident_count > 0);
+ vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
--pmap->pm_stats.resident_count;
- if (p->hold_count)
- panic("pmap_release: freeing held page table page");
if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
pmap->pm_ptphint = NULL;
vm_page_flag_set(p, PG_ZERO);
vm_page_wakeup(p);
} else {
- p->wire_count--;
+ vm_page_unwire(p, 0);
KKASSERT(p->wire_count == 0);
- vmstats.v_wire_count--;
/* JG eventually revert to using vm_page_free_zero() */
vm_page_free(p);
}
/*
* This routine is called when various levels in the page table need to
* be populated. This routine cannot fail.
+ *
+ * We returned a page wired for the caller. If we had to map the page into
+ * a parent page table it will receive an additional wire_count. For example,
+ * an empty page table directory which is still mapped into its pdp will
+ * retain a wire_count of 1.
*/
static
vm_page_t
* don't want to zero-out a raced page as this would desynchronize
* the pv_entry's for the related pte's and cause pmap_remove_all()
* to panic.
+ *
+ * Page table pages are unmanaged (do not use the normal PQ_s)
*/
if (m->valid == 0) {
+ vm_page_unmanage(m);
if ((m->flags & PG_ZERO) == 0) {
pmap_zero_page(VM_PAGE_TO_PHYS(m));
}
("_pmap_allocpte: %p->queue != PQ_NONE", m));
/*
- * Increment the hold count for the page we will be returning to
+ * Increment the wire_count for the page we will be returning to
* the caller.
*/
- m->hold_count++;
- if (m->wire_count++ == 0)
- vmstats.v_wire_count++;
+ vm_page_wire(m);
/*
* Map the pagetable page into the process address space, if
*/
if (ptepindex >= (NUPDE + NUPDPE)) {
/*
- * Wire up a new PDP page in the PML4
+ * Wire up a new PDP page in the PML4.
+ *
+ * (m) is busied so we cannot race another thread trying
+ * to map the PDP entry in the PML4.
*/
vm_pindex_t pml4index;
pml4_entry_t *pml4;
pml4index = ptepindex - (NUPDE + NUPDPE);
pml4 = &pmap->pm_pml4[pml4index];
- if (*pml4 & PG_V) {
- if (--m->wire_count == 0)
- --vmstats.v_wire_count;
- vm_page_wakeup(m);
- return(m);
+ if ((*pml4 & PG_V) == 0) {
+ *pml4 = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
+ PG_A | PG_M);
+ ++pmap->pm_stats.resident_count;
+ vm_page_wire_quick(m); /* wire for mapping */
}
- *pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+ /* return (m) wired for the caller */
} else if (ptepindex >= NUPDE) {
/*
* Wire up a new PD page in the PDP
pdpindex = ptepindex - NUPDE;
pml4index = pdpindex >> NPML4EPGSHIFT;
+ /*
+ * Once mapped the PDP is not unmapped during normal operation
+ * so we only need to handle races in the unmapped case.
+ *
+ * Mapping a PD into the PDP requires an additional wiring
+ * of the PDP.
+ */
pml4 = &pmap->pm_pml4[pml4index];
if ((*pml4 & PG_V) == 0) {
- /*
- * Have to allocate a new PDP page, recurse.
- * This always succeeds. Returned page will
- * be held.
- */
pdppg = _pmap_allocpte(pmap,
NUPDE + NUPDPE + pml4index);
+ /* pdppg wired for the map and also wired for return */
} else {
- /*
- * Add a held reference to the PDP page.
- */
pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
- pdppg->hold_count++;
+ vm_page_wire_quick(pdppg);
}
+ /* we have an extra ref on pdppg now for our use */
/*
- * Now find the pdp_entry and map the PDP. If the PDP
- * has already been mapped unwind and return the
- * already-mapped PDP held.
+ * Now find the PD entry in the PDP and map it.
*
- * pdppg is left held (hold_count is incremented for
- * each PD in the PDP).
+ * (m) is busied so we cannot race another thread trying
+ * to map the PD entry in the PDP.
+ *
+ * If the PD entry is already mapped we have to drop one
+ * wire count on the pdppg that we had bumped above.
*/
pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
- if (*pdp & PG_V) {
- vm_page_unhold(pdppg);
- if (--m->wire_count == 0)
- --vmstats.v_wire_count;
- vm_page_wakeup(m);
- return(m);
+
+ if ((*pdp & PG_V) == 0) {
+ *pdp = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
+ PG_A | PG_M);
+ vm_page_wire_quick(m); /* wire for mapping */
+ ++pmap->pm_stats.resident_count;
+ /* eat extra pdppg wiring for mapping */
+ } else {
+ if (vm_page_unwire_quick(pdppg))
+ panic("pmap_allocpte: unwire case 1");
}
- *pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+ /* return (m) wired for the caller */
} else {
/*
* Wire up the new PT page in the PD
pml4_entry_t *pml4;
pdp_entry_t *pdp;
pd_entry_t *pd;
+ vm_page_t pdppg;
vm_page_t pdpg;
pdpindex = ptepindex >> NPDPEPGSHIFT;
pml4index = pdpindex >> NPML4EPGSHIFT;
/*
- * Locate the PDP page in the PML4, then the PD page in
- * the PDP. If either does not exist we simply recurse
- * to allocate them.
+ * Locate the PDP page in the PML4
*
- * We can just recurse on the PD page as it will recurse
- * on the PDP if necessary.
+ * Once mapped the PDP is not unmapped during normal operation
+ * so we only need to handle races in the unmapped case.
*/
pml4 = &pmap->pm_pml4[pml4index];
if ((*pml4 & PG_V) == 0) {
+ pdppg = _pmap_allocpte(pmap, NUPDE + pdpindex);
+ } else {
+ pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
+ vm_page_wire_quick(pdppg);
+ }
+ /* we have an extra ref on pdppg now for our use */
+
+ /*
+ * Locate the PD page in the PDP
+ *
+ * Once mapped the PDP is not unmapped during normal operation
+ * so we only need to handle races in the unmapped case.
+ *
+ * We can scrap the extra reference on pdppg not needed if
+ * *pdp is already mapped and also not needed if it wasn't
+ * because the _pmap_allocpte() picked up the case for us.
+ */
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
+ pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
+
+ if ((*pdp & PG_V) == 0) {
pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex);
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
} else {
- pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
- pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
- if ((*pdp & PG_V) == 0) {
- pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex);
- } else {
- pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
- pdpg->hold_count++;
- }
+ pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
+ vm_page_wire_quick(pdpg);
}
+ vm_page_unwire_quick(pdppg);
+ /* we have an extra ref on pdpg now for our use */
/*
- * Now fill in the pte in the PD. If the pte already exists
- * (again, if we raced the grab), unhold pdpg and unwire
- * m, returning a held m.
+ * Locate the PT page in the PD.
*
- * pdpg is left held (hold_count is incremented for
- * each PT in the PD).
+ * (m) is busied so we cannot race another thread trying
+ * to map the PT page in the PD.
*/
pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
- if (*pd != 0) {
- vm_page_unhold(pdpg);
- if (--m->wire_count == 0)
- --vmstats.v_wire_count;
- vm_page_wakeup(m);
- return(m);
+ if ((*pd & PG_V) == 0) {
+ *pd = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
+ PG_A | PG_M);
+ ++pmap->pm_stats.resident_count;
+ vm_page_wire_quick(m); /* wire for mapping */
+ /* eat extra pdpg wiring for mapping */
+ } else {
+ if (vm_page_unwire_quick(pdpg))
+ panic("pmap_allocpte: unwire case 2");
}
- *pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
+ /* return (m) wired for the caller */
}
/*
* valid bits, mapped flag, unbusy, and we're done.
*/
pmap->pm_ptphint = m;
- ++pmap->pm_stats.resident_count;
#if 0
m->valid = VM_PAGE_BITS_ALL;
pd_entry_t *pd;
vm_page_t m;
+ ASSERT_LWKT_TOKEN_HELD(vm_object_token(pmap->pm_pteobj));
+
/*
* Calculate pagetable page index
*/
/*
* If the page table page is mapped, we just increment the
- * hold count, and activate it.
+ * wire count, and activate it.
*/
if (pd != NULL && (*pd & PG_V) != 0) {
- /* YYY hint is used here on i386 */
- m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
+ m = pmap_page_lookup(pmap->pm_pteobj, ptepindex);
pmap->pm_ptphint = m;
- m->hold_count++;
+ vm_page_wire_quick(m);
+ vm_page_wakeup(m);
return m;
}
/*
* Release any resources held by the given physical map.
* Called when a pmap initialized by pmap_pinit is being released.
* Should only be called if the map contains no valid mappings.
+ *
+ * Caller must hold pmap->pm_token
*/
static int pmap_release_callback(struct vm_page *p, void *data);
+static __inline
+void
+pmap_auto_yield(struct rb_vm_page_scan_info *info)
+{
+ if (++info->desired >= pmap_yield_count) {
+ info->desired = 0;
+ lwkt_yield();
+ }
+}
+
void
pmap_release(struct pmap *pmap)
{
info.pmap = pmap;
info.object = object;
- vm_object_hold(object);
- lwkt_gettoken(&vm_token);
+
+ spin_lock(&pmap_spin);
TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
+ spin_unlock(&pmap_spin);
+ info.desired = 0;
+ vm_object_hold(object);
do {
info.error = 0;
info.mpte = NULL;
info.error = 1;
}
} while (info.error);
- lwkt_reltoken(&vm_token);
vm_object_drop(object);
+
+ while (pmap->pm_hold)
+ tsleep(pmap, 0, "pmapx", 1);
}
static
}
if (!pmap_release_free_page(info->pmap, p)) {
info->error = 1;
+ pmap_auto_yield(info);
return(-1);
}
if (info->object->generation != info->limit) {
info->error = 1;
+ pmap_auto_yield(info);
return(-1);
}
return(0);
pdp_entry_t newpdp;
int update_kernel_vm_end;
- lwkt_gettoken(&vm_token);
+ vm_object_hold(kptobj);
/*
* bootstrap kernel_vm_end on first real VM use
if (update_kernel_vm_end && kernel_vm_end < kstart)
kernel_vm_end = kstart;
- lwkt_reltoken(&vm_token);
+ vm_object_drop(kptobj);
}
/*
if (pmap == NULL)
return;
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
count = --pmap->pm_count;
if (count == 0) {
- pmap_release(pmap);
+ pmap_release(pmap); /* eats pm_token */
panic("destroying a pmap is not yet implemented");
}
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
}
/*
pmap_reference(pmap_t pmap)
{
if (pmap != NULL) {
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
pmap->pm_count++;
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
}
}
void
free_pv_entry(pv_entry_t pv)
{
- pv_entry_count--;
+ atomic_add_int(&pv_entry_count, -1);
KKASSERT(pv_entry_count >= 0);
zfree(pvzone, pv);
}
pv_entry_t
get_pv_entry(void)
{
- pv_entry_count++;
+ atomic_add_int(&pv_entry_count, 1);
if (pv_entry_high_water &&
(pv_entry_count > pv_entry_high_water) &&
(pmap_pagedaemon_waken == 0)) {
if (pmap_pagedaemon_waken == 0)
return;
- lwkt_gettoken(&vm_token);
+ pmap_pagedaemon_waken = 0;
if (warningdone < 5) {
kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
warningdone++;
}
- for(i = 0; i < vm_page_array_size; i++) {
+ for (i = 0; i < vm_page_array_size; i++) {
m = &vm_page_array[i];
- if (m->wire_count || m->hold_count || m->busy ||
- (m->flags & PG_BUSY))
+ if (m->wire_count || m->hold_count)
continue;
- pmap_remove_all(m);
+ if (vm_page_busy_try(m, TRUE) == 0) {
+ if (m->wire_count == 0 && m->hold_count == 0) {
+ pmap_remove_all(m);
+ }
+ vm_page_wakeup(m);
+ }
}
- pmap_pagedaemon_waken = 0;
- lwkt_reltoken(&vm_token);
}
* Otherwise we must search the list for the entry. In either case we
* free the now unused entry.
*
- * Caller must hold vm_token
+ * Caller must hold pmap->pm_token
*/
static
int
pmap_remove_entry(struct pmap *pmap, vm_page_t m,
- vm_offset_t va, pmap_inval_info_t info)
+ vm_offset_t va, pmap_inval_info_t info)
{
pv_entry_t pv;
int rtval;
+ spin_lock(&pmap_spin);
if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
if (pmap == pv->pv_pmap && va == pv->pv_va)
KKASSERT(pv);
TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ m->md.pv_generation++;
m->md.pv_list_count--;
- m->object->agg_pv_list_count--;
+ vm_page_spin_lock(m);
+ if (m->object)
+ atomic_add_int(&m->object->agg_pv_list_count, -1);
+ vm_page_spin_unlock(m);
KKASSERT(m->md.pv_list_count >= 0);
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
++pmap->pm_generation;
+ spin_unlock(&pmap_spin);
+
rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
free_pv_entry(pv);
/*
* Create a pv entry for page at pa for (pmap, va).
*
- * Caller must hold vm_token
+ * Caller must hold pmap token
*/
static
void
pv->pv_pmap = pmap;
pv->pv_ptem = mpte;
+ spin_lock(&pmap_spin);
TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
- ++pmap->pm_generation;
+ m->md.pv_generation++;
m->md.pv_list_count++;
- m->object->agg_pv_list_count++;
+ vm_page_spin_lock(m);
+ if (m->object)
+ atomic_add_int(&m->object->agg_pv_list_count, 1);
+ vm_page_spin_unlock(m);
+ pmap->pm_generation++;
+ spin_unlock(&pmap_spin);
}
/*
* pmap_remove_pte: do the things to unmap a page in a process
*
- * Caller must hold vm_token
+ * Caller must hold pmap token
*/
static
int
pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
- pmap_inval_info_t info)
+ pmap_inval_info_t info)
{
pt_entry_t oldpte;
vm_page_t m;
+ ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token);
+
pmap_inval_interlock(info, pmap, va);
oldpte = pte_load_clear(ptq);
pmap_inval_deinterlock(info, pmap);
if (oldpte & PG_A)
vm_page_flag_set(m, PG_REFERENCED);
return pmap_remove_entry(pmap, m, va, info);
- } else {
+ }
+/*
+ else {
return pmap_unuse_pt(pmap, va, NULL, info);
}
+*/
return 0;
}
* This function may not be called from an interrupt if the pmap is
* not kernel_pmap.
*
- * Caller must hold vm_token
+ * Caller must hold pmap->pm_token
*/
static
void
{
pt_entry_t *pte;
+ ASSERT_LWKT_TOKEN_HELD(&pmap->pm_token);
+
pte = pmap_pte(pmap, va);
if (pte == NULL)
return;
if (pmap == NULL)
return;
- lwkt_gettoken(&vm_token);
+ vm_object_hold(pmap->pm_pteobj);
+ lwkt_gettoken(&pmap->pm_token);
if (pmap->pm_stats.resident_count == 0) {
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
+ vm_object_drop(pmap->pm_pteobj);
return;
}
if (pde && (*pde & PG_PS) == 0) {
pmap_remove_page(pmap, sva, &info);
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
+ vm_object_drop(pmap->pm_pteobj);
return;
}
}
}
}
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
+ vm_object_drop(pmap->pm_pteobj);
}
/*
- * pmap_remove_all:
+ * Removes this physical page from all physical maps in which it resides.
+ * Reflects back modify bits to the pager.
*
- * Removes this physical page from all physical maps in which it resides.
- * Reflects back modify bits to the pager.
- *
- * This routine may not be called from an interrupt.
+ * This routine may not be called from an interrupt.
*/
-
static
void
pmap_remove_all(vm_page_t m)
struct pmap_inval_info info;
pt_entry_t *pte, tpte;
pv_entry_t pv;
+ struct pmap *pmap;
if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
return;
- lwkt_gettoken(&vm_token);
pmap_inval_init(&info);
+ spin_lock(&pmap_spin);
while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
+ /*
+ * We have to be holding the pmap token to interlock
+ * the pte destruction and pv removal. XXX need hold on
+ * pmap.
+ */
+ pmap = pv->pv_pmap;
+ spin_unlock(&pmap_spin);
+ lwkt_gettoken(&pmap->pm_token); /* XXX hold race */
+ spin_lock(&pmap_spin);
+ if (pv != TAILQ_FIRST(&m->md.pv_list)) {
+ spin_unlock(&pmap_spin);
+ lwkt_reltoken(&pmap->pm_token);
+ spin_lock(&pmap_spin);
+ continue;
+ }
+
+ /*
+ * Remove the pv
+ */
+ TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+ TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
+ m->md.pv_generation++;
+ m->md.pv_list_count--;
+ vm_page_spin_lock(m);
+ if (m->object)
+ atomic_add_int(&m->object->agg_pv_list_count, -1);
+ vm_page_spin_unlock(m);
+ KKASSERT(m->md.pv_list_count >= 0);
+ ++pv->pv_pmap->pm_generation;
+ spin_unlock(&pmap_spin);
+
+ /*
+ * pv is now isolated
+ */
KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
--pv->pv_pmap->pm_stats.resident_count;
if (tpte & PG_M) {
#if defined(PMAP_DIAGNOSTIC)
if (pmap_nw_modified(tpte)) {
- kprintf(
- "pmap_remove_all: modified page not writable: va: 0x%lx, pte: 0x%lx\n",
- pv->pv_va, tpte);
+ kprintf("pmap_remove_all: modified page not "
+ "writable: va: 0x%lx, pte: 0x%lx\n",
+ pv->pv_va, tpte);
}
#endif
if (pmap_track_modified(pv->pv_va))
- vm_page_dirty(m);
+ vm_page_dirty(m); /* XXX races(m) */
}
- TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
- TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
- ++pv->pv_pmap->pm_generation;
- m->md.pv_list_count--;
- m->object->agg_pv_list_count--;
- KKASSERT(m->md.pv_list_count >= 0);
+
+ spin_lock(&pmap_spin);
if (TAILQ_EMPTY(&m->md.pv_list))
vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+ spin_unlock(&pmap_spin);
+
pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
+ lwkt_reltoken(&pv->pv_pmap->pm_token);
+
free_pv_entry(pv);
+ spin_lock(&pmap_spin);
}
+ spin_unlock(&pmap_spin);
KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
}
/*
if (prot & VM_PROT_WRITE)
return;
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
pmap_inval_init(&info);
for (; sva < eva; sva = va_next) {
-
pml4e = pmap_pml4e(pmap, sva);
if ((*pml4e & PG_V) == 0) {
va_next = (sva + NBPML4) & ~PML4MASK;
}
}
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
}
/*
#endif
}
- lwkt_gettoken(&vm_token);
+ vm_object_hold(pmap->pm_pteobj);
+ lwkt_gettoken(&pmap->pm_token);
/*
* In the case that a page table page is not
else
mpte = NULL;
- pmap_inval_init(&info);
+ if ((prot & VM_PROT_NOSYNC) == 0)
+ pmap_inval_init(&info);
pde = pmap_pde(pmap, va);
if (pde != NULL && (*pde & PG_V) != 0) {
if ((*pde & PG_PS) != 0)
panic("pmap_enter: attempted pmap_enter on 2MB page");
pte = pmap_pde_to_pte(pde, va);
- } else
+ } else {
panic("pmap_enter: invalid page directory va=%#lx", va);
+ }
KKASSERT(pte != NULL);
pa = VM_PAGE_TO_PHYS(m);
* bits below.
*/
if (mpte)
- mpte->hold_count--;
+ vm_page_unwire_quick(mpte);
/*
* We might be turning off write access to the page,
* Enter on the PV list if part of our managed memory. Note that we
* raise IPL while manipulating pv_table since pmap_enter can be
* called at interrupt time.
+ *
+ * The new mapping covers mpte's new wiring count so we don't
+ * unwire it.
*/
if (pmap_initialized &&
(m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
* to update the pte.
*/
if ((origpte & ~(PG_M|PG_A)) != newpte) {
- pmap_inval_interlock(&info, pmap, va);
+ if ((prot & VM_PROT_NOSYNC) == 0)
+ pmap_inval_interlock(&info, pmap, va);
*pte = newpte | PG_A;
- pmap_inval_deinterlock(&info, pmap);
+ if (prot & VM_PROT_NOSYNC)
+ cpu_invlpg((void *)va);
+ else
+ pmap_inval_deinterlock(&info, pmap);
if (newpte & PG_RW)
vm_page_flag_set(m, PG_WRITEABLE);
}
KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
- pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
+ if ((prot & VM_PROT_NOSYNC) == 0)
+ pmap_inval_done(&info);
+ lwkt_reltoken(&pmap->pm_token);
+ vm_object_drop(pmap->pm_pteobj);
}
/*
pt_entry_t *pte;
vm_paddr_t pa;
vm_page_t mpte;
- vm_pindex_t ptepindex;
- pd_entry_t *ptepa;
pmap_inval_info info;
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
+ vm_object_hold(pmap->pm_pteobj);
pmap_inval_init(&info);
if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
- kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n");
+ kprintf("Warning: pmap_enter_quick called on UVA with"
+ "kernel_pmap\n");
#ifdef DDB
db_print_backtrace();
#endif
}
if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
- kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n");
+ kprintf("Warning: pmap_enter_quick called on KVA without"
+ "kernel_pmap\n");
#ifdef DDB
db_print_backtrace();
#endif
/*
* Calculate the page table page (mpte), allocating it if necessary.
*
- * A held page table page (mpte), or NULL, is passed onto the
+ * A wired page table page (mpte), or NULL, is passed onto the
* section following.
*/
if (va < VM_MAX_USER_ADDRESS) {
- /*
- * Calculate pagetable page index
- */
- ptepindex = pmap_pde_pindex(va);
-
- do {
- /*
- * Get the page directory entry
- */
- ptepa = pmap_pde(pmap, va);
-
- /*
- * If the page table page is mapped, we just increment
- * the hold count, and activate it.
- */
- if (ptepa && (*ptepa & PG_V) != 0) {
- if (*ptepa & PG_PS)
- panic("pmap_enter_quick: unexpected mapping into 2MB page");
-// if (pmap->pm_ptphint &&
-// (pmap->pm_ptphint->pindex == ptepindex)) {
-// mpte = pmap->pm_ptphint;
-// } else {
- mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
- pmap->pm_ptphint = mpte;
-// }
- if (mpte)
- mpte->hold_count++;
- } else {
- mpte = _pmap_allocpte(pmap, ptepindex);
- }
- } while (mpte == NULL);
+ mpte = pmap_allocpte(pmap, va);
} else {
mpte = NULL;
/* this code path is not yet used */
*/
pte = vtopte(va);
if (*pte & PG_V) {
- if (mpte)
- pmap_unwire_pte_hold(pmap, va, mpte, &info);
pa = VM_PAGE_TO_PHYS(m);
KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
+ if (mpte)
+ pmap_unwire_pte_hold(pmap, va, mpte, &info);
+ vm_object_drop(pmap->pm_pteobj);
+ lwkt_reltoken(&pmap->pm_token);
return;
}
/*
- * Enter on the PV list if part of our managed memory
+ * Enter on the PV list if part of our managed memory.
+ *
+ * The new mapping covers mpte's new wiring count so we don't
+ * unwire it.
*/
if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
pmap_insert_entry(pmap, va, mpte, m);
*pte = pa | PG_V | PG_U | PG_MANAGED;
/* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
pmap_inval_done(&info);
- lwkt_reltoken(&vm_token);
+ vm_object_drop(pmap->pm_pteobj);
+ lwkt_reltoken(&pmap->pm_token);
}
/*
info.mpte = NULL;
info.addr = addr;
info.pmap = pmap;
+ info.desired = 0;
vm_object_hold(object);
- lwkt_gettoken(&vm_token);
vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
pmap_object_init_pt_callback, &info);
- lwkt_reltoken(&vm_token);
vm_object_drop(object);
}
{
struct rb_vm_page_scan_info *info = data;
vm_pindex_t rel_index;
+
/*
* don't allow an madvise to blow away our really
* free pages allocating pv entries.
vmstats.v_free_count < vmstats.v_free_reserved) {
return(-1);
}
+ if (vm_page_busy_try(p, TRUE))
+ return 0;
if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
- (p->busy == 0) && (p->flags & (PG_BUSY | PG_FICTITIOUS)) == 0) {
- vm_page_busy(p);
+ (p->flags & PG_FICTITIOUS) == 0) {
if ((p->queue - p->pc) == PQ_CACHE)
vm_page_deactivate(p);
rel_index = p->pindex - info->start_pindex;
pmap_enter_quick(info->pmap,
info->addr + x86_64_ptob(rel_index), p);
- vm_page_wakeup(p);
}
+ vm_page_wakeup(p);
+ pmap_auto_yield(info);
return(0);
}
pd_entry_t *pde;
int ret;
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
pde = pmap_pde(pmap, addr);
if (pde == NULL || *pde == 0) {
ret = 0;
pte = vtopte(addr);
ret = (*pte) ? 0 : 1;
}
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
return(ret);
}
if (pmap == NULL)
return;
- lwkt_gettoken(&vm_token);
+ lwkt_gettoken(&pmap->pm_token);
pte = pmap_pte(pmap, va);
if (wired && !pmap_pte_w(pte))
else
atomic_clear_long_nonlocked(pte, PG_W);
#endif
- lwkt_reltoken(&vm_token);
+ lwkt_reltoken(&pmap->pm_token);
}