From d2d8515bfb3ec54e781aeef674516e513bfbb8cb Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 27 Oct 2011 23:50:51 -0700 Subject: [PATCH] kernel - More many-cores SMP work * Add lwkt_yield() calls in a few critical places which can hog the cpu on large many-cores boxes during periods of very heavy contention. This allows other kernel threads on the same cpu to run and reduces symptoms of e.g. high ping times under certain load conditions. * Run the callout kernel threads at the same priority as other kernel threads so cpu-hogging operations run from callouts can yield to other kernel threads (e.g. yield to the netisr threads). * Change the vm_page_alloc() API to catch situations where the allocation races an insertion due to potentially blocking when dealing with PQ_CACHE pages. VM_ALLOC_NULL_OK allows vm_page_alloc() to return NULL in this case (otherwise it will panic). * Change vm_page_insert() to return TRUE if the insertion succeeded and FALSE if it didn't due to a race against another thread. * Change the meaning of the cpuid argument to lwkt_alloc_thread() and lwkt_create(). A cpuid of -1 will cause the kernel to choose a cpu to run the thread on (instead of choosing the current cpu). Eventually this specification will allow dynamic migration (but not at the moment). Adjust lwp_fork() to specify the current cpu, required for initial LWKT calls when setting the forked thread up. Numerous kernel threads will now be spread around available cpus for now. devfs core threads, NFS socket threads, etc. Interrupt threads are still fixed on cpu 0 awaiting additional work from Sephe. Put the emergency interrupt thread on the last cpu. * Change the vm_page_grab() API. When VM_ALLOC_ZERO is specified the vm_page_grab() code will automatically set an invalid page valid and zero it (using the PG_ZERO optimization if possible). Pages which are already valid are not zero'd. This simplies several use cases. * Change vm_fault_page() to enter the page into the pmap while the vm_map is still locked, instead of after unlocking it. For now anyhow. * Minor change to ensure that a deterministic value is stored in *freebuf in vn_fullpath(). * Minor debugging features added to help track down a x86-64 sge-fault issue. --- sys/dev/agp/agp.c | 15 ++-- sys/dev/agp/agp_i810.c | 7 +- sys/kern/kern_fork.c | 11 ++- sys/kern/kern_intr.c | 6 +- sys/kern/kern_kthread.c | 2 + sys/kern/kern_synch.c | 3 + sys/kern/kern_timeout.c | 6 +- sys/kern/lwkt_thread.c | 28 +++--- sys/kern/subr_disk.c | 2 +- sys/kern/uipc_syscalls.c | 3 +- sys/kern/vfs_cache.c | 1 + sys/platform/pc32/i386/pmap.c | 38 ++------ sys/platform/pc64/x86_64/trap.c | 34 +++++++- sys/platform/vkernel/platform/pmap.c | 15 +--- sys/platform/vkernel64/platform/pmap.c | 23 ++--- sys/sys/kthread.h | 2 - sys/vfs/devfs/devfs_core.c | 2 +- sys/vfs/nfs/nfs_vfsops.c | 4 +- sys/vm/device_pager.c | 5 +- sys/vm/swap_pager.c | 9 ++ sys/vm/vm_contig.c | 6 +- sys/vm/vm_fault.c | 61 +++++++++---- sys/vm/vm_kern.c | 7 +- sys/vm/vm_map.c | 5 +- sys/vm/vm_object.c | 12 ++- sys/vm/vm_page.c | 115 +++++++++++++++++-------- sys/vm/vm_page.h | 15 ++-- sys/vm/vm_pageout.c | 5 ++ sys/vm/vm_swapcache.c | 2 +- 29 files changed, 262 insertions(+), 182 deletions(-) diff --git a/sys/dev/agp/agp.c b/sys/dev/agp/agp.c index 77aec71f02..c4074cddbd 100644 --- a/sys/dev/agp/agp.c +++ b/sys/dev/agp/agp.c @@ -533,16 +533,15 @@ agp_generic_bind_memory(device_t dev, struct agp_memory *mem, */ for (i = 0; i < mem->am_size; i += PAGE_SIZE) { /* - * Find a page from the object and wire it - * down. This page will be mapped using one or more - * entries in the GATT (assuming that PAGE_SIZE >= - * AGP_PAGE_SIZE. If this is the first call to bind, - * the pages will be allocated and zeroed. + * Find a page from the object and wire it down. This page + * will be mapped using one or more entries in the GATT + * (assuming that PAGE_SIZE >= AGP_PAGE_SIZE. If this is + * the first call to bind, the pages will be allocated + * and zeroed. */ m = vm_page_grab(mem->am_obj, OFF_TO_IDX(i), - VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); - if ((m->flags & PG_ZERO) == 0) - vm_page_zero_fill(m); + VM_ALLOC_NORMAL | VM_ALLOC_ZERO | + VM_ALLOC_RETRY); AGP_DPF("found page pa=%#x\n", VM_PAGE_TO_PHYS(m)); vm_page_wire(m); diff --git a/sys/dev/agp/agp_i810.c b/sys/dev/agp/agp_i810.c index 7e457da1d9..9c9aa16856 100644 --- a/sys/dev/agp/agp_i810.c +++ b/sys/dev/agp/agp_i810.c @@ -969,10 +969,9 @@ agp_i810_alloc_memory(device_t dev, int type, vm_size_t size) */ vm_page_t m; - m = vm_page_grab(mem->am_obj, 0, - VM_ALLOC_NORMAL|VM_ALLOC_ZERO|VM_ALLOC_RETRY); - if ((m->flags & PG_ZERO) == 0) - vm_page_zero_fill(m); + m = vm_page_grab(mem->am_obj, 0, VM_ALLOC_NORMAL | + VM_ALLOC_ZERO | + VM_ALLOC_RETRY); vm_page_wire(m); mem->am_physical = VM_PAGE_TO_PHYS(m); vm_page_wakeup(m); diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 4a8a16ce10..cc0edff7a3 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -607,6 +607,7 @@ done: static struct lwp * lwp_fork(struct lwp *origlp, struct proc *destproc, int flags) { + globaldata_t gd = mycpu; struct lwp *lp; struct thread *td; @@ -627,13 +628,16 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags) * scheduler specific data. */ crit_enter(); - lp->lwp_cpbase = mycpu->gd_schedclock.time - - mycpu->gd_schedclock.periodic; + lp->lwp_cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; destproc->p_usched->heuristic_forking(origlp, lp); crit_exit(); lp->lwp_cpumask &= usched_mastermask; - td = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, -1, 0); + /* + * Assign the thread to the current cpu to begin with so we + * can manipulate it. + */ + td = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, gd->gd_cpuid, 0); lp->lwp_thread = td; td->td_proc = destproc; td->td_lwp = lp; @@ -661,7 +665,6 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags) destproc->p_lasttid = lp->lwp_tid; destproc->p_nthreads++; - return (lp); } diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c index 8a1383f841..101bc88a27 100644 --- a/sys/kern/kern_intr.c +++ b/sys/kern/kern_intr.c @@ -254,7 +254,7 @@ register_int(int intr, inthand2_t *handler, void *arg, const char *name, */ if (emergency_intr_thread.td_kstack == NULL) { lwkt_create(ithread_emergency, NULL, NULL, &emergency_intr_thread, - TDF_STOPREQ | TDF_INTTHREAD, -1, "ithread emerg"); + TDF_STOPREQ | TDF_INTTHREAD, ncpus - 1, "ithread emerg"); systimer_init_periodic_nq(&emergency_intr_timer, emergency_intr_timer_callback, &emergency_intr_thread, (emergency_intr_enable ? emergency_intr_freq : 1)); @@ -265,11 +265,13 @@ register_int(int intr, inthand2_t *handler, void *arg, const char *name, /* * Create an interrupt thread if necessary, leave it in an unscheduled * state. + * + * Put it on cpu 0 for now, other work is pending related to this. */ if (info->i_state == ISTATE_NOTHREAD) { info->i_state = ISTATE_NORMAL; lwkt_create(ithread_handler, (void *)(intptr_t)intr, NULL, - &info->i_thread, TDF_STOPREQ | TDF_INTTHREAD, -1, + &info->i_thread, TDF_STOPREQ | TDF_INTTHREAD, 0, "ithread %d", intr); if (intr >= FIRST_SOFTINT) lwkt_setpri(&info->i_thread, TDPRI_SOFT_NORM); diff --git a/sys/kern/kern_kthread.c b/sys/kern/kern_kthread.c index 3f731b6af2..a3b6e6c0ed 100644 --- a/sys/kern/kern_kthread.c +++ b/sys/kern/kern_kthread.c @@ -104,6 +104,7 @@ kthread_create_cpu(void (*func)(void *), void *arg, return 0; } +#if 0 /* * Same as kthread_create() but you can specify a custom stack size. */ @@ -126,6 +127,7 @@ kthread_create_stk(void (*func)(void *), void *arg, lwkt_schedule(td); return 0; } +#endif /* * Destroy an LWKT thread. Warning! This function is not called when diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 90a1d06f68..30e4c07fb6 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -233,6 +233,7 @@ schedcpu_stats(struct proc *p, void *data __unused) } } lwkt_reltoken(&p->p_token); + lwkt_yield(); PRELE(p); return(0); } @@ -289,6 +290,7 @@ schedcpu_resource(struct proc *p, void *data __unused) break; } lwkt_reltoken(&p->p_token); + lwkt_yield(); PRELE(p); return(0); } @@ -1216,6 +1218,7 @@ loadav_count_runnable(struct lwp *lp, void *data) default: break; } + lwkt_yield(); return(0); } diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c index 33379da852..5ca8f2df73 100644 --- a/sys/kern/kern_timeout.c +++ b/sys/kern/kern_timeout.c @@ -254,7 +254,11 @@ softclock_handler(void *arg) int mpsafe = 1; #endif - lwkt_setpri_self(TDPRI_SOFT_NORM); + /* + * Run the callout thread at the same priority as other kernel + * threads so it can be round-robined. + */ + /*lwkt_setpri_self(TDPRI_SOFT_NORM);*/ sc = arg; crit_enter(); diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 69251f29c0..e3dd7705ce 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -377,6 +377,7 @@ lwkt_gdinit(struct globaldata *gd) thread_t lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) { + static int cpu_rotator; globaldata_t gd = mycpu; void *stack; @@ -416,10 +417,12 @@ lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags) stack = (void *)kmem_alloc_stack(&kernel_map, stksize); flags |= TDF_ALLOCATED_STACK; } - if (cpu < 0) - lwkt_init_thread(td, stack, stksize, flags, gd); - else - lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); + if (cpu < 0) { + cpu = ++cpu_rotator; + cpu_ccfence(); + cpu %= ncpus; + } + lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu)); return(td); } @@ -1181,11 +1184,12 @@ lwkt_passive_release(struct thread *td) /* - * This implements a normal yield. This routine is virtually a nop if - * there is nothing to yield to but it will always run any pending interrupts - * if called from a critical section. + * This implements a LWKT yield, allowing a kernel thread to yield to other + * kernel threads at the same or higher priority. This function can be + * called in a tight loop and will typically only yield once per tick. * - * This yield is designed for kernel threads without a user context. + * Most kernel threads run at the same priority in order to allow equal + * sharing. * * (self contained on a per cpu basis) */ @@ -1450,11 +1454,11 @@ lwkt_deschedule(thread_t td) void lwkt_setpri(thread_t td, int pri) { - KKASSERT(td->td_gd == mycpu); if (td->td_pri != pri) { KKASSERT(pri >= 0); crit_enter(); if (td->td_flags & TDF_RUNQ) { + KKASSERT(td->td_gd == mycpu); _lwkt_dequeue(td); td->td_pri = pri; _lwkt_enqueue(td); @@ -1640,9 +1644,9 @@ lwkt_preempted_proc(void) * Create a kernel process/thread/whatever. It shares it's address space * with proc0 - ie: kernel only. * - * NOTE! By default new threads are created with the MP lock held. A - * thread which does not require the MP lock should release it by calling - * rel_mplock() at the start of the new thread. + * If the cpu is not specified one will be selected. In the future + * specifying a cpu of -1 will enable kernel thread migration between + * cpus. */ int lwkt_create(void (*func)(void *), void *arg, struct thread **tdp, diff --git a/sys/kern/subr_disk.c b/sys/kern/subr_disk.c index ea6c6a3525..16dfc1dc25 100644 --- a/sys/kern/subr_disk.c +++ b/sys/kern/subr_disk.c @@ -1395,7 +1395,7 @@ disk_init(void) lwkt_gettoken(&disklist_token); lwkt_create(disk_msg_core, /*args*/NULL, &td_core, NULL, - 0, 0, "disk_msg_core"); + 0, -1, "disk_msg_core"); tsleep(td_core, 0, "diskcore", 0); lwkt_reltoken(&disklist_token); } diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index cae66a2c65..561233b2f6 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -1611,7 +1611,8 @@ retry_lookup: goto retry_lookup; } if (pg == NULL) { - pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); + pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL | + VM_ALLOC_NULL_OK); if (pg == NULL) { vm_wait(0); vm_object_drop(obj); diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 8dd32b3c80..5b92fe5087 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -3312,6 +3312,7 @@ vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf, struct nchandle nch; int error; + *freebuf = NULL; atomic_add_int(&numfullpathcalls, 1); if (disablefullpath) return (ENODEV); diff --git a/sys/platform/pc32/i386/pmap.c b/sys/platform/pc32/i386/pmap.c index 45e3a85978..a06fc78377 100644 --- a/sys/platform/pc32/i386/pmap.c +++ b/sys/platform/pc32/i386/pmap.c @@ -1197,21 +1197,15 @@ pmap_pinit(struct pmap *pmap) */ if ((ptdpg = pmap->pm_pdirm) == NULL) { ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI, - VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + VM_ALLOC_NORMAL | VM_ALLOC_RETRY | + VM_ALLOC_ZERO); pmap->pm_pdirm = ptdpg; vm_page_flag_clear(ptdpg, PG_MAPPED); vm_page_wire(ptdpg); - ptdpg->valid = VM_PAGE_BITS_ALL; + KKASSERT(ptdpg->valid == VM_PAGE_BITS_ALL); pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); vm_page_wakeup(ptdpg); } - if ((ptdpg->flags & PG_ZERO) == 0) - bzero(pmap->pm_pdir, PAGE_SIZE); -#ifdef PMAP_DEBUG - else - pmap_page_assertzero(VM_PAGE_TO_PHYS(ptdpg)); -#endif - pmap->pm_pdir[MPPTDI] = PTD[MPPTDI]; /* install self-referential address mapping entry */ @@ -1357,10 +1351,11 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex) vm_page_t m; /* - * Find or fabricate a new pagetable page + * Find or fabricate a new pagetable page. Setting VM_ALLOC_ZERO + * will zero any new page and mark it valid. */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, - VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); + VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); @@ -1405,27 +1400,6 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex) * Set the page table hint */ pmap->pm_ptphint = m; - - /* - * Try to use the new mapping, but if we cannot, then - * do it with the routine that maps the page explicitly. - */ - if (m->valid == 0) { - if ((m->flags & PG_ZERO) == 0) { - if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) == - (((unsigned) PTDpde) & PG_FRAME)) { - pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex); - bzero((caddr_t) pteva, PAGE_SIZE); - } else { - pmap_zero_page(ptepa); - } - } - m->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(m, PG_ZERO); - } else { - KKASSERT((m->flags & PG_ZERO) == 0); - } - vm_page_flag_set(m, PG_MAPPED); vm_page_wakeup(m); diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index a0e0b878ea..863005d7f0 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -79,6 +79,7 @@ #include #include #include +#include #include #include #include @@ -151,6 +152,9 @@ SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW, static int ddb_on_seg_fault = 0; SYSCTL_INT(_machdep, OID_AUTO, ddb_on_seg_fault, CTLFLAG_RW, &ddb_on_seg_fault, 0, "Go to DDB on user seg-fault"); +static int freeze_on_seg_fault = 0; +SYSCTL_INT(_machdep, OID_AUTO, freeze_on_seg_fault, CTLFLAG_RW, + &freeze_on_seg_fault, 0, "Go to DDB on user seg-fault"); #endif static int panic_on_nmi = 1; SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW, @@ -162,6 +166,15 @@ static int slow_release; SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW, &slow_release, 0, "Passive Release was nonoptimal"); +/* + * System call debugging records the worst-case system call + * overhead (inclusive of blocking), but may be inaccurate. + */ +/*#define SYSCALL_DEBUG*/ +#ifdef SYSCALL_DEBUG +uint64_t SysCallsWorstCase[SYS_MAXSYSCALL]; +#endif + /* * Passively intercepts the thread switch function to increase * the thread priority from a user priority to a kernel priority, reducing @@ -490,8 +503,12 @@ trap(struct trapframe *frame) case T_PAGEFLT: /* page fault */ i = trap_pfault(frame, TRUE); - if (frame->tf_rip == 0) + if (frame->tf_rip == 0) { kprintf("T_PAGEFLT: Warning %%rip == 0!\n"); + while (freeze_on_seg_fault) { + tsleep(p, 0, "freeze", hz * 20); + } + } if (i == -1) goto out; if (i == 0) @@ -883,14 +900,18 @@ nogo: */ p = td->td_proc; if (td->td_lwp->lwp_vkernel == NULL) { - if (bootverbose) + if (bootverbose || freeze_on_seg_fault || ddb_on_seg_fault) { kprintf("seg-fault ft=%04x ff=%04x addr=%p rip=%p " "pid=%d p_comm=%s\n", ftype, fault_flags, (void *)frame->tf_addr, (void *)frame->tf_rip, p->p_pid, p->p_comm); + } #ifdef DDB + while (freeze_on_seg_fault) { + tsleep(p, 0, "freeze", hz * 20); + } if (ddb_on_seg_fault) Debugger("ddb_on_seg_fault"); #endif @@ -1185,7 +1206,16 @@ syscall2(struct trapframe *frame) * NOTE: All system calls run MPSAFE now. The system call itself * is responsible for getting the MP lock. */ +#ifdef SYSCALL_DEBUG + uint64_t tscval = rdtsc(); +#endif error = (*callp->sy_call)(&args); +#ifdef SYSCALL_DEBUG + tscval = rdtsc() - tscval; + tscval = tscval * 1000000 / tsc_frequency; + if (SysCallsWorstCase[code] < tscval) + SysCallsWorstCase[code] = tscval; +#endif out: /* diff --git a/sys/platform/vkernel/platform/pmap.c b/sys/platform/vkernel/platform/pmap.c index 1f19d6e402..8bfe86c6de 100644 --- a/sys/platform/vkernel/platform/pmap.c +++ b/sys/platform/vkernel/platform/pmap.c @@ -220,21 +220,17 @@ pmap_pinit(struct pmap *pmap) * allocate the page directory page */ ptdpg = vm_page_grab(pmap->pm_pteobj, pmap->pm_pdindex, - VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_ZERO); ptdpg->wire_count = 1; atomic_add_int(&vmstats.v_wire_count, 1); /* not usually mapped */ - ptdpg->valid = VM_PAGE_BITS_ALL; vm_page_flag_clear(ptdpg, PG_MAPPED); vm_page_wakeup(ptdpg); pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg)); pmap->pm_pdirpte = KernelPTA[(vm_offset_t)pmap->pm_pdir >> PAGE_SHIFT]; - if ((ptdpg->flags & PG_ZERO) == 0) - bzero(pmap->pm_pdir, PAGE_SIZE); - vm_page_flag_clear(ptdpg, PG_ZERO); pmap->pm_count = 1; pmap->pm_active = 0; @@ -1146,15 +1142,6 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex) */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); - - if (m->valid == 0) { - if ((m->flags & PG_ZERO) == 0) - pmap_zero_page(VM_PAGE_TO_PHYS(m)); - m->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(m, PG_ZERO); - } else { - KKASSERT((m->flags & PG_ZERO) == 0); - } vm_page_flag_set(m, PG_MAPPED); KASSERT(m->queue == PQ_NONE, diff --git a/sys/platform/vkernel64/platform/pmap.c b/sys/platform/vkernel64/platform/pmap.c index 4a7b41d53f..499b553885 100644 --- a/sys/platform/vkernel64/platform/pmap.c +++ b/sys/platform/vkernel64/platform/pmap.c @@ -1084,21 +1084,18 @@ pmap_pinit(struct pmap *pmap) * already be set appropriately. */ if ((ptdpg = pmap->pm_pdirm) == NULL) { - ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I, - VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + ptdpg = vm_page_grab(pmap->pm_pteobj, + NUPDE + NUPDPE + PML4PML4I, + VM_ALLOC_NORMAL | VM_ALLOC_RETRY | + VM_ALLOC_ZERO); pmap->pm_pdirm = ptdpg; vm_page_flag_clear(ptdpg, PG_MAPPED); - ptdpg->valid = VM_PAGE_BITS_ALL; if (ptdpg->wire_count == 0) atomic_add_int(&vmstats.v_wire_count, 1); ptdpg->wire_count = 1; vm_page_wakeup(ptdpg); pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg)); } - if ((ptdpg->flags & PG_ZERO) == 0) - bzero(pmap->pm_pml4, PAGE_SIZE); - vm_page_flag_clear(ptdpg, PG_ZERO); - pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_ptphint = NULL; @@ -1270,17 +1267,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex) * races by checking m->valid. */ m = vm_page_grab(pmap->pm_pteobj, ptepindex, - VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); - - if (m->valid == 0) { - if ((m->flags & PG_ZERO) == 0) { - pmap_zero_page(VM_PAGE_TO_PHYS(m)); - } - m->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(m, PG_ZERO); - } else { - KKASSERT((m->flags & PG_ZERO) == 0); - } + VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY); KASSERT(m->queue == PQ_NONE, ("_pmap_allocpte: %p->queue != PQ_NONE", m)); diff --git a/sys/sys/kthread.h b/sys/sys/kthread.h index 3690c15258..3b6d6d9d50 100644 --- a/sys/sys/kthread.h +++ b/sys/sys/kthread.h @@ -63,8 +63,6 @@ int kthread_create (void (*)(void *), void *, struct thread **, const char *, ...) __printflike(4, 5); int kthread_create_cpu (void (*)(void *), void *, struct thread **, int, const char *, ...) __printflike(5, 6); -int kthread_create_stk (void (*)(void *), void *, struct thread **, - int, const char *, ...) __printflike(5, 6); void kthread_exit (void) __dead2; #endif /* _KERNEL */ diff --git a/sys/vfs/devfs/devfs_core.c b/sys/vfs/devfs/devfs_core.c index f5ddfe5406..8fd99928fa 100644 --- a/sys/vfs/devfs/devfs_core.c +++ b/sys/vfs/devfs/devfs_core.c @@ -2500,7 +2500,7 @@ devfs_init(void) lockmgr(&devfs_lock, LK_EXCLUSIVE); lwkt_create(devfs_msg_core, /*args*/NULL, &td_core, NULL, - 0, 0, "devfs_msg_core"); + 0, -1, "devfs_msg_core"); while (devfs_run == 0) lksleep(td_core, &devfs_lock, 0, "devfsc", 0); lockmgr(&devfs_lock, LK_RELEASE); diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 4ebf91da8e..bbdbdbbd6b 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -1157,8 +1157,8 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, txcpu = 1; break; default: - rxcpu = 1; - txcpu = 2; + rxcpu = -1; + txcpu = -1; break; } #else diff --git a/sys/vm/device_pager.c b/sys/vm/device_pager.c index e3900bc533..bb64a1afd2 100644 --- a/sys/vm/device_pager.c +++ b/sys/vm/device_pager.c @@ -214,7 +214,10 @@ dev_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess) page, pageq); vm_object_hold(object); vm_page_free(*mpp); - vm_page_insert(page, object, offset); + if (vm_page_insert(page, object, offset) == FALSE) { + panic("dev_pager_getpage: page (%p,%ld) exists", + object, offset); + } vm_object_drop(object); } mtx_unlock(&dev_pager_mtx); diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 7d02dd2c01..da6e7f734e 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -640,6 +640,7 @@ swap_pager_condfree_callback(struct swblock *swap, void *data) --info->endi; if ((int)info->begi < 0 || (int)info->endi < 0) return(-1); + lwkt_yield(); return(0); } @@ -1225,6 +1226,10 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess) vm_object_drop(object); return(VM_PAGER_OK); } else if (m == NULL) { + /* + * Use VM_ALLOC_QUICK to avoid blocking on cache + * page reuse. + */ m = vm_page_alloc(object, mreq->pindex + 1, VM_ALLOC_QUICK); if (m == NULL) { @@ -1273,6 +1278,10 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess) if (error) { break; } else if (m == NULL) { + /* + * Use VM_ALLOC_QUICK to avoid blocking on cache + * page reuse. + */ m = vm_page_alloc(object, mreq->pindex + i, VM_ALLOC_QUICK); if (m == NULL) diff --git a/sys/vm/vm_contig.c b/sys/vm/vm_contig.c index 313f0887a5..811abc60b4 100644 --- a/sys/vm/vm_contig.c +++ b/sys/vm/vm_contig.c @@ -489,7 +489,11 @@ vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags) tmp_addr = addr; for (i = start; i < (start + size / PAGE_SIZE); i++) { vm_page_t m = &pga[i]; - vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr)); + if (vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr)) == + FALSE) { + panic("vm_contig_pg_kmap: page already exists @%p", + (void *)(intptr_t)tmp_addr); + } if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) pmap_zero_page(VM_PAGE_TO_PHYS(m)); m->flags = 0; diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index bc12340468..1ff19aaa38 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -656,9 +656,18 @@ RetryFault: goto done; } + /* + * Update the pmap. We really only have to do this if a COW + * occured to replace the read-only page with the new page. For + * now just do it unconditionally. XXX + */ + pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); + vm_page_flag_set(fs.m, PG_REFERENCED); + /* * On success vm_fault_object() does not unlock or deallocate, and fs.m - * will contain a busied page. + * will contain a busied page. So we must unlock here after having + * messed with the pmap. */ unlock_things(&fs); @@ -673,14 +682,6 @@ RetryFault: if (fault_type & VM_PROT_WRITE) vm_page_dirty(fs.m); - /* - * Update the pmap. We really only have to do this if a COW - * occured to replace the read-only page with the new page. For - * now just do it unconditionally. XXX - */ - pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); - vm_page_flag_set(fs.m, PG_REFERENCED); - /* * Unbusy the page by activating it. It remains held and will not * be reclaimed. @@ -807,8 +808,8 @@ RetryFault: } /* - * On success vm_fault_object() does not unlock or deallocate, and fs.m - * will contain a busied page. + * On success vm_fault_object() does not unlock or deallocate, so we + * do it here. Note that the returned fs.m will be busied. */ unlock_things(&fs); @@ -1023,6 +1024,9 @@ vm_fault_object(struct faultstate *fs, for (;;) { /* + * The entire backing chain from first_object to object + * inclusive is chainlocked. + * * If the object is dead, we stop here */ if (fs->object->flags & OBJ_DEAD) { @@ -1153,13 +1157,17 @@ vm_fault_object(struct faultstate *fs, /* * Allocate a new page for this object/offset pair. + * + * It is possible for the allocation to race, so + * handle the case. */ fs->m = NULL; if (!vm_page_count_severe()) { fs->m = vm_page_alloc(fs->object, pindex, ((fs->vp || fs->object->backing_object) ? - VM_ALLOC_NORMAL : - VM_ALLOC_NORMAL | VM_ALLOC_ZERO)); + VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL : + VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL | + VM_ALLOC_ZERO)); } if (fs->m == NULL) { vm_object_pip_wakeup(fs->first_object); @@ -1189,8 +1197,8 @@ readrest: * pager has it, and potentially fault in additional pages * at the same time. * - * We are NOT in splvm here and if TRYPAGER is true then - * fs.m will be non-NULL and will be PG_BUSY for us. + * If TRYPAGER is true then fs.m will be non-NULL and busied + * for us. */ if (TRYPAGER(fs)) { int rv; @@ -1870,7 +1878,8 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, */ do { dst_m = vm_page_alloc(dst_object, - OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL); + OFF_TO_IDX(dst_offset), + VM_ALLOC_NORMAL); if (dst_m == NULL) { vm_wait(0); } @@ -1999,7 +2008,8 @@ vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, i = 0; while (tpindex < pindex) { - rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM); + rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | + VM_ALLOC_NULL_OK); if (rtm == NULL) { for (j = 0; j < i; j++) { vm_page_free(marray[j]); @@ -2037,7 +2047,8 @@ vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, while (tpindex < endpindex) { if (vm_page_lookup(object, tpindex)) break; - rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM); + rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM | + VM_ALLOC_NULL_OK); if (rtm == NULL) break; marray[i] = rtm; @@ -2081,6 +2092,7 @@ vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead, * vm_map_entry via the normal fault code. Do NOT call this * shortcut unless the normal fault code has run on this entry. * + * The related map must be locked. * No other requirements. */ static int vm_prefault_pages = 8; @@ -2159,6 +2171,13 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot) int allocated = 0; int error; + /* + * This can eat a lot of time on a heavily contended + * machine so yield on the tick if needed. + */ + if ((i & 7) == 7) + lwkt_yield(); + /* * Calculate the page to pre-fault, stopping the scan in * each direction separately if the limit is reached. @@ -2237,7 +2256,11 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot) * NOTE: Allocated from base object */ m = vm_page_alloc(object, index, - VM_ALLOC_NORMAL | VM_ALLOC_ZERO); + VM_ALLOC_NORMAL | + VM_ALLOC_ZERO | + VM_ALLOC_NULL_OK); + if (m == NULL) + break; if ((m->flags & PG_ZERO) == 0) { vm_page_zero_fill(m); diff --git a/sys/vm/vm_kern.c b/sys/vm/vm_kern.c index a0e616c6ac..5384b85eff 100644 --- a/sys/vm/vm_kern.c +++ b/sys/vm/vm_kern.c @@ -223,11 +223,8 @@ kmem_alloc3(vm_map_t map, vm_size_t size, int kmflags) vm_page_t mem; mem = vm_page_grab(&kernel_object, OFF_TO_IDX(addr + i), - VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY); - if ((mem->flags & PG_ZERO) == 0) - vm_page_zero_fill(mem); - mem->valid = VM_PAGE_BITS_ALL; - vm_page_flag_clear(mem, PG_ZERO); + VM_ALLOC_FORCE_ZERO | VM_ALLOC_NORMAL | + VM_ALLOC_RETRY); vm_page_wakeup(mem); } vm_object_drop(&kernel_object); diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index b1240c8c18..437de341be 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -3109,8 +3109,11 @@ vm_map_split(vm_map_entry_t entry) * Copies the contents of the source entry to the destination * entry. The entries *must* be aligned properly. * - * The vm_map must be exclusively locked. + * The vm_maps must be exclusively locked. * The vm_map's token must be held. + * + * Because the maps are locked no faults can be in progress during the + * operation. */ static void vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map, diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 99cabb954f..7487d6f6e9 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -431,14 +431,11 @@ vm_object_chain_release(vm_object_t object) } /* - * This releases the entire chain starting with object and recursing - * through backing_object until stopobj is encountered. stopobj is - * not released. The caller will typically release stopobj manually - * before making this call (as the deepest object is the most likely - * to collide with other threads). + * This releases the entire chain of objects from first_object to and + * including stopobj, flowing through object->backing_object. * - * object and stopobj must be held by the caller. This code looks a - * bit odd but has been optimized fairly heavily. + * We release stopobj first as an optimization as this object is most + * likely to be shared across multiple processes. */ void vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj) @@ -868,6 +865,7 @@ vm_object_terminate_callback(vm_page_t p, void *data __unused) vm_page_remove(p); vm_page_wakeup(p); } + lwkt_yield(); return(0); } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 2e5c1872bb..87e8e6c422 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -755,8 +755,11 @@ vm_page_unhold(vm_page_t m) * This routine may not block. * This routine must be called with the vm_object held. * This routine must be called with a critical section held. + * + * This routine returns TRUE if the page was inserted into the object + * successfully, and FALSE if the page already exists in the object. */ -void +int vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) { ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); @@ -764,7 +767,6 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) panic("vm_page_insert: already inserted"); object->generation++; - object->resident_page_count++; /* * Record the object/offset pair in this page and add the @@ -775,7 +777,13 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) vm_page_spin_lock(m); m->object = object; m->pindex = pindex; - vm_page_rb_tree_RB_INSERT(&object->rb_memq, m); + if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) { + m->object = NULL; + m->pindex = 0; + vm_page_spin_unlock(m); + return FALSE; + } + object->resident_page_count++; /* atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count); */ vm_page_spin_unlock(m); @@ -790,6 +798,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex) * Checks for a swap assignment and sets PG_SWAPPED if appropriate. */ swap_pager_page_inserted(m); + return TRUE; } /* @@ -990,7 +999,10 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex) ASSERT_LWKT_TOKEN_HELD(vm_object_token(m->object)); vm_page_remove(m); } - vm_page_insert(m, new_object, new_pindex); + if (vm_page_insert(m, new_object, new_pindex) == FALSE) { + panic("vm_page_rename: target exists (%p,%ld)", + new_object, new_pindex); + } if (m->queue - m->pc == PQ_CACHE) vm_page_deactivate(m); vm_page_dirty(m); @@ -1236,17 +1248,20 @@ vm_page_select_free(u_short pg_color, boolean_t prefer_zero) * Allocate and return a memory cell associated with this VM object/offset * pair. If object is NULL an unassociated page will be allocated. * - * page_req classes: + * The returned page will be busied and removed from its queues. This + * routine can block and may return NULL if a race occurs and the page + * is found to already exist at the specified (object, pindex). * * VM_ALLOC_NORMAL allow use of cache pages, nominal free drain * VM_ALLOC_QUICK like normal but cannot use cache * VM_ALLOC_SYSTEM greater free drain * VM_ALLOC_INTERRUPT allow free list to be completely drained - * VM_ALLOC_ZERO advisory request for pre-zero'd page - * - * The object must be locked if not NULL + * VM_ALLOC_ZERO advisory request for pre-zero'd page only + * VM_ALLOC_FORCE_ZERO advisory request for pre-zero'd page only + * VM_ALLOC_NULL_OK ok to return NULL on insertion collision + * (see vm_page_grab()) + * The object must be held if not NULL * This routine may not block - * The returned page will be marked PG_BUSY * * Additional special handling is required when called from an interrupt * (VM_ALLOC_INTERRUPT). We are not allowed to mess with the page cache @@ -1265,8 +1280,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req) if (object) { pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask) + (object->pg_color & ~ncpus_fit_mask); - KASSERT(vm_page_lookup(object, pindex) == NULL, - ("vm_page_alloc: page already allocated")); } else { pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask); } @@ -1276,8 +1289,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req) */ if (object) { pg_color = object->pg_color + pindex; - KASSERT(vm_page_lookup(object, pindex) == NULL, - ("vm_page_alloc: page already allocated")); } else { pg_color = pindex; } @@ -1302,7 +1313,7 @@ loop: /* * The free queue has sufficient free pages to take one out. */ - if (page_req & VM_ALLOC_ZERO) + if (page_req & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) m = vm_page_select_free(pg_color, TRUE); else m = vm_page_select_free(pg_color, FALSE); @@ -1328,7 +1339,7 @@ loop: */ if (m != NULL) { KASSERT(m->dirty == 0, - ("Found dirty cache page %p", m)); + ("Found dirty cache page %p", m)); vm_page_protect(m, VM_PROT_NONE); vm_page_free(m); goto loop; @@ -1354,26 +1365,25 @@ loop: } /* - * Good page found. The page has already been busied for us. - * * v_free_count can race so loop if we don't find the expected * page. */ if (m == NULL) goto loop; - KASSERT(m->dirty == 0, - ("vm_page_alloc: free/cache page %p was dirty", m)); /* - * NOTE: page has already been removed from its queue and busied. + * Good page found. The page has already been busied for us and + * removed from its queues. */ + KASSERT(m->dirty == 0, + ("vm_page_alloc: free/cache page %p was dirty", m)); KKASSERT(m->queue == PQ_NONE); /* - * Initialize structure. Only the PG_ZERO flag is inherited. Set - * the page PG_BUSY + * Initialize the structure, inheriting some flags but clearing + * all the rest. The page has already been busied for us. */ - vm_page_flag_clear(m, ~(PG_ZERO | PG_BUSY)); + vm_page_flag_clear(m, ~(PG_ZERO | PG_BUSY | PG_SBUSY)); KKASSERT(m->wire_count == 0); KKASSERT(m->busy == 0); m->act_count = 0; @@ -1389,10 +1399,18 @@ loop: * NOTE: If no object an unassociated page is allocated, m->pindex * can be used by the caller for any purpose. */ - if (object) - vm_page_insert(m, object, pindex); - else + if (object) { + if (vm_page_insert(m, object, pindex) == FALSE) { + kprintf("PAGE RACE (%p:%d,%ld)\n", + object, object->type, pindex); + vm_page_free(m); + m = NULL; + if ((page_req & VM_ALLOC_NULL_OK) == 0) + panic("PAGE RACE"); + } + } else { m->pindex = pindex; + } /* * Don't wakeup too often - wakeup the pageout daemon when @@ -2142,24 +2160,27 @@ vm_page_io_finish(vm_page_t m) /* * Grab a page, blocking if it is busy and allocating a page if necessary. - * A busy page is returned or NULL. + * A busy page is returned or NULL. The page may or may not be valid and + * might not be on a queue (the caller is responsible for the disposition of + * the page). + * + * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the + * page will be zero'd and marked valid. * - * The page is not removed from its queues. XXX? + * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked + * valid even if it already exists. * - * If VM_ALLOC_RETRY is specified VM_ALLOC_NORMAL must also be specified. - * If VM_ALLOC_RETRY is not specified + * If VM_ALLOC_RETRY is specified this routine will never return NULL. Also + * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified. * * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is * always returned if we had blocked. - * This routine will never return NULL if VM_ALLOC_RETRY is set. + * * This routine may not be called from an interrupt. - * The returned page may not be entirely valid. * - * This routine may be called from mainline code without spl protection and - * be guarenteed a busied page associated with the object at the specified - * index. + * PG_ZERO is *ALWAYS* cleared by this routine. * - * No requirements. + * No other requirements. */ vm_page_t vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) @@ -2178,6 +2199,7 @@ vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) m = NULL; break; } + /* retry */ } else if (m == NULL) { m = vm_page_alloc(object, pindex, allocflags & ~VM_ALLOC_RETRY); @@ -2185,12 +2207,31 @@ vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags) break; vm_wait(0); if ((allocflags & VM_ALLOC_RETRY) == 0) - break; + goto failed; } else { /* m found */ break; } } + + /* + * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid. + * + * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set + * valid even if already valid. + */ + if (m->valid == 0) { + if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) { + if ((m->flags & PG_ZERO) == 0) + pmap_zero_page(VM_PAGE_TO_PHYS(m)); + m->valid = VM_PAGE_BITS_ALL; + } + } else if (allocflags & VM_ALLOC_FORCE_ZERO) { + pmap_zero_page(VM_PAGE_TO_PHYS(m)); + m->valid = VM_PAGE_BITS_ALL; + } + vm_page_flag_clear(m, PG_ZERO); +failed: vm_object_drop(object); return(m); } diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 1b894ec151..56f905cf7b 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -444,6 +444,8 @@ vm_page_flash(vm_page_t m) #define VM_ALLOC_INTERRUPT 0x04 /* ok to exhaust entire free list */ #define VM_ALLOC_ZERO 0x08 /* req pre-zero'd memory if avail */ #define VM_ALLOC_QUICK 0x10 /* like NORMAL but do not use cache */ +#define VM_ALLOC_FORCE_ZERO 0x20 /* zero page even if already valid */ +#define VM_ALLOC_NULL_OK 0x40 /* ok to return NULL on collision */ #define VM_ALLOC_RETRY 0x80 /* indefinite block (vm_page_grab()) */ void vm_page_queue_spin_lock(vm_page_t); @@ -468,7 +470,7 @@ int vm_page_try_to_free (vm_page_t); void vm_page_dontneed (vm_page_t); void vm_page_deactivate (vm_page_t); void vm_page_deactivate_locked (vm_page_t); -void vm_page_insert (vm_page_t, struct vm_object *, vm_pindex_t); +int vm_page_insert (vm_page_t, struct vm_object *, vm_pindex_t); vm_page_t vm_page_lookup (struct vm_object *, vm_pindex_t); vm_page_t VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *, vm_pindex_t, int, const char * VM_PAGE_DEBUG_ARGS); @@ -580,10 +582,10 @@ vm_page_copy(vm_page_t src_m, vm_page_t dest_m) /* * Free a page. The page must be marked BUSY. * - * The clearing of PG_ZERO is a temporary safety until the code can be - * reviewed to determine that PG_ZERO is being properly cleared on - * write faults or maps. PG_ZERO was previously cleared in - * vm_page_alloc(). + * Always clear PG_ZERO when freeing a page, which ensures the flag is not + * set unless we are absolutely certain the page is zerod. This is + * particularly important when the vm_page_alloc*() code moves pages from + * PQ_CACHE to PQ_FREE. */ static __inline void vm_page_free(vm_page_t m) @@ -593,7 +595,8 @@ vm_page_free(vm_page_t m) } /* - * Free a page to the zerod-pages queue + * Free a page to the zerod-pages queue. The caller must ensure that the + * page has been zerod. */ static __inline void vm_page_free_zero(vm_page_t m) diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index cd8b0cd9c5..28cad92064 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -845,6 +845,8 @@ vm_pageout_scan(int pass) vm_page_and_queue_spin_unlock(m); KKASSERT(m->queue == PQ_INACTIVE); + lwkt_yield(); + /* * The page has been successfully busied and is now no * longer spinlocked. The queue is no longer spinlocked @@ -1220,6 +1222,7 @@ vm_pageout_scan(int pass) continue; } vm_page_and_queue_spin_unlock(m); + lwkt_yield(); /* * The page has been successfully busied and the page and @@ -1370,6 +1373,7 @@ vm_pageout_scan(int pass) } vm_page_spin_unlock(m); pagedaemon_wakeup(); + lwkt_yield(); /* * Page has been successfully busied and it and its queue @@ -1511,6 +1515,7 @@ vm_pageout_scan_callback(struct proc *p, void *data) info->bigproc = p; info->bigsize = size; } + lwkt_yield(); return(0); } diff --git a/sys/vm/vm_swapcache.c b/sys/vm/vm_swapcache.c index ee7bfc8d74..c52d0e8f7d 100644 --- a/sys/vm/vm_swapcache.c +++ b/sys/vm/vm_swapcache.c @@ -621,9 +621,9 @@ vm_swapcache_cleaning(vm_object_t marker) lwkt_gettoken(&vmobj_token); while ((object = TAILQ_NEXT(object, object_list)) != NULL) { + lwkt_yield(); if (--count <= 0) break; - vm_object_hold(object); /* -- 2.41.0