kernel - More many-cores SMP work
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 28 Oct 2011 06:50:51 +0000 (23:50 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 28 Oct 2011 06:50:51 +0000 (23:50 -0700)
* Add lwkt_yield() calls in a few critical places which can hog the cpu
  on large many-cores boxes during periods of very heavy contention.  This
  allows other kernel threads on the same cpu to run and reduces symptoms
  of e.g. high ping times under certain load conditions.

* Run the callout kernel threads at the same priority as other kernel
  threads so cpu-hogging operations run from callouts can yield to
  other kernel threads (e.g. yield to the netisr threads).

* Change the vm_page_alloc() API to catch situations where the allocation
  races an insertion due to potentially blocking when dealing with
  PQ_CACHE pages.  VM_ALLOC_NULL_OK allows vm_page_alloc() to return NULL
  in this case (otherwise it will panic).

* Change vm_page_insert() to return TRUE if the insertion succeeded and
  FALSE if it didn't due to a race against another thread.

* Change the meaning of the cpuid argument to lwkt_alloc_thread() and
  lwkt_create().  A cpuid of -1 will cause the kernel to choose a cpu
  to run the thread on (instead of choosing the current cpu).

  Eventually this specification will allow dynamic migration (but not at
  the moment).

  Adjust lwp_fork() to specify the current cpu, required for initial
  LWKT calls when setting the forked thread up.

  Numerous kernel threads will now be spread around available cpus for
  now.  devfs core threads, NFS socket threads, etc.

  Interrupt threads are still fixed on cpu 0 awaiting additional work from
  Sephe.

  Put the emergency interrupt thread on the last cpu.

* Change the vm_page_grab() API.  When VM_ALLOC_ZERO is specified the
  vm_page_grab() code will automatically set an invalid page valid and
  zero it (using the PG_ZERO optimization if possible).  Pages which are
  already valid are not zero'd.

  This simplies several use cases.

* Change vm_fault_page() to enter the page into the pmap while the vm_map
  is still locked, instead of after unlocking it.  For now anyhow.

* Minor change to ensure that a deterministic value is stored in *freebuf
  in vn_fullpath().

* Minor debugging features added to help track down a x86-64 sge-fault
  issue.

29 files changed:
sys/dev/agp/agp.c
sys/dev/agp/agp_i810.c
sys/kern/kern_fork.c
sys/kern/kern_intr.c
sys/kern/kern_kthread.c
sys/kern/kern_synch.c
sys/kern/kern_timeout.c
sys/kern/lwkt_thread.c
sys/kern/subr_disk.c
sys/kern/uipc_syscalls.c
sys/kern/vfs_cache.c
sys/platform/pc32/i386/pmap.c
sys/platform/pc64/x86_64/trap.c
sys/platform/vkernel/platform/pmap.c
sys/platform/vkernel64/platform/pmap.c
sys/sys/kthread.h
sys/vfs/devfs/devfs_core.c
sys/vfs/nfs/nfs_vfsops.c
sys/vm/device_pager.c
sys/vm/swap_pager.c
sys/vm/vm_contig.c
sys/vm/vm_fault.c
sys/vm/vm_kern.c
sys/vm/vm_map.c
sys/vm/vm_object.c
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_pageout.c
sys/vm/vm_swapcache.c

index 77aec71..c4074cd 100644 (file)
@@ -533,16 +533,15 @@ agp_generic_bind_memory(device_t dev, struct agp_memory *mem,
         */
        for (i = 0; i < mem->am_size; i += PAGE_SIZE) {
                /*
-                * Find a page from the object and wire it
-                * down. This page will be mapped using one or more
-                * entries in the GATT (assuming that PAGE_SIZE >=
-                * AGP_PAGE_SIZE. If this is the first call to bind,
-                * the pages will be allocated and zeroed.
+                * Find a page from the object and wire it down. This page
+                * will be mapped using one or more entries in the GATT
+                * (assuming that PAGE_SIZE >= AGP_PAGE_SIZE. If this is
+                * the first call to bind, the pages will be allocated
+                * and zeroed.
                 */
                m = vm_page_grab(mem->am_obj, OFF_TO_IDX(i),
-                        VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
-               if ((m->flags & PG_ZERO) == 0)
-                       vm_page_zero_fill(m);
+                                VM_ALLOC_NORMAL | VM_ALLOC_ZERO |
+                                VM_ALLOC_RETRY);
                AGP_DPF("found page pa=%#x\n", VM_PAGE_TO_PHYS(m));
                vm_page_wire(m);
 
index 7e457da..9c9aa16 100644 (file)
@@ -969,10 +969,9 @@ agp_i810_alloc_memory(device_t dev, int type, vm_size_t size)
                         */
                        vm_page_t m;
        
-                       m = vm_page_grab(mem->am_obj, 0, 
-                                        VM_ALLOC_NORMAL|VM_ALLOC_ZERO|VM_ALLOC_RETRY);
-                       if ((m->flags & PG_ZERO) == 0)
-                               vm_page_zero_fill(m);
+                       m = vm_page_grab(mem->am_obj, 0, VM_ALLOC_NORMAL |
+                                                        VM_ALLOC_ZERO |
+                                                        VM_ALLOC_RETRY);
                        vm_page_wire(m);
                        mem->am_physical = VM_PAGE_TO_PHYS(m);
                        vm_page_wakeup(m);
index 4a8a16c..cc0edff 100644 (file)
@@ -607,6 +607,7 @@ done:
 static struct lwp *
 lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
 {
+       globaldata_t gd = mycpu;
        struct lwp *lp;
        struct thread *td;
 
@@ -627,13 +628,16 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
         * scheduler specific data.
         */
        crit_enter();
-       lp->lwp_cpbase = mycpu->gd_schedclock.time -
-                       mycpu->gd_schedclock.periodic;
+       lp->lwp_cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
        destproc->p_usched->heuristic_forking(origlp, lp);
        crit_exit();
        lp->lwp_cpumask &= usched_mastermask;
 
-       td = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, -1, 0);
+       /*
+        * Assign the thread to the current cpu to begin with so we
+        * can manipulate it.
+        */
+       td = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, gd->gd_cpuid, 0);
        lp->lwp_thread = td;
        td->td_proc = destproc;
        td->td_lwp = lp;
@@ -661,7 +665,6 @@ lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
        destproc->p_lasttid = lp->lwp_tid;
        destproc->p_nthreads++;
 
-
        return (lp);
 }
 
index 8a1383f..101bc88 100644 (file)
@@ -254,7 +254,7 @@ register_int(int intr, inthand2_t *handler, void *arg, const char *name,
      */
     if (emergency_intr_thread.td_kstack == NULL) {
        lwkt_create(ithread_emergency, NULL, NULL, &emergency_intr_thread,
-                   TDF_STOPREQ | TDF_INTTHREAD, -1, "ithread emerg");
+                   TDF_STOPREQ | TDF_INTTHREAD, ncpus - 1, "ithread emerg");
        systimer_init_periodic_nq(&emergency_intr_timer,
                    emergency_intr_timer_callback, &emergency_intr_thread, 
                    (emergency_intr_enable ? emergency_intr_freq : 1));
@@ -265,11 +265,13 @@ register_int(int intr, inthand2_t *handler, void *arg, const char *name,
     /*
      * Create an interrupt thread if necessary, leave it in an unscheduled
      * state.
+     *
+     * Put it on cpu 0 for now, other work is pending related to this.
      */
     if (info->i_state == ISTATE_NOTHREAD) {
        info->i_state = ISTATE_NORMAL;
        lwkt_create(ithread_handler, (void *)(intptr_t)intr, NULL,
-                   &info->i_thread, TDF_STOPREQ | TDF_INTTHREAD, -1,
+                   &info->i_thread, TDF_STOPREQ | TDF_INTTHREAD, 0,
                    "ithread %d", intr);
        if (intr >= FIRST_SOFTINT)
            lwkt_setpri(&info->i_thread, TDPRI_SOFT_NORM);
index 3f731b6..a3b6e6c 100644 (file)
@@ -104,6 +104,7 @@ kthread_create_cpu(void (*func)(void *), void *arg,
     return 0;
 }
 
+#if 0
 /*
  * Same as kthread_create() but you can specify a custom stack size.
  */
@@ -126,6 +127,7 @@ kthread_create_stk(void (*func)(void *), void *arg,
     lwkt_schedule(td);
     return 0;
 }
+#endif
 
 /*
  * Destroy an LWKT thread.   Warning!  This function is not called when
index 90a1d06..30e4c07 100644 (file)
@@ -233,6 +233,7 @@ schedcpu_stats(struct proc *p, void *data __unused)
                }
        }
        lwkt_reltoken(&p->p_token);
+       lwkt_yield();
        PRELE(p);
        return(0);
 }
@@ -289,6 +290,7 @@ schedcpu_resource(struct proc *p, void *data __unused)
                break;
        }
        lwkt_reltoken(&p->p_token);
+       lwkt_yield();
        PRELE(p);
        return(0);
 }
@@ -1216,6 +1218,7 @@ loadav_count_runnable(struct lwp *lp, void *data)
        default:
                break;
        }
+       lwkt_yield();
        return(0);
 }
 
index 33379da..5ca8f2d 100644 (file)
@@ -254,7 +254,11 @@ softclock_handler(void *arg)
        int mpsafe = 1;
 #endif
 
-       lwkt_setpri_self(TDPRI_SOFT_NORM);
+       /*
+        * Run the callout thread at the same priority as other kernel
+        * threads so it can be round-robined.
+        */
+       /*lwkt_setpri_self(TDPRI_SOFT_NORM);*/
 
        sc = arg;
        crit_enter();
index 69251f2..e3dd770 100644 (file)
@@ -377,6 +377,7 @@ lwkt_gdinit(struct globaldata *gd)
 thread_t
 lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
 {
+    static int cpu_rotator;
     globaldata_t gd = mycpu;
     void *stack;
 
@@ -416,10 +417,12 @@ lwkt_alloc_thread(struct thread *td, int stksize, int cpu, int flags)
        stack = (void *)kmem_alloc_stack(&kernel_map, stksize);
        flags |= TDF_ALLOCATED_STACK;
     }
-    if (cpu < 0)
-       lwkt_init_thread(td, stack, stksize, flags, gd);
-    else
-       lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
+    if (cpu < 0) {
+       cpu = ++cpu_rotator;
+       cpu_ccfence();
+       cpu %= ncpus;
+    }
+    lwkt_init_thread(td, stack, stksize, flags, globaldata_find(cpu));
     return(td);
 }
 
@@ -1181,11 +1184,12 @@ lwkt_passive_release(struct thread *td)
 
 
 /*
- * This implements a normal yield.  This routine is virtually a nop if
- * there is nothing to yield to but it will always run any pending interrupts
- * if called from a critical section.
+ * This implements a LWKT yield, allowing a kernel thread to yield to other
+ * kernel threads at the same or higher priority.  This function can be
+ * called in a tight loop and will typically only yield once per tick.
  *
- * This yield is designed for kernel threads without a user context.
+ * Most kernel threads run at the same priority in order to allow equal
+ * sharing.
  *
  * (self contained on a per cpu basis)
  */
@@ -1450,11 +1454,11 @@ lwkt_deschedule(thread_t td)
 void
 lwkt_setpri(thread_t td, int pri)
 {
-    KKASSERT(td->td_gd == mycpu);
     if (td->td_pri != pri) {
        KKASSERT(pri >= 0);
        crit_enter();
        if (td->td_flags & TDF_RUNQ) {
+           KKASSERT(td->td_gd == mycpu);
            _lwkt_dequeue(td);
            td->td_pri = pri;
            _lwkt_enqueue(td);
@@ -1640,9 +1644,9 @@ lwkt_preempted_proc(void)
  * Create a kernel process/thread/whatever.  It shares it's address space
  * with proc0 - ie: kernel only.
  *
- * NOTE!  By default new threads are created with the MP lock held.  A 
- * thread which does not require the MP lock should release it by calling
- * rel_mplock() at the start of the new thread.
+ * If the cpu is not specified one will be selected.  In the future
+ * specifying a cpu of -1 will enable kernel thread migration between
+ * cpus.
  */
 int
 lwkt_create(void (*func)(void *), void *arg, struct thread **tdp,
index ea6c6a3..16dfc1d 100644 (file)
@@ -1395,7 +1395,7 @@ disk_init(void)
 
        lwkt_gettoken(&disklist_token);
        lwkt_create(disk_msg_core, /*args*/NULL, &td_core, NULL,
-                   0, 0, "disk_msg_core");
+                   0, -1, "disk_msg_core");
        tsleep(td_core, 0, "diskcore", 0);
        lwkt_reltoken(&disklist_token);
 }
index cae66a2..561233b 100644 (file)
@@ -1611,7 +1611,8 @@ retry_lookup:
                        goto retry_lookup;
                }
                if (pg == NULL) {
-                       pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
+                       pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL |
+                                                       VM_ALLOC_NULL_OK);
                        if (pg == NULL) {
                                vm_wait(0);
                                vm_object_drop(obj);
index 8dd32b3..5b92fe5 100644 (file)
@@ -3312,6 +3312,7 @@ vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf,
        struct nchandle nch;
        int error;
 
+       *freebuf = NULL;
        atomic_add_int(&numfullpathcalls, 1);
        if (disablefullpath)
                return (ENODEV);
index 45e3a85..a06fc78 100644 (file)
@@ -1197,21 +1197,15 @@ pmap_pinit(struct pmap *pmap)
         */
        if ((ptdpg = pmap->pm_pdirm) == NULL) {
                ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
-                                    VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+                                    VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
+                                    VM_ALLOC_ZERO);
                pmap->pm_pdirm = ptdpg;
                vm_page_flag_clear(ptdpg, PG_MAPPED);
                vm_page_wire(ptdpg);
-               ptdpg->valid = VM_PAGE_BITS_ALL;
+               KKASSERT(ptdpg->valid == VM_PAGE_BITS_ALL);
                pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
                vm_page_wakeup(ptdpg);
        }
-       if ((ptdpg->flags & PG_ZERO) == 0)
-               bzero(pmap->pm_pdir, PAGE_SIZE);
-#ifdef PMAP_DEBUG
-       else
-               pmap_page_assertzero(VM_PAGE_TO_PHYS(ptdpg));
-#endif
-
        pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
 
        /* install self-referential address mapping entry */
@@ -1357,10 +1351,11 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
        vm_page_t m;
 
        /*
-        * Find or fabricate a new pagetable page
+        * Find or fabricate a new pagetable page.  Setting VM_ALLOC_ZERO
+        * will zero any new page and mark it valid.
         */
        m = vm_page_grab(pmap->pm_pteobj, ptepindex,
-                       VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
+                        VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 
        KASSERT(m->queue == PQ_NONE,
                ("_pmap_allocpte: %p->queue != PQ_NONE", m));
@@ -1405,27 +1400,6 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
         * Set the page table hint
         */
        pmap->pm_ptphint = m;
-
-       /*
-        * Try to use the new mapping, but if we cannot, then
-        * do it with the routine that maps the page explicitly.
-        */
-       if (m->valid == 0) {
-               if ((m->flags & PG_ZERO) == 0) {
-                       if ((((unsigned)pmap->pm_pdir[PTDPTDI]) & PG_FRAME) ==
-                               (((unsigned) PTDpde) & PG_FRAME)) {
-                               pteva = UPT_MIN_ADDRESS + i386_ptob(ptepindex);
-                               bzero((caddr_t) pteva, PAGE_SIZE);
-                       } else {
-                               pmap_zero_page(ptepa);
-                       }
-               }
-               m->valid = VM_PAGE_BITS_ALL;
-               vm_page_flag_clear(m, PG_ZERO);
-       } else {
-               KKASSERT((m->flags & PG_ZERO) == 0);
-       }
-
        vm_page_flag_set(m, PG_MAPPED);
        vm_page_wakeup(m);
 
index a0e0b87..863005d 100644 (file)
@@ -79,6 +79,7 @@
 #include <machine/pcb.h>
 #include <machine/smp.h>
 #include <machine/thread.h>
+#include <machine/clock.h>
 #include <machine/vmparam.h>
 #include <machine/md_var.h>
 #include <machine_base/isa/isa_intr.h>
@@ -151,6 +152,9 @@ SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
 static int ddb_on_seg_fault = 0;
 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_seg_fault, CTLFLAG_RW,
        &ddb_on_seg_fault, 0, "Go to DDB on user seg-fault");
+static int freeze_on_seg_fault = 0;
+SYSCTL_INT(_machdep, OID_AUTO, freeze_on_seg_fault, CTLFLAG_RW,
+       &freeze_on_seg_fault, 0, "Go to DDB on user seg-fault");
 #endif
 static int panic_on_nmi = 1;
 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
@@ -163,6 +167,15 @@ SYSCTL_INT(_machdep, OID_AUTO, slow_release, CTLFLAG_RW,
        &slow_release, 0, "Passive Release was nonoptimal");
 
 /*
+ * System call debugging records the worst-case system call
+ * overhead (inclusive of blocking), but may be inaccurate.
+ */
+/*#define SYSCALL_DEBUG*/
+#ifdef SYSCALL_DEBUG
+uint64_t SysCallsWorstCase[SYS_MAXSYSCALL];
+#endif
+
+/*
  * Passively intercepts the thread switch function to increase
  * the thread priority from a user priority to a kernel priority, reducing
  * syscall and trap overhead for the case where no switch occurs.
@@ -490,8 +503,12 @@ trap(struct trapframe *frame)
 
                case T_PAGEFLT:         /* page fault */
                        i = trap_pfault(frame, TRUE);
-                       if (frame->tf_rip == 0)
+                       if (frame->tf_rip == 0) {
                                kprintf("T_PAGEFLT: Warning %%rip == 0!\n");
+                               while (freeze_on_seg_fault) {
+                                       tsleep(p, 0, "freeze", hz * 20);
+                               }
+                       }
                        if (i == -1)
                                goto out;
                        if (i == 0)
@@ -883,14 +900,18 @@ nogo:
         */
        p = td->td_proc;
        if (td->td_lwp->lwp_vkernel == NULL) {
-               if (bootverbose)
+               if (bootverbose || freeze_on_seg_fault || ddb_on_seg_fault) {
                        kprintf("seg-fault ft=%04x ff=%04x addr=%p rip=%p "
                            "pid=%d p_comm=%s\n",
                            ftype, fault_flags,
                            (void *)frame->tf_addr,
                            (void *)frame->tf_rip,
                            p->p_pid, p->p_comm);
+               }
 #ifdef DDB
+               while (freeze_on_seg_fault) {
+                       tsleep(p, 0, "freeze", hz * 20);
+               }
                if (ddb_on_seg_fault)
                        Debugger("ddb_on_seg_fault");
 #endif
@@ -1185,7 +1206,16 @@ syscall2(struct trapframe *frame)
         * NOTE: All system calls run MPSAFE now.  The system call itself
         *       is responsible for getting the MP lock.
         */
+#ifdef SYSCALL_DEBUG
+       uint64_t tscval = rdtsc();
+#endif
        error = (*callp->sy_call)(&args);
+#ifdef SYSCALL_DEBUG
+       tscval = rdtsc() - tscval;
+       tscval = tscval * 1000000 / tsc_frequency;
+       if (SysCallsWorstCase[code] < tscval)
+               SysCallsWorstCase[code] = tscval;
+#endif
 
 out:
        /*
index 1f19d6e..8bfe86c 100644 (file)
@@ -220,21 +220,17 @@ pmap_pinit(struct pmap *pmap)
         * allocate the page directory page
         */
        ptdpg = vm_page_grab(pmap->pm_pteobj, pmap->pm_pdindex,
-                            VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+                            VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_ZERO);
 
        ptdpg->wire_count = 1;
        atomic_add_int(&vmstats.v_wire_count, 1);
 
        /* not usually mapped */
-       ptdpg->valid = VM_PAGE_BITS_ALL;
        vm_page_flag_clear(ptdpg, PG_MAPPED);
        vm_page_wakeup(ptdpg);
 
        pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
        pmap->pm_pdirpte = KernelPTA[(vm_offset_t)pmap->pm_pdir >> PAGE_SHIFT];
-       if ((ptdpg->flags & PG_ZERO) == 0)
-               bzero(pmap->pm_pdir, PAGE_SIZE);
-       vm_page_flag_clear(ptdpg, PG_ZERO);
 
        pmap->pm_count = 1;
        pmap->pm_active = 0;
@@ -1146,15 +1142,6 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
         */
        m = vm_page_grab(pmap->pm_pteobj, ptepindex,
                         VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
-
-       if (m->valid == 0) {
-               if ((m->flags & PG_ZERO) == 0)
-                       pmap_zero_page(VM_PAGE_TO_PHYS(m));
-               m->valid = VM_PAGE_BITS_ALL;
-               vm_page_flag_clear(m, PG_ZERO);
-       } else {
-               KKASSERT((m->flags & PG_ZERO) == 0);
-       }
        vm_page_flag_set(m, PG_MAPPED);
 
        KASSERT(m->queue == PQ_NONE,
index 4a7b41d..499b553 100644 (file)
@@ -1084,21 +1084,18 @@ pmap_pinit(struct pmap *pmap)
         * already be set appropriately.
         */
        if ((ptdpg = pmap->pm_pdirm) == NULL) {
-               ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I,
-                                    VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+               ptdpg = vm_page_grab(pmap->pm_pteobj,
+                                    NUPDE + NUPDPE + PML4PML4I,
+                                    VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
+                                    VM_ALLOC_ZERO);
                pmap->pm_pdirm = ptdpg;
                vm_page_flag_clear(ptdpg, PG_MAPPED);
-               ptdpg->valid = VM_PAGE_BITS_ALL;
                if (ptdpg->wire_count == 0)
                        atomic_add_int(&vmstats.v_wire_count, 1);
                ptdpg->wire_count = 1;
                vm_page_wakeup(ptdpg);
                pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg));
        }
-       if ((ptdpg->flags & PG_ZERO) == 0)
-               bzero(pmap->pm_pml4, PAGE_SIZE);
-       vm_page_flag_clear(ptdpg, PG_ZERO);
-
        pmap->pm_count = 1;
        pmap->pm_active = 0;
        pmap->pm_ptphint = NULL;
@@ -1270,17 +1267,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
         * races by checking m->valid.
         */
        m = vm_page_grab(pmap->pm_pteobj, ptepindex,
-                       VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
-
-       if (m->valid == 0) {
-               if ((m->flags & PG_ZERO) == 0) {
-                       pmap_zero_page(VM_PAGE_TO_PHYS(m));
-               }
-               m->valid = VM_PAGE_BITS_ALL;
-               vm_page_flag_clear(m, PG_ZERO);
-       } else {
-               KKASSERT((m->flags & PG_ZERO) == 0);
-       }
+                        VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
 
        KASSERT(m->queue == PQ_NONE,
                ("_pmap_allocpte: %p->queue != PQ_NONE", m));
index 3690c15..3b6d6d9 100644 (file)
@@ -63,8 +63,6 @@ int   kthread_create (void (*)(void *), void *, struct thread **,
            const char *, ...) __printflike(4, 5);
 int    kthread_create_cpu (void (*)(void *), void *, struct thread **,
            int, const char *, ...) __printflike(5, 6);
-int    kthread_create_stk (void (*)(void *), void *, struct thread **,
-           int, const char *, ...) __printflike(5, 6);
 void   kthread_exit (void) __dead2;
 #endif /* _KERNEL */
 
index f5ddfe5..8fd9992 100644 (file)
@@ -2500,7 +2500,7 @@ devfs_init(void)
 
        lockmgr(&devfs_lock, LK_EXCLUSIVE);
        lwkt_create(devfs_msg_core, /*args*/NULL, &td_core, NULL,
-                   0, 0, "devfs_msg_core");
+                   0, -1, "devfs_msg_core");
        while (devfs_run == 0)
                lksleep(td_core, &devfs_lock, 0, "devfsc", 0);
        lockmgr(&devfs_lock, LK_RELEASE);
index 4ebf91d..bbdbdbb 100644 (file)
@@ -1157,8 +1157,8 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam,
                txcpu = 1;
                break;
        default:
-               rxcpu = 1;
-               txcpu = 2;
+               rxcpu = -1;
+               txcpu = -1;
                break;
        }
 #else
index e3900bc..bb64a1a 100644 (file)
@@ -214,7 +214,10 @@ dev_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                                  page, pageq);
                vm_object_hold(object);
                vm_page_free(*mpp);
-               vm_page_insert(page, object, offset);
+               if (vm_page_insert(page, object, offset) == FALSE) {
+                       panic("dev_pager_getpage: page (%p,%ld) exists",
+                             object, offset);
+               }
                vm_object_drop(object);
        }
        mtx_unlock(&dev_pager_mtx);
index 7d02dd2..da6e7f7 100644 (file)
@@ -640,6 +640,7 @@ swap_pager_condfree_callback(struct swblock *swap, void *data)
        --info->endi;
        if ((int)info->begi < 0 || (int)info->endi < 0)
                return(-1);
+       lwkt_yield();
        return(0);
 }
 
@@ -1225,6 +1226,10 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                        vm_object_drop(object);
                        return(VM_PAGER_OK);
                } else if (m == NULL) {
+                       /*
+                        * Use VM_ALLOC_QUICK to avoid blocking on cache
+                        * page reuse.
+                        */
                        m = vm_page_alloc(object, mreq->pindex + 1,
                                          VM_ALLOC_QUICK);
                        if (m == NULL) {
@@ -1273,6 +1278,10 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                if (error) {
                        break;
                } else if (m == NULL) {
+                       /*
+                        * Use VM_ALLOC_QUICK to avoid blocking on cache
+                        * page reuse.
+                        */
                        m = vm_page_alloc(object, mreq->pindex + i,
                                          VM_ALLOC_QUICK);
                        if (m == NULL)
index 313f088..811abc6 100644 (file)
@@ -489,7 +489,11 @@ vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags)
        tmp_addr = addr;
        for (i = start; i < (start + size / PAGE_SIZE); i++) {
                vm_page_t m = &pga[i];
-               vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr));
+               if (vm_page_insert(m, &kernel_object, OFF_TO_IDX(tmp_addr)) ==
+                   FALSE) {
+                       panic("vm_contig_pg_kmap: page already exists @%p",
+                             (void *)(intptr_t)tmp_addr);
+               }
                if ((flags & M_ZERO) && !(m->flags & PG_ZERO))
                        pmap_zero_page(VM_PAGE_TO_PHYS(m));
                m->flags = 0;
index bc12340..1ff19aa 100644 (file)
@@ -657,8 +657,17 @@ RetryFault:
        }
 
        /*
+        * Update the pmap.  We really only have to do this if a COW
+        * occured to replace the read-only page with the new page.  For
+        * now just do it unconditionally. XXX
+        */
+       pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
+       vm_page_flag_set(fs.m, PG_REFERENCED);
+
+       /*
         * On success vm_fault_object() does not unlock or deallocate, and fs.m
-        * will contain a busied page.
+        * will contain a busied page.  So we must unlock here after having
+        * messed with the pmap.
         */
        unlock_things(&fs);
 
@@ -674,14 +683,6 @@ RetryFault:
                vm_page_dirty(fs.m);
 
        /*
-        * Update the pmap.  We really only have to do this if a COW
-        * occured to replace the read-only page with the new page.  For
-        * now just do it unconditionally. XXX
-        */
-       pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
-       vm_page_flag_set(fs.m, PG_REFERENCED);
-
-       /*
         * Unbusy the page by activating it.  It remains held and will not
         * be reclaimed.
         */
@@ -807,8 +808,8 @@ RetryFault:
        }
 
        /*
-        * On success vm_fault_object() does not unlock or deallocate, and fs.m
-        * will contain a busied page.
+        * On success vm_fault_object() does not unlock or deallocate, so we
+        * do it here.  Note that the returned fs.m will be busied.
         */
        unlock_things(&fs);
 
@@ -1023,6 +1024,9 @@ vm_fault_object(struct faultstate *fs,
 
        for (;;) {
                /*
+                * The entire backing chain from first_object to object
+                * inclusive is chainlocked.
+                *
                 * If the object is dead, we stop here
                 */
                if (fs->object->flags & OBJ_DEAD) {
@@ -1153,13 +1157,17 @@ vm_fault_object(struct faultstate *fs,
 
                        /*
                         * Allocate a new page for this object/offset pair.
+                        *
+                        * It is possible for the allocation to race, so
+                        * handle the case.
                         */
                        fs->m = NULL;
                        if (!vm_page_count_severe()) {
                                fs->m = vm_page_alloc(fs->object, pindex,
                                    ((fs->vp || fs->object->backing_object) ?
-                                       VM_ALLOC_NORMAL :
-                                       VM_ALLOC_NORMAL | VM_ALLOC_ZERO));
+                                       VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL :
+                                       VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL |
+                                       VM_ALLOC_ZERO));
                        }
                        if (fs->m == NULL) {
                                vm_object_pip_wakeup(fs->first_object);
@@ -1189,8 +1197,8 @@ readrest:
                 * pager has it, and potentially fault in additional pages
                 * at the same time.
                 *
-                * We are NOT in splvm here and if TRYPAGER is true then
-                * fs.m will be non-NULL and will be PG_BUSY for us.
+                * If TRYPAGER is true then fs.m will be non-NULL and busied
+                * for us.
                 */
                if (TRYPAGER(fs)) {
                        int rv;
@@ -1870,7 +1878,8 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
                 */
                do {
                        dst_m = vm_page_alloc(dst_object,
-                               OFF_TO_IDX(dst_offset), VM_ALLOC_NORMAL);
+                                             OFF_TO_IDX(dst_offset),
+                                             VM_ALLOC_NORMAL);
                        if (dst_m == NULL) {
                                vm_wait(0);
                        }
@@ -1999,7 +2008,8 @@ vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead,
 
                i = 0;
                while (tpindex < pindex) {
-                       rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM);
+                       rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM |
+                                                            VM_ALLOC_NULL_OK);
                        if (rtm == NULL) {
                                for (j = 0; j < i; j++) {
                                        vm_page_free(marray[j]);
@@ -2037,7 +2047,8 @@ vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead,
        while (tpindex < endpindex) {
                if (vm_page_lookup(object, tpindex))
                        break;
-               rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM);
+               rtm = vm_page_alloc(object, tpindex, VM_ALLOC_SYSTEM |
+                                                    VM_ALLOC_NULL_OK);
                if (rtm == NULL)
                        break;
                marray[i] = rtm;
@@ -2081,6 +2092,7 @@ vm_fault_additional_pages(vm_page_t m, int rbehind, int rahead,
  *       vm_map_entry via the normal fault code.  Do NOT call this
  *       shortcut unless the normal fault code has run on this entry.
  *
+ * The related map must be locked.
  * No other requirements.
  */
 static int vm_prefault_pages = 8;
@@ -2160,6 +2172,13 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                int error;
 
                /*
+                * This can eat a lot of time on a heavily contended
+                * machine so yield on the tick if needed.
+                */
+               if ((i & 7) == 7)
+                       lwkt_yield();
+
+               /*
                 * Calculate the page to pre-fault, stopping the scan in
                 * each direction separately if the limit is reached.
                 */
@@ -2237,7 +2256,11 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                                 * NOTE: Allocated from base object
                                 */
                                m = vm_page_alloc(object, index,
-                                                 VM_ALLOC_NORMAL | VM_ALLOC_ZERO);
+                                                 VM_ALLOC_NORMAL |
+                                                 VM_ALLOC_ZERO |
+                                                 VM_ALLOC_NULL_OK);
+                               if (m == NULL)
+                                       break;
 
                                if ((m->flags & PG_ZERO) == 0) {
                                        vm_page_zero_fill(m);
index a0e616c..5384b85 100644 (file)
@@ -223,11 +223,8 @@ kmem_alloc3(vm_map_t map, vm_size_t size, int kmflags)
                vm_page_t mem;
 
                mem = vm_page_grab(&kernel_object, OFF_TO_IDX(addr + i),
-                           VM_ALLOC_ZERO | VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
-               if ((mem->flags & PG_ZERO) == 0)
-                       vm_page_zero_fill(mem);
-               mem->valid = VM_PAGE_BITS_ALL;
-               vm_page_flag_clear(mem, PG_ZERO);
+                                  VM_ALLOC_FORCE_ZERO | VM_ALLOC_NORMAL |
+                                  VM_ALLOC_RETRY);
                vm_page_wakeup(mem);
        }
        vm_object_drop(&kernel_object);
index b1240c8..437de34 100644 (file)
@@ -3109,8 +3109,11 @@ vm_map_split(vm_map_entry_t entry)
  * Copies the contents of the source entry to the destination
  * entry.  The entries *must* be aligned properly.
  *
- * The vm_map must be exclusively locked.
+ * The vm_maps must be exclusively locked.
  * The vm_map's token must be held.
+ *
+ * Because the maps are locked no faults can be in progress during the
+ * operation.
  */
 static void
 vm_map_copy_entry(vm_map_t src_map, vm_map_t dst_map,
index 99cabb9..7487d6f 100644 (file)
@@ -431,14 +431,11 @@ vm_object_chain_release(vm_object_t object)
 }
 
 /*
- * This releases the entire chain starting with object and recursing
- * through backing_object until stopobj is encountered.  stopobj is
- * not released.  The caller will typically release stopobj manually
- * before making this call (as the deepest object is the most likely
- * to collide with other threads).
+ * This releases the entire chain of objects from first_object to and
+ * including stopobj, flowing through object->backing_object.
  *
- * object and stopobj must be held by the caller.  This code looks a
- * bit odd but has been optimized fairly heavily.
+ * We release stopobj first as an optimization as this object is most
+ * likely to be shared across multiple processes.
  */
 void
 vm_object_chain_release_all(vm_object_t first_object, vm_object_t stopobj)
@@ -868,6 +865,7 @@ vm_object_terminate_callback(vm_page_t p, void *data __unused)
                vm_page_remove(p);
                vm_page_wakeup(p);
        }
+       lwkt_yield();
        return(0);
 }
 
index 2e5c187..87e8e6c 100644 (file)
@@ -755,8 +755,11 @@ vm_page_unhold(vm_page_t m)
  * This routine may not block.
  * This routine must be called with the vm_object held.
  * This routine must be called with a critical section held.
+ *
+ * This routine returns TRUE if the page was inserted into the object
+ * successfully, and FALSE if the page already exists in the object.
  */
-void
+int
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
@@ -764,7 +767,6 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
                panic("vm_page_insert: already inserted");
 
        object->generation++;
-       object->resident_page_count++;
 
        /*
         * Record the object/offset pair in this page and add the
@@ -775,7 +777,13 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
        vm_page_spin_lock(m);
        m->object = object;
        m->pindex = pindex;
-       vm_page_rb_tree_RB_INSERT(&object->rb_memq, m);
+       if (vm_page_rb_tree_RB_INSERT(&object->rb_memq, m)) {
+               m->object = NULL;
+               m->pindex = 0;
+               vm_page_spin_unlock(m);
+               return FALSE;
+       }
+       object->resident_page_count++;
        /* atomic_add_int(&object->agg_pv_list_count, m->md.pv_list_count); */
        vm_page_spin_unlock(m);
 
@@ -790,6 +798,7 @@ vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
         * Checks for a swap assignment and sets PG_SWAPPED if appropriate.
         */
        swap_pager_page_inserted(m);
+       return TRUE;
 }
 
 /*
@@ -990,7 +999,10 @@ vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
                ASSERT_LWKT_TOKEN_HELD(vm_object_token(m->object));
                vm_page_remove(m);
        }
-       vm_page_insert(m, new_object, new_pindex);
+       if (vm_page_insert(m, new_object, new_pindex) == FALSE) {
+               panic("vm_page_rename: target exists (%p,%ld)",
+                     new_object, new_pindex);
+       }
        if (m->queue - m->pc == PQ_CACHE)
                vm_page_deactivate(m);
        vm_page_dirty(m);
@@ -1236,17 +1248,20 @@ vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
  * Allocate and return a memory cell associated with this VM object/offset
  * pair.  If object is NULL an unassociated page will be allocated.
  *
- *     page_req classes:
+ * The returned page will be busied and removed from its queues.  This
+ * routine can block and may return NULL if a race occurs and the page
+ * is found to already exist at the specified (object, pindex).
  *
  *     VM_ALLOC_NORMAL         allow use of cache pages, nominal free drain
  *     VM_ALLOC_QUICK          like normal but cannot use cache
  *     VM_ALLOC_SYSTEM         greater free drain
  *     VM_ALLOC_INTERRUPT      allow free list to be completely drained
- *     VM_ALLOC_ZERO           advisory request for pre-zero'd page
- *
- * The object must be locked if not NULL
+ *     VM_ALLOC_ZERO           advisory request for pre-zero'd page only
+ *     VM_ALLOC_FORCE_ZERO     advisory request for pre-zero'd page only
+ *     VM_ALLOC_NULL_OK        ok to return NULL on insertion collision
+ *                             (see vm_page_grab())
+ * The object must be held if not NULL
  * This routine may not block
- * The returned page will be marked PG_BUSY
  *
  * Additional special handling is required when called from an interrupt
  * (VM_ALLOC_INTERRUPT).  We are not allowed to mess with the page cache
@@ -1265,8 +1280,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
        if (object) {
                pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask) +
                           (object->pg_color & ~ncpus_fit_mask);
-               KASSERT(vm_page_lookup(object, pindex) == NULL,
-                       ("vm_page_alloc: page already allocated"));
        } else {
                pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask);
        }
@@ -1276,8 +1289,6 @@ vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
         */
        if (object) {
                pg_color = object->pg_color + pindex;
-               KASSERT(vm_page_lookup(object, pindex) == NULL,
-                       ("vm_page_alloc: page already allocated"));
        } else {
                pg_color = pindex;
        }
@@ -1302,7 +1313,7 @@ loop:
                /*
                 * The free queue has sufficient free pages to take one out.
                 */
-               if (page_req & VM_ALLOC_ZERO)
+               if (page_req & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO))
                        m = vm_page_select_free(pg_color, TRUE);
                else
                        m = vm_page_select_free(pg_color, FALSE);
@@ -1328,7 +1339,7 @@ loop:
                 */
                if (m != NULL) {
                        KASSERT(m->dirty == 0,
-                           ("Found dirty cache page %p", m));
+                               ("Found dirty cache page %p", m));
                        vm_page_protect(m, VM_PROT_NONE);
                        vm_page_free(m);
                        goto loop;
@@ -1354,26 +1365,25 @@ loop:
        }
 
        /*
-        * Good page found.  The page has already been busied for us.
-        *
         * v_free_count can race so loop if we don't find the expected
         * page.
         */
        if (m == NULL)
                goto loop;
-       KASSERT(m->dirty == 0, 
-               ("vm_page_alloc: free/cache page %p was dirty", m));
 
        /*
-        * NOTE: page has already been removed from its queue and busied.
+        * Good page found.  The page has already been busied for us and
+        * removed from its queues.
         */
+       KASSERT(m->dirty == 0,
+               ("vm_page_alloc: free/cache page %p was dirty", m));
        KKASSERT(m->queue == PQ_NONE);
 
        /*
-        * Initialize structure.  Only the PG_ZERO flag is inherited.  Set
-        * the page PG_BUSY
+        * Initialize the structure, inheriting some flags but clearing
+        * all the rest.  The page has already been busied for us.
         */
-       vm_page_flag_clear(m, ~(PG_ZERO | PG_BUSY));
+       vm_page_flag_clear(m, ~(PG_ZERO | PG_BUSY | PG_SBUSY));
        KKASSERT(m->wire_count == 0);
        KKASSERT(m->busy == 0);
        m->act_count = 0;
@@ -1389,10 +1399,18 @@ loop:
         * NOTE: If no object an unassociated page is allocated, m->pindex
         *       can be used by the caller for any purpose.
         */
-       if (object)
-               vm_page_insert(m, object, pindex);
-       else
+       if (object) {
+               if (vm_page_insert(m, object, pindex) == FALSE) {
+                       kprintf("PAGE RACE (%p:%d,%ld)\n",
+                               object, object->type, pindex);
+                       vm_page_free(m);
+                       m = NULL;
+                       if ((page_req & VM_ALLOC_NULL_OK) == 0)
+                               panic("PAGE RACE");
+               }
+       } else {
                m->pindex = pindex;
+       }
 
        /*
         * Don't wakeup too often - wakeup the pageout daemon when
@@ -2142,24 +2160,27 @@ vm_page_io_finish(vm_page_t m)
 
 /*
  * Grab a page, blocking if it is busy and allocating a page if necessary.
- * A busy page is returned or NULL.
+ * A busy page is returned or NULL.  The page may or may not be valid and
+ * might not be on a queue (the caller is responsible for the disposition of
+ * the page).
+ *
+ * If VM_ALLOC_ZERO is specified and the grab must allocate a new page, the
+ * page will be zero'd and marked valid.
  *
- * The page is not removed from its queues. XXX?
+ * If VM_ALLOC_FORCE_ZERO is specified the page will be zero'd and marked
+ * valid even if it already exists.
  *
- * If VM_ALLOC_RETRY is specified VM_ALLOC_NORMAL must also be specified.
- * If VM_ALLOC_RETRY is not specified
+ * If VM_ALLOC_RETRY is specified this routine will never return NULL.  Also
+ * note that VM_ALLOC_NORMAL must be specified if VM_ALLOC_RETRY is specified.
  *
  * This routine may block, but if VM_ALLOC_RETRY is not set then NULL is
  * always returned if we had blocked.  
- * This routine will never return NULL if VM_ALLOC_RETRY is set.
+ *
  * This routine may not be called from an interrupt.
- * The returned page may not be entirely valid.
  *
- * This routine may be called from mainline code without spl protection and
- * be guarenteed a busied page associated with the object at the specified
- * index.
+ * PG_ZERO is *ALWAYS* cleared by this routine.
  *
- * No requirements.
+ * No other requirements.
  */
 vm_page_t
 vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
@@ -2178,6 +2199,7 @@ vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
                                m = NULL;
                                break;
                        }
+                       /* retry */
                } else if (m == NULL) {
                        m = vm_page_alloc(object, pindex,
                                          allocflags & ~VM_ALLOC_RETRY);
@@ -2185,12 +2207,31 @@ vm_page_grab(vm_object_t object, vm_pindex_t pindex, int allocflags)
                                break;
                        vm_wait(0);
                        if ((allocflags & VM_ALLOC_RETRY) == 0)
-                               break;
+                               goto failed;
                } else {
                        /* m found */
                        break;
                }
        }
+
+       /*
+        * If VM_ALLOC_ZERO an invalid page will be zero'd and set valid.
+        *
+        * If VM_ALLOC_FORCE_ZERO the page is unconditionally zero'd and set
+        * valid even if already valid.
+        */
+       if (m->valid == 0) {
+               if (allocflags & (VM_ALLOC_ZERO | VM_ALLOC_FORCE_ZERO)) {
+                       if ((m->flags & PG_ZERO) == 0)
+                               pmap_zero_page(VM_PAGE_TO_PHYS(m));
+                       m->valid = VM_PAGE_BITS_ALL;
+               }
+       } else if (allocflags & VM_ALLOC_FORCE_ZERO) {
+               pmap_zero_page(VM_PAGE_TO_PHYS(m));
+               m->valid = VM_PAGE_BITS_ALL;
+       }
+       vm_page_flag_clear(m, PG_ZERO);
+failed:
        vm_object_drop(object);
        return(m);
 }
index 1b894ec..56f905c 100644 (file)
@@ -444,6 +444,8 @@ vm_page_flash(vm_page_t m)
 #define VM_ALLOC_INTERRUPT     0x04    /* ok to exhaust entire free list */
 #define        VM_ALLOC_ZERO           0x08    /* req pre-zero'd memory if avail */
 #define        VM_ALLOC_QUICK          0x10    /* like NORMAL but do not use cache */
+#define VM_ALLOC_FORCE_ZERO    0x20    /* zero page even if already valid */
+#define VM_ALLOC_NULL_OK       0x40    /* ok to return NULL on collision */
 #define        VM_ALLOC_RETRY          0x80    /* indefinite block (vm_page_grab()) */
 
 void vm_page_queue_spin_lock(vm_page_t);
@@ -468,7 +470,7 @@ int vm_page_try_to_free (vm_page_t);
 void vm_page_dontneed (vm_page_t);
 void vm_page_deactivate (vm_page_t);
 void vm_page_deactivate_locked (vm_page_t);
-void vm_page_insert (vm_page_t, struct vm_object *, vm_pindex_t);
+int vm_page_insert (vm_page_t, struct vm_object *, vm_pindex_t);
 vm_page_t vm_page_lookup (struct vm_object *, vm_pindex_t);
 vm_page_t VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *, vm_pindex_t,
                                int, const char * VM_PAGE_DEBUG_ARGS);
@@ -580,10 +582,10 @@ vm_page_copy(vm_page_t src_m, vm_page_t dest_m)
 /*
  * Free a page.  The page must be marked BUSY.
  *
- * The clearing of PG_ZERO is a temporary safety until the code can be
- * reviewed to determine that PG_ZERO is being properly cleared on
- * write faults or maps.  PG_ZERO was previously cleared in 
- * vm_page_alloc().
+ * Always clear PG_ZERO when freeing a page, which ensures the flag is not
+ * set unless we are absolutely certain the page is zerod.  This is
+ * particularly important when the vm_page_alloc*() code moves pages from
+ * PQ_CACHE to PQ_FREE.
  */
 static __inline void
 vm_page_free(vm_page_t m)
@@ -593,7 +595,8 @@ vm_page_free(vm_page_t m)
 }
 
 /*
- * Free a page to the zerod-pages queue
+ * Free a page to the zerod-pages queue.  The caller must ensure that the
+ * page has been zerod.
  */
 static __inline void
 vm_page_free_zero(vm_page_t m)
index cd8b0cd..28cad92 100644 (file)
@@ -845,6 +845,8 @@ vm_pageout_scan(int pass)
                vm_page_and_queue_spin_unlock(m);
                KKASSERT(m->queue == PQ_INACTIVE);
 
+               lwkt_yield();
+
                /*
                 * The page has been successfully busied and is now no
                 * longer spinlocked.  The queue is no longer spinlocked
@@ -1220,6 +1222,7 @@ vm_pageout_scan(int pass)
                        continue;
                }
                vm_page_and_queue_spin_unlock(m);
+               lwkt_yield();
 
                /*
                 * The page has been successfully busied and the page and
@@ -1370,6 +1373,7 @@ vm_pageout_scan(int pass)
                }
                vm_page_spin_unlock(m);
                pagedaemon_wakeup();
+               lwkt_yield();
 
                /*
                 * Page has been successfully busied and it and its queue
@@ -1511,6 +1515,7 @@ vm_pageout_scan_callback(struct proc *p, void *data)
                info->bigproc = p;
                info->bigsize = size;
        }
+       lwkt_yield();
        return(0);
 }
 
index ee7bfc8..c52d0e8 100644 (file)
@@ -621,9 +621,9 @@ vm_swapcache_cleaning(vm_object_t marker)
        lwkt_gettoken(&vmobj_token);
 
        while ((object = TAILQ_NEXT(object, object_list)) != NULL) {
+               lwkt_yield();
                if (--count <= 0)
                        break;
-
                vm_object_hold(object);
 
                /*