kernel - Numerous VM MPSAFE fixes
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 29 Sep 2011 21:46:42 +0000 (14:46 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Thu, 29 Sep 2011 21:46:42 +0000 (14:46 -0700)
* Remove most critical sections from the VM subsystem, these are no longer
  applicable (vm_token covers the access).

* _pmap_allocpte() for x86-64 - Conditionalize the zeroing of the vm_page
  after the grab.  The grab can race other threads and result in a page
  which had already been zero'd AND populated with pte's, so we can't just
  zero it.

  Use m->valid to determine if the page is actually newly allocated or not.

  NOTE: The 32 bit code already properly zeros the page by detecting whether
the pte has already been entered or not.  The 64-bit code couldn't
do this neatly so we used another method.

* Hold the pmap vm_object in pmap_release() and pmap_object_init_pt() for
  the x86-64 pmap code.  This prevents related loops from blocking on the
  pmap vm_object when freeing VM pages which is not expected by the code.

* pmap_copy() for x86-64 needs the vm_token, critical sections are no longer
  sufficient.

* Assert that PG_MANAGED is set when clearing pte's out of a pmap via the
  PV entries.  The pte's must exist in this case and it's a critical panic
  if they don't.

* pmap_replacevm() for x86-64 - Adjust newvm->vm_sysref prior to assigning
  it to p->p_vmspace to handle any potential MP races with other sysrefs
  on the vmspace.

* faultin() needs p->p_token, not proc_token.

* swapout_procs_callback() needs p->p_token.

* Deallocate the VM object associated with a vm_page after freeing the
  page instead of before freeing the page.  This fixes a potential
  use-after-refs-transition-to-0 case if a MP race occurs.

sys/platform/pc64/x86_64/pmap.c
sys/vm/device_pager.c
sys/vm/phys_pager.c
sys/vm/swap_pager.c
sys/vm/vm_contig.c
sys/vm/vm_glue.c
sys/vm/vm_map.c
sys/vm/vm_object.c
sys/vm/vm_pageout.c
sys/vm/vm_swapcache.c

index 3ab87c9..1f2a59f 100644 (file)
@@ -255,8 +255,6 @@ pmap_kmem_choose(vm_offset_t addr)
  *     This eliminates many course-grained invltlb calls.  Note that many of
  *     the pv list scans are across different pmaps and it is very wasteful
  *     to do an entire invltlb when checking a single mapping.
- *
- *     Should only be called while in a critical section.
  */
 static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va);
 
@@ -1106,8 +1104,11 @@ pmap_qremove(vm_offset_t va, int count)
  * page is busy.  This routine does not busy the page it returns.
  *
  * Unless the caller is managing objects whos pages are in a known state,
- * the call should be made with a critical section held so the page's object
- * association remains valid on return.
+ * the call should be made with both vm_token held and the governing object
+ * and its token held so the page's object association remains valid on
+ * return.
+ *
+ * This function can block!
  */
 static
 vm_page_t
@@ -1347,7 +1348,8 @@ pmap_pinit(struct pmap *pmap)
         * already be set appropriately.
         */
        if ((ptdpg = pmap->pm_pdirm) == NULL) {
-               ptdpg = vm_page_grab(pmap->pm_pteobj, NUPDE + NUPDPE + PML4PML4I,
+               ptdpg = vm_page_grab(pmap->pm_pteobj,
+                                    NUPDE + NUPDPE + PML4PML4I,
                                     VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
                pmap->pm_pdirm = ptdpg;
                vm_page_flag_clear(ptdpg, PG_MAPPED | PG_BUSY);
@@ -1423,12 +1425,10 @@ pmap_puninit(pmap_t pmap)
 void
 pmap_pinit2(struct pmap *pmap)
 {
-       crit_enter();
        lwkt_gettoken(&vm_token);
        TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
        /* XXX copies current process, does not fill in MPPTDI */
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -1560,12 +1560,30 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
         */
        m = vm_page_grab(pmap->pm_pteobj, ptepindex,
                         VM_ALLOC_NORMAL | VM_ALLOC_ZERO | VM_ALLOC_RETRY);
-       if ((m->flags & PG_ZERO) == 0) {
-               pmap_zero_page(VM_PAGE_TO_PHYS(m));
+
+       /*
+        * The grab may have blocked and raced another thread populating
+        * the same page table.  m->valid will be 0 on a newly allocated page
+        * so use this to determine if we have to zero it out or not.  We
+        * don't want to zero-out a raced page as this would desynchronize
+        * the pv_entry's for the related pte's and cause pmap_remove_all()
+        * to panic.
+        */
+       if (m->valid == 0) {
+               if ((m->flags & PG_ZERO) == 0) {
+                       pmap_zero_page(VM_PAGE_TO_PHYS(m));
+               }
+#ifdef PMAP_DEBUG
+               else {
+                       pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
+               }
+#endif
+               m->valid = VM_PAGE_BITS_ALL;
+               vm_page_flag_clear(m, PG_ZERO);
        }
 #ifdef PMAP_DEBUG
        else {
-               pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
+               KKASSERT((m->flags & PG_ZERO) == 0);
        }
 #endif
 
@@ -1579,8 +1597,6 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
        m->hold_count++;
        if (m->wire_count++ == 0)
                vmstats.v_wire_count++;
-       m->valid = VM_PAGE_BITS_ALL;
-       vm_page_flag_clear(m, PG_ZERO);
 
        /*
         * Map the pagetable page into the process address space, if
@@ -1803,13 +1819,11 @@ pmap_release(struct pmap *pmap)
        
        info.pmap = pmap;
        info.object = object;
-       crit_enter();
+       vm_object_hold(object);
        lwkt_gettoken(&vm_token);
        TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
-       crit_exit();
 
        do {
-               crit_enter();
                info.error = 0;
                info.mpte = NULL;
                info.limit = object->generation;
@@ -1820,9 +1834,9 @@ pmap_release(struct pmap *pmap)
                        if (!pmap_release_free_page(pmap, info.mpte))
                                info.error = 1;
                }
-               crit_exit();
        } while (info.error);
        lwkt_reltoken(&vm_token);
+       vm_object_drop(object);
 }
 
 static
@@ -1863,7 +1877,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
        pdp_entry_t newpdp;
        int update_kernel_vm_end;
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
 
        /*
@@ -1969,7 +1982,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                kernel_vm_end = kstart;
 
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -2074,10 +2086,13 @@ pmap_collect(void)
        
 
 /*
- * If it is the first entry on the list, it is actually
- * in the header and we must copy the following entry up
- * to the header.  Otherwise we must search the list for
- * the entry.  In either case we free the now unused entry.
+ * If it is the first entry on the list, it is actually in the header and
+ * we must copy the following entry up to the header.
+ *
+ * Otherwise we must search the list for the entry.  In either case we
+ * free the now unused entry.
+ *
+ * Caller must hold vm_token
  */
 static
 int
@@ -2087,7 +2102,6 @@ pmap_remove_entry(struct pmap *pmap, vm_page_t m,
        pv_entry_t pv;
        int rtval;
 
-       crit_enter();
        if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
                TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                        if (pmap == pv->pv_pmap && va == pv->pv_va) 
@@ -2114,13 +2128,13 @@ pmap_remove_entry(struct pmap *pmap, vm_page_t m,
        rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
        free_pv_entry(pv);
 
-       crit_exit();
        return rtval;
 }
 
 /*
- * Create a pv entry for page at pa for
- * (pmap, va).
+ * Create a pv entry for page at pa for (pmap, va).
+ *
+ * Caller must hold vm_token
  */
 static
 void
@@ -2128,7 +2142,6 @@ pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
 {
        pv_entry_t pv;
 
-       crit_enter();
        pv = get_pv_entry();
        pv->pv_va = va;
        pv->pv_pmap = pmap;
@@ -2139,12 +2152,12 @@ pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
        ++pmap->pm_generation;
        m->md.pv_list_count++;
        m->object->agg_pv_list_count++;
-
-       crit_exit();
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
+ *
+ * Caller must hold vm_token
  */
 static
 int
@@ -2192,12 +2205,12 @@ pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
 }
 
 /*
- * pmap_remove_page:
+ * Remove a single page from a process address space.
  *
- *     Remove a single page from a process address space.
+ * This function may not be called from an interrupt if the pmap is
+ * not kernel_pmap.
  *
- *     This function may not be called from an interrupt if the pmap is
- *     not kernel_pmap.
+ * Caller must hold vm_token
  */
 static
 void
@@ -2214,15 +2227,12 @@ pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
 }
 
 /*
- * pmap_remove:
- *
- *     Remove the given range of addresses from the specified map.
+ * Remove the given range of addresses from the specified map.
  *
- *     It is assumed that the start and end are properly
- *     rounded to the page size.
+ * It is assumed that the start and end are properly rounded to the page size.
  *
- *     This function may not be called from an interrupt if the pmap is
- *     not kernel_pmap.
+ * This function may not be called from an interrupt if the pmap is not
+ * kernel_pmap.
  */
 void
 pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
@@ -2350,7 +2360,6 @@ pmap_remove_all(vm_page_t m)
 
        lwkt_gettoken(&vm_token);
        pmap_inval_init(&info);
-       crit_enter();
        while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
                KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
                --pv->pv_pmap->pm_stats.resident_count;
@@ -2358,6 +2367,7 @@ pmap_remove_all(vm_page_t m)
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
                pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
                tpte = pte_load_clear(pte);
+               KKASSERT(tpte & PG_MANAGED);
                if (tpte & PG_W)
                        pv->pv_pmap->pm_stats.wired_count--;
                pmap_inval_deinterlock(&info, pv->pv_pmap);
@@ -2389,7 +2399,6 @@ pmap_remove_all(vm_page_t m)
                pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
                free_pv_entry(pv);
        }
-       crit_exit();
        KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
        pmap_inval_done(&info);
        lwkt_reltoken(&vm_token);
@@ -2892,8 +2901,8 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
         * Use a red-black scan to traverse the requested range and load
         * any valid pages found into the pmap.
         *
-        * We cannot safely scan the object's memq unless we are in a
-        * critical section since interrupts can remove pages from objects.
+        * We cannot safely scan the object's memq without holding the
+        * object token.
         */
        info.start_pindex = pindex;
        info.end_pindex = pindex + psize - 1;
@@ -2902,12 +2911,12 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
        info.addr = addr;
        info.pmap = pmap;
 
-       crit_enter();
+       vm_object_hold(object);
        lwkt_gettoken(&vm_token);
        vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
                                pmap_object_init_pt_callback, &info);
        lwkt_reltoken(&vm_token);
-       crit_exit();
+       vm_object_drop(object);
 }
 
 static
@@ -3010,11 +3019,10 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
 
 
 /*
- *     Copy the range specified by src_addr/len
- *     from the source map to the range dst_addr/len
- *     in the destination map.
+ * Copy the range specified by src_addr/len from the source map to
+ * the range dst_addr/len in the destination map.
  *
- *     This routine is only advisory and need not do anything.
+ * This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, 
@@ -3049,11 +3057,10 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
        pmap_inval_add(&info, src_pmap, -1);
 
        /*
-        * critical section protection is required to maintain the page/object
-        * association, interrupts can free pages and remove them from 
-        * their objects.
+        * vm_token section protection is required to maintain the page/object
+        * associations.
         */
-       crit_enter();
+       lwkt_gettoken(&vm_token);
        for (addr = src_addr; addr < end_addr; addr = pdnxt) {
                pt_entry_t *src_pte, *dst_pte;
                vm_page_t dstmpte, srcmpte;
@@ -3158,7 +3165,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
                }
        }
 failed:
-       crit_exit();
+       lwkt_reltoken(&vm_token);
        pmap_inval_done(&info);
 #endif
 }      
@@ -3267,13 +3274,11 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return FALSE;
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
 
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                if (pv->pv_pmap == pmap) {
                        lwkt_reltoken(&vm_token);
-                       crit_exit();
                        return TRUE;
                }
                loops++;
@@ -3281,7 +3286,6 @@ pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
                        break;
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
        return (FALSE);
 }
 
@@ -3336,6 +3340,7 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                        continue;
                }
                tpte = pte_load_clear(pte);
+               KKASSERT(tpte & PG_MANAGED);
 
                m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 
@@ -3380,9 +3385,10 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 }
 
 /*
- * pmap_testbit tests bits in pte's
- * note that the testbit/clearbit routines are inline,
- * and a lot of things compile-time evaluate.
+ * pmap_testbit tests bits in pte's note that the testbit/clearbit
+ * routines are inline, and a lot of things compile-time evaluate.
+ *
+ * Caller must hold vm_token
  */
 static
 boolean_t
@@ -3397,8 +3403,6 @@ pmap_testbit(vm_page_t m, int bit)
        if (TAILQ_FIRST(&m->md.pv_list) == NULL)
                return FALSE;
 
-       crit_enter();
-
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                /*
                 * if the bit being tested is the modified bit, then
@@ -3417,12 +3421,9 @@ pmap_testbit(vm_page_t m, int bit)
                }
 #endif
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-               if (*pte & bit) {
-                       crit_exit();
+               if (*pte & bit)
                        return TRUE;
-               }
        }
-       crit_exit();
        return (FALSE);
 }
 
@@ -3562,20 +3563,15 @@ pmap_ts_referenced(vm_page_t m)
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return (rtval);
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
 
        if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
-
                pvf = pv;
-
                do {
                        pvn = TAILQ_NEXT(pv, pv_list);
 
-                       crit_enter();
                        TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
                        TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
-                       crit_exit();
 
                        if (!pmap_track_modified(pv->pv_va))
                                continue;
@@ -3596,7 +3592,6 @@ pmap_ts_referenced(vm_page_t m)
                } while ((pv = pvn) != NULL && pv != pvf);
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
 
        return (rtval);
 }
@@ -3810,6 +3805,8 @@ done:
  *
  * The vmspace for all lwps associated with the process will be adjusted
  * and cr3 will be reloaded if any lwp is the current lwp.
+ *
+ * Caller must hold vmspace_token
  */
 void
 pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
@@ -3817,25 +3814,27 @@ pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs)
        struct vmspace *oldvm;
        struct lwp *lp;
 
-       crit_enter();
        oldvm = p->p_vmspace;
        if (oldvm != newvm) {
+               if (adjrefs)
+                       sysref_get(&newvm->vm_sysref);
                p->p_vmspace = newvm;
                KKASSERT(p->p_nthreads == 1);
                lp = RB_ROOT(&p->p_lwp_tree);
                pmap_setlwpvm(lp, newvm);
-               if (adjrefs) {
-                       sysref_get(&newvm->vm_sysref);
+               if (adjrefs)
                        sysref_put(&oldvm->vm_sysref);
-               }
        }
-       crit_exit();
 }
 
 /*
  * Set the vmspace for a LWP.  The vmspace is almost universally set the
  * same as the process vmspace, but virtual kernels need to swap out contexts
  * on a per-lwp basis.
+ *
+ * Caller does not necessarily hold vmspace_token.  Caller must control
+ * the lwp (typically be in the context of the lwp).  We use a critical
+ * section to protect against statclock and hardclock (statistics collection).
  */
 void
 pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
@@ -3843,10 +3842,10 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
        struct vmspace *oldvm;
        struct pmap *pmap;
 
-       crit_enter();
        oldvm = lp->lwp_vmspace;
 
        if (oldvm != newvm) {
+               crit_enter();
                lp->lwp_vmspace = newvm;
                if (curthread->td_lwp == lp) {
                        pmap = vmspace_pmap(newvm);
@@ -3870,8 +3869,8 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                        pmap->pm_active &= ~(cpumask_t)1;
 #endif
                }
+               crit_exit();
        }
-       crit_exit();
 }
 
 #ifdef SMP
index 88831ab..8b3590a 100644 (file)
@@ -212,10 +212,8 @@ dev_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                                  page, pageq);
                lwkt_gettoken(&vm_token);
                vm_object_hold(object);
-               crit_enter();
                vm_page_free(*mpp);
                vm_page_insert(page, object, offset);
-               crit_exit();
                vm_object_drop(object);
                lwkt_reltoken(&vm_token);
        }
index 4656384..a82180d 100644 (file)
@@ -82,7 +82,6 @@ phys_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        vm_page_t m = *mpp;
 
        lwkt_gettoken(&vm_token);
-       crit_enter();
        if ((m->flags & PG_ZERO) == 0)
                vm_page_zero_fill(m);
        vm_page_flag_set(m, PG_ZERO);
@@ -90,7 +89,6 @@ phys_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        vm_page_unmanage(m);
        m->valid = VM_PAGE_BITS_ALL;
        m->dirty = 0;
-       crit_exit();
        lwkt_reltoken(&vm_token);
 
        return (VM_PAGER_OK);
index 520e160..6005ce7 100644 (file)
@@ -455,9 +455,7 @@ swap_pager_dealloc(vm_object_t object)
         * associated with vm_page_t's for this object.  We do not care
         * if paging is still in progress on some objects.
         */
-       crit_enter();
        swp_pager_meta_free_all(object);
-       crit_exit();
        lwkt_reltoken(&vm_token);
 }
 
@@ -556,11 +554,9 @@ swp_pager_freeswapspace(vm_object_t object, swblk_t blk, int npages)
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 {
-       crit_enter();
        lwkt_gettoken(&vm_token);
        swp_pager_meta_free(object, start, size);
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -569,11 +565,9 @@ swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_pindex_t size)
 void
 swap_pager_freespace_all(vm_object_t object)
 {
-       crit_enter();
        lwkt_gettoken(&vm_token);
        swp_pager_meta_free_all(object);
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -660,12 +654,10 @@ void
 swap_pager_page_inserted(vm_page_t m)
 {
        if (m->object->swblock_count) {
-               crit_enter();
                lwkt_gettoken(&vm_token);
                if (swp_pager_meta_ctl(m->object, m->pindex, 0) != SWAPBLK_NONE)
                        vm_page_flag_set(m, PG_SWAPPED);
                lwkt_reltoken(&vm_token);
-               crit_exit();
        }
 }
 
@@ -687,7 +679,6 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
        swblk_t blk = SWAPBLK_NONE;
        vm_pindex_t beg = start;        /* save start index */
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
        while (size) {
                if (n == 0) {
@@ -700,7 +691,6 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
                                        swp_pager_meta_free(object, beg,
                                                            start - beg);
                                        lwkt_reltoken(&vm_token);
-                                       crit_exit();
                                        return(-1);
                                }
                        }
@@ -713,7 +703,6 @@ swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
        }
        swp_pager_meta_free(object, start, n);
        lwkt_reltoken(&vm_token);
-       crit_exit();
        return(0);
 }
 
@@ -752,7 +741,6 @@ swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
        vm_pindex_t i;
 
        ASSERT_LWKT_TOKEN_HELD(&vm_token);
-       crit_enter();
 
        /*
         * transfer source to destination.
@@ -806,7 +794,6 @@ swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
                if (srcobject->type == OBJT_SWAP)
                        srcobject->type = OBJT_DEFAULT;
        }
-       crit_exit();
 }
 
 /*
@@ -833,17 +820,14 @@ swap_pager_haspage(vm_object_t object, vm_pindex_t pindex)
         * do we have good backing store at the requested index ?
         */
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
        blk0 = swp_pager_meta_ctl(object, pindex, 0);
 
        if (blk0 == SWAPBLK_NONE) {
                lwkt_reltoken(&vm_token);
-               crit_exit();
                return (FALSE);
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
        return (TRUE);
 }
 
@@ -871,13 +855,11 @@ void
 swap_pager_unswapped(vm_page_t m)
 {
        if (m->flags & PG_SWAPPED) {
-               crit_enter();
                lwkt_gettoken(&vm_token);
                KKASSERT(m->flags & PG_SWAPPED);
                swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
                vm_page_flag_clear(m, PG_SWAPPED);
                lwkt_reltoken(&vm_token);
-               crit_exit();
        }
 }
 
@@ -951,11 +933,9 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                 * FREE PAGE(s) - destroy underlying swap that is no longer
                 *                needed.
                 */
-               crit_enter();
                lwkt_gettoken(&vm_token);
                swp_pager_meta_free(object, start, count);
                lwkt_reltoken(&vm_token);
-               crit_exit();
                bp->b_resid = 0;
                biodone(bio);
                return;
@@ -980,7 +960,6 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
        /*
         * Execute read or write
         */
-       crit_enter();
        lwkt_gettoken(&vm_token);
        while (count > 0) {
                swblk_t blk;
@@ -1067,7 +1046,6 @@ swap_pager_strategy(vm_object_t object, struct bio *bio)
                data += PAGE_SIZE;
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
 
        /*
         *  Flush out last buffer
@@ -1234,12 +1212,10 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        if (mreq->valid == VM_PAGE_BITS_ALL) {
                if (swap_burst_read == 0 || mreq->pindex + 1 >= object->size)
                        return(VM_PAGER_OK);
-               crit_enter();
                lwkt_gettoken(&vm_token);
                blk = swp_pager_meta_ctl(object, mreq->pindex + 1, 0);
                if (blk == SWAPBLK_NONE) {
                        lwkt_reltoken(&vm_token);
-                       crit_exit();
                        return(VM_PAGER_OK);
                }
                m = vm_page_lookup(object, mreq->pindex + 1);
@@ -1248,13 +1224,11 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                                          VM_ALLOC_QUICK);
                        if (m == NULL) {
                                lwkt_reltoken(&vm_token);
-                               crit_exit();
                                return(VM_PAGER_OK);
                        }
                } else {
                        if ((m->flags & PG_BUSY) || m->busy || m->valid) {
                                lwkt_reltoken(&vm_token);
-                               crit_exit();
                                return(VM_PAGER_OK);
                        }
                        vm_page_unqueue_nowakeup(m);
@@ -1263,7 +1237,6 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                mreq = m;
                raonly = 1;
                lwkt_reltoken(&vm_token);
-               crit_exit();
        } else {
                raonly = 0;
        }
@@ -1277,7 +1250,6 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
         * Note that blk and iblk can be SWAPBLK_NONE but the loop is
         * set up such that the case(s) are handled implicitly.
         */
-       crit_enter();
        lwkt_gettoken(&vm_token);
        blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);
        marray[0] = mreq;
@@ -1310,7 +1282,6 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                vm_page_flag_set(marray[i - 1], PG_RAM);
 
        lwkt_reltoken(&vm_token);
-       crit_exit();
 
        /*
         * If mreq is the requested page and we have nothing to do return
@@ -1392,7 +1363,6 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        /*
         * Read-ahead includes originally requested page case.
         */
-       crit_enter();
        lwkt_gettoken(&vm_token);
        while ((mreq->flags & PG_SWAPINPROG) != 0) {
                vm_page_flag_set(mreq, PG_WANTED | PG_REFERENCED);
@@ -1407,7 +1377,6 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
                }
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
 
        /*
         * mreq is left bussied after completion, but all the other pages
@@ -1506,7 +1475,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                 * count is too low, we may not be able to make the adjustment
                 * at this time.
                 */
-               crit_enter();
                lwkt_gettoken(&vm_token);
                n -= nsw_wcount_async_max;
                if (nsw_wcount_async + n >= 0) {
@@ -1515,7 +1483,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                        wakeup(&nsw_wcount_async);
                }
                lwkt_reltoken(&vm_token);
-               crit_exit();
        }
 
        /*
@@ -1539,7 +1506,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                n = min(BLIST_MAX_ALLOC, count - i);
                n = min(n, nsw_cluster_max);
 
-               crit_enter();
                lwkt_gettoken(&vm_token);
 
                /*
@@ -1558,7 +1524,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                        for (j = 0; j < n; ++j)
                                rtvals[i+j] = VM_PAGER_FAIL;
                        lwkt_reltoken(&vm_token);
-                       crit_exit();
                        continue;
                }
 
@@ -1606,7 +1571,6 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                mycpu->gd_cnt.v_swappgsout += bp->b_xio.xio_npages;
 
                lwkt_reltoken(&vm_token);
-               crit_exit();
 
                bp->b_dirtyoff = 0;             /* req'd for NFS */
                bp->b_dirtyend = bp->b_bcount;  /* req'd for NFS */
@@ -1704,7 +1668,6 @@ swp_pager_async_iodone(struct bio *bio)
         */
        if (bp->b_xio.xio_npages)
                object = bp->b_xio.xio_pages[0]->object;
-       crit_enter();
        lwkt_gettoken(&vm_token);
 
        /*
@@ -1901,7 +1864,6 @@ swp_pager_async_iodone(struct bio *bio)
        bp->b_cmd = BUF_CMD_DONE;
        relpbuf(bp, nswptr);
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
index f077a12..cf9a3ab 100644 (file)
@@ -233,7 +233,6 @@ vm_contig_pg_alloc(unsigned long size, vm_paddr_t low, vm_paddr_t high,
                panic("vm_contig_pg_alloc: boundary must be a power of 2");
 
        start = 0;
-       crit_enter();
 
        /*
         * Three passes (0, 1, 2).  Each pass scans the VM page list for
@@ -280,9 +279,6 @@ again:
                        vm_contig_pg_flush(PQ_INACTIVE, 
                                            vmstats.v_inactive_count);
 
-                       crit_exit(); /* give interrupts a chance */
-                       crit_enter();
-
                        /*
                         * Best effort flush of active pages.
                         *
@@ -303,8 +299,6 @@ again:
                         * to succeed, reset to 0 for the next iteration.
                         */
                        start = 0;
-                       crit_exit(); /* give interrupts a chance */
-                       crit_enter();
                        continue;       /* next pass */
                }
                start = i;
@@ -358,14 +352,12 @@ again:
                /*
                 * Our job is done, return the index page of vm_page_array.
                 */
-               crit_exit();
                return (start); /* aka &pga[start] */
        }
 
        /*
         * Failed.
         */
-       crit_exit();
        return (-1);
 }
 
@@ -419,7 +411,6 @@ vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags)
        if (size == 0)
                panic("vm_contig_pg_kmap: size must not be 0");
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
 
        /*
@@ -439,7 +430,6 @@ vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags)
                vm_map_unlock(map);
                vm_map_entry_release(count);
                lwkt_reltoken(&vm_token);
-               crit_exit();
                return (0);
        }
 
@@ -471,7 +461,6 @@ vm_contig_pg_kmap(int start, u_long size, vm_map_t map, int flags)
        vm_object_drop(&kernel_object);
 
        lwkt_reltoken(&vm_token);
-       crit_exit();
        return (addr);
 }
 
index 8660530..1cdbf83 100644 (file)
@@ -326,16 +326,14 @@ faultin(struct proc *p)
                 * The process is waiting in the kernel to return to user
                 * mode but cannot until P_SWAPPEDOUT gets cleared.
                 */
-               crit_enter();
-               lwkt_gettoken(&proc_token);
+               lwkt_gettoken(&p->p_token);
                p->p_flag &= ~(P_SWAPPEDOUT | P_SWAPWAIT);
 #ifdef INVARIANTS
                if (swap_debug)
                        kprintf("swapping in %d (%s)\n", p->p_pid, p->p_comm);
 #endif
                wakeup(p);
-               lwkt_reltoken(&proc_token);
-               crit_exit();
+               lwkt_reltoken(&p->p_token);
        }
 }
 
@@ -529,26 +527,33 @@ swapout_procs_callback(struct proc *p, void *data)
        if (!swappable(p))
                return(0);
 
+       lwkt_gettoken(&p->p_token);
        vm = p->p_vmspace;
 
        /*
         * We only consider active processes.
         */
-       if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
+       if (p->p_stat != SACTIVE && p->p_stat != SSTOP) {
+               lwkt_reltoken(&p->p_token);
                return(0);
+       }
 
        FOREACH_LWP_IN_PROC(lp, p) {
                /*
                 * do not swap out a realtime process
                 */
-               if (RTP_PRIO_IS_REALTIME(lp->lwp_rtprio.type))
+               if (RTP_PRIO_IS_REALTIME(lp->lwp_rtprio.type)) {
+                       lwkt_reltoken(&p->p_token);
                        return(0);
+               }
 
                /*
                 * Guarentee swap_idle_threshold time in memory
                 */
-               if (lp->lwp_slptime < swap_idle_threshold1)
+               if (lp->lwp_slptime < swap_idle_threshold1) {
+                       lwkt_reltoken(&p->p_token);
                        return(0);
+               }
 
                /*
                 * If the system is under memory stress, or if we
@@ -558,6 +563,7 @@ swapout_procs_callback(struct proc *p, void *data)
                if (((action & VM_SWAP_NORMAL) == 0) &&
                    (((action & VM_SWAP_IDLE) == 0) ||
                     (lp->lwp_slptime < swap_idle_threshold2))) {
+                       lwkt_reltoken(&p->p_token);
                        return(0);
                }
 
@@ -581,12 +587,13 @@ swapout_procs_callback(struct proc *p, void *data)
         * cleanup our reference
         */
        sysref_put(&vm->vm_sysref);
+       lwkt_reltoken(&p->p_token);
 
        return(0);
 }
 
 /*
- * The caller must hold proc_token and vmspace_token.
+ * The caller must hold proc_token and vmspace_token and p->p_token
  */
 static void
 swapout(struct proc *p)
index 3dd41c1..a657c65 100644 (file)
@@ -603,6 +603,8 @@ vm_map_entry_reserve(int count)
        /*
         * Make sure we have enough structures in gd_vme_base to handle
         * the reservation request.
+        *
+        * The critical section protects access to the per-cpu gd.
         */
        crit_enter();
        while (gd->gd_vme_avail < count) {
index ae9aee5..ee49c2d 100644 (file)
@@ -835,7 +835,6 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
         * Interlock other major object operations.  This allows us to 
         * temporarily clear OBJ_WRITEABLE and OBJ_MIGHTBEDIRTY.
         */
-       crit_enter();
        vm_object_set_flag(object, OBJ_CLEANING);
 
        /*
@@ -887,7 +886,6 @@ vm_object_page_clean(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
        } while (info.error || curgeneration != object->generation);
 
        vm_object_clear_flag(object, OBJ_CLEANING);
-       crit_exit();
        vm_object_drop(object);
 }
 
@@ -1108,7 +1106,6 @@ vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
         * spl protection needed to prevent races between the lookup,
         * an interrupt unbusy/free, and our protect call.
         */
-       crit_enter();
        lwkt_gettoken(&vm_token);
        for (idx = start; idx < end; idx++) {
                p = vm_page_lookup(object, idx);
@@ -1117,7 +1114,6 @@ vm_object_pmap_copy_1(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
                vm_page_protect(p, VM_PROT_READ);
        }
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -1139,14 +1135,12 @@ vm_object_pmap_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end)
        info.start_pindex = start;
        info.end_pindex = end - 1;
 
-       crit_enter();
        lwkt_gettoken(&vm_token);
        vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
                                vm_object_pmap_remove_callback, &info);
        if (start == 0 && end == object->size)
                vm_object_clear_flag(object, OBJ_WRITEABLE);
        lwkt_reltoken(&vm_token);
-       crit_exit();
 }
 
 /*
@@ -1218,7 +1212,6 @@ shadowlookup:
                 * lookup, an interrupt unbusy/free, and our busy check.
                 */
 
-               crit_enter();
                m = vm_page_lookup(tobject, tpindex);
 
                if (m == NULL) {
@@ -1231,7 +1224,6 @@ shadowlookup:
                        /*
                         * next object
                         */
-                       crit_exit();
                        if (tobject->backing_object == NULL)
                                continue;
                        tpindex += OFF_TO_IDX(tobject->backing_object_offset);
@@ -1251,16 +1243,13 @@ shadowlookup:
                    (m->flags & PG_UNMANAGED) ||
                    m->valid != VM_PAGE_BITS_ALL
                ) {
-                       crit_exit();
                        continue;
                }
 
                if (vm_page_sleep_busy(m, TRUE, "madvpo")) {
-                       crit_exit();
                        goto relookup;
                }
                vm_page_busy(m);
-               crit_exit();
 
                /*
                 * Theoretically once a page is known not to be busy, an
@@ -1384,8 +1373,6 @@ vm_object_backing_scan(vm_object_t object, int op)
        struct rb_vm_page_scan_info info;
        vm_object_t backing_object;
 
-       crit_enter();
-
        backing_object = object->backing_object;
        info.backing_offset_index = OFF_TO_IDX(object->backing_object_offset);
 
@@ -1404,7 +1391,6 @@ vm_object_backing_scan(vm_object_t object, int op)
                 * shadow test may succeed! XXX
                 */
                if (backing_object->type != OBJT_DEFAULT) {
-                       crit_exit();
                        return(0);
                }
        }
@@ -1428,7 +1414,7 @@ vm_object_backing_scan(vm_object_t object, int op)
                                        vm_object_backing_scan_callback,
                                        &info);
        } while (info.error < 0);
-       crit_exit();
+
        return(info.error);
 }
 
@@ -1680,7 +1666,6 @@ vm_object_collapse(vm_object_t object)
                        /*
                         * Move the pager from backing_object to object.
                         */
-
                        if (backing_object->type == OBJT_SWAP) {
                                vm_object_pip_add(backing_object, 1);
 
@@ -1847,7 +1832,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
        /*
         * Indicate that paging is occuring on the object
         */
-       crit_enter();
        vm_object_pip_add(object, 1);
 
        /*
@@ -1889,7 +1873,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end,
         * Cleanup
         */
        vm_object_pip_wakeup(object);
-       crit_exit();
        lwkt_reltoken(&vm_token);
 }
 
index 589aad8..e5e1af2 100644 (file)
@@ -519,11 +519,9 @@ vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
                        remove_mode = 1;
 
                /*
-                * scan the objects entire memory queue.  spl protection is
-                * required to avoid an interrupt unbusy/free race against
-                * our busy check.
+                * scan the objects entire memory queue.  We hold the
+                * object's token so the scan should not race anything.
                 */
-               crit_enter();
                info.limit = remove_mode;
                info.map = map;
                info.desired = desired;
@@ -531,7 +529,6 @@ vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object,
                                vm_pageout_object_deactivate_pages_callback,
                                &info
                );
-               crit_exit();
                tmp = object->backing_object;
                vm_object_drop(object);
                object = tmp;
@@ -678,27 +675,23 @@ vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired)
 #endif
 
 /*
- * Don't try to be fancy - being fancy can lead to vnode deadlocks.   We
- * only do it for OBJT_DEFAULT and OBJT_SWAP objects which we know can
- * be trivially freed.
+ * Called when the pageout scan wants to free a page.  We no longer
+ * try to cycle the vm_object here with a reference & dealloc, which can
+ * cause a non-trivial object collapse in a critical path.
  *
- * The caller must hold vm_token.
+ * It is unclear why we cycled the ref_count in the past, perhaps to try
+ * to optimize shadow chain collapses but I don't quite see why it would
+ * be necessary.  An OBJ_DEAD object should terminate any and all vm_pages
+ * synchronously and not have to be kicked-start.
  *
- * WARNING: vm_object_reference() can block.
+ * The caller must hold vm_token.
  */
 static void
 vm_pageout_page_free(vm_page_t m) 
 {
-       vm_object_t object = m->object;
-       int type = object->type;
-
        vm_page_busy(m);
-       if (type == OBJT_SWAP || type == OBJT_DEFAULT)
-               vm_object_reference(object);
        vm_page_protect(m, VM_PROT_NONE);
        vm_page_free(m);
-       if (type == OBJT_SWAP || type == OBJT_DEFAULT)
-               vm_object_deallocate(object);
 }
 
 /*
@@ -783,7 +776,6 @@ vm_pageout_scan(int pass)
         * check, leaving us on the wrong queue or checking the wrong
         * page.
         */
-       crit_enter();
 rescan0:
        vpfailed = NULL;
        maxscan = vmstats.v_inactive_count;
@@ -793,12 +785,6 @@ rescan0:
         ) {
                mycpu->gd_cnt.v_pdpages++;
 
-               /*
-                * Give interrupts a chance
-                */
-               crit_exit();
-               crit_enter();
-
                /*
                 * It's easier for some of the conditions below to just loop
                 * and catch queue changes here rather then check everywhere
@@ -1100,12 +1086,6 @@ rescan0:
        while ((m != NULL) && (pcount-- > 0) &&
               (inactive_shortage > 0 || active_shortage > 0)
        ) {
-               /*
-                * Give interrupts a chance.
-                */
-               crit_exit();
-               crit_enter();
-
                /*
                 * If the page was ripped out from under us, just stop.
                 */
@@ -1252,8 +1232,6 @@ rescan0:
                mycpu->gd_cnt.v_dfree++;
        }
 
-       crit_exit();
-
 #if !defined(NO_SWAPPING)
        /*
         * Idle process swapout -- run once per second.
@@ -1402,8 +1380,6 @@ vm_pageout_page_stats(void)
        if (page_shortage <= 0)
                return;
 
-       crit_enter();
-
        pcount = vmstats.v_active_count;
        fullintervalcount += vm_pageout_stats_interval;
        if (fullintervalcount < vm_pageout_full_stats_interval) {
@@ -1472,7 +1448,6 @@ vm_pageout_page_stats(void)
 
                m = next;
        }
-       crit_exit();
 }
 
 /*
index f291645..e6d9b9a 100644 (file)
@@ -172,7 +172,6 @@ vm_swapcached_thread(void)
        EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache,
                              NULL, SHUTDOWN_PRI_SECOND);
        lwkt_gettoken(&vm_token);
-       crit_enter();
 
        /*
         * Initialize our marker for the inactive scan (SWAPC_WRITING)
@@ -266,7 +265,6 @@ vm_swapcached_thread(void)
         * Cleanup (NOT REACHED)
         */
        TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq);
-       crit_exit();
        lwkt_reltoken(&vm_token);
 
        lwkt_gettoken(&vmobj_token);