kernel - reformulate some of the pmap code to adhere to the new rules
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 8 Nov 2011 19:44:38 +0000 (11:44 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 8 Nov 2011 19:44:38 +0000 (11:44 -0800)
* hold_count can now be adjusted asynchronously so it cannot be used to
  count the number of pte's in a page table page.  Use wire_count instead.

* On i386 there is no extra wire_count for the page table page mapping itself
  so the pmap_release code should never find a page table page other than the
  page directory page itself.

* Reformulate the *ptep checks in x86-64's pmap_scan() code to avoid races.

* Document why busy pages can sometimes be found in the PQ_FREE queues.

* Document that other than finding a busy page, pages in PQ_FREE should be
  ready to go.

* It is possible for a wired page to wind up on the inactive queue, the
  pageout daemon has to deal with the case by dequeueing it.

sys/platform/pc32/i386/pmap.c
sys/platform/pc64/include/pmap.h
sys/platform/pc64/x86_64/pmap.c
sys/vm/vm_page.c
sys/vm/vm_pageout.c

index edddba3..8a8c95f 100644 (file)
@@ -83,6 +83,7 @@
 #include <sys/thread2.h>
 #include <sys/sysref2.h>
 #include <sys/spinlock2.h>
+#include <vm/vm_page2.h>
 
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
@@ -1027,14 +1028,14 @@ pmap_dispose_proc(struct proc *p)
  ***************************************************/
 
 /*
- * This routine unholds page table pages, and if the hold count
- * drops to zero, then it decrements the wire count.
+ * This routine unwires page table pages, removing and freeing the page
+ * tale page when the wire count drops to 0.
  *
  * The caller must hold vm_token.
  * This function can block.
  */
 static int 
-_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info) 
+_pmap_unwire_pte(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
 {
        /* 
         * Wait until we can busy the page ourselves.  We cannot have
@@ -1042,9 +1043,9 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
         */
        vm_page_busy_wait(m, FALSE, "pmuwpt");
        KASSERT(m->queue == PQ_NONE,
-               ("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m));
+               ("_pmap_unwire_pte: %p->queue != PQ_NONE", m));
 
-       if (m->hold_count == 1) {
+       if (m->wire_count == 1) {
                /*
                 * Unmap the page table page.
                 *
@@ -1071,17 +1072,15 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
                 * FUTURE NOTE: shared page directory page could result in
                 * multiple wire counts.
                 */
-               vm_page_unhold(m);
-               --m->wire_count;
-               KKASSERT(m->wire_count == 0);
-               atomic_add_int(&vmstats.v_wire_count, -1);
+               vm_page_unwire(m, 0);
                vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
                vm_page_flash(m);
                vm_page_free_zero(m);
                return 1;
        } else {
-               KKASSERT(m->hold_count > 1);
-               vm_page_unhold(m);
+               KKASSERT(m->wire_count > 1);
+               if (vm_page_unwire_quick(m))
+                       panic("pmap_unwire_pte: Insufficient wire_count");
                vm_page_wakeup(m);
                return 0;
        }
@@ -1092,14 +1091,15 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
  * This function can block.
  */
 static PMAP_INLINE int
-pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
+pmap_unwire_pte(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
 {
-       KKASSERT(m->hold_count > 0);
-       if (m->hold_count > 1) {
-               vm_page_unhold(m);
+       KKASSERT(m->wire_count > 0);
+       if (m->wire_count > 1) {
+               if (vm_page_unwire_quick(m))
+                       panic("pmap_unwire_pte: Insufficient wire_count");
                return 0;
        } else {
-               return _pmap_unwire_pte_hold(pmap, m, info);
+               return _pmap_unwire_pte(pmap, m, info);
        }
 }
 
@@ -1133,7 +1133,7 @@ pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
                }
        }
 
-       return pmap_unwire_pte_hold(pmap, mpte, info);
+       return pmap_unwire_pte(pmap, mpte, info);
 }
 
 /*
@@ -1241,8 +1241,7 @@ pmap_puninit(pmap_t pmap)
                KKASSERT(pmap->pm_pdir != NULL);
                pmap_kremove((vm_offset_t)pmap->pm_pdir);
                vm_page_busy_wait(p, FALSE, "pgpun");
-               p->wire_count--;
-               atomic_add_int(&vmstats.v_wire_count, -1);
+               vm_page_unwire(p, 0);
                vm_page_free_zero(p);
                pmap->pm_pdirm = NULL;
        }
@@ -1311,8 +1310,8 @@ pmap_release_free_page(struct pmap *pmap, vm_page_t p)
        --pmap->pm_stats.resident_count;
        pmap->pm_cached = 0;
 
-       if (p->hold_count)  {
-               panic("pmap_release: freeing held page table page");
+       if (p->wire_count != 1)  {
+               panic("pmap_release: freeing wired page table page");
        }
        if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
                pmap->pm_ptphint = NULL;
@@ -1331,8 +1330,9 @@ pmap_release_free_page(struct pmap *pmap, vm_page_t p)
                vm_page_flag_set(p, PG_ZERO);
                vm_page_wakeup(p);
        } else {
-               p->wire_count--;
-               atomic_add_int(&vmstats.v_wire_count, -1);
+               panic("pmap_release: page should already be gone %p", p);
+               /*vm_page_flag_clear(p, PG_MAPPED); should already be clear */
+               vm_page_unwire(p, 0);
                vm_page_free_zero(p);
        }
        return 1;
@@ -1360,15 +1360,15 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
                ("_pmap_allocpte: %p->queue != PQ_NONE", m));
 
        /*
-        * Increment the hold count for the page we will be returning to
+        * Increment the wire count for the page we will be returning to
         * the caller.
         */
-       m->hold_count++;
+       vm_page_wire(m);
 
        /*
         * It is possible that someone else got in and mapped by the page
         * directory page while we were blocked, if so just unbusy and
-        * return the held page.
+        * return the wired page.
         */
        if ((ptepa = pmap->pm_pdir[ptepindex]) != 0) {
                KKASSERT((ptepa & PG_FRAME) == VM_PAGE_TO_PHYS(m));
@@ -1376,11 +1376,6 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
                return(m);
        }
 
-       if (m->wire_count == 0)
-               atomic_add_int(&vmstats.v_wire_count, 1);
-       m->wire_count++;
-
-
        /*
         * Map the pagetable page into the process address space, if
         * it isn't already there.
@@ -1442,7 +1437,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va)
 
        /*
         * If the page table page is mapped, we just increment the
-        * hold count, and activate it.
+        * wire count, and activate it.
         */
        if (ptepa) {
                /*
@@ -1457,7 +1452,7 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va)
                        pmap->pm_ptphint = m;
                        vm_page_wakeup(m);
                }
-               m->hold_count++;
+               vm_page_wire_quick(m);
                return m;
        }
        /*
@@ -1819,7 +1814,7 @@ pmap_remove_pte(struct pmap *pmap, unsigned *ptq, vm_offset_t va,
        if (oldpte & PG_W)
                pmap->pm_stats.wired_count -= 1;
        pmap_inval_deinterlock(info, pmap);
-       KKASSERT(oldpte);
+       KKASSERT(oldpte & PG_V);
        /*
         * Machines that don't support invlpg, also don't support
         * PG_G.  XXX PG_G is disabled for SMP so don't worry about
@@ -1863,7 +1858,7 @@ pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
        unsigned *ptq;
 
        /*
-        * if there is no pte for this address, just skip it!!!  Otherwise
+        * If there is no pte for this address, just skip it!!!  Otherwise
         * get a local va for mappings for this pmap and remove the entry.
         */
        if (*pmap_pde(pmap, va) != 0) {
@@ -2249,8 +2244,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                 * wiring count above and may need to adjust the wiring
                 * bits below.
                 */
-               if (mpte)
-                       mpte->hold_count--;
+               if (mpte) {
+                       if (vm_page_unwire_quick(mpte))
+                               panic("pmap_enter: Insufficient wire_count");
+               }
 
                /*
                 * We might be turning off write access to the page,
@@ -2410,7 +2407,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
 
                        /*
                         * If the page table page is mapped, we just increment
-                        * the hold count, and activate it.
+                        * the wire count, and activate it.
                         */
                        if (ptepa) {
                                if (ptepa & PG_PS)
@@ -2424,7 +2421,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
                                        vm_page_wakeup(mpte);
                                }
                                if (mpte)
-                                       mpte->hold_count++;
+                                       vm_page_wire_quick(mpte);
                        } else {
                                mpte = _pmap_allocpte(pmap, ptepindex);
                        }
@@ -2442,7 +2439,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
        pte = (unsigned *)vtopte(va);
        if (*pte & PG_V) {
                if (mpte)
-                       pmap_unwire_pte_hold(pmap, mpte, &info);
+                       pmap_unwire_pte(pmap, mpte, &info);
                pa = VM_PAGE_TO_PHYS(m);
                KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
                pmap_inval_done(&info);
index 563809e..e475195 100644 (file)
@@ -55,7 +55,7 @@
 
 /*
  * Size of Kernel address space.  This is the number of page table pages
- * (2MB each) to use for the kernel.  256 pages == 512 Megabyte.
+ * (2GB each) to use for the kernel.  256 pages == 512 Gigabytes.
  * This **MUST** be a multiple of 4 (eg: 252, 256, 260, etc).
  */
 #ifndef KVA_PAGES
index 2ef1346..f9ade37 100644 (file)
@@ -1,6 +1,4 @@
 /*
- * (MPSAFE)
- *
  * Copyright (c) 1991 Regents of the University of California.
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1994 David Greenman
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- *     from:   @(#)pmap.c      7.7 (Berkeley)  5/12/91
- * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
  */
-
 /*
- *     Manages physical address maps.
- *
- *     In addition to hardware address maps, this
- *     module is called upon to provide software-use-only
- *     maps which may or may not be stored in the same
- *     form as hardware maps.  These pseudo-maps are
- *     used to store intermediate results from copy
- *     operations to and from address spaces.
- *
- *     Since the information managed by this module is
- *     also stored by the logical address mapping module,
- *     this module may throw away valid virtual-to-physical
- *     mappings at almost any time.  However, invalidations
- *     of virtual-to-physical mappings must be done as
- *     requested.
- *
- *     In order to cope with hardware architectures which
- *     make virtual-to-physical map invalidates expensive,
- *     this module may delay invalidate or reduced protection
- *     operations until such time as they are actually
- *     necessary.  This module is given full information as
- *     to which processors are currently using which maps,
- *     and to when physical maps must be made correct.
+ * Manage physical address maps for x86-64 systems.
  */
 
 #if JG
@@ -2658,7 +2630,6 @@ fast_skip:
                        continue;
                }
 
-retry_pt_pv:
                /*
                 * PT cache
                 */
@@ -2731,51 +2702,18 @@ kernel_skip:
 
                while (sva < va_next) {
                        /*
-                        * NOTE: It is possible for an empty ptep to
-                        *       race a pv removal by some other means
-                        *       (typically via the vm_page_t).
+                        * Acquire the related pte_pv, if any.  If *ptep == 0
+                        * the related pte_pv should not exist, but if *ptep
+                        * is not zero the pte_pv may or may not exist (e.g.
+                        * will not exist for an unmanaged page).
                         *
-                        *       For now debug the situation by locking
-                        *       the potentially conflicting pv and
-                        *       re-checking.
+                        * However a multitude of races are possible here.
+                        *
+                        * In addition, the (pt_pv, pte_pv) lock order is
+                        * backwards, so we have to be careful in aquiring
+                        * a properly locked pte_pv.
                         */
                        lwkt_yield();
-                       if (*ptep == 0) {
-                               pte_pv = pv_find(pmap, pmap_pte_pindex(sva));
-                               if (pte_pv == NULL) {
-                                       sva += PAGE_SIZE;
-                                       ++ptep;
-                                       continue;
-                               }
-
-                               /*
-                                * Possible race against another thread
-                                * removing the pte.
-                                */
-                               kprintf("pmap_scan: caught race func %p", func);
-                               if (pt_pv) {
-                                       pv_put(pt_pv);   /* must be non-NULL */
-                                       pt_pv = NULL;
-                               }
-                               pv_lock(pte_pv);        /* safe to block now */
-                               if (pte_pv->pv_pmap) {
-                                       kprintf(", uh oh, pte_pv still good\n");
-                                       panic("unexpected non-NULL pte_pv %p",
-                                             pte_pv);
-                               } else {
-                                       kprintf(", resolved ok\n");
-                               }
-                               pv_put(pte_pv);
-                               pte_pv = NULL;
-                               goto retry_pt_pv;
-                       }
-
-                       /*
-                        * We need a locked pte_pv as well but just like the
-                        * above, the lock order is all wrong so if we
-                        * cannot acquire it non-blocking we will have to
-                        * undo a bunch of stuff and retry.
-                        */
                        if (pt_pv) {
                                pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
                                                    &error);
@@ -2802,19 +2740,25 @@ kernel_skip:
                        }
 
                        /*
-                        * *ptep can get ripped out while we were blocked.
+                        * Ok, if *ptep == 0 we had better NOT have a pte_pv.
                         */
                        if (*ptep == 0) {
                                if (pte_pv) {
-                                       pv_put(pte_pv);
-                                       pte_pv = NULL;
+                                       kprintf("Unexpected non-NULL pte_pv "
+                                               "%p pt_pv %p *ptep = %016lx\n",
+                                               pte_pv, pt_pv, *ptep);
+                                       panic("Unexpected non-NULL pte_pv");
                                }
+                               sva += PAGE_SIZE;
+                               ++ptep;
                                continue;
                        }
 
                        /*
-                        * Ready for the callback.  The locked pte_pv (if
-                        * not NULL) is consumed by the callback.
+                        * Ready for the callback.  The locked pte_pv (if any)
+                        * is consumed by the callback.  pte_pv will exist if
+                        *  the page is managed, and will not exist if it
+                        * isn't.
                         */
                        if (pte_pv) {
                                KASSERT((*ptep & (PG_MANAGED|PG_V)) ==
index dcb3faa..731b842 100644 (file)
@@ -1226,12 +1226,35 @@ vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
                if (m == NULL)
                        break;
                if (vm_page_busy_try(m, TRUE)) {
+                       /*
+                        * Various mechanisms such as a pmap_collect can
+                        * result in a busy page on the free queue.  We
+                        * have to move the page out of the way so we can
+                        * retry the allocation.  If the other thread is not
+                        * allocating the page then m->valid will remain 0 and
+                        * the pageout daemon will free the page later on.
+                        *
+                        * Since we could not busy the page, however, we
+                        * cannot make assumptions as to whether the page
+                        * will be allocated by the other thread or not,
+                        * so all we can do is deactivate it to move it out
+                        * of the way.  In particular, if the other thread
+                        * wires the page it may wind up on the inactive
+                        * queue and the pageout daemon will have to deal
+                        * with that case too.
+                        */
                        _vm_page_deactivate_locked(m, 0);
                        vm_page_spin_unlock(m);
 #ifdef INVARIANTS
                         kprintf("Warning: busy page %p found in cache\n", m);
 #endif
                } else {
+                       /*
+                        * Theoretically if we are able to busy the page
+                        * atomic with the queue removal (using the vm_page
+                        * lock) nobody else should be able to mess with the
+                        * page before us.
+                        */
                        KKASSERT((m->flags & PG_UNMANAGED) == 0);
                        KKASSERT(m->hold_count == 0);
                        KKASSERT(m->wire_count == 0);
@@ -1714,13 +1737,7 @@ vm_page_free_fromq_fast(void)
                                 */
                                _vm_page_deactivate_locked(m, 0);
                                vm_page_spin_unlock(m);
-                       } else if ((m->flags & PG_ZERO) == 0) {
-                               /*
-                                * The page is not PG_ZERO'd so return it.
-                                */
-                               vm_page_spin_unlock(m);
-                               break;
-                       } else {
+                       } else if (m->flags & PG_ZERO) {
                                /*
                                 * The page is PG_ZERO, requeue it and loop
                                 */
@@ -1734,6 +1751,15 @@ vm_page_free_fromq_fast(void)
                                } else {
                                        vm_page_spin_unlock(m);
                                }
+                       } else {
+                               /*
+                                * The page is not PG_ZERO'd so return it.
+                                */
+                               vm_page_spin_unlock(m);
+                               KKASSERT((m->flags & PG_UNMANAGED) == 0);
+                               KKASSERT(m->hold_count == 0);
+                               KKASSERT(m->wire_count == 0);
+                               break;
                        }
                        m = NULL;
                }
index b4e819c..6178152 100644 (file)
@@ -847,6 +847,22 @@ vm_pageout_scan_inactive(int pass, int q, int inactive_shortage,
                 * either.
                 */
 
+               /*
+                * It is possible for a page to be busied ad-hoc (e.g. the
+                * pmap_collect() code) and wired and race against the
+                * allocation of a new page.  vm_page_alloc() may be forced
+                * to deactivate the wired page in which case it winds up
+                * on the inactive queue and must be handled here.  We
+                * correct the problem simply by unqueuing the page.
+                */
+               if (m->wire_count) {
+                       vm_page_unqueue_nowakeup(m);
+                       vm_page_wakeup(m);
+                       kprintf("WARNING: pagedaemon: wired page on "
+                               "inactive queue %p\n", m);
+                       continue;
+               }
+
                /*
                 * A held page may be undergoing I/O, so skip it.
                 */