kernel - Preliminary vm_page hash lookup (2), cleanups, page wiring
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 23 Mar 2019 22:29:14 +0000 (15:29 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 27 Mar 2019 03:32:47 +0000 (20:32 -0700)
* Correct a bug in vm.fault_quick operation.  Soft-busied pages cannot
  be safely wired or unwired.  This fixes a wire/unwire race related panic.

* Optimize vm_page_unhold() to normally not have to obtain any spin-locks
  at all, since related pages are almost never in the PQ_HOLD VM page
  queue.  This leaves open a minor race condition where pages with
  a hold_count of 0 can accumulate in PQ_HOLD.

* Add vm_page_scan_hold() to the pageout daemon.  Unconditionally scan
  PQ_HOLD very slowly to remove any pages whos hold_count is 0.

* REFACTOR PAGE WIRING.  Wiring vm_page's no longer removes them from
  whatever paging queue they are on.  Instead, proactively remove
  such pages from the queue only when we need to (typically in the
  pageout code).

* Remove unused PV_FLAG_VMOBJECT.

* Fix missing atomic-op in pc64/x86_64/efirt.c

* Do not use m->md.pv_list for pagetable pages.  It is now only used for
  terminal pages.

* Properly initialize pv_flags to 0 when a pv_entry is allocated.

* Add debugging to detect managed pmap_enter()s without an object.

* Conditionalize the setting of PG_MAPPED and PG_WRITEABLE in the
  pmap code to avoid unnecessary cpu cache mastership changes.

* Move assertions in vm_pageout.c that could trigger improperly due
  to a race.

sys/platform/pc64/include/pmap.h
sys/platform/pc64/x86_64/efirt.c
sys/platform/pc64/x86_64/pmap.c
sys/vm/vm_fault.c
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_pageout.c

index 8ec4019..b0540b5 100644 (file)
@@ -362,7 +362,8 @@ typedef struct pv_entry {
 #define PV_HOLD_UNUSED2000     0x20000000U
 #define PV_HOLD_MASK           0x1FFFFFFFU
 
-#define PV_FLAG_VMOBJECT       0x00000001U     /* shared pt in VM obj */
+#define PV_FLAG_UNUSED01       0x00000001U
+#define PV_FLAG_PGTABLE                0x00000002U     /* page table page */
 
 #ifdef _KERNEL
 
index bd5c5f8..ac53e0b 100644 (file)
@@ -138,7 +138,7 @@ efi_destroy_1t1_map(void)
                while ((m = RB_ROOT(&obj->rb_memq)) != NULL) {
                        vm_page_busy_wait(m, FALSE, "efipg");
                        vm_page_unwire(m, 1);
-                       m->flags &= ~(PG_MAPPED | PG_WRITEABLE);
+                       vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
                        cdev_pager_free_page(obj, m);
                        kfree(m, M_EFI);
                }
index 00ea232..92d4051 100644 (file)
@@ -271,6 +271,10 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD,
     &pmap_nx_enable, 0,
     "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)");
 
+static int pmap_pv_debug = 50;
+SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW,
+    &pmap_pv_debug, 0, "");
+
 /* Standard user access funtions */
 extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len,
     size_t *lencopied);
@@ -2534,7 +2538,15 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 
        vm_page_spin_lock(m);
        pmap_page_stats_adding(m);
+
+       /*
+        * PGTABLE pv's only exist in the context of the pmap RB tree
+        * (pmap->pm_pvroot).
+        */
+#if 0
        TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
+#endif
+       pv->pv_flags |= PV_FLAG_PGTABLE;
        pv->pv_m = m;
        vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
        vm_page_spin_unlock(m);
@@ -3465,10 +3477,15 @@ pmap_remove_pv_page(pv_entry_t pv)
        vm_page_spin_lock(m);
        KKASSERT(m && m == pv->pv_m);
        pv->pv_m = NULL;
-       TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-       pmap_page_stats_deleting(m);
-       if (TAILQ_EMPTY(&m->md.pv_list))
+       if (pv->pv_flags & PV_FLAG_PGTABLE) {
                vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+               KKASSERT(TAILQ_EMPTY(&m->md.pv_list));
+       } else {
+               TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
+               if (TAILQ_EMPTY(&m->md.pv_list))
+                       vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+       }
+       pmap_page_stats_deleting(m);
        vm_page_spin_unlock(m);
 
        return(m);
@@ -3795,6 +3812,7 @@ _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
                        pnew->pv_pmap = pmap;
                        pnew->pv_pindex = pindex;
                        pnew->pv_hold = PV_HOLD_LOCKED | 2;
+                       pnew->pv_flags = 0;
 #ifdef PMAP_DEBUG
                        pnew->pv_func = func;
                        pnew->pv_line = lineno;
@@ -4968,6 +4986,12 @@ pmap_remove_all(vm_page_t m)
 
        vm_page_spin_lock(m);
        while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
+               if (pv->pv_m != m) {
+                       kprintf("pmap_remove_all FAILURE\n");
+                       kprintf("pv %p pv->pv_m %p m %p\n", pv, pv->pv_m, m);
+                       kprintf("pvflags %08x\n", pv->pv_flags);
+               }
+
                KKASSERT(pv->pv_m == m);
                if (pv_hold_try(pv)) {
                        vm_page_spin_unlock(m);
@@ -5434,14 +5458,29 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                 *
                 * Enter on the PV list if part of our managed memory.
                 */
+
+               if (m->object == NULL && pmap_pv_debug > 0) {
+                       --pmap_pv_debug;
+                       kprintf("pte_m %p pv_entry %p NOOBJ\n", m, pte_pv);
+                       print_backtrace(16);
+               }
+
                KKASSERT(pte_pv && (pte_pv->pv_m == NULL || pte_pv->pv_m == m));
                vm_page_spin_lock(m);
                pte_pv->pv_m = m;
                pmap_page_stats_adding(m);
                TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
-               vm_page_flag_set(m, PG_MAPPED);
-               if (newpte & pmap->pmap_bits[PG_RW_IDX])
+
+               /*
+                * Set vm_page flags.  Avoid a cache mastership change if
+                * the bits are already set.
+                */
+               if ((m->flags & PG_MAPPED) == 0)
+                       vm_page_flag_set(m, PG_MAPPED);
+               if ((newpte & pmap->pmap_bits[PG_RW_IDX]) &&
+                   (m->flags & PG_WRITEABLE) == 0) {
                        vm_page_flag_set(m, PG_WRITEABLE);
+               }
                vm_page_spin_unlock(m);
 
                if (pt_pv && opa &&
index d97ba5b..10fdc34 100644 (file)
@@ -163,6 +163,7 @@ static int vm_fault_quick_enable = 0;
 TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable);
 SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW,
                &vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut");
+#ifdef VM_FAULT_QUICK_DEBUG
 static long vm_fault_quick_success_count = 0;
 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW,
                &vm_fault_quick_success_count, 0, "");
@@ -178,6 +179,7 @@ SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count3, CTLFLAG_RW,
 static long vm_fault_quick_failure_count4 = 0;
 SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count4, CTLFLAG_RW,
                &vm_fault_quick_failure_count4, 0, "");
+#endif
 
 static int vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
                        vm_prot_t fault_type);
@@ -721,12 +723,14 @@ RetryFault:
        }
 
 success:
-
        /*
         * On success vm_fault_object() does not unlock or deallocate, and fs.m
         * will contain a busied page.
         *
         * Enter the page into the pmap and do pmap-related adjustments.
+        *
+        * WARNING! Soft-busied fs.m's can only be manipulated in limited
+        *          ways.
         */
        KKASSERT(fs.lookup_still_valid == TRUE);
        vm_page_flag_set(fs.m, PG_REFERENCED);
@@ -739,19 +743,23 @@ success:
        /*
         * If the page is not wired down, then put it where the pageout daemon
         * can find it.
+        *
+        * NOTE: We cannot safely wire, unwire, or adjust queues for a
+        *       soft-busied page.
         */
-       if (fs.fault_flags & VM_FAULT_WIRE_MASK) {
-               if (fs.wflags & FW_WIRED)
-                       vm_page_wire(fs.m);
-               else
-                       vm_page_unwire(fs.m, 1);
-       } else {
-               vm_page_activate(fs.m);
-       }
        if (fs.msoftonly) {
                KKASSERT(fs.m->busy_count & PBUSY_MASK);
+               KKASSERT((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0);
                vm_page_sbusy_drop(fs.m);
        } else {
+               if (fs.fault_flags & VM_FAULT_WIRE_MASK) {
+                       if (fs.wflags & FW_WIRED)
+                               vm_page_wire(fs.m);
+                       else
+                               vm_page_unwire(fs.m, 1);
+               } else {
+                       vm_page_activate(fs.m);
+               }
                KKASSERT(fs.m->busy_count & PBUSY_LOCKED);
                vm_page_wakeup(fs.m);
        }
@@ -858,11 +866,20 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
        if (obj->flags & OBJ_ONEMAPPING)
                return KERN_FAILURE;
 
+       /*
+        * This will try to wire/unwire a page, which can't be done with
+        * a soft-busied page.
+        */
+       if (fs->fault_flags & VM_FAULT_WIRE_MASK)
+               return KERN_FAILURE;
+
        /*
         * Ick, can't handle this
         */
        if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) {
+#ifdef VM_FAULT_QUICK_DEBUG
                ++vm_fault_quick_failure_count1;
+#endif
                return KERN_FAILURE;
        }
 
@@ -872,7 +889,9 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
         */
        m = vm_page_hash_get(obj, first_pindex);
        if (m == NULL) {
+#ifdef VM_FAULT_QUICK_DEBUG
                ++vm_fault_quick_failure_count2;
+#endif
                return KERN_FAILURE;
        }
        if ((obj->flags & OBJ_DEAD) ||
@@ -880,7 +899,9 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
            m->queue - m->pc == PQ_CACHE ||
            (m->flags & PG_SWAPPED)) {
                vm_page_sbusy_drop(m);
+#ifdef VM_FAULT_QUICK_DEBUG
                ++vm_fault_quick_failure_count3;
+#endif
                return KERN_FAILURE;
        }
 
@@ -897,23 +918,35 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex,
        }
 
        /*
-        * Check write permissions.  We don't hold an object lock so the
-        * object must already be flagged writable and dirty.
+        * If this is a write fault the object and the page must already
+        * be writable.  Since we don't hold an object lock and only a
+        * soft-busy on the page, we cannot manipulate the object or
+        * the page state (other than the page queue).
         */
        if (fs->prot & VM_PROT_WRITE) {
                if ((obj->flags & (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY)) !=
                    (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY) ||
                    m->dirty != VM_PAGE_BITS_ALL) {
                        vm_page_sbusy_drop(m);
+#ifdef VM_FAULT_QUICK_DEBUG
                        ++vm_fault_quick_failure_count4;
+#endif
                        return KERN_FAILURE;
                }
                vm_set_nosync(m, fs->entry);
        }
+
+       /*
+        * Even though we are only soft-busied we can still move pages
+        * around in the normal queue(s).  The soft-busy prevents the
+        * page from being removed from the object, etc (normal operation).
+        */
        vm_page_activate(m);
        fs->m = m;
        fs->msoftonly = 1;
+#ifdef VM_FAULT_QUICK_DEBUG
        ++vm_fault_quick_success_count;
+#endif
 
        return KERN_SUCCESS;
 }
index f499214..21a3a4a 100644 (file)
@@ -1303,8 +1303,15 @@ vm_page_hold(vm_page_t m)
  * The opposite of vm_page_hold().  If the page is on the HOLD queue
  * it was freed while held and must be moved back to the FREE queue.
  *
- * To avoid racing against vm_page_free*() we must test conditions
- * after obtaining the spin-lock.
+ * To avoid racing against vm_page_free*() we must re-test conditions
+ * after obtaining the spin-lock.  The initial test can also race a
+ * vm_page_free*() that is in the middle of moving a page to PQ_HOLD,
+ * leaving the page on PQ_HOLD with hold_count == 0.  Rather than
+ * throw a spin-lock in the critical path, we rely on the pageout
+ * daemon to clean-up these loose ends.
+ *
+ * More critically, the 'easy movement' between queues without busying
+ * a vm_page is only allowed for PQ_FREE<->PQ_HOLD.
  */
 void
 vm_page_unhold(vm_page_t m)
@@ -1314,7 +1321,8 @@ vm_page_unhold(vm_page_t m)
                 "on FREE queue (%d)",
                 m, m->hold_count, m->queue - m->pc));
 
-       if (atomic_fetchadd_int(&m->hold_count, -1) == 1) {
+       if (atomic_fetchadd_int(&m->hold_count, -1) == 1 &&
+           m->queue - m->pc == PQ_HOLD) {
                vm_page_spin_lock(m);
                if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) {
                        _vm_page_queue_spin_lock(m);
@@ -1976,14 +1984,20 @@ vm_page_select_free(u_short pg_color)
                        /*
                         * Theoretically if we are able to busy the page
                         * atomic with the queue removal (using the vm_page
-                        * lock) nobody else should be able to mess with the
-                        * page before us.
+                        * lock) nobody else should have been able to mess
+                        * with the page before us.
+                        *
+                        * Assert the page state.  Note that even though
+                        * wiring doesn't adjust queues, a page on the free
+                        * queue should never be wired at this point.
                         */
                        KKASSERT((m->flags & (PG_UNMANAGED |
                                              PG_NEED_COMMIT)) == 0);
-                       KASSERT(m->hold_count == 0, ("m->hold_count is not zero "
-                                                    "pg %p q=%d flags=%08x hold=%d wire=%d",
-                                                    m, m->queue, m->flags, m->hold_count, m->wire_count));
+                       KASSERT(m->hold_count == 0,
+                               ("m->hold_count is not zero "
+                                "pg %p q=%d flags=%08x hold=%d wire=%d",
+                                m, m->queue, m->flags,
+                                m->hold_count, m->wire_count));
                        KKASSERT(m->wire_count == 0);
                        vm_page_spin_unlock(m);
                        pagedaemon_wakeup();
@@ -2500,7 +2514,7 @@ vm_page_activate(vm_page_t m)
 
                if (oqueue == PQ_CACHE)
                        mycpu->gd_cnt.v_reactivated++;
-               if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
+               if ((m->flags & PG_UNMANAGED) == 0) {
                        if (m->act_count < ACT_INIT)
                                m->act_count = ACT_INIT;
                        _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0);
@@ -2670,11 +2684,10 @@ vm_page_free_toq(vm_page_t m)
  * vm_page_unmanage()
  *
  * Prevent PV management from being done on the page.  The page is
- * removed from the paging queues as if it were wired, and as a 
- * consequence of no longer being managed the pageout daemon will not
- * touch it (since there is no way to locate the pte mappings for the
- * page).  madvise() calls that mess with the pmap will also no longer
- * operate on the page.
+ * also removed from the paging queues, and as a consequence of no longer
+ * being managed the pageout daemon will not touch it (since there is no
+ * way to locate the pte mappings for the page).  madvise() calls that
+ * mess with the pmap will also no longer operate on the page.
  *
  * Beyond that the page is still reasonably 'normal'.  Freeing the page
  * will clear the flag.
@@ -2691,15 +2704,14 @@ vm_page_unmanage(vm_page_t m)
 {
        KKASSERT(m->busy_count & PBUSY_LOCKED);
        if ((m->flags & PG_UNMANAGED) == 0) {
-               if (m->wire_count == 0)
-                       vm_page_unqueue(m);
+               vm_page_unqueue(m);
        }
        vm_page_flag_set(m, PG_UNMANAGED);
 }
 
 /*
- * Mark this page as wired down by yet another map, removing it from
- * paging queues as necessary.
+ * Mark this page as wired down by yet another map.  We do not adjust the
+ * queue the page is on, it will be checked for wiring as-needed.
  *
  * Caller must be holding the page busy.
  */
@@ -2715,8 +2727,6 @@ vm_page_wire(vm_page_t m)
        KKASSERT(m->busy_count & PBUSY_LOCKED);
        if ((m->flags & PG_FICTITIOUS) == 0) {
                if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
-                       if ((m->flags & PG_UNMANAGED) == 0)
-                               vm_page_unqueue(m);
                        atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1);
                }
                KASSERT(m->wire_count != 0,
@@ -2727,6 +2737,10 @@ vm_page_wire(vm_page_t m)
 /*
  * Release one wiring of this page, potentially enabling it to be paged again.
  *
+ * Note that wired pages are no longer unconditionally removed from the
+ * paging queues, so the page may already be on a queue.  Move the page
+ * to the desired queue if necessary.
+ *
  * Many pages placed on the inactive queue should actually go
  * into the cache, but it is difficult to figure out which.  What
  * we do instead, if the inactive target is well met, is to put
@@ -2758,7 +2772,7 @@ vm_page_unwire(vm_page_t m, int activate)
        KKASSERT(m->busy_count & PBUSY_LOCKED);
        if (m->flags & PG_FICTITIOUS) {
                /* do nothing */
-       } else if (m->wire_count <= 0) {
+       } else if ((int)m->wire_count <= 0) {
                panic("vm_page_unwire: invalid wire count: %d", m->wire_count);
        } else {
                if (atomic_fetchadd_int(&m->wire_count, -1) == 1) {
@@ -2766,32 +2780,38 @@ vm_page_unwire(vm_page_t m, int activate)
                        if (m->flags & PG_UNMANAGED) {
                                ;
                        } else if (activate || (m->flags & PG_NEED_COMMIT)) {
+                               vm_page_activate(m);
+#if 0
                                vm_page_spin_lock(m);
                                _vm_page_add_queue_spinlocked(m,
                                                        PQ_ACTIVE + m->pc, 0);
                                _vm_page_and_queue_spin_unlock(m);
+#endif
                        } else {
+                               vm_page_deactivate(m);
+#if 0
                                vm_page_spin_lock(m);
                                vm_page_flag_clear(m, PG_WINATCFLS);
                                _vm_page_add_queue_spinlocked(m,
                                                        PQ_INACTIVE + m->pc, 0);
-                               ++vm_swapcache_inactive_heuristic;
                                _vm_page_and_queue_spin_unlock(m);
+#endif
+                               ++vm_swapcache_inactive_heuristic;
                        }
                }
        }
 }
 
 /*
- * Move the specified page to the inactive queue.  If the page has
- * any associated swap, the swap is deallocated.
+ * Move the specified page to the inactive queue.
  *
  * Normally athead is 0 resulting in LRU operation.  athead is set
  * to 1 if we want this page to be 'as if it were placed in the cache',
  * except without unmapping it from the process address space.
  *
  * vm_page's spinlock must be held on entry and will remain held on return.
- * This routine may not block.
+ * This routine may not block.  The caller does not have to hold the page
+ * busied but should have some sort of interlock on its validity.
  */
 static void
 _vm_page_deactivate_locked(vm_page_t m, int athead)
@@ -2806,7 +2826,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead)
        _vm_page_queue_spin_lock(m);
        oqueue = _vm_page_rem_queue_spinlocked(m);
 
-       if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) {
+       if ((m->flags & PG_UNMANAGED) == 0) {
                if (oqueue == PQ_CACHE)
                        mycpu->gd_cnt.v_reactivated++;
                vm_page_flag_clear(m, PG_WINATCFLS);
index 6697104..550f2a3 100644 (file)
 
 /*
  * vm_page structure
+ *
+ * hard-busy: (PBUSY_LOCKED)
+ *
+ *     Hard-busying a page allows major manipulation of the page structure.
+ *     No new soft-busies can accumulate while a page is hard-busied.  The
+ *     page busying code typically waits for all soft-busies to drop before
+ *     allowing the hard-busy.
+ *
+ * soft-busy: (PBUSY_MASK)
+ *
+ *     Soft-busying a page typically indicates I/O or read-only use of
+ *     the content.  A page can have multiple soft-busies on it.  New
+ *     soft-busies block on any hard-busied page (wait for the hard-busy
+ *     to go away).
+ *
+ * hold_count
+ *
+ *     This prevents a page from being freed.  This does not prevent any
+ *     other operation.  The page may still be disassociated from its
+ *     object and essentially scrapped.  It just won't be reused while
+ *     a non-zero hold_count is present.
+ *
+ * wire_count
+ *
+ *     This indicates that the page has been wired into memory somewhere
+ *     (typically a buffer cache buffer, or a user wire).  The pageout
+ *     daemon will skip wired pages.
  */
 TAILQ_HEAD(pglist, vm_page);
 
index b011de9..88d270d 100644 (file)
@@ -827,8 +827,8 @@ vm_pageout_scan_inactive(int pass, int q, long avail_shortage,
                 * Remaining operations run with the page busy and neither
                 * the page or the queue will be spin-locked.
                 */
-               vm_page_queues_spin_unlock(PQ_INACTIVE + q);
                KKASSERT(m->queue == PQ_INACTIVE + q);
+               vm_page_queues_spin_unlock(PQ_INACTIVE + q);
 
                /*
                 * The emergency pager runs when the primary pager gets
@@ -938,18 +938,13 @@ vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
        int count = 0;
 
        /*
-        * It is possible for a page to be busied ad-hoc (e.g. the
-        * pmap_collect() code) and wired and race against the
-        * allocation of a new page.  vm_page_alloc() may be forced
-        * to deactivate the wired page in which case it winds up
-        * on the inactive queue and must be handled here.  We
-        * correct the problem simply by unqueuing the page.
+        * Wiring no longer removes a page from its queue.  The last unwiring
+        * will requeue the page.  Obviously wired pages cannot be paged out
+        * so unqueue it and return.
         */
        if (m->wire_count) {
                vm_page_unqueue_nowakeup(m);
                vm_page_wakeup(m);
-               kprintf("WARNING: pagedaemon: wired page on "
-                       "inactive queue %p\n", m);
                return 0;
        }
 
@@ -1199,6 +1194,16 @@ vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp,
                        }
                        vm_page_unhold(m);
 
+                       /*
+                        * If it was wired while we didn't own it.
+                        */
+                       if (m->wire_count) {
+                               vm_page_unqueue_nowakeup(m);
+                               vput(vp);
+                               vm_page_wakeup(m);
+                               return 0;
+                       }
+
                        /*
                         * (m) is busied again
                         *
@@ -1340,15 +1345,15 @@ vm_pageout_scan_active(int pass, int q,
                 * Remaining operations run with the page busy and neither
                 * the page or the queue will be spin-locked.
                 */
-               vm_page_queues_spin_unlock(PQ_ACTIVE + q);
                KKASSERT(m->queue == PQ_ACTIVE + q);
+               vm_page_queues_spin_unlock(PQ_ACTIVE + q);
 
 #if 0
                /*
                 * Don't deactivate pages that are held, even if we can
                 * busy them.  (XXX why not?)
                 */
-               if (m->hold_count != 0) {
+               if (m->hold_count) {
                        vm_page_and_queue_spin_lock(m);
                        if (m->queue - m->pc == PQ_ACTIVE) {
                                TAILQ_REMOVE(
@@ -1363,6 +1368,14 @@ vm_pageout_scan_active(int pass, int q,
                        goto next;
                }
 #endif
+               /*
+                * We can just remove wired pages from the queue
+                */
+               if (m->wire_count) {
+                       vm_page_unqueue_nowakeup(m);
+                       vm_page_wakeup(m);
+                       goto next;
+               }
 
                /*
                 * The emergency pager ignores vnode-backed pages as these
@@ -1559,6 +1572,10 @@ vm_pageout_scan_cache(long avail_shortage, int pass,
                m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK);
                if (m == NULL)
                        break;
+
+               /*
+                * If the busy attempt fails we can still deactivate the page.
+                */
                /* page is returned removed from its queue and spinlocked */
                if (vm_page_busy_try(m, TRUE)) {
                        vm_page_deactivate_locked(m);
@@ -1733,6 +1750,42 @@ vm_pageout_scan_callback(struct proc *p, void *data)
        return(0);
 }
 
+/*
+ * This old guy slowly walks PQ_HOLD looking for pages which need to be
+ * moved back to PQ_FREE.  It is possible for pages to accumulate here
+ * when vm_page_free() races against vm_page_unhold(), resulting in a
+ * page being left on a PQ_HOLD queue with hold_count == 0.
+ *
+ * It is easier to handle this edge condition here, in non-critical code,
+ * rather than enforce a spin-lock for every 1->0 transition in
+ * vm_page_unhold().
+ *
+ * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue.
+ */
+static void
+vm_pageout_scan_hold(int q)
+{
+       vm_page_t m;
+
+       vm_page_queues_spin_lock(PQ_HOLD + q);
+       TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) {
+               if (m->flags & PG_MARKER)
+                       continue;
+
+               /*
+                * Process one page and return
+                */
+               if (m->hold_count)
+                       break;
+               kprintf("DEBUG: pageout HOLD->FREE %p\n", m);
+               vm_page_hold(m);
+               vm_page_queues_spin_unlock(PQ_HOLD + q);
+               vm_page_unhold(m);      /* reprocess */
+               return;
+       }
+       vm_page_queues_spin_unlock(PQ_HOLD + q);
+}
+
 /*
  * This routine tries to maintain the pseudo LRU active queue,
  * so that during long periods of time where there is no paging,
@@ -1807,16 +1860,26 @@ vm_pageout_page_stats(int q)
                 * Remaining operations run with the page busy and neither
                 * the page or the queue will be spin-locked.
                 */
-               vm_page_queues_spin_unlock(PQ_ACTIVE + q);
                KKASSERT(m->queue == PQ_ACTIVE + q);
+               vm_page_queues_spin_unlock(PQ_ACTIVE + q);
+
+               /*
+                * We can just remove wired pages from the queue
+                */
+               if (m->wire_count) {
+                       vm_page_unqueue_nowakeup(m);
+                       vm_page_wakeup(m);
+                       goto next;
+               }
+
 
                /*
                 * We now have a safely busied page, the page and queue
                 * spinlocks have been released.
                 *
-                * Ignore held pages
+                * Ignore held and wired pages
                 */
-               if (m->hold_count) {
+               if (m->hold_count || m->wire_count) {
                        vm_page_wakeup(m);
                        goto next;
                }
@@ -1952,6 +2015,7 @@ vm_pageout_thread(void)
        int q;
        int q1iterator = 0;
        int q2iterator = 0;
+       int q3iterator = 0;
        int isep;
 
        curthread->td_flags |= TDF_SYSTHREAD;
@@ -2104,7 +2168,13 @@ skip_setup:
                } else {
                        /*
                         * Primary pagedaemon
+                        *
+                        * NOTE: We unconditionally cleanup PQ_HOLD even
+                        *       when there is no work to do.
                         */
+                       vm_pageout_scan_hold(q3iterator & PQ_L2_MASK);
+                       ++q3iterator;
+
                        if (vm_pages_needed == 0) {
                                error = tsleep(&vm_pages_needed,
                                               0, "psleep",