From: Matthew Dillon Date: Sat, 23 Mar 2019 22:29:14 +0000 (-0700) Subject: kernel - Preliminary vm_page hash lookup (2), cleanups, page wiring X-Git-Tag: v5.7.0~434 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/e05899ceef0e1a34c88b6d12dd59538f627df6d4 kernel - Preliminary vm_page hash lookup (2), cleanups, page wiring * Correct a bug in vm.fault_quick operation. Soft-busied pages cannot be safely wired or unwired. This fixes a wire/unwire race related panic. * Optimize vm_page_unhold() to normally not have to obtain any spin-locks at all, since related pages are almost never in the PQ_HOLD VM page queue. This leaves open a minor race condition where pages with a hold_count of 0 can accumulate in PQ_HOLD. * Add vm_page_scan_hold() to the pageout daemon. Unconditionally scan PQ_HOLD very slowly to remove any pages whos hold_count is 0. * REFACTOR PAGE WIRING. Wiring vm_page's no longer removes them from whatever paging queue they are on. Instead, proactively remove such pages from the queue only when we need to (typically in the pageout code). * Remove unused PV_FLAG_VMOBJECT. * Fix missing atomic-op in pc64/x86_64/efirt.c * Do not use m->md.pv_list for pagetable pages. It is now only used for terminal pages. * Properly initialize pv_flags to 0 when a pv_entry is allocated. * Add debugging to detect managed pmap_enter()s without an object. * Conditionalize the setting of PG_MAPPED and PG_WRITEABLE in the pmap code to avoid unnecessary cpu cache mastership changes. * Move assertions in vm_pageout.c that could trigger improperly due to a race. --- diff --git a/sys/platform/pc64/include/pmap.h b/sys/platform/pc64/include/pmap.h index 8ec40194be..b0540b5779 100644 --- a/sys/platform/pc64/include/pmap.h +++ b/sys/platform/pc64/include/pmap.h @@ -362,7 +362,8 @@ typedef struct pv_entry { #define PV_HOLD_UNUSED2000 0x20000000U #define PV_HOLD_MASK 0x1FFFFFFFU -#define PV_FLAG_VMOBJECT 0x00000001U /* shared pt in VM obj */ +#define PV_FLAG_UNUSED01 0x00000001U +#define PV_FLAG_PGTABLE 0x00000002U /* page table page */ #ifdef _KERNEL diff --git a/sys/platform/pc64/x86_64/efirt.c b/sys/platform/pc64/x86_64/efirt.c index bd5c5f86ad..ac53e0b178 100644 --- a/sys/platform/pc64/x86_64/efirt.c +++ b/sys/platform/pc64/x86_64/efirt.c @@ -138,7 +138,7 @@ efi_destroy_1t1_map(void) while ((m = RB_ROOT(&obj->rb_memq)) != NULL) { vm_page_busy_wait(m, FALSE, "efipg"); vm_page_unwire(m, 1); - m->flags &= ~(PG_MAPPED | PG_WRITEABLE); + vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); cdev_pager_free_page(obj, m); kfree(m, M_EFI); } diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index 00ea2320e3..92d4051a55 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -271,6 +271,10 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD, &pmap_nx_enable, 0, "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)"); +static int pmap_pv_debug = 50; +SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW, + &pmap_pv_debug, 0, ""); + /* Standard user access funtions */ extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, size_t *lencopied); @@ -2534,7 +2538,15 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) vm_page_spin_lock(m); pmap_page_stats_adding(m); + + /* + * PGTABLE pv's only exist in the context of the pmap RB tree + * (pmap->pm_pvroot). + */ +#if 0 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list); +#endif + pv->pv_flags |= PV_FLAG_PGTABLE; pv->pv_m = m; vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE); vm_page_spin_unlock(m); @@ -3465,10 +3477,15 @@ pmap_remove_pv_page(pv_entry_t pv) vm_page_spin_lock(m); KKASSERT(m && m == pv->pv_m); pv->pv_m = NULL; - TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); - pmap_page_stats_deleting(m); - if (TAILQ_EMPTY(&m->md.pv_list)) + if (pv->pv_flags & PV_FLAG_PGTABLE) { vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); + KKASSERT(TAILQ_EMPTY(&m->md.pv_list)); + } else { + TAILQ_REMOVE(&m->md.pv_list, pv, pv_list); + if (TAILQ_EMPTY(&m->md.pv_list)) + vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); + } + pmap_page_stats_deleting(m); vm_page_spin_unlock(m); return(m); @@ -3795,6 +3812,7 @@ _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) pnew->pv_pmap = pmap; pnew->pv_pindex = pindex; pnew->pv_hold = PV_HOLD_LOCKED | 2; + pnew->pv_flags = 0; #ifdef PMAP_DEBUG pnew->pv_func = func; pnew->pv_line = lineno; @@ -4968,6 +4986,12 @@ pmap_remove_all(vm_page_t m) vm_page_spin_lock(m); while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) { + if (pv->pv_m != m) { + kprintf("pmap_remove_all FAILURE\n"); + kprintf("pv %p pv->pv_m %p m %p\n", pv, pv->pv_m, m); + kprintf("pvflags %08x\n", pv->pv_flags); + } + KKASSERT(pv->pv_m == m); if (pv_hold_try(pv)) { vm_page_spin_unlock(m); @@ -5434,14 +5458,29 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, * * Enter on the PV list if part of our managed memory. */ + + if (m->object == NULL && pmap_pv_debug > 0) { + --pmap_pv_debug; + kprintf("pte_m %p pv_entry %p NOOBJ\n", m, pte_pv); + print_backtrace(16); + } + KKASSERT(pte_pv && (pte_pv->pv_m == NULL || pte_pv->pv_m == m)); vm_page_spin_lock(m); pte_pv->pv_m = m; pmap_page_stats_adding(m); TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list); - vm_page_flag_set(m, PG_MAPPED); - if (newpte & pmap->pmap_bits[PG_RW_IDX]) + + /* + * Set vm_page flags. Avoid a cache mastership change if + * the bits are already set. + */ + if ((m->flags & PG_MAPPED) == 0) + vm_page_flag_set(m, PG_MAPPED); + if ((newpte & pmap->pmap_bits[PG_RW_IDX]) && + (m->flags & PG_WRITEABLE) == 0) { vm_page_flag_set(m, PG_WRITEABLE); + } vm_page_spin_unlock(m); if (pt_pv && opa && diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index d97ba5b530..10fdc34c83 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -163,6 +163,7 @@ static int vm_fault_quick_enable = 0; TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable); SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW, &vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut"); +#ifdef VM_FAULT_QUICK_DEBUG static long vm_fault_quick_success_count = 0; SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW, &vm_fault_quick_success_count, 0, ""); @@ -178,6 +179,7 @@ SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count3, CTLFLAG_RW, static long vm_fault_quick_failure_count4 = 0; SYSCTL_LONG(_vm, OID_AUTO, fault_quick_failure_count4, CTLFLAG_RW, &vm_fault_quick_failure_count4, 0, ""); +#endif static int vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, vm_prot_t fault_type); @@ -721,12 +723,14 @@ RetryFault: } success: - /* * On success vm_fault_object() does not unlock or deallocate, and fs.m * will contain a busied page. * * Enter the page into the pmap and do pmap-related adjustments. + * + * WARNING! Soft-busied fs.m's can only be manipulated in limited + * ways. */ KKASSERT(fs.lookup_still_valid == TRUE); vm_page_flag_set(fs.m, PG_REFERENCED); @@ -739,19 +743,23 @@ success: /* * If the page is not wired down, then put it where the pageout daemon * can find it. + * + * NOTE: We cannot safely wire, unwire, or adjust queues for a + * soft-busied page. */ - if (fs.fault_flags & VM_FAULT_WIRE_MASK) { - if (fs.wflags & FW_WIRED) - vm_page_wire(fs.m); - else - vm_page_unwire(fs.m, 1); - } else { - vm_page_activate(fs.m); - } if (fs.msoftonly) { KKASSERT(fs.m->busy_count & PBUSY_MASK); + KKASSERT((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0); vm_page_sbusy_drop(fs.m); } else { + if (fs.fault_flags & VM_FAULT_WIRE_MASK) { + if (fs.wflags & FW_WIRED) + vm_page_wire(fs.m); + else + vm_page_unwire(fs.m, 1); + } else { + vm_page_activate(fs.m); + } KKASSERT(fs.m->busy_count & PBUSY_LOCKED); vm_page_wakeup(fs.m); } @@ -858,11 +866,20 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, if (obj->flags & OBJ_ONEMAPPING) return KERN_FAILURE; + /* + * This will try to wire/unwire a page, which can't be done with + * a soft-busied page. + */ + if (fs->fault_flags & VM_FAULT_WIRE_MASK) + return KERN_FAILURE; + /* * Ick, can't handle this */ if (fs->entry->maptype == VM_MAPTYPE_VPAGETABLE) { +#ifdef VM_FAULT_QUICK_DEBUG ++vm_fault_quick_failure_count1; +#endif return KERN_FAILURE; } @@ -872,7 +889,9 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, */ m = vm_page_hash_get(obj, first_pindex); if (m == NULL) { +#ifdef VM_FAULT_QUICK_DEBUG ++vm_fault_quick_failure_count2; +#endif return KERN_FAILURE; } if ((obj->flags & OBJ_DEAD) || @@ -880,7 +899,9 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, m->queue - m->pc == PQ_CACHE || (m->flags & PG_SWAPPED)) { vm_page_sbusy_drop(m); +#ifdef VM_FAULT_QUICK_DEBUG ++vm_fault_quick_failure_count3; +#endif return KERN_FAILURE; } @@ -897,23 +918,35 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, } /* - * Check write permissions. We don't hold an object lock so the - * object must already be flagged writable and dirty. + * If this is a write fault the object and the page must already + * be writable. Since we don't hold an object lock and only a + * soft-busy on the page, we cannot manipulate the object or + * the page state (other than the page queue). */ if (fs->prot & VM_PROT_WRITE) { if ((obj->flags & (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY)) != (OBJ_WRITEABLE | OBJ_MIGHTBEDIRTY) || m->dirty != VM_PAGE_BITS_ALL) { vm_page_sbusy_drop(m); +#ifdef VM_FAULT_QUICK_DEBUG ++vm_fault_quick_failure_count4; +#endif return KERN_FAILURE; } vm_set_nosync(m, fs->entry); } + + /* + * Even though we are only soft-busied we can still move pages + * around in the normal queue(s). The soft-busy prevents the + * page from being removed from the object, etc (normal operation). + */ vm_page_activate(m); fs->m = m; fs->msoftonly = 1; +#ifdef VM_FAULT_QUICK_DEBUG ++vm_fault_quick_success_count; +#endif return KERN_SUCCESS; } diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index f4992140db..21a3a4a9d8 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -1303,8 +1303,15 @@ vm_page_hold(vm_page_t m) * The opposite of vm_page_hold(). If the page is on the HOLD queue * it was freed while held and must be moved back to the FREE queue. * - * To avoid racing against vm_page_free*() we must test conditions - * after obtaining the spin-lock. + * To avoid racing against vm_page_free*() we must re-test conditions + * after obtaining the spin-lock. The initial test can also race a + * vm_page_free*() that is in the middle of moving a page to PQ_HOLD, + * leaving the page on PQ_HOLD with hold_count == 0. Rather than + * throw a spin-lock in the critical path, we rely on the pageout + * daemon to clean-up these loose ends. + * + * More critically, the 'easy movement' between queues without busying + * a vm_page is only allowed for PQ_FREE<->PQ_HOLD. */ void vm_page_unhold(vm_page_t m) @@ -1314,7 +1321,8 @@ vm_page_unhold(vm_page_t m) "on FREE queue (%d)", m, m->hold_count, m->queue - m->pc)); - if (atomic_fetchadd_int(&m->hold_count, -1) == 1) { + if (atomic_fetchadd_int(&m->hold_count, -1) == 1 && + m->queue - m->pc == PQ_HOLD) { vm_page_spin_lock(m); if (m->hold_count == 0 && m->queue - m->pc == PQ_HOLD) { _vm_page_queue_spin_lock(m); @@ -1976,14 +1984,20 @@ vm_page_select_free(u_short pg_color) /* * Theoretically if we are able to busy the page * atomic with the queue removal (using the vm_page - * lock) nobody else should be able to mess with the - * page before us. + * lock) nobody else should have been able to mess + * with the page before us. + * + * Assert the page state. Note that even though + * wiring doesn't adjust queues, a page on the free + * queue should never be wired at this point. */ KKASSERT((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) == 0); - KASSERT(m->hold_count == 0, ("m->hold_count is not zero " - "pg %p q=%d flags=%08x hold=%d wire=%d", - m, m->queue, m->flags, m->hold_count, m->wire_count)); + KASSERT(m->hold_count == 0, + ("m->hold_count is not zero " + "pg %p q=%d flags=%08x hold=%d wire=%d", + m, m->queue, m->flags, + m->hold_count, m->wire_count)); KKASSERT(m->wire_count == 0); vm_page_spin_unlock(m); pagedaemon_wakeup(); @@ -2500,7 +2514,7 @@ vm_page_activate(vm_page_t m) if (oqueue == PQ_CACHE) mycpu->gd_cnt.v_reactivated++; - if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { + if ((m->flags & PG_UNMANAGED) == 0) { if (m->act_count < ACT_INIT) m->act_count = ACT_INIT; _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0); @@ -2670,11 +2684,10 @@ vm_page_free_toq(vm_page_t m) * vm_page_unmanage() * * Prevent PV management from being done on the page. The page is - * removed from the paging queues as if it were wired, and as a - * consequence of no longer being managed the pageout daemon will not - * touch it (since there is no way to locate the pte mappings for the - * page). madvise() calls that mess with the pmap will also no longer - * operate on the page. + * also removed from the paging queues, and as a consequence of no longer + * being managed the pageout daemon will not touch it (since there is no + * way to locate the pte mappings for the page). madvise() calls that + * mess with the pmap will also no longer operate on the page. * * Beyond that the page is still reasonably 'normal'. Freeing the page * will clear the flag. @@ -2691,15 +2704,14 @@ vm_page_unmanage(vm_page_t m) { KKASSERT(m->busy_count & PBUSY_LOCKED); if ((m->flags & PG_UNMANAGED) == 0) { - if (m->wire_count == 0) - vm_page_unqueue(m); + vm_page_unqueue(m); } vm_page_flag_set(m, PG_UNMANAGED); } /* - * Mark this page as wired down by yet another map, removing it from - * paging queues as necessary. + * Mark this page as wired down by yet another map. We do not adjust the + * queue the page is on, it will be checked for wiring as-needed. * * Caller must be holding the page busy. */ @@ -2715,8 +2727,6 @@ vm_page_wire(vm_page_t m) KKASSERT(m->busy_count & PBUSY_LOCKED); if ((m->flags & PG_FICTITIOUS) == 0) { if (atomic_fetchadd_int(&m->wire_count, 1) == 0) { - if ((m->flags & PG_UNMANAGED) == 0) - vm_page_unqueue(m); atomic_add_long(&mycpu->gd_vmstats_adj.v_wire_count, 1); } KASSERT(m->wire_count != 0, @@ -2727,6 +2737,10 @@ vm_page_wire(vm_page_t m) /* * Release one wiring of this page, potentially enabling it to be paged again. * + * Note that wired pages are no longer unconditionally removed from the + * paging queues, so the page may already be on a queue. Move the page + * to the desired queue if necessary. + * * Many pages placed on the inactive queue should actually go * into the cache, but it is difficult to figure out which. What * we do instead, if the inactive target is well met, is to put @@ -2758,7 +2772,7 @@ vm_page_unwire(vm_page_t m, int activate) KKASSERT(m->busy_count & PBUSY_LOCKED); if (m->flags & PG_FICTITIOUS) { /* do nothing */ - } else if (m->wire_count <= 0) { + } else if ((int)m->wire_count <= 0) { panic("vm_page_unwire: invalid wire count: %d", m->wire_count); } else { if (atomic_fetchadd_int(&m->wire_count, -1) == 1) { @@ -2766,32 +2780,38 @@ vm_page_unwire(vm_page_t m, int activate) if (m->flags & PG_UNMANAGED) { ; } else if (activate || (m->flags & PG_NEED_COMMIT)) { + vm_page_activate(m); +#if 0 vm_page_spin_lock(m); _vm_page_add_queue_spinlocked(m, PQ_ACTIVE + m->pc, 0); _vm_page_and_queue_spin_unlock(m); +#endif } else { + vm_page_deactivate(m); +#if 0 vm_page_spin_lock(m); vm_page_flag_clear(m, PG_WINATCFLS); _vm_page_add_queue_spinlocked(m, PQ_INACTIVE + m->pc, 0); - ++vm_swapcache_inactive_heuristic; _vm_page_and_queue_spin_unlock(m); +#endif + ++vm_swapcache_inactive_heuristic; } } } } /* - * Move the specified page to the inactive queue. If the page has - * any associated swap, the swap is deallocated. + * Move the specified page to the inactive queue. * * Normally athead is 0 resulting in LRU operation. athead is set * to 1 if we want this page to be 'as if it were placed in the cache', * except without unmapping it from the process address space. * * vm_page's spinlock must be held on entry and will remain held on return. - * This routine may not block. + * This routine may not block. The caller does not have to hold the page + * busied but should have some sort of interlock on its validity. */ static void _vm_page_deactivate_locked(vm_page_t m, int athead) @@ -2806,7 +2826,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead) _vm_page_queue_spin_lock(m); oqueue = _vm_page_rem_queue_spinlocked(m); - if (m->wire_count == 0 && (m->flags & PG_UNMANAGED) == 0) { + if ((m->flags & PG_UNMANAGED) == 0) { if (oqueue == PQ_CACHE) mycpu->gd_cnt.v_reactivated++; vm_page_flag_clear(m, PG_WINATCFLS); diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 6697104e96..550f2a3dbd 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -100,6 +100,33 @@ /* * vm_page structure + * + * hard-busy: (PBUSY_LOCKED) + * + * Hard-busying a page allows major manipulation of the page structure. + * No new soft-busies can accumulate while a page is hard-busied. The + * page busying code typically waits for all soft-busies to drop before + * allowing the hard-busy. + * + * soft-busy: (PBUSY_MASK) + * + * Soft-busying a page typically indicates I/O or read-only use of + * the content. A page can have multiple soft-busies on it. New + * soft-busies block on any hard-busied page (wait for the hard-busy + * to go away). + * + * hold_count + * + * This prevents a page from being freed. This does not prevent any + * other operation. The page may still be disassociated from its + * object and essentially scrapped. It just won't be reused while + * a non-zero hold_count is present. + * + * wire_count + * + * This indicates that the page has been wired into memory somewhere + * (typically a buffer cache buffer, or a user wire). The pageout + * daemon will skip wired pages. */ TAILQ_HEAD(pglist, vm_page); diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index b011de9b52..88d270dddd 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -827,8 +827,8 @@ vm_pageout_scan_inactive(int pass, int q, long avail_shortage, * Remaining operations run with the page busy and neither * the page or the queue will be spin-locked. */ - vm_page_queues_spin_unlock(PQ_INACTIVE + q); KKASSERT(m->queue == PQ_INACTIVE + q); + vm_page_queues_spin_unlock(PQ_INACTIVE + q); /* * The emergency pager runs when the primary pager gets @@ -938,18 +938,13 @@ vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, int count = 0; /* - * It is possible for a page to be busied ad-hoc (e.g. the - * pmap_collect() code) and wired and race against the - * allocation of a new page. vm_page_alloc() may be forced - * to deactivate the wired page in which case it winds up - * on the inactive queue and must be handled here. We - * correct the problem simply by unqueuing the page. + * Wiring no longer removes a page from its queue. The last unwiring + * will requeue the page. Obviously wired pages cannot be paged out + * so unqueue it and return. */ if (m->wire_count) { vm_page_unqueue_nowakeup(m); vm_page_wakeup(m); - kprintf("WARNING: pagedaemon: wired page on " - "inactive queue %p\n", m); return 0; } @@ -1199,6 +1194,16 @@ vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, } vm_page_unhold(m); + /* + * If it was wired while we didn't own it. + */ + if (m->wire_count) { + vm_page_unqueue_nowakeup(m); + vput(vp); + vm_page_wakeup(m); + return 0; + } + /* * (m) is busied again * @@ -1340,15 +1345,15 @@ vm_pageout_scan_active(int pass, int q, * Remaining operations run with the page busy and neither * the page or the queue will be spin-locked. */ - vm_page_queues_spin_unlock(PQ_ACTIVE + q); KKASSERT(m->queue == PQ_ACTIVE + q); + vm_page_queues_spin_unlock(PQ_ACTIVE + q); #if 0 /* * Don't deactivate pages that are held, even if we can * busy them. (XXX why not?) */ - if (m->hold_count != 0) { + if (m->hold_count) { vm_page_and_queue_spin_lock(m); if (m->queue - m->pc == PQ_ACTIVE) { TAILQ_REMOVE( @@ -1363,6 +1368,14 @@ vm_pageout_scan_active(int pass, int q, goto next; } #endif + /* + * We can just remove wired pages from the queue + */ + if (m->wire_count) { + vm_page_unqueue_nowakeup(m); + vm_page_wakeup(m); + goto next; + } /* * The emergency pager ignores vnode-backed pages as these @@ -1559,6 +1572,10 @@ vm_pageout_scan_cache(long avail_shortage, int pass, m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); if (m == NULL) break; + + /* + * If the busy attempt fails we can still deactivate the page. + */ /* page is returned removed from its queue and spinlocked */ if (vm_page_busy_try(m, TRUE)) { vm_page_deactivate_locked(m); @@ -1733,6 +1750,42 @@ vm_pageout_scan_callback(struct proc *p, void *data) return(0); } +/* + * This old guy slowly walks PQ_HOLD looking for pages which need to be + * moved back to PQ_FREE. It is possible for pages to accumulate here + * when vm_page_free() races against vm_page_unhold(), resulting in a + * page being left on a PQ_HOLD queue with hold_count == 0. + * + * It is easier to handle this edge condition here, in non-critical code, + * rather than enforce a spin-lock for every 1->0 transition in + * vm_page_unhold(). + * + * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. + */ +static void +vm_pageout_scan_hold(int q) +{ + vm_page_t m; + + vm_page_queues_spin_lock(PQ_HOLD + q); + TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { + if (m->flags & PG_MARKER) + continue; + + /* + * Process one page and return + */ + if (m->hold_count) + break; + kprintf("DEBUG: pageout HOLD->FREE %p\n", m); + vm_page_hold(m); + vm_page_queues_spin_unlock(PQ_HOLD + q); + vm_page_unhold(m); /* reprocess */ + return; + } + vm_page_queues_spin_unlock(PQ_HOLD + q); +} + /* * This routine tries to maintain the pseudo LRU active queue, * so that during long periods of time where there is no paging, @@ -1807,16 +1860,26 @@ vm_pageout_page_stats(int q) * Remaining operations run with the page busy and neither * the page or the queue will be spin-locked. */ - vm_page_queues_spin_unlock(PQ_ACTIVE + q); KKASSERT(m->queue == PQ_ACTIVE + q); + vm_page_queues_spin_unlock(PQ_ACTIVE + q); + + /* + * We can just remove wired pages from the queue + */ + if (m->wire_count) { + vm_page_unqueue_nowakeup(m); + vm_page_wakeup(m); + goto next; + } + /* * We now have a safely busied page, the page and queue * spinlocks have been released. * - * Ignore held pages + * Ignore held and wired pages */ - if (m->hold_count) { + if (m->hold_count || m->wire_count) { vm_page_wakeup(m); goto next; } @@ -1952,6 +2015,7 @@ vm_pageout_thread(void) int q; int q1iterator = 0; int q2iterator = 0; + int q3iterator = 0; int isep; curthread->td_flags |= TDF_SYSTHREAD; @@ -2104,7 +2168,13 @@ skip_setup: } else { /* * Primary pagedaemon + * + * NOTE: We unconditionally cleanup PQ_HOLD even + * when there is no work to do. */ + vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); + ++q3iterator; + if (vm_pages_needed == 0) { error = tsleep(&vm_pages_needed, 0, "psleep",