From bb0d60931c54c7121a931f0ebf915700feac863d Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 27 Dec 2013 21:37:42 -0800 Subject: [PATCH] kernel - Attempt to fix a rare vm_page_queues panic * Change the way the vm_page_t markers are used in the pageout daemon. The structures are not stable because they are declared on the stack, but the vm pageout scan functions were assuming that a TAILQ_NEXT() call would yield a stable vm_page_t structure that could then be locked with vm_page_and_queue_spin_lock(). * Fix the problem by holding the appropriate vm_page_queues[] spinlock across the TAILQ_NEXT() lookup. * The panic could not be definitively traced to this issue so we will have to see if it reoccurs. * This also fixes a potential infinite marker hop-over case when multiple cpus are trying to scan the same queue at the same time. However, it would be exceedingly rare for this case to actually occur since only the pageout and swapcache daemons could compete in this way. Reported-by: swildner --- sys/vm/vm_fault.c | 10 +++ sys/vm/vm_page.c | 2 +- sys/vm/vm_pageout.c | 153 ++++++++++++++++++++++---------------------- 3 files changed, 86 insertions(+), 79 deletions(-) diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 762e7d275b..49bec6a63b 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -120,6 +120,8 @@ struct faultstate { struct vnode *vp; }; +static int debug_fault = 0; +SYSCTL_INT(_vm, OID_AUTO, debug_fault, CTLFLAG_RW, &debug_fault, 0, ""); static int debug_cluster = 0; SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, ""); int vm_shared_fault = 1; @@ -497,6 +499,14 @@ RetryFault: * we must not try to burst (we can't allocate VM pages). */ result = vm_fault_object(&fs, first_pindex, fault_type, 1); + + if (debug_fault > 0) { + --debug_fault; + kprintf("VM_FAULT result %d addr=%jx type=%02x flags=%02x fs.m=%p fs.prot=%02x fs.wired=%02x fs.entry=%p\n", + result, vaddr, fault_type, fault_flags, + fs.m, fs.prot, fs.wired, fs.entry); + } + if (result == KERN_TRY_AGAIN) { vm_object_drop(fs.first_object); ++retry; diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 902c10c655..3a008b7745 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -2229,7 +2229,6 @@ vm_page_wire(vm_page_t m) * be placed in the cache - for example, just after dirtying a page. * dirty pages in the cache are not allowed. * - * The page queues must be locked. * This routine may not block. */ void @@ -2294,6 +2293,7 @@ _vm_page_deactivate_locked(vm_page_t m, int athead) if (athead == 0) ++vm_swapcache_inactive_heuristic; } + /* NOTE: PQ_NONE if condition not taken */ _vm_page_queue_spin_unlock(m); /* leaves vm_page spinlocked */ } diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index ecd20f1c4e..9fdf6f396c 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -773,17 +773,13 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, vm_page_queues_spin_lock(PQ_INACTIVE + q); TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; - vm_page_queues_spin_unlock(PQ_INACTIVE + q); + /* + * Queue locked at top of loop to avoid stack marker issues. + */ while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && maxscan-- > 0 && avail_shortage - delta > 0) { - vm_page_and_queue_spin_lock(m); - if (m != TAILQ_NEXT(&marker, pageq)) { - vm_page_and_queue_spin_unlock(m); - ++maxscan; - continue; - } KKASSERT(m->queue - m->pc == PQ_INACTIVE); TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); @@ -792,31 +788,26 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, mycpu->gd_cnt.v_pdpages++; /* - * Skip marker pages + * Skip marker pages (atomic against other markers to avoid + * infinite hop-over scans). */ - if (m->flags & PG_MARKER) { - vm_page_and_queue_spin_unlock(m); + if (m->flags & PG_MARKER) continue; - } /* * Try to busy the page. Don't mess with pages which are * already busy or reorder them in the queue. */ - if (vm_page_busy_try(m, TRUE)) { - vm_page_and_queue_spin_unlock(m); + if (vm_page_busy_try(m, TRUE)) continue; - } - vm_page_and_queue_spin_unlock(m); - KKASSERT(m->queue - m->pc == PQ_INACTIVE); - - lwkt_yield(); /* - * The page has been successfully busied and is now no - * longer spinlocked. The queue is no longer spinlocked - * either. + * Remaining operations run with the page busy and neither + * the page or the queue will be spin-locked. */ + vm_page_queues_spin_unlock(PQ_INACTIVE + q); + KKASSERT(m->queue - m->pc == PQ_INACTIVE); + lwkt_yield(); /* * It is possible for a page to be busied ad-hoc (e.g. the @@ -831,7 +822,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, vm_page_wakeup(m); kprintf("WARNING: pagedaemon: wired page on " "inactive queue %p\n", m); - continue; + goto next; } /* @@ -850,7 +841,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, } vm_page_and_queue_spin_unlock(m); vm_page_wakeup(m); - continue; + goto next; } if (m->object == NULL || m->object->ref_count == 0) { @@ -876,7 +867,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE); vm_page_wakeup(m); - continue; + goto next; } /* @@ -893,7 +884,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, vm_page_activate(m); m->act_count += (actcount + ACT_ADVANCE + 1); vm_page_wakeup(m); - continue; + goto next; } /* @@ -998,7 +989,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, } vm_page_and_queue_spin_unlock(m); vm_page_wakeup(m); - continue; + goto next; } /* @@ -1050,7 +1041,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, if (object->flags & OBJ_MIGHTBEDIRTY) ++*vnodes_skippedp; vm_page_unhold(m); - continue; + goto next; } /* @@ -1067,7 +1058,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, ++*vnodes_skippedp; vput(vp); vm_page_unhold(m); - continue; + goto next; } /* @@ -1079,7 +1070,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, if (vm_page_busy_try(m, TRUE)) { vput(vp); vm_page_unhold(m); - continue; + goto next; } vm_page_unhold(m); @@ -1102,7 +1093,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, ++*vnodes_skippedp; vm_page_wakeup(m); vput(vp); - continue; + goto next; } /* (m) is left busied as we fall through */ } @@ -1133,6 +1124,7 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, vm_page_wakeup(m); } +next: /* * Systems with a ton of memory can wind up with huge * deactivation counts. Because the inactive scan is @@ -1148,10 +1140,12 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage, * caching pages, it is deactivating active pages, so it * will not by itself cause the abort condition. */ + vm_page_queues_spin_lock(PQ_INACTIVE + q); if (vm_paging_target() < 0) break; } - vm_page_queues_spin_lock(PQ_INACTIVE + q); + + /* page queue still spin-locked */ TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); vm_page_queues_spin_unlock(PQ_INACTIVE + q); @@ -1199,18 +1193,14 @@ vm_pageout_scan_active(int pass, int q, vm_page_queues_spin_lock(PQ_ACTIVE + q); TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; - vm_page_queues_spin_unlock(PQ_ACTIVE + q); + /* + * Queue locked at top of loop to avoid stack marker issues. + */ while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && maxscan-- > 0 && (avail_shortage - delta > 0 || inactive_shortage > 0)) { - vm_page_and_queue_spin_lock(m); - if (m != TAILQ_NEXT(&marker, pageq)) { - vm_page_and_queue_spin_unlock(m); - ++maxscan; - continue; - } KKASSERT(m->queue - m->pc == PQ_ACTIVE); TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); @@ -1218,42 +1208,45 @@ vm_pageout_scan_active(int pass, int q, &marker, pageq); /* - * Skip marker pages + * Skip marker pages (atomic against other markers to avoid + * infinite hop-over scans). */ - if (m->flags & PG_MARKER) { - vm_page_and_queue_spin_unlock(m); + if (m->flags & PG_MARKER) continue; - } /* * Try to busy the page. Don't mess with pages which are * already busy or reorder them in the queue. */ - if (vm_page_busy_try(m, TRUE)) { - vm_page_and_queue_spin_unlock(m); + if (vm_page_busy_try(m, TRUE)) continue; - } + + /* + * Remaining operations run with the page busy and neither + * the page or the queue will be spin-locked. + */ + vm_page_queues_spin_unlock(PQ_ACTIVE + q); + KKASSERT(m->queue - m->pc == PQ_ACTIVE); + lwkt_yield(); /* * Don't deactivate pages that are held, even if we can * busy them. (XXX why not?) */ if (m->hold_count != 0) { - TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, - m, pageq); - TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl, - m, pageq); + vm_page_and_queue_spin_lock(m); + if (m->queue - m->pc == PQ_ACTIVE) { + TAILQ_REMOVE( + &vm_page_queues[PQ_ACTIVE + q].pl, + m, pageq); + TAILQ_INSERT_TAIL( + &vm_page_queues[PQ_ACTIVE + q].pl, + m, pageq); + } vm_page_and_queue_spin_unlock(m); vm_page_wakeup(m); - continue; + goto next; } - vm_page_and_queue_spin_unlock(m); - lwkt_yield(); - - /* - * The page has been successfully busied and the page and - * queue are no longer locked. - */ /* * The count for pagedaemon pages is done after checking the @@ -1356,12 +1349,15 @@ vm_pageout_scan_active(int pass, int q, vm_page_wakeup(m); } } +next: + vm_page_queues_spin_lock(PQ_ACTIVE + q); } /* * Clean out our local marker. + * + * Page queue still spin-locked. */ - vm_page_queues_spin_lock(PQ_ACTIVE + q); TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); vm_page_queues_spin_unlock(PQ_ACTIVE + q); @@ -1424,8 +1420,8 @@ vm_pageout_scan_cache(int avail_shortage, int vnodes_skipped, int recycle_count) lwkt_yield(); /* - * Page has been successfully busied and it and its queue - * is no longer spinlocked. + * Remaining operations run with the page busy and neither + * the page or the queue will be spin-locked. */ if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || m->hold_count || @@ -1611,40 +1607,38 @@ vm_pageout_page_stats(int q) vm_page_queues_spin_lock(PQ_ACTIVE + q); TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); - vm_page_queues_spin_unlock(PQ_ACTIVE + q); + /* + * Queue locked at top of loop to avoid stack marker issues. + */ while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && pcount-- > 0) { int actcount; - vm_page_and_queue_spin_lock(m); - if (m != TAILQ_NEXT(&marker, pageq)) { - vm_page_and_queue_spin_unlock(m); - ++pcount; - continue; - } KKASSERT(m->queue - m->pc == PQ_ACTIVE); TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, &marker, pageq); /* - * Ignore markers + * Skip marker pages (atomic against other markers to avoid + * infinite hop-over scans). */ - if (m->flags & PG_MARKER) { - vm_page_and_queue_spin_unlock(m); + if (m->flags & PG_MARKER) continue; - } /* * Ignore pages we can't busy */ - if (vm_page_busy_try(m, TRUE)) { - vm_page_and_queue_spin_unlock(m); + if (vm_page_busy_try(m, TRUE)) continue; - } - vm_page_and_queue_spin_unlock(m); + + /* + * Remaining operations run with the page busy and neither + * the page or the queue will be spin-locked. + */ + vm_page_queues_spin_unlock(PQ_ACTIVE + q); KKASSERT(m->queue - m->pc == PQ_ACTIVE); /* @@ -1655,7 +1649,7 @@ vm_pageout_page_stats(int q) */ if (m->hold_count) { vm_page_wakeup(m); - continue; + goto next; } /* @@ -1686,7 +1680,7 @@ vm_pageout_page_stats(int q) } vm_page_and_queue_spin_unlock(m); vm_page_wakeup(m); - continue; + goto next; } if (m->act_count == 0) { @@ -1719,12 +1713,15 @@ vm_pageout_page_stats(int q) vm_page_and_queue_spin_unlock(m); } vm_page_wakeup(m); +next: + vm_page_queues_spin_lock(PQ_ACTIVE + q); } /* * Remove our local marker + * + * Page queue still spin-locked. */ - vm_page_queues_spin_lock(PQ_ACTIVE + q); TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); vm_page_queues_spin_unlock(PQ_ACTIVE + q); } -- 2.41.0