From 3e0c75830de92264feb6624f4f97d08890d0bf51 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 10 Nov 2004 17:39:20 +0000 Subject: [PATCH] Fix a very serious bug in contigmalloc() which we inherited from FreeBSD-4.x. The contigmalloc() code incorrectly assumes that a page in PQ_CACHE can be reused without having to do any further checks and it unconditionally busies and frees such pages, and assumes that the page becomes PQ_FREE even though it might actually have gone to a PQ_HOLD state. Additionally the contigmalloc() code unconditionally sets m->object to NULL, ignoring the fact that the page will be in the VM page bucket hash table if object happens to not be NULL, leading to page bucket hash table corruption. The fix is two fold. First, we add checks for m->busy, (m->flags & PG_BUSY), m->wire_count, and m->hold_count and do not reuse a page with any of those set. We do this for all pages, not just PQ_CACHE pages, though it is believed that it only needs to be done for PQ_CACHE pages. Second, we replace the m->object = NULL assignment with an assertion that it is already NULL, since it had better be NULL and we cannot just set it to NULL unconditionally without blowing up the VM page hash table. Symptoms of the bug include: * Filesystem corruption, in particular with slower disk drivers (e.g. like the 'twe' driver), or in systems with drivers which use contigmalloc() a lot (e.g. require bounce buffers). Mangled directory entries, bad indirect blocks (containing data instead of indirect block pointers), and files containing other file's data. * 'page not found in hash' panic. This is the last major VM issue in DragonFly, one that has plagued in particular David Rhodus (who is a heavy user of the 'twe' driver) for over a year. I would never have found this bug if not for DR's persistence and the dozens of kernel cores he was able to provide me over the last year. We finally got a core with a 'smoking gun', after having written a program (/usr/src/test/debug/vmpageinfo.c) to run through all the VM pages and check their hash table association for correctness it became obvious that pages were being reused without being removed from the hash table which finally led to contigmalloc*(). Many thanks to: David Rhodus! Free gift enclosed! --- sys/vm/vm_contig.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/sys/vm/vm_contig.c b/sys/vm/vm_contig.c index 1a8a658406..ed7716e69f 100644 --- a/sys/vm/vm_contig.c +++ b/sys/vm/vm_contig.c @@ -64,7 +64,7 @@ * SUCH DAMAGE. * * from: @(#)vm_page.c 7.4 (Berkeley) 5/7/91 - * $DragonFly: src/sys/vm/vm_contig.c,v 1.10 2004/10/12 19:21:16 dillon Exp $ + * $DragonFly: src/sys/vm/vm_contig.c,v 1.11 2004/11/10 17:39:20 dillon Exp $ */ /* @@ -200,6 +200,8 @@ vm_contig_pg_alloc( int i, start, pass; vm_offset_t phys; vm_page_t pga = vm_page_array; + vm_page_t m; + int pqtype; size = round_page(size); if (size == 0) @@ -209,7 +211,7 @@ vm_contig_pg_alloc( if ((boundary & (boundary - 1)) != 0) panic("vm_contig_pg_alloc: boundary must be a power of 2"); - start = 0; + start = 1; /* must start at 1 due to m[-1] check below */ for (pass = 0; pass <= 1; pass++) { crit_enter(); again: @@ -218,14 +220,19 @@ again: * such that the boundary won't be crossed. */ for (i = start; i < vmstats.v_page_count; i++) { - int pqtype; - phys = VM_PAGE_TO_PHYS(&pga[i]); - pqtype = pga[i].queue - pga[i].pc; + m = &pga[i]; + phys = VM_PAGE_TO_PHYS(m); + pqtype = m->queue - m->pc; if (((pqtype == PQ_FREE) || (pqtype == PQ_CACHE)) && (phys >= low) && (phys < high) && ((phys & (alignment - 1)) == 0) && - (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0)) + (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) && + m->busy == 0 && m->wire_count == 0 && + m->hold_count == 0 && (m->flags & PG_BUSY) == 0 + + ) { break; + } } /* @@ -254,11 +261,14 @@ again1: * (still in critical section) */ for (i = start + 1; i < (start + size / PAGE_SIZE); i++) { - int pqtype; - pqtype = pga[i].queue - pga[i].pc; - if ((VM_PAGE_TO_PHYS(&pga[i]) != - (VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE)) || - ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE))) { + m = &pga[i]; + pqtype = m->queue - m->pc; + if ((VM_PAGE_TO_PHYS(&m[0]) != + (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) || + ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) || + m->busy || m->wire_count || + m->hold_count || (m->flags & PG_BUSY) + ) { start++; goto again; } @@ -268,14 +278,13 @@ again1: * (still in critical section) */ for (i = start; i < (start + size / PAGE_SIZE); i++) { - int pqtype; - vm_page_t m = &pga[i]; - + m = &pga[i]; pqtype = m->queue - m->pc; if (pqtype == PQ_CACHE) { vm_page_busy(m); vm_page_free(m); } + KKASSERT(m->object == NULL); vm_page_unqueue_nowakeup(m); m->valid = VM_PAGE_BITS_ALL; if (m->flags & PG_ZERO) @@ -286,7 +295,6 @@ again1: ("vm_contig_pg_alloc: page %p was dirty", m)); m->wire_count = 0; m->busy = 0; - m->object = NULL; } /* -- 2.32.0