From 70fc5283006abf89e499f860fc3a7eab0e8e104a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Fri, 12 Jan 2007 22:12:53 +0000 Subject: [PATCH] Fix the recently committed (and described) page writability nerf. The real kernel was unconditionally mapping writable pages read-only on read faults in order to be able to take another fault on a write attempt. This was needed for early virtual kernel support in order to set the Modify bit in the virtualized page table, but was being applied to ALL mappings rather then just those installed by the virtual kernel. Now the real kernel only does this for virtual kernel mappings. Additionally, the real kernel no longer makes the page read-only when clearing the Modify bit in the real page table (in order to rearm the write fault). When this case occurs VPTE_M has already been set in the virtual page table and no re-fault is required. The virtual kernel now only needs to invalidate the real kernel's page mapping when clearing the virtualized Modify bit in the virtual page table (VPTE_M), in order to rearm the real kernel's write fault so it can detect future modifications via the virtualized Modify bit. Also, the virtual kernel no longer needs to install read-only pages to detect the write fault. This allows the real kernel to do ALL the work required to handle VPTE_M and make the actual page writable. This greatly reduces the number of real page faults that occur and greatly reduces the number of page faults which have to be passed through to the virtual kernel. This fix reduces fork() overhead for processes running under a virtual kernel by 70%, from around 2100uS to around 650uS. --- sys/libkern/mcount.c | 4 ++- sys/platform/pc32/i386/pmap.c | 26 ++++++++++-------- sys/platform/vkernel/platform/pmap.c | 40 +++++++++++++++++----------- sys/vm/vm_fault.c | 5 ++-- 4 files changed, 45 insertions(+), 30 deletions(-) diff --git a/sys/libkern/mcount.c b/sys/libkern/mcount.c index 6a3f2743de..1aa815e5fb 100644 --- a/sys/libkern/mcount.c +++ b/sys/libkern/mcount.c @@ -31,7 +31,7 @@ * SUCH DAMAGE. * * $FreeBSD: src/sys/libkern/mcount.c,v 1.16 1999/12/29 04:54:41 peter Exp $ - * $DragonFly: src/sys/libkern/mcount.c,v 1.8 2006/11/07 17:51:23 dillon Exp $ + * $DragonFly: src/sys/libkern/mcount.c,v 1.9 2007/01/12 22:12:50 dillon Exp $ */ #include @@ -100,6 +100,8 @@ _MCOUNT_DECL(uintfptr_t frompc, uintfptr_t selfpc) * When we are called from an exception handler, frompci may be * for a user address. Convert such frompci's to the index of * user() to merge all user counts. + * + * XXX doesn't work properly with vkernel */ if (frompci >= p->textsize) { if (frompci + p->lowpc diff --git a/sys/platform/pc32/i386/pmap.c b/sys/platform/pc32/i386/pmap.c index 06ad7eec4c..abef842f55 100644 --- a/sys/platform/pc32/i386/pmap.c +++ b/sys/platform/pc32/i386/pmap.c @@ -40,7 +40,7 @@ * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ - * $DragonFly: src/sys/platform/pc32/i386/pmap.c,v 1.70 2007/01/11 10:39:40 dillon Exp $ + * $DragonFly: src/sys/platform/pc32/i386/pmap.c,v 1.71 2007/01/12 22:12:51 dillon Exp $ */ /* @@ -2516,13 +2516,9 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, * Clear the modified and * accessed (referenced) bits * during the copy. - * - * Clear PG_RW to force a fault on - * modify so virtual page tables can - * be updated. */ m = PHYS_TO_VM_PAGE(ptetemp); - *dst_pte = ptetemp & ~(PG_M | PG_A | PG_RW); + *dst_pte = ptetemp & ~(PG_M | PG_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); @@ -2890,9 +2886,14 @@ pmap_clearbit(vm_page_t m, int bit) * Careful here. We can use a locked bus instruction to * clear PG_A or PG_M safely but we need to synchronize * with the target cpus when we mess with PG_RW. + * + * We do not have to force synchronization when clearing + * PG_M even for PTEs generated via virtual memory maps, + * because the virtual kernel will invalidate the pmap + * entry when/if it needs to resynchronize the Modify bit. */ pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); - if (bit & (PG_RW | PG_M)) + if (bit & PG_RW) pmap_inval_add(&info, pv->pv_pmap, pv->pv_va); pbits = *pte; @@ -2903,11 +2904,14 @@ pmap_clearbit(vm_page_t m, int bit) atomic_clear_int(pte, PG_M|PG_RW); } else if (bit == PG_M) { /* - * When clearing the modified bit - * also make the page read only to - * force a fault-on-write. + * We could also clear PG_RW here to force + * a fault on write to redetect PG_M for + * virtual kernels, but it isn't necessary + * since virtual kernels invalidate the pte + * when they clear the VPTE_M bit in their + * virtual page tables. */ - atomic_clear_int(pte, PG_M|PG_RW); + atomic_clear_int(pte, PG_M); } else { atomic_clear_int(pte, bit); } diff --git a/sys/platform/vkernel/platform/pmap.c b/sys/platform/vkernel/platform/pmap.c index 1cd23919b9..a4a6b0db76 100644 --- a/sys/platform/vkernel/platform/pmap.c +++ b/sys/platform/vkernel/platform/pmap.c @@ -38,7 +38,7 @@ * * from: @(#)pmap.c 7.7 (Berkeley) 5/12/91 * $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $ - * $DragonFly: src/sys/platform/vkernel/platform/pmap.c,v 1.12 2007/01/12 18:03:48 dillon Exp $ + * $DragonFly: src/sys/platform/vkernel/platform/pmap.c,v 1.13 2007/01/12 22:12:52 dillon Exp $ */ /* * NOTE: PMAP_INVAL_ADD: In pc32 this function is called prior to adjusting @@ -2312,16 +2312,16 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, dstmpte = pmap_allocpte(dst_pmap, addr); if ((*dst_pte == 0) && (ptetemp = *src_pte)) { /* - * Clear the modified and - * accessed (referenced) bits - * during the copy. + * Clear the modified and accessed + * (referenced) bits during the copy. * - * Clear the Write bit to force a - * fault if under vpagetable - * management. + * We do not have to clear the write + * bit to force a fault-on-modify + * because the real kernel's target + * pmap is empty and will fault anyway. */ m = PHYS_TO_VM_PAGE(ptetemp); - *dst_pte = ptetemp & ~(VPTE_M | VPTE_A | VPTE_W); + *dst_pte = ptetemp & ~(VPTE_M | VPTE_A); dst_pmap->pm_stats.resident_count++; pmap_insert_entry(dst_pmap, addr, dstmpte, m); @@ -2634,7 +2634,8 @@ pmap_testbit(vm_page_t m, int bit) } /* - * this routine is used to modify bits in ptes + * This routine is used to clear bits in ptes. Certain bits require special + * handling, in particular (on virtual kernels) the VPTE_M (modify) bit. */ static __inline void pmap_clearbit(vm_page_t m, int bit) @@ -2675,9 +2676,10 @@ pmap_clearbit(vm_page_t m, int bit) * clear VPTE_A or VPTE_M safely but we need to synchronize * with the target cpus when we mess with VPTE_W. * - * On virtual kernels we also have to synchronize if clearing - * VPTE_M since the real kernel may have to force a fault - * to set it again. + * On virtual kernels we must force a new fault-on-write + * in the real kernel if we clear the Modify bit ourselves, + * otherwise the real kernel will not get a new fault and + * will never set our Modify bit again. */ pte = pmap_pte(pv->pv_pmap, pv->pv_va); if (bit & (VPTE_W|VPTE_M)) @@ -2692,11 +2694,17 @@ pmap_clearbit(vm_page_t m, int bit) atomic_clear_int(pte, VPTE_M|VPTE_W); } else if (bit == VPTE_M) { /* - * When clearing the modified bit also - * make the page read-only to force - * a fault. + * We do not have to make the page read-only + * when clearing the Modify bit. The real + * kernel will make the real PTE read-only + * or otherwise detect the write and set + * our VPTE_M again simply by us invalidating + * the real kernel VA for the pmap (as we did + * above). This allows the real kernel to + * handle the write fault without forwarding + * the fault to us. */ - atomic_clear_int(pte, VPTE_M|VPTE_W); + atomic_clear_int(pte, VPTE_M); } else { atomic_clear_int(pte, bit); } diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 141876476d..f38a62db55 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -67,7 +67,7 @@ * rights to redistribute these changes. * * $FreeBSD: src/sys/vm/vm_fault.c,v 1.108.2.8 2002/02/26 05:49:27 silby Exp $ - * $DragonFly: src/sys/vm/vm_fault.c,v 1.40 2007/01/11 20:53:42 dillon Exp $ + * $DragonFly: src/sys/vm/vm_fault.c,v 1.41 2007/01/12 22:12:53 dillon Exp $ */ /* @@ -737,7 +737,8 @@ vm_fault_object(struct faultstate *fs, * fs->prot above will simply not have VM_PROT_WRITE set. * * (2) If the mapping is a virtual page table we need to be able - * to detect writes so we can set VPTE_M. + * to detect writes so we can set VPTE_M in the virtual page + * table. * * (3) If the VM page is read-only or copy-on-write, upgrading would * just result in an unnecessary COW fault. -- 2.41.0