From 921c891ecf560602acfc7540df7a760f171e389e Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 12 Sep 2012 18:25:19 -0700 Subject: [PATCH] kernel - Implement segment pmap optimizations for x86-64 * Implement 2MB segment optimizations for x86-64. Any shared read-only or read-write VM object mapped into memory, including physical objects (so both sysv_shm and mmap), which is a multiple of the segment size and segment-aligned can be optimized. * Enable with sysctl machdep.pmap_mmu_optimize=1 Default is off for now. This is an experimental feature. * It works as follows: A VM object which is large enough will, when VM faults are generated, store a truncated pmap (PD, PT, and PTEs) in the VM object itself. VM faults whos vm_map_entry's can be optimized will cause the PTE, PT, and also the PD (for now) to be stored in a pmap embedded in the VM_OBJECT, instead of in the process pmap. The process pmap then creates PT entry in the PD page table that points to the PT page table page stored in the VM_OBJECT's pmap. * This removes nearly all page table overhead from fork()'d processes or even unrelated process which massively share data via mmap() or sysv_shm. We still recommend using sysctl kern.ipc.shm_use_phys=1 (which is now the default), which also removes the PV entries associated with the shared pmap. However, with this optimization PV entries are no longer a big issue since they will not be replicated in each process, only in the common pmap stored in the VM_OBJECT. * Features of this optimization: * Number of PV entries is reduced to approximately the number of live pages and no longer multiplied by the number of processes separately mapping the shared memory. * One process faulting in a page naturally makes the PTE available to all other processes mapping the same shared memory. The other processes do not have to fault that same page in. * Page tables survive process exit and restart. * Once page tables are populated and cached, any new process that maps the shared memory will take far fewer faults because each fault will bring in an ENTIRE page table. Postgres w/ 64-clients, VM fault rate was observed to drop from 1M faults/sec to less than 500 at startup, and during the run the fault rates dropped from a steady decline into the hundreds of thousands into an instant decline to virtually zero VM faults. * We no longer have to depend on sysv_shm to optimize the MMU. * CPU caches will do a better job caching page tables since most of them are now themselves shared. Even when we invltlb, more of the page tables will be in the L1, L2, and L3 caches. * EXPERIMENTAL!!!!! --- sys/kern/kern_slaballoc.c | 3 +- sys/platform/pc32/acpica5/acpi_wakeup.c | 6 +- sys/platform/pc32/i386/pmap.c | 38 +- sys/platform/pc32/include/pmap.h | 3 + sys/platform/pc64/include/pmap.h | 30 +- sys/platform/pc64/include/pmap_inval.h | 1 + sys/platform/pc64/x86_64/pmap.c | 646 ++++++++++++++++++++---- sys/platform/pc64/x86_64/pmap_inval.c | 6 + sys/platform/vkernel/include/pmap.h | 3 + sys/platform/vkernel/platform/pmap.c | 37 +- sys/platform/vkernel64/include/pmap.h | 3 + sys/platform/vkernel64/platform/pmap.c | 37 +- sys/vm/pmap.h | 11 +- sys/vm/vm_fault.c | 18 +- sys/vm/vm_object.c | 6 + sys/vm/vm_object.h | 6 + 16 files changed, 671 insertions(+), 183 deletions(-) diff --git a/sys/kern/kern_slaballoc.c b/sys/kern/kern_slaballoc.c index a923f2b122..9fc8a59114 100644 --- a/sys/kern/kern_slaballoc.c +++ b/sys/kern/kern_slaballoc.c @@ -1536,7 +1536,8 @@ kmem_slab_alloc(vm_size_t size, vm_offset_t align, int flags) */ m->valid = VM_PAGE_BITS_ALL; vm_page_wire(m); - pmap_enter(&kernel_pmap, addr + i, m, VM_PROT_ALL | VM_PROT_NOSYNC, 1); + pmap_enter(&kernel_pmap, addr + i, m, VM_PROT_ALL | VM_PROT_NOSYNC, + 1, NULL); if ((m->flags & PG_ZERO) == 0 && (flags & M_ZERO)) bzero((char *)addr + i, PAGE_SIZE); vm_page_flag_clear(m, PG_ZERO); diff --git a/sys/platform/pc32/acpica5/acpi_wakeup.c b/sys/platform/pc32/acpica5/acpi_wakeup.c index 53806a6539..2ff2d50cf6 100644 --- a/sys/platform/pc32/acpica5/acpi_wakeup.c +++ b/sys/platform/pc32/acpica5/acpi_wakeup.c @@ -223,7 +223,8 @@ acpi_sleep_machdep(struct acpi_softc *sc, int state) opage = PHYS_TO_VM_PAGE(oldphys); page = PHYS_TO_VM_PAGE(sc->acpi_wakephys); pmap_enter(pm, sc->acpi_wakephys, page, - VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, 1); + VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE, + 1, NULL); ret_addr = 0; if (acpi_savecpu()) { @@ -294,7 +295,8 @@ out: vm_page_unlock_queues(); if (opage) { pmap_enter(pm, sc->acpi_wakephys, page, - VM_PROT_READ | VM_PROT_WRITE, 0); + VM_PROT_READ | VM_PROT_WRITE, + 0, NULL); } if (pteobj_allocated) { diff --git a/sys/platform/pc32/i386/pmap.c b/sys/platform/pc32/i386/pmap.c index 3d362ea55f..d2e14c5736 100644 --- a/sys/platform/pc32/i386/pmap.c +++ b/sys/platform/pc32/i386/pmap.c @@ -1611,27 +1611,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) vm_object_drop(kptobj); } -/* - * Retire the given physical map from service. - * - * Should only be called if the map contains no valid mappings. - * - * No requirements. - */ -void -pmap_destroy(pmap_t pmap) -{ - if (pmap == NULL) - return; - - lwkt_gettoken(&vm_token); - if (--pmap->pm_count == 0) { - pmap_release(pmap); - panic("destroying a pmap is not yet implemented"); - } - lwkt_reltoken(&vm_token); -} - /* * Add a reference to the specified pmap. * @@ -2190,7 +2169,7 @@ again: */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) + boolean_t wired, vm_map_entry_t entry __unused) { vm_paddr_t pa; unsigned *pte; @@ -2714,7 +2693,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) * No requirements. */ void -pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) +pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, + vm_map_entry_t entry __unused) { unsigned *pte; @@ -3621,3 +3601,15 @@ pmap_kvtom(vm_offset_t va) { return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME)); } + +void +pmap_object_init(vm_object_t object) +{ + /* empty */ +} + +void +pmap_object_free(vm_object_t object) +{ + /* empty */ +} diff --git a/sys/platform/pc32/include/pmap.h b/sys/platform/pc32/include/pmap.h index a82e923169..f52a54cb53 100644 --- a/sys/platform/pc32/include/pmap.h +++ b/sys/platform/pc32/include/pmap.h @@ -212,6 +212,9 @@ struct md_page { TAILQ_HEAD(,pv_entry) pv_list; }; +struct md_object { +}; + /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they diff --git a/sys/platform/pc64/include/pmap.h b/sys/platform/pc64/include/pmap.h index 870e49acc1..6c67d6c103 100644 --- a/sys/platform/pc64/include/pmap.h +++ b/sys/platform/pc64/include/pmap.h @@ -209,15 +209,29 @@ pte_store(pt_entry_t *ptep, pt_entry_t pte) /* * Pmap stuff */ +struct pmap; struct pv_entry; struct vm_page; struct vm_object; struct vmspace; +/* + * vm_page structures embed a list of related pv_entry's + */ struct md_page { TAILQ_HEAD(,pv_entry) pv_list; }; +/* + * vm_object's representing large mappings can contain embedded pmaps + * to organize sharing at higher page table levels for PROT_READ and + * PROT_READ|PROT_WRITE maps. + */ +struct md_object { + struct pmap *pmap_rw; + struct pmap *pmap_ro; +}; + /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they @@ -244,7 +258,7 @@ struct pmap { RB_HEAD(pv_entry_rb_tree, pv_entry) pm_pvroot; int pm_count; /* reference count */ cpumask_t pm_active; /* active on cpus */ - int pm_filler02; /* (filler sync w/vkernel) */ + int pm_flags; struct pmap_statistics pm_stats; /* pmap statistics */ struct pv_entry *pm_pvhint; /* pv_entry lookup hint */ int pm_generation; /* detect pvlist deletions */ @@ -255,6 +269,8 @@ struct pmap { #define CPUMASK_LOCK CPUMASK(SMP_MAXCPU) #define CPUMASK_BIT SMP_MAXCPU /* for 1LLU << SMP_MAXCPU */ +#define PMAP_FLAG_SIMPLE 0x00000001 + #define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count typedef struct pmap *pmap_t; @@ -274,17 +290,19 @@ typedef struct pv_entry { RB_ENTRY(pv_entry) pv_entry; struct vm_page *pv_m; /* page being mapped */ u_int pv_hold; /* interlock action */ - u_int pv_unused01; + u_int pv_flags; #ifdef PMAP_DEBUG const char *pv_func; int pv_line; #endif } *pv_entry_t; -#define PV_HOLD_LOCKED 0x80000000U -#define PV_HOLD_WAITING 0x40000000U -#define PV_HOLD_DELETED 0x20000000U -#define PV_HOLD_MASK 0x1FFFFFFFU +#define PV_HOLD_LOCKED 0x80000000U +#define PV_HOLD_WAITING 0x40000000U +#define PV_HOLD_DELETED 0x20000000U +#define PV_HOLD_MASK 0x1FFFFFFFU + +#define PV_FLAG_VMOBJECT 0x00000001U /* shared pt in VM obj */ #ifdef _KERNEL diff --git a/sys/platform/pc64/include/pmap_inval.h b/sys/platform/pc64/include/pmap_inval.h index a1685a96b5..5dcbc8bda0 100644 --- a/sys/platform/pc64/include/pmap_inval.h +++ b/sys/platform/pc64/include/pmap_inval.h @@ -61,6 +61,7 @@ typedef pmap_inval_info *pmap_inval_info_t; void pmap_inval_init(pmap_inval_info_t); void pmap_inval_interlock(pmap_inval_info_t, pmap_t, vm_offset_t); +void pmap_inval_invltlb(pmap_inval_info_t); void pmap_inval_deinterlock(pmap_inval_info_t, pmap_t); void pmap_inval_done(pmap_inval_info_t); diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index e23f5649ec..928ef7c86b 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -6,7 +6,7 @@ * Copyright (c) 2005-2008 Alan L. Cox * Copyright (c) 2008, 2009 The DragonFly Project. * Copyright (c) 2008, 2009 Jordan Gordeev. - * Copyright (c) 2011 Matthew Dillon + * Copyright (c) 2011-2012 Matthew Dillon * All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -154,6 +154,8 @@ static int protection_codes[8]; struct pmap kernel_pmap; static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list); +MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects"); + vm_paddr_t avail_start; /* PA of first available physical page */ vm_paddr_t avail_end; /* PA of last available physical page */ vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ @@ -209,6 +211,9 @@ static caddr_t crashdumpmap; static int pmap_yield_count = 64; SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, &pmap_yield_count, 0, "Yield during init_pt/release"); +static int pmap_mmu_optimize = 0; +SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW, + &pmap_mmu_optimize, 0, "Share page table pages when possible"); #define DISABLE_PSE @@ -230,16 +235,19 @@ static void pv_free(pv_entry_t pv); static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp); +static pv_entry_t pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, + pv_entry_t *pvpp, vm_map_entry_t entry, vm_offset_t va); static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info); static vm_page_t pmap_remove_pv_page(pv_entry_t pv); +static int pmap_release_pv(pv_entry_t pv); static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info, - pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, - pt_entry_t *ptep, void *arg __unused); + pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, + vm_offset_t va, pt_entry_t *ptep, void *arg __unused); static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info, - pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, - pt_entry_t *ptep, void *arg __unused); + pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, + vm_offset_t va, pt_entry_t *ptep, void *arg __unused); static void i386_protection_init (void); static void create_pagetables(vm_paddr_t *firstaddr); @@ -395,6 +403,9 @@ pmap_pdp_index(vm_offset_t va) /* * Generic procedure to index a pte from a pt, pd, or pdp. + * + * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT + * a page table page index but is instead of PV lookup index. */ static void * @@ -1301,24 +1312,37 @@ pmap_pinit0(struct pmap *pmap) * Initialize a preallocated and zeroed pmap structure, * such as one in a vmspace structure. */ -void -pmap_pinit(struct pmap *pmap) +static void +pmap_pinit_simple(struct pmap *pmap) { - pv_entry_t pv; - int j; - /* * Misc initialization */ pmap->pm_count = 1; pmap->pm_active = 0; pmap->pm_pvhint = NULL; + pmap->pm_flags = PMAP_FLAG_SIMPLE; + + /* + * Don't blow up locks/tokens on re-use (XXX fix/use drop code + * for this). + */ if (pmap->pm_pmlpv == NULL) { RB_INIT(&pmap->pm_pvroot); bzero(&pmap->pm_stats, sizeof pmap->pm_stats); spin_init(&pmap->pm_spin); lwkt_token_init(&pmap->pm_token, "pmap_tok"); } +} + +void +pmap_pinit(struct pmap *pmap) +{ + pv_entry_t pv; + int j; + + pmap_pinit_simple(pmap); + pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; /* * No need to allocate page table space yet but we do need a valid @@ -1416,9 +1440,6 @@ pmap_puninit(pmap_t pmap) void pmap_pinit2(struct pmap *pmap) { - /* - * XXX copies current process, does not fill in MPPTDI - */ spin_lock(&pmap_spin); TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode); spin_unlock(&pmap_spin); @@ -1443,12 +1464,14 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) vm_pindex_t pt_pindex; vm_page_t m; int isnew; + int ispt; /* * If the pv already exists and we aren't being asked for the * parent page table page we can just return it. A locked+held pv * is returned. */ + ispt = 0; pv = pv_alloc(pmap, ptepindex, &isnew); if (isnew == 0 && pvpp == NULL) return(pv); @@ -1505,13 +1528,23 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) */ ptepindex = pv->pv_pindex - pmap_pt_pindex(0); ptepindex &= ((1ul << NPDEPGSHIFT) - 1); + ispt = 1; } else if (ptepindex < pmap_pdp_pindex(0)) { /* * pv is PD, pvp is PDP + * + * SIMPLE PMAP NOTE: Simple pmaps do not allocate above + * the PD. */ ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; - pvp = pmap_allocpte(pmap, ptepindex, NULL); + + if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { + KKASSERT(pvpp == NULL); + pvp = NULL; + } else { + pvp = pmap_allocpte(pmap, ptepindex, NULL); + } if (!isnew) goto notnew; @@ -1585,11 +1618,33 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) * we just put it away. * * No interlock is needed for pte 0 -> non-zero. + * + * In the situation where *ptep is valid we might have an unmanaged + * page table page shared from another page table which we need to + * unshare before installing our private page table page. */ if (pvp) { - vm_page_wire_quick(pvp->pv_m); ptep = pv_pte_lookup(pvp, ptepindex); - KKASSERT((*ptep & PG_V) == 0); + if (*ptep & PG_V) { + pt_entry_t pte; + pmap_inval_info info; + + kprintf("pmap_allocpte: restate shared pg table pg\n"); + + if (ispt == 0) { + panic("pmap_allocpte: unexpected pte %p/%d", + pvp, (int)ptepindex); + } + pmap_inval_init(&info); + pmap_inval_interlock(&info, pmap, (vm_offset_t)-1); + pte = pte_load_clear(ptep); + pmap_inval_deinterlock(&info, pmap); + pmap_inval_done(&info); + if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) + panic("pmap_allocpte: shared pgtable pg bad wirecount"); + } else { + vm_page_wire_quick(pvp->pv_m); + } *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V | PG_A | PG_M); } @@ -1602,6 +1657,206 @@ notnew: return (pv); } +/* + * This version of pmap_allocpte() checks for possible segment optimizations + * that would allow page-table sharing. It can be called for terminal + * page or page table page ptepindex's. + * + * The function is called with page table page ptepindex's for fictitious + * and unmanaged terminal pages. That is, we don't want to allocate a + * terminal pv, we just want the pt_pv. pvpp is usually passed as NULL + * for this case. + * + * This function can return a pv and *pvpp associated with the passed in pmap + * OR a pv and *pvpp associated with the shared pmap. In the latter case + * an unmanaged page table page will be entered into the pass in pmap. + */ +static +pv_entry_t +pmap_allocpte_seg(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp, + vm_map_entry_t entry, vm_offset_t va) +{ + struct pmap_inval_info info; + vm_object_t object; + pmap_t obpmap; + pmap_t *obpmapp; + vm_offset_t b; + pv_entry_t pte_pv; /* in original or shared pmap */ + pv_entry_t pt_pv; /* in original or shared pmap */ + pv_entry_t proc_pd_pv; /* in original pmap */ + pv_entry_t proc_pt_pv; /* in original pmap */ + pv_entry_t xpv; /* PT in shared pmap */ + pd_entry_t *pt; /* PT entry in PD of original pmap */ + pd_entry_t opte; /* contents of *pt */ + pd_entry_t npte; /* contents of *pt */ + vm_page_t m; + + /* + * Basic tests, require a non-NULL vm_map_entry, require proper + * alignment and type for the vm_map_entry, require that the + * underlying object already be allocated. + * + * We currently allow any type of object to use this optimization. + * The object itself does NOT have to be sized to a multiple of the + * segment size, but the memory mapping does. + */ + if (entry == NULL || + pmap_mmu_optimize == 0 || /* not enabled */ + ptepindex >= pmap_pd_pindex(0) || /* not terminal */ + entry->inheritance != VM_INHERIT_SHARE || /* not shared */ + entry->maptype != VM_MAPTYPE_NORMAL || /* weird map type */ + entry->object.vm_object == NULL || /* needs VM object */ + (entry->offset & SEG_MASK) || /* must be aligned */ + (entry->start & SEG_MASK)) { + return(pmap_allocpte(pmap, ptepindex, pvpp)); + } + + /* + * Make sure the full segment can be represented. + */ + b = va & ~(vm_offset_t)SEG_MASK; + if (b < entry->start && b + SEG_SIZE > entry->end) + return(pmap_allocpte(pmap, ptepindex, pvpp)); + + /* + * If the full segment can be represented dive the VM object's + * shared pmap, allocating as required. + */ + object = entry->object.vm_object; + + if (entry->protection & VM_PROT_WRITE) + obpmapp = &object->md.pmap_rw; + else + obpmapp = &object->md.pmap_ro; + + /* + * We allocate what appears to be a normal pmap but because portions + * of this pmap are shared with other unrelated pmaps we have to + * set pm_active to point to all cpus. + * + * XXX Currently using pmap_spin to interlock the update, can't use + * vm_object_hold/drop because the token might already be held + * shared OR exclusive and we don't know. + */ + while ((obpmap = *obpmapp) == NULL) { + obpmap = kmalloc(sizeof(*obpmap), M_OBJPMAP, M_WAITOK|M_ZERO); + pmap_pinit_simple(obpmap); + pmap_pinit2(obpmap); + spin_lock(&pmap_spin); + if (*obpmapp != NULL) { + /* + * Handle race + */ + spin_unlock(&pmap_spin); + pmap_release(obpmap); + pmap_puninit(obpmap); + kfree(obpmap, M_OBJPMAP); + } else { + obpmap->pm_active = smp_active_mask; + *obpmapp = obpmap; + spin_unlock(&pmap_spin); + } + } + + /* + * Layering is: PTE, PT, PD, PDP, PML4. We have to return the + * pte/pt using the shared pmap from the object but also adjust + * the process pmap's page table page as a side effect. + */ + + /* + * Resolve the terminal PTE and PT in the shared pmap. This is what + * we will return. This is true if ptepindex represents a terminal + * page, otherwise pte_pv is actually the PT and pt_pv is actually + * the PD. + */ + pt_pv = NULL; + pte_pv = pmap_allocpte(obpmap, ptepindex, &pt_pv); + if (ptepindex >= pmap_pt_pindex(0)) + xpv = pte_pv; + else + xpv = pt_pv; + + /* + * Resolve the PD in the process pmap so we can properly share the + * page table page. Lock order is bottom-up (leaf first)! + * + * NOTE: proc_pt_pv can be NULL. + */ + proc_pt_pv = pv_get(pmap, pmap_pt_pindex(b)); + proc_pd_pv = pmap_allocpte(pmap, pmap_pd_pindex(b), NULL); + + /* + * xpv is the page table page pv from the shared object + * (for convenience). + * + * Calculate the pte value for the PT to load into the process PD. + * If we have to change it we must properly dispose of the previous + * entry. + */ + pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b)); + npte = VM_PAGE_TO_PHYS(xpv->pv_m) | + (PG_U | PG_RW | PG_V | PG_A | PG_M); + if (*pt == 0) { + *pt = npte; + vm_page_wire_quick(xpv->pv_m); + vm_page_wire_quick(proc_pd_pv->pv_m); + atomic_add_long(&pmap->pm_stats.resident_count, 1); + } else if (*pt != npte) { + pmap_inval_init(&info); + pmap_inval_interlock(&info, pmap, (vm_offset_t)-1); + if (*pt != npte) { + opte = pte_load_clear(pt); + *pt = npte; + vm_page_wire_quick(xpv->pv_m); + + /* + * Clean up opte, bump the wire_count for the process + * PD page representing the new entry if it was + * previously empty. + * + * If the entry was not previously empty and we have + * a PT in the proc pmap then opte must match that + * pt. The proc pt must be retired (this is done + * later on in this procedure). + */ + if (opte & PG_V) { + m = PHYS_TO_VM_PAGE(opte & PG_FRAME); + if (proc_pt_pv) { + KKASSERT(proc_pt_pv->pv_m == m); + } else { + if (vm_page_unwire_quick(m)) { + panic("pmap_allocpte_seg: " + "bad wire count %p", + m); + } + } + } else { + vm_page_wire_quick(proc_pd_pv->pv_m); + } + } + pmap_inval_deinterlock(&info, pmap); + pmap_inval_done(&info); + } else { + KKASSERT(proc_pt_pv == NULL); + } + + /* + * The existing process page table was replaced and must be destroyed + * here. + */ + if (proc_pd_pv) + pv_put(proc_pd_pv); + if (proc_pt_pv) + pmap_release_pv(proc_pt_pv); + if (pvpp) + *pvpp = pt_pv; + else + pv_put(pt_pv); + + return (pte_pv); +} + /* * Release any resources held by the given physical map. * @@ -1647,7 +1902,9 @@ pmap_release(struct pmap *pmap) * One resident page (the pml4 page) should remain. * No wired pages should remain. */ - KKASSERT(pmap->pm_stats.resident_count == 1); + KKASSERT(pmap->pm_stats.resident_count == + ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1)); + KKASSERT(pmap->pm_stats.wired_count == 0); } @@ -1656,7 +1913,7 @@ pmap_release_callback(pv_entry_t pv, void *data) { struct pmap_release_info *info = data; pmap_t pmap = info->pmap; - vm_page_t p; + int r; if (pv_hold_try(pv)) { spin_unlock(&pmap->pm_spin); @@ -1670,6 +1927,19 @@ pmap_release_callback(pv_entry_t pv, void *data) return(-1); } } + r = pmap_release_pv(pv); + spin_lock(&pmap->pm_spin); + return(r); +} + +/* + * Called with held (i.e. also locked) pv. This function will dispose of + * the lock along with the pv. + */ +static int +pmap_release_pv(pv_entry_t pv) +{ + vm_page_t p; /* * The pmap is currently not spinlocked, pv is held+locked. @@ -1698,7 +1968,6 @@ pmap_release_callback(pv_entry_t pv, void *data) */ if (pv->pv_pindex == pmap_pml4_pindex()) { pv_put(pv); - spin_lock(&pmap->pm_spin); return(-1); } @@ -1719,12 +1988,16 @@ pmap_release_callback(pv_entry_t pv, void *data) vm_page_unwire(p, 0); KKASSERT(p->wire_count == 0); - /* JG eventually revert to using vm_page_free_zero() */ + + /* + * Theoretically this page, if not the pml4 page, should contain + * all-zeros. But its just too dangerous to mark it PG_ZERO. Free + * normally. + */ vm_page_free(p); skip: pv_free(pv); - spin_lock(&pmap->pm_spin); - return(0); + return 0; } /* @@ -1776,6 +2049,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) if (pvp == NULL) { pml4_pindex = pmap_pml4_pindex(); pvp = pv_get(pv->pv_pmap, pml4_pindex); + KKASSERT(pvp); gotpvp = 1; } pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; @@ -1785,7 +2059,11 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) KKASSERT(info == NULL); } else if (ptepindex >= pmap_pd_pindex(0)) { /* - * Remove a PD page from the pdp + * Remove a PD page from the pdp + * + * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case + * of a simple pmap because it stops at + * the PD page. */ vm_pindex_t pdp_pindex; vm_pindex_t pd_index; @@ -1797,12 +2075,19 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + (pd_index >> NPML4EPGSHIFT); pvp = pv_get(pv->pv_pmap, pdp_pindex); - gotpvp = 1; + if (pvp) + gotpvp = 1; + } + if (pvp) { + pd = pv_pte_lookup(pvp, pd_index & + ((1ul << NPDPEPGSHIFT) - 1)); + KKASSERT((*pd & PG_V) != 0); + p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); + *pd = 0; + } else { + KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); + p = pv->pv_m; /* degenerate test later */ } - pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1)); - KKASSERT((*pd & PG_V) != 0); - p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); - *pd = 0; KKASSERT(info == NULL); } else if (ptepindex >= pmap_pt_pindex(0)) { /* @@ -1818,6 +2103,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + (pt_index >> NPDPEPGSHIFT); pvp = pv_get(pv->pv_pmap, pd_pindex); + KKASSERT(pvp); gotpvp = 1; } pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); @@ -1848,6 +2134,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info) pt_pindex = NUPTE_TOTAL + (ptepindex >> NPDPEPGSHIFT); pvp = pv_get(pv->pv_pmap, pt_pindex); + KKASSERT(pvp); gotpvp = 1; } ptep = pv_pte_lookup(pvp, ptepindex & @@ -2042,36 +2329,24 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) } /* - * Retire the given physical map from service. - * Should only be called if the map contains - * no valid mappings. + * Add a reference to the specified pmap. */ void -pmap_destroy(pmap_t pmap) +pmap_reference(pmap_t pmap) { - int count; - - if (pmap == NULL) - return; - - lwkt_gettoken(&pmap->pm_token); - count = --pmap->pm_count; - if (count == 0) { - pmap_release(pmap); /* eats pm_token */ - panic("destroying a pmap is not yet implemented"); + if (pmap != NULL) { + lwkt_gettoken(&pmap->pm_token); + ++pmap->pm_count; + lwkt_reltoken(&pmap->pm_token); } - lwkt_reltoken(&pmap->pm_token); } -/* - * Add a reference to the specified pmap. - */ void -pmap_reference(pmap_t pmap) +pmap_drop(pmap_t pmap) { if (pmap != NULL) { lwkt_gettoken(&pmap->pm_token); - pmap->pm_count++; + --pmap->pm_count; lwkt_reltoken(&pmap->pm_token); } } @@ -2473,19 +2748,24 @@ pmap_collect(void) /* * Scan the pmap for active page table entries and issue a callback. - * The callback must dispose of pte_pv. + * The callback must dispose of pte_pv, whos PTE entry is at *ptep in + * its parent page table. * - * NOTE: Unmanaged page table entries will not have a pte_pv + * pte_pv will be NULL if the page is unmanaged. + * pt_pv will point to the page table page containing the pte for the page. * - * NOTE: Kernel page table entries will not have a pt_pv. That is, wiring - * counts are not tracked in kernel page table pages. + * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), + * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed + * process pmap's PD and page to the callback function. This can be + * confusing because the pt_pv is really a pd_pv, and the target page + * table page is simply aliased by the pmap and not owned by it. * * It is assumed that the start and end are properly rounded to the page size. */ static void pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva, void (*func)(pmap_t, struct pmap_inval_info *, - pv_entry_t, pv_entry_t, vm_offset_t, + pv_entry_t, pv_entry_t, int, vm_offset_t, pt_entry_t *, void *), void *arg) { @@ -2531,13 +2811,27 @@ pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva, ptep = vtopte(sva); } else { /* - * User mappings may or may not have a pte_pv but - * will always have a pt_pv if the page is present. + * User pages which are unmanaged will not have a + * pte_pv. User page table pages which are unmanaged + * (shared from elsewhere) will also not have a pt_pv. + * The func() callback will pass both pte_pv and pt_pv + * as NULL in that case. */ pte_pv = pv_get(pmap, pmap_pte_pindex(sva)); pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); if (pt_pv == NULL) { KKASSERT(pte_pv == NULL); + pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); + if (pd_pv) { + ptep = pv_pte_lookup(pd_pv, + pmap_pt_index(sva)); + if (*ptep) { + func(pmap, &info, + NULL, pd_pv, 1, + sva, ptep, arg); + } + pv_put(pd_pv); + } goto fast_skip; } ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); @@ -2555,12 +2849,12 @@ pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva, PG_V), ("bad *ptep %016lx sva %016lx pte_pv %p", *ptep, sva, pte_pv)); - func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg); + func(pmap, &info, pte_pv, pt_pv, 0, sva, ptep, arg); } else { KASSERT((*ptep & (PG_MANAGED|PG_V)) == PG_V, ("bad *ptep %016lx sva %016lx pte_pv NULL", *ptep, sva)); - func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg); + func(pmap, &info, NULL, pt_pv, 0, sva, ptep, arg); } if (pt_pv) pv_put(pt_pv); @@ -2659,10 +2953,29 @@ fast_skip: } /* - * We will scan or skip a page table page so adjust va_next - * either way. + * If pt_pv is NULL we either have an shared page table + * page and must issue a callback specific to that case, + * or there is no page table page. + * + * Either way we can skip the page table page. */ if (pt_pv == NULL) { + /* + * Possible unmanaged (shared from another pmap) + * page table page. + */ + if (pd_pv == NULL) + pd_pv = pv_get(pmap, pmap_pd_pindex(sva)); + KKASSERT(pd_pv != NULL); + ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); + if (*ptep & PG_V) { + func(pmap, &info, NULL, pd_pv, 1, + sva, ptep, arg); + } + + /* + * Done, move to next page table page. + */ va_next = (sva + NBPDR) & ~PDRMASK; if (va_next < sva) va_next = eva; @@ -2735,6 +3048,12 @@ kernel_skip: pte_pv = NULL; pt_pv = pv_get(pmap, pmap_pt_pindex(sva)); + /* + * pt_pv reloaded, need new ptep + */ + KKASSERT(pt_pv != NULL); + ptep = pv_pte_lookup(pt_pv, + pmap_pte_index(sva)); continue; } } else { @@ -2768,16 +3087,16 @@ kernel_skip: ("bad *ptep %016lx sva %016lx " "pte_pv %p", *ptep, sva, pte_pv)); - func(pmap, &info, pte_pv, pt_pv, sva, - ptep, arg); + func(pmap, &info, pte_pv, pt_pv, 0, + sva, ptep, arg); } else { KASSERT((*ptep & (PG_MANAGED|PG_V)) == PG_V, ("bad *ptep %016lx sva %016lx " "pte_pv NULL", *ptep, sva)); - func(pmap, &info, pte_pv, pt_pv, sva, - ptep, arg); + func(pmap, &info, NULL, pt_pv, 0, + sva, ptep, arg); } pte_pv = NULL; sva += PAGE_SIZE; @@ -2808,8 +3127,8 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info, - pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, - pt_entry_t *ptep, void *arg __unused) + pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, + vm_offset_t va, pt_entry_t *ptep, void *arg __unused) { pt_entry_t pte; @@ -2821,8 +3140,10 @@ pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info, pmap_remove_pv_pte(pte_pv, pt_pv, info); pmap_remove_pv_page(pte_pv); pv_free(pte_pv); - } else { + } else if (sharept == 0) { /* + * Unmanaged page + * * pt_pv's wire_count is still bumped by unmanaged pages * so we must decrement it manually. */ @@ -2832,8 +3153,24 @@ pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info, if (pte & PG_W) atomic_add_long(&pmap->pm_stats.wired_count, -1); atomic_add_long(&pmap->pm_stats.resident_count, -1); - if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m)) + if (vm_page_unwire_quick(pt_pv->pv_m)) panic("pmap_remove: insufficient wirecount"); + } else { + /* + * Unmanaged page table, pt_pv is actually the pd_pv + * for our pmap (not the share object pmap). + * + * We have to unwire the target page table page and we + * have to unwire our page directory page. + */ + pmap_inval_interlock(info, pmap, va); + pte = pte_load_clear(ptep); + pmap_inval_deinterlock(info, pmap); + atomic_add_long(&pmap->pm_stats.resident_count, -1); + if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) + panic("pmap_remove: shared pgtable1 bad wirecount"); + if (vm_page_unwire_quick(pt_pv->pv_m)) + panic("pmap_remove: shared pgtable2 bad wirecount"); } } @@ -2882,13 +3219,14 @@ pmap_remove_all(vm_page_t m) } /* - * pmap_protect: + * Set the physical protection on the specified range of this map + * as requested. This function is typically only used for debug watchpoints + * and COW pages. * - * Set the physical protection on the specified range of this map - * as requested. + * This function may not be called from an interrupt if the map is + * not the kernel_pmap. * - * This function may not be called from an interrupt if the map is - * not the kernel_pmap. + * NOTE! For shared page table pages we just unmap the page. */ void pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) @@ -2909,11 +3247,12 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info, - pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va, - pt_entry_t *ptep, void *arg __unused) + pv_entry_t pte_pv, pv_entry_t pt_pv, int sharept, + vm_offset_t va, pt_entry_t *ptep, void *arg __unused) { pt_entry_t pbits; pt_entry_t cbits; + pt_entry_t pte; vm_page_t m; /* @@ -2939,10 +3278,30 @@ again: cbits &= ~PG_M; } } + } else if (sharept) { + /* + * Unmanaged page table, pt_pv is actually the pd_pv + * for our pmap (not the share object pmap). + * + * When asked to protect something in a shared page table + * page we just unmap the page table page. We have to + * invalidate the tlb in this situation. + */ + pte = pte_load_clear(ptep); + pmap_inval_invltlb(info); + if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME))) + panic("pmap_protect: pgtable1 pg bad wirecount"); + if (vm_page_unwire_quick(pt_pv->pv_m)) + panic("pmap_protect: pgtable2 pg bad wirecount"); + ptep = NULL; } - cbits &= ~PG_RW; - if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) { - goto again; + /* else unmanaged page, adjust bits, no wire changes */ + + if (ptep) { + cbits &= ~PG_RW; + if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) { + goto again; + } } pmap_inval_deinterlock(info, pmap); if (pte_pv) @@ -2953,12 +3312,17 @@ again: * Insert the vm_page (m) at the virtual address (va), replacing any prior * mapping at that address. Set protection and wiring as requested. * + * If entry is non-NULL we check to see if the SEG_SIZE optimization is + * possible. If it is we enter the page into the appropriate shared pmap + * hanging off the related VM object instead of the passed pmap, then we + * share the page table page from the VM object's pmap into the current pmap. + * * NOTE: This routine MUST insert the page into the pmap now, it cannot * lazy-evaluate. */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) + boolean_t wired, vm_map_entry_t entry __unused) { pmap_inval_info info; pv_entry_t pt_pv; /* page table */ @@ -3015,24 +3379,31 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, pte_pv = NULL; pt_pv = NULL; ptep = vtopte(va); - } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) { + } else if (m->flags & (PG_FICTITIOUS | PG_UNMANAGED)) { /* XXX */ pte_pv = NULL; if (va >= VM_MAX_USER_ADDRESS) { pt_pv = NULL; ptep = vtopte(va); } else { - pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); + pt_pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), + NULL, entry, va); ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); } KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0); } else { if (va >= VM_MAX_USER_ADDRESS) { + /* + * Kernel map, pv_entry-tracked. + */ pt_pv = NULL; pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL); ptep = vtopte(va); } else { - pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), - &pt_pv); + /* + * User map + */ + pte_pv = pmap_allocpte_seg(pmap, pmap_pte_pindex(va), + &pt_pv, entry, va); ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); } KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED)); @@ -3090,12 +3461,20 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, if (pte_pv->pv_m) pmap_remove_pv_page(pte_pv); } else if (prot & VM_PROT_NOSYNC) { - /* leave wire count on PT page intact */ + /* + * Unmanaged page, NOSYNC (no mmu sync) requested. + * + * Leave wire count on PT page intact. + */ (void)pte_load_clear(ptep); cpu_invlpg((void *)va); atomic_add_long(&pmap->pm_stats.resident_count, -1); } else { - /* leave wire count on PT page intact */ + /* + * Unmanaged page, normal enter. + * + * Leave wire count on PT page intact. + */ pmap_inval_interlock(&info, pmap, va); (void)pte_load_clear(ptep); pmap_inval_deinterlock(&info, pmap); @@ -3129,9 +3508,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, } /* - * Ok, for UVM (pt_pv != NULL) we don't need to interlock or - * invalidate anything, the TLB won't have any stale entries to - * remove. + * Kernel VMAs (pt_pv == NULL) require pmap invalidation interlocks. + * + * User VMAs do not because those will be zero->non-zero, so no + * stale entries to worry about at this point. * * For KVM there appear to still be issues. Theoretically we * should be able to scrap the interlocks entirely but we @@ -3139,6 +3519,10 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, */ if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) pmap_inval_interlock(&info, pmap, va); + + /* + * Set the pte + */ *(volatile pt_entry_t *)ptep = newpte; if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL) @@ -3146,12 +3530,22 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, else if (pt_pv == NULL) cpu_invlpg((void *)va); - if (wired) - atomic_add_long(&pmap->pm_stats.wired_count, 1); + if (wired) { + if (pte_pv) { + atomic_add_long(&pte_pv->pv_pmap->pm_stats.wired_count, + 1); + } else { + atomic_add_long(&pmap->pm_stats.wired_count, 1); + } + } if (newpte & PG_RW) vm_page_flag_set(m, PG_WRITEABLE); - if (pte_pv == NULL) - atomic_add_long(&pmap->pm_stats.resident_count, 1); + + /* + * Unmanaged pages need manual resident_count tracking. + */ + if (pte_pv == NULL && pt_pv) + atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); /* * Cleanup @@ -3180,7 +3574,7 @@ done: void pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m) { - pmap_enter(pmap, va, m, VM_PROT_READ, FALSE); + pmap_enter(pmap, va, m, VM_PROT_READ, FALSE, NULL); } /* @@ -3228,6 +3622,9 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) return; + /* + * Misc additional checks + */ psize = x86_64_btop(size); if ((object->type != OBJT_VNODE) || @@ -3245,6 +3642,18 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot, if (psize == 0) return; + /* + * If everything is segment-aligned do not pre-init here. Instead + * allow the normal vm_fault path to pass a segment hint to + * pmap_enter() which will then use an object-referenced shared + * page table page. + */ + if ((addr & SEG_MASK) == 0 && + (ctob(psize) & SEG_MASK) == 0 && + (ctob(pindex) & SEG_MASK) == 0) { + return; + } + /* * Use a red-black scan to traverse the requested range and load * any valid pages found into the pmap. @@ -3332,7 +3741,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) * exist in the pmap. The mapping may or may not be managed. */ void -pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) +pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, + vm_map_entry_t entry) { pt_entry_t *ptep; pv_entry_t pv; @@ -3340,13 +3750,13 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) if (pmap == NULL) return; lwkt_gettoken(&pmap->pm_token); - pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); + pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va); ptep = pv_pte_lookup(pv, pmap_pte_index(va)); if (wired && !pmap_pte_w(ptep)) - atomic_add_long(&pmap->pm_stats.wired_count, 1); + atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1); else if (!wired && pmap_pte_w(ptep)) - atomic_add_long(&pmap->pm_stats.wired_count, -1); + atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1); /* * Wiring is not a hardware characteristic so there is no need to @@ -4082,3 +4492,47 @@ pmap_kvtom(vm_offset_t va) { return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME)); } + +/* + * Initialize machine-specific shared page directory support. This + * is executed when a VM object is created. + */ +void +pmap_object_init(vm_object_t object) +{ + object->md.pmap_rw = NULL; + object->md.pmap_ro = NULL; +} + +/* + * Clean up machine-specific shared page directory support. This + * is executed when a VM object is destroyed. + */ +void +pmap_object_free(vm_object_t object) +{ + pmap_t pmap; + + if ((pmap = object->md.pmap_rw) != NULL) { + object->md.pmap_rw = NULL; + kprintf("pmap_object_free: destroying pmap %p in obj %p\n", + pmap, object); + pmap_remove_pages(pmap, + VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); + pmap->pm_active = 0; + pmap_release(pmap); + pmap_puninit(pmap); + kfree(pmap, M_OBJPMAP); + } + if ((pmap = object->md.pmap_ro) != NULL) { + object->md.pmap_ro = NULL; + kprintf("pmap_object_free: destroying pmap %p in obj %p\n", + pmap, object); + pmap_remove_pages(pmap, + VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); + pmap->pm_active = 0; + pmap_release(pmap); + pmap_puninit(pmap); + kfree(pmap, M_OBJPMAP); + } +} diff --git a/sys/platform/pc64/x86_64/pmap_inval.c b/sys/platform/pc64/x86_64/pmap_inval.c index f93a713483..874acbd4b4 100644 --- a/sys/platform/pc64/x86_64/pmap_inval.c +++ b/sys/platform/pc64/x86_64/pmap_inval.c @@ -116,6 +116,12 @@ pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va) lwkt_cpusync_interlock(&info->pir_cpusync); } +void +pmap_inval_invltlb(pmap_inval_info_t info) +{ + info->pir_va = (vm_offset_t)-1; +} + void pmap_inval_deinterlock(pmap_inval_info_t info, pmap_t pmap) { diff --git a/sys/platform/vkernel/include/pmap.h b/sys/platform/vkernel/include/pmap.h index 20974f0fcc..56ad7357e8 100644 --- a/sys/platform/vkernel/include/pmap.h +++ b/sys/platform/vkernel/include/pmap.h @@ -113,6 +113,9 @@ struct md_page { TAILQ_HEAD(,pv_entry) pv_list; }; +struct md_object { +}; + /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they diff --git a/sys/platform/vkernel/platform/pmap.c b/sys/platform/vkernel/platform/pmap.c index cb85225666..e455580b56 100644 --- a/sys/platform/vkernel/platform/pmap.c +++ b/sys/platform/vkernel/platform/pmap.c @@ -375,26 +375,6 @@ pmap_release_callback(struct vm_page *p, void *data) return(0); } -/* - * Retire the given physical map from service. Should only be called if - * the map contains no valid mappings. - * - * No requirements. - */ -void -pmap_destroy(pmap_t pmap) -{ - if (pmap == NULL) - return; - - lwkt_gettoken(&vm_token); - if (--pmap->pm_count == 0) { - pmap_release(pmap); - panic("destroying a pmap is not yet implemented"); - } - lwkt_reltoken(&vm_token); -} - /* * Add a reference to the specified pmap. * @@ -1752,7 +1732,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) + boolean_t wired, vm_map_entry_t entry __unused) { vm_paddr_t pa; vpte_t *pte; @@ -2180,7 +2160,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) * No other requirements. */ void -pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) +pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, + vm_map_entry_t entry __unused) { vpte_t *pte; @@ -3091,3 +3072,15 @@ pmap_kvtom(vm_offset_t va) ptep = KernelPTA + (va >> PAGE_SHIFT); return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); } + +void +pmap_object_init(vm_object_t object) +{ + /* empty */ +} + +void +pmap_object_free(vm_object_t object) +{ + /* empty */ +} diff --git a/sys/platform/vkernel64/include/pmap.h b/sys/platform/vkernel64/include/pmap.h index 2a9ffc2d29..0c8c0643f5 100644 --- a/sys/platform/vkernel64/include/pmap.h +++ b/sys/platform/vkernel64/include/pmap.h @@ -138,6 +138,9 @@ struct md_page { TAILQ_HEAD(,pv_entry) pv_list; }; +struct md_object { +}; + /* * Each machine dependent implementation is expected to * keep certain statistics. They may do this anyway they diff --git a/sys/platform/vkernel64/platform/pmap.c b/sys/platform/vkernel64/platform/pmap.c index e871124caa..2248790393 100644 --- a/sys/platform/vkernel64/platform/pmap.c +++ b/sys/platform/vkernel64/platform/pmap.c @@ -1595,26 +1595,6 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) vm_object_drop(kptobj); } -/* - * Retire the given physical map from service. Should only be called - * if the map contains no valid mappings. - * - * No requirements. - */ -void -pmap_destroy(pmap_t pmap) -{ - if (pmap == NULL) - return; - - lwkt_gettoken(&vm_token); - if (--pmap->pm_count == 0) { - pmap_release(pmap); - panic("destroying a pmap is not yet implemented"); - } - lwkt_reltoken(&vm_token); -} - /* * Add a reference to the specified pmap. * @@ -2200,7 +2180,7 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) */ void pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, - boolean_t wired) + boolean_t wired, vm_map_entry_t entry __unused) { vm_paddr_t pa; pd_entry_t *pde; @@ -2602,7 +2582,8 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) * No other requirements. */ void -pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired) +pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired, + vm_map_entry_t entry __unused) { pt_entry_t *pte; @@ -3295,3 +3276,15 @@ pmap_kvtom(vm_offset_t va) ptep = vtopte(va); return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); } + +void +pmap_object_init(vm_object_t object) +{ + /* empty */ +} + +void +pmap_object_free(vm_object_t object) +{ + /* empty */ +} diff --git a/sys/vm/pmap.h b/sys/vm/pmap.h index 97c41c551b..315f807985 100644 --- a/sys/vm/pmap.h +++ b/sys/vm/pmap.h @@ -97,6 +97,7 @@ struct thread; struct vm_page; struct vmspace; struct vmspace_entry; +struct vm_map_entry; /* * Most of these variables represent parameters set up by low level MD kernel @@ -143,7 +144,8 @@ kva_p(const void *addr) #endif } -void pmap_change_wiring (pmap_t, vm_offset_t, boolean_t); +void pmap_change_wiring (pmap_t, vm_offset_t, boolean_t, + vm_map_entry_t); void pmap_clear_modify (struct vm_page *m); void pmap_clear_reference (struct vm_page *m); void pmap_collect (void); @@ -151,9 +153,8 @@ void pmap_copy (pmap_t, pmap_t, vm_offset_t, vm_size_t, vm_offset_t); void pmap_copy_page (vm_paddr_t, vm_paddr_t); void pmap_copy_page_frag (vm_paddr_t, vm_paddr_t, size_t bytes); -void pmap_destroy (pmap_t); void pmap_enter (pmap_t, vm_offset_t, struct vm_page *, - vm_prot_t, boolean_t); + vm_prot_t, boolean_t, struct vm_map_entry *); void pmap_enter_quick (pmap_t, vm_offset_t, struct vm_page *); vm_paddr_t pmap_extract (pmap_t pmap, vm_offset_t va); void pmap_growkernel (vm_offset_t, vm_offset_t); @@ -183,6 +184,7 @@ void pmap_kmodify_nc(vm_offset_t va); void pmap_kremove (vm_offset_t); void pmap_kremove_quick (vm_offset_t); void pmap_reference (pmap_t); +void pmap_drop (pmap_t); void pmap_remove (pmap_t, vm_offset_t, vm_offset_t); void pmap_remove_pages (pmap_t, vm_offset_t, vm_offset_t); void pmap_zero_page (vm_paddr_t); @@ -199,6 +201,9 @@ vm_offset_t pmap_addr_hint (vm_object_t obj, vm_offset_t addr, vm_size_t size); void *pmap_kenter_temporary (vm_paddr_t pa, long i); void pmap_init2 (void); struct vm_page *pmap_kvtom(vm_offset_t va); +void pmap_object_init(vm_object_t object); +void pmap_object_free(vm_object_t object); + #endif /* _KERNEL */ diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 86d08076d9..6fcc5decee 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -511,7 +511,7 @@ quick: * Enter the page into the pmap and do pmap-related adjustments. */ vm_page_flag_set(fs.m, PG_REFERENCED); - pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); + pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, fs.entry); mycpu->gd_cnt.v_vm_faults++; if (curthread->td_lwp) ++curthread->td_lwp->lwp_ru.ru_minflt; @@ -770,7 +770,7 @@ RetryFault: */ vm_page_flag_set(fs.m, PG_REFERENCED); #if 0 - pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired); + pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired, NULL); mycpu->gd_cnt.v_vm_faults++; if (curthread->td_lwp) ++curthread->td_lwp->lwp_ru.ru_minflt; @@ -1830,7 +1830,7 @@ vm_fault_wire(vm_map_t map, vm_map_entry_t entry, boolean_t user_wire) va -= PAGE_SIZE; if ((pa = pmap_extract(pmap, va)) == 0) continue; - pmap_change_wiring(pmap, va, FALSE); + pmap_change_wiring(pmap, va, FALSE, entry); if (!fictitious) { m = PHYS_TO_VM_PAGE(pa); vm_page_busy_wait(m, FALSE, "vmwrpg"); @@ -1880,7 +1880,7 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) for (va = start; va < end; va += PAGE_SIZE) { pa = pmap_extract(pmap, va); if (pa != 0) { - pmap_change_wiring(pmap, va, FALSE); + pmap_change_wiring(pmap, va, FALSE, entry); if (!fictitious) { m = PHYS_TO_VM_PAGE(pa); vm_page_busy_wait(m, FALSE, "vmwupg"); @@ -1901,6 +1901,8 @@ vm_fault_unwire(vm_map_t map, vm_map_entry_t entry) * entry corresponding to a main map entry that is wired down). * * No other requirements. + * + * XXX do segment optimization */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, @@ -1968,7 +1970,7 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, */ vm_page_flag_clear(dst_m, PG_ZERO); - pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE); + pmap_enter(dst_map->pmap, vaddr, dst_m, prot, FALSE, dst_entry); /* * Mark it no longer busy, and put it on the active list. @@ -2427,7 +2429,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot, */ if (pprot & VM_PROT_WRITE) vm_set_nosync(m, entry); - pmap_enter(pmap, addr, m, pprot, 0); + pmap_enter(pmap, addr, m, pprot, 0, entry); mycpu->gd_cnt.v_vm_faults++; if (curthread->td_lwp) ++curthread->td_lwp->lwp_ru.ru_minflt; @@ -2464,7 +2466,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot, } if (pprot & VM_PROT_WRITE) vm_set_nosync(m, entry); - pmap_enter(pmap, addr, m, pprot, 0); + pmap_enter(pmap, addr, m, pprot, 0, entry); mycpu->gd_cnt.v_vm_faults++; if (curthread->td_lwp) ++curthread->td_lwp->lwp_ru.ru_minflt; @@ -2599,7 +2601,7 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra, swap_pager_unswapped(m); } } - pmap_enter(pmap, addr, m, prot, 0); + pmap_enter(pmap, addr, m, prot, 0, entry); mycpu->gd_cnt.v_vm_faults++; if (curthread->td_lwp) ++curthread->td_lwp->lwp_ru.ru_minflt; diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index e0cbf5ec28..c822fb848f 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -430,6 +430,7 @@ _vm_object_allocate(objtype_t type, vm_pindex_t size, vm_object_t object) object->swblock_count = 0; RB_INIT(&object->swblock_root); vm_object_lock_init(object); + pmap_object_init(object); vm_object_hold(object); lwkt_gettoken(&vmobj_token); @@ -929,6 +930,11 @@ vm_object_terminate(vm_object_t object) "ref_count=%d", object->ref_count); } + /* + * Cleanup any shared pmaps associated with this object. + */ + pmap_object_free(object); + /* * Now free any remaining pages. For internal objects, this also * removes them from paging queues. Don't free wired pages, just diff --git a/sys/vm/vm_object.h b/sys/vm/vm_object.h index 108180e6c3..bf3b3dc5af 100644 --- a/sys/vm/vm_object.h +++ b/sys/vm/vm_object.h @@ -82,7 +82,12 @@ #ifndef _SYS_THREAD_H_ #include #endif +#ifndef _MACHINE_PMAP_H_ +#include +#endif +#ifndef _MACHINE_ATOMIC_H_ #include +#endif #ifndef _VM_VM_H_ #include #endif @@ -186,6 +191,7 @@ struct vm_object { RB_HEAD(swblock_rb_tree, swblock) swblock_root; int swblock_count; struct lwkt_token token; + struct md_object md; /* machine specific (typ pmap) */ }; /* -- 2.41.0