gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991 Regents of the University of California.
	3	* Copyright (c) 1994 John S. Dyson
	4	* Copyright (c) 1994 David Greenman
	5	* Copyright (c) 2003 Peter Wemm
	6	* Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
	7	* Copyright (c) 2008, 2009 The DragonFly Project.
	8	* Copyright (c) 2008, 2009 Jordan Gordeev.
	9	* Copyright (c) 2011 Matthew Dillon
	10	* All rights reserved.
	11	*
	12	* This code is derived from software contributed to Berkeley by
	13	* the Systems Programming Group of the University of Utah Computer
	14	* Science Department and William Jolitz of UUNET Technologies Inc.
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	* 1. Redistributions of source code must retain the above copyright
	20	* notice, this list of conditions and the following disclaimer.
	21	* 2. Redistributions in binary form must reproduce the above copyright
	22	* notice, this list of conditions and the following disclaimer in the
	23	* documentation and/or other materials provided with the distribution.
	24	* 3. All advertising materials mentioning features or use of this software
	25	* must display the following acknowledgement:
	26	* This product includes software developed by the University of
	27	* California, Berkeley and its contributors.
	28	* 4. Neither the name of the University nor the names of its contributors
	29	* may be used to endorse or promote products derived from this software
	30	* without specific prior written permission.
	31	*
	32	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	33	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	34	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	35	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	36	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	37	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	38	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	39	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	40	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	41	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	42	* SUCH DAMAGE.
	43	*/
	44	/*
	45	* Manage physical address maps for x86-64 systems.
	46	*/
	47
	48	#if JG
	49	#include "opt_disable_pse.h"
	50	#include "opt_pmap.h"
	51	#endif
	52	#include "opt_msgbuf.h"
	53
	54	#include <sys/param.h>
	55	#include <sys/systm.h>
	56	#include <sys/kernel.h>
	57	#include <sys/proc.h>
	58	#include <sys/msgbuf.h>
	59	#include <sys/vmmeter.h>
	60	#include <sys/mman.h>
	61
	62	#include <vm/vm.h>
	63	#include <vm/vm_param.h>
	64	#include <sys/sysctl.h>
	65	#include <sys/lock.h>
	66	#include <vm/vm_kern.h>
	67	#include <vm/vm_page.h>
	68	#include <vm/vm_map.h>
	69	#include <vm/vm_object.h>
	70	#include <vm/vm_extern.h>
	71	#include <vm/vm_pageout.h>
	72	#include <vm/vm_pager.h>
	73	#include <vm/vm_zone.h>
	74
	75	#include <sys/user.h>
	76	#include <sys/thread2.h>
	77	#include <sys/sysref2.h>
	78	#include <sys/spinlock2.h>
	79	#include <vm/vm_page2.h>
	80
	81	#include <machine/cputypes.h>
	82	#include <machine/md_var.h>
	83	#include <machine/specialreg.h>
	84	#include <machine/smp.h>
	85	#include <machine_base/apic/apicreg.h>
	86	#include <machine/globaldata.h>
	87	#include <machine/pmap.h>
	88	#include <machine/pmap_inval.h>
	89	#include <machine/inttypes.h>
	90
	91	#include <ddb/ddb.h>
	92
	93	#define PMAP_KEEP_PDIRS
	94	#ifndef PMAP_SHPGPERPROC
	95	#define PMAP_SHPGPERPROC 2000
	96	#endif
	97
	98	#if defined(DIAGNOSTIC)
	99	#define PMAP_DIAGNOSTIC
	100	#endif
	101
	102	#define MINPV 2048
	103
	104	/*
	105	* pmap debugging will report who owns a pv lock when blocking.
	106	*/
	107	#ifdef PMAP_DEBUG
	108
	109	#define PMAP_DEBUG_DECL ,const char *func, int lineno
	110	#define PMAP_DEBUG_ARGS , __func__, __LINE__
	111	#define PMAP_DEBUG_COPY , func, lineno
	112
	113	#define pv_get(pmap, pindex) _pv_get(pmap, pindex \
	114	PMAP_DEBUG_ARGS)
	115	#define pv_lock(pv) _pv_lock(pv \
	116	PMAP_DEBUG_ARGS)
	117	#define pv_hold_try(pv) _pv_hold_try(pv \
	118	PMAP_DEBUG_ARGS)
	119	#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \
	120	PMAP_DEBUG_ARGS)
	121
	122	#else
	123
	124	#define PMAP_DEBUG_DECL
	125	#define PMAP_DEBUG_ARGS
	126	#define PMAP_DEBUG_COPY
	127
	128	#define pv_get(pmap, pindex) _pv_get(pmap, pindex)
	129	#define pv_lock(pv) _pv_lock(pv)
	130	#define pv_hold_try(pv) _pv_hold_try(pv)
	131	#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp)
	132
	133	#endif
	134
	135	/*
	136	* Get PDEs and PTEs for user/kernel address space
	137	*/
	138	#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
	139
	140	#define pmap_pde_v(pte) (((pd_entry_t )pte & PG_V) != 0)
	141	#define pmap_pte_w(pte) (((pt_entry_t )pte & PG_W) != 0)
	142	#define pmap_pte_m(pte) (((pt_entry_t )pte & PG_M) != 0)
	143	#define pmap_pte_u(pte) (((pt_entry_t )pte & PG_A) != 0)
	144	#define pmap_pte_v(pte) (((pt_entry_t )pte & PG_V) != 0)
	145
	146	/*
	147	* Given a map and a machine independent protection code,
	148	* convert to a vax protection code.
	149	*/
	150	#define pte_prot(m, p) \
	151	(protection_codes[p & (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE)])
	152	static int protection_codes[8];
	153
	154	struct pmap kernel_pmap;
	155	static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
	156
	157	vm_paddr_t avail_start; /* PA of first available physical page */
	158	vm_paddr_t avail_end; /* PA of last available physical page */
	159	vm_offset_t virtual2_start; /* cutout free area prior to kernel start */
	160	vm_offset_t virtual2_end;
	161	vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */
	162	vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
	163	vm_offset_t KvaStart; /* VA start of KVA space */
	164	vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */
	165	vm_offset_t KvaSize; /* max size of kernel virtual address space */
	166	static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */
	167	static int pgeflag; /* PG_G or-in */
	168	static int pseflag; /* PG_PS or-in */
	169
	170	static int ndmpdp;
	171	static vm_paddr_t dmaplimit;
	172	static int nkpt;
	173	vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
	174
	175	static uint64_t KPTbase;
	176	static uint64_t KPTphys;
	177	static uint64_t KPDphys; /* phys addr of kernel level 2 */
	178	static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */
	179	uint64_t KPDPphys; /* phys addr of kernel level 3 */
	180	uint64_t KPML4phys; /* phys addr of kernel level 4 */
	181
	182	static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */
	183	static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */
	184
	185	/*
	186	* Data for the pv entry allocation mechanism
	187	*/
	188	static vm_zone_t pvzone;
	189	static struct vm_zone pvzone_store;
	190	static struct vm_object pvzone_obj;
	191	static int pv_entry_max=0, pv_entry_high_water=0;
	192	static int pmap_pagedaemon_waken = 0;
	193	static struct pv_entry *pvinit;
	194
	195	/*
	196	* All those kernel PT submaps that BSD is so fond of
	197	*/
	198	pt_entry_t CMAP1 = 0, ptmmap;
	199	caddr_t CADDR1 = 0, ptvmmap = 0;
	200	static pt_entry_t *msgbufmap;
	201	struct msgbuf *msgbufp=0;
	202
	203	/*
	204	* Crashdump maps.
	205	*/
	206	static pt_entry_t *pt_crashdumpmap;
	207	static caddr_t crashdumpmap;
	208
	209	static int pmap_yield_count = 64;
	210	SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
	211	&pmap_yield_count, 0, "Yield during init_pt/release");
	212
	213	#define DISABLE_PSE
	214
	215	static void pv_hold(pv_entry_t pv);
	216	static int _pv_hold_try(pv_entry_t pv
	217	PMAP_DEBUG_DECL);
	218	static void pv_drop(pv_entry_t pv);
	219	static void _pv_lock(pv_entry_t pv
	220	PMAP_DEBUG_DECL);
	221	static void pv_unlock(pv_entry_t pv);
	222	static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
	223	PMAP_DEBUG_DECL);
	224	static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
	225	PMAP_DEBUG_DECL);
	226	static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
	227	static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
	228	static void pv_put(pv_entry_t pv);
	229	static void pv_free(pv_entry_t pv);
	230	static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
	231	static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
	232	pv_entry_t *pvpp);
	233	static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
	234	struct pmap_inval_info *info);
	235	static vm_page_t pmap_remove_pv_page(pv_entry_t pv);
	236
	237	static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
	238	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	239	pt_entry_t ptep, void arg __unused);
	240	static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
	241	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	242	pt_entry_t ptep, void arg __unused);
	243
	244	static void i386_protection_init (void);
	245	static void create_pagetables(vm_paddr_t *firstaddr);
	246	static void pmap_remove_all (vm_page_t m);
	247	static boolean_t pmap_testbit (vm_page_t m, int bit);
	248
	249	static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
	250	static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
	251
	252	static unsigned pdir4mb;
	253
	254	static int
	255	pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
	256	{
	257	if (pv1->pv_pindex < pv2->pv_pindex)
	258	return(-1);
	259	if (pv1->pv_pindex > pv2->pv_pindex)
	260	return(1);
	261	return(0);
	262	}
	263
	264	RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
	265	pv_entry_compare, vm_pindex_t, pv_pindex);
	266
	267	/*
	268	* Move the kernel virtual free pointer to the next
	269	* 2MB. This is used to help improve performance
	270	* by using a large (2MB) page for much of the kernel
	271	* (.text, .data, .bss)
	272	*/
	273	static
	274	vm_offset_t
	275	pmap_kmem_choose(vm_offset_t addr)
	276	{
	277	vm_offset_t newaddr = addr;
	278
	279	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
	280	return newaddr;
	281	}
	282
	283	/*
	284	* pmap_pte_quick:
	285	*
	286	* Super fast pmap_pte routine best used when scanning the pv lists.
	287	* This eliminates many course-grained invltlb calls. Note that many of
	288	* the pv list scans are across different pmaps and it is very wasteful
	289	* to do an entire invltlb when checking a single mapping.
	290	*/
	291	static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va);
	292
	293	static
	294	pt_entry_t *
	295	pmap_pte_quick(pmap_t pmap, vm_offset_t va)
	296	{
	297	return pmap_pte(pmap, va);
	298	}
	299
	300	/*
	301	* Returns the pindex of a page table entry (representing a terminal page).
	302	* There are NUPTE_TOTAL page table entries possible (a huge number)
	303	*
	304	* x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
	305	* We want to properly translate negative KVAs.
	306	*/
	307	static __inline
	308	vm_pindex_t
	309	pmap_pte_pindex(vm_offset_t va)
	310	{
	311	return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
	312	}
	313
	314	/*
	315	* Returns the pindex of a page table.
	316	*/
	317	static __inline
	318	vm_pindex_t
	319	pmap_pt_pindex(vm_offset_t va)
	320	{
	321	return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
	322	}
	323
	324	/*
	325	* Returns the pindex of a page directory.
	326	*/
	327	static __inline
	328	vm_pindex_t
	329	pmap_pd_pindex(vm_offset_t va)
	330	{
	331	return (NUPTE_TOTAL + NUPT_TOTAL +
	332	((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
	333	}
	334
	335	static __inline
	336	vm_pindex_t
	337	pmap_pdp_pindex(vm_offset_t va)
	338	{
	339	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
	340	((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
	341	}
	342
	343	static __inline
	344	vm_pindex_t
	345	pmap_pml4_pindex(void)
	346	{
	347	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
	348	}
	349
	350	/*
	351	* Return various clipped indexes for a given VA
	352	*
	353	* Returns the index of a pte in a page table, representing a terminal
	354	* page.
	355	*/
	356	static __inline
	357	vm_pindex_t
	358	pmap_pte_index(vm_offset_t va)
	359	{
	360	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
	361	}
	362
	363	/*
	364	* Returns the index of a pt in a page directory, representing a page
	365	* table.
	366	*/
	367	static __inline
	368	vm_pindex_t
	369	pmap_pt_index(vm_offset_t va)
	370	{
	371	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
	372	}
	373
	374	/*
	375	* Returns the index of a pd in a page directory page, representing a page
	376	* directory.
	377	*/
	378	static __inline
	379	vm_pindex_t
	380	pmap_pd_index(vm_offset_t va)
	381	{
	382	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
	383	}
	384
	385	/*
	386	* Returns the index of a pdp in the pml4 table, representing a page
	387	* directory page.
	388	*/
	389	static __inline
	390	vm_pindex_t
	391	pmap_pdp_index(vm_offset_t va)
	392	{
	393	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
	394	}
	395
	396	/*
	397	* Generic procedure to index a pte from a pt, pd, or pdp.
	398	*/
	399	static
	400	void *
	401	pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
	402	{
	403	pt_entry_t *pte;
	404
	405	pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
	406	return(&pte[pindex]);
	407	}
	408
	409	/*
	410	* Return pointer to PDP slot in the PML4
	411	*/
	412	static __inline
	413	pml4_entry_t *
	414	pmap_pdp(pmap_t pmap, vm_offset_t va)
	415	{
	416	return (&pmap->pm_pml4[pmap_pdp_index(va)]);
	417	}
	418
	419	/*
	420	* Return pointer to PD slot in the PDP given a pointer to the PDP
	421	*/
	422	static __inline
	423	pdp_entry_t *
	424	pmap_pdp_to_pd(pml4_entry_t *pdp, vm_offset_t va)
	425	{
	426	pdp_entry_t *pd;
	427
	428	pd = (pdp_entry_t )PHYS_TO_DMAP(pdp & PG_FRAME);
	429	return (&pd[pmap_pd_index(va)]);
	430	}
	431
	432	/*
	433	* Return pointer to PD slot in the PDP
	434	**/
	435	static __inline
	436	pdp_entry_t *
	437	pmap_pd(pmap_t pmap, vm_offset_t va)
	438	{
	439	pml4_entry_t *pdp;
	440
	441	pdp = pmap_pdp(pmap, va);
	442	if ((*pdp & PG_V) == 0)
	443	return NULL;
	444	return (pmap_pdp_to_pd(pdp, va));
	445	}
	446
	447	/*
	448	* Return pointer to PT slot in the PD given a pointer to the PD
	449	*/
	450	static __inline
	451	pd_entry_t *
	452	pmap_pd_to_pt(pdp_entry_t *pd, vm_offset_t va)
	453	{
	454	pd_entry_t *pt;
	455
	456	pt = (pd_entry_t )PHYS_TO_DMAP(pd & PG_FRAME);
	457	return (&pt[pmap_pt_index(va)]);
	458	}
	459
	460	/*
	461	* Return pointer to PT slot in the PD
	462	*/
	463	static __inline
	464	pd_entry_t *
	465	pmap_pt(pmap_t pmap, vm_offset_t va)
	466	{
	467	pdp_entry_t *pd;
	468
	469	pd = pmap_pd(pmap, va);
	470	if (pd == NULL \|\| (*pd & PG_V) == 0)
	471	return NULL;
	472	return (pmap_pd_to_pt(pd, va));
	473	}
	474
	475	/*
	476	* Return pointer to PTE slot in the PT given a pointer to the PT
	477	*/
	478	static __inline
	479	pt_entry_t *
	480	pmap_pt_to_pte(pd_entry_t *pt, vm_offset_t va)
	481	{
	482	pt_entry_t *pte;
	483
	484	pte = (pt_entry_t )PHYS_TO_DMAP(pt & PG_FRAME);
	485	return (&pte[pmap_pte_index(va)]);
	486	}
	487
	488	/*
	489	* Return pointer to PTE slot in the PT
	490	*/
	491	static __inline
	492	pt_entry_t *
	493	pmap_pte(pmap_t pmap, vm_offset_t va)
	494	{
	495	pd_entry_t *pt;
	496
	497	pt = pmap_pt(pmap, va);
	498	if (pt == NULL \|\| (*pt & PG_V) == 0)
	499	return NULL;
	500	if ((*pt & PG_PS) != 0)
	501	return ((pt_entry_t *)pt);
	502	return (pmap_pt_to_pte(pt, va));
	503	}
	504
	505	/*
	506	* Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
	507	* the PT layer. This will speed up core pmap operations considerably.
	508	*/
	509	static __inline
	510	void
	511	pv_cache(pv_entry_t pv, vm_pindex_t pindex)
	512	{
	513	if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0))
	514	pv->pv_pmap->pm_pvhint = pv;
	515	}
	516
	517
	518	/*
	519	* KVM - return address of PT slot in PD
	520	*/
	521	static __inline
	522	pd_entry_t *
	523	vtopt(vm_offset_t va)
	524	{
	525	uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
	526	NPML4EPGSHIFT)) - 1);
	527
	528	return (PDmap + ((va >> PDRSHIFT) & mask));
	529	}
	530
	531	/*
	532	* KVM - return address of PTE slot in PT
	533	*/
	534	static __inline
	535	pt_entry_t *
	536	vtopte(vm_offset_t va)
	537	{
	538	uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
	539	NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
	540
	541	return (PTmap + ((va >> PAGE_SHIFT) & mask));
	542	}
	543
	544	static uint64_t
	545	allocpages(vm_paddr_t *firstaddr, long n)
	546	{
	547	uint64_t ret;
	548
	549	ret = *firstaddr;
	550	bzero((void )ret, n PAGE_SIZE);
	551	firstaddr += n PAGE_SIZE;
	552	return (ret);
	553	}
	554
	555	static
	556	void
	557	create_pagetables(vm_paddr_t *firstaddr)
	558	{
	559	long i; /* must be 64 bits */
	560	long nkpt_base;
	561	long nkpt_phys;
	562	int j;
	563
	564	/*
	565	* We are running (mostly) V=P at this point
	566	*
	567	* Calculate NKPT - number of kernel page tables. We have to
	568	* accomodoate prealloction of the vm_page_array, dump bitmap,
	569	* MSGBUF_SIZE, and other stuff. Be generous.
	570	*
	571	* Maxmem is in pages.
	572	*
	573	* ndmpdp is the number of 1GB pages we wish to map.
	574	*/
	575	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
	576	if (ndmpdp < 4) /* Minimum 4GB of dirmap */
	577	ndmpdp = 4;
	578	KKASSERT(ndmpdp <= NKPDPE * NPDEPG);
	579
	580	/*
	581	* Starting at the beginning of kvm (not KERNBASE).
	582	*/
	583	nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR;
	584	nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR;
	585	nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E +
	586	ndmpdp) + 511) / 512;
	587	nkpt_phys += 128;
	588
	589	/*
	590	* Starting at KERNBASE - map 2G worth of page table pages.
	591	* KERNBASE is offset -2G from the end of kvm.
	592	*/
	593	nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */
	594
	595	/*
	596	* Allocate pages
	597	*/
	598	KPTbase = allocpages(firstaddr, nkpt_base);
	599	KPTphys = allocpages(firstaddr, nkpt_phys);
	600	KPML4phys = allocpages(firstaddr, 1);
	601	KPDPphys = allocpages(firstaddr, NKPML4E);
	602	KPDphys = allocpages(firstaddr, NKPDPE);
	603
	604	/*
	605	* Calculate the page directory base for KERNBASE,
	606	* that is where we start populating the page table pages.
	607	* Basically this is the end - 2.
	608	*/
	609	KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT);
	610
	611	DMPDPphys = allocpages(firstaddr, NDMPML4E);
	612	if ((amd_feature & AMDID_PAGE1GB) == 0)
	613	DMPDphys = allocpages(firstaddr, ndmpdp);
	614	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
	615
	616	/*
	617	* Fill in the underlying page table pages for the area around
	618	* KERNBASE. This remaps low physical memory to KERNBASE.
	619	*
	620	* Read-only from zero to physfree
	621	* XXX not fully used, underneath 2M pages
	622	*/
	623	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
	624	((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT;
	625	((pt_entry_t *)KPTbase)[i] \|= PG_RW \| PG_V \| PG_G;
	626	}
	627
	628	/*
	629	* Now map the initial kernel page tables. One block of page
	630	* tables is placed at the beginning of kernel virtual memory,
	631	* and another block is placed at KERNBASE to map the kernel binary,
	632	* data, bss, and initial pre-allocations.
	633	*/
	634	for (i = 0; i < nkpt_base; i++) {
	635	((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT);
	636	((pd_entry_t *)KPDbase)[i] \|= PG_RW \| PG_V;
	637	}
	638	for (i = 0; i < nkpt_phys; i++) {
	639	((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
	640	((pd_entry_t *)KPDphys)[i] \|= PG_RW \| PG_V;
	641	}
	642
	643	/*
	644	* Map from zero to end of allocations using 2M pages as an
	645	* optimization. This will bypass some of the KPTBase pages
	646	* above in the KERNBASE area.
	647	*/
	648	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
	649	((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT;
	650	((pd_entry_t *)KPDbase)[i] \|= PG_RW \| PG_V \| PG_PS \| PG_G;
	651	}
	652
	653	/*
	654	* And connect up the PD to the PDP. The kernel pmap is expected
	655	* to pre-populate all of its PDs. See NKPDPE in vmparam.h.
	656	*/
	657	for (i = 0; i < NKPDPE; i++) {
	658	((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] =
	659	KPDphys + (i << PAGE_SHIFT);
	660	((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] \|=
	661	PG_RW \| PG_V \| PG_U;
	662	}
	663
	664	/*
	665	* Now set up the direct map space using either 2MB or 1GB pages
	666	* Preset PG_M and PG_A because demotion expects it.
	667	*
	668	* When filling in entries in the PD pages make sure any excess
	669	* entries are set to zero as we allocated enough PD pages
	670	*/
	671	if ((amd_feature & AMDID_PAGE1GB) == 0) {
	672	for (i = 0; i < NPDEPG * ndmpdp; i++) {
	673	((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
	674	((pd_entry_t *)DMPDphys)[i] \|= PG_RW \| PG_V \| PG_PS \|
	675	PG_G \| PG_M \| PG_A;
	676	}
	677
	678	/*
	679	* And the direct map space's PDP
	680	*/
	681	for (i = 0; i < ndmpdp; i++) {
	682	((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
	683	(i << PAGE_SHIFT);
	684	((pdp_entry_t *)DMPDPphys)[i] \|= PG_RW \| PG_V \| PG_U;
	685	}
	686	} else {
	687	for (i = 0; i < ndmpdp; i++) {
	688	((pdp_entry_t *)DMPDPphys)[i] =
	689	(vm_paddr_t)i << PDPSHIFT;
	690	((pdp_entry_t *)DMPDPphys)[i] \|= PG_RW \| PG_V \| PG_PS \|
	691	PG_G \| PG_M \| PG_A;
	692	}
	693	}
	694
	695	/* And recursively map PML4 to itself in order to get PTmap */
	696	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
	697	((pdp_entry_t *)KPML4phys)[PML4PML4I] \|= PG_RW \| PG_V \| PG_U;
	698
	699	/*
	700	* Connect the Direct Map slots up to the PML4
	701	*/
	702	for (j = 0; j < NDMPML4E; ++j) {
	703	((pdp_entry_t *)KPML4phys)[DMPML4I + j] =
	704	(DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) \|
	705	PG_RW \| PG_V \| PG_U;
	706	}
	707
	708	/*
	709	* Connect the KVA slot up to the PML4
	710	*/
	711	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
	712	((pdp_entry_t *)KPML4phys)[KPML4I] \|= PG_RW \| PG_V \| PG_U;
	713	}
	714
	715	/*
	716	* Bootstrap the system enough to run with virtual memory.
	717	*
	718	* On the i386 this is called after mapping has already been enabled
	719	* and just syncs the pmap module with what has already been done.
	720	* [We can't call it easily with mapping off since the kernel is not
	721	* mapped with PA == VA, hence we would have to relocate every address
	722	* from the linked base (virtual) address "KERNBASE" to the actual
	723	* (physical) address starting relative to 0]
	724	*/
	725	void
	726	pmap_bootstrap(vm_paddr_t *firstaddr)
	727	{
	728	vm_offset_t va;
	729	pt_entry_t *pte;
	730	struct mdglobaldata *gd;
	731	int pg;
	732
	733	KvaStart = VM_MIN_KERNEL_ADDRESS;
	734	KvaEnd = VM_MAX_KERNEL_ADDRESS;
	735	KvaSize = KvaEnd - KvaStart;
	736
	737	avail_start = *firstaddr;
	738
	739	/*
	740	* Create an initial set of page tables to run the kernel in.
	741	*/
	742	create_pagetables(firstaddr);
	743
	744	virtual2_start = KvaStart;
	745	virtual2_end = PTOV_OFFSET;
	746
	747	virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
	748	virtual_start = pmap_kmem_choose(virtual_start);
	749
	750	virtual_end = VM_MAX_KERNEL_ADDRESS;
	751
	752	/* XXX do %cr0 as well */
	753	load_cr4(rcr4() \| CR4_PGE \| CR4_PSE);
	754	load_cr3(KPML4phys);
	755
	756	/*
	757	* Initialize protection array.
	758	*/
	759	i386_protection_init();
	760
	761	/*
	762	* The kernel's pmap is statically allocated so we don't have to use
	763	* pmap_create, which is unlikely to work correctly at this part of
	764	* the boot sequence (XXX and which no longer exists).
	765	*/
	766	kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
	767	kernel_pmap.pm_count = 1;
	768	kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
	769	RB_INIT(&kernel_pmap.pm_pvroot);
	770	spin_init(&kernel_pmap.pm_spin);
	771	lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
	772
	773	/*
	774	* Reserve some special page table entries/VA space for temporary
	775	* mapping of pages.
	776	*/
	777	#define SYSMAP(c, p, v, n) \
	778	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
	779
	780	va = virtual_start;
	781	pte = vtopte(va);
	782
	783	/*
	784	* CMAP1/CMAP2 are used for zeroing and copying pages.
	785	*/
	786	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
	787
	788	/*
	789	* Crashdump maps.
	790	*/
	791	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
	792
	793	/*
	794	* ptvmmap is used for reading arbitrary physical pages via
	795	* /dev/mem.
	796	*/
	797	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
	798
	799	/*
	800	* msgbufp is used to map the system message buffer.
	801	* XXX msgbufmap is not used.
	802	*/
	803	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
	804	atop(round_page(MSGBUF_SIZE)))
	805
	806	virtual_start = va;
	807
	808	*CMAP1 = 0;
	809
	810	/*
	811	* PG_G is terribly broken on SMP because we IPI invltlb's in some
	812	* cases rather then invl1pg. Actually, I don't even know why it
	813	* works under UP because self-referential page table mappings
	814	*/
	815	#ifdef SMP
	816	pgeflag = 0;
	817	#else
	818	if (cpu_feature & CPUID_PGE)
	819	pgeflag = PG_G;
	820	#endif
	821
	822	/*
	823	* Initialize the 4MB page size flag
	824	*/
	825	pseflag = 0;
	826	/*
	827	* The 4MB page version of the initial
	828	* kernel page mapping.
	829	*/
	830	pdir4mb = 0;
	831
	832	#if !defined(DISABLE_PSE)
	833	if (cpu_feature & CPUID_PSE) {
	834	pt_entry_t ptditmp;
	835	/*
	836	* Note that we have enabled PSE mode
	837	*/
	838	pseflag = PG_PS;
	839	ptditmp = *(PTmap + x86_64_btop(KERNBASE));
	840	ptditmp &= ~(NBPDR - 1);
	841	ptditmp \|= PG_V \| PG_RW \| PG_PS \| PG_U \| pgeflag;
	842	pdir4mb = ptditmp;
	843
	844	#ifndef SMP
	845	/*
	846	* Enable the PSE mode. If we are SMP we can't do this
	847	* now because the APs will not be able to use it when
	848	* they boot up.
	849	*/
	850	load_cr4(rcr4() \| CR4_PSE);
	851
	852	/*
	853	* We can do the mapping here for the single processor
	854	* case. We simply ignore the old page table page from
	855	* now on.
	856	*/
	857	/*
	858	* For SMP, we still need 4K pages to bootstrap APs,
	859	* PSE will be enabled as soon as all APs are up.
	860	*/
	861	PTD[KPTDI] = (pd_entry_t)ptditmp;
	862	cpu_invltlb();
	863	#endif
	864	}
	865	#endif
	866
	867	/*
	868	* We need to finish setting up the globaldata page for the BSP.
	869	* locore has already populated the page table for the mdglobaldata
	870	* portion.
	871	*/
	872	pg = MDGLOBALDATA_BASEALLOC_PAGES;
	873	gd = &CPU_prvspace[0].mdglobaldata;
	874
	875	cpu_invltlb();
	876	}
	877
	878	#ifdef SMP
	879	/*
	880	* Set 4mb pdir for mp startup
	881	*/
	882	void
	883	pmap_set_opt(void)
	884	{
	885	if (pseflag && (cpu_feature & CPUID_PSE)) {
	886	load_cr4(rcr4() \| CR4_PSE);
	887	if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */
	888	cpu_invltlb();
	889	}
	890	}
	891	}
	892	#endif
	893
	894	/*
	895	* Initialize the pmap module.
	896	* Called by vm_init, to initialize any structures that the pmap
	897	* system needs to map virtual memory.
	898	* pmap_init has been enhanced to support in a fairly consistant
	899	* way, discontiguous physical memory.
	900	*/
	901	void
	902	pmap_init(void)
	903	{
	904	int i;
	905	int initial_pvs;
	906
	907	/*
	908	* Allocate memory for random pmap data structures. Includes the
	909	* pv_head_table.
	910	*/
	911
	912	for (i = 0; i < vm_page_array_size; i++) {
	913	vm_page_t m;
	914
	915	m = &vm_page_array[i];
	916	TAILQ_INIT(&m->md.pv_list);
	917	}
	918
	919	/*
	920	* init the pv free list
	921	*/
	922	initial_pvs = vm_page_array_size;
	923	if (initial_pvs < MINPV)
	924	initial_pvs = MINPV;
	925	pvzone = &pvzone_store;
	926	pvinit = (void *)kmem_alloc(&kernel_map,
	927	initial_pvs * sizeof (struct pv_entry));
	928	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry),
	929	pvinit, initial_pvs);
	930
	931	/*
	932	* Now it is safe to enable pv_table recording.
	933	*/
	934	pmap_initialized = TRUE;
	935	}
	936
	937	/*
	938	* Initialize the address space (zone) for the pv_entries. Set a
	939	* high water mark so that the system can recover from excessive
	940	* numbers of pv entries.
	941	*/
	942	void
	943	pmap_init2(void)
	944	{
	945	int shpgperproc = PMAP_SHPGPERPROC;
	946	int entry_max;
	947
	948	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
	949	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
	950	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
	951	pv_entry_high_water = 9 * (pv_entry_max / 10);
	952
	953	/*
	954	* Subtract out pages already installed in the zone (hack)
	955	*/
	956	entry_max = pv_entry_max - vm_page_array_size;
	957	if (entry_max <= 0)
	958	entry_max = 1;
	959
	960	zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1);
	961	}
	962
	963
	964	/***************************************************
	965	* Low level helper routines.....
	966	***************************************************/
	967
	968	/*
	969	* this routine defines the region(s) of memory that should
	970	* not be tested for the modified bit.
	971	*/
	972	static __inline
	973	int
	974	pmap_track_modified(vm_pindex_t pindex)
	975	{
	976	vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT;
	977	if ((va < clean_sva) \|\| (va >= clean_eva))
	978	return 1;
	979	else
	980	return 0;
	981	}
	982
	983	/*
	984	* Extract the physical page address associated with the map/VA pair.
	985	* The page must be wired for this to work reliably.
	986	*
	987	* XXX for the moment we're using pv_find() instead of pv_get(), as
	988	* callers might be expecting non-blocking operation.
	989	*/
	990	vm_paddr_t
	991	pmap_extract(pmap_t pmap, vm_offset_t va)
	992	{
	993	vm_paddr_t rtval;
	994	pv_entry_t pt_pv;
	995	pt_entry_t *ptep;
	996
	997	rtval = 0;
	998	if (va >= VM_MAX_USER_ADDRESS) {
	999	/*
	1000	* Kernel page directories might be direct-mapped and
	1001	* there is typically no PV tracking of pte's
	1002	*/
	1003	pd_entry_t *pt;
	1004
	1005	pt = pmap_pt(pmap, va);
	1006	if (pt && (*pt & PG_V)) {
	1007	if (*pt & PG_PS) {
	1008	rtval = *pt & PG_PS_FRAME;
	1009	rtval \|= va & PDRMASK;
	1010	} else {
	1011	ptep = pmap_pt_to_pte(pt, va);
	1012	if (*pt & PG_V) {
	1013	rtval = *ptep & PG_FRAME;
	1014	rtval \|= va & PAGE_MASK;
	1015	}
	1016	}
	1017	}
	1018	} else {
	1019	/*
	1020	* User pages currently do not direct-map the page directory
	1021	* and some pages might not used managed PVs. But all PT's
	1022	* will have a PV.
	1023	*/
	1024	pt_pv = pv_find(pmap, pmap_pt_pindex(va));
	1025	if (pt_pv) {
	1026	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
	1027	if (*ptep & PG_V) {
	1028	rtval = *ptep & PG_FRAME;
	1029	rtval \|= va & PAGE_MASK;
	1030	}
	1031	pv_drop(pt_pv);
	1032	}
	1033	}
	1034	return rtval;
	1035	}
	1036
	1037	/*
	1038	* Extract the physical page address associated kernel virtual address.
	1039	*/
	1040	vm_paddr_t
	1041	pmap_kextract(vm_offset_t va)
	1042	{
	1043	pd_entry_t pt; /* pt entry in pd */
	1044	vm_paddr_t pa;
	1045
	1046	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
	1047	pa = DMAP_TO_PHYS(va);
	1048	} else {
	1049	pt = *vtopt(va);
	1050	if (pt & PG_PS) {
	1051	pa = (pt & PG_PS_FRAME) \| (va & PDRMASK);
	1052	} else {
	1053	/*
	1054	* Beware of a concurrent promotion that changes the
	1055	* PDE at this point! For example, vtopte() must not
	1056	* be used to access the PTE because it would use the
	1057	* new PDE. It is, however, safe to use the old PDE
	1058	* because the page table page is preserved by the
	1059	* promotion.
	1060	*/
	1061	pa = *pmap_pt_to_pte(&pt, va);
	1062	pa = (pa & PG_FRAME) \| (va & PAGE_MASK);
	1063	}
	1064	}
	1065	return pa;
	1066	}
	1067
	1068	/***************************************************
	1069	* Low level mapping routines.....
	1070	***************************************************/
	1071
	1072	/*
	1073	* Routine: pmap_kenter
	1074	* Function:
	1075	* Add a wired page to the KVA
	1076	* NOTE! note that in order for the mapping to take effect -- you
	1077	* should do an invltlb after doing the pmap_kenter().
	1078	*/
	1079	void
	1080	pmap_kenter(vm_offset_t va, vm_paddr_t pa)
	1081	{
	1082	pt_entry_t *pte;
	1083	pt_entry_t npte;
	1084	pmap_inval_info info;
	1085
	1086	pmap_inval_init(&info); /* XXX remove */
	1087	npte = pa \| PG_RW \| PG_V \| pgeflag;
	1088	pte = vtopte(va);
	1089	pmap_inval_interlock(&info, &kernel_pmap, va); /* XXX remove */
	1090	*pte = npte;
	1091	pmap_inval_deinterlock(&info, &kernel_pmap); /* XXX remove */
	1092	pmap_inval_done(&info); /* XXX remove */
	1093	}
	1094
	1095	/*
	1096	* Routine: pmap_kenter_quick
	1097	* Function:
	1098	* Similar to pmap_kenter(), except we only invalidate the
	1099	* mapping on the current CPU.
	1100	*/
	1101	void
	1102	pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
	1103	{
	1104	pt_entry_t *pte;
	1105	pt_entry_t npte;
	1106
	1107	npte = pa \| PG_RW \| PG_V \| pgeflag;
	1108	pte = vtopte(va);
	1109	*pte = npte;
	1110	cpu_invlpg((void *)va);
	1111	}
	1112
	1113	void
	1114	pmap_kenter_sync(vm_offset_t va)
	1115	{
	1116	pmap_inval_info info;
	1117
	1118	pmap_inval_init(&info);
	1119	pmap_inval_interlock(&info, &kernel_pmap, va);
	1120	pmap_inval_deinterlock(&info, &kernel_pmap);
	1121	pmap_inval_done(&info);
	1122	}
	1123
	1124	void
	1125	pmap_kenter_sync_quick(vm_offset_t va)
	1126	{
	1127	cpu_invlpg((void *)va);
	1128	}
	1129
	1130	/*
	1131	* remove a page from the kernel pagetables
	1132	*/
	1133	void
	1134	pmap_kremove(vm_offset_t va)
	1135	{
	1136	pt_entry_t *pte;
	1137	pmap_inval_info info;
	1138
	1139	pmap_inval_init(&info);
	1140	pte = vtopte(va);
	1141	pmap_inval_interlock(&info, &kernel_pmap, va);
	1142	(void)pte_load_clear(pte);
	1143	pmap_inval_deinterlock(&info, &kernel_pmap);
	1144	pmap_inval_done(&info);
	1145	}
	1146
	1147	void
	1148	pmap_kremove_quick(vm_offset_t va)
	1149	{
	1150	pt_entry_t *pte;
	1151	pte = vtopte(va);
	1152	(void)pte_load_clear(pte);
	1153	cpu_invlpg((void *)va);
	1154	}
	1155
	1156	/*
	1157	* XXX these need to be recoded. They are not used in any critical path.
	1158	*/
	1159	void
	1160	pmap_kmodify_rw(vm_offset_t va)
	1161	{
	1162	atomic_set_long(vtopte(va), PG_RW);
	1163	cpu_invlpg((void *)va);
	1164	}
	1165
	1166	void
	1167	pmap_kmodify_nc(vm_offset_t va)
	1168	{
	1169	atomic_set_long(vtopte(va), PG_N);
	1170	cpu_invlpg((void *)va);
	1171	}
	1172
	1173	/*
	1174	* Used to map a range of physical addresses into kernel virtual
	1175	* address space during the low level boot, typically to map the
	1176	* dump bitmap, message buffer, and vm_page_array.
	1177	*
	1178	* These mappings are typically made at some pointer after the end of the
	1179	* kernel text+data.
	1180	*
	1181	* We could return PHYS_TO_DMAP(start) here and not allocate any
	1182	* via (*virtp), but then kmem from userland and kernel dumps won't
	1183	* have access to the related pointers.
	1184	*/
	1185	vm_offset_t
	1186	pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
	1187	{
	1188	vm_offset_t va;
	1189	vm_offset_t va_start;
	1190
	1191	/return PHYS_TO_DMAP(start);/
	1192
	1193	va_start = *virtp;
	1194	va = va_start;
	1195
	1196	while (start < end) {
	1197	pmap_kenter_quick(va, start);
	1198	va += PAGE_SIZE;
	1199	start += PAGE_SIZE;
	1200	}
	1201	*virtp = va;
	1202	return va_start;
	1203	}
	1204
	1205
	1206	/*
	1207	* Add a list of wired pages to the kva
	1208	* this routine is only used for temporary
	1209	* kernel mappings that do not need to have
	1210	* page modification or references recorded.
	1211	* Note that old mappings are simply written
	1212	* over. The page must be wired.
	1213	*/
	1214	void
	1215	pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
	1216	{
	1217	vm_offset_t end_va;
	1218
	1219	end_va = va + count * PAGE_SIZE;
	1220
	1221	while (va < end_va) {
	1222	pt_entry_t *pte;
	1223
	1224	pte = vtopte(va);
	1225	pte = VM_PAGE_TO_PHYS(m) \| PG_RW \| PG_V \| pgeflag;
	1226	cpu_invlpg((void *)va);
	1227	va += PAGE_SIZE;
	1228	m++;
	1229	}
	1230	smp_invltlb();
	1231	}
	1232
	1233	/*
	1234	* This routine jerks page mappings from the
	1235	* kernel -- it is meant only for temporary mappings.
	1236	*
	1237	* MPSAFE, INTERRUPT SAFE (cluster callback)
	1238	*/
	1239	void
	1240	pmap_qremove(vm_offset_t va, int count)
	1241	{
	1242	vm_offset_t end_va;
	1243
	1244	end_va = va + count * PAGE_SIZE;
	1245
	1246	while (va < end_va) {
	1247	pt_entry_t *pte;
	1248
	1249	pte = vtopte(va);
	1250	(void)pte_load_clear(pte);
	1251	cpu_invlpg((void *)va);
	1252	va += PAGE_SIZE;
	1253	}
	1254	smp_invltlb();
	1255	}
	1256
	1257	/*
	1258	* Create a new thread and optionally associate it with a (new) process.
	1259	* NOTE! the new thread's cpu may not equal the current cpu.
	1260	*/
	1261	void
	1262	pmap_init_thread(thread_t td)
	1263	{
	1264	/* enforce pcb placement & alignment */
	1265	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
	1266	td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF);
	1267	td->td_savefpu = &td->td_pcb->pcb_save;
	1268	td->td_sp = (char )td->td_pcb; / no -16 */
	1269	}
	1270
	1271	/*
	1272	* This routine directly affects the fork perf for a process.
	1273	*/
	1274	void
	1275	pmap_init_proc(struct proc *p)
	1276	{
	1277	}
	1278
	1279	/*
	1280	* Initialize pmap0/vmspace0. This pmap is not added to pmap_list because
	1281	* it, and IdlePTD, represents the template used to update all other pmaps.
	1282	*
	1283	* On architectures where the kernel pmap is not integrated into the user
	1284	* process pmap, this pmap represents the process pmap, not the kernel pmap.
	1285	* kernel_pmap should be used to directly access the kernel_pmap.
	1286	*/
	1287	void
	1288	pmap_pinit0(struct pmap *pmap)
	1289	{
	1290	pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
	1291	pmap->pm_count = 1;
	1292	pmap->pm_active = 0;
	1293	pmap->pm_pvhint = NULL;
	1294	RB_INIT(&pmap->pm_pvroot);
	1295	spin_init(&pmap->pm_spin);
	1296	lwkt_token_init(&pmap->pm_token, "pmap_tok");
	1297	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
	1298	}
	1299
	1300	/*
	1301	* Initialize a preallocated and zeroed pmap structure,
	1302	* such as one in a vmspace structure.
	1303	*/
	1304	void
	1305	pmap_pinit(struct pmap *pmap)
	1306	{
	1307	pv_entry_t pv;
	1308	int j;
	1309
	1310	/*
	1311	* Misc initialization
	1312	*/
	1313	pmap->pm_count = 1;
	1314	pmap->pm_active = 0;
	1315	pmap->pm_pvhint = NULL;
	1316	if (pmap->pm_pmlpv == NULL) {
	1317	RB_INIT(&pmap->pm_pvroot);
	1318	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
	1319	spin_init(&pmap->pm_spin);
	1320	lwkt_token_init(&pmap->pm_token, "pmap_tok");
	1321	}
	1322
	1323	/*
	1324	* No need to allocate page table space yet but we do need a valid
	1325	* page directory table.
	1326	*/
	1327	if (pmap->pm_pml4 == NULL) {
	1328	pmap->pm_pml4 =
	1329	(pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
	1330	}
	1331
	1332	/*
	1333	* Allocate the page directory page, which wires it even though
	1334	* it isn't being entered into some higher level page table (it
	1335	* being the highest level). If one is already cached we don't
	1336	* have to do anything.
	1337	*/
	1338	if ((pv = pmap->pm_pmlpv) == NULL) {
	1339	pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
	1340	pmap->pm_pmlpv = pv;
	1341	pmap_kenter((vm_offset_t)pmap->pm_pml4,
	1342	VM_PAGE_TO_PHYS(pv->pv_m));
	1343	pv_put(pv);
	1344
	1345	/*
	1346	* Install DMAP and KMAP.
	1347	*/
	1348	for (j = 0; j < NDMPML4E; ++j) {
	1349	pmap->pm_pml4[DMPML4I + j] =
	1350	(DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) \|
	1351	PG_RW \| PG_V \| PG_U;
	1352	}
	1353	pmap->pm_pml4[KPML4I] = KPDPphys \| PG_RW \| PG_V \| PG_U;
	1354
	1355	/*
	1356	* install self-referential address mapping entry
	1357	*/
	1358	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) \|
	1359	PG_V \| PG_RW \| PG_A \| PG_M;
	1360	} else {
	1361	KKASSERT(pv->pv_m->flags & PG_MAPPED);
	1362	KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
	1363	}
	1364	}
	1365
	1366	/*
	1367	* Clean up a pmap structure so it can be physically freed. This routine
	1368	* is called by the vmspace dtor function. A great deal of pmap data is
	1369	* left passively mapped to improve vmspace management so we have a bit
	1370	* of cleanup work to do here.
	1371	*/
	1372	void
	1373	pmap_puninit(pmap_t pmap)
	1374	{
	1375	pv_entry_t pv;
	1376	vm_page_t p;
	1377
	1378	KKASSERT(pmap->pm_active == 0);
	1379	if ((pv = pmap->pm_pmlpv) != NULL) {
	1380	if (pv_hold_try(pv) == 0)
	1381	pv_lock(pv);
	1382	p = pmap_remove_pv_page(pv);
	1383	pv_free(pv);
	1384	pmap_kremove((vm_offset_t)pmap->pm_pml4);
	1385	vm_page_busy_wait(p, FALSE, "pgpun");
	1386	KKASSERT(p->flags & (PG_FICTITIOUS\|PG_UNMANAGED));
	1387	vm_page_unwire(p, 0);
	1388	vm_page_flag_clear(p, PG_MAPPED \| PG_WRITEABLE);
	1389
	1390	/*
	1391	* XXX eventually clean out PML4 static entries and
	1392	* use vm_page_free_zero()
	1393	*/
	1394	vm_page_free(p);
	1395	pmap->pm_pmlpv = NULL;
	1396	}
	1397	if (pmap->pm_pml4) {
	1398	KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
	1399	kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
	1400	pmap->pm_pml4 = NULL;
	1401	}
	1402	KKASSERT(pmap->pm_stats.resident_count == 0);
	1403	KKASSERT(pmap->pm_stats.wired_count == 0);
	1404	}
	1405
	1406	/*
	1407	* Wire in kernel global address entries. To avoid a race condition
	1408	* between pmap initialization and pmap_growkernel, this procedure
	1409	* adds the pmap to the master list (which growkernel scans to update),
	1410	* then copies the template.
	1411	*/
	1412	void
	1413	pmap_pinit2(struct pmap *pmap)
	1414	{
	1415	/*
	1416	* XXX copies current process, does not fill in MPPTDI
	1417	*/
	1418	spin_lock(&pmap_spin);
	1419	TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
	1420	spin_unlock(&pmap_spin);
	1421	}
	1422
	1423	/*
	1424	* This routine is called when various levels in the page table need to
	1425	* be populated. This routine cannot fail.
	1426	*
	1427	* This function returns two locked pv_entry's, one representing the
	1428	* requested pv and one representing the requested pv's parent pv. If
	1429	* the pv did not previously exist it will be mapped into its parent
	1430	* and wired, otherwise no additional wire count will be added.
	1431	*/
	1432	static
	1433	pv_entry_t
	1434	pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
	1435	{
	1436	pt_entry_t *ptep;
	1437	pv_entry_t pv;
	1438	pv_entry_t pvp;
	1439	vm_pindex_t pt_pindex;
	1440	vm_page_t m;
	1441	int isnew;
	1442
	1443	/*
	1444	* If the pv already exists and we aren't being asked for the
	1445	* parent page table page we can just return it. A locked+held pv
	1446	* is returned.
	1447	*/
	1448	pv = pv_alloc(pmap, ptepindex, &isnew);
	1449	if (isnew == 0 && pvpp == NULL)
	1450	return(pv);
	1451
	1452	/*
	1453	* This is a new PV, we have to resolve its parent page table and
	1454	* add an additional wiring to the page if necessary.
	1455	*/
	1456
	1457	/*
	1458	* Special case terminal PVs. These are not page table pages so
	1459	* no vm_page is allocated (the caller supplied the vm_page). If
	1460	* pvpp is non-NULL we are being asked to also removed the pt_pv
	1461	* for this pv.
	1462	*
	1463	* Note that pt_pv's are only returned for user VAs. We assert that
	1464	* a pt_pv is not being requested for kernel VAs.
	1465	*/
	1466	if (ptepindex < pmap_pt_pindex(0)) {
	1467	if (ptepindex >= NUPTE_USER)
	1468	KKASSERT(pvpp == NULL);
	1469	else
	1470	KKASSERT(pvpp != NULL);
	1471	if (pvpp) {
	1472	pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
	1473	pvp = pmap_allocpte(pmap, pt_pindex, NULL);
	1474	if (isnew)
	1475	vm_page_wire_quick(pvp->pv_m);
	1476	*pvpp = pvp;
	1477	} else {
	1478	pvp = NULL;
	1479	}
	1480	return(pv);
	1481	}
	1482
	1483	/*
	1484	* Non-terminal PVs allocate a VM page to represent the page table,
	1485	* so we have to resolve pvp and calculate ptepindex for the pvp
	1486	* and then for the page table entry index in the pvp for
	1487	* fall-through.
	1488	*/
	1489	if (ptepindex < pmap_pd_pindex(0)) {
	1490	/*
	1491	* pv is PT, pvp is PD
	1492	*/
	1493	ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
	1494	ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
	1495	pvp = pmap_allocpte(pmap, ptepindex, NULL);
	1496	if (!isnew)
	1497	goto notnew;
	1498
	1499	/*
	1500	* PT index in PD
	1501	*/
	1502	ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
	1503	ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
	1504	} else if (ptepindex < pmap_pdp_pindex(0)) {
	1505	/*
	1506	* pv is PD, pvp is PDP
	1507	*/
	1508	ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
	1509	ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
	1510	pvp = pmap_allocpte(pmap, ptepindex, NULL);
	1511	if (!isnew)
	1512	goto notnew;
	1513
	1514	/*
	1515	* PD index in PDP
	1516	*/
	1517	ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
	1518	ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
	1519	} else if (ptepindex < pmap_pml4_pindex()) {
	1520	/*
	1521	* pv is PDP, pvp is the root pml4 table
	1522	*/
	1523	pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
	1524	if (!isnew)
	1525	goto notnew;
	1526
	1527	/*
	1528	* PDP index in PML4
	1529	*/
	1530	ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
	1531	ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
	1532	} else {
	1533	/*
	1534	* pv represents the top-level PML4, there is no parent.
	1535	*/
	1536	pvp = NULL;
	1537	if (!isnew)
	1538	goto notnew;
	1539	}
	1540
	1541	/*
	1542	* This code is only reached if isnew is TRUE and this is not a
	1543	* terminal PV. We need to allocate a vm_page for the page table
	1544	* at this level and enter it into the parent page table.
	1545	*
	1546	* page table pages are marked PG_WRITEABLE and PG_MAPPED.
	1547	*/
	1548	for (;;) {
	1549	m = vm_page_alloc(NULL, pv->pv_pindex,
	1550	VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM \|
	1551	VM_ALLOC_INTERRUPT);
	1552	if (m)
	1553	break;
	1554	vm_wait(0);
	1555	}
	1556	vm_page_spin_lock(m);
	1557	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
	1558	pv->pv_m = m;
	1559	vm_page_flag_set(m, PG_MAPPED \| PG_WRITEABLE);
	1560	vm_page_spin_unlock(m);
	1561	vm_page_unmanage(m); /* m must be spinunlocked */
	1562
	1563	if ((m->flags & PG_ZERO) == 0) {
	1564	pmap_zero_page(VM_PAGE_TO_PHYS(m));
	1565	}
	1566	#ifdef PMAP_DEBUG
	1567	else {
	1568	pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
	1569	}
	1570	#endif
	1571	m->valid = VM_PAGE_BITS_ALL;
	1572	vm_page_flag_clear(m, PG_ZERO);
	1573	vm_page_wire(m); /* wire for mapping in parent */
	1574
	1575	/*
	1576	* Wire the page into pvp, bump the wire-count for pvp's page table
	1577	* page. Bump the resident_count for the pmap. There is no pvp
	1578	* for the top level, address the pm_pml4[] array directly.
	1579	*
	1580	* If the caller wants the parent we return it, otherwise
	1581	* we just put it away.
	1582	*
	1583	* No interlock is needed for pte 0 -> non-zero.
	1584	*/
	1585	if (pvp) {
	1586	vm_page_wire_quick(pvp->pv_m);
	1587	ptep = pv_pte_lookup(pvp, ptepindex);
	1588	KKASSERT((*ptep & PG_V) == 0);
	1589	*ptep = VM_PAGE_TO_PHYS(m) \| (PG_U \| PG_RW \| PG_V \|
	1590	PG_A \| PG_M);
	1591	}
	1592	vm_page_wakeup(m);
	1593	notnew:
	1594	if (pvpp)
	1595	*pvpp = pvp;
	1596	else if (pvp)
	1597	pv_put(pvp);
	1598	return (pv);
	1599	}
	1600
	1601	/*
	1602	* Release any resources held by the given physical map.
	1603	*
	1604	* Called when a pmap initialized by pmap_pinit is being released. Should
	1605	* only be called if the map contains no valid mappings.
	1606	*
	1607	* Caller must hold pmap->pm_token
	1608	*/
	1609	struct pmap_release_info {
	1610	pmap_t pmap;
	1611	int retry;
	1612	};
	1613
	1614	static int pmap_release_callback(pv_entry_t pv, void *data);
	1615
	1616	void
	1617	pmap_release(struct pmap *pmap)
	1618	{
	1619	struct pmap_release_info info;
	1620
	1621	KASSERT(pmap->pm_active == 0,
	1622	("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
	1623
	1624	spin_lock(&pmap_spin);
	1625	TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
	1626	spin_unlock(&pmap_spin);
	1627
	1628	/*
	1629	* Pull pv's off the RB tree in order from low to high and release
	1630	* each page.
	1631	*/
	1632	info.pmap = pmap;
	1633	do {
	1634	info.retry = 0;
	1635	spin_lock(&pmap->pm_spin);
	1636	RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
	1637	pmap_release_callback, &info);
	1638	spin_unlock(&pmap->pm_spin);
	1639	} while (info.retry);
	1640
	1641
	1642	/*
	1643	* One resident page (the pml4 page) should remain.
	1644	* No wired pages should remain.
	1645	*/
	1646	KKASSERT(pmap->pm_stats.resident_count == 1);
	1647	KKASSERT(pmap->pm_stats.wired_count == 0);
	1648	}
	1649
	1650	static int
	1651	pmap_release_callback(pv_entry_t pv, void *data)
	1652	{
	1653	struct pmap_release_info *info = data;
	1654	pmap_t pmap = info->pmap;
	1655	vm_page_t p;
	1656
	1657	if (pv_hold_try(pv)) {
	1658	spin_unlock(&pmap->pm_spin);
	1659	} else {
	1660	spin_unlock(&pmap->pm_spin);
	1661	pv_lock(pv);
	1662	if (pv->pv_pmap != pmap) {
	1663	pv_put(pv);
	1664	spin_lock(&pmap->pm_spin);
	1665	info->retry = 1;
	1666	return(-1);
	1667	}
	1668	}
	1669
	1670	/*
	1671	* The pmap is currently not spinlocked, pv is held+locked.
	1672	* Remove the pv's page from its parent's page table. The
	1673	* parent's page table page's wire_count will be decremented.
	1674	*/
	1675	pmap_remove_pv_pte(pv, NULL, NULL);
	1676
	1677	/*
	1678	* Terminal pvs are unhooked from their vm_pages. Because
	1679	* terminal pages aren't page table pages they aren't wired
	1680	* by us, so we have to be sure not to unwire them either.
	1681	*/
	1682	if (pv->pv_pindex < pmap_pt_pindex(0)) {
	1683	pmap_remove_pv_page(pv);
	1684	goto skip;
	1685	}
	1686
	1687	/*
	1688	* We leave the top-level page table page cached, wired, and
	1689	* mapped in the pmap until the dtor function (pmap_puninit())
	1690	* gets called.
	1691	*
	1692	* Since we are leaving the top-level pv intact we need
	1693	* to break out of what would otherwise be an infinite loop.
	1694	*/
	1695	if (pv->pv_pindex == pmap_pml4_pindex()) {
	1696	pv_put(pv);
	1697	spin_lock(&pmap->pm_spin);
	1698	return(-1);
	1699	}
	1700
	1701	/*
	1702	* For page table pages (other than the top-level page),
	1703	* remove and free the vm_page. The representitive mapping
	1704	* removed above by pmap_remove_pv_pte() did not undo the
	1705	* last wire_count so we have to do that as well.
	1706	*/
	1707	p = pmap_remove_pv_page(pv);
	1708	vm_page_busy_wait(p, FALSE, "pmaprl");
	1709	if (p->wire_count != 1) {
	1710	kprintf("p->wire_count was %016lx %d\n",
	1711	pv->pv_pindex, p->wire_count);
	1712	}
	1713	KKASSERT(p->wire_count == 1);
	1714	KKASSERT(p->flags & PG_UNMANAGED);
	1715
	1716	vm_page_unwire(p, 0);
	1717	KKASSERT(p->wire_count == 0);
	1718	/* JG eventually revert to using vm_page_free_zero() */
	1719	vm_page_free(p);
	1720	skip:
	1721	pv_free(pv);
	1722	spin_lock(&pmap->pm_spin);
	1723	return(0);
	1724	}
	1725
	1726	/*
	1727	* This function will remove the pte associated with a pv from its parent.
	1728	* Terminal pv's are supported. The removal will be interlocked if info
	1729	* is non-NULL. The caller must dispose of pv instead of just unlocking
	1730	* it.
	1731	*
	1732	* The wire count will be dropped on the parent page table. The wire
	1733	* count on the page being removed (pv->pv_m) from the parent page table
	1734	* is NOT touched. Note that terminal pages will not have any additional
	1735	* wire counts while page table pages will have at least one representing
	1736	* the mapping, plus others representing sub-mappings.
	1737	*
	1738	* NOTE: Cannot be called on kernel page table pages, only KVM terminal
	1739	* pages and user page table and terminal pages.
	1740	*
	1741	* The pv must be locked.
	1742	*
	1743	* XXX must lock parent pv's if they exist to remove pte XXX
	1744	*/
	1745	static
	1746	void
	1747	pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
	1748	{
	1749	vm_pindex_t ptepindex = pv->pv_pindex;
	1750	pmap_t pmap = pv->pv_pmap;
	1751	vm_page_t p;
	1752	int gotpvp = 0;
	1753
	1754	KKASSERT(pmap);
	1755
	1756	if (ptepindex == pmap_pml4_pindex()) {
	1757	/*
	1758	* We are the top level pml4 table, there is no parent.
	1759	*/
	1760	p = pmap->pm_pmlpv->pv_m;
	1761	} else if (ptepindex >= pmap_pdp_pindex(0)) {
	1762	/*
	1763	* Remove a PDP page from the pml4e. This can only occur
	1764	* with user page tables. We do not have to lock the
	1765	* pml4 PV so just ignore pvp.
	1766	*/
	1767	vm_pindex_t pml4_pindex;
	1768	vm_pindex_t pdp_index;
	1769	pml4_entry_t *pdp;
	1770
	1771	pdp_index = ptepindex - pmap_pdp_pindex(0);
	1772	if (pvp == NULL) {
	1773	pml4_pindex = pmap_pml4_pindex();
	1774	pvp = pv_get(pv->pv_pmap, pml4_pindex);
	1775	gotpvp = 1;
	1776	}
	1777	pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
	1778	KKASSERT((*pdp & PG_V) != 0);
	1779	p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
	1780	*pdp = 0;
	1781	KKASSERT(info == NULL);
	1782	} else if (ptepindex >= pmap_pd_pindex(0)) {
	1783	/*
	1784	* Remove a PD page from the pdp
	1785	*/
	1786	vm_pindex_t pdp_pindex;
	1787	vm_pindex_t pd_index;
	1788	pdp_entry_t *pd;
	1789
	1790	pd_index = ptepindex - pmap_pd_pindex(0);
	1791
	1792	if (pvp == NULL) {
	1793	pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
	1794	(pd_index >> NPML4EPGSHIFT);
	1795	pvp = pv_get(pv->pv_pmap, pdp_pindex);
	1796	gotpvp = 1;
	1797	}
	1798	pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1));
	1799	KKASSERT((*pd & PG_V) != 0);
	1800	p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
	1801	*pd = 0;
	1802	KKASSERT(info == NULL);
	1803	} else if (ptepindex >= pmap_pt_pindex(0)) {
	1804	/*
	1805	* Remove a PT page from the pd
	1806	*/
	1807	vm_pindex_t pd_pindex;
	1808	vm_pindex_t pt_index;
	1809	pd_entry_t *pt;
	1810
	1811	pt_index = ptepindex - pmap_pt_pindex(0);
	1812
	1813	if (pvp == NULL) {
	1814	pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
	1815	(pt_index >> NPDPEPGSHIFT);
	1816	pvp = pv_get(pv->pv_pmap, pd_pindex);
	1817	gotpvp = 1;
	1818	}
	1819	pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
	1820	KKASSERT((*pt & PG_V) != 0);
	1821	p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
	1822	*pt = 0;
	1823	KKASSERT(info == NULL);
	1824	} else {
	1825	/*
	1826	* Remove a PTE from the PT page
	1827	*
	1828	* NOTE: pv's must be locked bottom-up to avoid deadlocking.
	1829	* pv is a pte_pv so we can safely lock pt_pv.
	1830	*/
	1831	vm_pindex_t pt_pindex;
	1832	pt_entry_t *ptep;
	1833	pt_entry_t pte;
	1834	vm_offset_t va;
	1835
	1836	pt_pindex = ptepindex >> NPTEPGSHIFT;
	1837	va = (vm_offset_t)ptepindex << PAGE_SHIFT;
	1838
	1839	if (ptepindex >= NUPTE_USER) {
	1840	ptep = vtopte(ptepindex << PAGE_SHIFT);
	1841	KKASSERT(pvp == NULL);
	1842	} else {
	1843	if (pvp == NULL) {
	1844	pt_pindex = NUPTE_TOTAL +
	1845	(ptepindex >> NPDPEPGSHIFT);
	1846	pvp = pv_get(pv->pv_pmap, pt_pindex);
	1847	gotpvp = 1;
	1848	}
	1849	ptep = pv_pte_lookup(pvp, ptepindex &
	1850	((1ul << NPDPEPGSHIFT) - 1));
	1851	}
	1852
	1853	if (info)
	1854	pmap_inval_interlock(info, pmap, va);
	1855	pte = pte_load_clear(ptep);
	1856	if (info)
	1857	pmap_inval_deinterlock(info, pmap);
	1858	else
	1859	cpu_invlpg((void *)va);
	1860
	1861	/*
	1862	* Now update the vm_page_t
	1863	*/
	1864	if ((pte & (PG_MANAGED\|PG_V)) != (PG_MANAGED\|PG_V)) {
	1865	kprintf("remove_pte badpte %016lx %016lx %d\n",
	1866	pte, pv->pv_pindex,
	1867	pv->pv_pindex < pmap_pt_pindex(0));
	1868	}
	1869	/KKASSERT((pte & (PG_MANAGED\|PG_V)) == (PG_MANAGED\|PG_V));/
	1870	p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
	1871
	1872	if (pte & PG_M) {
	1873	if (pmap_track_modified(ptepindex))
	1874	vm_page_dirty(p);
	1875	}
	1876	if (pte & PG_A) {
	1877	vm_page_flag_set(p, PG_REFERENCED);
	1878	}
	1879	if (pte & PG_W)
	1880	atomic_add_long(&pmap->pm_stats.wired_count, -1);
	1881	if (pte & PG_G)
	1882	cpu_invlpg((void *)va);
	1883	}
	1884
	1885	/*
	1886	* Unwire the parent page table page. The wire_count cannot go below
	1887	* 1 here because the parent page table page is itself still mapped.
	1888	*
	1889	* XXX remove the assertions later.
	1890	*/
	1891	KKASSERT(pv->pv_m == p);
	1892	if (pvp && vm_page_unwire_quick(pvp->pv_m))
	1893	panic("pmap_remove_pv_pte: Insufficient wire_count");
	1894
	1895	if (gotpvp)
	1896	pv_put(pvp);
	1897	}
	1898
	1899	static
	1900	vm_page_t
	1901	pmap_remove_pv_page(pv_entry_t pv)
	1902	{
	1903	vm_page_t m;
	1904
	1905	m = pv->pv_m;
	1906	KKASSERT(m);
	1907	vm_page_spin_lock(m);
	1908	pv->pv_m = NULL;
	1909	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
	1910	/*
	1911	if (m->object)
	1912	atomic_add_int(&m->object->agg_pv_list_count, -1);
	1913	*/
	1914	if (TAILQ_EMPTY(&m->md.pv_list))
	1915	vm_page_flag_clear(m, PG_MAPPED \| PG_WRITEABLE);
	1916	vm_page_spin_unlock(m);
	1917	return(m);
	1918	}
	1919
	1920	/*
	1921	* Grow the number of kernel page table entries, if needed.
	1922	*
	1923	* This routine is always called to validate any address space
	1924	* beyond KERNBASE (for kldloads). kernel_vm_end only governs the address
	1925	* space below KERNBASE.
	1926	*/
	1927	void
	1928	pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
	1929	{
	1930	vm_paddr_t paddr;
	1931	vm_offset_t ptppaddr;
	1932	vm_page_t nkpg;
	1933	pd_entry_t *pt, newpt;
	1934	pdp_entry_t newpd;
	1935	int update_kernel_vm_end;
	1936
	1937	/*
	1938	* bootstrap kernel_vm_end on first real VM use
	1939	*/
	1940	if (kernel_vm_end == 0) {
	1941	kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
	1942	nkpt = 0;
	1943	while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
	1944	kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
	1945	~(PAGE_SIZE * NPTEPG - 1);
	1946	nkpt++;
	1947	if (kernel_vm_end - 1 >= kernel_map.max_offset) {
	1948	kernel_vm_end = kernel_map.max_offset;
	1949	break;
	1950	}
	1951	}
	1952	}
	1953
	1954	/*
	1955	* Fill in the gaps. kernel_vm_end is only adjusted for ranges
	1956	* below KERNBASE. Ranges above KERNBASE are kldloaded and we
	1957	* do not want to force-fill 128G worth of page tables.
	1958	*/
	1959	if (kstart < KERNBASE) {
	1960	if (kstart > kernel_vm_end)
	1961	kstart = kernel_vm_end;
	1962	KKASSERT(kend <= KERNBASE);
	1963	update_kernel_vm_end = 1;
	1964	} else {
	1965	update_kernel_vm_end = 0;
	1966	}
	1967
	1968	kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG);
	1969	kend = roundup2(kend, PAGE_SIZE * NPTEPG);
	1970
	1971	if (kend - 1 >= kernel_map.max_offset)
	1972	kend = kernel_map.max_offset;
	1973
	1974	while (kstart < kend) {
	1975	pt = pmap_pt(&kernel_pmap, kstart);
	1976	if (pt == NULL) {
	1977	/* We need a new PDP entry */
	1978	nkpg = vm_page_alloc(NULL, nkpt,
	1979	VM_ALLOC_NORMAL \|
	1980	VM_ALLOC_SYSTEM \|
	1981	VM_ALLOC_INTERRUPT);
	1982	if (nkpg == NULL) {
	1983	panic("pmap_growkernel: no memory to grow "
	1984	"kernel");
	1985	}
	1986	paddr = VM_PAGE_TO_PHYS(nkpg);
	1987	if ((nkpg->flags & PG_ZERO) == 0)
	1988	pmap_zero_page(paddr);
	1989	vm_page_flag_clear(nkpg, PG_ZERO);
	1990	newpd = (pdp_entry_t)
	1991	(paddr \| PG_V \| PG_RW \| PG_A \| PG_M);
	1992	*pmap_pd(&kernel_pmap, kstart) = newpd;
	1993	nkpt++;
	1994	continue; /* try again */
	1995	}
	1996	if ((*pt & PG_V) != 0) {
	1997	kstart = (kstart + PAGE_SIZE * NPTEPG) &
	1998	~(PAGE_SIZE * NPTEPG - 1);
	1999	if (kstart - 1 >= kernel_map.max_offset) {
	2000	kstart = kernel_map.max_offset;
	2001	break;
	2002	}
	2003	continue;
	2004	}
	2005
	2006	/*
	2007	* This index is bogus, but out of the way
	2008	*/
	2009	nkpg = vm_page_alloc(NULL, nkpt,
	2010	VM_ALLOC_NORMAL \|
	2011	VM_ALLOC_SYSTEM \|
	2012	VM_ALLOC_INTERRUPT);
	2013	if (nkpg == NULL)
	2014	panic("pmap_growkernel: no memory to grow kernel");
	2015
	2016	vm_page_wire(nkpg);
	2017	ptppaddr = VM_PAGE_TO_PHYS(nkpg);
	2018	pmap_zero_page(ptppaddr);
	2019	vm_page_flag_clear(nkpg, PG_ZERO);
	2020	newpt = (pd_entry_t) (ptppaddr \| PG_V \| PG_RW \| PG_A \| PG_M);
	2021	*pmap_pt(&kernel_pmap, kstart) = newpt;
	2022	nkpt++;
	2023
	2024	kstart = (kstart + PAGE_SIZE * NPTEPG) &
	2025	~(PAGE_SIZE * NPTEPG - 1);
	2026
	2027	if (kstart - 1 >= kernel_map.max_offset) {
	2028	kstart = kernel_map.max_offset;
	2029	break;
	2030	}
	2031	}
	2032
	2033	/*
	2034	* Only update kernel_vm_end for areas below KERNBASE.
	2035	*/
	2036	if (update_kernel_vm_end && kernel_vm_end < kstart)
	2037	kernel_vm_end = kstart;
	2038	}
	2039
	2040	/*
	2041	* Retire the given physical map from service.
	2042	* Should only be called if the map contains
	2043	* no valid mappings.
	2044	*/
	2045	void
	2046	pmap_destroy(pmap_t pmap)
	2047	{
	2048	int count;
	2049
	2050	if (pmap == NULL)
	2051	return;
	2052
	2053	lwkt_gettoken(&pmap->pm_token);
	2054	count = --pmap->pm_count;
	2055	if (count == 0) {
	2056	pmap_release(pmap); /* eats pm_token */
	2057	panic("destroying a pmap is not yet implemented");
	2058	}
	2059	lwkt_reltoken(&pmap->pm_token);
	2060	}
	2061
	2062	/*
	2063	* Add a reference to the specified pmap.
	2064	*/
	2065	void
	2066	pmap_reference(pmap_t pmap)
	2067	{
	2068	if (pmap != NULL) {
	2069	lwkt_gettoken(&pmap->pm_token);
	2070	pmap->pm_count++;
	2071	lwkt_reltoken(&pmap->pm_token);
	2072	}
	2073	}
	2074
	2075	/***************************************************
	2076	* page management routines.
	2077	***************************************************/
	2078
	2079	/*
	2080	* Hold a pv without locking it
	2081	*/
	2082	static void
	2083	pv_hold(pv_entry_t pv)
	2084	{
	2085	u_int count;
	2086
	2087	if (atomic_cmpset_int(&pv->pv_hold, 0, 1))
	2088	return;
	2089
	2090	for (;;) {
	2091	count = pv->pv_hold;
	2092	cpu_ccfence();
	2093	if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
	2094	return;
	2095	/* retry */
	2096	}
	2097	}
	2098
	2099	/*
	2100	* Hold a pv_entry, preventing its destruction. TRUE is returned if the pv
	2101	* was successfully locked, FALSE if it wasn't. The caller must dispose of
	2102	* the pv properly.
	2103	*
	2104	* Either the pmap->pm_spin or the related vm_page_spin (if traversing a
	2105	* pv list via its page) must be held by the caller.
	2106	*/
	2107	static int
	2108	_pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
	2109	{
	2110	u_int count;
	2111
	2112	if (atomic_cmpset_int(&pv->pv_hold, 0, PV_HOLD_LOCKED \| 1)) {
	2113	#ifdef PMAP_DEBUG
	2114	pv->pv_func = func;
	2115	pv->pv_line = lineno;
	2116	#endif
	2117	return TRUE;
	2118	}
	2119
	2120	for (;;) {
	2121	count = pv->pv_hold;
	2122	cpu_ccfence();
	2123	if ((count & PV_HOLD_LOCKED) == 0) {
	2124	if (atomic_cmpset_int(&pv->pv_hold, count,
	2125	(count + 1) \| PV_HOLD_LOCKED)) {
	2126	#ifdef PMAP_DEBUG
	2127	pv->pv_func = func;
	2128	pv->pv_line = lineno;
	2129	#endif
	2130	return TRUE;
	2131	}
	2132	} else {
	2133	if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
	2134	return FALSE;
	2135	}
	2136	/* retry */
	2137	}
	2138	}
	2139
	2140	/*
	2141	* Drop a previously held pv_entry which could not be locked, allowing its
	2142	* destruction.
	2143	*
	2144	* Must not be called with a spinlock held as we might zfree() the pv if it
	2145	* is no longer associated with a pmap and this was the last hold count.
	2146	*/
	2147	static void
	2148	pv_drop(pv_entry_t pv)
	2149	{
	2150	u_int count;
	2151
	2152	if (atomic_cmpset_int(&pv->pv_hold, 1, 0)) {
	2153	if (pv->pv_pmap == NULL)
	2154	zfree(pvzone, pv);
	2155	return;
	2156	}
	2157
	2158	for (;;) {
	2159	count = pv->pv_hold;
	2160	cpu_ccfence();
	2161	KKASSERT((count & PV_HOLD_MASK) > 0);
	2162	KKASSERT((count & (PV_HOLD_LOCKED \| PV_HOLD_MASK)) !=
	2163	(PV_HOLD_LOCKED \| 1));
	2164	if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
	2165	if (count == 1 && pv->pv_pmap == NULL)
	2166	zfree(pvzone, pv);
	2167	return;
	2168	}
	2169	/* retry */
	2170	}
	2171	}
	2172
	2173	/*
	2174	* Find or allocate the requested PV entry, returning a locked pv
	2175	*/
	2176	static
	2177	pv_entry_t
	2178	_pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
	2179	{
	2180	pv_entry_t pv;
	2181	pv_entry_t pnew = NULL;
	2182
	2183	spin_lock(&pmap->pm_spin);
	2184	for (;;) {
	2185	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex) {
	2186	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
	2187	pindex);
	2188	}
	2189	if (pv == NULL) {
	2190	if (pnew == NULL) {
	2191	spin_unlock(&pmap->pm_spin);
	2192	pnew = zalloc(pvzone);
	2193	spin_lock(&pmap->pm_spin);
	2194	continue;
	2195	}
	2196	pnew->pv_pmap = pmap;
	2197	pnew->pv_pindex = pindex;
	2198	pnew->pv_hold = PV_HOLD_LOCKED \| 1;
	2199	#ifdef PMAP_DEBUG
	2200	pnew->pv_func = func;
	2201	pnew->pv_line = lineno;
	2202	#endif
	2203	pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
	2204	atomic_add_long(&pmap->pm_stats.resident_count, 1);
	2205	spin_unlock(&pmap->pm_spin);
	2206	*isnew = 1;
	2207	return(pnew);
	2208	}
	2209	if (pnew) {
	2210	spin_unlock(&pmap->pm_spin);
	2211	zfree(pvzone, pnew);
	2212	pnew = NULL;
	2213	spin_lock(&pmap->pm_spin);
	2214	continue;
	2215	}
	2216	if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
	2217	spin_unlock(&pmap->pm_spin);
	2218	*isnew = 0;
	2219	return(pv);
	2220	}
	2221	spin_unlock(&pmap->pm_spin);
	2222	_pv_lock(pv PMAP_DEBUG_COPY);
	2223	if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
	2224	*isnew = 0;
	2225	return(pv);
	2226	}
	2227	pv_put(pv);
	2228	spin_lock(&pmap->pm_spin);
	2229	}
	2230
	2231
	2232	}
	2233
	2234	/*
	2235	* Find the requested PV entry, returning a locked+held pv or NULL
	2236	*/
	2237	static
	2238	pv_entry_t
	2239	_pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL)
	2240	{
	2241	pv_entry_t pv;
	2242
	2243	spin_lock(&pmap->pm_spin);
	2244	for (;;) {
	2245	/*
	2246	* Shortcut cache
	2247	*/
	2248	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex) {
	2249	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
	2250	pindex);
	2251	}
	2252	if (pv == NULL) {
	2253	spin_unlock(&pmap->pm_spin);
	2254	return NULL;
	2255	}
	2256	if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
	2257	pv_cache(pv, pindex);
	2258	spin_unlock(&pmap->pm_spin);
	2259	return(pv);
	2260	}
	2261	spin_unlock(&pmap->pm_spin);
	2262	_pv_lock(pv PMAP_DEBUG_COPY);
	2263	if (pv->pv_pmap == pmap && pv->pv_pindex == pindex)
	2264	return(pv);
	2265	pv_put(pv);
	2266	spin_lock(&pmap->pm_spin);
	2267	}
	2268	}
	2269
	2270	/*
	2271	* Lookup, hold, and attempt to lock (pmap,pindex).
	2272	*
	2273	* If the entry does not exist NULL is returned and *errorp is set to 0
	2274	*
	2275	* If the entry exists and could be successfully locked it is returned and
	2276	* errorp is set to 0.
	2277	*
	2278	* If the entry exists but could NOT be successfully locked it is returned
	2279	* held and *errorp is set to 1.
	2280	*/
	2281	static
	2282	pv_entry_t
	2283	pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
	2284	{
	2285	pv_entry_t pv;
	2286
	2287	spin_lock(&pmap->pm_spin);
	2288	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex)
	2289	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
	2290	if (pv == NULL) {
	2291	spin_unlock(&pmap->pm_spin);
	2292	*errorp = 0;
	2293	return NULL;
	2294	}
	2295	if (pv_hold_try(pv)) {
	2296	pv_cache(pv, pindex);
	2297	spin_unlock(&pmap->pm_spin);
	2298	*errorp = 0;
	2299	return(pv); /* lock succeeded */
	2300	}
	2301	spin_unlock(&pmap->pm_spin);
	2302	*errorp = 1;
	2303	return (pv); /* lock failed */
	2304	}
	2305
	2306	/*
	2307	* Find the requested PV entry, returning a held pv or NULL
	2308	*/
	2309	static
	2310	pv_entry_t
	2311	pv_find(pmap_t pmap, vm_pindex_t pindex)
	2312	{
	2313	pv_entry_t pv;
	2314
	2315	spin_lock(&pmap->pm_spin);
	2316
	2317	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex)
	2318	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
	2319	if (pv == NULL) {
	2320	spin_unlock(&pmap->pm_spin);
	2321	return NULL;
	2322	}
	2323	pv_hold(pv);
	2324	pv_cache(pv, pindex);
	2325	spin_unlock(&pmap->pm_spin);
	2326	return(pv);
	2327	}
	2328
	2329	/*
	2330	* Lock a held pv, keeping the hold count
	2331	*/
	2332	static
	2333	void
	2334	_pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
	2335	{
	2336	u_int count;
	2337
	2338	for (;;) {
	2339	count = pv->pv_hold;
	2340	cpu_ccfence();
	2341	if ((count & PV_HOLD_LOCKED) == 0) {
	2342	if (atomic_cmpset_int(&pv->pv_hold, count,
	2343	count \| PV_HOLD_LOCKED)) {
	2344	#ifdef PMAP_DEBUG
	2345	pv->pv_func = func;
	2346	pv->pv_line = lineno;
	2347	#endif
	2348	return;
	2349	}
	2350	continue;
	2351	}
	2352	tsleep_interlock(pv, 0);
	2353	if (atomic_cmpset_int(&pv->pv_hold, count,
	2354	count \| PV_HOLD_WAITING)) {
	2355	#ifdef PMAP_DEBUG
	2356	kprintf("pv waiting on %s:%d\n",
	2357	pv->pv_func, pv->pv_line);
	2358	#endif
	2359	tsleep(pv, PINTERLOCKED, "pvwait", hz);
	2360	}
	2361	/* retry */
	2362	}
	2363	}
	2364
	2365	/*
	2366	* Unlock a held and locked pv, keeping the hold count.
	2367	*/
	2368	static
	2369	void
	2370	pv_unlock(pv_entry_t pv)
	2371	{
	2372	u_int count;
	2373
	2374	if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED \| 1, 1))
	2375	return;
	2376
	2377	for (;;) {
	2378	count = pv->pv_hold;
	2379	cpu_ccfence();
	2380	KKASSERT((count & (PV_HOLD_LOCKED\|PV_HOLD_MASK)) >=
	2381	(PV_HOLD_LOCKED \| 1));
	2382	if (atomic_cmpset_int(&pv->pv_hold, count,
	2383	count &
	2384	~(PV_HOLD_LOCKED \| PV_HOLD_WAITING))) {
	2385	if (count & PV_HOLD_WAITING)
	2386	wakeup(pv);
	2387	break;
	2388	}
	2389	}
	2390	}
	2391
	2392	/*
	2393	* Unlock and drop a pv. If the pv is no longer associated with a pmap
	2394	* and the hold count drops to zero we will free it.
	2395	*
	2396	* Caller should not hold any spin locks. We are protected from hold races
	2397	* by virtue of holds only occuring only with a pmap_spin or vm_page_spin
	2398	* lock held. A pv cannot be located otherwise.
	2399	*/
	2400	static
	2401	void
	2402	pv_put(pv_entry_t pv)
	2403	{
	2404	if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED \| 1, 0)) {
	2405	if (pv->pv_pmap == NULL)
	2406	zfree(pvzone, pv);
	2407	return;
	2408	}
	2409	pv_unlock(pv);
	2410	pv_drop(pv);
	2411	}
	2412
	2413	/*
	2414	* Unlock, drop, and free a pv, destroying it. The pv is removed from its
	2415	* pmap. Any pte operations must have already been completed.
	2416	*/
	2417	static
	2418	void
	2419	pv_free(pv_entry_t pv)
	2420	{
	2421	pmap_t pmap;
	2422
	2423	KKASSERT(pv->pv_m == NULL);
	2424	if ((pmap = pv->pv_pmap) != NULL) {
	2425	spin_lock(&pmap->pm_spin);
	2426	pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
	2427	if (pmap->pm_pvhint == pv)
	2428	pmap->pm_pvhint = NULL;
	2429	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	2430	pv->pv_pmap = NULL;
	2431	pv->pv_pindex = 0;
	2432	spin_unlock(&pmap->pm_spin);
	2433	}
	2434	pv_put(pv);
	2435	}
	2436
	2437	/*
	2438	* This routine is very drastic, but can save the system
	2439	* in a pinch.
	2440	*/
	2441	void
	2442	pmap_collect(void)
	2443	{
	2444	int i;
	2445	vm_page_t m;
	2446	static int warningdone=0;
	2447
	2448	if (pmap_pagedaemon_waken == 0)
	2449	return;
	2450	pmap_pagedaemon_waken = 0;
	2451	if (warningdone < 5) {
	2452	kprintf("pmap_collect: collecting pv entries -- "
	2453	"suggest increasing PMAP_SHPGPERPROC\n");
	2454	warningdone++;
	2455	}
	2456
	2457	for (i = 0; i < vm_page_array_size; i++) {
	2458	m = &vm_page_array[i];
	2459	if (m->wire_count \|\| m->hold_count)
	2460	continue;
	2461	if (vm_page_busy_try(m, TRUE) == 0) {
	2462	if (m->wire_count == 0 && m->hold_count == 0) {
	2463	pmap_remove_all(m);
	2464	}
	2465	vm_page_wakeup(m);
	2466	}
	2467	}
	2468	}
	2469
	2470	/*
	2471	* Scan the pmap for active page table entries and issue a callback.
	2472	* The callback must dispose of pte_pv.
	2473	*
	2474	* NOTE: Unmanaged page table entries will not have a pte_pv
	2475	*
	2476	* NOTE: Kernel page table entries will not have a pt_pv. That is, wiring
	2477	* counts are not tracked in kernel page table pages.
	2478	*
	2479	* It is assumed that the start and end are properly rounded to the page size.
	2480	*/
	2481	static void
	2482	pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
	2483	void (func)(pmap_t, struct pmap_inval_info ,
	2484	pv_entry_t, pv_entry_t, vm_offset_t,
	2485	pt_entry_t , void ),
	2486	void *arg)
	2487	{
	2488	pv_entry_t pdp_pv; /* A page directory page PV */
	2489	pv_entry_t pd_pv; /* A page directory PV */
	2490	pv_entry_t pt_pv; /* A page table PV */
	2491	pv_entry_t pte_pv; /* A page table entry PV */
	2492	pt_entry_t *ptep;
	2493	vm_offset_t va_next;
	2494	struct pmap_inval_info info;
	2495	int error;
	2496
	2497	if (pmap == NULL)
	2498	return;
	2499
	2500	/*
	2501	* Hold the token for stability; if the pmap is empty we have nothing
	2502	* to do.
	2503	*/
	2504	lwkt_gettoken(&pmap->pm_token);
	2505	#if 0
	2506	if (pmap->pm_stats.resident_count == 0) {
	2507	lwkt_reltoken(&pmap->pm_token);
	2508	return;
	2509	}
	2510	#endif
	2511
	2512	pmap_inval_init(&info);
	2513
	2514	/*
	2515	* Special handling for removing one page, which is a very common
	2516	* operation (it is?).
	2517	* NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
	2518	*/
	2519	if (sva + PAGE_SIZE == eva) {
	2520	if (sva >= VM_MAX_USER_ADDRESS) {
	2521	/*
	2522	* Kernel mappings do not track wire counts on
	2523	* page table pages.
	2524	*/
	2525	pt_pv = NULL;
	2526	pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
	2527	ptep = vtopte(sva);
	2528	} else {
	2529	/*
	2530	* User mappings may or may not have a pte_pv but
	2531	* will always have a pt_pv if the page is present.
	2532	*/
	2533	pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
	2534	pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
	2535	if (pt_pv == NULL) {
	2536	KKASSERT(pte_pv == NULL);
	2537	goto fast_skip;
	2538	}
	2539	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
	2540	}
	2541	if (*ptep == 0) {
	2542	/*
	2543	* Unlike the pv_find() case below we actually
	2544	* acquired a locked pv in this case so any
	2545	* race should have been resolved. It is expected
	2546	* to not exist.
	2547	*/
	2548	KKASSERT(pte_pv == NULL);
	2549	} else if (pte_pv) {
	2550	KASSERT((*ptep & (PG_MANAGED\|PG_V)) == (PG_MANAGED\|
	2551	PG_V),
	2552	("bad *ptep %016lx sva %016lx pte_pv %p",
	2553	*ptep, sva, pte_pv));
	2554	func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
	2555	} else {
	2556	KASSERT((*ptep & (PG_MANAGED\|PG_V)) == PG_V,
	2557	("bad *ptep %016lx sva %016lx pte_pv NULL",
	2558	*ptep, sva));
	2559	func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
	2560	}
	2561	if (pt_pv)
	2562	pv_put(pt_pv);
	2563	fast_skip:
	2564	pmap_inval_done(&info);
	2565	lwkt_reltoken(&pmap->pm_token);
	2566	return;
	2567	}
	2568
	2569	/*
	2570	* NOTE: kernel mappings do not track page table pages, only
	2571	* terminal pages.
	2572	*
	2573	* NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
	2574	* However, for the scan to be efficient we try to
	2575	* cache items top-down.
	2576	*/
	2577	pdp_pv = NULL;
	2578	pd_pv = NULL;
	2579	pt_pv = NULL;
	2580
	2581	for (; sva < eva; sva = va_next) {
	2582	lwkt_yield();
	2583	if (sva >= VM_MAX_USER_ADDRESS) {
	2584	if (pt_pv) {
	2585	pv_put(pt_pv);
	2586	pt_pv = NULL;
	2587	}
	2588	goto kernel_skip;
	2589	}
	2590
	2591	/*
	2592	* PDP cache
	2593	*/
	2594	if (pdp_pv == NULL) {
	2595	pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
	2596	} else if (pdp_pv->pv_pindex != pmap_pdp_pindex(sva)) {
	2597	pv_put(pdp_pv);
	2598	pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
	2599	}
	2600	if (pdp_pv == NULL) {
	2601	va_next = (sva + NBPML4) & ~PML4MASK;
	2602	if (va_next < sva)
	2603	va_next = eva;
	2604	continue;
	2605	}
	2606
	2607	/*
	2608	* PD cache
	2609	*/
	2610	if (pd_pv == NULL) {
	2611	if (pdp_pv) {
	2612	pv_put(pdp_pv);
	2613	pdp_pv = NULL;
	2614	}
	2615	pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
	2616	} else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
	2617	pv_put(pd_pv);
	2618	if (pdp_pv) {
	2619	pv_put(pdp_pv);
	2620	pdp_pv = NULL;
	2621	}
	2622	pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
	2623	}
	2624	if (pd_pv == NULL) {
	2625	va_next = (sva + NBPDP) & ~PDPMASK;
	2626	if (va_next < sva)
	2627	va_next = eva;
	2628	continue;
	2629	}
	2630
	2631	/*
	2632	* PT cache
	2633	*/
	2634	if (pt_pv == NULL) {
	2635	if (pdp_pv) {
	2636	pv_put(pdp_pv);
	2637	pdp_pv = NULL;
	2638	}
	2639	if (pd_pv) {
	2640	pv_put(pd_pv);
	2641	pd_pv = NULL;
	2642	}
	2643	pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
	2644	} else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
	2645	if (pdp_pv) {
	2646	pv_put(pdp_pv);
	2647	pdp_pv = NULL;
	2648	}
	2649	if (pd_pv) {
	2650	pv_put(pd_pv);
	2651	pd_pv = NULL;
	2652	}
	2653	pv_put(pt_pv);
	2654	pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
	2655	}
	2656
	2657	/*
	2658	* We will scan or skip a page table page so adjust va_next
	2659	* either way.
	2660	*/
	2661	if (pt_pv == NULL) {
	2662	va_next = (sva + NBPDR) & ~PDRMASK;
	2663	if (va_next < sva)
	2664	va_next = eva;
	2665	continue;
	2666	}
	2667
	2668	/*
	2669	* From this point in the loop testing pt_pv for non-NULL
	2670	* means we are in UVM, else if it is NULL we are in KVM.
	2671	*/
	2672	kernel_skip:
	2673	va_next = (sva + NBPDR) & ~PDRMASK;
	2674	if (va_next < sva)
	2675	va_next = eva;
	2676
	2677	/*
	2678	* Limit our scan to either the end of the va represented
	2679	* by the current page table page, or to the end of the
	2680	* range being removed.
	2681	*
	2682	* Scan the page table for pages. Some pages may not be
	2683	* managed (might not have a pv_entry).
	2684	*
	2685	* There is no page table management for kernel pages so
	2686	* pt_pv will be NULL in that case, but otherwise pt_pv
	2687	* is non-NULL, locked, and referenced.
	2688	*/
	2689	if (va_next > eva)
	2690	va_next = eva;
	2691
	2692	/*
	2693	* At this point a non-NULL pt_pv means a UVA, and a NULL
	2694	* pt_pv means a KVA.
	2695	*/
	2696	if (pt_pv)
	2697	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
	2698	else
	2699	ptep = vtopte(sva);
	2700
	2701	while (sva < va_next) {
	2702	/*
	2703	* Acquire the related pte_pv, if any. If *ptep == 0
	2704	* the related pte_pv should not exist, but if *ptep
	2705	* is not zero the pte_pv may or may not exist (e.g.
	2706	* will not exist for an unmanaged page).
	2707	*
	2708	* However a multitude of races are possible here.
	2709	*
	2710	* In addition, the (pt_pv, pte_pv) lock order is
	2711	* backwards, so we have to be careful in aquiring
	2712	* a properly locked pte_pv.
	2713	*/
	2714	lwkt_yield();
	2715	if (pt_pv) {
	2716	pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
	2717	&error);
	2718	if (error) {
	2719	if (pdp_pv) {
	2720	pv_put(pdp_pv);
	2721	pdp_pv = NULL;
	2722	}
	2723	if (pd_pv) {
	2724	pv_put(pd_pv);
	2725	pd_pv = NULL;
	2726	}
	2727	pv_put(pt_pv); /* must be non-NULL */
	2728	pt_pv = NULL;
	2729	pv_lock(pte_pv); /* safe to block now */
	2730	pv_put(pte_pv);
	2731	pte_pv = NULL;
	2732	pt_pv = pv_get(pmap,
	2733	pmap_pt_pindex(sva));
	2734	continue;
	2735	}
	2736	} else {
	2737	pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
	2738	}
	2739
	2740	/*
	2741	* Ok, if *ptep == 0 we had better NOT have a pte_pv.
	2742	*/
	2743	if (*ptep == 0) {
	2744	if (pte_pv) {
	2745	kprintf("Unexpected non-NULL pte_pv "
	2746	"%p pt_pv %p *ptep = %016lx\n",
	2747	pte_pv, pt_pv, *ptep);
	2748	panic("Unexpected non-NULL pte_pv");
	2749	}
	2750	sva += PAGE_SIZE;
	2751	++ptep;
	2752	continue;
	2753	}
	2754
	2755	/*
	2756	* Ready for the callback. The locked pte_pv (if any)
	2757	* is consumed by the callback. pte_pv will exist if
	2758	* the page is managed, and will not exist if it
	2759	* isn't.
	2760	*/
	2761	if (pte_pv) {
	2762	KASSERT((*ptep & (PG_MANAGED\|PG_V)) ==
	2763	(PG_MANAGED\|PG_V),
	2764	("bad *ptep %016lx sva %016lx "
	2765	"pte_pv %p",
	2766	*ptep, sva, pte_pv));
	2767	func(pmap, &info, pte_pv, pt_pv, sva,
	2768	ptep, arg);
	2769	} else {
	2770	KASSERT((*ptep & (PG_MANAGED\|PG_V)) ==
	2771	PG_V,
	2772	("bad *ptep %016lx sva %016lx "
	2773	"pte_pv NULL",
	2774	*ptep, sva));
	2775	func(pmap, &info, pte_pv, pt_pv, sva,
	2776	ptep, arg);
	2777	}
	2778	pte_pv = NULL;
	2779	sva += PAGE_SIZE;
	2780	++ptep;
	2781	}
	2782	}
	2783	if (pdp_pv) {
	2784	pv_put(pdp_pv);
	2785	pdp_pv = NULL;
	2786	}
	2787	if (pd_pv) {
	2788	pv_put(pd_pv);
	2789	pd_pv = NULL;
	2790	}
	2791	if (pt_pv) {
	2792	pv_put(pt_pv);
	2793	pt_pv = NULL;
	2794	}
	2795	pmap_inval_done(&info);
	2796	lwkt_reltoken(&pmap->pm_token);
	2797	}
	2798
	2799	void
	2800	pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
	2801	{
	2802	pmap_scan(pmap, sva, eva, pmap_remove_callback, NULL);
	2803	}
	2804
	2805	static void
	2806	pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
	2807	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	2808	pt_entry_t ptep, void arg __unused)
	2809	{
	2810	pt_entry_t pte;
	2811
	2812	if (pte_pv) {
	2813	/*
	2814	* This will also drop pt_pv's wire_count. Note that
	2815	* terminal pages are not wired based on mmu presence.
	2816	*/
	2817	pmap_remove_pv_pte(pte_pv, pt_pv, info);
	2818	pmap_remove_pv_page(pte_pv);
	2819	pv_free(pte_pv);
	2820	} else {
	2821	/*
	2822	* pt_pv's wire_count is still bumped by unmanaged pages
	2823	* so we must decrement it manually.
	2824	*/
	2825	pmap_inval_interlock(info, pmap, va);
	2826	pte = pte_load_clear(ptep);
	2827	pmap_inval_deinterlock(info, pmap);
	2828	if (pte & PG_W)
	2829	atomic_add_long(&pmap->pm_stats.wired_count, -1);
	2830	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	2831	if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
	2832	panic("pmap_remove: insufficient wirecount");
	2833	}
	2834	}
	2835
	2836	/*
	2837	* Removes this physical page from all physical maps in which it resides.
	2838	* Reflects back modify bits to the pager.
	2839	*
	2840	* This routine may not be called from an interrupt.
	2841	*/
	2842	static
	2843	void
	2844	pmap_remove_all(vm_page_t m)
	2845	{
	2846	struct pmap_inval_info info;
	2847	pv_entry_t pv;
	2848
	2849	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	2850	return;
	2851
	2852	pmap_inval_init(&info);
	2853	vm_page_spin_lock(m);
	2854	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
	2855	KKASSERT(pv->pv_m == m);
	2856	if (pv_hold_try(pv)) {
	2857	vm_page_spin_unlock(m);
	2858	} else {
	2859	vm_page_spin_unlock(m);
	2860	pv_lock(pv);
	2861	if (pv->pv_m != m) {
	2862	pv_put(pv);
	2863	vm_page_spin_lock(m);
	2864	continue;
	2865	}
	2866	}
	2867	/*
	2868	* Holding no spinlocks, pv is locked.
	2869	*/
	2870	pmap_remove_pv_pte(pv, NULL, &info);
	2871	pmap_remove_pv_page(pv);
	2872	pv_free(pv);
	2873	vm_page_spin_lock(m);
	2874	}
	2875	KKASSERT((m->flags & (PG_MAPPED\|PG_WRITEABLE)) == 0);
	2876	vm_page_spin_unlock(m);
	2877	pmap_inval_done(&info);
	2878	}
	2879
	2880	/*
	2881	* pmap_protect:
	2882	*
	2883	* Set the physical protection on the specified range of this map
	2884	* as requested.
	2885	*
	2886	* This function may not be called from an interrupt if the map is
	2887	* not the kernel_pmap.
	2888	*/
	2889	void
	2890	pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
	2891	{
	2892	/* JG review for NX */
	2893
	2894	if (pmap == NULL)
	2895	return;
	2896	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
	2897	pmap_remove(pmap, sva, eva);
	2898	return;
	2899	}
	2900	if (prot & VM_PROT_WRITE)
	2901	return;
	2902	pmap_scan(pmap, sva, eva, pmap_protect_callback, &prot);
	2903	}
	2904
	2905	static
	2906	void
	2907	pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
	2908	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	2909	pt_entry_t ptep, void arg __unused)
	2910	{
	2911	pt_entry_t pbits;
	2912	pt_entry_t cbits;
	2913	vm_page_t m;
	2914
	2915	/*
	2916	* XXX non-optimal.
	2917	*/
	2918	pmap_inval_interlock(info, pmap, va);
	2919	again:
	2920	pbits = *ptep;
	2921	cbits = pbits;
	2922	if (pte_pv) {
	2923	m = NULL;
	2924	if (pbits & PG_A) {
	2925	m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
	2926	KKASSERT(m == pte_pv->pv_m);
	2927	vm_page_flag_set(m, PG_REFERENCED);
	2928	cbits &= ~PG_A;
	2929	}
	2930	if (pbits & PG_M) {
	2931	if (pmap_track_modified(pte_pv->pv_pindex)) {
	2932	if (m == NULL)
	2933	m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
	2934	vm_page_dirty(m);
	2935	cbits &= ~PG_M;
	2936	}
	2937	}
	2938	}
	2939	cbits &= ~PG_RW;
	2940	if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
	2941	goto again;
	2942	}
	2943	pmap_inval_deinterlock(info, pmap);
	2944	if (pte_pv)
	2945	pv_put(pte_pv);
	2946	}
	2947
	2948	/*
	2949	* Insert the vm_page (m) at the virtual address (va), replacing any prior
	2950	* mapping at that address. Set protection and wiring as requested.
	2951	*
	2952	* NOTE: This routine MUST insert the page into the pmap now, it cannot
	2953	* lazy-evaluate.
	2954	*/
	2955	void
	2956	pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
	2957	boolean_t wired)
	2958	{
	2959	pmap_inval_info info;
	2960	pv_entry_t pt_pv; /* page table */
	2961	pv_entry_t pte_pv; /* page table entry */
	2962	pt_entry_t *ptep;
	2963	vm_paddr_t opa;
	2964	pt_entry_t origpte, newpte;
	2965	vm_paddr_t pa;
	2966
	2967	if (pmap == NULL)
	2968	return;
	2969	va = trunc_page(va);
	2970	#ifdef PMAP_DIAGNOSTIC
	2971	if (va >= KvaEnd)
	2972	panic("pmap_enter: toobig");
	2973	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
	2974	panic("pmap_enter: invalid to pmap_enter page table "
	2975	"pages (va: 0x%lx)", va);
	2976	#endif
	2977	if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
	2978	kprintf("Warning: pmap_enter called on UVA with "
	2979	"kernel_pmap\n");
	2980	#ifdef DDB
	2981	db_print_backtrace();
	2982	#endif
	2983	}
	2984	if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
	2985	kprintf("Warning: pmap_enter called on KVA without"
	2986	"kernel_pmap\n");
	2987	#ifdef DDB
	2988	db_print_backtrace();
	2989	#endif
	2990	}
	2991
	2992	/*
	2993	* Get locked PV entries for our new page table entry (pte_pv)
	2994	* and for its parent page table (pt_pv). We need the parent
	2995	* so we can resolve the location of the ptep.
	2996	*
	2997	* Only hardware MMU actions can modify the ptep out from
	2998	* under us.
	2999	*
	3000	* if (m) is fictitious or unmanaged we do not create a managing
	3001	* pte_pv for it. Any pre-existing page's management state must
	3002	* match (avoiding code complexity).
	3003	*
	3004	* If the pmap is still being initialized we assume existing
	3005	* page tables.
	3006	*
	3007	* Kernel mapppings do not track page table pages (i.e. pt_pv).
	3008	* pmap_allocpte() checks the
	3009	*/
	3010	if (pmap_initialized == FALSE) {
	3011	pte_pv = NULL;
	3012	pt_pv = NULL;
	3013	ptep = vtopte(va);
	3014	} else if (m->flags & (PG_FICTITIOUS \| PG_UNMANAGED)) {
	3015	pte_pv = NULL;
	3016	if (va >= VM_MAX_USER_ADDRESS) {
	3017	pt_pv = NULL;
	3018	ptep = vtopte(va);
	3019	} else {
	3020	pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
	3021	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
	3022	}
	3023	KKASSERT(ptep == 0 \|\| (ptep & PG_MANAGED) == 0);
	3024	} else {
	3025	if (va >= VM_MAX_USER_ADDRESS) {
	3026	pt_pv = NULL;
	3027	pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
	3028	ptep = vtopte(va);
	3029	} else {
	3030	pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va),
	3031	&pt_pv);
	3032	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
	3033	}
	3034	KKASSERT(ptep == 0 \|\| (ptep & PG_MANAGED));
	3035	}
	3036
	3037	pa = VM_PAGE_TO_PHYS(m);
	3038	origpte = *ptep;
	3039	opa = origpte & PG_FRAME;
	3040
	3041	newpte = (pt_entry_t)(pa \| pte_prot(pmap, prot) \| PG_V \| PG_A);
	3042	if (wired)
	3043	newpte \|= PG_W;
	3044	if (va < VM_MAX_USER_ADDRESS)
	3045	newpte \|= PG_U;
	3046	if (pte_pv)
	3047	newpte \|= PG_MANAGED;
	3048	if (pmap == &kernel_pmap)
	3049	newpte \|= pgeflag;
	3050
	3051	/*
	3052	* It is possible for multiple faults to occur in threaded
	3053	* environments, the existing pte might be correct.
	3054	*/
	3055	if (((origpte ^ newpte) & ~(pt_entry_t)(PG_M\|PG_A)) == 0)
	3056	goto done;
	3057
	3058	if ((prot & VM_PROT_NOSYNC) == 0)
	3059	pmap_inval_init(&info);
	3060
	3061	/*
	3062	* Ok, either the address changed or the protection or wiring
	3063	* changed.
	3064	*
	3065	* Clear the current entry, interlocking the removal. For managed
	3066	* pte's this will also flush the modified state to the vm_page.
	3067	* Atomic ops are mandatory in order to ensure that PG_M events are
	3068	* not lost during any transition.
	3069	*/
	3070	if (opa) {
	3071	if (pte_pv) {
	3072	/*
	3073	* pmap_remove_pv_pte() unwires pt_pv and assumes
	3074	* we will free pte_pv, but since we are reusing
	3075	* pte_pv we want to retain the wire count.
	3076	*
	3077	* pt_pv won't exist for a kernel page (managed or
	3078	* otherwise).
	3079	*/
	3080	if (pt_pv)
	3081	vm_page_wire_quick(pt_pv->pv_m);
	3082	if (prot & VM_PROT_NOSYNC)
	3083	pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
	3084	else
	3085	pmap_remove_pv_pte(pte_pv, pt_pv, &info);
	3086	if (pte_pv->pv_m)
	3087	pmap_remove_pv_page(pte_pv);
	3088	} else if (prot & VM_PROT_NOSYNC) {
	3089	/* leave wire count on PT page intact */
	3090	(void)pte_load_clear(ptep);
	3091	cpu_invlpg((void *)va);
	3092	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	3093	} else {
	3094	/* leave wire count on PT page intact */
	3095	pmap_inval_interlock(&info, pmap, va);
	3096	(void)pte_load_clear(ptep);
	3097	pmap_inval_deinterlock(&info, pmap);
	3098	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	3099	}
	3100	KKASSERT(*ptep == 0);
	3101	}
	3102
	3103	if (pte_pv) {
	3104	/*
	3105	* Enter on the PV list if part of our managed memory.
	3106	* Wiring of the PT page is already handled.
	3107	*/
	3108	KKASSERT(pte_pv->pv_m == NULL);
	3109	vm_page_spin_lock(m);
	3110	pte_pv->pv_m = m;
	3111	TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
	3112	/*
	3113	if (m->object)
	3114	atomic_add_int(&m->object->agg_pv_list_count, 1);
	3115	*/
	3116	vm_page_flag_set(m, PG_MAPPED);
	3117	vm_page_spin_unlock(m);
	3118	} else if (pt_pv && opa == 0) {
	3119	/*
	3120	* We have to adjust the wire count on the PT page ourselves
	3121	* for unmanaged entries. If opa was non-zero we retained
	3122	* the existing wire count from the removal.
	3123	*/
	3124	vm_page_wire_quick(pt_pv->pv_m);
	3125	}
	3126
	3127	/*
	3128	* Ok, for UVM (pt_pv != NULL) we don't need to interlock or
	3129	* invalidate anything, the TLB won't have any stale entries to
	3130	* remove.
	3131	*
	3132	* For KVM there appear to still be issues. Theoretically we
	3133	* should be able to scrap the interlocks entirely but we
	3134	* get crashes.
	3135	*/
	3136	if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
	3137	pmap_inval_interlock(&info, pmap, va);
	3138	(volatile pt_entry_t )ptep = newpte;
	3139
	3140	if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
	3141	pmap_inval_deinterlock(&info, pmap);
	3142	else if (pt_pv == NULL)
	3143	cpu_invlpg((void *)va);
	3144
	3145	if (wired)
	3146	atomic_add_long(&pmap->pm_stats.wired_count, 1);
	3147	if (newpte & PG_RW)
	3148	vm_page_flag_set(m, PG_WRITEABLE);
	3149	if (pte_pv == NULL)
	3150	atomic_add_long(&pmap->pm_stats.resident_count, 1);
	3151
	3152	/*
	3153	* Cleanup
	3154	*/
	3155	if ((prot & VM_PROT_NOSYNC) == 0 \|\| pte_pv == NULL)
	3156	pmap_inval_done(&info);
	3157	done:
	3158	KKASSERT((newpte & PG_MANAGED) == 0 \|\| (m->flags & PG_MAPPED));
	3159
	3160	/*
	3161	* Cleanup the pv entry, allowing other accessors.
	3162	*/
	3163	if (pte_pv)
	3164	pv_put(pte_pv);
	3165	if (pt_pv)
	3166	pv_put(pt_pv);
	3167	}
	3168
	3169	/*
	3170	* This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
	3171	* This code also assumes that the pmap has no pre-existing entry for this
	3172	* VA.
	3173	*
	3174	* This code currently may only be used on user pmaps, not kernel_pmap.
	3175	*/
	3176	void
	3177	pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
	3178	{
	3179	pmap_enter(pmap, va, m, VM_PROT_READ, FALSE);
	3180	}
	3181
	3182	/*
	3183	* Make a temporary mapping for a physical address. This is only intended
	3184	* to be used for panic dumps.
	3185	*
	3186	* The caller is responsible for calling smp_invltlb().
	3187	*/
	3188	void *
	3189	pmap_kenter_temporary(vm_paddr_t pa, long i)
	3190	{
	3191	pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
	3192	return ((void *)crashdumpmap);
	3193	}
	3194
	3195	#define MAX_INIT_PT (96)
	3196
	3197	/*
	3198	* This routine preloads the ptes for a given object into the specified pmap.
	3199	* This eliminates the blast of soft faults on process startup and
	3200	* immediately after an mmap.
	3201	*/
	3202	static int pmap_object_init_pt_callback(vm_page_t p, void *data);
	3203
	3204	void
	3205	pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
	3206	vm_object_t object, vm_pindex_t pindex,
	3207	vm_size_t size, int limit)
	3208	{
	3209	struct rb_vm_page_scan_info info;
	3210	struct lwp *lp;
	3211	vm_size_t psize;
	3212
	3213	/*
	3214	* We can't preinit if read access isn't set or there is no pmap
	3215	* or object.
	3216	*/
	3217	if ((prot & VM_PROT_READ) == 0 \|\| pmap == NULL \|\| object == NULL)
	3218	return;
	3219
	3220	/*
	3221	* We can't preinit if the pmap is not the current pmap
	3222	*/
	3223	lp = curthread->td_lwp;
	3224	if (lp == NULL \|\| pmap != vmspace_pmap(lp->lwp_vmspace))
	3225	return;
	3226
	3227	psize = x86_64_btop(size);
	3228
	3229	if ((object->type != OBJT_VNODE) \|\|
	3230	((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
	3231	(object->resident_page_count > MAX_INIT_PT))) {
	3232	return;
	3233	}
	3234
	3235	if (pindex + psize > object->size) {
	3236	if (object->size < pindex)
	3237	return;
	3238	psize = object->size - pindex;
	3239	}
	3240
	3241	if (psize == 0)
	3242	return;
	3243
	3244	/*
	3245	* Use a red-black scan to traverse the requested range and load
	3246	* any valid pages found into the pmap.
	3247	*
	3248	* We cannot safely scan the object's memq without holding the
	3249	* object token.
	3250	*/
	3251	info.start_pindex = pindex;
	3252	info.end_pindex = pindex + psize - 1;
	3253	info.limit = limit;
	3254	info.mpte = NULL;
	3255	info.addr = addr;
	3256	info.pmap = pmap;
	3257
	3258	vm_object_hold_shared(object);
	3259	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	3260	pmap_object_init_pt_callback, &info);
	3261	vm_object_drop(object);
	3262	}
	3263
	3264	static
	3265	int
	3266	pmap_object_init_pt_callback(vm_page_t p, void *data)
	3267	{
	3268	struct rb_vm_page_scan_info *info = data;
	3269	vm_pindex_t rel_index;
	3270
	3271	/*
	3272	* don't allow an madvise to blow away our really
	3273	* free pages allocating pv entries.
	3274	*/
	3275	if ((info->limit & MAP_PREFAULT_MADVISE) &&
	3276	vmstats.v_free_count < vmstats.v_free_reserved) {
	3277	return(-1);
	3278	}
	3279	if (vm_page_busy_try(p, TRUE))
	3280	return 0;
	3281	if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
	3282	(p->flags & PG_FICTITIOUS) == 0) {
	3283	if ((p->queue - p->pc) == PQ_CACHE)
	3284	vm_page_deactivate(p);
	3285	rel_index = p->pindex - info->start_pindex;
	3286	pmap_enter_quick(info->pmap,
	3287	info->addr + x86_64_ptob(rel_index), p);
	3288	}
	3289	vm_page_wakeup(p);
	3290	lwkt_yield();
	3291	return(0);
	3292	}
	3293
	3294	/*
	3295	* Return TRUE if the pmap is in shape to trivially pre-fault the specified
	3296	* address.
	3297	*
	3298	* Returns FALSE if it would be non-trivial or if a pte is already loaded
	3299	* into the slot.
	3300	*
	3301	* XXX This is safe only because page table pages are not freed.
	3302	*/
	3303	int
	3304	pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
	3305	{
	3306	pt_entry_t *pte;
	3307
	3308	/spin_lock(&pmap->pm_spin);/
	3309	if ((pte = pmap_pte(pmap, addr)) != NULL) {
	3310	if (*pte & PG_V) {
	3311	/spin_unlock(&pmap->pm_spin);/
	3312	return FALSE;
	3313	}
	3314	}
	3315	/spin_unlock(&pmap->pm_spin);/
	3316	return TRUE;
	3317	}
	3318
	3319	/*
	3320	* Change the wiring attribute for a pmap/va pair. The mapping must already
	3321	* exist in the pmap. The mapping may or may not be managed.
	3322	*/
	3323	void
	3324	pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
	3325	{
	3326	pt_entry_t *ptep;
	3327	pv_entry_t pv;
	3328
	3329	if (pmap == NULL)
	3330	return;
	3331	lwkt_gettoken(&pmap->pm_token);
	3332	pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
	3333	ptep = pv_pte_lookup(pv, pmap_pte_index(va));
	3334
	3335	if (wired && !pmap_pte_w(ptep))
	3336	atomic_add_long(&pmap->pm_stats.wired_count, 1);
	3337	else if (!wired && pmap_pte_w(ptep))
	3338	atomic_add_long(&pmap->pm_stats.wired_count, -1);
	3339
	3340	/*
	3341	* Wiring is not a hardware characteristic so there is no need to
	3342	* invalidate TLB. However, in an SMP environment we must use
	3343	* a locked bus cycle to update the pte (if we are not using
	3344	* the pmap_inval_*() API that is)... it's ok to do this for simple
	3345	* wiring changes.
	3346	*/
	3347	#ifdef SMP
	3348	if (wired)
	3349	atomic_set_long(ptep, PG_W);
	3350	else
	3351	atomic_clear_long(ptep, PG_W);
	3352	#else
	3353	if (wired)
	3354	atomic_set_long_nonlocked(ptep, PG_W);
	3355	else
	3356	atomic_clear_long_nonlocked(ptep, PG_W);
	3357	#endif
	3358	pv_put(pv);
	3359	lwkt_reltoken(&pmap->pm_token);
	3360	}
	3361
	3362
	3363
	3364	/*
	3365	* Copy the range specified by src_addr/len from the source map to
	3366	* the range dst_addr/len in the destination map.
	3367	*
	3368	* This routine is only advisory and need not do anything.
	3369	*/
	3370	void
	3371	pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
	3372	vm_size_t len, vm_offset_t src_addr)
	3373	{
	3374	}
	3375
	3376	/*
	3377	* pmap_zero_page:
	3378	*
	3379	* Zero the specified physical page.
	3380	*
	3381	* This function may be called from an interrupt and no locking is
	3382	* required.
	3383	*/
	3384	void
	3385	pmap_zero_page(vm_paddr_t phys)
	3386	{
	3387	vm_offset_t va = PHYS_TO_DMAP(phys);
	3388
	3389	pagezero((void *)va);
	3390	}
	3391
	3392	/*
	3393	* pmap_page_assertzero:
	3394	*
	3395	* Assert that a page is empty, panic if it isn't.
	3396	*/
	3397	void
	3398	pmap_page_assertzero(vm_paddr_t phys)
	3399	{
	3400	vm_offset_t va = PHYS_TO_DMAP(phys);
	3401	size_t i;
	3402
	3403	for (i = 0; i < PAGE_SIZE; i += sizeof(long)) {
	3404	if ((long )((char *)va + i) != 0) {
	3405	panic("pmap_page_assertzero() @ %p not zero!\n",
	3406	(void *)(intptr_t)va);
	3407	}
	3408	}
	3409	}
	3410
	3411	/*
	3412	* pmap_zero_page:
	3413	*
	3414	* Zero part of a physical page by mapping it into memory and clearing
	3415	* its contents with bzero.
	3416	*
	3417	* off and size may not cover an area beyond a single hardware page.
	3418	*/
	3419	void
	3420	pmap_zero_page_area(vm_paddr_t phys, int off, int size)
	3421	{
	3422	vm_offset_t virt = PHYS_TO_DMAP(phys);
	3423
	3424	bzero((char *)virt + off, size);
	3425	}
	3426
	3427	/*
	3428	* pmap_copy_page:
	3429	*
	3430	* Copy the physical page from the source PA to the target PA.
	3431	* This function may be called from an interrupt. No locking
	3432	* is required.
	3433	*/
	3434	void
	3435	pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
	3436	{
	3437	vm_offset_t src_virt, dst_virt;
	3438
	3439	src_virt = PHYS_TO_DMAP(src);
	3440	dst_virt = PHYS_TO_DMAP(dst);
	3441	bcopy((void )src_virt, (void )dst_virt, PAGE_SIZE);
	3442	}
	3443
	3444	/*
	3445	* pmap_copy_page_frag:
	3446	*
	3447	* Copy the physical page from the source PA to the target PA.
	3448	* This function may be called from an interrupt. No locking
	3449	* is required.
	3450	*/
	3451	void
	3452	pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
	3453	{
	3454	vm_offset_t src_virt, dst_virt;
	3455
	3456	src_virt = PHYS_TO_DMAP(src);
	3457	dst_virt = PHYS_TO_DMAP(dst);
	3458
	3459	bcopy((char *)src_virt + (src & PAGE_MASK),
	3460	(char *)dst_virt + (dst & PAGE_MASK),
	3461	bytes);
	3462	}
	3463
	3464	/*
	3465	* Returns true if the pmap's pv is one of the first 16 pvs linked to from
	3466	* this page. This count may be changed upwards or downwards in the future;
	3467	* it is only necessary that true be returned for a small subset of pmaps
	3468	* for proper page aging.
	3469	*/
	3470	boolean_t
	3471	pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
	3472	{
	3473	pv_entry_t pv;
	3474	int loops = 0;
	3475
	3476	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3477	return FALSE;
	3478
	3479	vm_page_spin_lock(m);
	3480	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3481	if (pv->pv_pmap == pmap) {
	3482	vm_page_spin_unlock(m);
	3483	return TRUE;
	3484	}
	3485	loops++;
	3486	if (loops >= 16)
	3487	break;
	3488	}
	3489	vm_page_spin_unlock(m);
	3490	return (FALSE);
	3491	}
	3492
	3493	/*
	3494	* Remove all pages from specified address space this aids process exit
	3495	* speeds. Also, this code may be special cased for the current process
	3496	* only.
	3497	*/
	3498	void
	3499	pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
	3500	{
	3501	pmap_remove(pmap, sva, eva);
	3502	}
	3503
	3504	/*
	3505	* pmap_testbit tests bits in pte's note that the testbit/clearbit
	3506	* routines are inline, and a lot of things compile-time evaluate.
	3507	*/
	3508	static
	3509	boolean_t
	3510	pmap_testbit(vm_page_t m, int bit)
	3511	{
	3512	pv_entry_t pv;
	3513	pt_entry_t *pte;
	3514
	3515	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3516	return FALSE;
	3517
	3518	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
	3519	return FALSE;
	3520	vm_page_spin_lock(m);
	3521	if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
	3522	vm_page_spin_unlock(m);
	3523	return FALSE;
	3524	}
	3525
	3526	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3527	/*
	3528	* if the bit being tested is the modified bit, then
	3529	* mark clean_map and ptes as never
	3530	* modified.
	3531	*/
	3532	if (bit & (PG_A\|PG_M)) {
	3533	if (!pmap_track_modified(pv->pv_pindex))
	3534	continue;
	3535	}
	3536
	3537	#if defined(PMAP_DIAGNOSTIC)
	3538	if (pv->pv_pmap == NULL) {
	3539	kprintf("Null pmap (tb) at pindex: %"PRIu64"\n",
	3540	pv->pv_pindex);
	3541	continue;
	3542	}
	3543	#endif
	3544	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
	3545	if (*pte & bit) {
	3546	vm_page_spin_unlock(m);
	3547	return TRUE;
	3548	}
	3549	}
	3550	vm_page_spin_unlock(m);
	3551	return (FALSE);
	3552	}
	3553
	3554	/*
	3555	* This routine is used to modify bits in ptes. Only one bit should be
	3556	* specified. PG_RW requires special handling.
	3557	*
	3558	* Caller must NOT hold any spin locks
	3559	*/
	3560	static __inline
	3561	void
	3562	pmap_clearbit(vm_page_t m, int bit)
	3563	{
	3564	struct pmap_inval_info info;
	3565	pv_entry_t pv;
	3566	pt_entry_t *pte;
	3567	pt_entry_t pbits;
	3568	pmap_t save_pmap;
	3569
	3570	if (bit == PG_RW)
	3571	vm_page_flag_clear(m, PG_WRITEABLE);
	3572	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS)) {
	3573	return;
	3574	}
	3575
	3576	/*
	3577	* PG_M or PG_A case
	3578	*
	3579	* Loop over all current mappings setting/clearing as appropos If
	3580	* setting RO do we need to clear the VAC?
	3581	*
	3582	* NOTE: When clearing PG_M we could also (not implemented) drop
	3583	* through to the PG_RW code and clear PG_RW too, forcing
	3584	* a fault on write to redetect PG_M for virtual kernels, but
	3585	* it isn't necessary since virtual kernels invalidate the
	3586	* pte when they clear the VPTE_M bit in their virtual page
	3587	* tables.
	3588	*
	3589	* NOTE: Does not re-dirty the page when clearing only PG_M.
	3590	*/
	3591	if ((bit & PG_RW) == 0) {
	3592	vm_page_spin_lock(m);
	3593	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3594	#if defined(PMAP_DIAGNOSTIC)
	3595	if (pv->pv_pmap == NULL) {
	3596	kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
	3597	pv->pv_pindex);
	3598	continue;
	3599	}
	3600	#endif
	3601	pte = pmap_pte_quick(pv->pv_pmap,
	3602	pv->pv_pindex << PAGE_SHIFT);
	3603	pbits = *pte;
	3604	if (pbits & bit)
	3605	atomic_clear_long(pte, bit);
	3606	}
	3607	vm_page_spin_unlock(m);
	3608	return;
	3609	}
	3610
	3611	/*
	3612	* Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M
	3613	* was set.
	3614	*/
	3615	pmap_inval_init(&info);
	3616
	3617	restart:
	3618	vm_page_spin_lock(m);
	3619	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3620	/*
	3621	* don't write protect pager mappings
	3622	*/
	3623	if (!pmap_track_modified(pv->pv_pindex))
	3624	continue;
	3625
	3626	#if defined(PMAP_DIAGNOSTIC)
	3627	if (pv->pv_pmap == NULL) {
	3628	kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
	3629	pv->pv_pindex);
	3630	continue;
	3631	}
	3632	#endif
	3633	/*
	3634	* Skip pages which do not have PG_RW set.
	3635	*/
	3636	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
	3637	if ((*pte & PG_RW) == 0)
	3638	continue;
	3639
	3640	/*
	3641	* Lock the PV
	3642	*/
	3643	if (pv_hold_try(pv) == 0) {
	3644	vm_page_spin_unlock(m);
	3645	pv_lock(pv); /* held, now do a blocking lock */
	3646	pv_put(pv); /* and release */
	3647	goto restart; /* anything could have happened */
	3648	}
	3649
	3650	save_pmap = pv->pv_pmap;
	3651	vm_page_spin_unlock(m);
	3652	pmap_inval_interlock(&info, save_pmap,
	3653	(vm_offset_t)pv->pv_pindex << PAGE_SHIFT);
	3654	KKASSERT(pv->pv_pmap == save_pmap);
	3655	for (;;) {
	3656	pbits = *pte;
	3657	cpu_ccfence();
	3658	if (atomic_cmpset_long(pte, pbits,
	3659	pbits & ~(PG_RW\|PG_M))) {
	3660	break;
	3661	}
	3662	}
	3663	pmap_inval_deinterlock(&info, save_pmap);
	3664	vm_page_spin_lock(m);
	3665
	3666	/*
	3667	* If PG_M was found to be set while we were clearing PG_RW
	3668	* we also clear PG_M (done above) and mark the page dirty.
	3669	* Callers expect this behavior.
	3670	*/
	3671	if (pbits & PG_M)
	3672	vm_page_dirty(m);
	3673	pv_put(pv);
	3674	}
	3675	vm_page_spin_unlock(m);
	3676	pmap_inval_done(&info);
	3677	}
	3678
	3679	/*
	3680	* Lower the permission for all mappings to a given page.
	3681	*
	3682	* Page must be busied by caller.
	3683	*/
	3684	void
	3685	pmap_page_protect(vm_page_t m, vm_prot_t prot)
	3686	{
	3687	/* JG NX support? */
	3688	if ((prot & VM_PROT_WRITE) == 0) {
	3689	if (prot & (VM_PROT_READ \| VM_PROT_EXECUTE)) {
	3690	/*
	3691	* NOTE: pmap_clearbit(.. PG_RW) also clears
	3692	* the PG_WRITEABLE flag in (m).
	3693	*/
	3694	pmap_clearbit(m, PG_RW);
	3695	} else {
	3696	pmap_remove_all(m);
	3697	}
	3698	}
	3699	}
	3700
	3701	vm_paddr_t
	3702	pmap_phys_address(vm_pindex_t ppn)
	3703	{
	3704	return (x86_64_ptob(ppn));
	3705	}
	3706
	3707	/*
	3708	* Return a count of reference bits for a page, clearing those bits.
	3709	* It is not necessary for every reference bit to be cleared, but it
	3710	* is necessary that 0 only be returned when there are truly no
	3711	* reference bits set.
	3712	*
	3713	* XXX: The exact number of bits to check and clear is a matter that
	3714	* should be tested and standardized at some point in the future for
	3715	* optimal aging of shared pages.
	3716	*
	3717	* This routine may not block.
	3718	*/
	3719	int
	3720	pmap_ts_referenced(vm_page_t m)
	3721	{
	3722	pv_entry_t pv;
	3723	pt_entry_t *pte;
	3724	int rtval = 0;
	3725
	3726	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3727	return (rtval);
	3728
	3729	vm_page_spin_lock(m);
	3730	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3731	if (!pmap_track_modified(pv->pv_pindex))
	3732	continue;
	3733	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
	3734	if (pte && (*pte & PG_A)) {
	3735	#ifdef SMP
	3736	atomic_clear_long(pte, PG_A);
	3737	#else
	3738	atomic_clear_long_nonlocked(pte, PG_A);
	3739	#endif
	3740	rtval++;
	3741	if (rtval > 4)
	3742	break;
	3743	}
	3744	}
	3745	vm_page_spin_unlock(m);
	3746	return (rtval);
	3747	}
	3748
	3749	/*
	3750	* pmap_is_modified:
	3751	*
	3752	* Return whether or not the specified physical page was modified
	3753	* in any physical maps.
	3754	*/
	3755	boolean_t
	3756	pmap_is_modified(vm_page_t m)
	3757	{
	3758	boolean_t res;
	3759
	3760	res = pmap_testbit(m, PG_M);
	3761	return (res);
	3762	}
	3763
	3764	/*
	3765	* Clear the modify bits on the specified physical page.
	3766	*/
	3767	void
	3768	pmap_clear_modify(vm_page_t m)
	3769	{
	3770	pmap_clearbit(m, PG_M);
	3771	}
	3772
	3773	/*
	3774	* pmap_clear_reference:
	3775	*
	3776	* Clear the reference bit on the specified physical page.
	3777	*/
	3778	void
	3779	pmap_clear_reference(vm_page_t m)
	3780	{
	3781	pmap_clearbit(m, PG_A);
	3782	}
	3783
	3784	/*
	3785	* Miscellaneous support routines follow
	3786	*/
	3787
	3788	static
	3789	void
	3790	i386_protection_init(void)
	3791	{
	3792	int *kp, prot;
	3793
	3794	/* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */
	3795	kp = protection_codes;
	3796	for (prot = 0; prot < 8; prot++) {
	3797	switch (prot) {
	3798	case VM_PROT_NONE \| VM_PROT_NONE \| VM_PROT_NONE:
	3799	/*
	3800	* Read access is also 0. There isn't any execute bit,
	3801	* so just make it readable.
	3802	*/
	3803	case VM_PROT_READ \| VM_PROT_NONE \| VM_PROT_NONE:
	3804	case VM_PROT_READ \| VM_PROT_NONE \| VM_PROT_EXECUTE:
	3805	case VM_PROT_NONE \| VM_PROT_NONE \| VM_PROT_EXECUTE:
	3806	*kp++ = 0;
	3807	break;
	3808	case VM_PROT_NONE \| VM_PROT_WRITE \| VM_PROT_NONE:
	3809	case VM_PROT_NONE \| VM_PROT_WRITE \| VM_PROT_EXECUTE:
	3810	case VM_PROT_READ \| VM_PROT_WRITE \| VM_PROT_NONE:
	3811	case VM_PROT_READ \| VM_PROT_WRITE \| VM_PROT_EXECUTE:
	3812	*kp++ = PG_RW;
	3813	break;
	3814	}
	3815	}
	3816	}
	3817
	3818	/*
	3819	* Map a set of physical memory pages into the kernel virtual
	3820	* address space. Return a pointer to where it is mapped. This
	3821	* routine is intended to be used for mapping device memory,
	3822	* NOT real memory.
	3823	*
	3824	* NOTE: we can't use pgeflag unless we invalidate the pages one at
	3825	* a time.
	3826	*/
	3827	void *
	3828	pmap_mapdev(vm_paddr_t pa, vm_size_t size)
	3829	{
	3830	vm_offset_t va, tmpva, offset;
	3831	pt_entry_t *pte;
	3832
	3833	offset = pa & PAGE_MASK;
	3834	size = roundup(offset + size, PAGE_SIZE);
	3835
	3836	va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
	3837	if (va == 0)
	3838	panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
	3839
	3840	pa = pa & ~PAGE_MASK;
	3841	for (tmpva = va; size > 0;) {
	3842	pte = vtopte(tmpva);
	3843	pte = pa \| PG_RW \| PG_V; / \| pgeflag; */
	3844	size -= PAGE_SIZE;
	3845	tmpva += PAGE_SIZE;
	3846	pa += PAGE_SIZE;
	3847	}
	3848	cpu_invltlb();
	3849	smp_invltlb();
	3850
	3851	return ((void *)(va + offset));
	3852	}
	3853
	3854	void *
	3855	pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
	3856	{
	3857	vm_offset_t va, tmpva, offset;
	3858	pt_entry_t *pte;
	3859
	3860	offset = pa & PAGE_MASK;
	3861	size = roundup(offset + size, PAGE_SIZE);
	3862
	3863	va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
	3864	if (va == 0)
	3865	panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
	3866
	3867	pa = pa & ~PAGE_MASK;
	3868	for (tmpva = va; size > 0;) {
	3869	pte = vtopte(tmpva);
	3870	pte = pa \| PG_RW \| PG_V \| PG_N; / \| pgeflag; */
	3871	size -= PAGE_SIZE;
	3872	tmpva += PAGE_SIZE;
	3873	pa += PAGE_SIZE;
	3874	}
	3875	cpu_invltlb();
	3876	smp_invltlb();
	3877
	3878	return ((void *)(va + offset));
	3879	}
	3880
	3881	void
	3882	pmap_unmapdev(vm_offset_t va, vm_size_t size)
	3883	{
	3884	vm_offset_t base, offset;
	3885
	3886	base = va & ~PAGE_MASK;
	3887	offset = va & PAGE_MASK;
	3888	size = roundup(offset + size, PAGE_SIZE);
	3889	pmap_qremove(va, size >> PAGE_SHIFT);
	3890	kmem_free(&kernel_map, base, size);
	3891	}
	3892
	3893	/*
	3894	* perform the pmap work for mincore
	3895	*/
	3896	int
	3897	pmap_mincore(pmap_t pmap, vm_offset_t addr)
	3898	{
	3899	pt_entry_t *ptep, pte;
	3900	vm_page_t m;
	3901	int val = 0;
	3902
	3903	lwkt_gettoken(&pmap->pm_token);
	3904	ptep = pmap_pte(pmap, addr);
	3905
	3906	if (ptep && (pte = *ptep) != 0) {
	3907	vm_offset_t pa;
	3908
	3909	val = MINCORE_INCORE;
	3910	if ((pte & PG_MANAGED) == 0)
	3911	goto done;
	3912
	3913	pa = pte & PG_FRAME;
	3914
	3915	m = PHYS_TO_VM_PAGE(pa);
	3916
	3917	/*
	3918	* Modified by us
	3919	*/
	3920	if (pte & PG_M)
	3921	val \|= MINCORE_MODIFIED\|MINCORE_MODIFIED_OTHER;
	3922	/*
	3923	* Modified by someone
	3924	*/
	3925	else if (m->dirty \|\| pmap_is_modified(m))
	3926	val \|= MINCORE_MODIFIED_OTHER;
	3927	/*
	3928	* Referenced by us
	3929	*/
	3930	if (pte & PG_A)
	3931	val \|= MINCORE_REFERENCED\|MINCORE_REFERENCED_OTHER;
	3932
	3933	/*
	3934	* Referenced by someone
	3935	*/
	3936	else if ((m->flags & PG_REFERENCED) \|\| pmap_ts_referenced(m)) {
	3937	val \|= MINCORE_REFERENCED_OTHER;
	3938	vm_page_flag_set(m, PG_REFERENCED);
	3939	}
	3940	}
	3941	done:
	3942	lwkt_reltoken(&pmap->pm_token);
	3943
	3944	return val;
	3945	}
	3946
	3947	/*
	3948	* Replace p->p_vmspace with a new one. If adjrefs is non-zero the new
	3949	* vmspace will be ref'd and the old one will be deref'd.
	3950	*
	3951	* The vmspace for all lwps associated with the process will be adjusted
	3952	* and cr3 will be reloaded if any lwp is the current lwp.
	3953	*
	3954	* The process must hold the vmspace->vm_map.token for oldvm and newvm
	3955	*/
	3956	void
	3957	pmap_replacevm(struct proc p, struct vmspace newvm, int adjrefs)
	3958	{
	3959	struct vmspace *oldvm;
	3960	struct lwp *lp;
	3961
	3962	oldvm = p->p_vmspace;
	3963	if (oldvm != newvm) {
	3964	if (adjrefs)
	3965	sysref_get(&newvm->vm_sysref);
	3966	p->p_vmspace = newvm;
	3967	KKASSERT(p->p_nthreads == 1);
	3968	lp = RB_ROOT(&p->p_lwp_tree);
	3969	pmap_setlwpvm(lp, newvm);
	3970	if (adjrefs)
	3971	sysref_put(&oldvm->vm_sysref);
	3972	}
	3973	}
	3974
	3975	/*
	3976	* Set the vmspace for a LWP. The vmspace is almost universally set the
	3977	* same as the process vmspace, but virtual kernels need to swap out contexts
	3978	* on a per-lwp basis.
	3979	*
	3980	* Caller does not necessarily hold any vmspace tokens. Caller must control
	3981	* the lwp (typically be in the context of the lwp). We use a critical
	3982	* section to protect against statclock and hardclock (statistics collection).
	3983	*/
	3984	void
	3985	pmap_setlwpvm(struct lwp lp, struct vmspace newvm)
	3986	{
	3987	struct vmspace *oldvm;
	3988	struct pmap *pmap;
	3989
	3990	oldvm = lp->lwp_vmspace;
	3991
	3992	if (oldvm != newvm) {
	3993	crit_enter();
	3994	lp->lwp_vmspace = newvm;
	3995	if (curthread->td_lwp == lp) {
	3996	pmap = vmspace_pmap(newvm);
	3997	#if defined(SMP)
	3998	atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
	3999	if (pmap->pm_active & CPUMASK_LOCK)
	4000	pmap_interlock_wait(newvm);
	4001	#else
	4002	pmap->pm_active \|= 1;
	4003	#endif
	4004	#if defined(SWTCH_OPTIM_STATS)
	4005	tlb_flush_count++;
	4006	#endif
	4007	curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
	4008	curthread->td_pcb->pcb_cr3 \|= PG_RW \| PG_U \| PG_V;
	4009	load_cr3(curthread->td_pcb->pcb_cr3);
	4010	pmap = vmspace_pmap(oldvm);
	4011	#if defined(SMP)
	4012	atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
	4013	#else
	4014	pmap->pm_active &= ~(cpumask_t)1;
	4015	#endif
	4016	}
	4017	crit_exit();
	4018	}
	4019	}
	4020
	4021	#ifdef SMP
	4022
	4023	/*
	4024	* Called when switching to a locked pmap, used to interlock against pmaps
	4025	* undergoing modifications to prevent us from activating the MMU for the
	4026	* target pmap until all such modifications have completed. We have to do
	4027	* this because the thread making the modifications has already set up its
	4028	* SMP synchronization mask.
	4029	*
	4030	* This function cannot sleep!
	4031	*
	4032	* No requirements.
	4033	*/
	4034	void
	4035	pmap_interlock_wait(struct vmspace *vm)
	4036	{
	4037	struct pmap *pmap = &vm->vm_pmap;
	4038
	4039	if (pmap->pm_active & CPUMASK_LOCK) {
	4040	crit_enter();
	4041	DEBUG_PUSH_INFO("pmap_interlock_wait");
	4042	while (pmap->pm_active & CPUMASK_LOCK) {
	4043	cpu_ccfence();
	4044	lwkt_process_ipiq();
	4045	}
	4046	DEBUG_POP_INFO();
	4047	crit_exit();
	4048	}
	4049	}
	4050
	4051	#endif
	4052
	4053	vm_offset_t
	4054	pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
	4055	{
	4056
	4057	if ((obj == NULL) \|\| (size < NBPDR) \|\| (obj->type != OBJT_DEVICE)) {
	4058	return addr;
	4059	}
	4060
	4061	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
	4062	return addr;
	4063	}
	4064
	4065	/*
	4066	* Used by kmalloc/kfree, page already exists at va
	4067	*/
	4068	vm_page_t
	4069	pmap_kvtom(vm_offset_t va)
	4070	{
	4071	return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME));
	4072	}