gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991 Regents of the University of California.
	3	* Copyright (c) 1994 John S. Dyson
	4	* Copyright (c) 1994 David Greenman
	5	* Copyright (c) 2003 Peter Wemm
	6	* Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu>
	7	* Copyright (c) 2008, 2009 The DragonFly Project.
	8	* Copyright (c) 2008, 2009 Jordan Gordeev.
	9	* Copyright (c) 2011 Matthew Dillon
	10	* All rights reserved.
	11	*
	12	* This code is derived from software contributed to Berkeley by
	13	* the Systems Programming Group of the University of Utah Computer
	14	* Science Department and William Jolitz of UUNET Technologies Inc.
	15	*
	16	* Redistribution and use in source and binary forms, with or without
	17	* modification, are permitted provided that the following conditions
	18	* are met:
	19	* 1. Redistributions of source code must retain the above copyright
	20	* notice, this list of conditions and the following disclaimer.
	21	* 2. Redistributions in binary form must reproduce the above copyright
	22	* notice, this list of conditions and the following disclaimer in the
	23	* documentation and/or other materials provided with the distribution.
	24	* 3. All advertising materials mentioning features or use of this software
	25	* must display the following acknowledgement:
	26	* This product includes software developed by the University of
	27	* California, Berkeley and its contributors.
	28	* 4. Neither the name of the University nor the names of its contributors
	29	* may be used to endorse or promote products derived from this software
	30	* without specific prior written permission.
	31	*
	32	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	33	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	34	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	35	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	36	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	37	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	38	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	39	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	40	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	41	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	42	* SUCH DAMAGE.
	43	*/
	44	/*
	45	* Manage physical address maps for x86-64 systems.
	46	*/
	47
	48	#if JG
	49	#include "opt_disable_pse.h"
	50	#include "opt_pmap.h"
	51	#endif
	52	#include "opt_msgbuf.h"
	53
	54	#include <sys/param.h>
	55	#include <sys/systm.h>
	56	#include <sys/kernel.h>
	57	#include <sys/proc.h>
	58	#include <sys/msgbuf.h>
	59	#include <sys/vmmeter.h>
	60	#include <sys/mman.h>
	61
	62	#include <vm/vm.h>
	63	#include <vm/vm_param.h>
	64	#include <sys/sysctl.h>
	65	#include <sys/lock.h>
	66	#include <vm/vm_kern.h>
	67	#include <vm/vm_page.h>
	68	#include <vm/vm_map.h>
	69	#include <vm/vm_object.h>
	70	#include <vm/vm_extern.h>
	71	#include <vm/vm_pageout.h>
	72	#include <vm/vm_pager.h>
	73	#include <vm/vm_zone.h>
	74
	75	#include <sys/user.h>
	76	#include <sys/thread2.h>
	77	#include <sys/sysref2.h>
	78	#include <sys/spinlock2.h>
	79	#include <vm/vm_page2.h>
	80
	81	#include <machine/cputypes.h>
	82	#include <machine/md_var.h>
	83	#include <machine/specialreg.h>
	84	#include <machine/smp.h>
	85	#include <machine_base/apic/apicreg.h>
	86	#include <machine/globaldata.h>
	87	#include <machine/pmap.h>
	88	#include <machine/pmap_inval.h>
	89	#include <machine/inttypes.h>
	90
	91	#include <ddb/ddb.h>
	92
	93	#define PMAP_KEEP_PDIRS
	94	#ifndef PMAP_SHPGPERPROC
	95	#define PMAP_SHPGPERPROC 2000
	96	#endif
	97
	98	#if defined(DIAGNOSTIC)
	99	#define PMAP_DIAGNOSTIC
	100	#endif
	101
	102	#define MINPV 2048
	103
	104	/*
	105	* pmap debugging will report who owns a pv lock when blocking.
	106	*/
	107	#ifdef PMAP_DEBUG
	108
	109	#define PMAP_DEBUG_DECL ,const char *func, int lineno
	110	#define PMAP_DEBUG_ARGS , __func__, __LINE__
	111	#define PMAP_DEBUG_COPY , func, lineno
	112
	113	#define pv_get(pmap, pindex) _pv_get(pmap, pindex \
	114	PMAP_DEBUG_ARGS)
	115	#define pv_lock(pv) _pv_lock(pv \
	116	PMAP_DEBUG_ARGS)
	117	#define pv_hold_try(pv) _pv_hold_try(pv \
	118	PMAP_DEBUG_ARGS)
	119	#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \
	120	PMAP_DEBUG_ARGS)
	121
	122	#else
	123
	124	#define PMAP_DEBUG_DECL
	125	#define PMAP_DEBUG_ARGS
	126	#define PMAP_DEBUG_COPY
	127
	128	#define pv_get(pmap, pindex) _pv_get(pmap, pindex)
	129	#define pv_lock(pv) _pv_lock(pv)
	130	#define pv_hold_try(pv) _pv_hold_try(pv)
	131	#define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp)
	132
	133	#endif
	134
	135	/*
	136	* Get PDEs and PTEs for user/kernel address space
	137	*/
	138	#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
	139
	140	#define pmap_pde_v(pte) (((pd_entry_t )pte & PG_V) != 0)
	141	#define pmap_pte_w(pte) (((pt_entry_t )pte & PG_W) != 0)
	142	#define pmap_pte_m(pte) (((pt_entry_t )pte & PG_M) != 0)
	143	#define pmap_pte_u(pte) (((pt_entry_t )pte & PG_A) != 0)
	144	#define pmap_pte_v(pte) (((pt_entry_t )pte & PG_V) != 0)
	145
	146	/*
	147	* Given a map and a machine independent protection code,
	148	* convert to a vax protection code.
	149	*/
	150	#define pte_prot(m, p) \
	151	(protection_codes[p & (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE)])
	152	static int protection_codes[8];
	153
	154	struct pmap kernel_pmap;
	155	static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
	156
	157	vm_paddr_t avail_start; /* PA of first available physical page */
	158	vm_paddr_t avail_end; /* PA of last available physical page */
	159	vm_offset_t virtual2_start; /* cutout free area prior to kernel start */
	160	vm_offset_t virtual2_end;
	161	vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */
	162	vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
	163	vm_offset_t KvaStart; /* VA start of KVA space */
	164	vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */
	165	vm_offset_t KvaSize; /* max size of kernel virtual address space */
	166	static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */
	167	static int pgeflag; /* PG_G or-in */
	168	static int pseflag; /* PG_PS or-in */
	169
	170	static int ndmpdp;
	171	static vm_paddr_t dmaplimit;
	172	static int nkpt;
	173	vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
	174
	175	static uint64_t KPTbase;
	176	static uint64_t KPTphys;
	177	static uint64_t KPDphys; /* phys addr of kernel level 2 */
	178	static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */
	179	uint64_t KPDPphys; /* phys addr of kernel level 3 */
	180	uint64_t KPML4phys; /* phys addr of kernel level 4 */
	181
	182	static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */
	183	static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */
	184
	185	/*
	186	* Data for the pv entry allocation mechanism
	187	*/
	188	static vm_zone_t pvzone;
	189	static struct vm_zone pvzone_store;
	190	static struct vm_object pvzone_obj;
	191	static int pv_entry_max=0, pv_entry_high_water=0;
	192	static int pmap_pagedaemon_waken = 0;
	193	static struct pv_entry *pvinit;
	194
	195	/*
	196	* All those kernel PT submaps that BSD is so fond of
	197	*/
	198	pt_entry_t CMAP1 = 0, ptmmap;
	199	caddr_t CADDR1 = 0, ptvmmap = 0;
	200	static pt_entry_t *msgbufmap;
	201	struct msgbuf *msgbufp=0;
	202
	203	/*
	204	* Crashdump maps.
	205	*/
	206	static pt_entry_t *pt_crashdumpmap;
	207	static caddr_t crashdumpmap;
	208
	209	static int pmap_yield_count = 64;
	210	SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW,
	211	&pmap_yield_count, 0, "Yield during init_pt/release");
	212
	213	#define DISABLE_PSE
	214
	215	static void pv_hold(pv_entry_t pv);
	216	static int _pv_hold_try(pv_entry_t pv
	217	PMAP_DEBUG_DECL);
	218	static void pv_drop(pv_entry_t pv);
	219	static void _pv_lock(pv_entry_t pv
	220	PMAP_DEBUG_DECL);
	221	static void pv_unlock(pv_entry_t pv);
	222	static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew
	223	PMAP_DEBUG_DECL);
	224	static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex
	225	PMAP_DEBUG_DECL);
	226	static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp);
	227	static pv_entry_t pv_find(pmap_t pmap, vm_pindex_t pindex);
	228	static void pv_put(pv_entry_t pv);
	229	static void pv_free(pv_entry_t pv);
	230	static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex);
	231	static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
	232	pv_entry_t *pvpp);
	233	static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp,
	234	struct pmap_inval_info *info);
	235	static vm_page_t pmap_remove_pv_page(pv_entry_t pv);
	236
	237	static void pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
	238	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	239	pt_entry_t ptep, void arg __unused);
	240	static void pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
	241	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	242	pt_entry_t ptep, void arg __unused);
	243
	244	static void i386_protection_init (void);
	245	static void create_pagetables(vm_paddr_t *firstaddr);
	246	static void pmap_remove_all (vm_page_t m);
	247	static boolean_t pmap_testbit (vm_page_t m, int bit);
	248
	249	static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
	250	static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
	251
	252	static unsigned pdir4mb;
	253
	254	static int
	255	pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2)
	256	{
	257	if (pv1->pv_pindex < pv2->pv_pindex)
	258	return(-1);
	259	if (pv1->pv_pindex > pv2->pv_pindex)
	260	return(1);
	261	return(0);
	262	}
	263
	264	RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry,
	265	pv_entry_compare, vm_pindex_t, pv_pindex);
	266
	267	/*
	268	* Move the kernel virtual free pointer to the next
	269	* 2MB. This is used to help improve performance
	270	* by using a large (2MB) page for much of the kernel
	271	* (.text, .data, .bss)
	272	*/
	273	static
	274	vm_offset_t
	275	pmap_kmem_choose(vm_offset_t addr)
	276	{
	277	vm_offset_t newaddr = addr;
	278
	279	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
	280	return newaddr;
	281	}
	282
	283	/*
	284	* pmap_pte_quick:
	285	*
	286	* Super fast pmap_pte routine best used when scanning the pv lists.
	287	* This eliminates many course-grained invltlb calls. Note that many of
	288	* the pv list scans are across different pmaps and it is very wasteful
	289	* to do an entire invltlb when checking a single mapping.
	290	*/
	291	static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va);
	292
	293	static
	294	pt_entry_t *
	295	pmap_pte_quick(pmap_t pmap, vm_offset_t va)
	296	{
	297	return pmap_pte(pmap, va);
	298	}
	299
	300	/*
	301	* Returns the pindex of a page table entry (representing a terminal page).
	302	* There are NUPTE_TOTAL page table entries possible (a huge number)
	303	*
	304	* x86-64 has a 48-bit address space, where bit 47 is sign-extended out.
	305	* We want to properly translate negative KVAs.
	306	*/
	307	static __inline
	308	vm_pindex_t
	309	pmap_pte_pindex(vm_offset_t va)
	310	{
	311	return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1));
	312	}
	313
	314	/*
	315	* Returns the pindex of a page table.
	316	*/
	317	static __inline
	318	vm_pindex_t
	319	pmap_pt_pindex(vm_offset_t va)
	320	{
	321	return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1)));
	322	}
	323
	324	/*
	325	* Returns the pindex of a page directory.
	326	*/
	327	static __inline
	328	vm_pindex_t
	329	pmap_pd_pindex(vm_offset_t va)
	330	{
	331	return (NUPTE_TOTAL + NUPT_TOTAL +
	332	((va >> PDPSHIFT) & (NUPD_TOTAL - 1)));
	333	}
	334
	335	static __inline
	336	vm_pindex_t
	337	pmap_pdp_pindex(vm_offset_t va)
	338	{
	339	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
	340	((va >> PML4SHIFT) & (NUPDP_TOTAL - 1)));
	341	}
	342
	343	static __inline
	344	vm_pindex_t
	345	pmap_pml4_pindex(void)
	346	{
	347	return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL);
	348	}
	349
	350	/*
	351	* Return various clipped indexes for a given VA
	352	*
	353	* Returns the index of a pte in a page table, representing a terminal
	354	* page.
	355	*/
	356	static __inline
	357	vm_pindex_t
	358	pmap_pte_index(vm_offset_t va)
	359	{
	360	return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1));
	361	}
	362
	363	/*
	364	* Returns the index of a pt in a page directory, representing a page
	365	* table.
	366	*/
	367	static __inline
	368	vm_pindex_t
	369	pmap_pt_index(vm_offset_t va)
	370	{
	371	return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
	372	}
	373
	374	/*
	375	* Returns the index of a pd in a page directory page, representing a page
	376	* directory.
	377	*/
	378	static __inline
	379	vm_pindex_t
	380	pmap_pd_index(vm_offset_t va)
	381	{
	382	return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1));
	383	}
	384
	385	/*
	386	* Returns the index of a pdp in the pml4 table, representing a page
	387	* directory page.
	388	*/
	389	static __inline
	390	vm_pindex_t
	391	pmap_pdp_index(vm_offset_t va)
	392	{
	393	return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1));
	394	}
	395
	396	/*
	397	* Generic procedure to index a pte from a pt, pd, or pdp.
	398	*/
	399	static
	400	void *
	401	pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex)
	402	{
	403	pt_entry_t *pte;
	404
	405	pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m));
	406	return(&pte[pindex]);
	407	}
	408
	409	/*
	410	* Return pointer to PDP slot in the PML4
	411	*/
	412	static __inline
	413	pml4_entry_t *
	414	pmap_pdp(pmap_t pmap, vm_offset_t va)
	415	{
	416	return (&pmap->pm_pml4[pmap_pdp_index(va)]);
	417	}
	418
	419	/*
	420	* Return pointer to PD slot in the PDP given a pointer to the PDP
	421	*/
	422	static __inline
	423	pdp_entry_t *
	424	pmap_pdp_to_pd(pml4_entry_t *pdp, vm_offset_t va)
	425	{
	426	pdp_entry_t *pd;
	427
	428	pd = (pdp_entry_t )PHYS_TO_DMAP(pdp & PG_FRAME);
	429	return (&pd[pmap_pd_index(va)]);
	430	}
	431
	432	/*
	433	* Return pointer to PD slot in the PDP
	434	**/
	435	static __inline
	436	pdp_entry_t *
	437	pmap_pd(pmap_t pmap, vm_offset_t va)
	438	{
	439	pml4_entry_t *pdp;
	440
	441	pdp = pmap_pdp(pmap, va);
	442	if ((*pdp & PG_V) == 0)
	443	return NULL;
	444	return (pmap_pdp_to_pd(pdp, va));
	445	}
	446
	447	/*
	448	* Return pointer to PT slot in the PD given a pointer to the PD
	449	*/
	450	static __inline
	451	pd_entry_t *
	452	pmap_pd_to_pt(pdp_entry_t *pd, vm_offset_t va)
	453	{
	454	pd_entry_t *pt;
	455
	456	pt = (pd_entry_t )PHYS_TO_DMAP(pd & PG_FRAME);
	457	return (&pt[pmap_pt_index(va)]);
	458	}
	459
	460	/*
	461	* Return pointer to PT slot in the PD
	462	*/
	463	static __inline
	464	pd_entry_t *
	465	pmap_pt(pmap_t pmap, vm_offset_t va)
	466	{
	467	pdp_entry_t *pd;
	468
	469	pd = pmap_pd(pmap, va);
	470	if (pd == NULL \|\| (*pd & PG_V) == 0)
	471	return NULL;
	472	return (pmap_pd_to_pt(pd, va));
	473	}
	474
	475	/*
	476	* Return pointer to PTE slot in the PT given a pointer to the PT
	477	*/
	478	static __inline
	479	pt_entry_t *
	480	pmap_pt_to_pte(pd_entry_t *pt, vm_offset_t va)
	481	{
	482	pt_entry_t *pte;
	483
	484	pte = (pt_entry_t )PHYS_TO_DMAP(pt & PG_FRAME);
	485	return (&pte[pmap_pte_index(va)]);
	486	}
	487
	488	/*
	489	* Return pointer to PTE slot in the PT
	490	*/
	491	static __inline
	492	pt_entry_t *
	493	pmap_pte(pmap_t pmap, vm_offset_t va)
	494	{
	495	pd_entry_t *pt;
	496
	497	pt = pmap_pt(pmap, va);
	498	if (pt == NULL \|\| (*pt & PG_V) == 0)
	499	return NULL;
	500	if ((*pt & PG_PS) != 0)
	501	return ((pt_entry_t *)pt);
	502	return (pmap_pt_to_pte(pt, va));
	503	}
	504
	505	/*
	506	* Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
	507	* the PT layer. This will speed up core pmap operations considerably.
	508	*/
	509	static __inline
	510	void
	511	pv_cache(pv_entry_t pv, vm_pindex_t pindex)
	512	{
	513	if (pindex >= pmap_pt_pindex(0) && pindex <= pmap_pd_pindex(0))
	514	pv->pv_pmap->pm_pvhint = pv;
	515	}
	516
	517
	518	/*
	519	* KVM - return address of PT slot in PD
	520	*/
	521	static __inline
	522	pd_entry_t *
	523	vtopt(vm_offset_t va)
	524	{
	525	uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT +
	526	NPML4EPGSHIFT)) - 1);
	527
	528	return (PDmap + ((va >> PDRSHIFT) & mask));
	529	}
	530
	531	/*
	532	* KVM - return address of PTE slot in PT
	533	*/
	534	static __inline
	535	pt_entry_t *
	536	vtopte(vm_offset_t va)
	537	{
	538	uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT +
	539	NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
	540
	541	return (PTmap + ((va >> PAGE_SHIFT) & mask));
	542	}
	543
	544	static uint64_t
	545	allocpages(vm_paddr_t *firstaddr, long n)
	546	{
	547	uint64_t ret;
	548
	549	ret = *firstaddr;
	550	bzero((void )ret, n PAGE_SIZE);
	551	firstaddr += n PAGE_SIZE;
	552	return (ret);
	553	}
	554
	555	static
	556	void
	557	create_pagetables(vm_paddr_t *firstaddr)
	558	{
	559	long i; /* must be 64 bits */
	560	long nkpt_base;
	561	long nkpt_phys;
	562	int j;
	563
	564	/*
	565	* We are running (mostly) V=P at this point
	566	*
	567	* Calculate NKPT - number of kernel page tables. We have to
	568	* accomodoate prealloction of the vm_page_array, dump bitmap,
	569	* MSGBUF_SIZE, and other stuff. Be generous.
	570	*
	571	* Maxmem is in pages.
	572	*
	573	* ndmpdp is the number of 1GB pages we wish to map.
	574	*/
	575	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
	576	if (ndmpdp < 4) /* Minimum 4GB of dirmap */
	577	ndmpdp = 4;
	578	KKASSERT(ndmpdp <= NKPDPE * NPDEPG);
	579
	580	/*
	581	* Starting at the beginning of kvm (not KERNBASE).
	582	*/
	583	nkpt_phys = (Maxmem * sizeof(struct vm_page) + NBPDR - 1) / NBPDR;
	584	nkpt_phys += (Maxmem * sizeof(struct pv_entry) + NBPDR - 1) / NBPDR;
	585	nkpt_phys += ((nkpt + nkpt + 1 + NKPML4E + NKPDPE + NDMPML4E +
	586	ndmpdp) + 511) / 512;
	587	nkpt_phys += 128;
	588
	589	/*
	590	* Starting at KERNBASE - map 2G worth of page table pages.
	591	* KERNBASE is offset -2G from the end of kvm.
	592	*/
	593	nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */
	594
	595	/*
	596	* Allocate pages
	597	*/
	598	KPTbase = allocpages(firstaddr, nkpt_base);
	599	KPTphys = allocpages(firstaddr, nkpt_phys);
	600	KPML4phys = allocpages(firstaddr, 1);
	601	KPDPphys = allocpages(firstaddr, NKPML4E);
	602	KPDphys = allocpages(firstaddr, NKPDPE);
	603
	604	/*
	605	* Calculate the page directory base for KERNBASE,
	606	* that is where we start populating the page table pages.
	607	* Basically this is the end - 2.
	608	*/
	609	KPDbase = KPDphys + ((NKPDPE - (NPDPEPG - KPDPI)) << PAGE_SHIFT);
	610
	611	DMPDPphys = allocpages(firstaddr, NDMPML4E);
	612	if ((amd_feature & AMDID_PAGE1GB) == 0)
	613	DMPDphys = allocpages(firstaddr, ndmpdp);
	614	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
	615
	616	/*
	617	* Fill in the underlying page table pages for the area around
	618	* KERNBASE. This remaps low physical memory to KERNBASE.
	619	*
	620	* Read-only from zero to physfree
	621	* XXX not fully used, underneath 2M pages
	622	*/
	623	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
	624	((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT;
	625	((pt_entry_t *)KPTbase)[i] \|= PG_RW \| PG_V \| PG_G;
	626	}
	627
	628	/*
	629	* Now map the initial kernel page tables. One block of page
	630	* tables is placed at the beginning of kernel virtual memory,
	631	* and another block is placed at KERNBASE to map the kernel binary,
	632	* data, bss, and initial pre-allocations.
	633	*/
	634	for (i = 0; i < nkpt_base; i++) {
	635	((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT);
	636	((pd_entry_t *)KPDbase)[i] \|= PG_RW \| PG_V;
	637	}
	638	for (i = 0; i < nkpt_phys; i++) {
	639	((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
	640	((pd_entry_t *)KPDphys)[i] \|= PG_RW \| PG_V;
	641	}
	642
	643	/*
	644	* Map from zero to end of allocations using 2M pages as an
	645	* optimization. This will bypass some of the KPTBase pages
	646	* above in the KERNBASE area.
	647	*/
	648	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
	649	((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT;
	650	((pd_entry_t *)KPDbase)[i] \|= PG_RW \| PG_V \| PG_PS \| PG_G;
	651	}
	652
	653	/*
	654	* And connect up the PD to the PDP. The kernel pmap is expected
	655	* to pre-populate all of its PDs. See NKPDPE in vmparam.h.
	656	*/
	657	for (i = 0; i < NKPDPE; i++) {
	658	((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] =
	659	KPDphys + (i << PAGE_SHIFT);
	660	((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] \|=
	661	PG_RW \| PG_V \| PG_U;
	662	}
	663
	664	/*
	665	* Now set up the direct map space using either 2MB or 1GB pages
	666	* Preset PG_M and PG_A because demotion expects it.
	667	*
	668	* When filling in entries in the PD pages make sure any excess
	669	* entries are set to zero as we allocated enough PD pages
	670	*/
	671	if ((amd_feature & AMDID_PAGE1GB) == 0) {
	672	for (i = 0; i < NPDEPG * ndmpdp; i++) {
	673	((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
	674	((pd_entry_t *)DMPDphys)[i] \|= PG_RW \| PG_V \| PG_PS \|
	675	PG_G \| PG_M \| PG_A;
	676	}
	677
	678	/*
	679	* And the direct map space's PDP
	680	*/
	681	for (i = 0; i < ndmpdp; i++) {
	682	((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
	683	(i << PAGE_SHIFT);
	684	((pdp_entry_t *)DMPDPphys)[i] \|= PG_RW \| PG_V \| PG_U;
	685	}
	686	} else {
	687	for (i = 0; i < ndmpdp; i++) {
	688	((pdp_entry_t *)DMPDPphys)[i] =
	689	(vm_paddr_t)i << PDPSHIFT;
	690	((pdp_entry_t *)DMPDPphys)[i] \|= PG_RW \| PG_V \| PG_PS \|
	691	PG_G \| PG_M \| PG_A;
	692	}
	693	}
	694
	695	/* And recursively map PML4 to itself in order to get PTmap */
	696	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
	697	((pdp_entry_t *)KPML4phys)[PML4PML4I] \|= PG_RW \| PG_V \| PG_U;
	698
	699	/*
	700	* Connect the Direct Map slots up to the PML4
	701	*/
	702	for (j = 0; j < NDMPML4E; ++j) {
	703	((pdp_entry_t *)KPML4phys)[DMPML4I + j] =
	704	(DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) \|
	705	PG_RW \| PG_V \| PG_U;
	706	}
	707
	708	/*
	709	* Connect the KVA slot up to the PML4
	710	*/
	711	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
	712	((pdp_entry_t *)KPML4phys)[KPML4I] \|= PG_RW \| PG_V \| PG_U;
	713	}
	714
	715	/*
	716	* Bootstrap the system enough to run with virtual memory.
	717	*
	718	* On the i386 this is called after mapping has already been enabled
	719	* and just syncs the pmap module with what has already been done.
	720	* [We can't call it easily with mapping off since the kernel is not
	721	* mapped with PA == VA, hence we would have to relocate every address
	722	* from the linked base (virtual) address "KERNBASE" to the actual
	723	* (physical) address starting relative to 0]
	724	*/
	725	void
	726	pmap_bootstrap(vm_paddr_t *firstaddr)
	727	{
	728	vm_offset_t va;
	729	pt_entry_t *pte;
	730	struct mdglobaldata *gd;
	731	int pg;
	732
	733	KvaStart = VM_MIN_KERNEL_ADDRESS;
	734	KvaEnd = VM_MAX_KERNEL_ADDRESS;
	735	KvaSize = KvaEnd - KvaStart;
	736
	737	avail_start = *firstaddr;
	738
	739	/*
	740	* Create an initial set of page tables to run the kernel in.
	741	*/
	742	create_pagetables(firstaddr);
	743
	744	virtual2_start = KvaStart;
	745	virtual2_end = PTOV_OFFSET;
	746
	747	virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
	748	virtual_start = pmap_kmem_choose(virtual_start);
	749
	750	virtual_end = VM_MAX_KERNEL_ADDRESS;
	751
	752	/* XXX do %cr0 as well */
	753	load_cr4(rcr4() \| CR4_PGE \| CR4_PSE);
	754	load_cr3(KPML4phys);
	755
	756	/*
	757	* Initialize protection array.
	758	*/
	759	i386_protection_init();
	760
	761	/*
	762	* The kernel's pmap is statically allocated so we don't have to use
	763	* pmap_create, which is unlikely to work correctly at this part of
	764	* the boot sequence (XXX and which no longer exists).
	765	*/
	766	kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
	767	kernel_pmap.pm_count = 1;
	768	kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
	769	RB_INIT(&kernel_pmap.pm_pvroot);
	770	spin_init(&kernel_pmap.pm_spin);
	771	lwkt_token_init(&kernel_pmap.pm_token, "kpmap_tok");
	772
	773	/*
	774	* Reserve some special page table entries/VA space for temporary
	775	* mapping of pages.
	776	*/
	777	#define SYSMAP(c, p, v, n) \
	778	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
	779
	780	va = virtual_start;
	781	pte = vtopte(va);
	782
	783	/*
	784	* CMAP1/CMAP2 are used for zeroing and copying pages.
	785	*/
	786	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
	787
	788	/*
	789	* Crashdump maps.
	790	*/
	791	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
	792
	793	/*
	794	* ptvmmap is used for reading arbitrary physical pages via
	795	* /dev/mem.
	796	*/
	797	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
	798
	799	/*
	800	* msgbufp is used to map the system message buffer.
	801	* XXX msgbufmap is not used.
	802	*/
	803	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
	804	atop(round_page(MSGBUF_SIZE)))
	805
	806	virtual_start = va;
	807
	808	*CMAP1 = 0;
	809
	810	/*
	811	* PG_G is terribly broken on SMP because we IPI invltlb's in some
	812	* cases rather then invl1pg. Actually, I don't even know why it
	813	* works under UP because self-referential page table mappings
	814	*/
	815	#ifdef SMP
	816	pgeflag = 0;
	817	#else
	818	if (cpu_feature & CPUID_PGE)
	819	pgeflag = PG_G;
	820	#endif
	821
	822	/*
	823	* Initialize the 4MB page size flag
	824	*/
	825	pseflag = 0;
	826	/*
	827	* The 4MB page version of the initial
	828	* kernel page mapping.
	829	*/
	830	pdir4mb = 0;
	831
	832	#if !defined(DISABLE_PSE)
	833	if (cpu_feature & CPUID_PSE) {
	834	pt_entry_t ptditmp;
	835	/*
	836	* Note that we have enabled PSE mode
	837	*/
	838	pseflag = PG_PS;
	839	ptditmp = *(PTmap + x86_64_btop(KERNBASE));
	840	ptditmp &= ~(NBPDR - 1);
	841	ptditmp \|= PG_V \| PG_RW \| PG_PS \| PG_U \| pgeflag;
	842	pdir4mb = ptditmp;
	843
	844	#ifndef SMP
	845	/*
	846	* Enable the PSE mode. If we are SMP we can't do this
	847	* now because the APs will not be able to use it when
	848	* they boot up.
	849	*/
	850	load_cr4(rcr4() \| CR4_PSE);
	851
	852	/*
	853	* We can do the mapping here for the single processor
	854	* case. We simply ignore the old page table page from
	855	* now on.
	856	*/
	857	/*
	858	* For SMP, we still need 4K pages to bootstrap APs,
	859	* PSE will be enabled as soon as all APs are up.
	860	*/
	861	PTD[KPTDI] = (pd_entry_t)ptditmp;
	862	cpu_invltlb();
	863	#endif
	864	}
	865	#endif
	866
	867	/*
	868	* We need to finish setting up the globaldata page for the BSP.
	869	* locore has already populated the page table for the mdglobaldata
	870	* portion.
	871	*/
	872	pg = MDGLOBALDATA_BASEALLOC_PAGES;
	873	gd = &CPU_prvspace[0].mdglobaldata;
	874
	875	cpu_invltlb();
	876	}
	877
	878	#ifdef SMP
	879	/*
	880	* Set 4mb pdir for mp startup
	881	*/
	882	void
	883	pmap_set_opt(void)
	884	{
	885	if (pseflag && (cpu_feature & CPUID_PSE)) {
	886	load_cr4(rcr4() \| CR4_PSE);
	887	if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */
	888	cpu_invltlb();
	889	}
	890	}
	891	}
	892	#endif
	893
	894	/*
	895	* Initialize the pmap module.
	896	* Called by vm_init, to initialize any structures that the pmap
	897	* system needs to map virtual memory.
	898	* pmap_init has been enhanced to support in a fairly consistant
	899	* way, discontiguous physical memory.
	900	*/
	901	void
	902	pmap_init(void)
	903	{
	904	int i;
	905	int initial_pvs;
	906
	907	/*
	908	* Allocate memory for random pmap data structures. Includes the
	909	* pv_head_table.
	910	*/
	911
	912	for (i = 0; i < vm_page_array_size; i++) {
	913	vm_page_t m;
	914
	915	m = &vm_page_array[i];
	916	TAILQ_INIT(&m->md.pv_list);
	917	}
	918
	919	/*
	920	* init the pv free list
	921	*/
	922	initial_pvs = vm_page_array_size;
	923	if (initial_pvs < MINPV)
	924	initial_pvs = MINPV;
	925	pvzone = &pvzone_store;
	926	pvinit = (void *)kmem_alloc(&kernel_map,
	927	initial_pvs * sizeof (struct pv_entry));
	928	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry),
	929	pvinit, initial_pvs);
	930
	931	/*
	932	* Now it is safe to enable pv_table recording.
	933	*/
	934	pmap_initialized = TRUE;
	935	}
	936
	937	/*
	938	* Initialize the address space (zone) for the pv_entries. Set a
	939	* high water mark so that the system can recover from excessive
	940	* numbers of pv entries.
	941	*/
	942	void
	943	pmap_init2(void)
	944	{
	945	int shpgperproc = PMAP_SHPGPERPROC;
	946	int entry_max;
	947
	948	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
	949	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
	950	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
	951	pv_entry_high_water = 9 * (pv_entry_max / 10);
	952
	953	/*
	954	* Subtract out pages already installed in the zone (hack)
	955	*/
	956	entry_max = pv_entry_max - vm_page_array_size;
	957	if (entry_max <= 0)
	958	entry_max = 1;
	959
	960	zinitna(pvzone, &pvzone_obj, NULL, 0, entry_max, ZONE_INTERRUPT, 1);
	961	}
	962
	963
	964	/***************************************************
	965	* Low level helper routines.....
	966	***************************************************/
	967
	968	/*
	969	* this routine defines the region(s) of memory that should
	970	* not be tested for the modified bit.
	971	*/
	972	static __inline
	973	int
	974	pmap_track_modified(vm_pindex_t pindex)
	975	{
	976	vm_offset_t va = (vm_offset_t)pindex << PAGE_SHIFT;
	977	if ((va < clean_sva) \|\| (va >= clean_eva))
	978	return 1;
	979	else
	980	return 0;
	981	}
	982
	983	/*
	984	* Extract the physical page address associated with the map/VA pair.
	985	* The page must be wired for this to work reliably.
	986	*
	987	* XXX for the moment we're using pv_find() instead of pv_get(), as
	988	* callers might be expecting non-blocking operation.
	989	*/
	990	vm_paddr_t
	991	pmap_extract(pmap_t pmap, vm_offset_t va)
	992	{
	993	vm_paddr_t rtval;
	994	pv_entry_t pt_pv;
	995	pt_entry_t *ptep;
	996
	997	rtval = 0;
	998	if (va >= VM_MAX_USER_ADDRESS) {
	999	/*
	1000	* Kernel page directories might be direct-mapped and
	1001	* there is typically no PV tracking of pte's
	1002	*/
	1003	pd_entry_t *pt;
	1004
	1005	pt = pmap_pt(pmap, va);
	1006	if (pt && (*pt & PG_V)) {
	1007	if (*pt & PG_PS) {
	1008	rtval = *pt & PG_PS_FRAME;
	1009	rtval \|= va & PDRMASK;
	1010	} else {
	1011	ptep = pmap_pt_to_pte(pt, va);
	1012	if (*pt & PG_V) {
	1013	rtval = *ptep & PG_FRAME;
	1014	rtval \|= va & PAGE_MASK;
	1015	}
	1016	}
	1017	}
	1018	} else {
	1019	/*
	1020	* User pages currently do not direct-map the page directory
	1021	* and some pages might not used managed PVs. But all PT's
	1022	* will have a PV.
	1023	*/
	1024	pt_pv = pv_find(pmap, pmap_pt_pindex(va));
	1025	if (pt_pv) {
	1026	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
	1027	if (*ptep & PG_V) {
	1028	rtval = *ptep & PG_FRAME;
	1029	rtval \|= va & PAGE_MASK;
	1030	}
	1031	pv_drop(pt_pv);
	1032	}
	1033	}
	1034	return rtval;
	1035	}
	1036
	1037	/*
	1038	* Extract the physical page address associated kernel virtual address.
	1039	*/
	1040	vm_paddr_t
	1041	pmap_kextract(vm_offset_t va)
	1042	{
	1043	pd_entry_t pt; /* pt entry in pd */
	1044	vm_paddr_t pa;
	1045
	1046	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
	1047	pa = DMAP_TO_PHYS(va);
	1048	} else {
	1049	pt = *vtopt(va);
	1050	if (pt & PG_PS) {
	1051	pa = (pt & PG_PS_FRAME) \| (va & PDRMASK);
	1052	} else {
	1053	/*
	1054	* Beware of a concurrent promotion that changes the
	1055	* PDE at this point! For example, vtopte() must not
	1056	* be used to access the PTE because it would use the
	1057	* new PDE. It is, however, safe to use the old PDE
	1058	* because the page table page is preserved by the
	1059	* promotion.
	1060	*/
	1061	pa = *pmap_pt_to_pte(&pt, va);
	1062	pa = (pa & PG_FRAME) \| (va & PAGE_MASK);
	1063	}
	1064	}
	1065	return pa;
	1066	}
	1067
	1068	/***************************************************
	1069	* Low level mapping routines.....
	1070	***************************************************/
	1071
	1072	/*
	1073	* Routine: pmap_kenter
	1074	* Function:
	1075	* Add a wired page to the KVA
	1076	* NOTE! note that in order for the mapping to take effect -- you
	1077	* should do an invltlb after doing the pmap_kenter().
	1078	*/
	1079	void
	1080	pmap_kenter(vm_offset_t va, vm_paddr_t pa)
	1081	{
	1082	pt_entry_t *pte;
	1083	pt_entry_t npte;
	1084	pmap_inval_info info;
	1085
	1086	pmap_inval_init(&info); /* XXX remove */
	1087	npte = pa \| PG_RW \| PG_V \| pgeflag;
	1088	pte = vtopte(va);
	1089	pmap_inval_interlock(&info, &kernel_pmap, va); /* XXX remove */
	1090	*pte = npte;
	1091	pmap_inval_deinterlock(&info, &kernel_pmap); /* XXX remove */
	1092	pmap_inval_done(&info); /* XXX remove */
	1093	}
	1094
	1095	/*
	1096	* Routine: pmap_kenter_quick
	1097	* Function:
	1098	* Similar to pmap_kenter(), except we only invalidate the
	1099	* mapping on the current CPU.
	1100	*/
	1101	void
	1102	pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
	1103	{
	1104	pt_entry_t *pte;
	1105	pt_entry_t npte;
	1106
	1107	npte = pa \| PG_RW \| PG_V \| pgeflag;
	1108	pte = vtopte(va);
	1109	*pte = npte;
	1110	cpu_invlpg((void *)va);
	1111	}
	1112
	1113	void
	1114	pmap_kenter_sync(vm_offset_t va)
	1115	{
	1116	pmap_inval_info info;
	1117
	1118	pmap_inval_init(&info);
	1119	pmap_inval_interlock(&info, &kernel_pmap, va);
	1120	pmap_inval_deinterlock(&info, &kernel_pmap);
	1121	pmap_inval_done(&info);
	1122	}
	1123
	1124	void
	1125	pmap_kenter_sync_quick(vm_offset_t va)
	1126	{
	1127	cpu_invlpg((void *)va);
	1128	}
	1129
	1130	/*
	1131	* remove a page from the kernel pagetables
	1132	*/
	1133	void
	1134	pmap_kremove(vm_offset_t va)
	1135	{
	1136	pt_entry_t *pte;
	1137	pmap_inval_info info;
	1138
	1139	pmap_inval_init(&info);
	1140	pte = vtopte(va);
	1141	pmap_inval_interlock(&info, &kernel_pmap, va);
	1142	(void)pte_load_clear(pte);
	1143	pmap_inval_deinterlock(&info, &kernel_pmap);
	1144	pmap_inval_done(&info);
	1145	}
	1146
	1147	void
	1148	pmap_kremove_quick(vm_offset_t va)
	1149	{
	1150	pt_entry_t *pte;
	1151	pte = vtopte(va);
	1152	(void)pte_load_clear(pte);
	1153	cpu_invlpg((void *)va);
	1154	}
	1155
	1156	/*
	1157	* XXX these need to be recoded. They are not used in any critical path.
	1158	*/
	1159	void
	1160	pmap_kmodify_rw(vm_offset_t va)
	1161	{
	1162	atomic_set_long(vtopte(va), PG_RW);
	1163	cpu_invlpg((void *)va);
	1164	}
	1165
	1166	void
	1167	pmap_kmodify_nc(vm_offset_t va)
	1168	{
	1169	atomic_set_long(vtopte(va), PG_N);
	1170	cpu_invlpg((void *)va);
	1171	}
	1172
	1173	/*
	1174	* Used to map a range of physical addresses into kernel virtual
	1175	* address space during the low level boot, typically to map the
	1176	* dump bitmap, message buffer, and vm_page_array.
	1177	*
	1178	* These mappings are typically made at some pointer after the end of the
	1179	* kernel text+data.
	1180	*
	1181	* We could return PHYS_TO_DMAP(start) here and not allocate any
	1182	* via (*virtp), but then kmem from userland and kernel dumps won't
	1183	* have access to the related pointers.
	1184	*/
	1185	vm_offset_t
	1186	pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot)
	1187	{
	1188	vm_offset_t va;
	1189	vm_offset_t va_start;
	1190
	1191	/return PHYS_TO_DMAP(start);/
	1192
	1193	va_start = *virtp;
	1194	va = va_start;
	1195
	1196	while (start < end) {
	1197	pmap_kenter_quick(va, start);
	1198	va += PAGE_SIZE;
	1199	start += PAGE_SIZE;
	1200	}
	1201	*virtp = va;
	1202	return va_start;
	1203	}
	1204
	1205
	1206	/*
	1207	* Add a list of wired pages to the kva
	1208	* this routine is only used for temporary
	1209	* kernel mappings that do not need to have
	1210	* page modification or references recorded.
	1211	* Note that old mappings are simply written
	1212	* over. The page must be wired.
	1213	*/
	1214	void
	1215	pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
	1216	{
	1217	vm_offset_t end_va;
	1218
	1219	end_va = va + count * PAGE_SIZE;
	1220
	1221	while (va < end_va) {
	1222	pt_entry_t *pte;
	1223
	1224	pte = vtopte(va);
	1225	pte = VM_PAGE_TO_PHYS(m) \| PG_RW \| PG_V \| pgeflag;
	1226	cpu_invlpg((void *)va);
	1227	va += PAGE_SIZE;
	1228	m++;
	1229	}
	1230	smp_invltlb();
	1231	}
	1232
	1233	/*
	1234	* This routine jerks page mappings from the
	1235	* kernel -- it is meant only for temporary mappings.
	1236	*
	1237	* MPSAFE, INTERRUPT SAFE (cluster callback)
	1238	*/
	1239	void
	1240	pmap_qremove(vm_offset_t va, int count)
	1241	{
	1242	vm_offset_t end_va;
	1243
	1244	end_va = va + count * PAGE_SIZE;
	1245
	1246	while (va < end_va) {
	1247	pt_entry_t *pte;
	1248
	1249	pte = vtopte(va);
	1250	(void)pte_load_clear(pte);
	1251	cpu_invlpg((void *)va);
	1252	va += PAGE_SIZE;
	1253	}
	1254	smp_invltlb();
	1255	}
	1256
	1257	/*
	1258	* Create a new thread and optionally associate it with a (new) process.
	1259	* NOTE! the new thread's cpu may not equal the current cpu.
	1260	*/
	1261	void
	1262	pmap_init_thread(thread_t td)
	1263	{
	1264	/* enforce pcb placement & alignment */
	1265	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
	1266	td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF);
	1267	td->td_savefpu = &td->td_pcb->pcb_save;
	1268	td->td_sp = (char )td->td_pcb; / no -16 */
	1269	}
	1270
	1271	/*
	1272	* This routine directly affects the fork perf for a process.
	1273	*/
	1274	void
	1275	pmap_init_proc(struct proc *p)
	1276	{
	1277	}
	1278
	1279	/*
	1280	* Initialize pmap0/vmspace0. This pmap is not added to pmap_list because
	1281	* it, and IdlePTD, represents the template used to update all other pmaps.
	1282	*
	1283	* On architectures where the kernel pmap is not integrated into the user
	1284	* process pmap, this pmap represents the process pmap, not the kernel pmap.
	1285	* kernel_pmap should be used to directly access the kernel_pmap.
	1286	*/
	1287	void
	1288	pmap_pinit0(struct pmap *pmap)
	1289	{
	1290	pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys);
	1291	pmap->pm_count = 1;
	1292	pmap->pm_active = 0;
	1293	pmap->pm_pvhint = NULL;
	1294	RB_INIT(&pmap->pm_pvroot);
	1295	spin_init(&pmap->pm_spin);
	1296	lwkt_token_init(&pmap->pm_token, "pmap_tok");
	1297	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
	1298	}
	1299
	1300	/*
	1301	* Initialize a preallocated and zeroed pmap structure,
	1302	* such as one in a vmspace structure.
	1303	*/
	1304	void
	1305	pmap_pinit(struct pmap *pmap)
	1306	{
	1307	pv_entry_t pv;
	1308	int j;
	1309
	1310	/*
	1311	* Misc initialization
	1312	*/
	1313	pmap->pm_count = 1;
	1314	pmap->pm_active = 0;
	1315	pmap->pm_pvhint = NULL;
	1316	if (pmap->pm_pmlpv == NULL) {
	1317	RB_INIT(&pmap->pm_pvroot);
	1318	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
	1319	spin_init(&pmap->pm_spin);
	1320	lwkt_token_init(&pmap->pm_token, "pmap_tok");
	1321	}
	1322
	1323	/*
	1324	* No need to allocate page table space yet but we do need a valid
	1325	* page directory table.
	1326	*/
	1327	if (pmap->pm_pml4 == NULL) {
	1328	pmap->pm_pml4 =
	1329	(pml4_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
	1330	}
	1331
	1332	/*
	1333	* Allocate the page directory page, which wires it even though
	1334	* it isn't being entered into some higher level page table (it
	1335	* being the highest level). If one is already cached we don't
	1336	* have to do anything.
	1337	*/
	1338	if ((pv = pmap->pm_pmlpv) == NULL) {
	1339	pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
	1340	pmap->pm_pmlpv = pv;
	1341	pmap_kenter((vm_offset_t)pmap->pm_pml4,
	1342	VM_PAGE_TO_PHYS(pv->pv_m));
	1343	pv_put(pv);
	1344
	1345	/*
	1346	* Install DMAP and KMAP.
	1347	*/
	1348	for (j = 0; j < NDMPML4E; ++j) {
	1349	pmap->pm_pml4[DMPML4I + j] =
	1350	(DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) \|
	1351	PG_RW \| PG_V \| PG_U;
	1352	}
	1353	pmap->pm_pml4[KPML4I] = KPDPphys \| PG_RW \| PG_V \| PG_U;
	1354
	1355	/*
	1356	* install self-referential address mapping entry
	1357	*/
	1358	pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) \|
	1359	PG_V \| PG_RW \| PG_A \| PG_M;
	1360	} else {
	1361	KKASSERT(pv->pv_m->flags & PG_MAPPED);
	1362	KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
	1363	}
	1364	KKASSERT(pmap->pm_pml4[255] == 0);
	1365	KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv);
	1366	KKASSERT(pv->pv_entry.rbe_left == NULL);
	1367	KKASSERT(pv->pv_entry.rbe_right == NULL);
	1368	}
	1369
	1370	/*
	1371	* Clean up a pmap structure so it can be physically freed. This routine
	1372	* is called by the vmspace dtor function. A great deal of pmap data is
	1373	* left passively mapped to improve vmspace management so we have a bit
	1374	* of cleanup work to do here.
	1375	*/
	1376	void
	1377	pmap_puninit(pmap_t pmap)
	1378	{
	1379	pv_entry_t pv;
	1380	vm_page_t p;
	1381
	1382	KKASSERT(pmap->pm_active == 0);
	1383	if ((pv = pmap->pm_pmlpv) != NULL) {
	1384	if (pv_hold_try(pv) == 0)
	1385	pv_lock(pv);
	1386	p = pmap_remove_pv_page(pv);
	1387	pv_free(pv);
	1388	pmap_kremove((vm_offset_t)pmap->pm_pml4);
	1389	vm_page_busy_wait(p, FALSE, "pgpun");
	1390	KKASSERT(p->flags & (PG_FICTITIOUS\|PG_UNMANAGED));
	1391	vm_page_unwire(p, 0);
	1392	vm_page_flag_clear(p, PG_MAPPED \| PG_WRITEABLE);
	1393
	1394	/*
	1395	* XXX eventually clean out PML4 static entries and
	1396	* use vm_page_free_zero()
	1397	*/
	1398	vm_page_free(p);
	1399	pmap->pm_pmlpv = NULL;
	1400	}
	1401	if (pmap->pm_pml4) {
	1402	KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
	1403	kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
	1404	pmap->pm_pml4 = NULL;
	1405	}
	1406	KKASSERT(pmap->pm_stats.resident_count == 0);
	1407	KKASSERT(pmap->pm_stats.wired_count == 0);
	1408	}
	1409
	1410	/*
	1411	* Wire in kernel global address entries. To avoid a race condition
	1412	* between pmap initialization and pmap_growkernel, this procedure
	1413	* adds the pmap to the master list (which growkernel scans to update),
	1414	* then copies the template.
	1415	*/
	1416	void
	1417	pmap_pinit2(struct pmap *pmap)
	1418	{
	1419	/*
	1420	* XXX copies current process, does not fill in MPPTDI
	1421	*/
	1422	spin_lock(&pmap_spin);
	1423	TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
	1424	spin_unlock(&pmap_spin);
	1425	}
	1426
	1427	/*
	1428	* This routine is called when various levels in the page table need to
	1429	* be populated. This routine cannot fail.
	1430	*
	1431	* This function returns two locked pv_entry's, one representing the
	1432	* requested pv and one representing the requested pv's parent pv. If
	1433	* the pv did not previously exist it will be mapped into its parent
	1434	* and wired, otherwise no additional wire count will be added.
	1435	*/
	1436	static
	1437	pv_entry_t
	1438	pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
	1439	{
	1440	pt_entry_t *ptep;
	1441	pv_entry_t pv;
	1442	pv_entry_t pvp;
	1443	vm_pindex_t pt_pindex;
	1444	vm_page_t m;
	1445	int isnew;
	1446
	1447	/*
	1448	* If the pv already exists and we aren't being asked for the
	1449	* parent page table page we can just return it. A locked+held pv
	1450	* is returned.
	1451	*/
	1452	pv = pv_alloc(pmap, ptepindex, &isnew);
	1453	if (isnew == 0 && pvpp == NULL)
	1454	return(pv);
	1455
	1456	/*
	1457	* This is a new PV, we have to resolve its parent page table and
	1458	* add an additional wiring to the page if necessary.
	1459	*/
	1460
	1461	/*
	1462	* Special case terminal PVs. These are not page table pages so
	1463	* no vm_page is allocated (the caller supplied the vm_page). If
	1464	* pvpp is non-NULL we are being asked to also removed the pt_pv
	1465	* for this pv.
	1466	*
	1467	* Note that pt_pv's are only returned for user VAs. We assert that
	1468	* a pt_pv is not being requested for kernel VAs.
	1469	*/
	1470	if (ptepindex < pmap_pt_pindex(0)) {
	1471	if (ptepindex >= NUPTE_USER)
	1472	KKASSERT(pvpp == NULL);
	1473	else
	1474	KKASSERT(pvpp != NULL);
	1475	if (pvpp) {
	1476	pt_pindex = NUPTE_TOTAL + (ptepindex >> NPTEPGSHIFT);
	1477	pvp = pmap_allocpte(pmap, pt_pindex, NULL);
	1478	if (isnew)
	1479	vm_page_wire_quick(pvp->pv_m);
	1480	*pvpp = pvp;
	1481	} else {
	1482	pvp = NULL;
	1483	}
	1484	return(pv);
	1485	}
	1486
	1487	/*
	1488	* Non-terminal PVs allocate a VM page to represent the page table,
	1489	* so we have to resolve pvp and calculate ptepindex for the pvp
	1490	* and then for the page table entry index in the pvp for
	1491	* fall-through.
	1492	*/
	1493	if (ptepindex < pmap_pd_pindex(0)) {
	1494	/*
	1495	* pv is PT, pvp is PD
	1496	*/
	1497	ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT;
	1498	ptepindex += NUPTE_TOTAL + NUPT_TOTAL;
	1499	pvp = pmap_allocpte(pmap, ptepindex, NULL);
	1500	if (!isnew)
	1501	goto notnew;
	1502
	1503	/*
	1504	* PT index in PD
	1505	*/
	1506	ptepindex = pv->pv_pindex - pmap_pt_pindex(0);
	1507	ptepindex &= ((1ul << NPDEPGSHIFT) - 1);
	1508	} else if (ptepindex < pmap_pdp_pindex(0)) {
	1509	/*
	1510	* pv is PD, pvp is PDP
	1511	*/
	1512	ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT;
	1513	ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
	1514	pvp = pmap_allocpte(pmap, ptepindex, NULL);
	1515	if (!isnew)
	1516	goto notnew;
	1517
	1518	/*
	1519	* PD index in PDP
	1520	*/
	1521	ptepindex = pv->pv_pindex - pmap_pd_pindex(0);
	1522	ptepindex &= ((1ul << NPDPEPGSHIFT) - 1);
	1523	} else if (ptepindex < pmap_pml4_pindex()) {
	1524	/*
	1525	* pv is PDP, pvp is the root pml4 table
	1526	*/
	1527	pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
	1528	if (!isnew)
	1529	goto notnew;
	1530
	1531	/*
	1532	* PDP index in PML4
	1533	*/
	1534	ptepindex = pv->pv_pindex - pmap_pdp_pindex(0);
	1535	ptepindex &= ((1ul << NPML4EPGSHIFT) - 1);
	1536	} else {
	1537	/*
	1538	* pv represents the top-level PML4, there is no parent.
	1539	*/
	1540	pvp = NULL;
	1541	if (!isnew)
	1542	goto notnew;
	1543	}
	1544
	1545	/*
	1546	* This code is only reached if isnew is TRUE and this is not a
	1547	* terminal PV. We need to allocate a vm_page for the page table
	1548	* at this level and enter it into the parent page table.
	1549	*
	1550	* page table pages are marked PG_WRITEABLE and PG_MAPPED.
	1551	*/
	1552	for (;;) {
	1553	m = vm_page_alloc(NULL, pv->pv_pindex,
	1554	VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM \|
	1555	VM_ALLOC_INTERRUPT);
	1556	if (m)
	1557	break;
	1558	vm_wait(0);
	1559	}
	1560	vm_page_spin_lock(m);
	1561	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
	1562	pv->pv_m = m;
	1563	vm_page_flag_set(m, PG_MAPPED \| PG_WRITEABLE);
	1564	vm_page_spin_unlock(m);
	1565	vm_page_unmanage(m); /* m must be spinunlocked */
	1566
	1567	if ((m->flags & PG_ZERO) == 0) {
	1568	pmap_zero_page(VM_PAGE_TO_PHYS(m));
	1569	}
	1570	#ifdef PMAP_DEBUG
	1571	else {
	1572	pmap_page_assertzero(VM_PAGE_TO_PHYS(m));
	1573	}
	1574	#endif
	1575	m->valid = VM_PAGE_BITS_ALL;
	1576	vm_page_flag_clear(m, PG_ZERO);
	1577	vm_page_wire(m); /* wire for mapping in parent */
	1578
	1579	/*
	1580	* Wire the page into pvp, bump the wire-count for pvp's page table
	1581	* page. Bump the resident_count for the pmap. There is no pvp
	1582	* for the top level, address the pm_pml4[] array directly.
	1583	*
	1584	* If the caller wants the parent we return it, otherwise
	1585	* we just put it away.
	1586	*
	1587	* No interlock is needed for pte 0 -> non-zero.
	1588	*/
	1589	if (pvp) {
	1590	vm_page_wire_quick(pvp->pv_m);
	1591	ptep = pv_pte_lookup(pvp, ptepindex);
	1592	KKASSERT((*ptep & PG_V) == 0);
	1593	*ptep = VM_PAGE_TO_PHYS(m) \| (PG_U \| PG_RW \| PG_V \|
	1594	PG_A \| PG_M);
	1595	}
	1596	vm_page_wakeup(m);
	1597	notnew:
	1598	if (pvpp)
	1599	*pvpp = pvp;
	1600	else if (pvp)
	1601	pv_put(pvp);
	1602	return (pv);
	1603	}
	1604
	1605	/*
	1606	* Release any resources held by the given physical map.
	1607	*
	1608	* Called when a pmap initialized by pmap_pinit is being released. Should
	1609	* only be called if the map contains no valid mappings.
	1610	*
	1611	* Caller must hold pmap->pm_token
	1612	*/
	1613	struct pmap_release_info {
	1614	pmap_t pmap;
	1615	int retry;
	1616	};
	1617
	1618	static int pmap_release_callback(pv_entry_t pv, void *data);
	1619
	1620	void
	1621	pmap_release(struct pmap *pmap)
	1622	{
	1623	struct pmap_release_info info;
	1624
	1625	KASSERT(pmap->pm_active == 0,
	1626	("pmap still active! %016jx", (uintmax_t)pmap->pm_active));
	1627
	1628	spin_lock(&pmap_spin);
	1629	TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
	1630	spin_unlock(&pmap_spin);
	1631
	1632	/*
	1633	* Pull pv's off the RB tree in order from low to high and release
	1634	* each page.
	1635	*/
	1636	info.pmap = pmap;
	1637	do {
	1638	info.retry = 0;
	1639	spin_lock(&pmap->pm_spin);
	1640	RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL,
	1641	pmap_release_callback, &info);
	1642	spin_unlock(&pmap->pm_spin);
	1643	} while (info.retry);
	1644
	1645
	1646	/*
	1647	* One resident page (the pml4 page) should remain.
	1648	* No wired pages should remain.
	1649	*/
	1650	KKASSERT(pmap->pm_stats.resident_count == 1);
	1651	KKASSERT(pmap->pm_stats.wired_count == 0);
	1652	}
	1653
	1654	static int
	1655	pmap_release_callback(pv_entry_t pv, void *data)
	1656	{
	1657	struct pmap_release_info *info = data;
	1658	pmap_t pmap = info->pmap;
	1659	vm_page_t p;
	1660
	1661	if (pv_hold_try(pv)) {
	1662	spin_unlock(&pmap->pm_spin);
	1663	} else {
	1664	spin_unlock(&pmap->pm_spin);
	1665	pv_lock(pv);
	1666	if (pv->pv_pmap != pmap) {
	1667	pv_put(pv);
	1668	spin_lock(&pmap->pm_spin);
	1669	info->retry = 1;
	1670	return(-1);
	1671	}
	1672	}
	1673
	1674	/*
	1675	* The pmap is currently not spinlocked, pv is held+locked.
	1676	* Remove the pv's page from its parent's page table. The
	1677	* parent's page table page's wire_count will be decremented.
	1678	*/
	1679	pmap_remove_pv_pte(pv, NULL, NULL);
	1680
	1681	/*
	1682	* Terminal pvs are unhooked from their vm_pages. Because
	1683	* terminal pages aren't page table pages they aren't wired
	1684	* by us, so we have to be sure not to unwire them either.
	1685	*/
	1686	if (pv->pv_pindex < pmap_pt_pindex(0)) {
	1687	pmap_remove_pv_page(pv);
	1688	goto skip;
	1689	}
	1690
	1691	/*
	1692	* We leave the top-level page table page cached, wired, and
	1693	* mapped in the pmap until the dtor function (pmap_puninit())
	1694	* gets called.
	1695	*
	1696	* Since we are leaving the top-level pv intact we need
	1697	* to break out of what would otherwise be an infinite loop.
	1698	*/
	1699	if (pv->pv_pindex == pmap_pml4_pindex()) {
	1700	pv_put(pv);
	1701	spin_lock(&pmap->pm_spin);
	1702	return(-1);
	1703	}
	1704
	1705	/*
	1706	* For page table pages (other than the top-level page),
	1707	* remove and free the vm_page. The representitive mapping
	1708	* removed above by pmap_remove_pv_pte() did not undo the
	1709	* last wire_count so we have to do that as well.
	1710	*/
	1711	p = pmap_remove_pv_page(pv);
	1712	vm_page_busy_wait(p, FALSE, "pmaprl");
	1713	if (p->wire_count != 1) {
	1714	kprintf("p->wire_count was %016lx %d\n",
	1715	pv->pv_pindex, p->wire_count);
	1716	}
	1717	KKASSERT(p->wire_count == 1);
	1718	KKASSERT(p->flags & PG_UNMANAGED);
	1719
	1720	vm_page_unwire(p, 0);
	1721	KKASSERT(p->wire_count == 0);
	1722	/* JG eventually revert to using vm_page_free_zero() */
	1723	vm_page_free(p);
	1724	skip:
	1725	pv_free(pv);
	1726	spin_lock(&pmap->pm_spin);
	1727	return(0);
	1728	}
	1729
	1730	/*
	1731	* This function will remove the pte associated with a pv from its parent.
	1732	* Terminal pv's are supported. The removal will be interlocked if info
	1733	* is non-NULL. The caller must dispose of pv instead of just unlocking
	1734	* it.
	1735	*
	1736	* The wire count will be dropped on the parent page table. The wire
	1737	* count on the page being removed (pv->pv_m) from the parent page table
	1738	* is NOT touched. Note that terminal pages will not have any additional
	1739	* wire counts while page table pages will have at least one representing
	1740	* the mapping, plus others representing sub-mappings.
	1741	*
	1742	* NOTE: Cannot be called on kernel page table pages, only KVM terminal
	1743	* pages and user page table and terminal pages.
	1744	*
	1745	* The pv must be locked.
	1746	*
	1747	* XXX must lock parent pv's if they exist to remove pte XXX
	1748	*/
	1749	static
	1750	void
	1751	pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
	1752	{
	1753	vm_pindex_t ptepindex = pv->pv_pindex;
	1754	pmap_t pmap = pv->pv_pmap;
	1755	vm_page_t p;
	1756	int gotpvp = 0;
	1757
	1758	KKASSERT(pmap);
	1759
	1760	if (ptepindex == pmap_pml4_pindex()) {
	1761	/*
	1762	* We are the top level pml4 table, there is no parent.
	1763	*/
	1764	p = pmap->pm_pmlpv->pv_m;
	1765	} else if (ptepindex >= pmap_pdp_pindex(0)) {
	1766	/*
	1767	* Remove a PDP page from the pml4e. This can only occur
	1768	* with user page tables. We do not have to lock the
	1769	* pml4 PV so just ignore pvp.
	1770	*/
	1771	vm_pindex_t pml4_pindex;
	1772	vm_pindex_t pdp_index;
	1773	pml4_entry_t *pdp;
	1774
	1775	pdp_index = ptepindex - pmap_pdp_pindex(0);
	1776	if (pvp == NULL) {
	1777	pml4_pindex = pmap_pml4_pindex();
	1778	pvp = pv_get(pv->pv_pmap, pml4_pindex);
	1779	gotpvp = 1;
	1780	}
	1781	pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
	1782	KKASSERT((*pdp & PG_V) != 0);
	1783	p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
	1784	*pdp = 0;
	1785	KKASSERT(info == NULL);
	1786	} else if (ptepindex >= pmap_pd_pindex(0)) {
	1787	/*
	1788	* Remove a PD page from the pdp
	1789	*/
	1790	vm_pindex_t pdp_pindex;
	1791	vm_pindex_t pd_index;
	1792	pdp_entry_t *pd;
	1793
	1794	pd_index = ptepindex - pmap_pd_pindex(0);
	1795
	1796	if (pvp == NULL) {
	1797	pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL +
	1798	(pd_index >> NPML4EPGSHIFT);
	1799	pvp = pv_get(pv->pv_pmap, pdp_pindex);
	1800	gotpvp = 1;
	1801	}
	1802	pd = pv_pte_lookup(pvp, pd_index & ((1ul << NPDPEPGSHIFT) - 1));
	1803	KKASSERT((*pd & PG_V) != 0);
	1804	p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
	1805	*pd = 0;
	1806	KKASSERT(info == NULL);
	1807	} else if (ptepindex >= pmap_pt_pindex(0)) {
	1808	/*
	1809	* Remove a PT page from the pd
	1810	*/
	1811	vm_pindex_t pd_pindex;
	1812	vm_pindex_t pt_index;
	1813	pd_entry_t *pt;
	1814
	1815	pt_index = ptepindex - pmap_pt_pindex(0);
	1816
	1817	if (pvp == NULL) {
	1818	pd_pindex = NUPTE_TOTAL + NUPT_TOTAL +
	1819	(pt_index >> NPDPEPGSHIFT);
	1820	pvp = pv_get(pv->pv_pmap, pd_pindex);
	1821	gotpvp = 1;
	1822	}
	1823	pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
	1824	KKASSERT((*pt & PG_V) != 0);
	1825	p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
	1826	*pt = 0;
	1827	KKASSERT(info == NULL);
	1828	} else {
	1829	/*
	1830	* Remove a PTE from the PT page
	1831	*
	1832	* NOTE: pv's must be locked bottom-up to avoid deadlocking.
	1833	* pv is a pte_pv so we can safely lock pt_pv.
	1834	*/
	1835	vm_pindex_t pt_pindex;
	1836	pt_entry_t *ptep;
	1837	pt_entry_t pte;
	1838	vm_offset_t va;
	1839
	1840	pt_pindex = ptepindex >> NPTEPGSHIFT;
	1841	va = (vm_offset_t)ptepindex << PAGE_SHIFT;
	1842
	1843	if (ptepindex >= NUPTE_USER) {
	1844	ptep = vtopte(ptepindex << PAGE_SHIFT);
	1845	KKASSERT(pvp == NULL);
	1846	} else {
	1847	if (pvp == NULL) {
	1848	pt_pindex = NUPTE_TOTAL +
	1849	(ptepindex >> NPDPEPGSHIFT);
	1850	pvp = pv_get(pv->pv_pmap, pt_pindex);
	1851	gotpvp = 1;
	1852	}
	1853	ptep = pv_pte_lookup(pvp, ptepindex &
	1854	((1ul << NPDPEPGSHIFT) - 1));
	1855	}
	1856
	1857	if (info)
	1858	pmap_inval_interlock(info, pmap, va);
	1859	pte = pte_load_clear(ptep);
	1860	if (info)
	1861	pmap_inval_deinterlock(info, pmap);
	1862	else
	1863	cpu_invlpg((void *)va);
	1864
	1865	/*
	1866	* Now update the vm_page_t
	1867	*/
	1868	if ((pte & (PG_MANAGED\|PG_V)) != (PG_MANAGED\|PG_V)) {
	1869	kprintf("remove_pte badpte %016lx %016lx %d\n",
	1870	pte, pv->pv_pindex,
	1871	pv->pv_pindex < pmap_pt_pindex(0));
	1872	}
	1873	/KKASSERT((pte & (PG_MANAGED\|PG_V)) == (PG_MANAGED\|PG_V));/
	1874	p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
	1875
	1876	if (pte & PG_M) {
	1877	if (pmap_track_modified(ptepindex))
	1878	vm_page_dirty(p);
	1879	}
	1880	if (pte & PG_A) {
	1881	vm_page_flag_set(p, PG_REFERENCED);
	1882	}
	1883	if (pte & PG_W)
	1884	atomic_add_long(&pmap->pm_stats.wired_count, -1);
	1885	if (pte & PG_G)
	1886	cpu_invlpg((void *)va);
	1887	}
	1888
	1889	/*
	1890	* Unwire the parent page table page. The wire_count cannot go below
	1891	* 1 here because the parent page table page is itself still mapped.
	1892	*
	1893	* XXX remove the assertions later.
	1894	*/
	1895	KKASSERT(pv->pv_m == p);
	1896	if (pvp && vm_page_unwire_quick(pvp->pv_m))
	1897	panic("pmap_remove_pv_pte: Insufficient wire_count");
	1898
	1899	if (gotpvp)
	1900	pv_put(pvp);
	1901	}
	1902
	1903	static
	1904	vm_page_t
	1905	pmap_remove_pv_page(pv_entry_t pv)
	1906	{
	1907	vm_page_t m;
	1908
	1909	m = pv->pv_m;
	1910	KKASSERT(m);
	1911	vm_page_spin_lock(m);
	1912	pv->pv_m = NULL;
	1913	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
	1914	/*
	1915	if (m->object)
	1916	atomic_add_int(&m->object->agg_pv_list_count, -1);
	1917	*/
	1918	if (TAILQ_EMPTY(&m->md.pv_list))
	1919	vm_page_flag_clear(m, PG_MAPPED \| PG_WRITEABLE);
	1920	vm_page_spin_unlock(m);
	1921	return(m);
	1922	}
	1923
	1924	/*
	1925	* Grow the number of kernel page table entries, if needed.
	1926	*
	1927	* This routine is always called to validate any address space
	1928	* beyond KERNBASE (for kldloads). kernel_vm_end only governs the address
	1929	* space below KERNBASE.
	1930	*/
	1931	void
	1932	pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
	1933	{
	1934	vm_paddr_t paddr;
	1935	vm_offset_t ptppaddr;
	1936	vm_page_t nkpg;
	1937	pd_entry_t *pt, newpt;
	1938	pdp_entry_t newpd;
	1939	int update_kernel_vm_end;
	1940
	1941	/*
	1942	* bootstrap kernel_vm_end on first real VM use
	1943	*/
	1944	if (kernel_vm_end == 0) {
	1945	kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
	1946	nkpt = 0;
	1947	while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
	1948	kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
	1949	~(PAGE_SIZE * NPTEPG - 1);
	1950	nkpt++;
	1951	if (kernel_vm_end - 1 >= kernel_map.max_offset) {
	1952	kernel_vm_end = kernel_map.max_offset;
	1953	break;
	1954	}
	1955	}
	1956	}
	1957
	1958	/*
	1959	* Fill in the gaps. kernel_vm_end is only adjusted for ranges
	1960	* below KERNBASE. Ranges above KERNBASE are kldloaded and we
	1961	* do not want to force-fill 128G worth of page tables.
	1962	*/
	1963	if (kstart < KERNBASE) {
	1964	if (kstart > kernel_vm_end)
	1965	kstart = kernel_vm_end;
	1966	KKASSERT(kend <= KERNBASE);
	1967	update_kernel_vm_end = 1;
	1968	} else {
	1969	update_kernel_vm_end = 0;
	1970	}
	1971
	1972	kstart = rounddown2(kstart, PAGE_SIZE * NPTEPG);
	1973	kend = roundup2(kend, PAGE_SIZE * NPTEPG);
	1974
	1975	if (kend - 1 >= kernel_map.max_offset)
	1976	kend = kernel_map.max_offset;
	1977
	1978	while (kstart < kend) {
	1979	pt = pmap_pt(&kernel_pmap, kstart);
	1980	if (pt == NULL) {
	1981	/* We need a new PDP entry */
	1982	nkpg = vm_page_alloc(NULL, nkpt,
	1983	VM_ALLOC_NORMAL \|
	1984	VM_ALLOC_SYSTEM \|
	1985	VM_ALLOC_INTERRUPT);
	1986	if (nkpg == NULL) {
	1987	panic("pmap_growkernel: no memory to grow "
	1988	"kernel");
	1989	}
	1990	paddr = VM_PAGE_TO_PHYS(nkpg);
	1991	if ((nkpg->flags & PG_ZERO) == 0)
	1992	pmap_zero_page(paddr);
	1993	vm_page_flag_clear(nkpg, PG_ZERO);
	1994	newpd = (pdp_entry_t)
	1995	(paddr \| PG_V \| PG_RW \| PG_A \| PG_M);
	1996	*pmap_pd(&kernel_pmap, kstart) = newpd;
	1997	nkpt++;
	1998	continue; /* try again */
	1999	}
	2000	if ((*pt & PG_V) != 0) {
	2001	kstart = (kstart + PAGE_SIZE * NPTEPG) &
	2002	~(PAGE_SIZE * NPTEPG - 1);
	2003	if (kstart - 1 >= kernel_map.max_offset) {
	2004	kstart = kernel_map.max_offset;
	2005	break;
	2006	}
	2007	continue;
	2008	}
	2009
	2010	/*
	2011	* This index is bogus, but out of the way
	2012	*/
	2013	nkpg = vm_page_alloc(NULL, nkpt,
	2014	VM_ALLOC_NORMAL \|
	2015	VM_ALLOC_SYSTEM \|
	2016	VM_ALLOC_INTERRUPT);
	2017	if (nkpg == NULL)
	2018	panic("pmap_growkernel: no memory to grow kernel");
	2019
	2020	vm_page_wire(nkpg);
	2021	ptppaddr = VM_PAGE_TO_PHYS(nkpg);
	2022	pmap_zero_page(ptppaddr);
	2023	vm_page_flag_clear(nkpg, PG_ZERO);
	2024	newpt = (pd_entry_t) (ptppaddr \| PG_V \| PG_RW \| PG_A \| PG_M);
	2025	*pmap_pt(&kernel_pmap, kstart) = newpt;
	2026	nkpt++;
	2027
	2028	kstart = (kstart + PAGE_SIZE * NPTEPG) &
	2029	~(PAGE_SIZE * NPTEPG - 1);
	2030
	2031	if (kstart - 1 >= kernel_map.max_offset) {
	2032	kstart = kernel_map.max_offset;
	2033	break;
	2034	}
	2035	}
	2036
	2037	/*
	2038	* Only update kernel_vm_end for areas below KERNBASE.
	2039	*/
	2040	if (update_kernel_vm_end && kernel_vm_end < kstart)
	2041	kernel_vm_end = kstart;
	2042	}
	2043
	2044	/*
	2045	* Retire the given physical map from service.
	2046	* Should only be called if the map contains
	2047	* no valid mappings.
	2048	*/
	2049	void
	2050	pmap_destroy(pmap_t pmap)
	2051	{
	2052	int count;
	2053
	2054	if (pmap == NULL)
	2055	return;
	2056
	2057	lwkt_gettoken(&pmap->pm_token);
	2058	count = --pmap->pm_count;
	2059	if (count == 0) {
	2060	pmap_release(pmap); /* eats pm_token */
	2061	panic("destroying a pmap is not yet implemented");
	2062	}
	2063	lwkt_reltoken(&pmap->pm_token);
	2064	}
	2065
	2066	/*
	2067	* Add a reference to the specified pmap.
	2068	*/
	2069	void
	2070	pmap_reference(pmap_t pmap)
	2071	{
	2072	if (pmap != NULL) {
	2073	lwkt_gettoken(&pmap->pm_token);
	2074	pmap->pm_count++;
	2075	lwkt_reltoken(&pmap->pm_token);
	2076	}
	2077	}
	2078
	2079	/***************************************************
	2080	* page management routines.
	2081	***************************************************/
	2082
	2083	/*
	2084	* Hold a pv without locking it
	2085	*/
	2086	static void
	2087	pv_hold(pv_entry_t pv)
	2088	{
	2089	u_int count;
	2090
	2091	if (atomic_cmpset_int(&pv->pv_hold, 0, 1))
	2092	return;
	2093
	2094	for (;;) {
	2095	count = pv->pv_hold;
	2096	cpu_ccfence();
	2097	if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
	2098	return;
	2099	/* retry */
	2100	}
	2101	}
	2102
	2103	/*
	2104	* Hold a pv_entry, preventing its destruction. TRUE is returned if the pv
	2105	* was successfully locked, FALSE if it wasn't. The caller must dispose of
	2106	* the pv properly.
	2107	*
	2108	* Either the pmap->pm_spin or the related vm_page_spin (if traversing a
	2109	* pv list via its page) must be held by the caller.
	2110	*/
	2111	static int
	2112	_pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL)
	2113	{
	2114	u_int count;
	2115
	2116	if (atomic_cmpset_int(&pv->pv_hold, 0, PV_HOLD_LOCKED \| 1)) {
	2117	#ifdef PMAP_DEBUG
	2118	pv->pv_func = func;
	2119	pv->pv_line = lineno;
	2120	#endif
	2121	return TRUE;
	2122	}
	2123
	2124	for (;;) {
	2125	count = pv->pv_hold;
	2126	cpu_ccfence();
	2127	if ((count & PV_HOLD_LOCKED) == 0) {
	2128	if (atomic_cmpset_int(&pv->pv_hold, count,
	2129	(count + 1) \| PV_HOLD_LOCKED)) {
	2130	#ifdef PMAP_DEBUG
	2131	pv->pv_func = func;
	2132	pv->pv_line = lineno;
	2133	#endif
	2134	return TRUE;
	2135	}
	2136	} else {
	2137	if (atomic_cmpset_int(&pv->pv_hold, count, count + 1))
	2138	return FALSE;
	2139	}
	2140	/* retry */
	2141	}
	2142	}
	2143
	2144	/*
	2145	* Drop a previously held pv_entry which could not be locked, allowing its
	2146	* destruction.
	2147	*
	2148	* Must not be called with a spinlock held as we might zfree() the pv if it
	2149	* is no longer associated with a pmap and this was the last hold count.
	2150	*/
	2151	static void
	2152	pv_drop(pv_entry_t pv)
	2153	{
	2154	u_int count;
	2155
	2156	if (atomic_cmpset_int(&pv->pv_hold, 1, 0)) {
	2157	if (pv->pv_pmap == NULL)
	2158	zfree(pvzone, pv);
	2159	return;
	2160	}
	2161
	2162	for (;;) {
	2163	count = pv->pv_hold;
	2164	cpu_ccfence();
	2165	KKASSERT((count & PV_HOLD_MASK) > 0);
	2166	KKASSERT((count & (PV_HOLD_LOCKED \| PV_HOLD_MASK)) !=
	2167	(PV_HOLD_LOCKED \| 1));
	2168	if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) {
	2169	if (count == 1 && pv->pv_pmap == NULL)
	2170	zfree(pvzone, pv);
	2171	return;
	2172	}
	2173	/* retry */
	2174	}
	2175	}
	2176
	2177	/*
	2178	* Find or allocate the requested PV entry, returning a locked pv
	2179	*/
	2180	static
	2181	pv_entry_t
	2182	_pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL)
	2183	{
	2184	pv_entry_t pv;
	2185	pv_entry_t pnew = NULL;
	2186
	2187	spin_lock(&pmap->pm_spin);
	2188	for (;;) {
	2189	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex) {
	2190	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
	2191	pindex);
	2192	}
	2193	if (pv == NULL) {
	2194	if (pnew == NULL) {
	2195	spin_unlock(&pmap->pm_spin);
	2196	pnew = zalloc(pvzone);
	2197	spin_lock(&pmap->pm_spin);
	2198	continue;
	2199	}
	2200	pnew->pv_pmap = pmap;
	2201	pnew->pv_pindex = pindex;
	2202	pnew->pv_hold = PV_HOLD_LOCKED \| 1;
	2203	#ifdef PMAP_DEBUG
	2204	pnew->pv_func = func;
	2205	pnew->pv_line = lineno;
	2206	#endif
	2207	pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew);
	2208	atomic_add_long(&pmap->pm_stats.resident_count, 1);
	2209	spin_unlock(&pmap->pm_spin);
	2210	*isnew = 1;
	2211	return(pnew);
	2212	}
	2213	if (pnew) {
	2214	spin_unlock(&pmap->pm_spin);
	2215	zfree(pvzone, pnew);
	2216	pnew = NULL;
	2217	spin_lock(&pmap->pm_spin);
	2218	continue;
	2219	}
	2220	if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
	2221	spin_unlock(&pmap->pm_spin);
	2222	*isnew = 0;
	2223	return(pv);
	2224	}
	2225	spin_unlock(&pmap->pm_spin);
	2226	_pv_lock(pv PMAP_DEBUG_COPY);
	2227	if (pv->pv_pmap == pmap && pv->pv_pindex == pindex) {
	2228	*isnew = 0;
	2229	return(pv);
	2230	}
	2231	pv_put(pv);
	2232	spin_lock(&pmap->pm_spin);
	2233	}
	2234
	2235
	2236	}
	2237
	2238	/*
	2239	* Find the requested PV entry, returning a locked+held pv or NULL
	2240	*/
	2241	static
	2242	pv_entry_t
	2243	_pv_get(pmap_t pmap, vm_pindex_t pindex PMAP_DEBUG_DECL)
	2244	{
	2245	pv_entry_t pv;
	2246
	2247	spin_lock(&pmap->pm_spin);
	2248	for (;;) {
	2249	/*
	2250	* Shortcut cache
	2251	*/
	2252	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex) {
	2253	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot,
	2254	pindex);
	2255	}
	2256	if (pv == NULL) {
	2257	spin_unlock(&pmap->pm_spin);
	2258	return NULL;
	2259	}
	2260	if (_pv_hold_try(pv PMAP_DEBUG_COPY)) {
	2261	pv_cache(pv, pindex);
	2262	spin_unlock(&pmap->pm_spin);
	2263	return(pv);
	2264	}
	2265	spin_unlock(&pmap->pm_spin);
	2266	_pv_lock(pv PMAP_DEBUG_COPY);
	2267	if (pv->pv_pmap == pmap && pv->pv_pindex == pindex)
	2268	return(pv);
	2269	pv_put(pv);
	2270	spin_lock(&pmap->pm_spin);
	2271	}
	2272	}
	2273
	2274	/*
	2275	* Lookup, hold, and attempt to lock (pmap,pindex).
	2276	*
	2277	* If the entry does not exist NULL is returned and *errorp is set to 0
	2278	*
	2279	* If the entry exists and could be successfully locked it is returned and
	2280	* errorp is set to 0.
	2281	*
	2282	* If the entry exists but could NOT be successfully locked it is returned
	2283	* held and *errorp is set to 1.
	2284	*/
	2285	static
	2286	pv_entry_t
	2287	pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
	2288	{
	2289	pv_entry_t pv;
	2290
	2291	spin_lock(&pmap->pm_spin);
	2292	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex)
	2293	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
	2294	if (pv == NULL) {
	2295	spin_unlock(&pmap->pm_spin);
	2296	*errorp = 0;
	2297	return NULL;
	2298	}
	2299	if (pv_hold_try(pv)) {
	2300	pv_cache(pv, pindex);
	2301	spin_unlock(&pmap->pm_spin);
	2302	*errorp = 0;
	2303	return(pv); /* lock succeeded */
	2304	}
	2305	spin_unlock(&pmap->pm_spin);
	2306	*errorp = 1;
	2307	return (pv); /* lock failed */
	2308	}
	2309
	2310	/*
	2311	* Find the requested PV entry, returning a held pv or NULL
	2312	*/
	2313	static
	2314	pv_entry_t
	2315	pv_find(pmap_t pmap, vm_pindex_t pindex)
	2316	{
	2317	pv_entry_t pv;
	2318
	2319	spin_lock(&pmap->pm_spin);
	2320
	2321	if ((pv = pmap->pm_pvhint) == NULL \|\| pv->pv_pindex != pindex)
	2322	pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
	2323	if (pv == NULL) {
	2324	spin_unlock(&pmap->pm_spin);
	2325	return NULL;
	2326	}
	2327	pv_hold(pv);
	2328	pv_cache(pv, pindex);
	2329	spin_unlock(&pmap->pm_spin);
	2330	return(pv);
	2331	}
	2332
	2333	/*
	2334	* Lock a held pv, keeping the hold count
	2335	*/
	2336	static
	2337	void
	2338	_pv_lock(pv_entry_t pv PMAP_DEBUG_DECL)
	2339	{
	2340	u_int count;
	2341
	2342	for (;;) {
	2343	count = pv->pv_hold;
	2344	cpu_ccfence();
	2345	if ((count & PV_HOLD_LOCKED) == 0) {
	2346	if (atomic_cmpset_int(&pv->pv_hold, count,
	2347	count \| PV_HOLD_LOCKED)) {
	2348	#ifdef PMAP_DEBUG
	2349	pv->pv_func = func;
	2350	pv->pv_line = lineno;
	2351	#endif
	2352	return;
	2353	}
	2354	continue;
	2355	}
	2356	tsleep_interlock(pv, 0);
	2357	if (atomic_cmpset_int(&pv->pv_hold, count,
	2358	count \| PV_HOLD_WAITING)) {
	2359	#ifdef PMAP_DEBUG
	2360	kprintf("pv waiting on %s:%d\n",
	2361	pv->pv_func, pv->pv_line);
	2362	#endif
	2363	tsleep(pv, PINTERLOCKED, "pvwait", hz);
	2364	}
	2365	/* retry */
	2366	}
	2367	}
	2368
	2369	/*
	2370	* Unlock a held and locked pv, keeping the hold count.
	2371	*/
	2372	static
	2373	void
	2374	pv_unlock(pv_entry_t pv)
	2375	{
	2376	u_int count;
	2377
	2378	if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED \| 1, 1))
	2379	return;
	2380
	2381	for (;;) {
	2382	count = pv->pv_hold;
	2383	cpu_ccfence();
	2384	KKASSERT((count & (PV_HOLD_LOCKED\|PV_HOLD_MASK)) >=
	2385	(PV_HOLD_LOCKED \| 1));
	2386	if (atomic_cmpset_int(&pv->pv_hold, count,
	2387	count &
	2388	~(PV_HOLD_LOCKED \| PV_HOLD_WAITING))) {
	2389	if (count & PV_HOLD_WAITING)
	2390	wakeup(pv);
	2391	break;
	2392	}
	2393	}
	2394	}
	2395
	2396	/*
	2397	* Unlock and drop a pv. If the pv is no longer associated with a pmap
	2398	* and the hold count drops to zero we will free it.
	2399	*
	2400	* Caller should not hold any spin locks. We are protected from hold races
	2401	* by virtue of holds only occuring only with a pmap_spin or vm_page_spin
	2402	* lock held. A pv cannot be located otherwise.
	2403	*/
	2404	static
	2405	void
	2406	pv_put(pv_entry_t pv)
	2407	{
	2408	if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED \| 1, 0)) {
	2409	if (pv->pv_pmap == NULL)
	2410	zfree(pvzone, pv);
	2411	return;
	2412	}
	2413	pv_unlock(pv);
	2414	pv_drop(pv);
	2415	}
	2416
	2417	/*
	2418	* Unlock, drop, and free a pv, destroying it. The pv is removed from its
	2419	* pmap. Any pte operations must have already been completed.
	2420	*/
	2421	static
	2422	void
	2423	pv_free(pv_entry_t pv)
	2424	{
	2425	pmap_t pmap;
	2426
	2427	KKASSERT(pv->pv_m == NULL);
	2428	if ((pmap = pv->pv_pmap) != NULL) {
	2429	spin_lock(&pmap->pm_spin);
	2430	pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
	2431	if (pmap->pm_pvhint == pv)
	2432	pmap->pm_pvhint = NULL;
	2433	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	2434	pv->pv_pmap = NULL;
	2435	pv->pv_pindex = 0;
	2436	spin_unlock(&pmap->pm_spin);
	2437	}
	2438	pv_put(pv);
	2439	}
	2440
	2441	/*
	2442	* This routine is very drastic, but can save the system
	2443	* in a pinch.
	2444	*/
	2445	void
	2446	pmap_collect(void)
	2447	{
	2448	int i;
	2449	vm_page_t m;
	2450	static int warningdone=0;
	2451
	2452	if (pmap_pagedaemon_waken == 0)
	2453	return;
	2454	pmap_pagedaemon_waken = 0;
	2455	if (warningdone < 5) {
	2456	kprintf("pmap_collect: collecting pv entries -- "
	2457	"suggest increasing PMAP_SHPGPERPROC\n");
	2458	warningdone++;
	2459	}
	2460
	2461	for (i = 0; i < vm_page_array_size; i++) {
	2462	m = &vm_page_array[i];
	2463	if (m->wire_count \|\| m->hold_count)
	2464	continue;
	2465	if (vm_page_busy_try(m, TRUE) == 0) {
	2466	if (m->wire_count == 0 && m->hold_count == 0) {
	2467	pmap_remove_all(m);
	2468	}
	2469	vm_page_wakeup(m);
	2470	}
	2471	}
	2472	}
	2473
	2474	/*
	2475	* Scan the pmap for active page table entries and issue a callback.
	2476	* The callback must dispose of pte_pv.
	2477	*
	2478	* NOTE: Unmanaged page table entries will not have a pte_pv
	2479	*
	2480	* NOTE: Kernel page table entries will not have a pt_pv. That is, wiring
	2481	* counts are not tracked in kernel page table pages.
	2482	*
	2483	* It is assumed that the start and end are properly rounded to the page size.
	2484	*/
	2485	static void
	2486	pmap_scan(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva,
	2487	void (func)(pmap_t, struct pmap_inval_info ,
	2488	pv_entry_t, pv_entry_t, vm_offset_t,
	2489	pt_entry_t , void ),
	2490	void *arg)
	2491	{
	2492	pv_entry_t pdp_pv; /* A page directory page PV */
	2493	pv_entry_t pd_pv; /* A page directory PV */
	2494	pv_entry_t pt_pv; /* A page table PV */
	2495	pv_entry_t pte_pv; /* A page table entry PV */
	2496	pt_entry_t *ptep;
	2497	vm_offset_t va_next;
	2498	struct pmap_inval_info info;
	2499	int error;
	2500
	2501	if (pmap == NULL)
	2502	return;
	2503
	2504	/*
	2505	* Hold the token for stability; if the pmap is empty we have nothing
	2506	* to do.
	2507	*/
	2508	lwkt_gettoken(&pmap->pm_token);
	2509	#if 0
	2510	if (pmap->pm_stats.resident_count == 0) {
	2511	lwkt_reltoken(&pmap->pm_token);
	2512	return;
	2513	}
	2514	#endif
	2515
	2516	pmap_inval_init(&info);
	2517
	2518	/*
	2519	* Special handling for removing one page, which is a very common
	2520	* operation (it is?).
	2521	* NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4
	2522	*/
	2523	if (sva + PAGE_SIZE == eva) {
	2524	if (sva >= VM_MAX_USER_ADDRESS) {
	2525	/*
	2526	* Kernel mappings do not track wire counts on
	2527	* page table pages.
	2528	*/
	2529	pt_pv = NULL;
	2530	pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
	2531	ptep = vtopte(sva);
	2532	} else {
	2533	/*
	2534	* User mappings may or may not have a pte_pv but
	2535	* will always have a pt_pv if the page is present.
	2536	*/
	2537	pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
	2538	pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
	2539	if (pt_pv == NULL) {
	2540	KKASSERT(pte_pv == NULL);
	2541	goto fast_skip;
	2542	}
	2543	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
	2544	}
	2545	if (*ptep == 0) {
	2546	/*
	2547	* Unlike the pv_find() case below we actually
	2548	* acquired a locked pv in this case so any
	2549	* race should have been resolved. It is expected
	2550	* to not exist.
	2551	*/
	2552	KKASSERT(pte_pv == NULL);
	2553	} else if (pte_pv) {
	2554	KASSERT((*ptep & (PG_MANAGED\|PG_V)) == (PG_MANAGED\|
	2555	PG_V),
	2556	("bad *ptep %016lx sva %016lx pte_pv %p",
	2557	*ptep, sva, pte_pv));
	2558	func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
	2559	} else {
	2560	KASSERT((*ptep & (PG_MANAGED\|PG_V)) == PG_V,
	2561	("bad *ptep %016lx sva %016lx pte_pv NULL",
	2562	*ptep, sva));
	2563	func(pmap, &info, pte_pv, pt_pv, sva, ptep, arg);
	2564	}
	2565	if (pt_pv)
	2566	pv_put(pt_pv);
	2567	fast_skip:
	2568	pmap_inval_done(&info);
	2569	lwkt_reltoken(&pmap->pm_token);
	2570	return;
	2571	}
	2572
	2573	/*
	2574	* NOTE: kernel mappings do not track page table pages, only
	2575	* terminal pages.
	2576	*
	2577	* NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4.
	2578	* However, for the scan to be efficient we try to
	2579	* cache items top-down.
	2580	*/
	2581	pdp_pv = NULL;
	2582	pd_pv = NULL;
	2583	pt_pv = NULL;
	2584
	2585	for (; sva < eva; sva = va_next) {
	2586	lwkt_yield();
	2587	if (sva >= VM_MAX_USER_ADDRESS) {
	2588	if (pt_pv) {
	2589	pv_put(pt_pv);
	2590	pt_pv = NULL;
	2591	}
	2592	goto kernel_skip;
	2593	}
	2594
	2595	/*
	2596	* PDP cache
	2597	*/
	2598	if (pdp_pv == NULL) {
	2599	pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
	2600	} else if (pdp_pv->pv_pindex != pmap_pdp_pindex(sva)) {
	2601	pv_put(pdp_pv);
	2602	pdp_pv = pv_get(pmap, pmap_pdp_pindex(sva));
	2603	}
	2604	if (pdp_pv == NULL) {
	2605	va_next = (sva + NBPML4) & ~PML4MASK;
	2606	if (va_next < sva)
	2607	va_next = eva;
	2608	continue;
	2609	}
	2610
	2611	/*
	2612	* PD cache
	2613	*/
	2614	if (pd_pv == NULL) {
	2615	if (pdp_pv) {
	2616	pv_put(pdp_pv);
	2617	pdp_pv = NULL;
	2618	}
	2619	pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
	2620	} else if (pd_pv->pv_pindex != pmap_pd_pindex(sva)) {
	2621	pv_put(pd_pv);
	2622	if (pdp_pv) {
	2623	pv_put(pdp_pv);
	2624	pdp_pv = NULL;
	2625	}
	2626	pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
	2627	}
	2628	if (pd_pv == NULL) {
	2629	va_next = (sva + NBPDP) & ~PDPMASK;
	2630	if (va_next < sva)
	2631	va_next = eva;
	2632	continue;
	2633	}
	2634
	2635	/*
	2636	* PT cache
	2637	*/
	2638	if (pt_pv == NULL) {
	2639	if (pdp_pv) {
	2640	pv_put(pdp_pv);
	2641	pdp_pv = NULL;
	2642	}
	2643	if (pd_pv) {
	2644	pv_put(pd_pv);
	2645	pd_pv = NULL;
	2646	}
	2647	pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
	2648	} else if (pt_pv->pv_pindex != pmap_pt_pindex(sva)) {
	2649	if (pdp_pv) {
	2650	pv_put(pdp_pv);
	2651	pdp_pv = NULL;
	2652	}
	2653	if (pd_pv) {
	2654	pv_put(pd_pv);
	2655	pd_pv = NULL;
	2656	}
	2657	pv_put(pt_pv);
	2658	pt_pv = pv_get(pmap, pmap_pt_pindex(sva));
	2659	}
	2660
	2661	/*
	2662	* We will scan or skip a page table page so adjust va_next
	2663	* either way.
	2664	*/
	2665	if (pt_pv == NULL) {
	2666	va_next = (sva + NBPDR) & ~PDRMASK;
	2667	if (va_next < sva)
	2668	va_next = eva;
	2669	continue;
	2670	}
	2671
	2672	/*
	2673	* From this point in the loop testing pt_pv for non-NULL
	2674	* means we are in UVM, else if it is NULL we are in KVM.
	2675	*/
	2676	kernel_skip:
	2677	va_next = (sva + NBPDR) & ~PDRMASK;
	2678	if (va_next < sva)
	2679	va_next = eva;
	2680
	2681	/*
	2682	* Limit our scan to either the end of the va represented
	2683	* by the current page table page, or to the end of the
	2684	* range being removed.
	2685	*
	2686	* Scan the page table for pages. Some pages may not be
	2687	* managed (might not have a pv_entry).
	2688	*
	2689	* There is no page table management for kernel pages so
	2690	* pt_pv will be NULL in that case, but otherwise pt_pv
	2691	* is non-NULL, locked, and referenced.
	2692	*/
	2693	if (va_next > eva)
	2694	va_next = eva;
	2695
	2696	/*
	2697	* At this point a non-NULL pt_pv means a UVA, and a NULL
	2698	* pt_pv means a KVA.
	2699	*/
	2700	if (pt_pv)
	2701	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva));
	2702	else
	2703	ptep = vtopte(sva);
	2704
	2705	while (sva < va_next) {
	2706	/*
	2707	* Acquire the related pte_pv, if any. If *ptep == 0
	2708	* the related pte_pv should not exist, but if *ptep
	2709	* is not zero the pte_pv may or may not exist (e.g.
	2710	* will not exist for an unmanaged page).
	2711	*
	2712	* However a multitude of races are possible here.
	2713	*
	2714	* In addition, the (pt_pv, pte_pv) lock order is
	2715	* backwards, so we have to be careful in aquiring
	2716	* a properly locked pte_pv.
	2717	*/
	2718	lwkt_yield();
	2719	if (pt_pv) {
	2720	pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva),
	2721	&error);
	2722	if (error) {
	2723	if (pdp_pv) {
	2724	pv_put(pdp_pv);
	2725	pdp_pv = NULL;
	2726	}
	2727	if (pd_pv) {
	2728	pv_put(pd_pv);
	2729	pd_pv = NULL;
	2730	}
	2731	pv_put(pt_pv); /* must be non-NULL */
	2732	pt_pv = NULL;
	2733	pv_lock(pte_pv); /* safe to block now */
	2734	pv_put(pte_pv);
	2735	pte_pv = NULL;
	2736	pt_pv = pv_get(pmap,
	2737	pmap_pt_pindex(sva));
	2738	continue;
	2739	}
	2740	} else {
	2741	pte_pv = pv_get(pmap, pmap_pte_pindex(sva));
	2742	}
	2743
	2744	/*
	2745	* Ok, if *ptep == 0 we had better NOT have a pte_pv.
	2746	*/
	2747	if (*ptep == 0) {
	2748	if (pte_pv) {
	2749	kprintf("Unexpected non-NULL pte_pv "
	2750	"%p pt_pv %p *ptep = %016lx\n",
	2751	pte_pv, pt_pv, *ptep);
	2752	panic("Unexpected non-NULL pte_pv");
	2753	}
	2754	sva += PAGE_SIZE;
	2755	++ptep;
	2756	continue;
	2757	}
	2758
	2759	/*
	2760	* Ready for the callback. The locked pte_pv (if any)
	2761	* is consumed by the callback. pte_pv will exist if
	2762	* the page is managed, and will not exist if it
	2763	* isn't.
	2764	*/
	2765	if (pte_pv) {
	2766	KASSERT((*ptep & (PG_MANAGED\|PG_V)) ==
	2767	(PG_MANAGED\|PG_V),
	2768	("bad *ptep %016lx sva %016lx "
	2769	"pte_pv %p",
	2770	*ptep, sva, pte_pv));
	2771	func(pmap, &info, pte_pv, pt_pv, sva,
	2772	ptep, arg);
	2773	} else {
	2774	KASSERT((*ptep & (PG_MANAGED\|PG_V)) ==
	2775	PG_V,
	2776	("bad *ptep %016lx sva %016lx "
	2777	"pte_pv NULL",
	2778	*ptep, sva));
	2779	func(pmap, &info, pte_pv, pt_pv, sva,
	2780	ptep, arg);
	2781	}
	2782	pte_pv = NULL;
	2783	sva += PAGE_SIZE;
	2784	++ptep;
	2785	}
	2786	}
	2787	if (pdp_pv) {
	2788	pv_put(pdp_pv);
	2789	pdp_pv = NULL;
	2790	}
	2791	if (pd_pv) {
	2792	pv_put(pd_pv);
	2793	pd_pv = NULL;
	2794	}
	2795	if (pt_pv) {
	2796	pv_put(pt_pv);
	2797	pt_pv = NULL;
	2798	}
	2799	pmap_inval_done(&info);
	2800	lwkt_reltoken(&pmap->pm_token);
	2801	}
	2802
	2803	void
	2804	pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
	2805	{
	2806	pmap_scan(pmap, sva, eva, pmap_remove_callback, NULL);
	2807	}
	2808
	2809	static void
	2810	pmap_remove_callback(pmap_t pmap, struct pmap_inval_info *info,
	2811	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	2812	pt_entry_t ptep, void arg __unused)
	2813	{
	2814	pt_entry_t pte;
	2815
	2816	if (pte_pv) {
	2817	/*
	2818	* This will also drop pt_pv's wire_count. Note that
	2819	* terminal pages are not wired based on mmu presence.
	2820	*/
	2821	pmap_remove_pv_pte(pte_pv, pt_pv, info);
	2822	pmap_remove_pv_page(pte_pv);
	2823	pv_free(pte_pv);
	2824	} else {
	2825	/*
	2826	* pt_pv's wire_count is still bumped by unmanaged pages
	2827	* so we must decrement it manually.
	2828	*/
	2829	pmap_inval_interlock(info, pmap, va);
	2830	pte = pte_load_clear(ptep);
	2831	pmap_inval_deinterlock(info, pmap);
	2832	if (pte & PG_W)
	2833	atomic_add_long(&pmap->pm_stats.wired_count, -1);
	2834	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	2835	if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m))
	2836	panic("pmap_remove: insufficient wirecount");
	2837	}
	2838	}
	2839
	2840	/*
	2841	* Removes this physical page from all physical maps in which it resides.
	2842	* Reflects back modify bits to the pager.
	2843	*
	2844	* This routine may not be called from an interrupt.
	2845	*/
	2846	static
	2847	void
	2848	pmap_remove_all(vm_page_t m)
	2849	{
	2850	struct pmap_inval_info info;
	2851	pv_entry_t pv;
	2852
	2853	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	2854	return;
	2855
	2856	pmap_inval_init(&info);
	2857	vm_page_spin_lock(m);
	2858	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
	2859	KKASSERT(pv->pv_m == m);
	2860	if (pv_hold_try(pv)) {
	2861	vm_page_spin_unlock(m);
	2862	} else {
	2863	vm_page_spin_unlock(m);
	2864	pv_lock(pv);
	2865	if (pv->pv_m != m) {
	2866	pv_put(pv);
	2867	vm_page_spin_lock(m);
	2868	continue;
	2869	}
	2870	}
	2871	/*
	2872	* Holding no spinlocks, pv is locked.
	2873	*/
	2874	pmap_remove_pv_pte(pv, NULL, &info);
	2875	pmap_remove_pv_page(pv);
	2876	pv_free(pv);
	2877	vm_page_spin_lock(m);
	2878	}
	2879	KKASSERT((m->flags & (PG_MAPPED\|PG_WRITEABLE)) == 0);
	2880	vm_page_spin_unlock(m);
	2881	pmap_inval_done(&info);
	2882	}
	2883
	2884	/*
	2885	* pmap_protect:
	2886	*
	2887	* Set the physical protection on the specified range of this map
	2888	* as requested.
	2889	*
	2890	* This function may not be called from an interrupt if the map is
	2891	* not the kernel_pmap.
	2892	*/
	2893	void
	2894	pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
	2895	{
	2896	/* JG review for NX */
	2897
	2898	if (pmap == NULL)
	2899	return;
	2900	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
	2901	pmap_remove(pmap, sva, eva);
	2902	return;
	2903	}
	2904	if (prot & VM_PROT_WRITE)
	2905	return;
	2906	pmap_scan(pmap, sva, eva, pmap_protect_callback, &prot);
	2907	}
	2908
	2909	static
	2910	void
	2911	pmap_protect_callback(pmap_t pmap, struct pmap_inval_info *info,
	2912	pv_entry_t pte_pv, pv_entry_t pt_pv, vm_offset_t va,
	2913	pt_entry_t ptep, void arg __unused)
	2914	{
	2915	pt_entry_t pbits;
	2916	pt_entry_t cbits;
	2917	vm_page_t m;
	2918
	2919	/*
	2920	* XXX non-optimal.
	2921	*/
	2922	pmap_inval_interlock(info, pmap, va);
	2923	again:
	2924	pbits = *ptep;
	2925	cbits = pbits;
	2926	if (pte_pv) {
	2927	m = NULL;
	2928	if (pbits & PG_A) {
	2929	m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
	2930	KKASSERT(m == pte_pv->pv_m);
	2931	vm_page_flag_set(m, PG_REFERENCED);
	2932	cbits &= ~PG_A;
	2933	}
	2934	if (pbits & PG_M) {
	2935	if (pmap_track_modified(pte_pv->pv_pindex)) {
	2936	if (m == NULL)
	2937	m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
	2938	vm_page_dirty(m);
	2939	cbits &= ~PG_M;
	2940	}
	2941	}
	2942	}
	2943	cbits &= ~PG_RW;
	2944	if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
	2945	goto again;
	2946	}
	2947	pmap_inval_deinterlock(info, pmap);
	2948	if (pte_pv)
	2949	pv_put(pte_pv);
	2950	}
	2951
	2952	/*
	2953	* Insert the vm_page (m) at the virtual address (va), replacing any prior
	2954	* mapping at that address. Set protection and wiring as requested.
	2955	*
	2956	* NOTE: This routine MUST insert the page into the pmap now, it cannot
	2957	* lazy-evaluate.
	2958	*/
	2959	void
	2960	pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
	2961	boolean_t wired)
	2962	{
	2963	pmap_inval_info info;
	2964	pv_entry_t pt_pv; /* page table */
	2965	pv_entry_t pte_pv; /* page table entry */
	2966	pt_entry_t *ptep;
	2967	vm_paddr_t opa;
	2968	pt_entry_t origpte, newpte;
	2969	vm_paddr_t pa;
	2970
	2971	if (pmap == NULL)
	2972	return;
	2973	va = trunc_page(va);
	2974	#ifdef PMAP_DIAGNOSTIC
	2975	if (va >= KvaEnd)
	2976	panic("pmap_enter: toobig");
	2977	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
	2978	panic("pmap_enter: invalid to pmap_enter page table "
	2979	"pages (va: 0x%lx)", va);
	2980	#endif
	2981	if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
	2982	kprintf("Warning: pmap_enter called on UVA with "
	2983	"kernel_pmap\n");
	2984	#ifdef DDB
	2985	db_print_backtrace();
	2986	#endif
	2987	}
	2988	if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
	2989	kprintf("Warning: pmap_enter called on KVA without"
	2990	"kernel_pmap\n");
	2991	#ifdef DDB
	2992	db_print_backtrace();
	2993	#endif
	2994	}
	2995
	2996	/*
	2997	* Get locked PV entries for our new page table entry (pte_pv)
	2998	* and for its parent page table (pt_pv). We need the parent
	2999	* so we can resolve the location of the ptep.
	3000	*
	3001	* Only hardware MMU actions can modify the ptep out from
	3002	* under us.
	3003	*
	3004	* if (m) is fictitious or unmanaged we do not create a managing
	3005	* pte_pv for it. Any pre-existing page's management state must
	3006	* match (avoiding code complexity).
	3007	*
	3008	* If the pmap is still being initialized we assume existing
	3009	* page tables.
	3010	*
	3011	* Kernel mapppings do not track page table pages (i.e. pt_pv).
	3012	* pmap_allocpte() checks the
	3013	*/
	3014	if (pmap_initialized == FALSE) {
	3015	pte_pv = NULL;
	3016	pt_pv = NULL;
	3017	ptep = vtopte(va);
	3018	} else if (m->flags & (PG_FICTITIOUS \| PG_UNMANAGED)) {
	3019	pte_pv = NULL;
	3020	if (va >= VM_MAX_USER_ADDRESS) {
	3021	pt_pv = NULL;
	3022	ptep = vtopte(va);
	3023	} else {
	3024	pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
	3025	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
	3026	}
	3027	KKASSERT(ptep == 0 \|\| (ptep & PG_MANAGED) == 0);
	3028	} else {
	3029	if (va >= VM_MAX_USER_ADDRESS) {
	3030	pt_pv = NULL;
	3031	pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va), NULL);
	3032	ptep = vtopte(va);
	3033	} else {
	3034	pte_pv = pmap_allocpte(pmap, pmap_pte_pindex(va),
	3035	&pt_pv);
	3036	ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
	3037	}
	3038	KKASSERT(ptep == 0 \|\| (ptep & PG_MANAGED));
	3039	}
	3040
	3041	pa = VM_PAGE_TO_PHYS(m);
	3042	origpte = *ptep;
	3043	opa = origpte & PG_FRAME;
	3044
	3045	newpte = (pt_entry_t)(pa \| pte_prot(pmap, prot) \| PG_V \| PG_A);
	3046	if (wired)
	3047	newpte \|= PG_W;
	3048	if (va < VM_MAX_USER_ADDRESS)
	3049	newpte \|= PG_U;
	3050	if (pte_pv)
	3051	newpte \|= PG_MANAGED;
	3052	if (pmap == &kernel_pmap)
	3053	newpte \|= pgeflag;
	3054
	3055	/*
	3056	* It is possible for multiple faults to occur in threaded
	3057	* environments, the existing pte might be correct.
	3058	*/
	3059	if (((origpte ^ newpte) & ~(pt_entry_t)(PG_M\|PG_A)) == 0)
	3060	goto done;
	3061
	3062	if ((prot & VM_PROT_NOSYNC) == 0)
	3063	pmap_inval_init(&info);
	3064
	3065	/*
	3066	* Ok, either the address changed or the protection or wiring
	3067	* changed.
	3068	*
	3069	* Clear the current entry, interlocking the removal. For managed
	3070	* pte's this will also flush the modified state to the vm_page.
	3071	* Atomic ops are mandatory in order to ensure that PG_M events are
	3072	* not lost during any transition.
	3073	*/
	3074	if (opa) {
	3075	if (pte_pv) {
	3076	/*
	3077	* pmap_remove_pv_pte() unwires pt_pv and assumes
	3078	* we will free pte_pv, but since we are reusing
	3079	* pte_pv we want to retain the wire count.
	3080	*
	3081	* pt_pv won't exist for a kernel page (managed or
	3082	* otherwise).
	3083	*/
	3084	if (pt_pv)
	3085	vm_page_wire_quick(pt_pv->pv_m);
	3086	if (prot & VM_PROT_NOSYNC)
	3087	pmap_remove_pv_pte(pte_pv, pt_pv, NULL);
	3088	else
	3089	pmap_remove_pv_pte(pte_pv, pt_pv, &info);
	3090	if (pte_pv->pv_m)
	3091	pmap_remove_pv_page(pte_pv);
	3092	} else if (prot & VM_PROT_NOSYNC) {
	3093	/* leave wire count on PT page intact */
	3094	(void)pte_load_clear(ptep);
	3095	cpu_invlpg((void *)va);
	3096	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	3097	} else {
	3098	/* leave wire count on PT page intact */
	3099	pmap_inval_interlock(&info, pmap, va);
	3100	(void)pte_load_clear(ptep);
	3101	pmap_inval_deinterlock(&info, pmap);
	3102	atomic_add_long(&pmap->pm_stats.resident_count, -1);
	3103	}
	3104	KKASSERT(*ptep == 0);
	3105	}
	3106
	3107	if (pte_pv) {
	3108	/*
	3109	* Enter on the PV list if part of our managed memory.
	3110	* Wiring of the PT page is already handled.
	3111	*/
	3112	KKASSERT(pte_pv->pv_m == NULL);
	3113	vm_page_spin_lock(m);
	3114	pte_pv->pv_m = m;
	3115	TAILQ_INSERT_TAIL(&m->md.pv_list, pte_pv, pv_list);
	3116	/*
	3117	if (m->object)
	3118	atomic_add_int(&m->object->agg_pv_list_count, 1);
	3119	*/
	3120	vm_page_flag_set(m, PG_MAPPED);
	3121	vm_page_spin_unlock(m);
	3122	} else if (pt_pv && opa == 0) {
	3123	/*
	3124	* We have to adjust the wire count on the PT page ourselves
	3125	* for unmanaged entries. If opa was non-zero we retained
	3126	* the existing wire count from the removal.
	3127	*/
	3128	vm_page_wire_quick(pt_pv->pv_m);
	3129	}
	3130
	3131	/*
	3132	* Ok, for UVM (pt_pv != NULL) we don't need to interlock or
	3133	* invalidate anything, the TLB won't have any stale entries to
	3134	* remove.
	3135	*
	3136	* For KVM there appear to still be issues. Theoretically we
	3137	* should be able to scrap the interlocks entirely but we
	3138	* get crashes.
	3139	*/
	3140	if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
	3141	pmap_inval_interlock(&info, pmap, va);
	3142	(volatile pt_entry_t )ptep = newpte;
	3143
	3144	if ((prot & VM_PROT_NOSYNC) == 0 && pt_pv == NULL)
	3145	pmap_inval_deinterlock(&info, pmap);
	3146	else if (pt_pv == NULL)
	3147	cpu_invlpg((void *)va);
	3148
	3149	if (wired)
	3150	atomic_add_long(&pmap->pm_stats.wired_count, 1);
	3151	if (newpte & PG_RW)
	3152	vm_page_flag_set(m, PG_WRITEABLE);
	3153	if (pte_pv == NULL)
	3154	atomic_add_long(&pmap->pm_stats.resident_count, 1);
	3155
	3156	/*
	3157	* Cleanup
	3158	*/
	3159	if ((prot & VM_PROT_NOSYNC) == 0 \|\| pte_pv == NULL)
	3160	pmap_inval_done(&info);
	3161	done:
	3162	KKASSERT((newpte & PG_MANAGED) == 0 \|\| (m->flags & PG_MAPPED));
	3163
	3164	/*
	3165	* Cleanup the pv entry, allowing other accessors.
	3166	*/
	3167	if (pte_pv)
	3168	pv_put(pte_pv);
	3169	if (pt_pv)
	3170	pv_put(pt_pv);
	3171	}
	3172
	3173	/*
	3174	* This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
	3175	* This code also assumes that the pmap has no pre-existing entry for this
	3176	* VA.
	3177	*
	3178	* This code currently may only be used on user pmaps, not kernel_pmap.
	3179	*/
	3180	void
	3181	pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
	3182	{
	3183	pmap_enter(pmap, va, m, VM_PROT_READ, FALSE);
	3184	}
	3185
	3186	/*
	3187	* Make a temporary mapping for a physical address. This is only intended
	3188	* to be used for panic dumps.
	3189	*
	3190	* The caller is responsible for calling smp_invltlb().
	3191	*/
	3192	void *
	3193	pmap_kenter_temporary(vm_paddr_t pa, long i)
	3194	{
	3195	pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
	3196	return ((void *)crashdumpmap);
	3197	}
	3198
	3199	#define MAX_INIT_PT (96)
	3200
	3201	/*
	3202	* This routine preloads the ptes for a given object into the specified pmap.
	3203	* This eliminates the blast of soft faults on process startup and
	3204	* immediately after an mmap.
	3205	*/
	3206	static int pmap_object_init_pt_callback(vm_page_t p, void *data);
	3207
	3208	void
	3209	pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
	3210	vm_object_t object, vm_pindex_t pindex,
	3211	vm_size_t size, int limit)
	3212	{
	3213	struct rb_vm_page_scan_info info;
	3214	struct lwp *lp;
	3215	vm_size_t psize;
	3216
	3217	/*
	3218	* We can't preinit if read access isn't set or there is no pmap
	3219	* or object.
	3220	*/
	3221	if ((prot & VM_PROT_READ) == 0 \|\| pmap == NULL \|\| object == NULL)
	3222	return;
	3223
	3224	/*
	3225	* We can't preinit if the pmap is not the current pmap
	3226	*/
	3227	lp = curthread->td_lwp;
	3228	if (lp == NULL \|\| pmap != vmspace_pmap(lp->lwp_vmspace))
	3229	return;
	3230
	3231	psize = x86_64_btop(size);
	3232
	3233	if ((object->type != OBJT_VNODE) \|\|
	3234	((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
	3235	(object->resident_page_count > MAX_INIT_PT))) {
	3236	return;
	3237	}
	3238
	3239	if (pindex + psize > object->size) {
	3240	if (object->size < pindex)
	3241	return;
	3242	psize = object->size - pindex;
	3243	}
	3244
	3245	if (psize == 0)
	3246	return;
	3247
	3248	/*
	3249	* Use a red-black scan to traverse the requested range and load
	3250	* any valid pages found into the pmap.
	3251	*
	3252	* We cannot safely scan the object's memq without holding the
	3253	* object token.
	3254	*/
	3255	info.start_pindex = pindex;
	3256	info.end_pindex = pindex + psize - 1;
	3257	info.limit = limit;
	3258	info.mpte = NULL;
	3259	info.addr = addr;
	3260	info.pmap = pmap;
	3261
	3262	vm_object_hold_shared(object);
	3263	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	3264	pmap_object_init_pt_callback, &info);
	3265	vm_object_drop(object);
	3266	}
	3267
	3268	static
	3269	int
	3270	pmap_object_init_pt_callback(vm_page_t p, void *data)
	3271	{
	3272	struct rb_vm_page_scan_info *info = data;
	3273	vm_pindex_t rel_index;
	3274
	3275	/*
	3276	* don't allow an madvise to blow away our really
	3277	* free pages allocating pv entries.
	3278	*/
	3279	if ((info->limit & MAP_PREFAULT_MADVISE) &&
	3280	vmstats.v_free_count < vmstats.v_free_reserved) {
	3281	return(-1);
	3282	}
	3283
	3284	/*
	3285	* Ignore list markers and ignore pages we cannot instantly
	3286	* busy (while holding the object token).
	3287	*/
	3288	if (p->flags & PG_MARKER)
	3289	return 0;
	3290	if (vm_page_busy_try(p, TRUE))
	3291	return 0;
	3292	if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
	3293	(p->flags & PG_FICTITIOUS) == 0) {
	3294	if ((p->queue - p->pc) == PQ_CACHE)
	3295	vm_page_deactivate(p);
	3296	rel_index = p->pindex - info->start_pindex;
	3297	pmap_enter_quick(info->pmap,
	3298	info->addr + x86_64_ptob(rel_index), p);
	3299	}
	3300	vm_page_wakeup(p);
	3301	lwkt_yield();
	3302	return(0);
	3303	}
	3304
	3305	/*
	3306	* Return TRUE if the pmap is in shape to trivially pre-fault the specified
	3307	* address.
	3308	*
	3309	* Returns FALSE if it would be non-trivial or if a pte is already loaded
	3310	* into the slot.
	3311	*
	3312	* XXX This is safe only because page table pages are not freed.
	3313	*/
	3314	int
	3315	pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
	3316	{
	3317	pt_entry_t *pte;
	3318
	3319	/spin_lock(&pmap->pm_spin);/
	3320	if ((pte = pmap_pte(pmap, addr)) != NULL) {
	3321	if (*pte & PG_V) {
	3322	/spin_unlock(&pmap->pm_spin);/
	3323	return FALSE;
	3324	}
	3325	}
	3326	/spin_unlock(&pmap->pm_spin);/
	3327	return TRUE;
	3328	}
	3329
	3330	/*
	3331	* Change the wiring attribute for a pmap/va pair. The mapping must already
	3332	* exist in the pmap. The mapping may or may not be managed.
	3333	*/
	3334	void
	3335	pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
	3336	{
	3337	pt_entry_t *ptep;
	3338	pv_entry_t pv;
	3339
	3340	if (pmap == NULL)
	3341	return;
	3342	lwkt_gettoken(&pmap->pm_token);
	3343	pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL);
	3344	ptep = pv_pte_lookup(pv, pmap_pte_index(va));
	3345
	3346	if (wired && !pmap_pte_w(ptep))
	3347	atomic_add_long(&pmap->pm_stats.wired_count, 1);
	3348	else if (!wired && pmap_pte_w(ptep))
	3349	atomic_add_long(&pmap->pm_stats.wired_count, -1);
	3350
	3351	/*
	3352	* Wiring is not a hardware characteristic so there is no need to
	3353	* invalidate TLB. However, in an SMP environment we must use
	3354	* a locked bus cycle to update the pte (if we are not using
	3355	* the pmap_inval_*() API that is)... it's ok to do this for simple
	3356	* wiring changes.
	3357	*/
	3358	#ifdef SMP
	3359	if (wired)
	3360	atomic_set_long(ptep, PG_W);
	3361	else
	3362	atomic_clear_long(ptep, PG_W);
	3363	#else
	3364	if (wired)
	3365	atomic_set_long_nonlocked(ptep, PG_W);
	3366	else
	3367	atomic_clear_long_nonlocked(ptep, PG_W);
	3368	#endif
	3369	pv_put(pv);
	3370	lwkt_reltoken(&pmap->pm_token);
	3371	}
	3372
	3373
	3374
	3375	/*
	3376	* Copy the range specified by src_addr/len from the source map to
	3377	* the range dst_addr/len in the destination map.
	3378	*
	3379	* This routine is only advisory and need not do anything.
	3380	*/
	3381	void
	3382	pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
	3383	vm_size_t len, vm_offset_t src_addr)
	3384	{
	3385	}
	3386
	3387	/*
	3388	* pmap_zero_page:
	3389	*
	3390	* Zero the specified physical page.
	3391	*
	3392	* This function may be called from an interrupt and no locking is
	3393	* required.
	3394	*/
	3395	void
	3396	pmap_zero_page(vm_paddr_t phys)
	3397	{
	3398	vm_offset_t va = PHYS_TO_DMAP(phys);
	3399
	3400	pagezero((void *)va);
	3401	}
	3402
	3403	/*
	3404	* pmap_page_assertzero:
	3405	*
	3406	* Assert that a page is empty, panic if it isn't.
	3407	*/
	3408	void
	3409	pmap_page_assertzero(vm_paddr_t phys)
	3410	{
	3411	vm_offset_t va = PHYS_TO_DMAP(phys);
	3412	size_t i;
	3413
	3414	for (i = 0; i < PAGE_SIZE; i += sizeof(long)) {
	3415	if ((long )((char *)va + i) != 0) {
	3416	panic("pmap_page_assertzero() @ %p not zero!\n",
	3417	(void *)(intptr_t)va);
	3418	}
	3419	}
	3420	}
	3421
	3422	/*
	3423	* pmap_zero_page:
	3424	*
	3425	* Zero part of a physical page by mapping it into memory and clearing
	3426	* its contents with bzero.
	3427	*
	3428	* off and size may not cover an area beyond a single hardware page.
	3429	*/
	3430	void
	3431	pmap_zero_page_area(vm_paddr_t phys, int off, int size)
	3432	{
	3433	vm_offset_t virt = PHYS_TO_DMAP(phys);
	3434
	3435	bzero((char *)virt + off, size);
	3436	}
	3437
	3438	/*
	3439	* pmap_copy_page:
	3440	*
	3441	* Copy the physical page from the source PA to the target PA.
	3442	* This function may be called from an interrupt. No locking
	3443	* is required.
	3444	*/
	3445	void
	3446	pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
	3447	{
	3448	vm_offset_t src_virt, dst_virt;
	3449
	3450	src_virt = PHYS_TO_DMAP(src);
	3451	dst_virt = PHYS_TO_DMAP(dst);
	3452	bcopy((void )src_virt, (void )dst_virt, PAGE_SIZE);
	3453	}
	3454
	3455	/*
	3456	* pmap_copy_page_frag:
	3457	*
	3458	* Copy the physical page from the source PA to the target PA.
	3459	* This function may be called from an interrupt. No locking
	3460	* is required.
	3461	*/
	3462	void
	3463	pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
	3464	{
	3465	vm_offset_t src_virt, dst_virt;
	3466
	3467	src_virt = PHYS_TO_DMAP(src);
	3468	dst_virt = PHYS_TO_DMAP(dst);
	3469
	3470	bcopy((char *)src_virt + (src & PAGE_MASK),
	3471	(char *)dst_virt + (dst & PAGE_MASK),
	3472	bytes);
	3473	}
	3474
	3475	/*
	3476	* Returns true if the pmap's pv is one of the first 16 pvs linked to from
	3477	* this page. This count may be changed upwards or downwards in the future;
	3478	* it is only necessary that true be returned for a small subset of pmaps
	3479	* for proper page aging.
	3480	*/
	3481	boolean_t
	3482	pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
	3483	{
	3484	pv_entry_t pv;
	3485	int loops = 0;
	3486
	3487	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3488	return FALSE;
	3489
	3490	vm_page_spin_lock(m);
	3491	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3492	if (pv->pv_pmap == pmap) {
	3493	vm_page_spin_unlock(m);
	3494	return TRUE;
	3495	}
	3496	loops++;
	3497	if (loops >= 16)
	3498	break;
	3499	}
	3500	vm_page_spin_unlock(m);
	3501	return (FALSE);
	3502	}
	3503
	3504	/*
	3505	* Remove all pages from specified address space this aids process exit
	3506	* speeds. Also, this code may be special cased for the current process
	3507	* only.
	3508	*/
	3509	void
	3510	pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
	3511	{
	3512	pmap_remove(pmap, sva, eva);
	3513	}
	3514
	3515	/*
	3516	* pmap_testbit tests bits in pte's note that the testbit/clearbit
	3517	* routines are inline, and a lot of things compile-time evaluate.
	3518	*/
	3519	static
	3520	boolean_t
	3521	pmap_testbit(vm_page_t m, int bit)
	3522	{
	3523	pv_entry_t pv;
	3524	pt_entry_t *pte;
	3525
	3526	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3527	return FALSE;
	3528
	3529	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
	3530	return FALSE;
	3531	vm_page_spin_lock(m);
	3532	if (TAILQ_FIRST(&m->md.pv_list) == NULL) {
	3533	vm_page_spin_unlock(m);
	3534	return FALSE;
	3535	}
	3536
	3537	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3538	/*
	3539	* if the bit being tested is the modified bit, then
	3540	* mark clean_map and ptes as never
	3541	* modified.
	3542	*/
	3543	if (bit & (PG_A\|PG_M)) {
	3544	if (!pmap_track_modified(pv->pv_pindex))
	3545	continue;
	3546	}
	3547
	3548	#if defined(PMAP_DIAGNOSTIC)
	3549	if (pv->pv_pmap == NULL) {
	3550	kprintf("Null pmap (tb) at pindex: %"PRIu64"\n",
	3551	pv->pv_pindex);
	3552	continue;
	3553	}
	3554	#endif
	3555	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
	3556	if (*pte & bit) {
	3557	vm_page_spin_unlock(m);
	3558	return TRUE;
	3559	}
	3560	}
	3561	vm_page_spin_unlock(m);
	3562	return (FALSE);
	3563	}
	3564
	3565	/*
	3566	* This routine is used to modify bits in ptes. Only one bit should be
	3567	* specified. PG_RW requires special handling.
	3568	*
	3569	* Caller must NOT hold any spin locks
	3570	*/
	3571	static __inline
	3572	void
	3573	pmap_clearbit(vm_page_t m, int bit)
	3574	{
	3575	struct pmap_inval_info info;
	3576	pv_entry_t pv;
	3577	pt_entry_t *pte;
	3578	pt_entry_t pbits;
	3579	pmap_t save_pmap;
	3580
	3581	if (bit == PG_RW)
	3582	vm_page_flag_clear(m, PG_WRITEABLE);
	3583	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS)) {
	3584	return;
	3585	}
	3586
	3587	/*
	3588	* PG_M or PG_A case
	3589	*
	3590	* Loop over all current mappings setting/clearing as appropos If
	3591	* setting RO do we need to clear the VAC?
	3592	*
	3593	* NOTE: When clearing PG_M we could also (not implemented) drop
	3594	* through to the PG_RW code and clear PG_RW too, forcing
	3595	* a fault on write to redetect PG_M for virtual kernels, but
	3596	* it isn't necessary since virtual kernels invalidate the
	3597	* pte when they clear the VPTE_M bit in their virtual page
	3598	* tables.
	3599	*
	3600	* NOTE: Does not re-dirty the page when clearing only PG_M.
	3601	*/
	3602	if ((bit & PG_RW) == 0) {
	3603	vm_page_spin_lock(m);
	3604	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3605	#if defined(PMAP_DIAGNOSTIC)
	3606	if (pv->pv_pmap == NULL) {
	3607	kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
	3608	pv->pv_pindex);
	3609	continue;
	3610	}
	3611	#endif
	3612	pte = pmap_pte_quick(pv->pv_pmap,
	3613	pv->pv_pindex << PAGE_SHIFT);
	3614	pbits = *pte;
	3615	if (pbits & bit)
	3616	atomic_clear_long(pte, bit);
	3617	}
	3618	vm_page_spin_unlock(m);
	3619	return;
	3620	}
	3621
	3622	/*
	3623	* Clear PG_RW. Also clears PG_M and marks the page dirty if PG_M
	3624	* was set.
	3625	*/
	3626	pmap_inval_init(&info);
	3627
	3628	restart:
	3629	vm_page_spin_lock(m);
	3630	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3631	/*
	3632	* don't write protect pager mappings
	3633	*/
	3634	if (!pmap_track_modified(pv->pv_pindex))
	3635	continue;
	3636
	3637	#if defined(PMAP_DIAGNOSTIC)
	3638	if (pv->pv_pmap == NULL) {
	3639	kprintf("Null pmap (cb) at pindex: %"PRIu64"\n",
	3640	pv->pv_pindex);
	3641	continue;
	3642	}
	3643	#endif
	3644	/*
	3645	* Skip pages which do not have PG_RW set.
	3646	*/
	3647	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
	3648	if ((*pte & PG_RW) == 0)
	3649	continue;
	3650
	3651	/*
	3652	* Lock the PV
	3653	*/
	3654	if (pv_hold_try(pv) == 0) {
	3655	vm_page_spin_unlock(m);
	3656	pv_lock(pv); /* held, now do a blocking lock */
	3657	pv_put(pv); /* and release */
	3658	goto restart; /* anything could have happened */
	3659	}
	3660
	3661	save_pmap = pv->pv_pmap;
	3662	vm_page_spin_unlock(m);
	3663	pmap_inval_interlock(&info, save_pmap,
	3664	(vm_offset_t)pv->pv_pindex << PAGE_SHIFT);
	3665	KKASSERT(pv->pv_pmap == save_pmap);
	3666	for (;;) {
	3667	pbits = *pte;
	3668	cpu_ccfence();
	3669	if (atomic_cmpset_long(pte, pbits,
	3670	pbits & ~(PG_RW\|PG_M))) {
	3671	break;
	3672	}
	3673	}
	3674	pmap_inval_deinterlock(&info, save_pmap);
	3675	vm_page_spin_lock(m);
	3676
	3677	/*
	3678	* If PG_M was found to be set while we were clearing PG_RW
	3679	* we also clear PG_M (done above) and mark the page dirty.
	3680	* Callers expect this behavior.
	3681	*/
	3682	if (pbits & PG_M)
	3683	vm_page_dirty(m);
	3684	pv_put(pv);
	3685	}
	3686	vm_page_spin_unlock(m);
	3687	pmap_inval_done(&info);
	3688	}
	3689
	3690	/*
	3691	* Lower the permission for all mappings to a given page.
	3692	*
	3693	* Page must be busied by caller.
	3694	*/
	3695	void
	3696	pmap_page_protect(vm_page_t m, vm_prot_t prot)
	3697	{
	3698	/* JG NX support? */
	3699	if ((prot & VM_PROT_WRITE) == 0) {
	3700	if (prot & (VM_PROT_READ \| VM_PROT_EXECUTE)) {
	3701	/*
	3702	* NOTE: pmap_clearbit(.. PG_RW) also clears
	3703	* the PG_WRITEABLE flag in (m).
	3704	*/
	3705	pmap_clearbit(m, PG_RW);
	3706	} else {
	3707	pmap_remove_all(m);
	3708	}
	3709	}
	3710	}
	3711
	3712	vm_paddr_t
	3713	pmap_phys_address(vm_pindex_t ppn)
	3714	{
	3715	return (x86_64_ptob(ppn));
	3716	}
	3717
	3718	/*
	3719	* Return a count of reference bits for a page, clearing those bits.
	3720	* It is not necessary for every reference bit to be cleared, but it
	3721	* is necessary that 0 only be returned when there are truly no
	3722	* reference bits set.
	3723	*
	3724	* XXX: The exact number of bits to check and clear is a matter that
	3725	* should be tested and standardized at some point in the future for
	3726	* optimal aging of shared pages.
	3727	*
	3728	* This routine may not block.
	3729	*/
	3730	int
	3731	pmap_ts_referenced(vm_page_t m)
	3732	{
	3733	pv_entry_t pv;
	3734	pt_entry_t *pte;
	3735	int rtval = 0;
	3736
	3737	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3738	return (rtval);
	3739
	3740	vm_page_spin_lock(m);
	3741	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3742	if (!pmap_track_modified(pv->pv_pindex))
	3743	continue;
	3744	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
	3745	if (pte && (*pte & PG_A)) {
	3746	#ifdef SMP
	3747	atomic_clear_long(pte, PG_A);
	3748	#else
	3749	atomic_clear_long_nonlocked(pte, PG_A);
	3750	#endif
	3751	rtval++;
	3752	if (rtval > 4)
	3753	break;
	3754	}
	3755	}
	3756	vm_page_spin_unlock(m);
	3757	return (rtval);
	3758	}
	3759
	3760	/*
	3761	* pmap_is_modified:
	3762	*
	3763	* Return whether or not the specified physical page was modified
	3764	* in any physical maps.
	3765	*/
	3766	boolean_t
	3767	pmap_is_modified(vm_page_t m)
	3768	{
	3769	boolean_t res;
	3770
	3771	res = pmap_testbit(m, PG_M);
	3772	return (res);
	3773	}
	3774
	3775	/*
	3776	* Clear the modify bits on the specified physical page.
	3777	*/
	3778	void
	3779	pmap_clear_modify(vm_page_t m)
	3780	{
	3781	pmap_clearbit(m, PG_M);
	3782	}
	3783
	3784	/*
	3785	* pmap_clear_reference:
	3786	*
	3787	* Clear the reference bit on the specified physical page.
	3788	*/
	3789	void
	3790	pmap_clear_reference(vm_page_t m)
	3791	{
	3792	pmap_clearbit(m, PG_A);
	3793	}
	3794
	3795	/*
	3796	* Miscellaneous support routines follow
	3797	*/
	3798
	3799	static
	3800	void
	3801	i386_protection_init(void)
	3802	{
	3803	int *kp, prot;
	3804
	3805	/* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit */
	3806	kp = protection_codes;
	3807	for (prot = 0; prot < 8; prot++) {
	3808	switch (prot) {
	3809	case VM_PROT_NONE \| VM_PROT_NONE \| VM_PROT_NONE:
	3810	/*
	3811	* Read access is also 0. There isn't any execute bit,
	3812	* so just make it readable.
	3813	*/
	3814	case VM_PROT_READ \| VM_PROT_NONE \| VM_PROT_NONE:
	3815	case VM_PROT_READ \| VM_PROT_NONE \| VM_PROT_EXECUTE:
	3816	case VM_PROT_NONE \| VM_PROT_NONE \| VM_PROT_EXECUTE:
	3817	*kp++ = 0;
	3818	break;
	3819	case VM_PROT_NONE \| VM_PROT_WRITE \| VM_PROT_NONE:
	3820	case VM_PROT_NONE \| VM_PROT_WRITE \| VM_PROT_EXECUTE:
	3821	case VM_PROT_READ \| VM_PROT_WRITE \| VM_PROT_NONE:
	3822	case VM_PROT_READ \| VM_PROT_WRITE \| VM_PROT_EXECUTE:
	3823	*kp++ = PG_RW;
	3824	break;
	3825	}
	3826	}
	3827	}
	3828
	3829	/*
	3830	* Map a set of physical memory pages into the kernel virtual
	3831	* address space. Return a pointer to where it is mapped. This
	3832	* routine is intended to be used for mapping device memory,
	3833	* NOT real memory.
	3834	*
	3835	* NOTE: we can't use pgeflag unless we invalidate the pages one at
	3836	* a time.
	3837	*/
	3838	void *
	3839	pmap_mapdev(vm_paddr_t pa, vm_size_t size)
	3840	{
	3841	vm_offset_t va, tmpva, offset;
	3842	pt_entry_t *pte;
	3843
	3844	offset = pa & PAGE_MASK;
	3845	size = roundup(offset + size, PAGE_SIZE);
	3846
	3847	va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
	3848	if (va == 0)
	3849	panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
	3850
	3851	pa = pa & ~PAGE_MASK;
	3852	for (tmpva = va; size > 0;) {
	3853	pte = vtopte(tmpva);
	3854	pte = pa \| PG_RW \| PG_V; / \| pgeflag; */
	3855	size -= PAGE_SIZE;
	3856	tmpva += PAGE_SIZE;
	3857	pa += PAGE_SIZE;
	3858	}
	3859	cpu_invltlb();
	3860	smp_invltlb();
	3861
	3862	return ((void *)(va + offset));
	3863	}
	3864
	3865	void *
	3866	pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
	3867	{
	3868	vm_offset_t va, tmpva, offset;
	3869	pt_entry_t *pte;
	3870
	3871	offset = pa & PAGE_MASK;
	3872	size = roundup(offset + size, PAGE_SIZE);
	3873
	3874	va = kmem_alloc_nofault(&kernel_map, size, PAGE_SIZE);
	3875	if (va == 0)
	3876	panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
	3877
	3878	pa = pa & ~PAGE_MASK;
	3879	for (tmpva = va; size > 0;) {
	3880	pte = vtopte(tmpva);
	3881	pte = pa \| PG_RW \| PG_V \| PG_N; / \| pgeflag; */
	3882	size -= PAGE_SIZE;
	3883	tmpva += PAGE_SIZE;
	3884	pa += PAGE_SIZE;
	3885	}
	3886	cpu_invltlb();
	3887	smp_invltlb();
	3888
	3889	return ((void *)(va + offset));
	3890	}
	3891
	3892	void
	3893	pmap_unmapdev(vm_offset_t va, vm_size_t size)
	3894	{
	3895	vm_offset_t base, offset;
	3896
	3897	base = va & ~PAGE_MASK;
	3898	offset = va & PAGE_MASK;
	3899	size = roundup(offset + size, PAGE_SIZE);
	3900	pmap_qremove(va, size >> PAGE_SHIFT);
	3901	kmem_free(&kernel_map, base, size);
	3902	}
	3903
	3904	/*
	3905	* perform the pmap work for mincore
	3906	*/
	3907	int
	3908	pmap_mincore(pmap_t pmap, vm_offset_t addr)
	3909	{
	3910	pt_entry_t *ptep, pte;
	3911	vm_page_t m;
	3912	int val = 0;
	3913
	3914	lwkt_gettoken(&pmap->pm_token);
	3915	ptep = pmap_pte(pmap, addr);
	3916
	3917	if (ptep && (pte = *ptep) != 0) {
	3918	vm_offset_t pa;
	3919
	3920	val = MINCORE_INCORE;
	3921	if ((pte & PG_MANAGED) == 0)
	3922	goto done;
	3923
	3924	pa = pte & PG_FRAME;
	3925
	3926	m = PHYS_TO_VM_PAGE(pa);
	3927
	3928	/*
	3929	* Modified by us
	3930	*/
	3931	if (pte & PG_M)
	3932	val \|= MINCORE_MODIFIED\|MINCORE_MODIFIED_OTHER;
	3933	/*
	3934	* Modified by someone
	3935	*/
	3936	else if (m->dirty \|\| pmap_is_modified(m))
	3937	val \|= MINCORE_MODIFIED_OTHER;
	3938	/*
	3939	* Referenced by us
	3940	*/
	3941	if (pte & PG_A)
	3942	val \|= MINCORE_REFERENCED\|MINCORE_REFERENCED_OTHER;
	3943
	3944	/*
	3945	* Referenced by someone
	3946	*/
	3947	else if ((m->flags & PG_REFERENCED) \|\| pmap_ts_referenced(m)) {
	3948	val \|= MINCORE_REFERENCED_OTHER;
	3949	vm_page_flag_set(m, PG_REFERENCED);
	3950	}
	3951	}
	3952	done:
	3953	lwkt_reltoken(&pmap->pm_token);
	3954
	3955	return val;
	3956	}
	3957
	3958	/*
	3959	* Replace p->p_vmspace with a new one. If adjrefs is non-zero the new
	3960	* vmspace will be ref'd and the old one will be deref'd.
	3961	*
	3962	* The vmspace for all lwps associated with the process will be adjusted
	3963	* and cr3 will be reloaded if any lwp is the current lwp.
	3964	*
	3965	* The process must hold the vmspace->vm_map.token for oldvm and newvm
	3966	*/
	3967	void
	3968	pmap_replacevm(struct proc p, struct vmspace newvm, int adjrefs)
	3969	{
	3970	struct vmspace *oldvm;
	3971	struct lwp *lp;
	3972
	3973	oldvm = p->p_vmspace;
	3974	if (oldvm != newvm) {
	3975	if (adjrefs)
	3976	sysref_get(&newvm->vm_sysref);
	3977	p->p_vmspace = newvm;
	3978	KKASSERT(p->p_nthreads == 1);
	3979	lp = RB_ROOT(&p->p_lwp_tree);
	3980	pmap_setlwpvm(lp, newvm);
	3981	if (adjrefs)
	3982	sysref_put(&oldvm->vm_sysref);
	3983	}
	3984	}
	3985
	3986	/*
	3987	* Set the vmspace for a LWP. The vmspace is almost universally set the
	3988	* same as the process vmspace, but virtual kernels need to swap out contexts
	3989	* on a per-lwp basis.
	3990	*
	3991	* Caller does not necessarily hold any vmspace tokens. Caller must control
	3992	* the lwp (typically be in the context of the lwp). We use a critical
	3993	* section to protect against statclock and hardclock (statistics collection).
	3994	*/
	3995	void
	3996	pmap_setlwpvm(struct lwp lp, struct vmspace newvm)
	3997	{
	3998	struct vmspace *oldvm;
	3999	struct pmap *pmap;
	4000
	4001	oldvm = lp->lwp_vmspace;
	4002
	4003	if (oldvm != newvm) {
	4004	crit_enter();
	4005	lp->lwp_vmspace = newvm;
	4006	if (curthread->td_lwp == lp) {
	4007	pmap = vmspace_pmap(newvm);
	4008	#if defined(SMP)
	4009	atomic_set_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
	4010	if (pmap->pm_active & CPUMASK_LOCK)
	4011	pmap_interlock_wait(newvm);
	4012	#else
	4013	pmap->pm_active \|= 1;
	4014	#endif
	4015	#if defined(SWTCH_OPTIM_STATS)
	4016	tlb_flush_count++;
	4017	#endif
	4018	curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
	4019	curthread->td_pcb->pcb_cr3 \|= PG_RW \| PG_U \| PG_V;
	4020	load_cr3(curthread->td_pcb->pcb_cr3);
	4021	pmap = vmspace_pmap(oldvm);
	4022	#if defined(SMP)
	4023	atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
	4024	#else
	4025	pmap->pm_active &= ~(cpumask_t)1;
	4026	#endif
	4027	}
	4028	crit_exit();
	4029	}
	4030	}
	4031
	4032	#ifdef SMP
	4033
	4034	/*
	4035	* Called when switching to a locked pmap, used to interlock against pmaps
	4036	* undergoing modifications to prevent us from activating the MMU for the
	4037	* target pmap until all such modifications have completed. We have to do
	4038	* this because the thread making the modifications has already set up its
	4039	* SMP synchronization mask.
	4040	*
	4041	* This function cannot sleep!
	4042	*
	4043	* No requirements.
	4044	*/
	4045	void
	4046	pmap_interlock_wait(struct vmspace *vm)
	4047	{
	4048	struct pmap *pmap = &vm->vm_pmap;
	4049
	4050	if (pmap->pm_active & CPUMASK_LOCK) {
	4051	crit_enter();
	4052	KKASSERT(curthread->td_critcount >= 2);
	4053	DEBUG_PUSH_INFO("pmap_interlock_wait");
	4054	while (pmap->pm_active & CPUMASK_LOCK) {
	4055	cpu_ccfence();
	4056	lwkt_process_ipiq();
	4057	}
	4058	DEBUG_POP_INFO();
	4059	crit_exit();
	4060	}
	4061	}
	4062
	4063	#endif
	4064
	4065	vm_offset_t
	4066	pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
	4067	{
	4068
	4069	if ((obj == NULL) \|\| (size < NBPDR) \|\| (obj->type != OBJT_DEVICE)) {
	4070	return addr;
	4071	}
	4072
	4073	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
	4074	return addr;
	4075	}
	4076
	4077	/*
	4078	* Used by kmalloc/kfree, page already exists at va
	4079	*/
	4080	vm_page_t
	4081	pmap_kvtom(vm_offset_t va)
	4082	{
	4083	return(PHYS_TO_VM_PAGE(*vtopte(va) & PG_FRAME));
	4084	}