gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1991 Regents of the University of California.
	3	* Copyright (c) 1994 John S. Dyson
	4	* Copyright (c) 1994 David Greenman
	5	* Copyright (c) 2008 The DragonFly Project.
	6	* Copyright (c) 2008 Jordan Gordeev.
	7	* All rights reserved.
	8	*
	9	* This code is derived from software contributed to Berkeley by
	10	* the Systems Programming Group of the University of Utah Computer
	11	* Science Department and William Jolitz of UUNET Technologies Inc.
	12	*
	13	* Redistribution and use in source and binary forms, with or without
	14	* modification, are permitted provided that the following conditions
	15	* are met:
	16	* 1. Redistributions of source code must retain the above copyright
	17	* notice, this list of conditions and the following disclaimer.
	18	* 2. Redistributions in binary form must reproduce the above copyright
	19	* notice, this list of conditions and the following disclaimer in the
	20	* documentation and/or other materials provided with the distribution.
	21	* 3. All advertising materials mentioning features or use of this software
	22	* must display the following acknowledgement:
	23	* This product includes software developed by the University of
	24	* California, Berkeley and its contributors.
	25	* 4. Neither the name of the University nor the names of its contributors
	26	* may be used to endorse or promote products derived from this software
	27	* without specific prior written permission.
	28	*
	29	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	30	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	31	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	32	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	33	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	34	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	35	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	36	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	37	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	38	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	39	* SUCH DAMAGE.
	40	*
	41	* from: @(#)pmap.c 7.7 (Berkeley) 5/12/91
	42	* $FreeBSD: src/sys/i386/i386/pmap.c,v 1.250.2.18 2002/03/06 22:48:53 silby Exp $
	43	* $DragonFly: src/sys/platform/pc64/amd64/pmap.c,v 1.3 2008/08/29 17:07:10 dillon Exp $
	44	*/
	45
	46	/*
	47	* Manages physical address maps.
	48	*
	49	* In addition to hardware address maps, this
	50	* module is called upon to provide software-use-only
	51	* maps which may or may not be stored in the same
	52	* form as hardware maps. These pseudo-maps are
	53	* used to store intermediate results from copy
	54	* operations to and from address spaces.
	55	*
	56	* Since the information managed by this module is
	57	* also stored by the logical address mapping module,
	58	* this module may throw away valid virtual-to-physical
	59	* mappings at almost any time. However, invalidations
	60	* of virtual-to-physical mappings must be done as
	61	* requested.
	62	*
	63	* In order to cope with hardware architectures which
	64	* make virtual-to-physical map invalidates expensive,
	65	* this module may delay invalidate or reduced protection
	66	* operations until such time as they are actually
	67	* necessary. This module is given full information as
	68	* to which processors are currently using which maps,
	69	* and to when physical maps must be made correct.
	70	*/
	71
	72	#if JG
	73	#include "opt_disable_pse.h"
	74	#include "opt_pmap.h"
	75	#endif
	76	#include "opt_msgbuf.h"
	77
	78	#include <sys/param.h>
	79	#include <sys/systm.h>
	80	#include <sys/kernel.h>
	81	#include <sys/proc.h>
	82	#include <sys/msgbuf.h>
	83	#include <sys/vmmeter.h>
	84	#include <sys/mman.h>
	85
	86	#include <vm/vm.h>
	87	#include <vm/vm_param.h>
	88	#include <sys/sysctl.h>
	89	#include <sys/lock.h>
	90	#include <vm/vm_kern.h>
	91	#include <vm/vm_page.h>
	92	#include <vm/vm_map.h>
	93	#include <vm/vm_object.h>
	94	#include <vm/vm_extern.h>
	95	#include <vm/vm_pageout.h>
	96	#include <vm/vm_pager.h>
	97	#include <vm/vm_zone.h>
	98
	99	#include <sys/user.h>
	100	#include <sys/thread2.h>
	101	#include <sys/sysref2.h>
	102
	103	#include <machine/cputypes.h>
	104	#include <machine/md_var.h>
	105	#include <machine/specialreg.h>
	106	#include <machine/smp.h>
	107	#include <machine_base/apic/apicreg.h>
	108	#include <machine/globaldata.h>
	109	#include <machine/pmap.h>
	110	#include <machine/pmap_inval.h>
	111
	112	#define PMAP_KEEP_PDIRS
	113	#ifndef PMAP_SHPGPERPROC
	114	#define PMAP_SHPGPERPROC 200
	115	#endif
	116
	117	#if defined(DIAGNOSTIC)
	118	#define PMAP_DIAGNOSTIC
	119	#endif
	120
	121	#define MINPV 2048
	122
	123	#if !defined(PMAP_DIAGNOSTIC)
	124	#define PMAP_INLINE __inline
	125	#else
	126	#define PMAP_INLINE
	127	#endif
	128
	129	/*
	130	* Get PDEs and PTEs for user/kernel address space
	131	*/
	132	#define pmap_pde(m, v) (&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
	133	#define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
	134
	135	#define pmap_pde_v(pte) (((pd_entry_t )pte & PG_V) != 0)
	136	#define pmap_pte_w(pte) (((pt_entry_t )pte & PG_W) != 0)
	137	#define pmap_pte_m(pte) (((pt_entry_t )pte & PG_M) != 0)
	138	#define pmap_pte_u(pte) (((pt_entry_t )pte & PG_A) != 0)
	139	#define pmap_pte_v(pte) (((pt_entry_t )pte & PG_V) != 0)
	140
	141
	142	/*
	143	* Given a map and a machine independent protection code,
	144	* convert to a vax protection code.
	145	*/
	146	#define pte_prot(m, p) \
	147	(protection_codes[p & (VM_PROT_READ\|VM_PROT_WRITE\|VM_PROT_EXECUTE)])
	148	static int protection_codes[8];
	149
	150	struct pmap kernel_pmap;
	151	static TAILQ_HEAD(,pmap) pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
	152
	153	vm_paddr_t avail_start; /* PA of first available physical page */
	154	vm_paddr_t avail_end; /* PA of last available physical page */
	155	vm_offset_t virtual_start; /* VA of first avail page (after kernel bss) */
	156	vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
	157	vm_offset_t KvaStart; /* VA start of KVA space */
	158	vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */
	159	vm_offset_t KvaSize; /* max size of kernel virtual address space */
	160	static boolean_t pmap_initialized = FALSE; /* Has pmap_init completed? */
	161	static int pgeflag; /* PG_G or-in */
	162	static int pseflag; /* PG_PS or-in */
	163
	164	static vm_object_t kptobj;
	165
	166	static int nkpt;
	167	vm_offset_t kernel_vm_end;
	168
	169	/*
	170	* Data for the pv entry allocation mechanism
	171	*/
	172	static vm_zone_t pvzone;
	173	static struct vm_zone pvzone_store;
	174	static struct vm_object pvzone_obj;
	175	static int pv_entry_count=0, pv_entry_max=0, pv_entry_high_water=0;
	176	static int pmap_pagedaemon_waken = 0;
	177	static struct pv_entry *pvinit;
	178
	179	/*
	180	* All those kernel PT submaps that BSD is so fond of
	181	*/
	182	pt_entry_t CMAP1 = 0, ptmmap;
	183	caddr_t CADDR1 = 0, ptvmmap = 0;
	184	static pt_entry_t *msgbufmap;
	185	struct msgbuf *msgbufp=0;
	186
	187	/*
	188	* Crashdump maps.
	189	*/
	190	static pt_entry_t *pt_crashdumpmap;
	191	static caddr_t crashdumpmap;
	192
	193	extern uint64_t KPTphys;
	194	extern pt_entry_t *SMPpt;
	195	extern uint64_t SMPptpa;
	196
	197	#define DISABLE_PSE
	198
	199	static PMAP_INLINE void free_pv_entry (pv_entry_t pv);
	200	static pt_entry_t * get_ptbase (pmap_t pmap);
	201	static pv_entry_t get_pv_entry (void);
	202	static void i386_protection_init (void);
	203	static __inline void pmap_clearbit (vm_page_t m, int bit);
	204
	205	static void pmap_remove_all (vm_page_t m);
	206	static void pmap_enter_quick (pmap_t pmap, vm_offset_t va, vm_page_t m);
	207	static int pmap_remove_pte (struct pmap pmap, pt_entry_t ptq,
	208	vm_offset_t sva, pmap_inval_info_t info);
	209	static void pmap_remove_page (struct pmap *pmap,
	210	vm_offset_t va, pmap_inval_info_t info);
	211	static int pmap_remove_entry (struct pmap *pmap, vm_page_t m,
	212	vm_offset_t va, pmap_inval_info_t info);
	213	static boolean_t pmap_testbit (vm_page_t m, int bit);
	214	static void pmap_insert_entry (pmap_t pmap, vm_offset_t va,
	215	vm_page_t mpte, vm_page_t m);
	216
	217	static vm_page_t pmap_allocpte (pmap_t pmap, vm_offset_t va);
	218
	219	static int pmap_release_free_page (pmap_t pmap, vm_page_t p);
	220	static vm_page_t _pmap_allocpte (pmap_t pmap, vm_pindex_t ptepindex);
	221	static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
	222	static vm_page_t pmap_page_lookup (vm_object_t object, vm_pindex_t pindex);
	223	static int pmap_unuse_pt (pmap_t, vm_offset_t, vm_page_t, pmap_inval_info_t);
	224	static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
	225
	226	static unsigned pdir4mb;
	227
	228	/*
	229	* Move the kernel virtual free pointer to the next
	230	* 4MB. This is used to help improve performance
	231	* by using a large (4MB) page for much of the kernel
	232	* (.text, .data, .bss)
	233	*/
	234	static vm_offset_t
	235	pmap_kmem_choose(vm_offset_t addr)
	236	{
	237	vm_offset_t newaddr = addr;
	238	#ifndef DISABLE_PSE
	239	if (cpu_feature & CPUID_PSE) {
	240	newaddr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
	241	}
	242	#endif
	243	return newaddr;
	244	}
	245
	246	/*
	247	* pmap_pte:
	248	*
	249	* Extract the page table entry associated with the given map/virtual
	250	* pair.
	251	*
	252	* This function may NOT be called from an interrupt.
	253	*/
	254	PMAP_INLINE pt_entry_t *
	255	pmap_pte(pmap_t pmap, vm_offset_t va)
	256	{
	257	pd_entry_t *pdeaddr;
	258
	259	if (pmap) {
	260	pdeaddr = pmap_pde(pmap, va);
	261	if (*pdeaddr & PG_PS)
	262	return pdeaddr;
	263	if (*pdeaddr) {
	264	return get_ptbase(pmap) + amd64_btop(va);
	265	}
	266	}
	267	return (0);
	268	}
	269
	270	/*
	271	* pmap_pte_quick:
	272	*
	273	* Super fast pmap_pte routine best used when scanning the pv lists.
	274	* This eliminates many course-grained invltlb calls. Note that many of
	275	* the pv list scans are across different pmaps and it is very wasteful
	276	* to do an entire invltlb when checking a single mapping.
	277	*
	278	* Should only be called while in a critical section.
	279	*/
	280	static pt_entry_t *
	281	pmap_pte_quick(pmap_t pmap, vm_offset_t va)
	282	{
	283	struct mdglobaldata *gd = mdcpu;
	284	pd_entry_t pde, newpf;
	285
	286	if ((pde = pmap->pm_pdir[va >> PDRSHIFT]) != 0) {
	287	pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
	288	vm_pindex_t index = amd64_btop(va);
	289	/* are we current address space or kernel? */
	290	if ((pmap == &kernel_pmap) \|\|
	291	(frame == (PTDpde & PG_FRAME))) {
	292	return (pt_entry_t *) PTmap + index;
	293	}
	294	newpf = pde & PG_FRAME;
	295	if ( ((* (pt_entry_t *) gd->gd_PMAP1) & PG_FRAME) != newpf) {
	296	* (pt_entry_t *) gd->gd_PMAP1 = newpf \| PG_RW \| PG_V;
	297	cpu_invlpg(gd->gd_PADDR1);
	298	}
	299	return gd->gd_PADDR1 + (index & (NPTEPG - 1));
	300	}
	301	return (0);
	302	}
	303
	304
	305	static u_int64_t
	306	allocpages(vm_paddr_t *firstaddr, int n)
	307	{
	308	u_int64_t ret;
	309
	310	ret = *firstaddr;
	311	bzero((void )ret, n PAGE_SIZE);
	312	firstaddr += n PAGE_SIZE;
	313	return (ret);
	314	}
	315
	316	void
	317	create_pagetables(vm_paddr_t *firstaddr)
	318	{
	319	int i;
	320	int count;
	321	uint64_t cpu0pp, cpu0idlestk;
	322	int idlestk_page_offset = offsetof(struct privatespace, idlestack) / PAGE_SIZE;
	323
	324	/* we are running (mostly) V=P at this point */
	325
	326	common_lvl4_phys = allocpages(firstaddr, 1); /* 512 512G mappings */
	327	common_lvl3_phys = allocpages(firstaddr, 1); /* 512 1G mappings */
	328	KPTphys = allocpages(firstaddr, NKPT); /* kernel page table */
	329	IdlePTD = allocpages(firstaddr, 1); /* kernel page dir */
	330	cpu0pp = allocpages(firstaddr, MDGLOBALDATA_BASEALLOC_PAGES);
	331	cpu0idlestk = allocpages(firstaddr, UPAGES);
	332	SMPptpa = allocpages(firstaddr, 1);
	333	SMPpt = (void *)(SMPptpa + KERNBASE);
	334
	335
	336	/*
	337	* Load kernel page table with kernel memory mappings
	338	*/
	339	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
	340	((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
	341	((pt_entry_t *)KPTphys)[i] \|= PG_RW \| PG_V;
	342	}
	343
	344	#ifndef JG
	345	for (i = 0; i < NKPT; i++) {
	346	((pd_entry_t *)IdlePTD)[i] = KPTphys + (i << PAGE_SHIFT);
	347	((pd_entry_t *)IdlePTD)[i] \|= PG_RW \| PG_V;
	348	}
	349	#endif
	350
	351	/*
	352	* Set up the kernel page table itself.
	353	*/
	354	for (i = 0; i < NKPT; i++) {
	355	((pd_entry_t *)IdlePTD)[KPTDI + i] = KPTphys + (i << PAGE_SHIFT);
	356	((pd_entry_t *)IdlePTD)[KPTDI + i] \|= PG_RW \| PG_V;
	357	}
	358
	359	#ifndef JG
	360	count = ISA_HOLE_LENGTH >> PAGE_SHIFT;
	361	for (i = 0; i < count; i++) {
	362	((pt_entry_t *)KPTphys)[amd64_btop(ISA_HOLE_START) + i] = \
	363	(ISA_HOLE_START + i * PAGE_SIZE) \| PG_RW \| PG_V;
	364	}
	365	#endif
	366
	367	/*
	368	* Self-mapping
	369	*/
	370	((pd_entry_t *)IdlePTD)[PTDPTDI] = (pd_entry_t)IdlePTD \| PG_RW \| PG_V;
	371
	372	/*
	373	* Map CPU_prvspace[0].mdglobaldata
	374	*/
	375	for (i = 0; i < MDGLOBALDATA_BASEALLOC_PAGES; i++) {
	376	((pt_entry_t *)SMPptpa)[i] = \
	377	(cpu0pp + i * PAGE_SIZE) \| PG_RW \| PG_V;
	378	}
	379
	380	/*
	381	* Map CPU_prvspace[0].idlestack
	382	*/
	383	for (i = 0; i < UPAGES; i++) {
	384	((pt_entry_t *)SMPptpa)[idlestk_page_offset + i] = \
	385	(cpu0idlestk + i * PAGE_SIZE) \| PG_RW \| PG_V;
	386	}
	387
	388	/*
	389	* Link SMPpt.
	390	*/
	391	((pd_entry_t *)IdlePTD)[MPPTDI] = SMPptpa \| PG_RW \| PG_V;
	392
	393	/*
	394	* PML4 maps level 3
	395	*/
	396	((pml4_entry_t *)common_lvl4_phys)[LINKPML4I] = common_lvl3_phys \| PG_RW \| PG_V \| PG_U;
	397
	398	/*
	399	* location of "virtual CR3" - a PDP entry that is loaded
	400	* with a PD physical address (+ page attributes).
	401	* Matt: location of user page directory entry (representing 1G)
	402	*/
	403	link_pdpe = &((pdp_entry_t *)common_lvl3_phys)[LINKPDPI];
	404	}
	405
	406	void
	407	init_paging(vm_paddr_t *firstaddr) {
	408	create_pagetables(firstaddr);
	409
	410	/* switch to the newly created page table */
	411	*link_pdpe = IdlePTD \| PG_RW \| PG_V \| PG_U;
	412	load_cr3(common_lvl4_phys);
	413	link_pdpe = (void )((char )link_pdpe + KERNBASE);
	414
	415	KvaStart = (vm_offset_t)VADDR(PTDPTDI, 0);
	416	KvaEnd = (vm_offset_t)VADDR(APTDPTDI, 0);
	417	KvaSize = KvaEnd - KvaStart;
	418	}
	419
	420	/*
	421	* Bootstrap the system enough to run with virtual memory.
	422	*
	423	* On the i386 this is called after mapping has already been enabled
	424	* and just syncs the pmap module with what has already been done.
	425	* [We can't call it easily with mapping off since the kernel is not
	426	* mapped with PA == VA, hence we would have to relocate every address
	427	* from the linked base (virtual) address "KERNBASE" to the actual
	428	* (physical) address starting relative to 0]
	429	*/
	430	void
	431	pmap_bootstrap(vm_paddr_t *firstaddr, vm_paddr_t loadaddr)
	432	{
	433	vm_offset_t va;
	434	pt_entry_t *pte;
	435	struct mdglobaldata *gd;
	436	int i;
	437	int pg;
	438
	439	avail_start = *firstaddr;
	440
	441	/*
	442	* XXX The calculation of virtual_start is wrong. It's NKPT*PAGE_SIZE
	443	* too large. It should instead be correctly calculated in locore.s and
	444	* not based on 'first' (which is a physical address, not a virtual
	445	* address, for the start of unused physical memory). The kernel
	446	* page tables are NOT double mapped and thus should not be included
	447	* in this calculation.
	448	*/
	449	virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr;
	450	virtual_start = pmap_kmem_choose(virtual_start);
	451	virtual_end = VADDR(KPTDI+NKPDE-1, NPTEPG-1);
	452
	453	/*
	454	* Initialize protection array.
	455	*/
	456	i386_protection_init();
	457
	458	/*
	459	* The kernel's pmap is statically allocated so we don't have to use
	460	* pmap_create, which is unlikely to work correctly at this part of
	461	* the boot sequence (XXX and which no longer exists).
	462	*/
	463	kernel_pmap.pm_pdir = (pd_entry_t *)(PTOV_OFFSET + (uint64_t)IdlePTD);
	464	kernel_pmap.pm_count = 1;
	465	kernel_pmap.pm_active = (cpumask_t)-1; /* don't allow deactivation */
	466	TAILQ_INIT(&kernel_pmap.pm_pvlist);
	467	nkpt = NKPT;
	468
	469	/*
	470	* Reserve some special page table entries/VA space for temporary
	471	* mapping of pages.
	472	*/
	473	#define SYSMAP(c, p, v, n) \
	474	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
	475
	476	va = virtual_start;
	477	pte = (pt_entry_t *) pmap_pte(&kernel_pmap, va);
	478
	479	/*
	480	* CMAP1/CMAP2 are used for zeroing and copying pages.
	481	*/
	482	SYSMAP(caddr_t, CMAP1, CADDR1, 1)
	483
	484	/*
	485	* Crashdump maps.
	486	*/
	487	SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS);
	488
	489	/*
	490	* ptvmmap is used for reading arbitrary physical pages via
	491	* /dev/mem.
	492	*/
	493	SYSMAP(caddr_t, ptmmap, ptvmmap, 1)
	494
	495	/*
	496	* msgbufp is used to map the system message buffer.
	497	* XXX msgbufmap is not used.
	498	*/
	499	SYSMAP(struct msgbuf *, msgbufmap, msgbufp,
	500	atop(round_page(MSGBUF_SIZE)))
	501
	502	virtual_start = va;
	503
	504	*CMAP1 = 0;
	505	for (i = 0; i < NKPT; i++)
	506	PTD[i] = 0;
	507
	508	/*
	509	* PG_G is terribly broken on SMP because we IPI invltlb's in some
	510	* cases rather then invl1pg. Actually, I don't even know why it
	511	* works under UP because self-referential page table mappings
	512	*/
	513	#ifdef SMP
	514	pgeflag = 0;
	515	#else
	516	if (cpu_feature & CPUID_PGE)
	517	pgeflag = PG_G;
	518	#endif
	519
	520	/*
	521	* Initialize the 4MB page size flag
	522	*/
	523	pseflag = 0;
	524	/*
	525	* The 4MB page version of the initial
	526	* kernel page mapping.
	527	*/
	528	pdir4mb = 0;
	529
	530	#if !defined(DISABLE_PSE)
	531	if (cpu_feature & CPUID_PSE) {
	532	pt_entry_t ptditmp;
	533	/*
	534	* Note that we have enabled PSE mode
	535	*/
	536	pseflag = PG_PS;
	537	ptditmp = *(PTmap + amd64_btop(KERNBASE));
	538	ptditmp &= ~(NBPDR - 1);
	539	ptditmp \|= PG_V \| PG_RW \| PG_PS \| PG_U \| pgeflag;
	540	pdir4mb = ptditmp;
	541
	542	#ifndef SMP
	543	/*
	544	* Enable the PSE mode. If we are SMP we can't do this
	545	* now because the APs will not be able to use it when
	546	* they boot up.
	547	*/
	548	load_cr4(rcr4() \| CR4_PSE);
	549
	550	/*
	551	* We can do the mapping here for the single processor
	552	* case. We simply ignore the old page table page from
	553	* now on.
	554	*/
	555	/*
	556	* For SMP, we still need 4K pages to bootstrap APs,
	557	* PSE will be enabled as soon as all APs are up.
	558	*/
	559	PTD[KPTDI] = (pd_entry_t)ptditmp;
	560	kernel_pmap.pm_pdir[KPTDI] = (pd_entry_t)ptditmp;
	561	cpu_invltlb();
	562	#endif
	563	}
	564	#endif
	565	#ifdef SMP
	566	if (cpu_apic_address == 0)
	567	panic("pmap_bootstrap: no local apic!");
	568
	569	/* local apic is mapped on last page */
	570	SMPpt[NPTEPG - 1] = (pt_entry_t)(PG_V \| PG_RW \| PG_N \| pgeflag \|
	571	(cpu_apic_address & PG_FRAME));
	572	#endif
	573
	574	/*
	575	* We need to finish setting up the globaldata page for the BSP.
	576	* locore has already populated the page table for the mdglobaldata
	577	* portion.
	578	*/
	579	pg = MDGLOBALDATA_BASEALLOC_PAGES;
	580	gd = &CPU_prvspace[0].mdglobaldata;
	581	gd->gd_CMAP1 = &SMPpt[pg + 0];
	582	gd->gd_CMAP2 = &SMPpt[pg + 1];
	583	gd->gd_CMAP3 = &SMPpt[pg + 2];
	584	gd->gd_PMAP1 = &SMPpt[pg + 3];
	585	gd->gd_CADDR1 = CPU_prvspace[0].CPAGE1;
	586	gd->gd_CADDR2 = CPU_prvspace[0].CPAGE2;
	587	gd->gd_CADDR3 = CPU_prvspace[0].CPAGE3;
	588	gd->gd_PADDR1 = (pt_entry_t *)CPU_prvspace[0].PPAGE1;
	589
	590	cpu_invltlb();
	591	}
	592
	593	#ifdef SMP
	594	/*
	595	* Set 4mb pdir for mp startup
	596	*/
	597	void
	598	pmap_set_opt(void)
	599	{
	600	if (pseflag && (cpu_feature & CPUID_PSE)) {
	601	load_cr4(rcr4() \| CR4_PSE);
	602	if (pdir4mb && mycpu->gd_cpuid == 0) { /* only on BSP */
	603	kernel_pmap.pm_pdir[KPTDI] =
	604	PTD[KPTDI] = (pd_entry_t)pdir4mb;
	605	cpu_invltlb();
	606	}
	607	}
	608	}
	609	#endif
	610
	611	/*
	612	* Initialize the pmap module.
	613	* Called by vm_init, to initialize any structures that the pmap
	614	* system needs to map virtual memory.
	615	* pmap_init has been enhanced to support in a fairly consistant
	616	* way, discontiguous physical memory.
	617	*/
	618	void
	619	pmap_init(void)
	620	{
	621	int i;
	622	int initial_pvs;
	623
	624	/*
	625	* object for kernel page table pages
	626	*/
	627	kptobj = vm_object_allocate(OBJT_DEFAULT, NKPDE);
	628
	629	/*
	630	* Allocate memory for random pmap data structures. Includes the
	631	* pv_head_table.
	632	*/
	633
	634	for(i = 0; i < vm_page_array_size; i++) {
	635	vm_page_t m;
	636
	637	m = &vm_page_array[i];
	638	TAILQ_INIT(&m->md.pv_list);
	639	m->md.pv_list_count = 0;
	640	}
	641
	642	/*
	643	* init the pv free list
	644	*/
	645	initial_pvs = vm_page_array_size;
	646	if (initial_pvs < MINPV)
	647	initial_pvs = MINPV;
	648	pvzone = &pvzone_store;
	649	pvinit = (struct pv_entry *) kmem_alloc(&kernel_map,
	650	initial_pvs * sizeof (struct pv_entry));
	651	zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), pvinit,
	652	initial_pvs);
	653
	654	/*
	655	* Now it is safe to enable pv_table recording.
	656	*/
	657	pmap_initialized = TRUE;
	658	}
	659
	660	/*
	661	* Initialize the address space (zone) for the pv_entries. Set a
	662	* high water mark so that the system can recover from excessive
	663	* numbers of pv entries.
	664	*/
	665	void
	666	pmap_init2(void)
	667	{
	668	int shpgperproc = PMAP_SHPGPERPROC;
	669
	670	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
	671	pv_entry_max = shpgperproc * maxproc + vm_page_array_size;
	672	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
	673	pv_entry_high_water = 9 * (pv_entry_max / 10);
	674	zinitna(pvzone, &pvzone_obj, NULL, 0, pv_entry_max, ZONE_INTERRUPT, 1);
	675	}
	676
	677
	678	/***************************************************
	679	* Low level helper routines.....
	680	***************************************************/
	681
	682	#if defined(PMAP_DIAGNOSTIC)
	683
	684	/*
	685	* This code checks for non-writeable/modified pages.
	686	* This should be an invalid condition.
	687	*/
	688	static int
	689	pmap_nw_modified(pt_entry_t ptea)
	690	{
	691	int pte;
	692
	693	pte = (int) ptea;
	694
	695	if ((pte & (PG_M\|PG_RW)) == PG_M)
	696	return 1;
	697	else
	698	return 0;
	699	}
	700	#endif
	701
	702
	703	/*
	704	* this routine defines the region(s) of memory that should
	705	* not be tested for the modified bit.
	706	*/
	707	static PMAP_INLINE int
	708	pmap_track_modified(vm_offset_t va)
	709	{
	710	if ((va < clean_sva) \|\| (va >= clean_eva))
	711	return 1;
	712	else
	713	return 0;
	714	}
	715
	716	static pt_entry_t *
	717	get_ptbase(pmap_t pmap)
	718	{
	719	pd_entry_t frame = pmap->pm_pdir[PTDPTDI] & PG_FRAME;
	720	struct globaldata *gd = mycpu;
	721
	722	/* are we current address space or kernel? */
	723	if (pmap == &kernel_pmap \|\| frame == (PTDpde & PG_FRAME)) {
	724	return (pt_entry_t *) PTmap;
	725	}
	726
	727	/* otherwise, we are alternate address space */
	728	KKASSERT(gd->gd_intr_nesting_level == 0 &&
	729	(gd->gd_curthread->td_flags & TDF_INTTHREAD) == 0);
	730
	731	if (frame != (((pd_entry_t) APTDpde) & PG_FRAME)) {
	732	APTDpde = (pd_entry_t)(frame \| PG_RW \| PG_V);
	733	/* The page directory is not shared between CPUs */
	734	cpu_invltlb();
	735	}
	736	return (pt_entry_t *) APTmap;
	737	}
	738
	739	/*
	740	* pmap_extract:
	741	*
	742	* Extract the physical page address associated with the map/VA pair.
	743	*
	744	* This function may not be called from an interrupt if the pmap is
	745	* not kernel_pmap.
	746	*/
	747	vm_paddr_t
	748	pmap_extract(pmap_t pmap, vm_offset_t va)
	749	{
	750	vm_offset_t rtval;
	751	vm_offset_t pdirindex;
	752
	753	pdirindex = va >> PDRSHIFT;
	754	if (pmap && (rtval = pmap->pm_pdir[pdirindex])) {
	755	pt_entry_t *pte;
	756	if ((rtval & PG_PS) != 0) {
	757	rtval &= ~(NBPDR - 1);
	758	rtval \|= va & (NBPDR - 1);
	759	return rtval;
	760	}
	761	pte = get_ptbase(pmap) + amd64_btop(va);
	762	rtval = ((*pte & PG_FRAME) \| (va & PAGE_MASK));
	763	return rtval;
	764	}
	765	return 0;
	766	}
	767
	768	/***************************************************
	769	* Low level mapping routines.....
	770	***************************************************/
	771
	772	/*
	773	* Routine: pmap_kenter
	774	* Function:
	775	* Add a wired page to the KVA
	776	* NOTE! note that in order for the mapping to take effect -- you
	777	* should do an invltlb after doing the pmap_kenter().
	778	*/
	779	void
	780	pmap_kenter(vm_offset_t va, vm_paddr_t pa)
	781	{
	782	pt_entry_t *pte;
	783	pt_entry_t npte;
	784	pmap_inval_info info;
	785
	786	pmap_inval_init(&info);
	787	npte = pa \| PG_RW \| PG_V \| pgeflag;
	788	pte = vtopte(va);
	789	pmap_inval_add(&info, &kernel_pmap, va);
	790	*pte = npte;
	791	pmap_inval_flush(&info);
	792	}
	793
	794	/*
	795	* Routine: pmap_kenter_quick
	796	* Function:
	797	* Similar to pmap_kenter(), except we only invalidate the
	798	* mapping on the current CPU.
	799	*/
	800	void
	801	pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
	802	{
	803	pt_entry_t *pte;
	804	pt_entry_t npte;
	805
	806	npte = pa \| PG_RW \| PG_V \| pgeflag;
	807	pte = vtopte(va);
	808	*pte = npte;
	809	cpu_invlpg((void *)va);
	810	}
	811
	812	void
	813	pmap_kenter_sync(vm_offset_t va)
	814	{
	815	pmap_inval_info info;
	816
	817	pmap_inval_init(&info);
	818	pmap_inval_add(&info, &kernel_pmap, va);
	819	pmap_inval_flush(&info);
	820	}
	821
	822	void
	823	pmap_kenter_sync_quick(vm_offset_t va)
	824	{
	825	cpu_invlpg((void *)va);
	826	}
	827
	828	/*
	829	* remove a page from the kernel pagetables
	830	*/
	831	void
	832	pmap_kremove(vm_offset_t va)
	833	{
	834	pt_entry_t *pte;
	835	pmap_inval_info info;
	836
	837	pmap_inval_init(&info);
	838	pte = vtopte(va);
	839	pmap_inval_add(&info, &kernel_pmap, va);
	840	*pte = 0;
	841	pmap_inval_flush(&info);
	842	}
	843
	844	void
	845	pmap_kremove_quick(vm_offset_t va)
	846	{
	847	pt_entry_t *pte;
	848	pte = vtopte(va);
	849	*pte = 0;
	850	cpu_invlpg((void *)va);
	851	}
	852
	853	/*
	854	* XXX these need to be recoded. They are not used in any critical path.
	855	*/
	856	void
	857	pmap_kmodify_rw(vm_offset_t va)
	858	{
	859	*vtopte(va) \|= PG_RW;
	860	cpu_invlpg((void *)va);
	861	}
	862
	863	void
	864	pmap_kmodify_nc(vm_offset_t va)
	865	{
	866	*vtopte(va) \|= PG_N;
	867	cpu_invlpg((void *)va);
	868	}
	869
	870	/*
	871	* Used to map a range of physical addresses into kernel
	872	* virtual address space.
	873	*
	874	* For now, VM is already on, we only need to map the
	875	* specified memory.
	876	*/
	877	vm_offset_t
	878	pmap_map(vm_offset_t virt, vm_paddr_t start, vm_paddr_t end, int prot)
	879	{
	880	while (start < end) {
	881	pmap_kenter(virt, start);
	882	virt += PAGE_SIZE;
	883	start += PAGE_SIZE;
	884	}
	885	return (virt);
	886	}
	887
	888
	889	/*
	890	* Add a list of wired pages to the kva
	891	* this routine is only used for temporary
	892	* kernel mappings that do not need to have
	893	* page modification or references recorded.
	894	* Note that old mappings are simply written
	895	* over. The page must be wired.
	896	*/
	897	void
	898	pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
	899	{
	900	vm_offset_t end_va;
	901
	902	end_va = va + count * PAGE_SIZE;
	903
	904	while (va < end_va) {
	905	pt_entry_t *pte;
	906
	907	pte = vtopte(va);
	908	pte = VM_PAGE_TO_PHYS(m) \| PG_RW \| PG_V \| pgeflag;
	909	cpu_invlpg((void *)va);
	910	va += PAGE_SIZE;
	911	m++;
	912	}
	913	#ifdef SMP
	914	smp_invltlb(); /* XXX */
	915	#endif
	916	}
	917
	918	void
	919	pmap_qenter2(vm_offset_t va, vm_page_t m, int count, cpumask_t mask)
	920	{
	921	vm_offset_t end_va;
	922	cpumask_t cmask = mycpu->gd_cpumask;
	923
	924	end_va = va + count * PAGE_SIZE;
	925
	926	while (va < end_va) {
	927	pt_entry_t *pte;
	928	pt_entry_t pteval;
	929
	930	/*
	931	* Install the new PTE. If the pte changed from the prior
	932	* mapping we must reset the cpu mask and invalidate the page.
	933	* If the pte is the same but we have not seen it on the
	934	* current cpu, invlpg the existing mapping. Otherwise the
	935	* entry is optimal and no invalidation is required.
	936	*/
	937	pte = vtopte(va);
	938	pteval = VM_PAGE_TO_PHYS(*m) \| PG_A \| PG_RW \| PG_V \| pgeflag;
	939	if (*pte != pteval) {
	940	*mask = 0;
	941	*pte = pteval;
	942	cpu_invlpg((void *)va);
	943	} else if ((*mask & cmask) == 0) {
	944	cpu_invlpg((void *)va);
	945	}
	946	va += PAGE_SIZE;
	947	m++;
	948	}
	949	*mask \|= cmask;
	950	}
	951
	952	/*
	953	* this routine jerks page mappings from the
	954	* kernel -- it is meant only for temporary mappings.
	955	*/
	956	void
	957	pmap_qremove(vm_offset_t va, int count)
	958	{
	959	vm_offset_t end_va;
	960
	961	end_va = va + count*PAGE_SIZE;
	962
	963	while (va < end_va) {
	964	pt_entry_t *pte;
	965
	966	pte = vtopte(va);
	967	*pte = 0;
	968	cpu_invlpg((void *)va);
	969	va += PAGE_SIZE;
	970	}
	971	#ifdef SMP
	972	smp_invltlb();
	973	#endif
	974	}
	975
	976	/*
	977	* This routine works like vm_page_lookup() but also blocks as long as the
	978	* page is busy. This routine does not busy the page it returns.
	979	*
	980	* Unless the caller is managing objects whos pages are in a known state,
	981	* the call should be made with a critical section held so the page's object
	982	* association remains valid on return.
	983	*/
	984	static vm_page_t
	985	pmap_page_lookup(vm_object_t object, vm_pindex_t pindex)
	986	{
	987	vm_page_t m;
	988
	989	do {
	990	m = vm_page_lookup(object, pindex);
	991	} while (m && vm_page_sleep_busy(m, FALSE, "pplookp"));
	992
	993	return(m);
	994	}
	995
	996	/*
	997	* Create a new thread and optionally associate it with a (new) process.
	998	* NOTE! the new thread's cpu may not equal the current cpu.
	999	*/
	1000	void
	1001	pmap_init_thread(thread_t td)
	1002	{
	1003	/* enforce pcb placement */
	1004	td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1;
	1005	td->td_savefpu = &td->td_pcb->pcb_save;
	1006	td->td_sp = (char *)td->td_pcb - 16;
	1007	}
	1008
	1009	/*
	1010	* This routine directly affects the fork perf for a process.
	1011	*/
	1012	void
	1013	pmap_init_proc(struct proc *p)
	1014	{
	1015	}
	1016
	1017	/*
	1018	* Dispose the UPAGES for a process that has exited.
	1019	* This routine directly impacts the exit perf of a process.
	1020	*/
	1021	void
	1022	pmap_dispose_proc(struct proc *p)
	1023	{
	1024	KASSERT(p->p_lock == 0, ("attempt to dispose referenced proc! %p", p));
	1025	}
	1026
	1027	/***************************************************
	1028	* Page table page management routines.....
	1029	***************************************************/
	1030
	1031	/*
	1032	* This routine unholds page table pages, and if the hold count
	1033	* drops to zero, then it decrements the wire count.
	1034	*/
	1035	static int
	1036	_pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
	1037	{
	1038	/*
	1039	* Wait until we can busy the page ourselves. We cannot have
	1040	* any active flushes if we block.
	1041	*/
	1042	if (m->flags & PG_BUSY) {
	1043	pmap_inval_flush(info);
	1044	while (vm_page_sleep_busy(m, FALSE, "pmuwpt"))
	1045	;
	1046	}
	1047	KASSERT(m->queue == PQ_NONE,
	1048	("_pmap_unwire_pte_hold: %p->queue != PQ_NONE", m));
	1049
	1050	if (m->hold_count == 1) {
	1051	/*
	1052	* Unmap the page table page
	1053	*/
	1054	vm_page_busy(m);
	1055	pmap_inval_add(info, pmap, -1);
	1056	pmap->pm_pdir[m->pindex] = 0;
	1057
	1058	KKASSERT(pmap->pm_stats.resident_count > 0);
	1059	--pmap->pm_stats.resident_count;
	1060
	1061	if (pmap->pm_ptphint == m)
	1062	pmap->pm_ptphint = NULL;
	1063
	1064	/*
	1065	* This was our last hold, the page had better be unwired
	1066	* after we decrement wire_count.
	1067	*
	1068	* FUTURE NOTE: shared page directory page could result in
	1069	* multiple wire counts.
	1070	*/
	1071	vm_page_unhold(m);
	1072	--m->wire_count;
	1073	KKASSERT(m->wire_count == 0);
	1074	--vmstats.v_wire_count;
	1075	vm_page_flag_clear(m, PG_MAPPED \| PG_WRITEABLE);
	1076	vm_page_flash(m);
	1077	vm_page_free_zero(m);
	1078	return 1;
	1079	} else {
	1080	KKASSERT(m->hold_count > 1);
	1081	vm_page_unhold(m);
	1082	return 0;
	1083	}
	1084	}
	1085
	1086	static PMAP_INLINE int
	1087	pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
	1088	{
	1089	KKASSERT(m->hold_count > 0);
	1090	if (m->hold_count > 1) {
	1091	vm_page_unhold(m);
	1092	return 0;
	1093	} else {
	1094	return _pmap_unwire_pte_hold(pmap, m, info);
	1095	}
	1096	}
	1097
	1098	/*
	1099	* After removing a page table entry, this routine is used to
	1100	* conditionally free the page, and manage the hold/wire counts.
	1101	*/
	1102	static int
	1103	pmap_unuse_pt(pmap_t pmap, vm_offset_t va, vm_page_t mpte,
	1104	pmap_inval_info_t info)
	1105	{
	1106	vm_pindex_t ptepindex;
	1107	if (va >= UPT_MIN_ADDRESS)
	1108	return 0;
	1109
	1110	if (mpte == NULL) {
	1111	ptepindex = (va >> PDRSHIFT);
	1112	if (pmap->pm_ptphint &&
	1113	(pmap->pm_ptphint->pindex == ptepindex)) {
	1114	mpte = pmap->pm_ptphint;
	1115	} else {
	1116	pmap_inval_flush(info);
	1117	mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
	1118	pmap->pm_ptphint = mpte;
	1119	}
	1120	}
	1121
	1122	return pmap_unwire_pte_hold(pmap, mpte, info);
	1123	}
	1124
	1125	/*
	1126	* Initialize pmap0/vmspace0. This pmap is not added to pmap_list because
	1127	* it, and IdlePTD, represents the template used to update all other pmaps.
	1128	*
	1129	* On architectures where the kernel pmap is not integrated into the user
	1130	* process pmap, this pmap represents the process pmap, not the kernel pmap.
	1131	* kernel_pmap should be used to directly access the kernel_pmap.
	1132	*/
	1133	void
	1134	pmap_pinit0(struct pmap *pmap)
	1135	{
	1136	pmap->pm_pdir =
	1137	(pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
	1138	pmap_kenter((vm_offset_t)pmap->pm_pdir, (vm_offset_t) IdlePTD);
	1139	pmap->pm_count = 1;
	1140	pmap->pm_active = 0;
	1141	pmap->pm_ptphint = NULL;
	1142	TAILQ_INIT(&pmap->pm_pvlist);
	1143	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
	1144	}
	1145
	1146	/*
	1147	* Initialize a preallocated and zeroed pmap structure,
	1148	* such as one in a vmspace structure.
	1149	*/
	1150	void
	1151	pmap_pinit(struct pmap *pmap)
	1152	{
	1153	vm_page_t ptdpg;
	1154
	1155	/*
	1156	* No need to allocate page table space yet but we do need a valid
	1157	* page directory table.
	1158	*/
	1159	if (pmap->pm_pdir == NULL) {
	1160	pmap->pm_pdir =
	1161	(pd_entry_t *)kmem_alloc_pageable(&kernel_map, PAGE_SIZE);
	1162	}
	1163
	1164	/*
	1165	* Allocate an object for the ptes
	1166	*/
	1167	if (pmap->pm_pteobj == NULL)
	1168	pmap->pm_pteobj = vm_object_allocate(OBJT_DEFAULT, PTDPTDI + 1);
	1169
	1170	/*
	1171	* Allocate the page directory page, unless we already have
	1172	* one cached. If we used the cached page the wire_count will
	1173	* already be set appropriately.
	1174	*/
	1175	if ((ptdpg = pmap->pm_pdirm) == NULL) {
	1176	ptdpg = vm_page_grab(pmap->pm_pteobj, PTDPTDI,
	1177	VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	1178	pmap->pm_pdirm = ptdpg;
	1179	vm_page_flag_clear(ptdpg, PG_MAPPED \| PG_BUSY);
	1180	ptdpg->valid = VM_PAGE_BITS_ALL;
	1181	ptdpg->wire_count = 1;
	1182	++vmstats.v_wire_count;
	1183	pmap_kenter((vm_offset_t)pmap->pm_pdir, VM_PAGE_TO_PHYS(ptdpg));
	1184	}
	1185	if ((ptdpg->flags & PG_ZERO) == 0)
	1186	bzero(pmap->pm_pdir, PAGE_SIZE);
	1187
	1188	pmap->pm_pdir[MPPTDI] = PTD[MPPTDI];
	1189
	1190	/* install self-referential address mapping entry */
	1191	(pd_entry_t ) (pmap->pm_pdir + PTDPTDI) =
	1192	VM_PAGE_TO_PHYS(ptdpg) \| PG_V \| PG_RW \| PG_A \| PG_M;
	1193
	1194	pmap->pm_count = 1;
	1195	pmap->pm_active = 0;
	1196	pmap->pm_ptphint = NULL;
	1197	TAILQ_INIT(&pmap->pm_pvlist);
	1198	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
	1199	pmap->pm_stats.resident_count = 1;
	1200	}
	1201
	1202	/*
	1203	* Clean up a pmap structure so it can be physically freed. This routine
	1204	* is called by the vmspace dtor function. A great deal of pmap data is
	1205	* left passively mapped to improve vmspace management so we have a bit
	1206	* of cleanup work to do here.
	1207	*/
	1208	void
	1209	pmap_puninit(pmap_t pmap)
	1210	{
	1211	vm_page_t p;
	1212
	1213	KKASSERT(pmap->pm_active == 0);
	1214	if ((p = pmap->pm_pdirm) != NULL) {
	1215	KKASSERT(pmap->pm_pdir != NULL);
	1216	pmap_kremove((vm_offset_t)pmap->pm_pdir);
	1217	p->wire_count--;
	1218	vmstats.v_wire_count--;
	1219	KKASSERT((p->flags & PG_BUSY) == 0);
	1220	vm_page_busy(p);
	1221	vm_page_free_zero(p);
	1222	pmap->pm_pdirm = NULL;
	1223	}
	1224	if (pmap->pm_pdir) {
	1225	kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pdir, PAGE_SIZE);
	1226	pmap->pm_pdir = NULL;
	1227	}
	1228	if (pmap->pm_pteobj) {
	1229	vm_object_deallocate(pmap->pm_pteobj);
	1230	pmap->pm_pteobj = NULL;
	1231	}
	1232	}
	1233
	1234	/*
	1235	* Wire in kernel global address entries. To avoid a race condition
	1236	* between pmap initialization and pmap_growkernel, this procedure
	1237	* adds the pmap to the master list (which growkernel scans to update),
	1238	* then copies the template.
	1239	*/
	1240	void
	1241	pmap_pinit2(struct pmap *pmap)
	1242	{
	1243	crit_enter();
	1244	TAILQ_INSERT_TAIL(&pmap_list, pmap, pm_pmnode);
	1245	/* XXX copies current process, does not fill in MPPTDI */
	1246	bcopy(PTD + KPTDI, pmap->pm_pdir + KPTDI, nkpt * PTESIZE);
	1247	crit_exit();
	1248	}
	1249
	1250	/*
	1251	* Attempt to release and free a vm_page in a pmap. Returns 1 on success,
	1252	* 0 on failure (if the procedure had to sleep).
	1253	*
	1254	* When asked to remove the page directory page itself, we actually just
	1255	* leave it cached so we do not have to incur the SMP inval overhead of
	1256	* removing the kernel mapping. pmap_puninit() will take care of it.
	1257	*/
	1258	static int
	1259	pmap_release_free_page(struct pmap *pmap, vm_page_t p)
	1260	{
	1261	pd_entry_t pde = (pd_entry_t ) pmap->pm_pdir;
	1262	/*
	1263	* This code optimizes the case of freeing non-busy
	1264	* page-table pages. Those pages are zero now, and
	1265	* might as well be placed directly into the zero queue.
	1266	*/
	1267	if (vm_page_sleep_busy(p, FALSE, "pmaprl"))
	1268	return 0;
	1269
	1270	vm_page_busy(p);
	1271
	1272	/*
	1273	* Remove the page table page from the processes address space.
	1274	*/
	1275	pde[p->pindex] = 0;
	1276	KKASSERT(pmap->pm_stats.resident_count > 0);
	1277	--pmap->pm_stats.resident_count;
	1278
	1279	if (p->hold_count) {
	1280	panic("pmap_release: freeing held page table page");
	1281	}
	1282	if (pmap->pm_ptphint && (pmap->pm_ptphint->pindex == p->pindex))
	1283	pmap->pm_ptphint = NULL;
	1284
	1285	/*
	1286	* We leave the page directory page cached, wired, and mapped in
	1287	* the pmap until the dtor function (pmap_puninit()) gets called.
	1288	* However, still clean it up so we can set PG_ZERO.
	1289	*/
	1290	if (p->pindex == PTDPTDI) {
	1291	bzero(pde + KPTDI, nkpt * PTESIZE);
	1292	pde[MPPTDI] = 0;
	1293	pde[APTDPTDI] = 0;
	1294	vm_page_flag_set(p, PG_ZERO);
	1295	vm_page_wakeup(p);
	1296	} else {
	1297	p->wire_count--;
	1298	vmstats.v_wire_count--;
	1299	vm_page_free_zero(p);
	1300	}
	1301	return 1;
	1302	}
	1303
	1304	/*
	1305	* this routine is called if the page table page is not
	1306	* mapped correctly.
	1307	*/
	1308	static vm_page_t
	1309	_pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
	1310	{
	1311	vm_offset_t pteva, ptepa;
	1312	vm_page_t m;
	1313
	1314	/*
	1315	* Find or fabricate a new pagetable page
	1316	*/
	1317	m = vm_page_grab(pmap->pm_pteobj, ptepindex,
	1318	VM_ALLOC_NORMAL \| VM_ALLOC_ZERO \| VM_ALLOC_RETRY);
	1319
	1320	KASSERT(m->queue == PQ_NONE,
	1321	("_pmap_allocpte: %p->queue != PQ_NONE", m));
	1322
	1323	/*
	1324	* Increment the hold count for the page we will be returning to
	1325	* the caller.
	1326	*/
	1327	m->hold_count++;
	1328
	1329	/*
	1330	* It is possible that someone else got in and mapped by the page
	1331	* directory page while we were blocked, if so just unbusy and
	1332	* return the held page.
	1333	*/
	1334	if ((ptepa = pmap->pm_pdir[ptepindex]) != 0) {
	1335	KKASSERT((ptepa & PG_FRAME) == VM_PAGE_TO_PHYS(m));
	1336	vm_page_wakeup(m);
	1337	return(m);
	1338	}
	1339
	1340	if (m->wire_count == 0)
	1341	vmstats.v_wire_count++;
	1342	m->wire_count++;
	1343
	1344
	1345	/*
	1346	* Map the pagetable page into the process address space, if
	1347	* it isn't already there.
	1348	*/
	1349
	1350	++pmap->pm_stats.resident_count;
	1351
	1352	ptepa = VM_PAGE_TO_PHYS(m);
	1353	pmap->pm_pdir[ptepindex] =
	1354	(pd_entry_t) (ptepa \| PG_U \| PG_RW \| PG_V \| PG_A \| PG_M);
	1355
	1356	/*
	1357	* Set the page table hint
	1358	*/
	1359	pmap->pm_ptphint = m;
	1360
	1361	/*
	1362	* Try to use the new mapping, but if we cannot, then
	1363	* do it with the routine that maps the page explicitly.
	1364	*/
	1365	if ((m->flags & PG_ZERO) == 0) {
	1366	if ((pmap->pm_pdir[PTDPTDI] & PG_FRAME) ==
	1367	(((pd_entry_t) PTDpde) & PG_FRAME)) {
	1368	pteva = UPT_MIN_ADDRESS + amd64_ptob(ptepindex);
	1369	bzero((caddr_t) pteva, PAGE_SIZE);
	1370	} else {
	1371	pmap_zero_page(ptepa);
	1372	}
	1373	}
	1374
	1375	m->valid = VM_PAGE_BITS_ALL;
	1376	vm_page_flag_clear(m, PG_ZERO);
	1377	vm_page_flag_set(m, PG_MAPPED);
	1378	vm_page_wakeup(m);
	1379
	1380	return m;
	1381	}
	1382
	1383	static vm_page_t
	1384	pmap_allocpte(pmap_t pmap, vm_offset_t va)
	1385	{
	1386	vm_pindex_t ptepindex;
	1387	vm_offset_t ptepa;
	1388	vm_page_t m;
	1389
	1390	/*
	1391	* Calculate pagetable page index
	1392	*/
	1393	ptepindex = va >> PDRSHIFT;
	1394
	1395	/*
	1396	* Get the page directory entry
	1397	*/
	1398	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
	1399
	1400	/*
	1401	* This supports switching from a 4MB page to a
	1402	* normal 4K page.
	1403	*/
	1404	if (ptepa & PG_PS) {
	1405	pmap->pm_pdir[ptepindex] = 0;
	1406	ptepa = 0;
	1407	cpu_invltlb();
	1408	smp_invltlb();
	1409	}
	1410
	1411	/*
	1412	* If the page table page is mapped, we just increment the
	1413	* hold count, and activate it.
	1414	*/
	1415	if (ptepa) {
	1416	/*
	1417	* In order to get the page table page, try the
	1418	* hint first.
	1419	*/
	1420	if (pmap->pm_ptphint &&
	1421	(pmap->pm_ptphint->pindex == ptepindex)) {
	1422	m = pmap->pm_ptphint;
	1423	} else {
	1424	m = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
	1425	pmap->pm_ptphint = m;
	1426	}
	1427	m->hold_count++;
	1428	return m;
	1429	}
	1430	/*
	1431	* Here if the pte page isn't mapped, or if it has been deallocated.
	1432	*/
	1433	return _pmap_allocpte(pmap, ptepindex);
	1434	}
	1435
	1436
	1437	/***************************************************
	1438	* Pmap allocation/deallocation routines.
	1439	***************************************************/
	1440
	1441	/*
	1442	* Release any resources held by the given physical map.
	1443	* Called when a pmap initialized by pmap_pinit is being released.
	1444	* Should only be called if the map contains no valid mappings.
	1445	*/
	1446	static int pmap_release_callback(struct vm_page p, void data);
	1447
	1448	void
	1449	pmap_release(struct pmap *pmap)
	1450	{
	1451	vm_object_t object = pmap->pm_pteobj;
	1452	struct rb_vm_page_scan_info info;
	1453
	1454	KASSERT(pmap->pm_active == 0, ("pmap still active! %08x", pmap->pm_active));
	1455	#if defined(DIAGNOSTIC)
	1456	if (object->ref_count != 1)
	1457	panic("pmap_release: pteobj reference count != 1");
	1458	#endif
	1459
	1460	info.pmap = pmap;
	1461	info.object = object;
	1462	crit_enter();
	1463	TAILQ_REMOVE(&pmap_list, pmap, pm_pmnode);
	1464	crit_exit();
	1465
	1466	do {
	1467	crit_enter();
	1468	info.error = 0;
	1469	info.mpte = NULL;
	1470	info.limit = object->generation;
	1471
	1472	vm_page_rb_tree_RB_SCAN(&object->rb_memq, NULL,
	1473	pmap_release_callback, &info);
	1474	if (info.error == 0 && info.mpte) {
	1475	if (!pmap_release_free_page(pmap, info.mpte))
	1476	info.error = 1;
	1477	}
	1478	crit_exit();
	1479	} while (info.error);
	1480	}
	1481
	1482	static int
	1483	pmap_release_callback(struct vm_page p, void data)
	1484	{
	1485	struct rb_vm_page_scan_info *info = data;
	1486
	1487	if (p->pindex == PTDPTDI) {
	1488	info->mpte = p;
	1489	return(0);
	1490	}
	1491	if (!pmap_release_free_page(info->pmap, p)) {
	1492	info->error = 1;
	1493	return(-1);
	1494	}
	1495	if (info->object->generation != info->limit) {
	1496	info->error = 1;
	1497	return(-1);
	1498	}
	1499	return(0);
	1500	}
	1501
	1502	/*
	1503	* Grow the number of kernel page table entries, if needed.
	1504	*/
	1505
	1506	void
	1507	pmap_growkernel(vm_offset_t addr)
	1508	{
	1509	struct pmap *pmap;
	1510	vm_offset_t ptppaddr;
	1511	vm_page_t nkpg;
	1512	pd_entry_t newpdir;
	1513
	1514	crit_enter();
	1515	if (kernel_vm_end == 0) {
	1516	kernel_vm_end = KERNBASE;
	1517	nkpt = 0;
	1518	while (pdir_pde(PTD, kernel_vm_end)) {
	1519	kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
	1520	nkpt++;
	1521	}
	1522	}
	1523	addr = (addr + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
	1524	while (kernel_vm_end < addr) {
	1525	if (pdir_pde(PTD, kernel_vm_end)) {
	1526	kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & ~(PAGE_SIZE * NPTEPG - 1);
	1527	continue;
	1528	}
	1529
	1530	/*
	1531	* This index is bogus, but out of the way
	1532	*/
	1533	nkpg = vm_page_alloc(kptobj, nkpt,
	1534	VM_ALLOC_NORMAL \| VM_ALLOC_SYSTEM \| VM_ALLOC_INTERRUPT);
	1535	if (nkpg == NULL)
	1536	panic("pmap_growkernel: no memory to grow kernel");
	1537
	1538	vm_page_wire(nkpg);
	1539	ptppaddr = VM_PAGE_TO_PHYS(nkpg);
	1540	pmap_zero_page(ptppaddr);
	1541	newpdir = (pd_entry_t) (ptppaddr \| PG_V \| PG_RW \| PG_A \| PG_M);
	1542	pdir_pde(PTD, kernel_vm_end) = newpdir;
	1543	*pmap_pde(&kernel_pmap, kernel_vm_end) = newpdir;
	1544	nkpt++;
	1545
	1546	/*
	1547	* This update must be interlocked with pmap_pinit2.
	1548	*/
	1549	TAILQ_FOREACH(pmap, &pmap_list, pm_pmnode) {
	1550	*pmap_pde(pmap, kernel_vm_end) = newpdir;
	1551	}
	1552	kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
	1553	~(PAGE_SIZE * NPTEPG - 1);
	1554	}
	1555	crit_exit();
	1556	}
	1557
	1558	/*
	1559	* Retire the given physical map from service.
	1560	* Should only be called if the map contains
	1561	* no valid mappings.
	1562	*/
	1563	void
	1564	pmap_destroy(pmap_t pmap)
	1565	{
	1566	int count;
	1567
	1568	if (pmap == NULL)
	1569	return;
	1570
	1571	count = --pmap->pm_count;
	1572	if (count == 0) {
	1573	pmap_release(pmap);
	1574	panic("destroying a pmap is not yet implemented");
	1575	}
	1576	}
	1577
	1578	/*
	1579	* Add a reference to the specified pmap.
	1580	*/
	1581	void
	1582	pmap_reference(pmap_t pmap)
	1583	{
	1584	if (pmap != NULL) {
	1585	pmap->pm_count++;
	1586	}
	1587	}
	1588
	1589	/***************************************************
	1590	* page management routines.
	1591	***************************************************/
	1592
	1593	/*
	1594	* free the pv_entry back to the free list. This function may be
	1595	* called from an interrupt.
	1596	*/
	1597	static PMAP_INLINE void
	1598	free_pv_entry(pv_entry_t pv)
	1599	{
	1600	pv_entry_count--;
	1601	zfree(pvzone, pv);
	1602	}
	1603
	1604	/*
	1605	* get a new pv_entry, allocating a block from the system
	1606	* when needed. This function may be called from an interrupt.
	1607	*/
	1608	static pv_entry_t
	1609	get_pv_entry(void)
	1610	{
	1611	pv_entry_count++;
	1612	if (pv_entry_high_water &&
	1613	(pv_entry_count > pv_entry_high_water) &&
	1614	(pmap_pagedaemon_waken == 0)) {
	1615	pmap_pagedaemon_waken = 1;
	1616	wakeup (&vm_pages_needed);
	1617	}
	1618	return zalloc(pvzone);
	1619	}
	1620
	1621	/*
	1622	* This routine is very drastic, but can save the system
	1623	* in a pinch.
	1624	*/
	1625	void
	1626	pmap_collect(void)
	1627	{
	1628	int i;
	1629	vm_page_t m;
	1630	static int warningdone=0;
	1631
	1632	if (pmap_pagedaemon_waken == 0)
	1633	return;
	1634	pmap_pagedaemon_waken = 0;
	1635
	1636	if (warningdone < 5) {
	1637	kprintf("pmap_collect: collecting pv entries -- suggest increasing PMAP_SHPGPERPROC\n");
	1638	warningdone++;
	1639	}
	1640
	1641	for(i = 0; i < vm_page_array_size; i++) {
	1642	m = &vm_page_array[i];
	1643	if (m->wire_count \|\| m->hold_count \|\| m->busy \|\|
	1644	(m->flags & PG_BUSY))
	1645	continue;
	1646	pmap_remove_all(m);
	1647	}
	1648	}
	1649
	1650
	1651	/*
	1652	* If it is the first entry on the list, it is actually
	1653	* in the header and we must copy the following entry up
	1654	* to the header. Otherwise we must search the list for
	1655	* the entry. In either case we free the now unused entry.
	1656	*/
	1657	static int
	1658	pmap_remove_entry(struct pmap *pmap, vm_page_t m,
	1659	vm_offset_t va, pmap_inval_info_t info)
	1660	{
	1661	pv_entry_t pv;
	1662	int rtval;
	1663
	1664	crit_enter();
	1665	if (m->md.pv_list_count < pmap->pm_stats.resident_count) {
	1666	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	1667	if (pmap == pv->pv_pmap && va == pv->pv_va)
	1668	break;
	1669	}
	1670	} else {
	1671	TAILQ_FOREACH(pv, &pmap->pm_pvlist, pv_plist) {
	1672	if (va == pv->pv_va)
	1673	break;
	1674	}
	1675	}
	1676
	1677	rtval = 0;
	1678	if (pv) {
	1679	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
	1680	m->md.pv_list_count--;
	1681	if (TAILQ_EMPTY(&m->md.pv_list))
	1682	vm_page_flag_clear(m, PG_MAPPED \| PG_WRITEABLE);
	1683	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
	1684	++pmap->pm_generation;
	1685	rtval = pmap_unuse_pt(pmap, va, pv->pv_ptem, info);
	1686	free_pv_entry(pv);
	1687	}
	1688	crit_exit();
	1689	return rtval;
	1690	}
	1691
	1692	/*
	1693	* Create a pv entry for page at pa for
	1694	* (pmap, va).
	1695	*/
	1696	static void
	1697	pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t mpte, vm_page_t m)
	1698	{
	1699	pv_entry_t pv;
	1700
	1701	crit_enter();
	1702	pv = get_pv_entry();
	1703	pv->pv_va = va;
	1704	pv->pv_pmap = pmap;
	1705	pv->pv_ptem = mpte;
	1706
	1707	TAILQ_INSERT_TAIL(&pmap->pm_pvlist, pv, pv_plist);
	1708	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
	1709	m->md.pv_list_count++;
	1710
	1711	crit_exit();
	1712	}
	1713
	1714	/*
	1715	* pmap_remove_pte: do the things to unmap a page in a process
	1716	*/
	1717	static int
	1718	pmap_remove_pte(struct pmap pmap, pt_entry_t ptq, vm_offset_t va,
	1719	pmap_inval_info_t info)
	1720	{
	1721	pt_entry_t oldpte;
	1722	vm_page_t m;
	1723
	1724	pmap_inval_add(info, pmap, va);
	1725	oldpte = pte_load_clear(ptq);
	1726	if (oldpte & PG_W)
	1727	pmap->pm_stats.wired_count -= 1;
	1728	/*
	1729	* Machines that don't support invlpg, also don't support
	1730	* PG_G. XXX PG_G is disabled for SMP so don't worry about
	1731	* the SMP case.
	1732	*/
	1733	if (oldpte & PG_G)
	1734	cpu_invlpg((void *)va);
	1735	KKASSERT(pmap->pm_stats.resident_count > 0);
	1736	--pmap->pm_stats.resident_count;
	1737	if (oldpte & PG_MANAGED) {
	1738	m = PHYS_TO_VM_PAGE(oldpte);
	1739	if (oldpte & PG_M) {
	1740	#if defined(PMAP_DIAGNOSTIC)
	1741	if (pmap_nw_modified((pt_entry_t) oldpte)) {
	1742	kprintf(
	1743	"pmap_remove: modified page not writable: va: 0x%x, pte: 0x%x\n",
	1744	va, oldpte);
	1745	}
	1746	#endif
	1747	if (pmap_track_modified(va))
	1748	vm_page_dirty(m);
	1749	}
	1750	if (oldpte & PG_A)
	1751	vm_page_flag_set(m, PG_REFERENCED);
	1752	return pmap_remove_entry(pmap, m, va, info);
	1753	} else {
	1754	return pmap_unuse_pt(pmap, va, NULL, info);
	1755	}
	1756
	1757	return 0;
	1758	}
	1759
	1760	/*
	1761	* pmap_remove_page:
	1762	*
	1763	* Remove a single page from a process address space.
	1764	*
	1765	* This function may not be called from an interrupt if the pmap is
	1766	* not kernel_pmap.
	1767	*/
	1768	static void
	1769	pmap_remove_page(struct pmap *pmap, vm_offset_t va, pmap_inval_info_t info)
	1770	{
	1771	pt_entry_t *ptq;
	1772
	1773	/*
	1774	* if there is no pte for this address, just skip it!!! Otherwise
	1775	* get a local va for mappings for this pmap and remove the entry.
	1776	*/
	1777	if (*pmap_pde(pmap, va) != 0) {
	1778	ptq = get_ptbase(pmap) + amd64_btop(va);
	1779	if (*ptq) {
	1780	pmap_remove_pte(pmap, ptq, va, info);
	1781	}
	1782	}
	1783	}
	1784
	1785	/*
	1786	* pmap_remove:
	1787	*
	1788	* Remove the given range of addresses from the specified map.
	1789	*
	1790	* It is assumed that the start and end are properly
	1791	* rounded to the page size.
	1792	*
	1793	* This function may not be called from an interrupt if the pmap is
	1794	* not kernel_pmap.
	1795	*/
	1796	void
	1797	pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
	1798	{
	1799	pt_entry_t *ptbase;
	1800	vm_offset_t pdnxt;
	1801	vm_offset_t ptpaddr;
	1802	vm_offset_t sindex, eindex;
	1803	struct pmap_inval_info info;
	1804
	1805	if (pmap == NULL)
	1806	return;
	1807
	1808	if (pmap->pm_stats.resident_count == 0)
	1809	return;
	1810
	1811	pmap_inval_init(&info);
	1812
	1813	/*
	1814	* special handling of removing one page. a very
	1815	* common operation and easy to short circuit some
	1816	* code.
	1817	*/
	1818	if (((sva + PAGE_SIZE) == eva) &&
	1819	((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
	1820	pmap_remove_page(pmap, sva, &info);
	1821	pmap_inval_flush(&info);
	1822	return;
	1823	}
	1824
	1825	/*
	1826	* Get a local virtual address for the mappings that are being
	1827	* worked with.
	1828	*/
	1829	sindex = amd64_btop(sva);
	1830	eindex = amd64_btop(eva);
	1831
	1832	for (; sindex < eindex; sindex = pdnxt) {
	1833	vm_pindex_t pdirindex;
	1834
	1835	/*
	1836	* Calculate index for next page table.
	1837	*/
	1838	pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
	1839	if (pmap->pm_stats.resident_count == 0)
	1840	break;
	1841
	1842	pdirindex = sindex / NPDEPG;
	1843	if (((ptpaddr = pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
	1844	pmap_inval_add(&info, pmap, -1);
	1845	pmap->pm_pdir[pdirindex] = 0;
	1846	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
	1847	continue;
	1848	}
	1849
	1850	/*
	1851	* Weed out invalid mappings. Note: we assume that the page
	1852	* directory table is always allocated, and in kernel virtual.
	1853	*/
	1854	if (ptpaddr == 0)
	1855	continue;
	1856
	1857	/*
	1858	* Limit our scan to either the end of the va represented
	1859	* by the current page table page, or to the end of the
	1860	* range being removed.
	1861	*/
	1862	if (pdnxt > eindex) {
	1863	pdnxt = eindex;
	1864	}
	1865
	1866	/*
	1867	* NOTE: pmap_remove_pte() can block.
	1868	*/
	1869	for (; sindex != pdnxt; sindex++) {
	1870	vm_offset_t va;
	1871
	1872	ptbase = get_ptbase(pmap);
	1873	if (ptbase[sindex] == 0)
	1874	continue;
	1875	va = amd64_ptob(sindex);
	1876	if (pmap_remove_pte(pmap, ptbase + sindex, va, &info))
	1877	break;
	1878	}
	1879	}
	1880	pmap_inval_flush(&info);
	1881	}
	1882
	1883	/*
	1884	* pmap_remove_all:
	1885	*
	1886	* Removes this physical page from all physical maps in which it resides.
	1887	* Reflects back modify bits to the pager.
	1888	*
	1889	* This routine may not be called from an interrupt.
	1890	*/
	1891
	1892	static void
	1893	pmap_remove_all(vm_page_t m)
	1894	{
	1895	struct pmap_inval_info info;
	1896	pt_entry_t *pte, tpte;
	1897	pv_entry_t pv;
	1898
	1899	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	1900	return;
	1901
	1902	pmap_inval_init(&info);
	1903	crit_enter();
	1904	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
	1905	KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
	1906	--pv->pv_pmap->pm_stats.resident_count;
	1907
	1908	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
	1909	pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
	1910	tpte = pte_load_clear(pte);
	1911
	1912	if (tpte & PG_W)
	1913	pv->pv_pmap->pm_stats.wired_count--;
	1914
	1915	if (tpte & PG_A)
	1916	vm_page_flag_set(m, PG_REFERENCED);
	1917
	1918	/*
	1919	* Update the vm_page_t clean and reference bits.
	1920	*/
	1921	if (tpte & PG_M) {
	1922	#if defined(PMAP_DIAGNOSTIC)
	1923	if (pmap_nw_modified((pt_entry_t) tpte)) {
	1924	kprintf(
	1925	"pmap_remove_all: modified page not writable: va: 0x%x, pte: 0x%x\n",
	1926	pv->pv_va, tpte);
	1927	}
	1928	#endif
	1929	if (pmap_track_modified(pv->pv_va))
	1930	vm_page_dirty(m);
	1931	}
	1932	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
	1933	TAILQ_REMOVE(&pv->pv_pmap->pm_pvlist, pv, pv_plist);
	1934	++pv->pv_pmap->pm_generation;
	1935	m->md.pv_list_count--;
	1936	if (TAILQ_EMPTY(&m->md.pv_list))
	1937	vm_page_flag_clear(m, PG_MAPPED \| PG_WRITEABLE);
	1938	pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
	1939	free_pv_entry(pv);
	1940	}
	1941	crit_exit();
	1942	KKASSERT((m->flags & (PG_MAPPED\|PG_WRITEABLE)) == 0);
	1943	pmap_inval_flush(&info);
	1944	}
	1945
	1946	/*
	1947	* pmap_protect:
	1948	*
	1949	* Set the physical protection on the specified range of this map
	1950	* as requested.
	1951	*
	1952	* This function may not be called from an interrupt if the map is
	1953	* not the kernel_pmap.
	1954	*/
	1955	void
	1956	pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
	1957	{
	1958	pt_entry_t *ptbase;
	1959	vm_offset_t pdnxt, ptpaddr;
	1960	vm_pindex_t sindex, eindex;
	1961	pmap_inval_info info;
	1962
	1963	if (pmap == NULL)
	1964	return;
	1965
	1966	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
	1967	pmap_remove(pmap, sva, eva);
	1968	return;
	1969	}
	1970
	1971	if (prot & VM_PROT_WRITE)
	1972	return;
	1973
	1974	pmap_inval_init(&info);
	1975
	1976	ptbase = get_ptbase(pmap);
	1977
	1978	sindex = amd64_btop(sva);
	1979	eindex = amd64_btop(eva);
	1980
	1981	for (; sindex < eindex; sindex = pdnxt) {
	1982
	1983	vm_pindex_t pdirindex;
	1984
	1985	pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
	1986
	1987	pdirindex = sindex / NPDEPG;
	1988	if (((ptpaddr = pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
	1989	pmap_inval_add(&info, pmap, -1);
	1990	pmap->pm_pdir[pdirindex] &= ~(PG_M\|PG_RW);
	1991	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
	1992	continue;
	1993	}
	1994
	1995	/*
	1996	* Weed out invalid mappings. Note: we assume that the page
	1997	* directory table is always allocated, and in kernel virtual.
	1998	*/
	1999	if (ptpaddr == 0)
	2000	continue;
	2001
	2002	if (pdnxt > eindex) {
	2003	pdnxt = eindex;
	2004	}
	2005
	2006	for (; sindex != pdnxt; sindex++) {
	2007
	2008	pt_entry_t pbits;
	2009	vm_page_t m;
	2010
	2011	/*
	2012	* XXX non-optimal. Note also that there can be
	2013	* no pmap_inval_flush() calls until after we modify
	2014	* ptbase[sindex] (or otherwise we have to do another
	2015	* pmap_inval_add() call).
	2016	*/
	2017	pmap_inval_add(&info, pmap, amd64_ptob(sindex));
	2018	pbits = ptbase[sindex];
	2019
	2020	if (pbits & PG_MANAGED) {
	2021	m = NULL;
	2022	if (pbits & PG_A) {
	2023	m = PHYS_TO_VM_PAGE(pbits);
	2024	vm_page_flag_set(m, PG_REFERENCED);
	2025	pbits &= ~PG_A;
	2026	}
	2027	if (pbits & PG_M) {
	2028	if (pmap_track_modified(amd64_ptob(sindex))) {
	2029	if (m == NULL)
	2030	m = PHYS_TO_VM_PAGE(pbits);
	2031	vm_page_dirty(m);
	2032	pbits &= ~PG_M;
	2033	}
	2034	}
	2035	}
	2036
	2037	pbits &= ~PG_RW;
	2038
	2039	if (pbits != ptbase[sindex]) {
	2040	ptbase[sindex] = pbits;
	2041	}
	2042	}
	2043	}
	2044	pmap_inval_flush(&info);
	2045	}
	2046
	2047	/*
	2048	* Insert the given physical page (p) at
	2049	* the specified virtual address (v) in the
	2050	* target physical map with the protection requested.
	2051	*
	2052	* If specified, the page will be wired down, meaning
	2053	* that the related pte can not be reclaimed.
	2054	*
	2055	* NB: This is the only routine which MAY NOT lazy-evaluate
	2056	* or lose information. That is, this routine must actually
	2057	* insert this page into the given map NOW.
	2058	*/
	2059	void
	2060	pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
	2061	boolean_t wired)
	2062	{
	2063	vm_paddr_t pa;
	2064	pt_entry_t *pte;
	2065	vm_paddr_t opa;
	2066	vm_offset_t origpte, newpte;
	2067	vm_page_t mpte;
	2068	pmap_inval_info info;
	2069
	2070	if (pmap == NULL)
	2071	return;
	2072
	2073	va &= PG_FRAME;
	2074	#ifdef PMAP_DIAGNOSTIC
	2075	if (va >= KvaEnd)
	2076	panic("pmap_enter: toobig");
	2077	if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS))
	2078	panic("pmap_enter: invalid to pmap_enter page table pages (va: 0x%x)", va);
	2079	#endif
	2080	if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
	2081	kprintf("Warning: pmap_enter called on UVA with kernel_pmap\n");
	2082	print_backtrace();
	2083	}
	2084	if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
	2085	kprintf("Warning: pmap_enter called on KVA without kernel_pmap\n");
	2086	print_backtrace();
	2087	}
	2088
	2089	/*
	2090	* In the case that a page table page is not
	2091	* resident, we are creating it here.
	2092	*/
	2093	if (va < UPT_MIN_ADDRESS)
	2094	mpte = pmap_allocpte(pmap, va);
	2095	else
	2096	mpte = NULL;
	2097
	2098	pmap_inval_init(&info);
	2099	pte = pmap_pte(pmap, va);
	2100
	2101	/*
	2102	* Page Directory table entry not valid, we need a new PT page
	2103	*/
	2104	if (pte == NULL) {
	2105	panic("pmap_enter: invalid page directory pdir=%x, va=0x%x\n",
	2106	pmap->pm_pdir[PTDPTDI], va);
	2107	}
	2108
	2109	pa = VM_PAGE_TO_PHYS(m) & PG_FRAME;
	2110	origpte = (vm_offset_t )pte;
	2111	opa = origpte & PG_FRAME;
	2112
	2113	if (origpte & PG_PS)
	2114	panic("pmap_enter: attempted pmap_enter on 4MB page");
	2115
	2116	/*
	2117	* Mapping has not changed, must be protection or wiring change.
	2118	*/
	2119	if (origpte && (opa == pa)) {
	2120	/*
	2121	* Wiring change, just update stats. We don't worry about
	2122	* wiring PT pages as they remain resident as long as there
	2123	* are valid mappings in them. Hence, if a user page is wired,
	2124	* the PT page will be also.
	2125	*/
	2126	if (wired && ((origpte & PG_W) == 0))
	2127	pmap->pm_stats.wired_count++;
	2128	else if (!wired && (origpte & PG_W))
	2129	pmap->pm_stats.wired_count--;
	2130
	2131	#if defined(PMAP_DIAGNOSTIC)
	2132	if (pmap_nw_modified((pt_entry_t) origpte)) {
	2133	kprintf(
	2134	"pmap_enter: modified page not writable: va: 0x%x, pte: 0x%x\n",
	2135	va, origpte);
	2136	}
	2137	#endif
	2138
	2139	/*
	2140	* Remove the extra pte reference. Note that we cannot
	2141	* optimize the RO->RW case because we have adjusted the
	2142	* wiring count above and may need to adjust the wiring
	2143	* bits below.
	2144	*/
	2145	if (mpte)
	2146	mpte->hold_count--;
	2147
	2148	/*
	2149	* We might be turning off write access to the page,
	2150	* so we go ahead and sense modify status.
	2151	*/
	2152	if (origpte & PG_MANAGED) {
	2153	if ((origpte & PG_M) && pmap_track_modified(va)) {
	2154	vm_page_t om;
	2155	om = PHYS_TO_VM_PAGE(opa);
	2156	vm_page_dirty(om);
	2157	}
	2158	pa \|= PG_MANAGED;
	2159	KKASSERT(m->flags & PG_MAPPED);
	2160	}
	2161	goto validate;
	2162	}
	2163	/*
	2164	* Mapping has changed, invalidate old range and fall through to
	2165	* handle validating new mapping.
	2166	*/
	2167	if (opa) {
	2168	int err;
	2169	err = pmap_remove_pte(pmap, pte, va, &info);
	2170	if (err)
	2171	panic("pmap_enter: pte vanished, va: 0x%x", va);
	2172	}
	2173
	2174	/*
	2175	* Enter on the PV list if part of our managed memory. Note that we
	2176	* raise IPL while manipulating pv_table since pmap_enter can be
	2177	* called at interrupt time.
	2178	*/
	2179	if (pmap_initialized &&
	2180	(m->flags & (PG_FICTITIOUS\|PG_UNMANAGED)) == 0) {
	2181	pmap_insert_entry(pmap, va, mpte, m);
	2182	pa \|= PG_MANAGED;
	2183	vm_page_flag_set(m, PG_MAPPED);
	2184	}
	2185
	2186	/*
	2187	* Increment counters
	2188	*/
	2189	++pmap->pm_stats.resident_count;
	2190	if (wired)
	2191	pmap->pm_stats.wired_count++;
	2192
	2193	validate:
	2194	/*
	2195	* Now validate mapping with desired protection/wiring.
	2196	*/
	2197	newpte = (vm_offset_t) (pa \| pte_prot(pmap, prot) \| PG_V);
	2198
	2199	if (wired)
	2200	newpte \|= PG_W;
	2201	if (va < UPT_MIN_ADDRESS)
	2202	newpte \|= PG_U;
	2203	if (pmap == &kernel_pmap)
	2204	newpte \|= pgeflag;
	2205
	2206	/*
	2207	* if the mapping or permission bits are different, we need
	2208	* to update the pte.
	2209	*/
	2210	if ((origpte & ~(PG_M\|PG_A)) != newpte) {
	2211	pmap_inval_add(&info, pmap, va);
	2212	*pte = newpte \| PG_A;
	2213	if (newpte & PG_RW)
	2214	vm_page_flag_set(m, PG_WRITEABLE);
	2215	}
	2216	KKASSERT((newpte & PG_MANAGED) == 0 \|\| (m->flags & PG_MAPPED));
	2217	pmap_inval_flush(&info);
	2218	}
	2219
	2220	/*
	2221	* This code works like pmap_enter() but assumes VM_PROT_READ and not-wired.
	2222	* This code also assumes that the pmap has no pre-existing entry for this
	2223	* VA.
	2224	*
	2225	* This code currently may only be used on user pmaps, not kernel_pmap.
	2226	*/
	2227	static void
	2228	pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
	2229	{
	2230	pt_entry_t *pte;
	2231	vm_paddr_t pa;
	2232	vm_page_t mpte;
	2233	vm_pindex_t ptepindex;
	2234	vm_offset_t ptepa;
	2235	pmap_inval_info info;
	2236
	2237	pmap_inval_init(&info);
	2238
	2239	if (va < UPT_MAX_ADDRESS && pmap == &kernel_pmap) {
	2240	kprintf("Warning: pmap_enter_quick called on UVA with kernel_pmap\n");
	2241	print_backtrace();
	2242	}
	2243	if (va >= UPT_MAX_ADDRESS && pmap != &kernel_pmap) {
	2244	kprintf("Warning: pmap_enter_quick called on KVA without kernel_pmap\n");
	2245	print_backtrace();
	2246	}
	2247
	2248	KKASSERT(va < UPT_MIN_ADDRESS); /* assert used on user pmaps only */
	2249
	2250	/*
	2251	* Calculate the page table page (mpte), allocating it if necessary.
	2252	*
	2253	* A held page table page (mpte), or NULL, is passed onto the
	2254	* section following.
	2255	*/
	2256	if (va < UPT_MIN_ADDRESS) {
	2257	/*
	2258	* Calculate pagetable page index
	2259	*/
	2260	ptepindex = va >> PDRSHIFT;
	2261
	2262	do {
	2263	/*
	2264	* Get the page directory entry
	2265	*/
	2266	ptepa = (vm_offset_t) pmap->pm_pdir[ptepindex];
	2267
	2268	/*
	2269	* If the page table page is mapped, we just increment
	2270	* the hold count, and activate it.
	2271	*/
	2272	if (ptepa) {
	2273	if (ptepa & PG_PS)
	2274	panic("pmap_enter_quick: unexpected mapping into 4MB page");
	2275	if (pmap->pm_ptphint &&
	2276	(pmap->pm_ptphint->pindex == ptepindex)) {
	2277	mpte = pmap->pm_ptphint;
	2278	} else {
	2279	mpte = pmap_page_lookup( pmap->pm_pteobj, ptepindex);
	2280	pmap->pm_ptphint = mpte;
	2281	}
	2282	if (mpte)
	2283	mpte->hold_count++;
	2284	} else {
	2285	mpte = _pmap_allocpte(pmap, ptepindex);
	2286	}
	2287	} while (mpte == NULL);
	2288	} else {
	2289	mpte = NULL;
	2290	/* this code path is not yet used */
	2291	}
	2292
	2293	/*
	2294	* With a valid (and held) page directory page, we can just use
	2295	* vtopte() to get to the pte. If the pte is already present
	2296	* we do not disturb it.
	2297	*/
	2298	pte = vtopte(va);
	2299	if (*pte & PG_V) {
	2300	if (mpte)
	2301	pmap_unwire_pte_hold(pmap, mpte, &info);
	2302	pa = VM_PAGE_TO_PHYS(m);
	2303	KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
	2304	return;
	2305	}
	2306
	2307	/*
	2308	* Enter on the PV list if part of our managed memory
	2309	*/
	2310	if ((m->flags & (PG_FICTITIOUS\|PG_UNMANAGED)) == 0) {
	2311	pmap_insert_entry(pmap, va, mpte, m);
	2312	vm_page_flag_set(m, PG_MAPPED);
	2313	}
	2314
	2315	/*
	2316	* Increment counters
	2317	*/
	2318	++pmap->pm_stats.resident_count;
	2319
	2320	pa = VM_PAGE_TO_PHYS(m);
	2321
	2322	/*
	2323	* Now validate mapping with RO protection
	2324	*/
	2325	if (m->flags & (PG_FICTITIOUS\|PG_UNMANAGED))
	2326	*pte = pa \| PG_V \| PG_U;
	2327	else
	2328	*pte = pa \| PG_V \| PG_U \| PG_MANAGED;
	2329	/* pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
	2330	pmap_inval_flush(&info);
	2331	}
	2332
	2333	/*
	2334	* Make a temporary mapping for a physical address. This is only intended
	2335	* to be used for panic dumps.
	2336	*/
	2337	void *
	2338	pmap_kenter_temporary(vm_paddr_t pa, int i)
	2339	{
	2340	pmap_kenter((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa);
	2341	return ((void *)crashdumpmap);
	2342	}
	2343
	2344	#define MAX_INIT_PT (96)
	2345
	2346	/*
	2347	* This routine preloads the ptes for a given object into the specified pmap.
	2348	* This eliminates the blast of soft faults on process startup and
	2349	* immediately after an mmap.
	2350	*/
	2351	static int pmap_object_init_pt_callback(vm_page_t p, void *data);
	2352
	2353	void
	2354	pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
	2355	vm_object_t object, vm_pindex_t pindex,
	2356	vm_size_t size, int limit)
	2357	{
	2358	struct rb_vm_page_scan_info info;
	2359	struct lwp *lp;
	2360	int psize;
	2361
	2362	/*
	2363	* We can't preinit if read access isn't set or there is no pmap
	2364	* or object.
	2365	*/
	2366	if ((prot & VM_PROT_READ) == 0 \|\| pmap == NULL \|\| object == NULL)
	2367	return;
	2368
	2369	/*
	2370	* We can't preinit if the pmap is not the current pmap
	2371	*/
	2372	lp = curthread->td_lwp;
	2373	if (lp == NULL \|\| pmap != vmspace_pmap(lp->lwp_vmspace))
	2374	return;
	2375
	2376	psize = amd64_btop(size);
	2377
	2378	if ((object->type != OBJT_VNODE) \|\|
	2379	((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) &&
	2380	(object->resident_page_count > MAX_INIT_PT))) {
	2381	return;
	2382	}
	2383
	2384	if (psize + pindex > object->size) {
	2385	if (object->size < pindex)
	2386	return;
	2387	psize = object->size - pindex;
	2388	}
	2389
	2390	if (psize == 0)
	2391	return;
	2392
	2393	/*
	2394	* Use a red-black scan to traverse the requested range and load
	2395	* any valid pages found into the pmap.
	2396	*
	2397	* We cannot safely scan the object's memq unless we are in a
	2398	* critical section since interrupts can remove pages from objects.
	2399	*/
	2400	info.start_pindex = pindex;
	2401	info.end_pindex = pindex + psize - 1;
	2402	info.limit = limit;
	2403	info.mpte = NULL;
	2404	info.addr = addr;
	2405	info.pmap = pmap;
	2406
	2407	crit_enter();
	2408	vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
	2409	pmap_object_init_pt_callback, &info);
	2410	crit_exit();
	2411	}
	2412
	2413	static
	2414	int
	2415	pmap_object_init_pt_callback(vm_page_t p, void *data)
	2416	{
	2417	struct rb_vm_page_scan_info *info = data;
	2418	vm_pindex_t rel_index;
	2419	/*
	2420	* don't allow an madvise to blow away our really
	2421	* free pages allocating pv entries.
	2422	*/
	2423	if ((info->limit & MAP_PREFAULT_MADVISE) &&
	2424	vmstats.v_free_count < vmstats.v_free_reserved) {
	2425	return(-1);
	2426	}
	2427	if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
	2428	(p->busy == 0) && (p->flags & (PG_BUSY \| PG_FICTITIOUS)) == 0) {
	2429	if ((p->queue - p->pc) == PQ_CACHE)
	2430	vm_page_deactivate(p);
	2431	vm_page_busy(p);
	2432	rel_index = p->pindex - info->start_pindex;
	2433	pmap_enter_quick(info->pmap,
	2434	info->addr + amd64_ptob(rel_index), p);
	2435	vm_page_wakeup(p);
	2436	}
	2437	return(0);
	2438	}
	2439
	2440	/*
	2441	* pmap_prefault provides a quick way of clustering pagefaults into a
	2442	* processes address space. It is a "cousin" of pmap_object_init_pt,
	2443	* except it runs at page fault time instead of mmap time.
	2444	*/
	2445	#define PFBAK 4
	2446	#define PFFOR 4
	2447	#define PAGEORDER_SIZE (PFBAK+PFFOR)
	2448
	2449	static int pmap_prefault_pageorder[] = {
	2450	-PAGE_SIZE, PAGE_SIZE,
	2451	-2 * PAGE_SIZE, 2 * PAGE_SIZE,
	2452	-3 * PAGE_SIZE, 3 * PAGE_SIZE,
	2453	-4 * PAGE_SIZE, 4 * PAGE_SIZE
	2454	};
	2455
	2456	void
	2457	pmap_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry)
	2458	{
	2459	int i;
	2460	vm_offset_t starta;
	2461	vm_offset_t addr;
	2462	vm_pindex_t pindex;
	2463	vm_page_t m;
	2464	vm_object_t object;
	2465	struct lwp *lp;
	2466
	2467	/*
	2468	* We do not currently prefault mappings that use virtual page
	2469	* tables. We do not prefault foreign pmaps.
	2470	*/
	2471	if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
	2472	return;
	2473	lp = curthread->td_lwp;
	2474	if (lp == NULL \|\| (pmap != vmspace_pmap(lp->lwp_vmspace)))
	2475	return;
	2476
	2477	object = entry->object.vm_object;
	2478
	2479	starta = addra - PFBAK * PAGE_SIZE;
	2480	if (starta < entry->start)
	2481	starta = entry->start;
	2482	else if (starta > addra)
	2483	starta = 0;
	2484
	2485	/*
	2486	* critical section protection is required to maintain the
	2487	* page/object association, interrupts can free pages and remove
	2488	* them from their objects.
	2489	*/
	2490	crit_enter();
	2491	for (i = 0; i < PAGEORDER_SIZE; i++) {
	2492	vm_object_t lobject;
	2493	pt_entry_t *pte;
	2494
	2495	addr = addra + pmap_prefault_pageorder[i];
	2496	if (addr > addra + (PFFOR * PAGE_SIZE))
	2497	addr = 0;
	2498
	2499	if (addr < starta \|\| addr >= entry->end)
	2500	continue;
	2501
	2502	if ((*pmap_pde(pmap, addr)) == 0)
	2503	continue;
	2504
	2505	pte = vtopte(addr);
	2506	if (*pte)
	2507	continue;
	2508
	2509	pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
	2510	lobject = object;
	2511
	2512	for (m = vm_page_lookup(lobject, pindex);
	2513	(!m && (lobject->type == OBJT_DEFAULT) &&
	2514	(lobject->backing_object));
	2515	lobject = lobject->backing_object
	2516	) {
	2517	if (lobject->backing_object_offset & PAGE_MASK)
	2518	break;
	2519	pindex += (lobject->backing_object_offset >> PAGE_SHIFT);
	2520	m = vm_page_lookup(lobject->backing_object, pindex);
	2521	}
	2522
	2523	/*
	2524	* give-up when a page is not in memory
	2525	*/
	2526	if (m == NULL)
	2527	break;
	2528
	2529	if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
	2530	(m->busy == 0) &&
	2531	(m->flags & (PG_BUSY \| PG_FICTITIOUS)) == 0) {
	2532
	2533	if ((m->queue - m->pc) == PQ_CACHE) {
	2534	vm_page_deactivate(m);
	2535	}
	2536	vm_page_busy(m);
	2537	pmap_enter_quick(pmap, addr, m);
	2538	vm_page_wakeup(m);
	2539	}
	2540	}
	2541	crit_exit();
	2542	}
	2543
	2544	/*
	2545	* Routine: pmap_change_wiring
	2546	* Function: Change the wiring attribute for a map/virtual-address
	2547	* pair.
	2548	* In/out conditions:
	2549	* The mapping must already exist in the pmap.
	2550	*/
	2551	void
	2552	pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired)
	2553	{
	2554	pt_entry_t *pte;
	2555
	2556	if (pmap == NULL)
	2557	return;
	2558
	2559	pte = pmap_pte(pmap, va);
	2560
	2561	if (wired && !pmap_pte_w(pte))
	2562	pmap->pm_stats.wired_count++;
	2563	else if (!wired && pmap_pte_w(pte))
	2564	pmap->pm_stats.wired_count--;
	2565
	2566	/*
	2567	* Wiring is not a hardware characteristic so there is no need to
	2568	* invalidate TLB. However, in an SMP environment we must use
	2569	* a locked bus cycle to update the pte (if we are not using
	2570	* the pmap_inval_*() API that is)... it's ok to do this for simple
	2571	* wiring changes.
	2572	*/
	2573	#ifdef SMP
	2574	if (wired)
	2575	atomic_set_int(pte, PG_W);
	2576	else
	2577	atomic_clear_int(pte, PG_W);
	2578	#else
	2579	if (wired)
	2580	atomic_set_int_nonlocked(pte, PG_W);
	2581	else
	2582	atomic_clear_int_nonlocked(pte, PG_W);
	2583	#endif
	2584	}
	2585
	2586
	2587
	2588	/*
	2589	* Copy the range specified by src_addr/len
	2590	* from the source map to the range dst_addr/len
	2591	* in the destination map.
	2592	*
	2593	* This routine is only advisory and need not do anything.
	2594	*/
	2595	void
	2596	pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
	2597	vm_size_t len, vm_offset_t src_addr)
	2598	{
	2599	pmap_inval_info info;
	2600	vm_offset_t addr;
	2601	vm_offset_t end_addr = src_addr + len;
	2602	vm_offset_t pdnxt;
	2603	pd_entry_t src_frame, dst_frame;
	2604	vm_page_t m;
	2605
	2606	if (dst_addr != src_addr)
	2607	return;
	2608	/*
	2609	* XXX BUGGY. Amoung other things srcmpte is assumed to remain
	2610	* valid through blocking calls, and that's just not going to
	2611	* be the case.
	2612	*
	2613	* FIXME!
	2614	*/
	2615	return;
	2616
	2617	src_frame = src_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
	2618	if (src_frame != (PTDpde & PG_FRAME)) {
	2619	return;
	2620	}
	2621
	2622	dst_frame = dst_pmap->pm_pdir[PTDPTDI] & PG_FRAME;
	2623	if (dst_frame != (APTDpde & PG_FRAME)) {
	2624	APTDpde = (pd_entry_t) (dst_frame \| PG_RW \| PG_V);
	2625	/* The page directory is not shared between CPUs */
	2626	cpu_invltlb();
	2627	}
	2628	pmap_inval_init(&info);
	2629	pmap_inval_add(&info, dst_pmap, -1);
	2630	pmap_inval_add(&info, src_pmap, -1);
	2631
	2632	/*
	2633	* critical section protection is required to maintain the page/object
	2634	* association, interrupts can free pages and remove them from
	2635	* their objects.
	2636	*/
	2637	crit_enter();
	2638	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
	2639	pt_entry_t src_pte, dst_pte;
	2640	vm_page_t dstmpte, srcmpte;
	2641	vm_offset_t srcptepaddr;
	2642	vm_pindex_t ptepindex;
	2643
	2644	if (addr >= UPT_MIN_ADDRESS)
	2645	panic("pmap_copy: invalid to pmap_copy page tables\n");
	2646
	2647	/*
	2648	* Don't let optional prefaulting of pages make us go
	2649	* way below the low water mark of free pages or way
	2650	* above high water mark of used pv entries.
	2651	*/
	2652	if (vmstats.v_free_count < vmstats.v_free_reserved \|\|
	2653	pv_entry_count > pv_entry_high_water)
	2654	break;
	2655
	2656	pdnxt = ((addr + PAGE_SIZENPTEPG) & ~(PAGE_SIZENPTEPG - 1));
	2657	ptepindex = addr >> PDRSHIFT;
	2658
	2659	srcptepaddr = (vm_offset_t) src_pmap->pm_pdir[ptepindex];
	2660	if (srcptepaddr == 0)
	2661	continue;
	2662
	2663	if (srcptepaddr & PG_PS) {
	2664	if (dst_pmap->pm_pdir[ptepindex] == 0) {
	2665	dst_pmap->pm_pdir[ptepindex] = (pd_entry_t) srcptepaddr;
	2666	dst_pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
	2667	}
	2668	continue;
	2669	}
	2670
	2671	srcmpte = vm_page_lookup(src_pmap->pm_pteobj, ptepindex);
	2672	if ((srcmpte == NULL) \|\| (srcmpte->hold_count == 0) \|\|
	2673	(srcmpte->flags & PG_BUSY)) {
	2674	continue;
	2675	}
	2676
	2677	if (pdnxt > end_addr)
	2678	pdnxt = end_addr;
	2679
	2680	src_pte = vtopte(addr);
	2681	dst_pte = avtopte(addr);
	2682	while (addr < pdnxt) {
	2683	pt_entry_t ptetemp;
	2684
	2685	ptetemp = *src_pte;
	2686	/*
	2687	* we only virtual copy managed pages
	2688	*/
	2689	if ((ptetemp & PG_MANAGED) != 0) {
	2690	/*
	2691	* We have to check after allocpte for the
	2692	* pte still being around... allocpte can
	2693	* block.
	2694	*
	2695	* pmap_allocpte() can block. If we lose
	2696	* our page directory mappings we stop.
	2697	*/
	2698	dstmpte = pmap_allocpte(dst_pmap, addr);
	2699
	2700	if (src_frame != (PTDpde & PG_FRAME) \|\|
	2701	dst_frame != (APTDpde & PG_FRAME)
	2702	) {
	2703	kprintf("WARNING: pmap_copy: detected and corrected race\n");
	2704	pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
	2705	goto failed;
	2706	} else if ((*dst_pte == 0) &&
	2707	(ptetemp = *src_pte) != 0 &&
	2708	(ptetemp & PG_MANAGED)) {
	2709	/*
	2710	* Clear the modified and
	2711	* accessed (referenced) bits
	2712	* during the copy.
	2713	*/
	2714	m = PHYS_TO_VM_PAGE(ptetemp);
	2715	*dst_pte = ptetemp & ~(PG_M \| PG_A);
	2716	++dst_pmap->pm_stats.resident_count;
	2717	pmap_insert_entry(dst_pmap, addr,
	2718	dstmpte, m);
	2719	KKASSERT(m->flags & PG_MAPPED);
	2720	} else {
	2721	kprintf("WARNING: pmap_copy: dst_pte race detected and corrected\n");
	2722	pmap_unwire_pte_hold(dst_pmap, dstmpte, &info);
	2723	goto failed;
	2724	}
	2725	if (dstmpte->hold_count >= srcmpte->hold_count)
	2726	break;
	2727	}
	2728	addr += PAGE_SIZE;
	2729	src_pte++;
	2730	dst_pte++;
	2731	}
	2732	}
	2733	failed:
	2734	crit_exit();
	2735	pmap_inval_flush(&info);
	2736	}
	2737
	2738	/*
	2739	* pmap_zero_page:
	2740	*
	2741	* Zero the specified PA by mapping the page into KVM and clearing its
	2742	* contents.
	2743	*
	2744	* This function may be called from an interrupt and no locking is
	2745	* required.
	2746	*/
	2747	void
	2748	pmap_zero_page(vm_paddr_t phys)
	2749	{
	2750	struct mdglobaldata *gd = mdcpu;
	2751
	2752	crit_enter();
	2753	if (*gd->gd_CMAP3)
	2754	panic("pmap_zero_page: CMAP3 busy");
	2755	*gd->gd_CMAP3 =
	2756	PG_V \| PG_RW \| (phys & PG_FRAME) \| PG_A \| PG_M;
	2757	cpu_invlpg(gd->gd_CADDR3);
	2758
	2759	#if defined(I686_CPU)
	2760	if (cpu_class == CPUCLASS_686)
	2761	i686_pagezero(gd->gd_CADDR3);
	2762	else
	2763	#endif
	2764	bzero(gd->gd_CADDR3, PAGE_SIZE);
	2765	*gd->gd_CMAP3 = 0;
	2766	crit_exit();
	2767	}
	2768
	2769	/*
	2770	* pmap_page_assertzero:
	2771	*
	2772	* Assert that a page is empty, panic if it isn't.
	2773	*/
	2774	void
	2775	pmap_page_assertzero(vm_paddr_t phys)
	2776	{
	2777	struct mdglobaldata *gd = mdcpu;
	2778	int i;
	2779
	2780	crit_enter();
	2781	if (*gd->gd_CMAP3)
	2782	panic("pmap_zero_page: CMAP3 busy");
	2783	*gd->gd_CMAP3 =
	2784	PG_V \| PG_RW \| (phys & PG_FRAME) \| PG_A \| PG_M;
	2785	cpu_invlpg(gd->gd_CADDR3);
	2786	for (i = 0; i < PAGE_SIZE; i += sizeof(int)) {
	2787	if ((int )((char *)gd->gd_CADDR3 + i) != 0) {
	2788	panic("pmap_page_assertzero() @ %p not zero!\n",
	2789	(void *)gd->gd_CADDR3);
	2790	}
	2791	}
	2792	*gd->gd_CMAP3 = 0;
	2793	crit_exit();
	2794	}
	2795
	2796	/*
	2797	* pmap_zero_page:
	2798	*
	2799	* Zero part of a physical page by mapping it into memory and clearing
	2800	* its contents with bzero.
	2801	*
	2802	* off and size may not cover an area beyond a single hardware page.
	2803	*/
	2804	void
	2805	pmap_zero_page_area(vm_paddr_t phys, int off, int size)
	2806	{
	2807	struct mdglobaldata *gd = mdcpu;
	2808
	2809	crit_enter();
	2810	if (*gd->gd_CMAP3)
	2811	panic("pmap_zero_page: CMAP3 busy");
	2812	*gd->gd_CMAP3 = PG_V \| PG_RW \| (phys & PG_FRAME) \| PG_A \| PG_M;
	2813	cpu_invlpg(gd->gd_CADDR3);
	2814
	2815	#if defined(I686_CPU)
	2816	if (cpu_class == CPUCLASS_686 && off == 0 && size == PAGE_SIZE)
	2817	i686_pagezero(gd->gd_CADDR3);
	2818	else
	2819	#endif
	2820	bzero((char *)gd->gd_CADDR3 + off, size);
	2821	*gd->gd_CMAP3 = 0;
	2822	crit_exit();
	2823	}
	2824
	2825	/*
	2826	* pmap_copy_page:
	2827	*
	2828	* Copy the physical page from the source PA to the target PA.
	2829	* This function may be called from an interrupt. No locking
	2830	* is required.
	2831	*/
	2832	void
	2833	pmap_copy_page(vm_paddr_t src, vm_paddr_t dst)
	2834	{
	2835	struct mdglobaldata *gd = mdcpu;
	2836
	2837	crit_enter();
	2838	if (*gd->gd_CMAP1)
	2839	panic("pmap_copy_page: CMAP1 busy");
	2840	if (*gd->gd_CMAP2)
	2841	panic("pmap_copy_page: CMAP2 busy");
	2842
	2843	*gd->gd_CMAP1 = PG_V \| (src & PG_FRAME) \| PG_A;
	2844	*gd->gd_CMAP2 = PG_V \| PG_RW \| (dst & PG_FRAME) \| PG_A \| PG_M;
	2845
	2846	cpu_invlpg(gd->gd_CADDR1);
	2847	cpu_invlpg(gd->gd_CADDR2);
	2848
	2849	bcopy(gd->gd_CADDR1, gd->gd_CADDR2, PAGE_SIZE);
	2850
	2851	*gd->gd_CMAP1 = 0;
	2852	*gd->gd_CMAP2 = 0;
	2853	crit_exit();
	2854	}
	2855
	2856	/*
	2857	* pmap_copy_page_frag:
	2858	*
	2859	* Copy the physical page from the source PA to the target PA.
	2860	* This function may be called from an interrupt. No locking
	2861	* is required.
	2862	*/
	2863	void
	2864	pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes)
	2865	{
	2866	struct mdglobaldata *gd = mdcpu;
	2867
	2868	crit_enter();
	2869	if (*gd->gd_CMAP1)
	2870	panic("pmap_copy_page: CMAP1 busy");
	2871	if (*gd->gd_CMAP2)
	2872	panic("pmap_copy_page: CMAP2 busy");
	2873
	2874	*gd->gd_CMAP1 = PG_V \| (src & PG_FRAME) \| PG_A;
	2875	*gd->gd_CMAP2 = PG_V \| PG_RW \| (dst & PG_FRAME) \| PG_A \| PG_M;
	2876
	2877	cpu_invlpg(gd->gd_CADDR1);
	2878	cpu_invlpg(gd->gd_CADDR2);
	2879
	2880	bcopy((char *)gd->gd_CADDR1 + (src & PAGE_MASK),
	2881	(char *)gd->gd_CADDR2 + (dst & PAGE_MASK),
	2882	bytes);
	2883
	2884	*gd->gd_CMAP1 = 0;
	2885	*gd->gd_CMAP2 = 0;
	2886	crit_exit();
	2887	}
	2888
	2889	/*
	2890	* Returns true if the pmap's pv is one of the first
	2891	* 16 pvs linked to from this page. This count may
	2892	* be changed upwards or downwards in the future; it
	2893	* is only necessary that true be returned for a small
	2894	* subset of pmaps for proper page aging.
	2895	*/
	2896	boolean_t
	2897	pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
	2898	{
	2899	pv_entry_t pv;
	2900	int loops = 0;
	2901
	2902	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	2903	return FALSE;
	2904
	2905	crit_enter();
	2906
	2907	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	2908	if (pv->pv_pmap == pmap) {
	2909	crit_exit();
	2910	return TRUE;
	2911	}
	2912	loops++;
	2913	if (loops >= 16)
	2914	break;
	2915	}
	2916	crit_exit();
	2917	return (FALSE);
	2918	}
	2919
	2920	/*
	2921	* Remove all pages from specified address space
	2922	* this aids process exit speeds. Also, this code
	2923	* is special cased for current process only, but
	2924	* can have the more generic (and slightly slower)
	2925	* mode enabled. This is much faster than pmap_remove
	2926	* in the case of running down an entire address space.
	2927	*/
	2928	void
	2929	pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
	2930	{
	2931	struct lwp *lp;
	2932	pt_entry_t *pte, tpte;
	2933	pv_entry_t pv, npv;
	2934	vm_page_t m;
	2935	pmap_inval_info info;
	2936	int iscurrentpmap;
	2937	int32_t save_generation;
	2938
	2939	lp = curthread->td_lwp;
	2940	if (lp && pmap == vmspace_pmap(lp->lwp_vmspace))
	2941	iscurrentpmap = 1;
	2942	else
	2943	iscurrentpmap = 0;
	2944
	2945	pmap_inval_init(&info);
	2946	crit_enter();
	2947	for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
	2948	if (pv->pv_va >= eva \|\| pv->pv_va < sva) {
	2949	npv = TAILQ_NEXT(pv, pv_plist);
	2950	continue;
	2951	}
	2952
	2953	KKASSERT(pmap == pv->pv_pmap);
	2954
	2955	if (iscurrentpmap)
	2956	pte = vtopte(pv->pv_va);
	2957	else
	2958	pte = pmap_pte_quick(pmap, pv->pv_va);
	2959	if (pmap->pm_active)
	2960	pmap_inval_add(&info, pmap, pv->pv_va);
	2961
	2962	/*
	2963	* We cannot remove wired pages from a process' mapping
	2964	* at this time
	2965	*/
	2966	if (*pte & PG_W) {
	2967	npv = TAILQ_NEXT(pv, pv_plist);
	2968	continue;
	2969	}
	2970	tpte = pte_load_clear(pte);
	2971
	2972	m = PHYS_TO_VM_PAGE(tpte);
	2973
	2974	KASSERT(m < &vm_page_array[vm_page_array_size],
	2975	("pmap_remove_pages: bad tpte %x", tpte));
	2976
	2977	KKASSERT(pmap->pm_stats.resident_count > 0);
	2978	--pmap->pm_stats.resident_count;
	2979
	2980	/*
	2981	* Update the vm_page_t clean and reference bits.
	2982	*/
	2983	if (tpte & PG_M) {
	2984	vm_page_dirty(m);
	2985	}
	2986
	2987	npv = TAILQ_NEXT(pv, pv_plist);
	2988	TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
	2989	save_generation = ++pmap->pm_generation;
	2990
	2991	m->md.pv_list_count--;
	2992	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
	2993	if (TAILQ_EMPTY(&m->md.pv_list))
	2994	vm_page_flag_clear(m, PG_MAPPED \| PG_WRITEABLE);
	2995
	2996	pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem, &info);
	2997	free_pv_entry(pv);
	2998
	2999	/*
	3000	* Restart the scan if we blocked during the unuse or free
	3001	* calls and other removals were made.
	3002	*/
	3003	if (save_generation != pmap->pm_generation) {
	3004	kprintf("Warning: pmap_remove_pages race-A avoided\n");
	3005	pv = TAILQ_FIRST(&pmap->pm_pvlist);
	3006	}
	3007	}
	3008	pmap_inval_flush(&info);
	3009	crit_exit();
	3010	}
	3011
	3012	/*
	3013	* pmap_testbit tests bits in pte's
	3014	* note that the testbit/clearbit routines are inline,
	3015	* and a lot of things compile-time evaluate.
	3016	*/
	3017	static boolean_t
	3018	pmap_testbit(vm_page_t m, int bit)
	3019	{
	3020	pv_entry_t pv;
	3021	pt_entry_t *pte;
	3022
	3023	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3024	return FALSE;
	3025
	3026	if (TAILQ_FIRST(&m->md.pv_list) == NULL)
	3027	return FALSE;
	3028
	3029	crit_enter();
	3030
	3031	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3032	/*
	3033	* if the bit being tested is the modified bit, then
	3034	* mark clean_map and ptes as never
	3035	* modified.
	3036	*/
	3037	if (bit & (PG_A\|PG_M)) {
	3038	if (!pmap_track_modified(pv->pv_va))
	3039	continue;
	3040	}
	3041
	3042	#if defined(PMAP_DIAGNOSTIC)
	3043	if (!pv->pv_pmap) {
	3044	kprintf("Null pmap (tb) at va: 0x%x\n", pv->pv_va);
	3045	continue;
	3046	}
	3047	#endif
	3048	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
	3049	if (*pte & bit) {
	3050	crit_exit();
	3051	return TRUE;
	3052	}
	3053	}
	3054	crit_exit();
	3055	return (FALSE);
	3056	}
	3057
	3058	/*
	3059	* this routine is used to modify bits in ptes
	3060	*/
	3061	static __inline void
	3062	pmap_clearbit(vm_page_t m, int bit)
	3063	{
	3064	struct pmap_inval_info info;
	3065	pv_entry_t pv;
	3066	pt_entry_t *pte;
	3067	pt_entry_t pbits;
	3068
	3069	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3070	return;
	3071
	3072	pmap_inval_init(&info);
	3073	crit_enter();
	3074
	3075	/*
	3076	* Loop over all current mappings setting/clearing as appropos If
	3077	* setting RO do we need to clear the VAC?
	3078	*/
	3079	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3080	/*
	3081	* don't write protect pager mappings
	3082	*/
	3083	if (bit == PG_RW) {
	3084	if (!pmap_track_modified(pv->pv_va))
	3085	continue;
	3086	}
	3087
	3088	#if defined(PMAP_DIAGNOSTIC)
	3089	if (!pv->pv_pmap) {
	3090	kprintf("Null pmap (cb) at va: 0x%x\n", pv->pv_va);
	3091	continue;
	3092	}
	3093	#endif
	3094
	3095	/*
	3096	* Careful here. We can use a locked bus instruction to
	3097	* clear PG_A or PG_M safely but we need to synchronize
	3098	* with the target cpus when we mess with PG_RW.
	3099	*
	3100	* We do not have to force synchronization when clearing
	3101	* PG_M even for PTEs generated via virtual memory maps,
	3102	* because the virtual kernel will invalidate the pmap
	3103	* entry when/if it needs to resynchronize the Modify bit.
	3104	*/
	3105	if (bit & PG_RW)
	3106	pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
	3107	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
	3108	again:
	3109	pbits = *pte;
	3110	if (pbits & bit) {
	3111	if (bit == PG_RW) {
	3112	if (pbits & PG_M) {
	3113	vm_page_dirty(m);
	3114	atomic_clear_int(pte, PG_M\|PG_RW);
	3115	} else {
	3116	/*
	3117	* The cpu may be trying to set PG_M
	3118	* simultaniously with our clearing
	3119	* of PG_RW.
	3120	*/
	3121	if (!atomic_cmpset_int(pte, pbits,
	3122	pbits & ~PG_RW))
	3123	goto again;
	3124	}
	3125	} else if (bit == PG_M) {
	3126	/*
	3127	* We could also clear PG_RW here to force
	3128	* a fault on write to redetect PG_M for
	3129	* virtual kernels, but it isn't necessary
	3130	* since virtual kernels invalidate the pte
	3131	* when they clear the VPTE_M bit in their
	3132	* virtual page tables.
	3133	*/
	3134	atomic_clear_int(pte, PG_M);
	3135	} else {
	3136	atomic_clear_int(pte, bit);
	3137	}
	3138	}
	3139	}
	3140	pmap_inval_flush(&info);
	3141	crit_exit();
	3142	}
	3143
	3144	/*
	3145	* pmap_page_protect:
	3146	*
	3147	* Lower the permission for all mappings to a given page.
	3148	*/
	3149	void
	3150	pmap_page_protect(vm_page_t m, vm_prot_t prot)
	3151	{
	3152	if ((prot & VM_PROT_WRITE) == 0) {
	3153	if (prot & (VM_PROT_READ \| VM_PROT_EXECUTE)) {
	3154	pmap_clearbit(m, PG_RW);
	3155	vm_page_flag_clear(m, PG_WRITEABLE);
	3156	} else {
	3157	pmap_remove_all(m);
	3158	}
	3159	}
	3160	}
	3161
	3162	vm_paddr_t
	3163	pmap_phys_address(vm_pindex_t ppn)
	3164	{
	3165	return (amd64_ptob(ppn));
	3166	}
	3167
	3168	/*
	3169	* pmap_ts_referenced:
	3170	*
	3171	* Return a count of reference bits for a page, clearing those bits.
	3172	* It is not necessary for every reference bit to be cleared, but it
	3173	* is necessary that 0 only be returned when there are truly no
	3174	* reference bits set.
	3175	*
	3176	* XXX: The exact number of bits to check and clear is a matter that
	3177	* should be tested and standardized at some point in the future for
	3178	* optimal aging of shared pages.
	3179	*/
	3180	int
	3181	pmap_ts_referenced(vm_page_t m)
	3182	{
	3183	pv_entry_t pv, pvf, pvn;
	3184	pt_entry_t *pte;
	3185	int rtval = 0;
	3186
	3187	if (!pmap_initialized \|\| (m->flags & PG_FICTITIOUS))
	3188	return (rtval);
	3189
	3190	crit_enter();
	3191
	3192	if ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
	3193
	3194	pvf = pv;
	3195
	3196	do {
	3197	pvn = TAILQ_NEXT(pv, pv_list);
	3198
	3199	TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
	3200
	3201	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_list);
	3202
	3203	if (!pmap_track_modified(pv->pv_va))
	3204	continue;
	3205
	3206	pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
	3207
	3208	if (pte && (*pte & PG_A)) {
	3209	#ifdef SMP
	3210	atomic_clear_int(pte, PG_A);
	3211	#else
	3212	atomic_clear_int_nonlocked(pte, PG_A);
	3213	#endif
	3214	rtval++;
	3215	if (rtval > 4) {
	3216	break;
	3217	}
	3218	}
	3219	} while ((pv = pvn) != NULL && pv != pvf);
	3220	}
	3221	crit_exit();
	3222
	3223	return (rtval);
	3224	}
	3225
	3226	/*
	3227	* pmap_is_modified:
	3228	*
	3229	* Return whether or not the specified physical page was modified
	3230	* in any physical maps.
	3231	*/
	3232	boolean_t
	3233	pmap_is_modified(vm_page_t m)
	3234	{
	3235	return pmap_testbit(m, PG_M);
	3236	}
	3237
	3238	/*
	3239	* Clear the modify bits on the specified physical page.
	3240	*/
	3241	void
	3242	pmap_clear_modify(vm_page_t m)
	3243	{
	3244	pmap_clearbit(m, PG_M);
	3245	}
	3246
	3247	/*
	3248	* pmap_clear_reference:
	3249	*
	3250	* Clear the reference bit on the specified physical page.
	3251	*/
	3252	void
	3253	pmap_clear_reference(vm_page_t m)
	3254	{
	3255	pmap_clearbit(m, PG_A);
	3256	}
	3257
	3258	/*
	3259	* Miscellaneous support routines follow
	3260	*/
	3261
	3262	static void
	3263	i386_protection_init(void)
	3264	{
	3265	int *kp, prot;
	3266
	3267	kp = protection_codes;
	3268	for (prot = 0; prot < 8; prot++) {
	3269	switch (prot) {
	3270	case VM_PROT_NONE \| VM_PROT_NONE \| VM_PROT_NONE:
	3271	/*
	3272	* Read access is also 0. There isn't any execute bit,
	3273	* so just make it readable.
	3274	*/
	3275	case VM_PROT_READ \| VM_PROT_NONE \| VM_PROT_NONE:
	3276	case VM_PROT_READ \| VM_PROT_NONE \| VM_PROT_EXECUTE:
	3277	case VM_PROT_NONE \| VM_PROT_NONE \| VM_PROT_EXECUTE:
	3278	*kp++ = 0;
	3279	break;
	3280	case VM_PROT_NONE \| VM_PROT_WRITE \| VM_PROT_NONE:
	3281	case VM_PROT_NONE \| VM_PROT_WRITE \| VM_PROT_EXECUTE:
	3282	case VM_PROT_READ \| VM_PROT_WRITE \| VM_PROT_NONE:
	3283	case VM_PROT_READ \| VM_PROT_WRITE \| VM_PROT_EXECUTE:
	3284	*kp++ = PG_RW;
	3285	break;
	3286	}
	3287	}
	3288	}
	3289
	3290	/*
	3291	* Map a set of physical memory pages into the kernel virtual
	3292	* address space. Return a pointer to where it is mapped. This
	3293	* routine is intended to be used for mapping device memory,
	3294	* NOT real memory.
	3295	*
	3296	* NOTE: we can't use pgeflag unless we invalidate the pages one at
	3297	* a time.
	3298	*/
	3299	void *
	3300	pmap_mapdev(vm_paddr_t pa, vm_size_t size)
	3301	{
	3302	vm_offset_t va, tmpva, offset;
	3303	pt_entry_t *pte;
	3304
	3305	offset = pa & PAGE_MASK;
	3306	size = roundup(offset + size, PAGE_SIZE);
	3307
	3308	va = kmem_alloc_nofault(&kernel_map, size);
	3309	if (!va)
	3310	panic("pmap_mapdev: Couldn't alloc kernel virtual memory");
	3311
	3312	pa = pa & PG_FRAME;
	3313	for (tmpva = va; size > 0;) {
	3314	pte = vtopte(tmpva);
	3315	pte = pa \| PG_RW \| PG_V; / \| pgeflag; */
	3316	size -= PAGE_SIZE;
	3317	tmpva += PAGE_SIZE;
	3318	pa += PAGE_SIZE;
	3319	}
	3320	cpu_invltlb();
	3321	smp_invltlb();
	3322
	3323	return ((void *)(va + offset));
	3324	}
	3325
	3326	void
	3327	pmap_unmapdev(vm_offset_t va, vm_size_t size)
	3328	{
	3329	vm_offset_t base, offset;
	3330
	3331	base = va & PG_FRAME;
	3332	offset = va & PAGE_MASK;
	3333	size = roundup(offset + size, PAGE_SIZE);
	3334	pmap_qremove(va, size >> PAGE_SHIFT);
	3335	kmem_free(&kernel_map, base, size);
	3336	}
	3337
	3338	/*
	3339	* perform the pmap work for mincore
	3340	*/
	3341	int
	3342	pmap_mincore(pmap_t pmap, vm_offset_t addr)
	3343	{
	3344	pt_entry_t *ptep, pte;
	3345	vm_page_t m;
	3346	int val = 0;
	3347
	3348	ptep = pmap_pte(pmap, addr);
	3349	if (ptep == 0) {
	3350	return 0;
	3351	}
	3352
	3353	if ((pte = *ptep) != 0) {
	3354	vm_offset_t pa;
	3355
	3356	val = MINCORE_INCORE;
	3357	if ((pte & PG_MANAGED) == 0)
	3358	return val;
	3359
	3360	pa = pte & PG_FRAME;
	3361
	3362	m = PHYS_TO_VM_PAGE(pa);
	3363
	3364	/*
	3365	* Modified by us
	3366	*/
	3367	if (pte & PG_M)
	3368	val \|= MINCORE_MODIFIED\|MINCORE_MODIFIED_OTHER;
	3369	/*
	3370	* Modified by someone
	3371	*/
	3372	else if (m->dirty \|\| pmap_is_modified(m))
	3373	val \|= MINCORE_MODIFIED_OTHER;
	3374	/*
	3375	* Referenced by us
	3376	*/
	3377	if (pte & PG_A)
	3378	val \|= MINCORE_REFERENCED\|MINCORE_REFERENCED_OTHER;
	3379
	3380	/*
	3381	* Referenced by someone
	3382	*/
	3383	else if ((m->flags & PG_REFERENCED) \|\| pmap_ts_referenced(m)) {
	3384	val \|= MINCORE_REFERENCED_OTHER;
	3385	vm_page_flag_set(m, PG_REFERENCED);
	3386	}
	3387	}
	3388	return val;
	3389	}
	3390
	3391	/*
	3392	* Replace p->p_vmspace with a new one. If adjrefs is non-zero the new
	3393	* vmspace will be ref'd and the old one will be deref'd.
	3394	*
	3395	* The vmspace for all lwps associated with the process will be adjusted
	3396	* and cr3 will be reloaded if any lwp is the current lwp.
	3397	*/
	3398	void
	3399	pmap_replacevm(struct proc p, struct vmspace newvm, int adjrefs)
	3400	{
	3401	struct vmspace *oldvm;
	3402	struct lwp *lp;
	3403
	3404	crit_enter();
	3405	oldvm = p->p_vmspace;
	3406	if (oldvm != newvm) {
	3407	p->p_vmspace = newvm;
	3408	KKASSERT(p->p_nthreads == 1);
	3409	lp = RB_ROOT(&p->p_lwp_tree);
	3410	pmap_setlwpvm(lp, newvm);
	3411	if (adjrefs) {
	3412	sysref_get(&newvm->vm_sysref);
	3413	sysref_put(&oldvm->vm_sysref);
	3414	}
	3415	}
	3416	crit_exit();
	3417	}
	3418
	3419	/*
	3420	* Set the vmspace for a LWP. The vmspace is almost universally set the
	3421	* same as the process vmspace, but virtual kernels need to swap out contexts
	3422	* on a per-lwp basis.
	3423	*/
	3424	void
	3425	pmap_setlwpvm(struct lwp lp, struct vmspace newvm)
	3426	{
	3427	struct vmspace *oldvm;
	3428	struct pmap *pmap;
	3429
	3430	crit_enter();
	3431	oldvm = lp->lwp_vmspace;
	3432
	3433	if (oldvm != newvm) {
	3434	lp->lwp_vmspace = newvm;
	3435	if (curthread->td_lwp == lp) {
	3436	pmap = vmspace_pmap(newvm);
	3437	#if defined(SMP)
	3438	atomic_set_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
	3439	#else
	3440	pmap->pm_active \|= 1;
	3441	#endif
	3442	#if defined(SWTCH_OPTIM_STATS)
	3443	tlb_flush_count++;
	3444	#endif
	3445	curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pdir);
	3446	curthread->td_pcb->pcb_cr3 \|= PG_RW \| PG_U \| PG_V;
	3447	*link_pdpe = curthread->td_pcb->pcb_cr3 \| PG_RW \| PG_U \| PG_V;
	3448	load_cr3(common_lvl4_phys);
	3449	pmap = vmspace_pmap(oldvm);
	3450	#if defined(SMP)
	3451	atomic_clear_int(&pmap->pm_active,
	3452	1 << mycpu->gd_cpuid);
	3453	#else
	3454	pmap->pm_active &= ~1;
	3455	#endif
	3456	}
	3457	}
	3458	crit_exit();
	3459	}
	3460
	3461	vm_offset_t
	3462	pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
	3463	{
	3464
	3465	if ((obj == NULL) \|\| (size < NBPDR) \|\| (obj->type != OBJT_DEVICE)) {
	3466	return addr;
	3467	}
	3468
	3469	addr = (addr + (NBPDR - 1)) & ~(NBPDR - 1);
	3470	return addr;
	3471	}
	3472
	3473
	3474	#if defined(DEBUG)
	3475
	3476	static void pads (pmap_t pm);
	3477	void pmap_pvdump (vm_paddr_t pa);
	3478
	3479	/* print address space of pmap*/
	3480	static void
	3481	pads(pmap_t pm)
	3482	{
	3483	vm_offset_t va;
	3484	unsigned i, j;
	3485	pt_entry_t *ptep;
	3486
	3487	if (pm == &kernel_pmap)
	3488	return;
	3489	crit_enter();
	3490	for (i = 0; i < NPDEPG; i++) {
	3491	if (pm->pm_pdir[i]) {
	3492	for (j = 0; j < NPTEPG; j++) {
	3493	va = (i << PDRSHIFT) + (j << PAGE_SHIFT);
	3494	if (pm == &kernel_pmap && va < KERNBASE)
	3495	continue;
	3496	if (pm != &kernel_pmap && va > UPT_MAX_ADDRESS)
	3497	continue;
	3498	ptep = pmap_pte_quick(pm, va);
	3499	if (pmap_pte_v(ptep))
	3500	kprintf("%lx:%lx ", va, *ptep);
	3501	};
	3502	}
	3503	}
	3504	crit_exit();
	3505
	3506	}
	3507
	3508	void
	3509	pmap_pvdump(vm_paddr_t pa)
	3510	{
	3511	pv_entry_t pv;
	3512	vm_page_t m;
	3513
	3514	kprintf("pa %08llx", (long long)pa);
	3515	m = PHYS_TO_VM_PAGE(pa);
	3516	TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
	3517	#ifdef used_to_be
	3518	kprintf(" -> pmap %p, va %x, flags %x",
	3519	(void *)pv->pv_pmap, pv->pv_va, pv->pv_flags);
	3520	#endif
	3521	kprintf(" -> pmap %p, va %x", (void *)pv->pv_pmap, pv->pv_va);
	3522	pads(pv->pv_pmap);
	3523	}
	3524	kprintf(" ");
	3525	}
	3526	#endif