Commit | Line | Data |
---|---|---|
d7f50089 | 1 | /* |
d7f50089 | 2 | * Copyright (c) 1991 Regents of the University of California. |
d7f50089 | 3 | * Copyright (c) 1994 John S. Dyson |
d7f50089 | 4 | * Copyright (c) 1994 David Greenman |
48ffc236 JG |
5 | * Copyright (c) 2003 Peter Wemm |
6 | * Copyright (c) 2005-2008 Alan L. Cox <alc@cs.rice.edu> | |
7 | * Copyright (c) 2008, 2009 The DragonFly Project. | |
8 | * Copyright (c) 2008, 2009 Jordan Gordeev. | |
567a6398 | 9 | * Copyright (c) 2011-2019 Matthew Dillon |
d7f50089 | 10 | * All rights reserved. |
c8fe38ae MD |
11 | * |
12 | * This code is derived from software contributed to Berkeley by | |
13 | * the Systems Programming Group of the University of Utah Computer | |
14 | * Science Department and William Jolitz of UUNET Technologies Inc. | |
15 | * | |
d7f50089 YY |
16 | * Redistribution and use in source and binary forms, with or without |
17 | * modification, are permitted provided that the following conditions | |
18 | * are met: | |
d7f50089 YY |
19 | * 1. Redistributions of source code must retain the above copyright |
20 | * notice, this list of conditions and the following disclaimer. | |
21 | * 2. Redistributions in binary form must reproduce the above copyright | |
c8fe38ae MD |
22 | * notice, this list of conditions and the following disclaimer in the |
23 | * documentation and/or other materials provided with the distribution. | |
24 | * 3. All advertising materials mentioning features or use of this software | |
25 | * must display the following acknowledgement: | |
26 | * This product includes software developed by the University of | |
27 | * California, Berkeley and its contributors. | |
28 | * 4. Neither the name of the University nor the names of its contributors | |
29 | * may be used to endorse or promote products derived from this software | |
30 | * without specific prior written permission. | |
31 | * | |
32 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
33 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
34 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
35 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
36 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
37 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
38 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
39 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
40 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
41 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
d7f50089 | 42 | * SUCH DAMAGE. |
d7f50089 YY |
43 | */ |
44 | /* | |
90244566 | 45 | * Manage physical address maps for x86-64 systems. |
9e24b495 MD |
46 | * |
47 | * Some notes: | |
48 | * - The 'M'odified bit is only applicable to terminal PTEs. | |
49 | * | |
50 | * - The 'U'ser access bit can be set for higher-level PTEs as | |
51 | * long as it isn't set for terminal PTEs for pages we don't | |
52 | * want user access to. | |
c8fe38ae MD |
53 | */ |
54 | ||
4a3a2ba2 | 55 | #include "opt_ddb.h" |
c8fe38ae | 56 | #include "opt_msgbuf.h" |
d7f50089 | 57 | |
c8fe38ae | 58 | #include <sys/param.h> |
d7f50089 | 59 | #include <sys/kernel.h> |
d7f50089 | 60 | #include <sys/proc.h> |
c8fe38ae MD |
61 | #include <sys/msgbuf.h> |
62 | #include <sys/vmmeter.h> | |
63 | #include <sys/mman.h> | |
a86ce0cd | 64 | #include <sys/systm.h> |
d7f50089 | 65 | |
c8fe38ae MD |
66 | #include <vm/vm.h> |
67 | #include <vm/vm_param.h> | |
68 | #include <sys/sysctl.h> | |
69 | #include <sys/lock.h> | |
d7f50089 | 70 | #include <vm/vm_kern.h> |
c8fe38ae MD |
71 | #include <vm/vm_page.h> |
72 | #include <vm/vm_map.h> | |
d7f50089 | 73 | #include <vm/vm_object.h> |
c8fe38ae | 74 | #include <vm/vm_extern.h> |
d7f50089 | 75 | #include <vm/vm_pageout.h> |
c8fe38ae MD |
76 | #include <vm/vm_pager.h> |
77 | #include <vm/vm_zone.h> | |
78 | ||
c8fe38ae | 79 | #include <sys/thread2.h> |
b12defdc MD |
80 | #include <sys/spinlock2.h> |
81 | #include <vm/vm_page2.h> | |
d7f50089 | 82 | |
c8fe38ae | 83 | #include <machine/cputypes.h> |
9e24b495 | 84 | #include <machine/cpu.h> |
d7f50089 | 85 | #include <machine/md_var.h> |
c8fe38ae MD |
86 | #include <machine/specialreg.h> |
87 | #include <machine/smp.h> | |
88 | #include <machine_base/apic/apicreg.h> | |
d7f50089 | 89 | #include <machine/globaldata.h> |
c8fe38ae MD |
90 | #include <machine/pmap.h> |
91 | #include <machine/pmap_inval.h> | |
92 | ||
48ffc236 JG |
93 | #include <ddb/ddb.h> |
94 | ||
c8fe38ae | 95 | #define PMAP_KEEP_PDIRS |
c8fe38ae MD |
96 | |
97 | #if defined(DIAGNOSTIC) | |
98 | #define PMAP_DIAGNOSTIC | |
99 | #endif | |
100 | ||
101 | #define MINPV 2048 | |
102 | ||
701c977e MD |
103 | /* |
104 | * pmap debugging will report who owns a pv lock when blocking. | |
105 | */ | |
106 | #ifdef PMAP_DEBUG | |
107 | ||
6379cf29 | 108 | #define PMAP_DEBUG_DECL , const char *func, int lineno |
701c977e MD |
109 | #define PMAP_DEBUG_ARGS , __func__, __LINE__ |
110 | #define PMAP_DEBUG_COPY , func, lineno | |
111 | ||
76f1911e | 112 | #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp \ |
701c977e MD |
113 | PMAP_DEBUG_ARGS) |
114 | #define pv_lock(pv) _pv_lock(pv \ | |
115 | PMAP_DEBUG_ARGS) | |
116 | #define pv_hold_try(pv) _pv_hold_try(pv \ | |
117 | PMAP_DEBUG_ARGS) | |
118 | #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp \ | |
119 | PMAP_DEBUG_ARGS) | |
120 | ||
e989b548 MD |
121 | #define pv_free(pv, pvp) _pv_free(pv, pvp PMAP_DEBUG_ARGS) |
122 | ||
701c977e MD |
123 | #else |
124 | ||
125 | #define PMAP_DEBUG_DECL | |
126 | #define PMAP_DEBUG_ARGS | |
127 | #define PMAP_DEBUG_COPY | |
128 | ||
567a6398 | 129 | #define pv_get(pmap, pindex, pmarkp) _pv_get(pmap, pindex, pmarkp) |
701c977e MD |
130 | #define pv_lock(pv) _pv_lock(pv) |
131 | #define pv_hold_try(pv) _pv_hold_try(pv) | |
132 | #define pv_alloc(pmap, pindex, isnewp) _pv_alloc(pmap, pindex, isnewp) | |
e989b548 | 133 | #define pv_free(pv, pvp) _pv_free(pv, pvp) |
701c977e MD |
134 | |
135 | #endif | |
136 | ||
c8fe38ae MD |
137 | /* |
138 | * Get PDEs and PTEs for user/kernel address space | |
139 | */ | |
6379cf29 | 140 | #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT]) |
c8fe38ae | 141 | |
6379cf29 AL |
142 | #define pmap_pde_v(pmap, pde) \ |
143 | ((*(pd_entry_t *)pde & pmap->pmap_bits[PG_V_IDX]) != 0) | |
567a6398 MD |
144 | #define pmap_pte_w(pmap, pte) \ |
145 | ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0) | |
146 | #define pmap_pte_m(pmap, pte) \ | |
147 | ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0) | |
148 | #define pmap_pte_u(pmap, pte) \ | |
149 | ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0) | |
150 | #define pmap_pte_v(pmap, pte) \ | |
151 | ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0) | |
c8fe38ae | 152 | |
c8fe38ae MD |
153 | /* |
154 | * Given a map and a machine independent protection code, | |
155 | * convert to a vax protection code. | |
156 | */ | |
157 | #define pte_prot(m, p) \ | |
a86ce0cd | 158 | (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)]) |
3e925ec2 | 159 | static uint64_t protection_codes[PROTECTION_CODES_SIZE]; |
d7f50089 | 160 | |
0ad80e33 MD |
161 | /* |
162 | * Backing scan macros. Note that in the use case 'ipte' is only a tentitive | |
163 | * value and must be validated by a pmap_inval_smp_cmpset*() or equivalent | |
164 | * function. | |
165 | * | |
166 | * NOTE: cpu_ccfence() is required to prevent excessive optmization of | |
167 | * of the (ipte) variable. | |
c2830aa6 MD |
168 | * |
169 | * NOTE: We don't bother locking the backing object if it isn't mapped | |
170 | * to anything (backing_list is empty). | |
171 | * | |
172 | * NOTE: For now guarantee an interlock via iobj->backing_lk if the | |
173 | * object exists and do not shortcut the lock by checking to see | |
174 | * if the list is empty first. | |
0ad80e33 | 175 | */ |
567a6398 MD |
176 | #define PMAP_PAGE_BACKING_SCAN(m, match_pmap, ipmap, iptep, ipte, iva) \ |
177 | if (m->object) { \ | |
178 | vm_object_t iobj = m->object; \ | |
179 | vm_map_backing_t iba, next_ba; \ | |
180 | struct pmap *ipmap; \ | |
181 | pt_entry_t ipte; \ | |
182 | pt_entry_t *iptep; \ | |
183 | vm_offset_t iva; \ | |
184 | vm_pindex_t ipindex_start; \ | |
185 | vm_pindex_t ipindex_end; \ | |
186 | \ | |
187 | lockmgr(&iobj->backing_lk, LK_SHARED); \ | |
188 | next_ba = TAILQ_FIRST(&iobj->backing_list); \ | |
189 | while ((iba = next_ba) != NULL) { \ | |
190 | next_ba = TAILQ_NEXT(iba, entry); \ | |
191 | ipmap = iba->pmap; \ | |
192 | if (match_pmap && ipmap != match_pmap) \ | |
193 | continue; \ | |
194 | ipindex_start = iba->offset >> PAGE_SHIFT; \ | |
195 | ipindex_end = ipindex_start + \ | |
196 | ((iba->end - iba->start) >> PAGE_SHIFT); \ | |
197 | if (m->pindex < ipindex_start || \ | |
198 | m->pindex >= ipindex_end) { \ | |
199 | continue; \ | |
200 | } \ | |
201 | iva = iba->start + \ | |
202 | ((m->pindex - ipindex_start) << PAGE_SHIFT); \ | |
203 | iptep = pmap_pte(ipmap, iva); \ | |
204 | if (iptep == NULL) \ | |
205 | continue; \ | |
206 | ipte = *iptep; \ | |
0ad80e33 | 207 | cpu_ccfence(); \ |
567a6398 MD |
208 | if (m->phys_addr != (ipte & PG_FRAME)) \ |
209 | continue; \ | |
210 | ||
211 | #define PMAP_PAGE_BACKING_RETRY \ | |
212 | { \ | |
213 | next_ba = iba; \ | |
214 | continue; \ | |
215 | } \ | |
216 | ||
217 | #define PMAP_PAGE_BACKING_DONE \ | |
218 | } \ | |
219 | lockmgr(&iobj->backing_lk, LK_RELEASE); \ | |
220 | } \ | |
221 | ||
db2ec6f8 | 222 | static struct pmap iso_pmap; |
c713db65 AL |
223 | static struct pmap kernel_pmap_store; |
224 | struct pmap *kernel_pmap = &kernel_pmap_store; | |
d7f50089 | 225 | |
c8fe38ae MD |
226 | vm_paddr_t avail_start; /* PA of first available physical page */ |
227 | vm_paddr_t avail_end; /* PA of last available physical page */ | |
791c6551 MD |
228 | vm_offset_t virtual2_start; /* cutout free area prior to kernel start */ |
229 | vm_offset_t virtual2_end; | |
6379cf29 | 230 | vm_offset_t virtual_start; /* VA of first avail page (after kernel BSS) */ |
c8fe38ae MD |
231 | vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */ |
232 | vm_offset_t KvaStart; /* VA start of KVA space */ | |
233 | vm_offset_t KvaEnd; /* VA end of KVA space (non-inclusive) */ | |
6379cf29 | 234 | vm_offset_t KvaSize; /* max size of KVA space */ |
c2ec3418 | 235 | vm_offset_t DMapMaxAddress; |
6379cf29 | 236 | |
427fbc33 MD |
237 | /* Has pmap_init completed? */ |
238 | __read_frequently static boolean_t pmap_initialized = FALSE; | |
a86ce0cd | 239 | //static int pgeflag; /* PG_G or-in */ |
6379cf29 | 240 | static uint64_t PatMsr; /* value of MSR_PAT */ |
d7f50089 | 241 | |
48ffc236 JG |
242 | static int ndmpdp; |
243 | static vm_paddr_t dmaplimit; | |
791c6551 | 244 | vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS; |
d7f50089 | 245 | |
b524ca76 | 246 | static pt_entry_t pat_pte_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ |
c2ec3418 | 247 | static pt_entry_t pat_pde_index[PAT_INDEX_SIZE]; /* PAT -> PG_ bits */ |
b524ca76 | 248 | |
791c6551 MD |
249 | static uint64_t KPTbase; |
250 | static uint64_t KPTphys; | |
6379cf29 AL |
251 | static uint64_t KPDphys; /* phys addr of kernel level 2 */ |
252 | static uint64_t KPDbase; /* phys addr of kernel level 2 @ KERNBASE */ | |
8ff9866b MD |
253 | uint64_t KPDPphys; /* phys addr of kernel level 3 */ |
254 | uint64_t KPML4phys; /* phys addr of kernel level 4 */ | |
48ffc236 | 255 | |
6379cf29 AL |
256 | static uint64_t DMPDphys; /* phys addr of direct mapped level 2 */ |
257 | static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ | |
48ffc236 | 258 | |
d7f50089 | 259 | /* |
c8fe38ae | 260 | * Data for the pv entry allocation mechanism |
d7f50089 | 261 | */ |
df49ec1e MD |
262 | __read_mostly static vm_zone_t pvzone; |
263 | __read_mostly static int pmap_pagedaemon_waken = 0; | |
c8fe38ae | 264 | static struct vm_zone pvzone_store; |
c8fe38ae | 265 | static struct pv_entry *pvinit; |
d7f50089 YY |
266 | |
267 | /* | |
c8fe38ae | 268 | * All those kernel PT submaps that BSD is so fond of |
d7f50089 | 269 | */ |
db2ec6f8 | 270 | pt_entry_t *CMAP1 = NULL; |
4c0cc8bb | 271 | caddr_t CADDR1 = NULL, ptvmmap = NULL; |
db2ec6f8 | 272 | static pt_entry_t *msgbufmap, *ptmmap; |
6379cf29 | 273 | struct msgbuf *msgbufp = NULL; |
d7f50089 | 274 | |
a86ce0cd | 275 | /* |
24aa2e44 | 276 | * PG_* bits for regular (x86) pmap. |
a86ce0cd | 277 | */ |
24aa2e44 AL |
278 | __read_frequently static uint64_t pmap_bits_default[PG_BITS_SIZE] = { |
279 | [TYPE_IDX] = REGULAR_PMAP, | |
280 | [PG_V_IDX] = X86_PG_V, | |
281 | [PG_RW_IDX] = X86_PG_RW, | |
282 | [PG_U_IDX] = X86_PG_U, | |
283 | [PG_A_IDX] = X86_PG_A, | |
284 | [PG_M_IDX] = X86_PG_M, | |
285 | [PG_PS_IDX] = X86_PG_PS, | |
286 | [PG_G_IDX] = X86_PG_G, | |
287 | [PG_W_IDX] = X86_PG_AVAIL1, | |
288 | [PG_MANAGED_IDX] = X86_PG_AVAIL2, | |
289 | [PG_N_IDX] = X86_PG_NC_PWT | X86_PG_NC_PCD, | |
290 | [PG_NX_IDX] = X86_PG_NX, | |
a86ce0cd | 291 | }; |
427fbc33 | 292 | |
c8fe38ae MD |
293 | /* |
294 | * Crashdump maps. | |
d7f50089 | 295 | */ |
c8fe38ae MD |
296 | static pt_entry_t *pt_crashdumpmap; |
297 | static caddr_t crashdumpmap; | |
298 | ||
a7a03a5f MD |
299 | static int pmap_debug = 0; |
300 | SYSCTL_INT(_machdep, OID_AUTO, pmap_debug, CTLFLAG_RW, | |
301 | &pmap_debug, 0, "Debug pmap's"); | |
a44410dd MD |
302 | #ifdef PMAP_DEBUG2 |
303 | static int pmap_enter_debug = 0; | |
304 | SYSCTL_INT(_machdep, OID_AUTO, pmap_enter_debug, CTLFLAG_RW, | |
305 | &pmap_enter_debug, 0, "Debug pmap_enter's"); | |
306 | #endif | |
b12defdc MD |
307 | static int pmap_yield_count = 64; |
308 | SYSCTL_INT(_machdep, OID_AUTO, pmap_yield_count, CTLFLAG_RW, | |
309 | &pmap_yield_count, 0, "Yield during init_pt/release"); | |
db2ec6f8 | 310 | static int pmap_fast_kernel_cpusync = 0; |
b8d5441d MD |
311 | SYSCTL_INT(_machdep, OID_AUTO, pmap_fast_kernel_cpusync, CTLFLAG_RW, |
312 | &pmap_fast_kernel_cpusync, 0, "Share page table pages when possible"); | |
db2ec6f8 | 313 | static int pmap_dynamic_delete = 0; |
582f286d MD |
314 | SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW, |
315 | &pmap_dynamic_delete, 0, "Dynamically delete PT/PD/PDPs"); | |
db2ec6f8 | 316 | static int pmap_lock_delay = 100; |
bb1339f8 MD |
317 | SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW, |
318 | &pmap_lock_delay, 0, "Spin loops"); | |
94c5f25a MD |
319 | static int meltdown_mitigation = -1; |
320 | TUNABLE_INT("machdep.meltdown_mitigation", &meltdown_mitigation); | |
321 | SYSCTL_INT(_machdep, OID_AUTO, meltdown_mitigation, CTLFLAG_RW, | |
322 | &meltdown_mitigation, 0, "Userland pmap isolation"); | |
b12defdc | 323 | |
d92e3890 | 324 | static int pmap_nx_enable = -1; /* -1 = auto */ |
2620a64f | 325 | /* needs manual TUNABLE in early probe, see below */ |
d92e3890 MD |
326 | SYSCTL_INT(_machdep, OID_AUTO, pmap_nx_enable, CTLFLAG_RD, |
327 | &pmap_nx_enable, 0, | |
328 | "no-execute support (0=disabled, 1=w/READ, 2=w/READ & WRITE)"); | |
2620a64f | 329 | |
e05899ce MD |
330 | static int pmap_pv_debug = 50; |
331 | SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW, | |
332 | &pmap_pv_debug, 0, ""); | |
333 | ||
df49ec1e MD |
334 | static long vm_pmap_pv_entries; |
335 | SYSCTL_LONG(_vm, OID_AUTO, pmap_pv_entries, CTLFLAG_RD, | |
336 | &vm_pmap_pv_entries, 0, ""); | |
337 | ||
a86ce0cd MD |
338 | /* Standard user access funtions */ |
339 | extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, | |
340 | size_t *lencopied); | |
341 | extern int std_copyin (const void *udaddr, void *kaddr, size_t len); | |
342 | extern int std_copyout (const void *kaddr, void *udaddr, size_t len); | |
5947157e MD |
343 | extern int std_fubyte (const uint8_t *base); |
344 | extern int std_subyte (uint8_t *base, uint8_t byte); | |
345 | extern int32_t std_fuword32 (const uint32_t *base); | |
346 | extern int64_t std_fuword64 (const uint64_t *base); | |
347 | extern int std_suword64 (uint64_t *base, uint64_t word); | |
348 | extern int std_suword32 (uint32_t *base, int word); | |
7f4bfbe7 MD |
349 | extern uint32_t std_swapu32 (volatile uint32_t *base, uint32_t v); |
350 | extern uint64_t std_swapu64 (volatile uint64_t *base, uint64_t v); | |
6481baf4 MD |
351 | extern uint32_t std_fuwordadd32 (volatile uint32_t *base, uint32_t v); |
352 | extern uint64_t std_fuwordadd64 (volatile uint64_t *base, uint64_t v); | |
a86ce0cd | 353 | |
567a6398 | 354 | #if 0 |
701c977e | 355 | static void pv_hold(pv_entry_t pv); |
567a6398 | 356 | #endif |
701c977e MD |
357 | static int _pv_hold_try(pv_entry_t pv |
358 | PMAP_DEBUG_DECL); | |
359 | static void pv_drop(pv_entry_t pv); | |
360 | static void _pv_lock(pv_entry_t pv | |
361 | PMAP_DEBUG_DECL); | |
362 | static void pv_unlock(pv_entry_t pv); | |
363 | static pv_entry_t _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew | |
364 | PMAP_DEBUG_DECL); | |
76f1911e | 365 | static pv_entry_t _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp |
701c977e | 366 | PMAP_DEBUG_DECL); |
e989b548 | 367 | static void _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL); |
76f1911e MD |
368 | static pv_entry_t pv_get_try(pmap_t pmap, vm_pindex_t pindex, |
369 | vm_pindex_t **pmarkp, int *errorp); | |
701c977e | 370 | static void pv_put(pv_entry_t pv); |
701c977e MD |
371 | static void *pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex); |
372 | static pv_entry_t pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, | |
373 | pv_entry_t *pvpp); | |
ccd67bf6 | 374 | static void pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, |
a7a03a5f | 375 | pmap_inval_bulk_t *bulk, int destroy); |
c2830aa6 | 376 | static vm_page_t pmap_remove_pv_page(pv_entry_t pv, int clrpgbits); |
ccd67bf6 MD |
377 | static int pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, |
378 | pmap_inval_bulk_t *bulk); | |
701c977e | 379 | |
9df83100 MD |
380 | struct pmap_scan_info; |
381 | static void pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, | |
567a6398 | 382 | vm_pindex_t *pte_placemark, pv_entry_t pt_pv, |
921c891e | 383 | vm_offset_t va, pt_entry_t *ptep, void *arg __unused); |
9df83100 | 384 | static void pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, |
567a6398 | 385 | vm_pindex_t *pte_placemark, pv_entry_t pt_pv, |
921c891e | 386 | vm_offset_t va, pt_entry_t *ptep, void *arg __unused); |
701c977e | 387 | |
d87f4462 | 388 | static void x86_64_protection_init (void); |
bfc09ba0 MD |
389 | static void create_pagetables(vm_paddr_t *firstaddr); |
390 | static void pmap_remove_all (vm_page_t m); | |
c8fe38ae | 391 | static boolean_t pmap_testbit (vm_page_t m, int bit); |
c8fe38ae | 392 | |
567a6398 | 393 | static pt_entry_t *pmap_pte_quick (pmap_t pmap, vm_offset_t va); |
c8fe38ae MD |
394 | static vm_offset_t pmap_kmem_choose(vm_offset_t addr); |
395 | ||
a86ce0cd | 396 | static void pmap_pinit_defaults(struct pmap *pmap); |
76f1911e MD |
397 | static void pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark); |
398 | static void pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark); | |
a86ce0cd | 399 | |
701c977e MD |
400 | static int |
401 | pv_entry_compare(pv_entry_t pv1, pv_entry_t pv2) | |
402 | { | |
403 | if (pv1->pv_pindex < pv2->pv_pindex) | |
404 | return(-1); | |
405 | if (pv1->pv_pindex > pv2->pv_pindex) | |
406 | return(1); | |
407 | return(0); | |
408 | } | |
409 | ||
410 | RB_GENERATE2(pv_entry_rb_tree, pv_entry, pv_entry, | |
411 | pv_entry_compare, vm_pindex_t, pv_pindex); | |
412 | ||
c2830aa6 MD |
413 | /* |
414 | * We have removed a managed pte. The page might not be hard or soft-busied | |
415 | * at this point so we have to be careful. | |
416 | * | |
417 | * If advanced mode is enabled we can clear PG_MAPPED/WRITEABLE only if | |
418 | * MAPPEDMULTI is not set. This must be done atomically against possible | |
419 | * concurrent pmap_enter()s occurring at the same time. If MULTI is set | |
420 | * then the kernel may have to call vm_page_protect() later on to clean | |
421 | * the bits up. This is particularly important for kernel_map/kernel_object | |
422 | * mappings due to the expense of scanning the kernel_object's vm_backing's. | |
423 | * | |
424 | * If advanced mode is not enabled we update our tracking counts and | |
425 | * synchronize PG_MAPPED/WRITEABLE later on in pmap_mapped_sync(). | |
426 | */ | |
427 | static __inline | |
428 | void | |
429 | pmap_removed_pte(vm_page_t m, pt_entry_t pte) | |
430 | { | |
c2830aa6 MD |
431 | int flags; |
432 | int nflags; | |
433 | ||
434 | flags = m->flags; | |
435 | cpu_ccfence(); | |
436 | while ((flags & PG_MAPPEDMULTI) == 0) { | |
437 | nflags = flags & ~(PG_MAPPED | PG_WRITEABLE); | |
438 | if (atomic_fcmpset_int(&m->flags, &flags, nflags)) | |
439 | break; | |
440 | } | |
c2830aa6 MD |
441 | } |
442 | ||
d7f50089 | 443 | /* |
c8fe38ae | 444 | * Move the kernel virtual free pointer to the next |
f9cc0f15 JG |
445 | * 2MB. This is used to help improve performance |
446 | * by using a large (2MB) page for much of the kernel | |
c8fe38ae | 447 | * (.text, .data, .bss) |
d7f50089 | 448 | */ |
bfc09ba0 MD |
449 | static |
450 | vm_offset_t | |
c8fe38ae | 451 | pmap_kmem_choose(vm_offset_t addr) |
d7f50089 | 452 | { |
c8fe38ae | 453 | vm_offset_t newaddr = addr; |
f9cc0f15 | 454 | |
965b839f | 455 | newaddr = roundup2(addr, NBPDR); |
c8fe38ae | 456 | return newaddr; |
d7f50089 YY |
457 | } |
458 | ||
701c977e MD |
459 | /* |
460 | * Returns the pindex of a page table entry (representing a terminal page). | |
461 | * There are NUPTE_TOTAL page table entries possible (a huge number) | |
462 | * | |
463 | * x86-64 has a 48-bit address space, where bit 47 is sign-extended out. | |
464 | * We want to properly translate negative KVAs. | |
465 | */ | |
bfc09ba0 MD |
466 | static __inline |
467 | vm_pindex_t | |
701c977e | 468 | pmap_pte_pindex(vm_offset_t va) |
48ffc236 | 469 | { |
701c977e | 470 | return ((va >> PAGE_SHIFT) & (NUPTE_TOTAL - 1)); |
48ffc236 JG |
471 | } |
472 | ||
701c977e MD |
473 | /* |
474 | * Returns the pindex of a page table. | |
475 | */ | |
bfc09ba0 MD |
476 | static __inline |
477 | vm_pindex_t | |
701c977e | 478 | pmap_pt_pindex(vm_offset_t va) |
48ffc236 | 479 | { |
701c977e MD |
480 | return (NUPTE_TOTAL + ((va >> PDRSHIFT) & (NUPT_TOTAL - 1))); |
481 | } | |
48ffc236 | 482 | |
701c977e MD |
483 | /* |
484 | * Returns the pindex of a page directory. | |
485 | */ | |
486 | static __inline | |
487 | vm_pindex_t | |
488 | pmap_pd_pindex(vm_offset_t va) | |
489 | { | |
490 | return (NUPTE_TOTAL + NUPT_TOTAL + | |
491 | ((va >> PDPSHIFT) & (NUPD_TOTAL - 1))); | |
48ffc236 JG |
492 | } |
493 | ||
bfc09ba0 MD |
494 | static __inline |
495 | vm_pindex_t | |
701c977e | 496 | pmap_pdp_pindex(vm_offset_t va) |
48ffc236 | 497 | { |
701c977e MD |
498 | return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + |
499 | ((va >> PML4SHIFT) & (NUPDP_TOTAL - 1))); | |
500 | } | |
48ffc236 | 501 | |
701c977e MD |
502 | static __inline |
503 | vm_pindex_t | |
504 | pmap_pml4_pindex(void) | |
505 | { | |
506 | return (NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL); | |
48ffc236 JG |
507 | } |
508 | ||
701c977e | 509 | /* |
4b01284d | 510 | * Return various *clipped* indexes for a given VA. |
701c977e | 511 | * |
4b01284d AL |
512 | * Returns the index of a PTE in a page table (PT), representing |
513 | * a terminal page. | |
514 | */ | |
515 | static __inline | |
516 | vm_pindex_t | |
517 | pmap_pte_index(vm_offset_t va) | |
518 | { | |
519 | return ((va >> PAGE_SHIFT) & ((1UL << NPTEPGSHIFT) - 1)); | |
520 | } | |
521 | ||
522 | /* | |
523 | * Returns the index of a PDE in a page directory (PD) table, representing | |
524 | * a page table (PT). | |
701c977e MD |
525 | */ |
526 | static __inline | |
527 | vm_pindex_t | |
528 | pmap_pt_index(vm_offset_t va) | |
529 | { | |
4b01284d | 530 | return ((va >> PDRSHIFT) & ((1UL << NPDEPGSHIFT) - 1)); |
48ffc236 JG |
531 | } |
532 | ||
701c977e | 533 | /* |
4b01284d AL |
534 | * Returns the index of a PDPE in a page directory pointer (PDP) table, |
535 | * representing a page directory (PD) table. | |
701c977e | 536 | */ |
bfc09ba0 MD |
537 | static __inline |
538 | vm_pindex_t | |
701c977e | 539 | pmap_pd_index(vm_offset_t va) |
48ffc236 | 540 | { |
4b01284d | 541 | return ((va >> PDPSHIFT) & ((1UL << NPDPEPGSHIFT) - 1)); |
701c977e | 542 | } |
48ffc236 | 543 | |
701c977e | 544 | /* |
4b01284d AL |
545 | * Returns the index of a PML4E in the PML4 table, representing a page |
546 | * directory pointer (PDP) table. | |
701c977e MD |
547 | */ |
548 | static __inline | |
549 | vm_pindex_t | |
550 | pmap_pdp_index(vm_offset_t va) | |
551 | { | |
4b01284d | 552 | return ((va >> PML4SHIFT) & ((1UL << NPML4EPGSHIFT) - 1)); |
48ffc236 JG |
553 | } |
554 | ||
67e78c75 | 555 | /* |
737b020b | 556 | * Of all the layers (PT, PD, PDP, PML4) the best one to cache is |
67e78c75 | 557 | * the PT layer. This will speed up core pmap operations considerably. |
67e78c75 MD |
558 | * |
559 | * NOTE: The pmap spinlock does not need to be held but the passed-in pv | |
560 | * must be in a known associated state (typically by being locked when | |
561 | * the pmap spinlock isn't held). We allow the race for that case. | |
562 | * | |
563 | * NOTE: pm_pvhint* is only accessed (read) with the spin-lock held, using | |
564 | * cpu_ccfence() to prevent compiler optimizations from reloading the | |
565 | * field. | |
566 | */ | |
567 | static __inline | |
568 | void | |
569 | pv_cache(pmap_t pmap, pv_entry_t pv, vm_pindex_t pindex) | |
570 | { | |
571 | if (pindex < pmap_pt_pindex(0)) { | |
567a6398 | 572 | ; |
67e78c75 MD |
573 | } else if (pindex < pmap_pd_pindex(0)) { |
574 | pmap->pm_pvhint_pt = pv; | |
575 | } | |
576 | } | |
577 | ||
bb1339f8 MD |
578 | /* |
579 | * Locate the requested pt_entry | |
580 | */ | |
581 | static __inline | |
582 | pv_entry_t | |
583 | pv_entry_lookup(pmap_t pmap, vm_pindex_t pindex) | |
584 | { | |
585 | pv_entry_t pv; | |
586 | ||
587 | if (pindex < pmap_pt_pindex(0)) | |
567a6398 MD |
588 | return NULL; |
589 | #if 1 | |
590 | if (pindex < pmap_pd_pindex(0)) | |
bb1339f8 MD |
591 | pv = pmap->pm_pvhint_pt; |
592 | else | |
593 | pv = NULL; | |
594 | cpu_ccfence(); | |
595 | if (pv == NULL || pv->pv_pmap != pmap) { | |
67e78c75 MD |
596 | pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); |
597 | if (pv) | |
598 | pv_cache(pmap, pv, pindex); | |
bb1339f8 MD |
599 | } else if (pv->pv_pindex != pindex) { |
600 | pv = pv_entry_rb_tree_RB_LOOKUP_REL(&pmap->pm_pvroot, | |
601 | pindex, pv); | |
67e78c75 MD |
602 | if (pv) |
603 | pv_cache(pmap, pv, pindex); | |
bb1339f8 | 604 | } |
67e78c75 MD |
605 | #else |
606 | pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex); | |
607 | #endif | |
bb1339f8 MD |
608 | return pv; |
609 | } | |
610 | ||
611 | /* | |
612 | * pmap_pte_quick: | |
613 | * | |
614 | * Super fast pmap_pte routine best used when scanning the pv lists. | |
615 | * This eliminates many course-grained invltlb calls. Note that many of | |
616 | * the pv list scans are across different pmaps and it is very wasteful | |
617 | * to do an entire invltlb when checking a single mapping. | |
618 | */ | |
619 | static __inline pt_entry_t *pmap_pte(pmap_t pmap, vm_offset_t va); | |
620 | ||
621 | static | |
622 | pt_entry_t * | |
623 | pmap_pte_quick(pmap_t pmap, vm_offset_t va) | |
624 | { | |
625 | return pmap_pte(pmap, va); | |
626 | } | |
627 | ||
76f1911e MD |
628 | /* |
629 | * The placemarker hash must be broken up into four zones so lock | |
630 | * ordering semantics continue to work (e.g. pte, pt, pd, then pdp). | |
631 | * | |
632 | * Placemarkers are used to 'lock' page table indices that do not have | |
633 | * a pv_entry. This allows the pmap to support managed and unmanaged | |
634 | * pages and shared page tables. | |
635 | */ | |
636 | #define PM_PLACE_BASE (PM_PLACEMARKS >> 2) | |
637 | ||
638 | static __inline | |
639 | vm_pindex_t * | |
640 | pmap_placemarker_hash(pmap_t pmap, vm_pindex_t pindex) | |
641 | { | |
642 | int hi; | |
643 | ||
644 | if (pindex < pmap_pt_pindex(0)) /* zone 0 - PTE */ | |
645 | hi = 0; | |
646 | else if (pindex < pmap_pd_pindex(0)) /* zone 1 - PT */ | |
647 | hi = PM_PLACE_BASE; | |
648 | else if (pindex < pmap_pdp_pindex(0)) /* zone 2 - PD */ | |
649 | hi = PM_PLACE_BASE << 1; | |
650 | else /* zone 3 - PDP (and PML4E) */ | |
651 | hi = PM_PLACE_BASE | (PM_PLACE_BASE << 1); | |
652 | hi += pindex & (PM_PLACE_BASE - 1); | |
653 | ||
654 | return (&pmap->pm_placemarks[hi]); | |
655 | } | |
656 | ||
657 | ||
701c977e MD |
658 | /* |
659 | * Generic procedure to index a pte from a pt, pd, or pdp. | |
921c891e MD |
660 | * |
661 | * NOTE: Normally passed pindex as pmap_xx_index(). pmap_xx_pindex() is NOT | |
662 | * a page table page index but is instead of PV lookup index. | |
701c977e MD |
663 | */ |
664 | static | |
665 | void * | |
666 | pv_pte_lookup(pv_entry_t pv, vm_pindex_t pindex) | |
667 | { | |
668 | pt_entry_t *pte; | |
669 | ||
670 | pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pv->pv_m)); | |
671 | return(&pte[pindex]); | |
672 | } | |
673 | ||
674 | /* | |
675 | * Return pointer to PDP slot in the PML4 | |
676 | */ | |
bfc09ba0 MD |
677 | static __inline |
678 | pml4_entry_t * | |
701c977e | 679 | pmap_pdp(pmap_t pmap, vm_offset_t va) |
48ffc236 | 680 | { |
701c977e | 681 | return (&pmap->pm_pml4[pmap_pdp_index(va)]); |
48ffc236 JG |
682 | } |
683 | ||
701c977e MD |
684 | /* |
685 | * Return pointer to PD slot in the PDP given a pointer to the PDP | |
686 | */ | |
bfc09ba0 MD |
687 | static __inline |
688 | pdp_entry_t * | |
eb010d6e | 689 | pmap_pdp_to_pd(pml4_entry_t pdp_pte, vm_offset_t va) |
48ffc236 | 690 | { |
701c977e | 691 | pdp_entry_t *pd; |
48ffc236 | 692 | |
eb010d6e | 693 | pd = (pdp_entry_t *)PHYS_TO_DMAP(pdp_pte & PG_FRAME); |
701c977e | 694 | return (&pd[pmap_pd_index(va)]); |
48ffc236 JG |
695 | } |
696 | ||
701c977e | 697 | /* |
eb010d6e MD |
698 | * Return pointer to PD slot in the PDP. |
699 | */ | |
bfc09ba0 MD |
700 | static __inline |
701 | pdp_entry_t * | |
701c977e | 702 | pmap_pd(pmap_t pmap, vm_offset_t va) |
48ffc236 | 703 | { |
701c977e | 704 | pml4_entry_t *pdp; |
48ffc236 | 705 | |
701c977e | 706 | pdp = pmap_pdp(pmap, va); |
a86ce0cd | 707 | if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0) |
48ffc236 | 708 | return NULL; |
eb010d6e | 709 | return (pmap_pdp_to_pd(*pdp, va)); |
48ffc236 JG |
710 | } |
711 | ||
701c977e MD |
712 | /* |
713 | * Return pointer to PT slot in the PD given a pointer to the PD | |
714 | */ | |
bfc09ba0 MD |
715 | static __inline |
716 | pd_entry_t * | |
eb010d6e | 717 | pmap_pd_to_pt(pdp_entry_t pd_pte, vm_offset_t va) |
48ffc236 | 718 | { |
701c977e | 719 | pd_entry_t *pt; |
48ffc236 | 720 | |
eb010d6e | 721 | pt = (pd_entry_t *)PHYS_TO_DMAP(pd_pte & PG_FRAME); |
701c977e | 722 | return (&pt[pmap_pt_index(va)]); |
48ffc236 JG |
723 | } |
724 | ||
701c977e MD |
725 | /* |
726 | * Return pointer to PT slot in the PD | |
eb010d6e MD |
727 | * |
728 | * SIMPLE PMAP NOTE: Simple pmaps (embedded in objects) do not have PDPs, | |
729 | * so we cannot lookup the PD via the PDP. Instead we | |
730 | * must look it up via the pmap. | |
701c977e | 731 | */ |
bfc09ba0 MD |
732 | static __inline |
733 | pd_entry_t * | |
701c977e | 734 | pmap_pt(pmap_t pmap, vm_offset_t va) |
48ffc236 | 735 | { |
701c977e | 736 | pdp_entry_t *pd; |
eb010d6e MD |
737 | pv_entry_t pv; |
738 | vm_pindex_t pd_pindex; | |
08abdbfc | 739 | vm_paddr_t phys; |
eb010d6e MD |
740 | |
741 | if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { | |
742 | pd_pindex = pmap_pd_pindex(va); | |
08abdbfc | 743 | spin_lock_shared(&pmap->pm_spin); |
eb010d6e | 744 | pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pd_pindex); |
08abdbfc MD |
745 | if (pv == NULL || pv->pv_m == NULL) { |
746 | spin_unlock_shared(&pmap->pm_spin); | |
eb010d6e | 747 | return NULL; |
08abdbfc MD |
748 | } |
749 | phys = VM_PAGE_TO_PHYS(pv->pv_m); | |
750 | spin_unlock_shared(&pmap->pm_spin); | |
751 | return (pmap_pd_to_pt(phys, va)); | |
eb010d6e MD |
752 | } else { |
753 | pd = pmap_pd(pmap, va); | |
a86ce0cd | 754 | if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0) |
eb010d6e MD |
755 | return NULL; |
756 | return (pmap_pd_to_pt(*pd, va)); | |
757 | } | |
48ffc236 JG |
758 | } |
759 | ||
701c977e MD |
760 | /* |
761 | * Return pointer to PTE slot in the PT given a pointer to the PT | |
762 | */ | |
bfc09ba0 MD |
763 | static __inline |
764 | pt_entry_t * | |
eb010d6e | 765 | pmap_pt_to_pte(pd_entry_t pt_pte, vm_offset_t va) |
48ffc236 JG |
766 | { |
767 | pt_entry_t *pte; | |
768 | ||
eb010d6e | 769 | pte = (pt_entry_t *)PHYS_TO_DMAP(pt_pte & PG_FRAME); |
48ffc236 JG |
770 | return (&pte[pmap_pte_index(va)]); |
771 | } | |
772 | ||
701c977e MD |
773 | /* |
774 | * Return pointer to PTE slot in the PT | |
775 | */ | |
bfc09ba0 MD |
776 | static __inline |
777 | pt_entry_t * | |
48ffc236 | 778 | pmap_pte(pmap_t pmap, vm_offset_t va) |
48ffc236 | 779 | { |
701c977e | 780 | pd_entry_t *pt; |
48ffc236 | 781 | |
701c977e | 782 | pt = pmap_pt(pmap, va); |
a86ce0cd | 783 | if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0) |
701c977e | 784 | return NULL; |
a86ce0cd | 785 | if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0) |
701c977e | 786 | return ((pt_entry_t *)pt); |
eb010d6e | 787 | return (pmap_pt_to_pte(*pt, va)); |
48ffc236 JG |
788 | } |
789 | ||
701c977e | 790 | /* |
a44410dd MD |
791 | * Return address of PT slot in PD (KVM only) |
792 | * | |
793 | * Cannot be used for user page tables because it might interfere with | |
794 | * the shared page-table-page optimization (pmap_mmu_optimize). | |
701c977e | 795 | */ |
bfc09ba0 MD |
796 | static __inline |
797 | pd_entry_t * | |
701c977e | 798 | vtopt(vm_offset_t va) |
48ffc236 | 799 | { |
b12defdc MD |
800 | uint64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + |
801 | NPML4EPGSHIFT)) - 1); | |
48ffc236 JG |
802 | |
803 | return (PDmap + ((va >> PDRSHIFT) & mask)); | |
804 | } | |
c8fe38ae | 805 | |
701c977e MD |
806 | /* |
807 | * KVM - return address of PTE slot in PT | |
808 | */ | |
809 | static __inline | |
810 | pt_entry_t * | |
811 | vtopte(vm_offset_t va) | |
812 | { | |
813 | uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + | |
814 | NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); | |
815 | ||
816 | return (PTmap + ((va >> PAGE_SHIFT) & mask)); | |
817 | } | |
818 | ||
afd7f124 MD |
819 | /* |
820 | * Returns the physical address translation from va for a user address. | |
821 | * (vm_paddr_t)-1 is returned on failure. | |
822 | */ | |
823 | vm_paddr_t | |
824 | uservtophys(vm_offset_t va) | |
825 | { | |
826 | uint64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + | |
827 | NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1); | |
828 | vm_paddr_t pa; | |
829 | pt_entry_t pte; | |
830 | pmap_t pmap; | |
831 | ||
832 | pmap = vmspace_pmap(mycpu->gd_curthread->td_lwp->lwp_vmspace); | |
833 | pa = (vm_paddr_t)-1; | |
834 | if (va < VM_MAX_USER_ADDRESS) { | |
835 | pte = kreadmem64(PTmap + ((va >> PAGE_SHIFT) & mask)); | |
836 | if (pte & pmap->pmap_bits[PG_V_IDX]) | |
837 | pa = (pte & PG_FRAME) | (va & PAGE_MASK); | |
838 | } | |
839 | return pa; | |
840 | } | |
841 | ||
48ffc236 | 842 | static uint64_t |
8e5ea5f7 | 843 | allocpages(vm_paddr_t *firstaddr, long n) |
d7f50089 | 844 | { |
48ffc236 | 845 | uint64_t ret; |
c8fe38ae MD |
846 | |
847 | ret = *firstaddr; | |
848 | bzero((void *)ret, n * PAGE_SIZE); | |
849 | *firstaddr += n * PAGE_SIZE; | |
850 | return (ret); | |
d7f50089 YY |
851 | } |
852 | ||
bfc09ba0 | 853 | static |
c8fe38ae MD |
854 | void |
855 | create_pagetables(vm_paddr_t *firstaddr) | |
856 | { | |
8e5ea5f7 | 857 | long i; /* must be 64 bits */ |
da23a592 MD |
858 | long nkpt_base; |
859 | long nkpt_phys; | |
8ff9866b | 860 | long nkpd_phys; |
33fb3ba1 | 861 | int j; |
c8fe38ae | 862 | |
ad54aa11 MD |
863 | /* |
864 | * We are running (mostly) V=P at this point | |
865 | * | |
8ff9866b MD |
866 | * Calculate how many 1GB PD entries in our PDP pages are needed |
867 | * for the DMAP. This is only allocated if the system does not | |
868 | * support 1GB pages. Otherwise ndmpdp is simply a count of | |
869 | * the number of 1G terminal entries in our PDP pages are needed. | |
ad54aa11 | 870 | * |
8ff9866b | 871 | * NOTE: Maxmem is in pages |
ad54aa11 | 872 | */ |
86dae8f1 | 873 | ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT; |
faeb2db3 | 874 | if (ndmpdp < 4) /* Minimum 4GB of DMAP */ |
86dae8f1 | 875 | ndmpdp = 4; |
faeb2db3 | 876 | |
ebc415a3 | 877 | #if 0 |
faeb2db3 MD |
878 | /* |
879 | * HACK XXX fix me - Some laptops map the EFI framebuffer in | |
880 | * very high physical addresses and the DMAP winds up being too | |
881 | * small. The EFI framebuffer has to be mapped for the console | |
882 | * very early and the DMAP is how it does it. | |
883 | */ | |
884 | if (ndmpdp < 512) /* Minimum 512GB of DMAP */ | |
885 | ndmpdp = 512; | |
ebc415a3 | 886 | #endif |
faeb2db3 | 887 | |
8ff9866b | 888 | KKASSERT(ndmpdp <= NDMPML4E * NPML4EPG); |
c2ec3418 MD |
889 | DMapMaxAddress = DMAP_MIN_ADDRESS + |
890 | ((ndmpdp * NPDEPG) << PDRSHIFT); | |
8ff9866b MD |
891 | |
892 | /* | |
893 | * Starting at KERNBASE - map all 2G worth of page table pages. | |
894 | * KERNBASE is offset -2G from the end of kvm. This will accomodate | |
895 | * all KVM allocations above KERNBASE, including the SYSMAPs below. | |
896 | * | |
897 | * We do this by allocating 2*512 PT pages. Each PT page can map | |
898 | * 2MB, for 2GB total. | |
899 | */ | |
900 | nkpt_base = (NPDPEPG - KPDPI) * NPTEPG; /* typically 2 x 512 */ | |
86dae8f1 | 901 | |
da23a592 | 902 | /* |
8ff9866b MD |
903 | * Starting at the beginning of kvm (VM_MIN_KERNEL_ADDRESS), |
904 | * Calculate how many page table pages we need to preallocate | |
905 | * for early vm_map allocations. | |
906 | * | |
907 | * A few extra won't hurt, they will get used up in the running | |
908 | * system. | |
909 | * | |
910 | * vm_page array | |
911 | * initial pventry's | |
da23a592 | 912 | */ |
4f048b1c SW |
913 | nkpt_phys = howmany(Maxmem * sizeof(struct vm_page), NBPDR); |
914 | nkpt_phys += howmany(Maxmem * sizeof(struct pv_entry), NBPDR); | |
8ff9866b | 915 | nkpt_phys += 128; /* a few extra */ |
da23a592 MD |
916 | |
917 | /* | |
8ff9866b MD |
918 | * The highest value nkpd_phys can be set to is |
919 | * NKPDPE - (NPDPEPG - KPDPI) (i.e. NKPDPE - 2). | |
920 | * | |
921 | * Doing so would cause all PD pages to be pre-populated for | |
922 | * a maximal KVM space (approximately 16*512 pages, or 32MB. | |
923 | * We can save memory by not doing this. | |
da23a592 | 924 | */ |
8ff9866b | 925 | nkpd_phys = (nkpt_phys + NPDPEPG - 1) / NPDPEPG; |
c8fe38ae | 926 | |
ad54aa11 MD |
927 | /* |
928 | * Allocate pages | |
8ff9866b MD |
929 | * |
930 | * Normally NKPML4E=1-16 (1-16 kernel PDP page) | |
931 | * Normally NKPDPE= NKPML4E*512-1 (511 min kernel PD pages) | |
932 | * | |
933 | * Only allocate enough PD pages | |
934 | * NOTE: We allocate all kernel PD pages up-front, typically | |
935 | * ~511G of KVM, requiring 511 PD pages. | |
ad54aa11 | 936 | */ |
8ff9866b MD |
937 | KPTbase = allocpages(firstaddr, nkpt_base); /* KERNBASE to end */ |
938 | KPTphys = allocpages(firstaddr, nkpt_phys); /* KVA start */ | |
939 | KPML4phys = allocpages(firstaddr, 1); /* recursive PML4 map */ | |
940 | KPDPphys = allocpages(firstaddr, NKPML4E); /* kernel PDP pages */ | |
941 | KPDphys = allocpages(firstaddr, nkpd_phys); /* kernel PD pages */ | |
791c6551 MD |
942 | |
943 | /* | |
8ff9866b | 944 | * Alloc PD pages for the area starting at KERNBASE. |
791c6551 | 945 | */ |
8ff9866b | 946 | KPDbase = allocpages(firstaddr, NPDPEPG - KPDPI); |
48ffc236 | 947 | |
8ff9866b | 948 | /* |
c2ec3418 MD |
949 | * Stuff for our DMAP. Use 2MB pages even when 1GB pages |
950 | * are available in order to allow APU code to adjust page | |
951 | * attributes on a fixed grain (see pmap_change_attr()). | |
8ff9866b | 952 | */ |
48ffc236 | 953 | DMPDPphys = allocpages(firstaddr, NDMPML4E); |
c2ec3418 MD |
954 | #if 1 |
955 | DMPDphys = allocpages(firstaddr, ndmpdp); | |
956 | #else | |
48ffc236 JG |
957 | if ((amd_feature & AMDID_PAGE1GB) == 0) |
958 | DMPDphys = allocpages(firstaddr, ndmpdp); | |
c2ec3418 | 959 | #endif |
48ffc236 JG |
960 | dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT; |
961 | ||
791c6551 MD |
962 | /* |
963 | * Fill in the underlying page table pages for the area around | |
964 | * KERNBASE. This remaps low physical memory to KERNBASE. | |
965 | * | |
966 | * Read-only from zero to physfree | |
967 | * XXX not fully used, underneath 2M pages | |
968 | */ | |
48ffc236 | 969 | for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) { |
791c6551 | 970 | ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT; |
a86ce0cd MD |
971 | ((pt_entry_t *)KPTbase)[i] |= |
972 | pmap_bits_default[PG_RW_IDX] | | |
973 | pmap_bits_default[PG_V_IDX] | | |
974 | pmap_bits_default[PG_G_IDX]; | |
48ffc236 JG |
975 | } |
976 | ||
791c6551 MD |
977 | /* |
978 | * Now map the initial kernel page tables. One block of page | |
979 | * tables is placed at the beginning of kernel virtual memory, | |
980 | * and another block is placed at KERNBASE to map the kernel binary, | |
981 | * data, bss, and initial pre-allocations. | |
982 | */ | |
da23a592 | 983 | for (i = 0; i < nkpt_base; i++) { |
791c6551 | 984 | ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT); |
a86ce0cd MD |
985 | ((pd_entry_t *)KPDbase)[i] |= |
986 | pmap_bits_default[PG_RW_IDX] | | |
987 | pmap_bits_default[PG_V_IDX]; | |
791c6551 | 988 | } |
da23a592 | 989 | for (i = 0; i < nkpt_phys; i++) { |
48ffc236 | 990 | ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT); |
a86ce0cd MD |
991 | ((pd_entry_t *)KPDphys)[i] |= |
992 | pmap_bits_default[PG_RW_IDX] | | |
993 | pmap_bits_default[PG_V_IDX]; | |
48ffc236 JG |
994 | } |
995 | ||
791c6551 MD |
996 | /* |
997 | * Map from zero to end of allocations using 2M pages as an | |
998 | * optimization. This will bypass some of the KPTBase pages | |
999 | * above in the KERNBASE area. | |
1000 | */ | |
48ffc236 | 1001 | for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) { |
791c6551 | 1002 | ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT; |
a86ce0cd MD |
1003 | ((pd_entry_t *)KPDbase)[i] |= |
1004 | pmap_bits_default[PG_RW_IDX] | | |
1005 | pmap_bits_default[PG_V_IDX] | | |
1006 | pmap_bits_default[PG_PS_IDX] | | |
1007 | pmap_bits_default[PG_G_IDX]; | |
48ffc236 JG |
1008 | } |
1009 | ||
791c6551 | 1010 | /* |
8ff9866b MD |
1011 | * Load PD addresses into the PDP pages for primary KVA space to |
1012 | * cover existing page tables. PD's for KERNBASE are handled in | |
1013 | * the next loop. | |
1014 | * | |
1015 | * expected to pre-populate all of its PDs. See NKPDPE in vmparam.h. | |
791c6551 | 1016 | */ |
8ff9866b MD |
1017 | for (i = 0; i < nkpd_phys; i++) { |
1018 | ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] = | |
791c6551 | 1019 | KPDphys + (i << PAGE_SHIFT); |
8ff9866b MD |
1020 | ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] |= |
1021 | pmap_bits_default[PG_RW_IDX] | | |
1022 | pmap_bits_default[PG_V_IDX] | | |
9e24b495 | 1023 | pmap_bits_default[PG_A_IDX]; |
8ff9866b MD |
1024 | } |
1025 | ||
1026 | /* | |
1027 | * Load PDs for KERNBASE to the end | |
1028 | */ | |
1029 | i = (NKPML4E - 1) * NPDPEPG + KPDPI; | |
1030 | for (j = 0; j < NPDPEPG - KPDPI; ++j) { | |
1031 | ((pdp_entry_t *)KPDPphys)[i + j] = | |
1032 | KPDbase + (j << PAGE_SHIFT); | |
1033 | ((pdp_entry_t *)KPDPphys)[i + j] |= | |
a86ce0cd MD |
1034 | pmap_bits_default[PG_RW_IDX] | |
1035 | pmap_bits_default[PG_V_IDX] | | |
9e24b495 | 1036 | pmap_bits_default[PG_A_IDX]; |
48ffc236 JG |
1037 | } |
1038 | ||
33fb3ba1 MD |
1039 | /* |
1040 | * Now set up the direct map space using either 2MB or 1GB pages | |
1041 | * Preset PG_M and PG_A because demotion expects it. | |
1042 | * | |
1043 | * When filling in entries in the PD pages make sure any excess | |
1044 | * entries are set to zero as we allocated enough PD pages | |
c2ec3418 MD |
1045 | * |
1046 | * Stuff for our DMAP. Use 2MB pages even when 1GB pages | |
1047 | * are available in order to allow APU code to adjust page | |
1048 | * attributes on a fixed grain (see pmap_change_attr()). | |
33fb3ba1 | 1049 | */ |
c2ec3418 MD |
1050 | #if 0 |
1051 | if ((amd_feature & AMDID_PAGE1GB) == 0) | |
1052 | #endif | |
1053 | { | |
9e24b495 MD |
1054 | /* |
1055 | * Use 2MB pages | |
1056 | */ | |
48ffc236 | 1057 | for (i = 0; i < NPDEPG * ndmpdp; i++) { |
8e5ea5f7 | 1058 | ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT; |
a86ce0cd MD |
1059 | ((pd_entry_t *)DMPDphys)[i] |= |
1060 | pmap_bits_default[PG_RW_IDX] | | |
1061 | pmap_bits_default[PG_V_IDX] | | |
1062 | pmap_bits_default[PG_PS_IDX] | | |
1063 | pmap_bits_default[PG_G_IDX] | | |
1064 | pmap_bits_default[PG_M_IDX] | | |
1065 | pmap_bits_default[PG_A_IDX]; | |
48ffc236 | 1066 | } |
33fb3ba1 MD |
1067 | |
1068 | /* | |
1069 | * And the direct map space's PDP | |
1070 | */ | |
48ffc236 JG |
1071 | for (i = 0; i < ndmpdp; i++) { |
1072 | ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + | |
33fb3ba1 | 1073 | (i << PAGE_SHIFT); |
a86ce0cd MD |
1074 | ((pdp_entry_t *)DMPDPphys)[i] |= |
1075 | pmap_bits_default[PG_RW_IDX] | | |
c2ec3418 MD |
1076 | pmap_bits_default[PG_V_IDX] | |
1077 | pmap_bits_default[PG_A_IDX]; | |
48ffc236 | 1078 | } |
c2ec3418 MD |
1079 | } |
1080 | #if 0 | |
1081 | else { | |
9e24b495 MD |
1082 | /* |
1083 | * 1GB pages | |
1084 | */ | |
48ffc236 JG |
1085 | for (i = 0; i < ndmpdp; i++) { |
1086 | ((pdp_entry_t *)DMPDPphys)[i] = | |
33fb3ba1 | 1087 | (vm_paddr_t)i << PDPSHIFT; |
a86ce0cd MD |
1088 | ((pdp_entry_t *)DMPDPphys)[i] |= |
1089 | pmap_bits_default[PG_RW_IDX] | | |
1090 | pmap_bits_default[PG_V_IDX] | | |
1091 | pmap_bits_default[PG_PS_IDX] | | |
1092 | pmap_bits_default[PG_G_IDX] | | |
1093 | pmap_bits_default[PG_M_IDX] | | |
1094 | pmap_bits_default[PG_A_IDX]; | |
48ffc236 JG |
1095 | } |
1096 | } | |
c2ec3418 | 1097 | #endif |
48ffc236 JG |
1098 | |
1099 | /* And recursively map PML4 to itself in order to get PTmap */ | |
1100 | ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys; | |
a86ce0cd MD |
1101 | ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= |
1102 | pmap_bits_default[PG_RW_IDX] | | |
1103 | pmap_bits_default[PG_V_IDX] | | |
9e24b495 | 1104 | pmap_bits_default[PG_A_IDX]; |
48ffc236 | 1105 | |
33fb3ba1 MD |
1106 | /* |
1107 | * Connect the Direct Map slots up to the PML4 | |
1108 | */ | |
1109 | for (j = 0; j < NDMPML4E; ++j) { | |
1110 | ((pdp_entry_t *)KPML4phys)[DMPML4I + j] = | |
f70051b1 | 1111 | (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | |
a86ce0cd MD |
1112 | pmap_bits_default[PG_RW_IDX] | |
1113 | pmap_bits_default[PG_V_IDX] | | |
9e24b495 | 1114 | pmap_bits_default[PG_A_IDX]; |
33fb3ba1 | 1115 | } |
48ffc236 | 1116 | |
33fb3ba1 MD |
1117 | /* |
1118 | * Connect the KVA slot up to the PML4 | |
1119 | */ | |
8ff9866b MD |
1120 | for (j = 0; j < NKPML4E; ++j) { |
1121 | ((pdp_entry_t *)KPML4phys)[KPML4I + j] = | |
1122 | KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT); | |
1123 | ((pdp_entry_t *)KPML4phys)[KPML4I + j] |= | |
1124 | pmap_bits_default[PG_RW_IDX] | | |
1125 | pmap_bits_default[PG_V_IDX] | | |
9e24b495 | 1126 | pmap_bits_default[PG_A_IDX]; |
8ff9866b MD |
1127 | } |
1128 | cpu_mfence(); | |
1129 | cpu_invltlb(); | |
c8fe38ae MD |
1130 | } |
1131 | ||
d7f50089 | 1132 | /* |
c8fe38ae MD |
1133 | * Bootstrap the system enough to run with virtual memory. |
1134 | * | |
d87f4462 | 1135 | * On x86_64 this is called after mapping has already been enabled |
c8fe38ae MD |
1136 | * and just syncs the pmap module with what has already been done. |
1137 | * [We can't call it easily with mapping off since the kernel is not | |
1138 | * mapped with PA == VA, hence we would have to relocate every address | |
1139 | * from the linked base (virtual) address "KERNBASE" to the actual | |
1140 | * (physical) address starting relative to 0] | |
d7f50089 YY |
1141 | */ |
1142 | void | |
48ffc236 | 1143 | pmap_bootstrap(vm_paddr_t *firstaddr) |
c8fe38ae MD |
1144 | { |
1145 | vm_offset_t va; | |
1146 | pt_entry_t *pte; | |
76f1911e | 1147 | int i; |
c8fe38ae | 1148 | |
48ffc236 JG |
1149 | KvaStart = VM_MIN_KERNEL_ADDRESS; |
1150 | KvaEnd = VM_MAX_KERNEL_ADDRESS; | |
1151 | KvaSize = KvaEnd - KvaStart; | |
1152 | ||
c8fe38ae MD |
1153 | avail_start = *firstaddr; |
1154 | ||
1155 | /* | |
48ffc236 | 1156 | * Create an initial set of page tables to run the kernel in. |
c8fe38ae | 1157 | */ |
48ffc236 JG |
1158 | create_pagetables(firstaddr); |
1159 | ||
791c6551 MD |
1160 | virtual2_start = KvaStart; |
1161 | virtual2_end = PTOV_OFFSET; | |
1162 | ||
c8fe38ae MD |
1163 | virtual_start = (vm_offset_t) PTOV_OFFSET + *firstaddr; |
1164 | virtual_start = pmap_kmem_choose(virtual_start); | |
48ffc236 JG |
1165 | |
1166 | virtual_end = VM_MAX_KERNEL_ADDRESS; | |
1167 | ||
1168 | /* XXX do %cr0 as well */ | |
1169 | load_cr4(rcr4() | CR4_PGE | CR4_PSE); | |
1170 | load_cr3(KPML4phys); | |
c8fe38ae MD |
1171 | |
1172 | /* | |
1173 | * Initialize protection array. | |
1174 | */ | |
d87f4462 | 1175 | x86_64_protection_init(); |
c8fe38ae MD |
1176 | |
1177 | /* | |
1178 | * The kernel's pmap is statically allocated so we don't have to use | |
1179 | * pmap_create, which is unlikely to work correctly at this part of | |
1180 | * the boot sequence (XXX and which no longer exists). | |
1181 | */ | |
c713db65 AL |
1182 | kernel_pmap->pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys); |
1183 | kernel_pmap->pm_count = 1; | |
1184 | CPUMASK_ASSALLONES(kernel_pmap->pm_active); | |
1185 | RB_INIT(&kernel_pmap->pm_pvroot); | |
1186 | spin_init(&kernel_pmap->pm_spin, "pmapbootstrap"); | |
76f1911e | 1187 | for (i = 0; i < PM_PLACEMARKS; ++i) |
c713db65 | 1188 | kernel_pmap->pm_placemarks[i] = PM_NOPLACEMARK; |
c8fe38ae MD |
1189 | |
1190 | /* | |
1191 | * Reserve some special page table entries/VA space for temporary | |
1192 | * mapping of pages. | |
1193 | */ | |
1194 | #define SYSMAP(c, p, v, n) \ | |
1195 | v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n); | |
1196 | ||
1197 | va = virtual_start; | |
48ffc236 | 1198 | pte = vtopte(va); |
c8fe38ae MD |
1199 | |
1200 | /* | |
1201 | * CMAP1/CMAP2 are used for zeroing and copying pages. | |
1202 | */ | |
1203 | SYSMAP(caddr_t, CMAP1, CADDR1, 1) | |
1204 | ||
1205 | /* | |
1206 | * Crashdump maps. | |
1207 | */ | |
1208 | SYSMAP(caddr_t, pt_crashdumpmap, crashdumpmap, MAXDUMPPGS); | |
1209 | ||
1210 | /* | |
1211 | * ptvmmap is used for reading arbitrary physical pages via | |
1212 | * /dev/mem. | |
1213 | */ | |
1214 | SYSMAP(caddr_t, ptmmap, ptvmmap, 1) | |
1215 | ||
1216 | /* | |
1217 | * msgbufp is used to map the system message buffer. | |
1218 | * XXX msgbufmap is not used. | |
1219 | */ | |
1220 | SYSMAP(struct msgbuf *, msgbufmap, msgbufp, | |
1221 | atop(round_page(MSGBUF_SIZE))) | |
1222 | ||
1223 | virtual_start = va; | |
e172ba36 | 1224 | virtual_start = pmap_kmem_choose(virtual_start); |
c8fe38ae MD |
1225 | |
1226 | *CMAP1 = 0; | |
c8fe38ae MD |
1227 | |
1228 | /* | |
1229 | * PG_G is terribly broken on SMP because we IPI invltlb's in some | |
1230 | * cases rather then invl1pg. Actually, I don't even know why it | |
1231 | * works under UP because self-referential page table mappings | |
1232 | */ | |
a86ce0cd | 1233 | // pgeflag = 0; |
1918fc5c | 1234 | |
c8fe38ae | 1235 | cpu_invltlb(); |
b524ca76 MD |
1236 | |
1237 | /* Initialize the PAT MSR */ | |
1238 | pmap_init_pat(); | |
c713db65 | 1239 | pmap_pinit_defaults(kernel_pmap); |
b8d5441d MD |
1240 | |
1241 | TUNABLE_INT_FETCH("machdep.pmap_fast_kernel_cpusync", | |
1242 | &pmap_fast_kernel_cpusync); | |
1243 | ||
b524ca76 MD |
1244 | } |
1245 | ||
1246 | /* | |
1247 | * Setup the PAT MSR. | |
1248 | */ | |
1249 | void | |
1250 | pmap_init_pat(void) | |
1251 | { | |
1252 | uint64_t pat_msr; | |
1253 | u_long cr0, cr4; | |
c2ec3418 | 1254 | int i; |
b524ca76 MD |
1255 | |
1256 | /* | |
1257 | * Default values mapping PATi,PCD,PWT bits at system reset. | |
1258 | * The default values effectively ignore the PATi bit by | |
1259 | * repeating the encodings for 0-3 in 4-7, and map the PCD | |
1260 | * and PWT bit combinations to the expected PAT types. | |
1261 | */ | |
1262 | pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) | /* 000 */ | |
1263 | PAT_VALUE(1, PAT_WRITE_THROUGH) | /* 001 */ | |
1264 | PAT_VALUE(2, PAT_UNCACHED) | /* 010 */ | |
1265 | PAT_VALUE(3, PAT_UNCACHEABLE) | /* 011 */ | |
1266 | PAT_VALUE(4, PAT_WRITE_BACK) | /* 100 */ | |
1267 | PAT_VALUE(5, PAT_WRITE_THROUGH) | /* 101 */ | |
1268 | PAT_VALUE(6, PAT_UNCACHED) | /* 110 */ | |
1269 | PAT_VALUE(7, PAT_UNCACHEABLE); /* 111 */ | |
1270 | pat_pte_index[PAT_WRITE_BACK] = 0; | |
a86ce0cd MD |
1271 | pat_pte_index[PAT_WRITE_THROUGH]= 0 | X86_PG_NC_PWT; |
1272 | pat_pte_index[PAT_UNCACHED] = X86_PG_NC_PCD; | |
1273 | pat_pte_index[PAT_UNCACHEABLE] = X86_PG_NC_PCD | X86_PG_NC_PWT; | |
b524ca76 MD |
1274 | pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE]; |
1275 | pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE]; | |
1276 | ||
1277 | if (cpu_feature & CPUID_PAT) { | |
1278 | /* | |
1279 | * If we support the PAT then set-up entries for | |
1280 | * WRITE_PROTECTED and WRITE_COMBINING using bit patterns | |
704f404a | 1281 | * 5 and 6. |
b524ca76 | 1282 | */ |
b524ca76 | 1283 | pat_msr = (pat_msr & ~PAT_MASK(5)) | |
704f404a MD |
1284 | PAT_VALUE(5, PAT_WRITE_PROTECTED); |
1285 | pat_msr = (pat_msr & ~PAT_MASK(6)) | | |
1286 | PAT_VALUE(6, PAT_WRITE_COMBINING); | |
1287 | pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | X86_PG_NC_PWT; | |
1288 | pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PCD; | |
b524ca76 MD |
1289 | |
1290 | /* | |
1291 | * Then enable the PAT | |
1292 | */ | |
1293 | ||
1294 | /* Disable PGE. */ | |
1295 | cr4 = rcr4(); | |
1296 | load_cr4(cr4 & ~CR4_PGE); | |
1297 | ||
1298 | /* Disable caches (CD = 1, NW = 0). */ | |
1299 | cr0 = rcr0(); | |
1300 | load_cr0((cr0 & ~CR0_NW) | CR0_CD); | |
1301 | ||
1302 | /* Flushes caches and TLBs. */ | |
1303 | wbinvd(); | |
1304 | cpu_invltlb(); | |
1305 | ||
1306 | /* Update PAT and index table. */ | |
1307 | wrmsr(MSR_PAT, pat_msr); | |
1308 | ||
1309 | /* Flush caches and TLBs again. */ | |
1310 | wbinvd(); | |
1311 | cpu_invltlb(); | |
1312 | ||
1313 | /* Restore caches and PGE. */ | |
1314 | load_cr0(cr0); | |
1315 | load_cr4(cr4); | |
1316 | PatMsr = pat_msr; | |
1317 | } | |
c2ec3418 MD |
1318 | |
1319 | for (i = 0; i < 8; ++i) { | |
1320 | pt_entry_t pte; | |
1321 | ||
1322 | pte = pat_pte_index[i]; | |
1323 | if (pte & X86_PG_PTE_PAT) { | |
1324 | pte &= ~X86_PG_PTE_PAT; | |
1325 | pte |= X86_PG_PDE_PAT; | |
1326 | } | |
1327 | pat_pde_index[i] = pte; | |
1328 | } | |
d7f50089 YY |
1329 | } |
1330 | ||
1331 | /* | |
c8fe38ae | 1332 | * Set 4mb pdir for mp startup |
d7f50089 YY |
1333 | */ |
1334 | void | |
c8fe38ae MD |
1335 | pmap_set_opt(void) |
1336 | { | |
a86ce0cd | 1337 | if (cpu_feature & CPUID_PSE) { |
c8fe38ae | 1338 | load_cr4(rcr4() | CR4_PSE); |
fb59d445 | 1339 | if (mycpu->gd_cpuid == 0) /* only on BSP */ |
c8fe38ae | 1340 | cpu_invltlb(); |
c8fe38ae | 1341 | } |
2f6148a6 MD |
1342 | |
1343 | /* | |
1344 | * Check for SMAP support and enable if available. Must be done | |
1345 | * after cr3 is loaded, and on all cores. | |
1346 | */ | |
1347 | if (cpu_stdext_feature & CPUID_STDEXT_SMAP) { | |
1348 | load_cr4(rcr4() | CR4_SMAP); | |
1349 | } | |
1350 | if (cpu_stdext_feature & CPUID_STDEXT_SMEP) { | |
1351 | load_cr4(rcr4() | CR4_SMEP); | |
1352 | } | |
d7f50089 YY |
1353 | } |
1354 | ||
8d2aaeec MD |
1355 | /* |
1356 | * SMAP is just a processor flag, but SMEP can only be enabled | |
1357 | * and disabled via CR4. We still use the processor flag to | |
1358 | * disable SMAP because the page-fault/trap code checks it, in | |
1359 | * order to allow a page-fault to actually occur. | |
1360 | */ | |
1361 | void | |
1362 | smap_smep_disable(void) | |
1363 | { | |
1364 | /* | |
1365 | * disable SMAP. This also bypasses a software failsafe check | |
1366 | * in the trap() code. | |
1367 | */ | |
1368 | smap_open(); | |
1369 | ||
1370 | /* | |
1371 | * Also needed to bypass a software failsafe check in the trap() | |
1372 | * code and allow the userspace address fault from kernel mode | |
1373 | * to proceed. | |
1374 | * | |
1375 | * Note that This will not reload %rip because pcb_onfault_rsp will | |
1376 | * not match. Just setting it to non-NULL is sufficient to bypass | |
1377 | * the checks. | |
1378 | */ | |
1379 | curthread->td_pcb->pcb_onfault = (void *)1; | |
1380 | ||
1381 | /* | |
1382 | * Disable SMEP (requires modifying cr4) | |
1383 | */ | |
1384 | if (cpu_stdext_feature & CPUID_STDEXT_SMEP) | |
1385 | load_cr4(rcr4() & ~CR4_SMEP); | |
1386 | } | |
1387 | ||
1388 | void | |
1389 | smap_smep_enable(void) | |
1390 | { | |
1391 | if (cpu_stdext_feature & CPUID_STDEXT_SMEP) | |
1392 | load_cr4(rcr4() | CR4_SMEP); | |
1393 | curthread->td_pcb->pcb_onfault = NULL; | |
1394 | smap_close(); | |
1395 | } | |
1396 | ||
c8fe38ae | 1397 | /* |
6a0fee04 MD |
1398 | * Early initialization of the pmap module. |
1399 | * | |
1400 | * Called by vm_init, to initialize any structures that the pmap | |
1401 | * system needs to map virtual memory. pmap_init has been enhanced to | |
1402 | * support in a fairly consistant way, discontiguous physical memory. | |
d7f50089 YY |
1403 | */ |
1404 | void | |
c8fe38ae | 1405 | pmap_init(void) |
d7f50089 | 1406 | { |
b7ea2f3f MD |
1407 | vm_pindex_t initial_pvs; |
1408 | vm_pindex_t i; | |
c8fe38ae | 1409 | |
c8fe38ae MD |
1410 | /* |
1411 | * Allocate memory for random pmap data structures. Includes the | |
1412 | * pv_head_table. | |
1413 | */ | |
701c977e | 1414 | for (i = 0; i < vm_page_array_size; i++) { |
c8fe38ae MD |
1415 | vm_page_t m; |
1416 | ||
1417 | m = &vm_page_array[i]; | |
c2830aa6 | 1418 | m->md.interlock_count = 0; |
c8fe38ae MD |
1419 | } |
1420 | ||
1421 | /* | |
1422 | * init the pv free list | |
1423 | */ | |
1424 | initial_pvs = vm_page_array_size; | |
1425 | if (initial_pvs < MINPV) | |
1426 | initial_pvs = MINPV; | |
1427 | pvzone = &pvzone_store; | |
1eeaf6b2 | 1428 | pvinit = (void *)kmem_alloc(kernel_map, |
3091de50 MD |
1429 | initial_pvs * sizeof (struct pv_entry), |
1430 | VM_SUBSYS_PVENTRY); | |
948209ce MD |
1431 | zbootinit(pvzone, "PV ENTRY", sizeof (struct pv_entry), |
1432 | pvinit, initial_pvs); | |
c8fe38ae MD |
1433 | |
1434 | /* | |
1435 | * Now it is safe to enable pv_table recording. | |
1436 | */ | |
1437 | pmap_initialized = TRUE; | |
d7f50089 YY |
1438 | } |
1439 | ||
c8fe38ae MD |
1440 | /* |
1441 | * Initialize the address space (zone) for the pv_entries. Set a | |
1442 | * high water mark so that the system can recover from excessive | |
1443 | * numbers of pv entries. | |
9e24b495 MD |
1444 | * |
1445 | * Also create the kernel page table template for isolated user | |
1446 | * pmaps. | |
c8fe38ae | 1447 | */ |
9e24b495 MD |
1448 | static void pmap_init_iso_range(vm_offset_t base, size_t bytes); |
1449 | static void pmap_init2_iso_pmap(void); | |
1450 | #if 0 | |
1451 | static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base); | |
1452 | #endif | |
1453 | ||
d7f50089 | 1454 | void |
c8fe38ae | 1455 | pmap_init2(void) |
d7f50089 | 1456 | { |
b7ea2f3f | 1457 | vm_pindex_t entry_max; |
c8fe38ae | 1458 | |
df49ec1e MD |
1459 | /* |
1460 | * We can significantly reduce pv_entry_max from historical | |
1461 | * levels because pv_entry's are no longer use for PTEs at the | |
1462 | * leafs. This prevents excessive pcpu caching on many-core | |
1463 | * boxes (even with the further '/ 16' done in zinitna(). | |
1464 | * | |
1465 | * Remember, however, that processes can share physical pages | |
1466 | * with each process still needing the pdp/pd/pt infrstructure | |
1467 | * (which still use pv_entry's). And don't just assume that | |
1468 | * every PT will be completely filled up. So don't make it | |
1469 | * too small. | |
1470 | */ | |
1471 | entry_max = maxproc * 32 + vm_page_array_size / 16; | |
1472 | TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &entry_max); | |
1473 | vm_pmap_pv_entries = entry_max; | |
948209ce MD |
1474 | |
1475 | /* | |
1476 | * Subtract out pages already installed in the zone (hack) | |
1477 | */ | |
df49ec1e MD |
1478 | if (entry_max <= MINPV) |
1479 | entry_max = MINPV; | |
948209ce | 1480 | |
e16c650d | 1481 | zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); |
8983ce70 MD |
1482 | |
1483 | /* | |
1484 | * Enable dynamic deletion of empty higher-level page table pages | |
1485 | * by default only if system memory is < 8GB (use 7GB for slop). | |
1486 | * This can save a little memory, but imposes significant | |
1487 | * performance overhead for things like bulk builds, and for programs | |
1488 | * which do a lot of memory mapping and memory unmapping. | |
1489 | */ | |
567a6398 | 1490 | #if 0 |
8983ce70 MD |
1491 | if (pmap_dynamic_delete < 0) { |
1492 | if (vmstats.v_page_count < 7LL * 1024 * 1024 * 1024 / PAGE_SIZE) | |
1493 | pmap_dynamic_delete = 1; | |
1494 | else | |
1495 | pmap_dynamic_delete = 0; | |
1496 | } | |
567a6398 MD |
1497 | #endif |
1498 | /* | |
1499 | * Disable so vm_map_backing iterations do not race | |
1500 | */ | |
1501 | pmap_dynamic_delete = 0; | |
9e24b495 | 1502 | |
6a0fee04 MD |
1503 | /* |
1504 | * Automatic detection of Intel meltdown bug requiring user/kernel | |
1505 | * mmap isolation. | |
1506 | * | |
1507 | * Currently there are so many Intel cpu's impacted that its better | |
1508 | * to whitelist future Intel CPUs. Most? AMD cpus are not impacted | |
1509 | * so the default is off for AMD. | |
1510 | */ | |
94c5f25a | 1511 | if (meltdown_mitigation < 0) { |
d7883524 | 1512 | if (cpu_vendor_id == CPU_VENDOR_INTEL) { |
fe9a7b1b MD |
1513 | meltdown_mitigation = 1; |
1514 | if (cpu_ia32_arch_caps & IA32_ARCH_CAP_RDCL_NO) | |
1515 | meltdown_mitigation = 0; | |
d7883524 | 1516 | } else { |
94c5f25a | 1517 | meltdown_mitigation = 0; |
d7883524 | 1518 | } |
6a0fee04 | 1519 | } |
94c5f25a MD |
1520 | if (meltdown_mitigation) { |
1521 | kprintf("machdep.meltdown_mitigation enabled to " | |
6a0fee04 MD |
1522 | "protect against (mostly Intel) meltdown bug\n"); |
1523 | kprintf("system call performance will be impacted\n"); | |
1524 | } | |
1525 | ||
9e24b495 MD |
1526 | pmap_init2_iso_pmap(); |
1527 | } | |
1528 | ||
1529 | /* | |
1530 | * Create the isolation pmap template. Once created, the template | |
1531 | * is static and its PML4e entries are used to populate the | |
1532 | * kernel portion of any isolated user pmaps. | |
1533 | * | |
1534 | * Our isolation pmap must contain: | |
1535 | * (1) trampoline area for all cpus | |
1536 | * (2) common_tss area for all cpus (its part of the trampoline area now) | |
1537 | * (3) IDT for all cpus | |
1538 | * (4) GDT for all cpus | |
1539 | */ | |
1540 | static void | |
1541 | pmap_init2_iso_pmap(void) | |
1542 | { | |
1543 | int n; | |
1544 | ||
6a0fee04 MD |
1545 | if (bootverbose) |
1546 | kprintf("Initialize isolation pmap\n"); | |
9e24b495 MD |
1547 | |
1548 | /* | |
1549 | * Try to use our normal API calls to make this easier. We have | |
1550 | * to scrap the shadowed kernel PDPs pmap_pinit() creates for our | |
1551 | * iso_pmap. | |
1552 | */ | |
1553 | pmap_pinit(&iso_pmap); | |
1554 | bzero(iso_pmap.pm_pml4, PAGE_SIZE); | |
1555 | ||
1556 | /* | |
1557 | * Install areas needed by the cpu and trampoline. | |
1558 | */ | |
1559 | for (n = 0; n < ncpus; ++n) { | |
1560 | struct privatespace *ps; | |
1561 | ||
1562 | ps = CPU_prvspace[n]; | |
1563 | pmap_init_iso_range((vm_offset_t)&ps->trampoline, | |
1564 | sizeof(ps->trampoline)); | |
85b33048 MD |
1565 | pmap_init_iso_range((vm_offset_t)&ps->dblstack, |
1566 | sizeof(ps->dblstack)); | |
1567 | pmap_init_iso_range((vm_offset_t)&ps->dbgstack, | |
1568 | sizeof(ps->dbgstack)); | |
9e24b495 MD |
1569 | pmap_init_iso_range((vm_offset_t)&ps->common_tss, |
1570 | sizeof(ps->common_tss)); | |
1571 | pmap_init_iso_range(r_idt_arr[n].rd_base, | |
1572 | r_idt_arr[n].rd_limit + 1); | |
31815141 MD |
1573 | pmap_init_iso_range((register_t)ps->mdglobaldata.gd_gdt, |
1574 | MAXGDT_LIMIT); | |
9e24b495 | 1575 | } |
9e24b495 MD |
1576 | pmap_init_iso_range((vm_offset_t)(int *)btext, |
1577 | (vm_offset_t)(int *)etext - | |
1578 | (vm_offset_t)(int *)btext); | |
1579 | ||
1580 | #if 0 | |
1581 | kprintf("Dump iso_pmap:\n"); | |
1582 | dump_pmap(&iso_pmap, vtophys(iso_pmap.pm_pml4), 0, 0); | |
1583 | kprintf("\nDump kernel_pmap:\n"); | |
c713db65 | 1584 | dump_pmap(kernel_pmap, vtophys(kernel_pmap->pm_pml4), 0, 0); |
9e24b495 MD |
1585 | #endif |
1586 | } | |
1587 | ||
1588 | /* | |
1589 | * This adds a kernel virtual address range to the isolation pmap. | |
1590 | */ | |
1591 | static void | |
1592 | pmap_init_iso_range(vm_offset_t base, size_t bytes) | |
1593 | { | |
1594 | pv_entry_t pv; | |
1595 | pv_entry_t pvp; | |
1596 | pt_entry_t *ptep; | |
1597 | pt_entry_t pte; | |
1598 | vm_offset_t va; | |
1599 | ||
6a0fee04 MD |
1600 | if (bootverbose) { |
1601 | kprintf("isolate %016jx-%016jx (%zd)\n", | |
1602 | base, base + bytes, bytes); | |
1603 | } | |
9e24b495 MD |
1604 | va = base & ~(vm_offset_t)PAGE_MASK; |
1605 | while (va < base + bytes) { | |
1606 | if ((va & PDRMASK) == 0 && va + NBPDR <= base + bytes && | |
c713db65 AL |
1607 | (ptep = pmap_pt(kernel_pmap, va)) != NULL && |
1608 | (*ptep & kernel_pmap->pmap_bits[PG_V_IDX]) && | |
1609 | (*ptep & kernel_pmap->pmap_bits[PG_PS_IDX])) { | |
9e24b495 MD |
1610 | /* |
1611 | * Use 2MB pages if possible | |
1612 | */ | |
1613 | pte = *ptep; | |
1614 | pv = pmap_allocpte(&iso_pmap, pmap_pd_pindex(va), &pvp); | |
1615 | ptep = pv_pte_lookup(pv, (va >> PDRSHIFT) & 511); | |
1616 | *ptep = pte; | |
1617 | va += NBPDR; | |
1618 | } else { | |
1619 | /* | |
1620 | * Otherwise use 4KB pages | |
1621 | */ | |
1622 | pv = pmap_allocpte(&iso_pmap, pmap_pt_pindex(va), &pvp); | |
1623 | ptep = pv_pte_lookup(pv, (va >> PAGE_SHIFT) & 511); | |
c713db65 AL |
1624 | *ptep = vtophys(va) | kernel_pmap->pmap_bits[PG_RW_IDX] | |
1625 | kernel_pmap->pmap_bits[PG_V_IDX] | | |
1626 | kernel_pmap->pmap_bits[PG_A_IDX] | | |
1627 | kernel_pmap->pmap_bits[PG_M_IDX]; | |
9e24b495 MD |
1628 | |
1629 | va += PAGE_SIZE; | |
1630 | } | |
1631 | pv_put(pv); | |
1632 | pv_put(pvp); | |
1633 | } | |
1634 | } | |
1635 | ||
1636 | #if 0 | |
1637 | /* | |
1638 | * Useful debugging pmap dumper, do not remove (#if 0 when not in use) | |
1639 | */ | |
1640 | static | |
1641 | void | |
1642 | dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base) | |
1643 | { | |
1644 | pt_entry_t *ptp; | |
1645 | vm_offset_t incr; | |
1646 | int i; | |
1647 | ||
1648 | switch(level) { | |
1649 | case 0: /* PML4e page, 512G entries */ | |
1650 | incr = (1LL << 48) / 512; | |
1651 | break; | |
1652 | case 1: /* PDP page, 1G entries */ | |
1653 | incr = (1LL << 39) / 512; | |
1654 | break; | |
1655 | case 2: /* PD page, 2MB entries */ | |
1656 | incr = (1LL << 30) / 512; | |
1657 | break; | |
1658 | case 3: /* PT page, 4KB entries */ | |
1659 | incr = (1LL << 21) / 512; | |
1660 | break; | |
1661 | default: | |
1662 | incr = 0; | |
1663 | break; | |
1664 | } | |
1665 | ||
1666 | if (level == 0) | |
1667 | kprintf("cr3 %016jx @ va=%016jx\n", pte, base); | |
1668 | ptp = (void *)PHYS_TO_DMAP(pte & ~(pt_entry_t)PAGE_MASK); | |
1669 | for (i = 0; i < 512; ++i) { | |
1670 | if (level == 0 && i == 128) | |
1671 | base += 0xFFFF000000000000LLU; | |
1672 | if (ptp[i]) { | |
1673 | kprintf("%*.*s ", level * 4, level * 4, ""); | |
1674 | if (level == 1 && (ptp[i] & 0x180) == 0x180) { | |
1675 | kprintf("va=%016jx %3d term %016jx (1GB)\n", | |
1676 | base, i, ptp[i]); | |
1677 | } else if (level == 2 && (ptp[i] & 0x180) == 0x180) { | |
1678 | kprintf("va=%016jx %3d term %016jx (2MB)\n", | |
1679 | base, i, ptp[i]); | |
1680 | } else if (level == 3) { | |
1681 | kprintf("va=%016jx %3d term %016jx\n", | |
1682 | base, i, ptp[i]); | |
1683 | } else { | |
1684 | kprintf("va=%016jx %3d deep %016jx\n", | |
1685 | base, i, ptp[i]); | |
1686 | dump_pmap(pmap, ptp[i], level + 1, base); | |
1687 | } | |
1688 | } | |
1689 | base += incr; | |
1690 | } | |
d7f50089 YY |
1691 | } |
1692 | ||
9e24b495 MD |
1693 | #endif |
1694 | ||
9e5e1578 MD |
1695 | /* |
1696 | * Typically used to initialize a fictitious page by vm/device_pager.c | |
1697 | */ | |
1698 | void | |
1699 | pmap_page_init(struct vm_page *m) | |
1700 | { | |
1701 | vm_page_init(m); | |
c2830aa6 | 1702 | m->md.interlock_count = 0; |
9e5e1578 | 1703 | } |
c8fe38ae MD |
1704 | |
1705 | /*************************************************** | |
1706 | * Low level helper routines..... | |
1707 | ***************************************************/ | |
1708 | ||
d7f50089 | 1709 | /* |
10d6182e | 1710 | * Extract the physical page address associated with the map/VA pair. |
701c977e | 1711 | * The page must be wired for this to work reliably. |
d7f50089 | 1712 | */ |
c8fe38ae | 1713 | vm_paddr_t |
76f1911e | 1714 | pmap_extract(pmap_t pmap, vm_offset_t va, void **handlep) |
d7f50089 | 1715 | { |
48ffc236 | 1716 | vm_paddr_t rtval; |
701c977e MD |
1717 | pv_entry_t pt_pv; |
1718 | pt_entry_t *ptep; | |
c8fe38ae | 1719 | |
48ffc236 | 1720 | rtval = 0; |
701c977e MD |
1721 | if (va >= VM_MAX_USER_ADDRESS) { |
1722 | /* | |
1723 | * Kernel page directories might be direct-mapped and | |
1724 | * there is typically no PV tracking of pte's | |
1725 | */ | |
1726 | pd_entry_t *pt; | |
1727 | ||
1728 | pt = pmap_pt(pmap, va); | |
a86ce0cd MD |
1729 | if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) { |
1730 | if (*pt & pmap->pmap_bits[PG_PS_IDX]) { | |
701c977e MD |
1731 | rtval = *pt & PG_PS_FRAME; |
1732 | rtval |= va & PDRMASK; | |
48ffc236 | 1733 | } else { |
eb010d6e | 1734 | ptep = pmap_pt_to_pte(*pt, va); |
a86ce0cd | 1735 | if (*pt & pmap->pmap_bits[PG_V_IDX]) { |
701c977e MD |
1736 | rtval = *ptep & PG_FRAME; |
1737 | rtval |= va & PAGE_MASK; | |
1738 | } | |
1739 | } | |
1740 | } | |
76f1911e MD |
1741 | if (handlep) |
1742 | *handlep = NULL; | |
701c977e MD |
1743 | } else { |
1744 | /* | |
1745 | * User pages currently do not direct-map the page directory | |
1746 | * and some pages might not used managed PVs. But all PT's | |
1747 | * will have a PV. | |
1748 | */ | |
76f1911e | 1749 | pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); |
701c977e MD |
1750 | if (pt_pv) { |
1751 | ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); | |
a86ce0cd | 1752 | if (*ptep & pmap->pmap_bits[PG_V_IDX]) { |
701c977e MD |
1753 | rtval = *ptep & PG_FRAME; |
1754 | rtval |= va & PAGE_MASK; | |
48ffc236 | 1755 | } |
76f1911e MD |
1756 | if (handlep) |
1757 | *handlep = pt_pv; /* locked until done */ | |
1758 | else | |
1759 | pv_put (pt_pv); | |
1760 | } else if (handlep) { | |
1761 | *handlep = NULL; | |
c8fe38ae | 1762 | } |
c8fe38ae | 1763 | } |
48ffc236 JG |
1764 | return rtval; |
1765 | } | |
1766 | ||
76f1911e MD |
1767 | void |
1768 | pmap_extract_done(void *handle) | |
1769 | { | |
1770 | if (handle) | |
1771 | pv_put((pv_entry_t)handle); | |
1772 | } | |
1773 | ||
a86ce0cd MD |
1774 | /* |
1775 | * Similar to extract but checks protections, SMP-friendly short-cut for | |
1776 | * vm_fault_page[_quick](). Can return NULL to cause the caller to | |
dc039ae0 MD |
1777 | * fall-through to the real fault code. Does not work with HVM page |
1778 | * tables. | |
a86ce0cd | 1779 | * |
08abdbfc MD |
1780 | * if busyp is NULL the returned page, if not NULL, is held (and not busied). |
1781 | * | |
1782 | * If busyp is not NULL and this function sets *busyp non-zero, the returned | |
1783 | * page is busied (and not held). | |
1784 | * | |
1785 | * If busyp is not NULL and this function sets *busyp to zero, the returned | |
1786 | * page is held (and not busied). | |
1787 | * | |
7a45978d MD |
1788 | * If VM_PROT_WRITE is set in prot, and the pte is already writable, the |
1789 | * returned page will be dirtied. If the pte is not already writable NULL | |
1790 | * is returned. In otherwords, if the bit is set and a vm_page_t is returned, | |
1791 | * any COW will already have happened and that page can be written by the | |
1792 | * caller. | |
a36803d2 MD |
1793 | * |
1794 | * WARNING! THE RETURNED PAGE IS ONLY HELD AND NOT SUITABLE FOR READING | |
1795 | * OR WRITING AS-IS. | |
a86ce0cd MD |
1796 | */ |
1797 | vm_page_t | |
dc039ae0 | 1798 | pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot, int *busyp) |
a86ce0cd | 1799 | { |
dc039ae0 MD |
1800 | if (pmap && |
1801 | va < VM_MAX_USER_ADDRESS && | |
1802 | (pmap->pm_flags & PMAP_HVM) == 0) { | |
a86ce0cd MD |
1803 | pv_entry_t pt_pv; |
1804 | pv_entry_t pte_pv; | |
1805 | pt_entry_t *ptep; | |
1806 | pt_entry_t req; | |
1807 | vm_page_t m; | |
1808 | int error; | |
1809 | ||
1810 | req = pmap->pmap_bits[PG_V_IDX] | | |
1811 | pmap->pmap_bits[PG_U_IDX]; | |
7a45978d | 1812 | if (prot & VM_PROT_WRITE) |
a86ce0cd MD |
1813 | req |= pmap->pmap_bits[PG_RW_IDX]; |
1814 | ||
76f1911e | 1815 | pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); |
a86ce0cd MD |
1816 | if (pt_pv == NULL) |
1817 | return (NULL); | |
1818 | ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); | |
1819 | if ((*ptep & req) != req) { | |
76f1911e | 1820 | pv_put(pt_pv); |
a86ce0cd MD |
1821 | return (NULL); |
1822 | } | |
76f1911e | 1823 | pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), NULL, &error); |
a86ce0cd MD |
1824 | if (pte_pv && error == 0) { |
1825 | m = pte_pv->pv_m; | |
7a45978d | 1826 | if (prot & VM_PROT_WRITE) { |
dc039ae0 | 1827 | /* interlocked by presence of pv_entry */ |
a86ce0cd | 1828 | vm_page_dirty(m); |
dc039ae0 MD |
1829 | } |
1830 | if (busyp) { | |
1831 | if (prot & VM_PROT_WRITE) { | |
1832 | if (vm_page_busy_try(m, TRUE)) | |
1833 | m = NULL; | |
1834 | *busyp = 1; | |
1835 | } else { | |
1836 | vm_page_hold(m); | |
1837 | *busyp = 0; | |
1838 | } | |
1839 | } else { | |
1840 | vm_page_hold(m); | |
1841 | } | |
a86ce0cd MD |
1842 | pv_put(pte_pv); |
1843 | } else if (pte_pv) { | |
1844 | pv_drop(pte_pv); | |
1845 | m = NULL; | |
1846 | } else { | |
e989b548 | 1847 | /* error, since we didn't request a placemarker */ |
a86ce0cd MD |
1848 | m = NULL; |
1849 | } | |
76f1911e | 1850 | pv_put(pt_pv); |
a86ce0cd MD |
1851 | return(m); |
1852 | } else { | |
1853 | return(NULL); | |
1854 | } | |
1855 | } | |
1856 | ||
48ffc236 | 1857 | /* |
10d6182e | 1858 | * Extract the physical page address associated kernel virtual address. |
48ffc236 JG |
1859 | */ |
1860 | vm_paddr_t | |
1861 | pmap_kextract(vm_offset_t va) | |
48ffc236 | 1862 | { |
701c977e | 1863 | pd_entry_t pt; /* pt entry in pd */ |
48ffc236 JG |
1864 | vm_paddr_t pa; |
1865 | ||
1866 | if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { | |
1867 | pa = DMAP_TO_PHYS(va); | |
1868 | } else { | |
701c977e | 1869 | pt = *vtopt(va); |
c713db65 | 1870 | if (pt & kernel_pmap->pmap_bits[PG_PS_IDX]) { |
701c977e | 1871 | pa = (pt & PG_PS_FRAME) | (va & PDRMASK); |
48ffc236 JG |
1872 | } else { |
1873 | /* | |
1874 | * Beware of a concurrent promotion that changes the | |
1875 | * PDE at this point! For example, vtopte() must not | |
1876 | * be used to access the PTE because it would use the | |
1877 | * new PDE. It is, however, safe to use the old PDE | |
1878 | * because the page table page is preserved by the | |
1879 | * promotion. | |
1880 | */ | |
eb010d6e | 1881 | pa = *pmap_pt_to_pte(pt, va); |
48ffc236 JG |
1882 | pa = (pa & PG_FRAME) | (va & PAGE_MASK); |
1883 | } | |
1884 | } | |
1885 | return pa; | |
d7f50089 YY |
1886 | } |
1887 | ||
c8fe38ae MD |
1888 | /*************************************************** |
1889 | * Low level mapping routines..... | |
1890 | ***************************************************/ | |
1891 | ||
d7f50089 | 1892 | /* |
737b020b | 1893 | * Add a wired page to the KVA and invalidate the mapping on all CPUs. |
d7f50089 | 1894 | */ |
737b020b | 1895 | void |
d7f50089 YY |
1896 | pmap_kenter(vm_offset_t va, vm_paddr_t pa) |
1897 | { | |
79f2da03 | 1898 | pt_entry_t *ptep; |
c8fe38ae | 1899 | pt_entry_t npte; |
c8fe38ae | 1900 | |
a86ce0cd | 1901 | npte = pa | |
c713db65 AL |
1902 | kernel_pmap->pmap_bits[PG_RW_IDX] | |
1903 | kernel_pmap->pmap_bits[PG_V_IDX]; | |
e989b548 | 1904 | // pgeflag; |
79f2da03 | 1905 | ptep = vtopte(va); |
1af05cbf | 1906 | #if 1 |
c713db65 | 1907 | pmap_inval_smp(kernel_pmap, va, 1, ptep, npte); |
1af05cbf MD |
1908 | #else |
1909 | /* FUTURE */ | |
79f2da03 | 1910 | if (*ptep) |
c713db65 | 1911 | pmap_inval_smp(kernel_pmap, va, ptep, npte); |
79f2da03 MD |
1912 | else |
1913 | *ptep = npte; | |
1af05cbf | 1914 | #endif |
d7f50089 YY |
1915 | } |
1916 | ||
1917 | /* | |
79f2da03 MD |
1918 | * Similar to pmap_kenter(), except we only invalidate the mapping on the |
1919 | * current CPU. Returns 0 if the previous pte was 0, 1 if it wasn't | |
1920 | * (caller can conditionalize calling smp_invltlb()). | |
d7f50089 | 1921 | */ |
79f2da03 | 1922 | int |
c8fe38ae MD |
1923 | pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) |
1924 | { | |
79f2da03 | 1925 | pt_entry_t *ptep; |
c8fe38ae | 1926 | pt_entry_t npte; |
79f2da03 | 1927 | int res; |
c8fe38ae | 1928 | |
c713db65 AL |
1929 | npte = pa | kernel_pmap->pmap_bits[PG_RW_IDX] | |
1930 | kernel_pmap->pmap_bits[PG_V_IDX]; | |
76f1911e | 1931 | // npte |= pgeflag; |
79f2da03 | 1932 | ptep = vtopte(va); |
1af05cbf MD |
1933 | #if 1 |
1934 | res = 1; | |
1935 | #else | |
1936 | /* FUTURE */ | |
79f2da03 | 1937 | res = (*ptep != 0); |
1af05cbf | 1938 | #endif |
76f1911e | 1939 | atomic_swap_long(ptep, npte); |
c8fe38ae | 1940 | cpu_invlpg((void *)va); |
c8fe38ae | 1941 | |
79f2da03 | 1942 | return res; |
d7f50089 YY |
1943 | } |
1944 | ||
ccd67bf6 MD |
1945 | /* |
1946 | * Enter addresses into the kernel pmap but don't bother | |
1947 | * doing any tlb invalidations. Caller will do a rollup | |
1948 | * invalidation via pmap_rollup_inval(). | |
1949 | */ | |
1950 | int | |
1951 | pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) | |
1952 | { | |
1953 | pt_entry_t *ptep; | |
1954 | pt_entry_t npte; | |
1955 | int res; | |
1956 | ||
1957 | npte = pa | | |
c713db65 AL |
1958 | kernel_pmap->pmap_bits[PG_RW_IDX] | |
1959 | kernel_pmap->pmap_bits[PG_V_IDX]; | |
ccd67bf6 MD |
1960 | // pgeflag; |
1961 | ptep = vtopte(va); | |
1962 | #if 1 | |
1963 | res = 1; | |
1964 | #else | |
1965 | /* FUTURE */ | |
1966 | res = (*ptep != 0); | |
1967 | #endif | |
76f1911e | 1968 | atomic_swap_long(ptep, npte); |
ccd67bf6 MD |
1969 | cpu_invlpg((void *)va); |
1970 | ||
1971 | return res; | |
1972 | } | |
1973 | ||
d7f50089 | 1974 | /* |
c8fe38ae | 1975 | * remove a page from the kernel pagetables |
d7f50089 YY |
1976 | */ |
1977 | void | |
c8fe38ae | 1978 | pmap_kremove(vm_offset_t va) |
d7f50089 | 1979 | { |
79f2da03 | 1980 | pt_entry_t *ptep; |
c8fe38ae | 1981 | |
79f2da03 | 1982 | ptep = vtopte(va); |
c713db65 | 1983 | pmap_inval_smp(kernel_pmap, va, 1, ptep, 0); |
c8fe38ae MD |
1984 | } |
1985 | ||
1986 | void | |
1987 | pmap_kremove_quick(vm_offset_t va) | |
1988 | { | |
79f2da03 MD |
1989 | pt_entry_t *ptep; |
1990 | ||
1991 | ptep = vtopte(va); | |
8078b160 | 1992 | atomic_readandclear_long(ptep); |
c8fe38ae | 1993 | cpu_invlpg((void *)va); |
d7f50089 YY |
1994 | } |
1995 | ||
ccd67bf6 MD |
1996 | /* |
1997 | * Remove addresses from the kernel pmap but don't bother | |
1998 | * doing any tlb invalidations. Caller will do a rollup | |
1999 | * invalidation via pmap_rollup_inval(). | |
2000 | */ | |
2001 | void | |
2002 | pmap_kremove_noinval(vm_offset_t va) | |
2003 | { | |
2004 | pt_entry_t *ptep; | |
2005 | ||
2006 | ptep = vtopte(va); | |
8078b160 | 2007 | atomic_readandclear_long(ptep); |
ccd67bf6 MD |
2008 | } |
2009 | ||
d7f50089 | 2010 | /* |
c8fe38ae | 2011 | * XXX these need to be recoded. They are not used in any critical path. |
d7f50089 YY |
2012 | */ |
2013 | void | |
c8fe38ae | 2014 | pmap_kmodify_rw(vm_offset_t va) |
d7f50089 | 2015 | { |
c713db65 | 2016 | atomic_set_long(vtopte(va), kernel_pmap->pmap_bits[PG_RW_IDX]); |
c8fe38ae | 2017 | cpu_invlpg((void *)va); |
d7f50089 YY |
2018 | } |
2019 | ||
a86ce0cd | 2020 | /* NOT USED |
c8fe38ae MD |
2021 | void |
2022 | pmap_kmodify_nc(vm_offset_t va) | |
2023 | { | |
701c977e | 2024 | atomic_set_long(vtopte(va), PG_N); |
c8fe38ae MD |
2025 | cpu_invlpg((void *)va); |
2026 | } | |
a86ce0cd | 2027 | */ |
d7f50089 YY |
2028 | |
2029 | /* | |
ad54aa11 MD |
2030 | * Used to map a range of physical addresses into kernel virtual |
2031 | * address space during the low level boot, typically to map the | |
2032 | * dump bitmap, message buffer, and vm_page_array. | |
c8fe38ae | 2033 | * |
ad54aa11 MD |
2034 | * These mappings are typically made at some pointer after the end of the |
2035 | * kernel text+data. | |
2036 | * | |
2037 | * We could return PHYS_TO_DMAP(start) here and not allocate any | |
2038 | * via (*virtp), but then kmem from userland and kernel dumps won't | |
2039 | * have access to the related pointers. | |
d7f50089 YY |
2040 | */ |
2041 | vm_offset_t | |
8e5e6f1b | 2042 | pmap_map(vm_offset_t *virtp, vm_paddr_t start, vm_paddr_t end, int prot) |
d7f50089 | 2043 | { |
ad54aa11 MD |
2044 | vm_offset_t va; |
2045 | vm_offset_t va_start; | |
2046 | ||
2047 | /*return PHYS_TO_DMAP(start);*/ | |
2048 | ||
2049 | va_start = *virtp; | |
2050 | va = va_start; | |
2051 | ||
2052 | while (start < end) { | |
2053 | pmap_kenter_quick(va, start); | |
2054 | va += PAGE_SIZE; | |
2055 | start += PAGE_SIZE; | |
2056 | } | |
2057 | *virtp = va; | |
2058 | return va_start; | |
d7f50089 YY |
2059 | } |
2060 | ||
c174861d FT |
2061 | #define PMAP_CLFLUSH_THRESHOLD (2 * 1024 * 1024) |
2062 | ||
2063 | /* | |
2064 | * Remove the specified set of pages from the data and instruction caches. | |
2065 | * | |
2066 | * In contrast to pmap_invalidate_cache_range(), this function does not | |
2067 | * rely on the CPU's self-snoop feature, because it is intended for use | |
2068 | * when moving pages into a different cache domain. | |
2069 | */ | |
2070 | void | |
2071 | pmap_invalidate_cache_pages(vm_page_t *pages, int count) | |
2072 | { | |
2073 | vm_offset_t daddr, eva; | |
2074 | int i; | |
2075 | ||
2076 | if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE || | |
2077 | (cpu_feature & CPUID_CLFSH) == 0) | |
2078 | wbinvd(); | |
2079 | else { | |
2080 | cpu_mfence(); | |
2081 | for (i = 0; i < count; i++) { | |
2082 | daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i])); | |
2083 | eva = daddr + PAGE_SIZE; | |
2084 | for (; daddr < eva; daddr += cpu_clflush_line_size) | |
2085 | clflush(daddr); | |
2086 | } | |
2087 | cpu_mfence(); | |
2088 | } | |
2089 | } | |
2090 | ||
300a6373 JH |
2091 | void |
2092 | pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva) | |
2093 | { | |
fd300b3d FT |
2094 | KASSERT((sva & PAGE_MASK) == 0, |
2095 | ("pmap_invalidate_cache_range: sva not page-aligned")); | |
2096 | KASSERT((eva & PAGE_MASK) == 0, | |
2097 | ("pmap_invalidate_cache_range: eva not page-aligned")); | |
2098 | ||
2099 | if (cpu_feature & CPUID_SS) { | |
2100 | ; /* If "Self Snoop" is supported, do nothing. */ | |
2101 | } else { | |
2102 | /* Globally invalidate caches */ | |
2103 | cpu_wbinvd_on_all_cpus(); | |
2104 | } | |
300a6373 | 2105 | } |
ccd67bf6 MD |
2106 | |
2107 | /* | |
2108 | * Invalidate the specified range of virtual memory on all cpus associated | |
2109 | * with the pmap. | |
2110 | */ | |
300a6373 JH |
2111 | void |
2112 | pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) | |
2113 | { | |
ccd67bf6 | 2114 | pmap_inval_smp(pmap, sva, (eva - sva) >> PAGE_SHIFT, NULL, 0); |
300a6373 | 2115 | } |
c8fe38ae | 2116 | |
d7f50089 | 2117 | /* |
d0f59917 MD |
2118 | * Add a list of wired pages to the kva. This routine is used for temporary |
2119 | * kernel mappings such as those found in buffer cache buffer. Page | |
2120 | * modifications and accesses are not tracked or recorded. | |
2121 | * | |
2122 | * NOTE! Old mappings are simply overwritten, and we cannot assume relaxed | |
2123 | * semantics as previous mappings may have been zerod without any | |
2124 | * invalidation. | |
2125 | * | |
2126 | * The page *must* be wired. | |
d7f50089 | 2127 | */ |
f3bd2fce MD |
2128 | static __inline void |
2129 | _pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count, int doinval) | |
d7f50089 | 2130 | { |
c8fe38ae | 2131 | vm_offset_t end_va; |
ccd67bf6 | 2132 | vm_offset_t va; |
c8fe38ae | 2133 | |
ccd67bf6 MD |
2134 | end_va = beg_va + count * PAGE_SIZE; |
2135 | ||
2136 | for (va = beg_va; va < end_va; va += PAGE_SIZE) { | |
76f1911e MD |
2137 | pt_entry_t pte; |
2138 | pt_entry_t *ptep; | |
c8fe38ae | 2139 | |
76f1911e MD |
2140 | ptep = vtopte(va); |
2141 | pte = VM_PAGE_TO_PHYS(*m) | | |
c713db65 AL |
2142 | kernel_pmap->pmap_bits[PG_RW_IDX] | |
2143 | kernel_pmap->pmap_bits[PG_V_IDX] | | |
2144 | kernel_pmap->pmap_cache_bits_pte[(*m)->pat_mode]; | |
a86ce0cd | 2145 | // pgeflag; |
76f1911e | 2146 | atomic_swap_long(ptep, pte); |
c8fe38ae MD |
2147 | m++; |
2148 | } | |
f3bd2fce | 2149 | if (doinval) |
c713db65 | 2150 | pmap_invalidate_range(kernel_pmap, beg_va, end_va); |
f3bd2fce MD |
2151 | } |
2152 | ||
2153 | void | |
2154 | pmap_qenter(vm_offset_t beg_va, vm_page_t *m, int count) | |
2155 | { | |
2156 | _pmap_qenter(beg_va, m, count, 1); | |
2157 | } | |
2158 | ||
2159 | void | |
2160 | pmap_qenter_noinval(vm_offset_t beg_va, vm_page_t *m, int count) | |
2161 | { | |
2162 | _pmap_qenter(beg_va, m, count, 0); | |
c8fe38ae MD |
2163 | } |
2164 | ||
d7f50089 | 2165 | /* |
d0f59917 MD |
2166 | * This routine jerks page mappings from the kernel -- it is meant only |
2167 | * for temporary mappings such as those found in buffer cache buffers. | |
2168 | * No recording modified or access status occurs. | |
7155fc7d MD |
2169 | * |
2170 | * MPSAFE, INTERRUPT SAFE (cluster callback) | |
d7f50089 | 2171 | */ |
c8fe38ae | 2172 | void |
d0f59917 | 2173 | pmap_qremove(vm_offset_t beg_va, int count) |
d7f50089 | 2174 | { |
c8fe38ae | 2175 | vm_offset_t end_va; |
d0f59917 | 2176 | vm_offset_t va; |
c8fe38ae | 2177 | |
d0f59917 | 2178 | end_va = beg_va + count * PAGE_SIZE; |
c8fe38ae | 2179 | |
d0f59917 | 2180 | for (va = beg_va; va < end_va; va += PAGE_SIZE) { |
c8fe38ae MD |
2181 | pt_entry_t *pte; |
2182 | ||
2183 | pte = vtopte(va); | |
8078b160 | 2184 | atomic_readandclear_long(pte); |
c8fe38ae | 2185 | cpu_invlpg((void *)va); |
c8fe38ae | 2186 | } |
c713db65 | 2187 | pmap_invalidate_range(kernel_pmap, beg_va, end_va); |
d0f59917 MD |
2188 | } |
2189 | ||
2190 | /* | |
2191 | * This routine removes temporary kernel mappings, only invalidating them | |
2192 | * on the current cpu. It should only be used under carefully controlled | |
2193 | * conditions. | |
2194 | */ | |
2195 | void | |
2196 | pmap_qremove_quick(vm_offset_t beg_va, int count) | |
2197 | { | |
2198 | vm_offset_t end_va; | |
2199 | vm_offset_t va; | |
2200 | ||
2201 | end_va = beg_va + count * PAGE_SIZE; | |
2202 | ||
2203 | for (va = beg_va; va < end_va; va += PAGE_SIZE) { | |
2204 | pt_entry_t *pte; | |
2205 | ||
2206 | pte = vtopte(va); | |
8078b160 | 2207 | atomic_readandclear_long(pte); |
d0f59917 MD |
2208 | cpu_invlpg((void *)va); |
2209 | } | |
2210 | } | |
2211 | ||
2212 | /* | |
2213 | * This routine removes temporary kernel mappings *without* invalidating | |
2214 | * the TLB. It can only be used on permanent kva reservations such as those | |
2215 | * found in buffer cache buffers, under carefully controlled circumstances. | |
2216 | * | |
2217 | * NOTE: Repopulating these KVAs requires unconditional invalidation. | |
2218 | * (pmap_qenter() does unconditional invalidation). | |
2219 | */ | |
2220 | void | |
2221 | pmap_qremove_noinval(vm_offset_t beg_va, int count) | |
2222 | { | |
2223 | vm_offset_t end_va; | |
2224 | vm_offset_t va; | |
2225 | ||
2226 | end_va = beg_va + count * PAGE_SIZE; | |
2227 | ||
2228 | for (va = beg_va; va < end_va; va += PAGE_SIZE) { | |
2229 | pt_entry_t *pte; | |
2230 | ||
2231 | pte = vtopte(va); | |
8078b160 | 2232 | atomic_readandclear_long(pte); |
d0f59917 | 2233 | } |
d7f50089 YY |
2234 | } |
2235 | ||
d7f50089 | 2236 | /* |
c8fe38ae MD |
2237 | * Create a new thread and optionally associate it with a (new) process. |
2238 | * NOTE! the new thread's cpu may not equal the current cpu. | |
d7f50089 YY |
2239 | */ |
2240 | void | |
c8fe38ae | 2241 | pmap_init_thread(thread_t td) |
d7f50089 | 2242 | { |
d1368d1a | 2243 | /* enforce pcb placement & alignment */ |
c8fe38ae | 2244 | td->td_pcb = (struct pcb *)(td->td_kstack + td->td_kstack_size) - 1; |
d1368d1a | 2245 | td->td_pcb = (struct pcb *)((intptr_t)td->td_pcb & ~(intptr_t)0xF); |
c8fe38ae | 2246 | td->td_savefpu = &td->td_pcb->pcb_save; |
d1368d1a | 2247 | td->td_sp = (char *)td->td_pcb; /* no -16 */ |
d7f50089 YY |
2248 | } |
2249 | ||
2250 | /* | |
c8fe38ae | 2251 | * This routine directly affects the fork perf for a process. |
d7f50089 YY |
2252 | */ |
2253 | void | |
c8fe38ae | 2254 | pmap_init_proc(struct proc *p) |
d7f50089 YY |
2255 | { |
2256 | } | |
2257 | ||
a86ce0cd | 2258 | static void |
8e2efb11 MD |
2259 | pmap_pinit_defaults(struct pmap *pmap) |
2260 | { | |
2261 | bcopy(pmap_bits_default, pmap->pmap_bits, | |
2262 | sizeof(pmap_bits_default)); | |
2263 | bcopy(protection_codes, pmap->protection_codes, | |
2264 | sizeof(protection_codes)); | |
c2ec3418 | 2265 | bcopy(pat_pte_index, pmap->pmap_cache_bits_pte, |
8e2efb11 | 2266 | sizeof(pat_pte_index)); |
c2ec3418 MD |
2267 | bcopy(pat_pde_index, pmap->pmap_cache_bits_pde, |
2268 | sizeof(pat_pte_index)); | |
2269 | pmap->pmap_cache_mask_pte = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT; | |
2270 | pmap->pmap_cache_mask_pde = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PDE_PAT; | |
a86ce0cd MD |
2271 | pmap->copyinstr = std_copyinstr; |
2272 | pmap->copyin = std_copyin; | |
2273 | pmap->copyout = std_copyout; | |
2274 | pmap->fubyte = std_fubyte; | |
2275 | pmap->subyte = std_subyte; | |
5947157e MD |
2276 | pmap->fuword32 = std_fuword32; |
2277 | pmap->fuword64 = std_fuword64; | |
a86ce0cd | 2278 | pmap->suword32 = std_suword32; |
7f4bfbe7 MD |
2279 | pmap->suword64 = std_suword64; |
2280 | pmap->swapu32 = std_swapu32; | |
2281 | pmap->swapu64 = std_swapu64; | |
6481baf4 MD |
2282 | pmap->fuwordadd32 = std_fuwordadd32; |
2283 | pmap->fuwordadd64 = std_fuwordadd64; | |
a86ce0cd | 2284 | } |
9bbbdb7e | 2285 | |
d7f50089 | 2286 | /* |
73b1bfb1 | 2287 | * Initialize pmap0/vmspace0. |
c8fe38ae MD |
2288 | * |
2289 | * On architectures where the kernel pmap is not integrated into the user | |
2290 | * process pmap, this pmap represents the process pmap, not the kernel pmap. | |
2291 | * kernel_pmap should be used to directly access the kernel_pmap. | |
d7f50089 YY |
2292 | */ |
2293 | void | |
c8fe38ae | 2294 | pmap_pinit0(struct pmap *pmap) |
d7f50089 | 2295 | { |
76f1911e MD |
2296 | int i; |
2297 | ||
48ffc236 | 2298 | pmap->pm_pml4 = (pml4_entry_t *)(PTOV_OFFSET + KPML4phys); |
c8fe38ae | 2299 | pmap->pm_count = 1; |
c07315c4 | 2300 | CPUMASK_ASSZERO(pmap->pm_active); |
bb1339f8 | 2301 | pmap->pm_pvhint_pt = NULL; |
567a6398 | 2302 | pmap->pm_pvhint_unused = NULL; |
701c977e | 2303 | RB_INIT(&pmap->pm_pvroot); |
ba87a4ab | 2304 | spin_init(&pmap->pm_spin, "pmapinit0"); |
76f1911e MD |
2305 | for (i = 0; i < PM_PLACEMARKS; ++i) |
2306 | pmap->pm_placemarks[i] = PM_NOPLACEMARK; | |
c8fe38ae | 2307 | bzero(&pmap->pm_stats, sizeof pmap->pm_stats); |
a86ce0cd | 2308 | pmap_pinit_defaults(pmap); |
d7f50089 YY |
2309 | } |
2310 | ||
2311 | /* | |
c8fe38ae MD |
2312 | * Initialize a preallocated and zeroed pmap structure, |
2313 | * such as one in a vmspace structure. | |
d7f50089 | 2314 | */ |
921c891e MD |
2315 | static void |
2316 | pmap_pinit_simple(struct pmap *pmap) | |
d7f50089 | 2317 | { |
76f1911e MD |
2318 | int i; |
2319 | ||
701c977e MD |
2320 | /* |
2321 | * Misc initialization | |
2322 | */ | |
2323 | pmap->pm_count = 1; | |
c07315c4 | 2324 | CPUMASK_ASSZERO(pmap->pm_active); |
bb1339f8 | 2325 | pmap->pm_pvhint_pt = NULL; |
567a6398 | 2326 | pmap->pm_pvhint_unused = NULL; |
921c891e MD |
2327 | pmap->pm_flags = PMAP_FLAG_SIMPLE; |
2328 | ||
a86ce0cd MD |
2329 | pmap_pinit_defaults(pmap); |
2330 | ||
921c891e MD |
2331 | /* |
2332 | * Don't blow up locks/tokens on re-use (XXX fix/use drop code | |
2333 | * for this). | |
2334 | */ | |
701c977e MD |
2335 | if (pmap->pm_pmlpv == NULL) { |
2336 | RB_INIT(&pmap->pm_pvroot); | |
2337 | bzero(&pmap->pm_stats, sizeof pmap->pm_stats); | |
ba87a4ab | 2338 | spin_init(&pmap->pm_spin, "pmapinitsimple"); |
76f1911e MD |
2339 | for (i = 0; i < PM_PLACEMARKS; ++i) |
2340 | pmap->pm_placemarks[i] = PM_NOPLACEMARK; | |
701c977e | 2341 | } |
921c891e MD |
2342 | } |
2343 | ||
2344 | void | |
2345 | pmap_pinit(struct pmap *pmap) | |
2346 | { | |
2347 | pv_entry_t pv; | |
2348 | int j; | |
2349 | ||
a86ce0cd | 2350 | if (pmap->pm_pmlpv) { |
6379cf29 | 2351 | /* Completely clear the cached pmap if not REGULAR_PMAP. */ |
a86ce0cd MD |
2352 | if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) { |
2353 | pmap_puninit(pmap); | |
2354 | } | |
2355 | } | |
2356 | ||
921c891e MD |
2357 | pmap_pinit_simple(pmap); |
2358 | pmap->pm_flags &= ~PMAP_FLAG_SIMPLE; | |
c8fe38ae MD |
2359 | |
2360 | /* | |
2361 | * No need to allocate page table space yet but we do need a valid | |
2362 | * page directory table. | |
2363 | */ | |
48ffc236 JG |
2364 | if (pmap->pm_pml4 == NULL) { |
2365 | pmap->pm_pml4 = | |
1eeaf6b2 | 2366 | (pml4_entry_t *)kmem_alloc_pageable(kernel_map, |
4611d87f | 2367 | PAGE_SIZE * 2, |
3091de50 | 2368 | VM_SUBSYS_PML4); |
4611d87f | 2369 | pmap->pm_pml4_iso = (void *)((char *)pmap->pm_pml4 + PAGE_SIZE); |
c8fe38ae MD |
2370 | } |
2371 | ||
2372 | /* | |
4611d87f MD |
2373 | * Allocate the PML4e table, which wires it even though it isn't |
2374 | * being entered into some higher level page table (it being the | |
2375 | * highest level). If one is already cached we don't have to do | |
2376 | * anything. | |
c8fe38ae | 2377 | */ |
701c977e MD |
2378 | if ((pv = pmap->pm_pmlpv) == NULL) { |
2379 | pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); | |
2380 | pmap->pm_pmlpv = pv; | |
b12defdc | 2381 | pmap_kenter((vm_offset_t)pmap->pm_pml4, |
701c977e MD |
2382 | VM_PAGE_TO_PHYS(pv->pv_m)); |
2383 | pv_put(pv); | |
33fb3ba1 MD |
2384 | |
2385 | /* | |
2386 | * Install DMAP and KMAP. | |
2387 | */ | |
2388 | for (j = 0; j < NDMPML4E; ++j) { | |
2389 | pmap->pm_pml4[DMPML4I + j] = | |
f70051b1 | 2390 | (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | |
a86ce0cd MD |
2391 | pmap->pmap_bits[PG_RW_IDX] | |
2392 | pmap->pmap_bits[PG_V_IDX] | | |
9e24b495 | 2393 | pmap->pmap_bits[PG_A_IDX]; |
33fb3ba1 | 2394 | } |
8ff9866b MD |
2395 | for (j = 0; j < NKPML4E; ++j) { |
2396 | pmap->pm_pml4[KPML4I + j] = | |
2397 | (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) | | |
2398 | pmap->pmap_bits[PG_RW_IDX] | | |
2399 | pmap->pmap_bits[PG_V_IDX] | | |
9e24b495 | 2400 | pmap->pmap_bits[PG_A_IDX]; |
8ff9866b | 2401 | } |
701c977e | 2402 | |
33fb3ba1 MD |
2403 | /* |
2404 | * install self-referential address mapping entry | |
2405 | */ | |
701c977e | 2406 | pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) | |
a86ce0cd MD |
2407 | pmap->pmap_bits[PG_V_IDX] | |
2408 | pmap->pmap_bits[PG_RW_IDX] | | |
9e24b495 | 2409 | pmap->pmap_bits[PG_A_IDX]; |
701c977e MD |
2410 | } else { |
2411 | KKASSERT(pv->pv_m->flags & PG_MAPPED); | |
2412 | KKASSERT(pv->pv_m->flags & PG_WRITEABLE); | |
b12defdc | 2413 | } |
993bac44 | 2414 | KKASSERT(pmap->pm_pml4[255] == 0); |
4611d87f MD |
2415 | |
2416 | /* | |
2417 | * When implementing an isolated userland pmap, a second PML4e table | |
2418 | * is needed. We use pmap_pml4_pindex() + 1 for convenience, but | |
2419 | * note that we do not operate on this table using our API functions | |
2420 | * so handling of the + 1 case is mostly just to prevent implosions. | |
9e24b495 MD |
2421 | * |
2422 | * We install an isolated version of the kernel PDPs into this | |
2423 | * second PML4e table. The pmap code will mirror all user PDPs | |
2424 | * between the primary and secondary PML4e table. | |
4611d87f | 2425 | */ |
94c5f25a | 2426 | if ((pv = pmap->pm_pmlpv_iso) == NULL && meltdown_mitigation && |
9e24b495 | 2427 | pmap != &iso_pmap) { |
4611d87f MD |
2428 | pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL); |
2429 | pmap->pm_pmlpv_iso = pv; | |
2430 | pmap_kenter((vm_offset_t)pmap->pm_pml4_iso, | |
2431 | VM_PAGE_TO_PHYS(pv->pv_m)); | |
2432 | pv_put(pv); | |
2433 | ||
2434 | /* | |
9e24b495 MD |
2435 | * Install an isolated version of the kernel pmap for |
2436 | * user consumption, using PDPs constructed in iso_pmap. | |
4611d87f MD |
2437 | */ |
2438 | for (j = 0; j < NKPML4E; ++j) { | |
2439 | pmap->pm_pml4_iso[KPML4I + j] = | |
9e24b495 | 2440 | iso_pmap.pm_pml4[KPML4I + j]; |
4611d87f | 2441 | } |
4611d87f MD |
2442 | } else if (pv) { |
2443 | KKASSERT(pv->pv_m->flags & PG_MAPPED); | |
2444 | KKASSERT(pv->pv_m->flags & PG_WRITEABLE); | |
2445 | } | |
d7f50089 YY |
2446 | } |
2447 | ||
2448 | /* | |
c8fe38ae MD |
2449 | * Clean up a pmap structure so it can be physically freed. This routine |
2450 | * is called by the vmspace dtor function. A great deal of pmap data is | |
2451 | * left passively mapped to improve vmspace management so we have a bit | |
2452 | * of cleanup work to do here. | |
d7f50089 YY |
2453 | */ |
2454 | void | |
c8fe38ae | 2455 | pmap_puninit(pmap_t pmap) |
d7f50089 | 2456 | { |
701c977e | 2457 | pv_entry_t pv; |
c8fe38ae MD |
2458 | vm_page_t p; |
2459 | ||
c07315c4 | 2460 | KKASSERT(CPUMASK_TESTZERO(pmap->pm_active)); |
701c977e MD |
2461 | if ((pv = pmap->pm_pmlpv) != NULL) { |
2462 | if (pv_hold_try(pv) == 0) | |
2463 | pv_lock(pv); | |
8e2efb11 | 2464 | KKASSERT(pv == pmap->pm_pmlpv); |
c2830aa6 | 2465 | p = pmap_remove_pv_page(pv, 1); |
76f1911e | 2466 | pv_free(pv, NULL); |
a7a03a5f | 2467 | pv = NULL; /* safety */ |
48ffc236 | 2468 | pmap_kremove((vm_offset_t)pmap->pm_pml4); |
b12defdc | 2469 | vm_page_busy_wait(p, FALSE, "pgpun"); |
831a8507 | 2470 | KKASSERT(p->flags & PG_UNQUEUED); |
b12defdc | 2471 | vm_page_unwire(p, 0); |
701c977e | 2472 | vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); |
701c977e MD |
2473 | vm_page_free(p); |
2474 | pmap->pm_pmlpv = NULL; | |
c8fe38ae | 2475 | } |
4611d87f MD |
2476 | if ((pv = pmap->pm_pmlpv_iso) != NULL) { |
2477 | if (pv_hold_try(pv) == 0) | |
2478 | pv_lock(pv); | |
2479 | KKASSERT(pv == pmap->pm_pmlpv_iso); | |
c2830aa6 | 2480 | p = pmap_remove_pv_page(pv, 1); |
4611d87f MD |
2481 | pv_free(pv, NULL); |
2482 | pv = NULL; /* safety */ | |
2483 | pmap_kremove((vm_offset_t)pmap->pm_pml4_iso); | |
2484 | vm_page_busy_wait(p, FALSE, "pgpun"); | |
831a8507 | 2485 | KKASSERT(p->flags & PG_UNQUEUED); |
4611d87f MD |
2486 | vm_page_unwire(p, 0); |
2487 | vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); | |
2488 | vm_page_free(p); | |
2489 | pmap->pm_pmlpv_iso = NULL; | |
2490 | } | |
48ffc236 | 2491 | if (pmap->pm_pml4) { |
bfc09ba0 | 2492 | KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys)); |
1eeaf6b2 | 2493 | kmem_free(kernel_map, |
4611d87f | 2494 | (vm_offset_t)pmap->pm_pml4, PAGE_SIZE * 2); |
48ffc236 | 2495 | pmap->pm_pml4 = NULL; |
4611d87f | 2496 | pmap->pm_pml4_iso = NULL; |
c8fe38ae | 2497 | } |
701c977e MD |
2498 | KKASSERT(pmap->pm_stats.resident_count == 0); |
2499 | KKASSERT(pmap->pm_stats.wired_count == 0); | |
d7f50089 YY |
2500 | } |
2501 | ||
2502 | /* | |
73b1bfb1 | 2503 | * This function is now unused (used to add the pmap to the pmap_list) |
d7f50089 YY |
2504 | */ |
2505 | void | |
c8fe38ae | 2506 | pmap_pinit2(struct pmap *pmap) |
d7f50089 YY |
2507 | { |
2508 | } | |
2509 | ||
29701988 AL |
2510 | /* |
2511 | * Transform an initialized pmap for Intel EPT. | |
2512 | */ | |
2513 | void | |
2514 | pmap_ept_transform(pmap_t pmap, int flags) | |
2515 | { | |
2516 | uint64_t pmap_bits_ept[PG_BITS_SIZE] = { | |
2517 | [TYPE_IDX] = EPT_PMAP, | |
2518 | [PG_V_IDX] = EPT_PG_READ | EPT_PG_EXECUTE, | |
2519 | [PG_RW_IDX] = EPT_PG_WRITE, | |
2520 | [PG_U_IDX] = 0, /* no support in EPT */ | |
2521 | [PG_A_IDX] = EPT_PG_A, | |
2522 | [PG_M_IDX] = EPT_PG_M, | |
2523 | [PG_PS_IDX] = EPT_PG_PS, | |
2524 | [PG_G_IDX] = 0, /* no support in EPT */ | |
2525 | [PG_W_IDX] = EPT_PG_AVAIL1, | |
2526 | [PG_MANAGED_IDX] = EPT_PG_AVAIL2, | |
2527 | [PG_N_IDX] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_UC, | |
2528 | [PG_NX_IDX] = 0, /* no support in EPT */ | |
2529 | }; | |
2530 | uint64_t protection_codes_ept[PROTECTION_CODES_SIZE] = { | |
2531 | [VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE ] = 0, | |
2532 | [VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE ] = 0, | |
2533 | [VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE] = 0, | |
2534 | [VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE] = 0, | |
2535 | [VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE ] = | |
2536 | pmap_bits_ept[PG_RW_IDX], | |
2537 | [VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE] = | |
2538 | pmap_bits_ept[PG_RW_IDX], | |
2539 | [VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE ] = | |
2540 | pmap_bits_ept[PG_RW_IDX], | |
2541 | [VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE] = | |
2542 | pmap_bits_ept[PG_RW_IDX], | |
2543 | }; | |
2544 | pt_entry_t pmap_cache_bits_ept[PAT_INDEX_SIZE] = { | |
2545 | [PAT_UNCACHEABLE] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_UC, | |
2546 | [PAT_WRITE_COMBINING] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_WC, | |
2547 | [PAT_WRITE_THROUGH] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_WT, | |
2548 | [PAT_WRITE_PROTECTED] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_WP, | |
2549 | [PAT_WRITE_BACK] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_WB, | |
2550 | [PAT_UNCACHED] = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_UC, | |
2551 | }; | |
2552 | pt_entry_t pmap_cache_mask_ept = EPT_PG_IGNORE_PAT | EPT_MEM_TYPE_MASK; | |
2553 | ||
2554 | pmap->pm_flags |= (flags | PMAP_HVM); | |
2555 | bcopy(pmap_bits_ept, pmap->pmap_bits, sizeof(pmap_bits_ept)); | |
2556 | bcopy(protection_codes_ept, pmap->protection_codes, | |
2557 | sizeof(protection_codes_ept)); | |
2558 | bcopy(pmap_cache_bits_ept, pmap->pmap_cache_bits_pte, | |
2559 | sizeof(pmap_cache_bits_ept)); | |
2560 | bcopy(pmap_cache_bits_ept, pmap->pmap_cache_bits_pde, | |
2561 | sizeof(pmap_cache_bits_ept)); | |
2562 | pmap->pmap_cache_mask_pte = pmap_cache_mask_ept; | |
2563 | pmap->pmap_cache_mask_pde = pmap_cache_mask_ept; | |
2564 | ||
2565 | /* | |
2566 | * Zero out page directories. These are only used by the VM. Note | |
2567 | * that the valid area is two pages if there is a pm_pmlpv_iso PTE | |
2568 | * installed, otherwise it is only one page. The ISO page isn't used | |
2569 | * either way but clean it out anyway if it exists. | |
2570 | */ | |
2571 | if (pmap->pm_pmlpv_iso != NULL) | |
2572 | bzero(pmap->pm_pml4, PAGE_SIZE * 2); | |
2573 | else | |
2574 | bzero(pmap->pm_pml4, PAGE_SIZE); | |
2575 | } | |
2576 | ||
bb11cce6 AL |
2577 | /* |
2578 | * Transform an initialized pmap for AMD NPT/RVI. | |
2579 | */ | |
2580 | void | |
2581 | pmap_npt_transform(pmap_t pmap, int flags) | |
2582 | { | |
2583 | uint64_t protection_codes_npt[PROTECTION_CODES_SIZE] = { | |
2584 | [VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE ] = 0, | |
2585 | [VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE ] = 0, | |
2586 | [VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE] = 0, | |
2587 | [VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE] = 0, | |
2588 | [VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE ] = | |
2589 | pmap_bits_default[PG_RW_IDX], | |
2590 | [VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE] = | |
2591 | pmap_bits_default[PG_RW_IDX], | |
2592 | [VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE ] = | |
2593 | pmap_bits_default[PG_RW_IDX], | |
2594 | [VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE] = | |
2595 | pmap_bits_default[PG_RW_IDX], | |
2596 | }; | |
2597 | ||
2598 | pmap->pm_flags |= (flags | PMAP_HVM); | |
2599 | pmap->pmap_bits[TYPE_IDX] = NPT_PMAP; | |
2600 | /* Set PG_G and PG_NX bits to 0, similar to the EPT case above. */ | |
2601 | pmap->pmap_bits[PG_G_IDX] = 0; | |
2602 | pmap->pmap_bits[PG_NX_IDX] = 0; | |
2603 | ||
2604 | bcopy(protection_codes_npt, pmap->protection_codes, | |
2605 | sizeof(protection_codes_npt)); | |
2606 | ||
2607 | if (pmap->pm_pmlpv_iso != NULL) | |
2608 | bzero(pmap->pm_pml4, PAGE_SIZE * 2); | |
2609 | else | |
2610 | bzero(pmap->pm_pml4, PAGE_SIZE); | |
2611 | } | |
2612 | ||
d7f50089 | 2613 | /* |
701c977e MD |
2614 | * This routine is called when various levels in the page table need to |
2615 | * be populated. This routine cannot fail. | |
d7f50089 | 2616 | * |
701c977e MD |
2617 | * This function returns two locked pv_entry's, one representing the |
2618 | * requested pv and one representing the requested pv's parent pv. If | |
a7a03a5f MD |
2619 | * an intermediate page table does not exist it will be created, mapped, |
2620 | * wired, and the parent page table will be given an additional hold | |
2621 | * count representing the presence of the child pv_entry. | |
d7f50089 | 2622 | */ |
bfc09ba0 | 2623 | static |
701c977e MD |
2624 | pv_entry_t |
2625 | pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp) | |
d7f50089 | 2626 | { |
701c977e | 2627 | pt_entry_t *ptep; |
4611d87f | 2628 | pt_entry_t *ptep_iso; |
701c977e MD |
2629 | pv_entry_t pv; |
2630 | pv_entry_t pvp; | |
76f1911e | 2631 | pt_entry_t v; |
701c977e MD |
2632 | vm_page_t m; |
2633 | int isnew; | |
921c891e | 2634 | int ispt; |
701c977e | 2635 | |
c8fe38ae | 2636 | /* |
701c977e MD |
2637 | * If the pv already exists and we aren't being asked for the |
2638 | * parent page table page we can just return it. A locked+held pv | |
8e2efb11 MD |
2639 | * is returned. The pv will also have a second hold related to the |
2640 | * pmap association that we don't have to worry about. | |
c8fe38ae | 2641 | */ |
921c891e | 2642 | ispt = 0; |
701c977e MD |
2643 | pv = pv_alloc(pmap, ptepindex, &isnew); |
2644 | if (isnew == 0 && pvpp == NULL) | |
2645 | return(pv); | |
2646 | ||
701c977e | 2647 | /* |
567a6398 | 2648 | * DragonFly doesn't use PV's to represent terminal PTEs any more. |
0600465e MD |
2649 | * The index range is still used for placemarkers, but not for |
2650 | * actual pv_entry's. | |
701c977e | 2651 | */ |
567a6398 | 2652 | KKASSERT(ptepindex >= pmap_pt_pindex(0)); |
567a6398 MD |
2653 | |
2654 | /* | |
2655 | * Note that pt_pv's are only returned for user VAs. We assert that | |
2656 | * a pt_pv is not being requested for kernel VAs. The kernel | |
2657 | * pre-wires all higher-level page tables so don't overload managed | |
2658 | * higher-level page tables on top of it! | |
2659 | * | |
2660 | * However, its convenient for us to allow the case when creating | |
2661 | * iso_pmap. This is a bit of a hack but it simplifies iso_pmap | |
2662 | * a lot. | |
2663 | */ | |
c8fe38ae | 2664 | |
582f286d MD |
2665 | /* |
2666 | * The kernel never uses managed PT/PD/PDP pages. | |
2667 | */ | |
c713db65 | 2668 | KKASSERT(pmap != kernel_pmap); |
582f286d | 2669 | |
c8fe38ae | 2670 | /* |
701c977e MD |
2671 | * Non-terminal PVs allocate a VM page to represent the page table, |
2672 | * so we have to resolve pvp and calculate ptepindex for the pvp | |
2673 | * and then for the page table entry index in the pvp for | |
2674 | * fall-through. | |
c8fe38ae | 2675 | */ |
701c977e | 2676 | if (ptepindex < pmap_pd_pindex(0)) { |
4a4ea614 | 2677 | /* |
701c977e | 2678 | * pv is PT, pvp is PD |
4a4ea614 | 2679 | */ |
701c977e MD |
2680 | ptepindex = (ptepindex - pmap_pt_pindex(0)) >> NPDEPGSHIFT; |
2681 | ptepindex += NUPTE_TOTAL + NUPT_TOTAL; | |
2682 | pvp = pmap_allocpte(pmap, ptepindex, NULL); | |
701c977e | 2683 | |
1b2e0b92 | 2684 | /* |
701c977e | 2685 | * PT index in PD |
1b2e0b92 | 2686 | */ |
701c977e MD |
2687 | ptepindex = pv->pv_pindex - pmap_pt_pindex(0); |
2688 | ptepindex &= ((1ul << NPDEPGSHIFT) - 1); | |
921c891e | 2689 | ispt = 1; |
701c977e | 2690 | } else if (ptepindex < pmap_pdp_pindex(0)) { |
1b2e0b92 | 2691 | /* |
701c977e | 2692 | * pv is PD, pvp is PDP |
921c891e MD |
2693 | * |
2694 | * SIMPLE PMAP NOTE: Simple pmaps do not allocate above | |
2695 | * the PD. | |
1b2e0b92 | 2696 | */ |
701c977e MD |
2697 | ptepindex = (ptepindex - pmap_pd_pindex(0)) >> NPDPEPGSHIFT; |
2698 | ptepindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; | |
921c891e MD |
2699 | |
2700 | if (pmap->pm_flags & PMAP_FLAG_SIMPLE) { | |
2701 | KKASSERT(pvpp == NULL); | |
2702 | pvp = NULL; | |
2703 | } else { | |
2704 | pvp = pmap_allocpte(pmap, ptepindex, NULL); | |
2705 | } | |
701c977e MD |
2706 | |
2707 | /* | |
2708 | * PD index in PDP | |
2709 | */ | |
2710 | ptepindex = pv->pv_pindex - pmap_pd_pindex(0); | |
2711 | ptepindex &= ((1ul << NPDPEPGSHIFT) - 1); | |
2712 | } else if (ptepindex < pmap_pml4_pindex()) { | |
700e22f7 | 2713 | /* |
701c977e | 2714 | * pv is PDP, pvp is the root pml4 table |
1b2e0b92 | 2715 | */ |
701c977e | 2716 | pvp = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL); |
700e22f7 | 2717 | |
701c977e MD |
2718 | /* |
2719 | * PDP index in PML4 | |
2720 | */ | |
2721 | ptepindex = pv->pv_pindex - pmap_pdp_pindex(0); | |
2722 | ptepindex &= ((1ul << NPML4EPGSHIFT) - 1); | |
2723 | } else { | |
2724 | /* | |
2725 | * pv represents the top-level PML4, there is no parent. | |
2726 | */ | |
2727 | pvp = NULL; | |
1b2e0b92 | 2728 | } |
700e22f7 | 2729 | |
76f1911e MD |
2730 | if (isnew == 0) |
2731 | goto notnew; | |
2732 | ||
700e22f7 | 2733 | /* |
737b020b | 2734 | * (isnew) is TRUE. |
a7a03a5f MD |
2735 | * |
2736 | * (1) Add a wire count to the parent page table (pvp). | |
2737 | * (2) Allocate a VM page for the page table. | |
2738 | * (3) Enter the VM page into the parent page table. | |
701c977e MD |
2739 | * |
2740 | * page table pages are marked PG_WRITEABLE and PG_MAPPED. | |
1b2e0b92 | 2741 | */ |
a7a03a5f MD |
2742 | if (pvp) |
2743 | vm_page_wire_quick(pvp->pv_m); | |
2744 | ||
701c977e MD |
2745 | for (;;) { |
2746 | m = vm_page_alloc(NULL, pv->pv_pindex, | |
2747 | VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | | |
2748 | VM_ALLOC_INTERRUPT); | |
2749 | if (m) | |
2750 | break; | |
2751 | vm_wait(0); | |
1b2e0b92 | 2752 | } |
76f1911e | 2753 | vm_page_wire(m); /* wire for mapping in parent */ |
76f1911e MD |
2754 | pmap_zero_page(VM_PAGE_TO_PHYS(m)); |
2755 | m->valid = VM_PAGE_BITS_ALL; | |
831a8507 MD |
2756 | vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE | PG_UNQUEUED); |
2757 | KKASSERT(m->queue == PQ_NONE); | |
76f1911e | 2758 | |
701c977e | 2759 | pv->pv_m = m; |
701c977e MD |
2760 | |
2761 | /* | |
737b020b | 2762 | * (isnew) is TRUE. |
76f1911e | 2763 | * |
a7a03a5f MD |
2764 | * Wire the page into pvp. Bump the resident_count for the pmap. |
2765 | * There is no pvp for the top level, address the pm_pml4[] array | |
2766 | * directly. | |
701c977e MD |
2767 | * |
2768 | * If the caller wants the parent we return it, otherwise | |
2769 | * we just put it away. | |
2770 | * | |
2771 | * No interlock is needed for pte 0 -> non-zero. | |
921c891e MD |
2772 | * |
2773 | * In the situation where *ptep is valid we might have an unmanaged | |
2774 | * page table page shared from another page table which we need to | |
2775 | * unshare before installing our private page table page. | |
701c977e MD |
2776 | */ |
2777 | if (pvp) { | |
76f1911e | 2778 | v = VM_PAGE_TO_PHYS(m) | |
9e24b495 | 2779 | (pmap->pmap_bits[PG_RW_IDX] | |
76f1911e | 2780 | pmap->pmap_bits[PG_V_IDX] | |
9e24b495 MD |
2781 | pmap->pmap_bits[PG_A_IDX]); |
2782 | if (ptepindex < NUPTE_USER) | |
2783 | v |= pmap->pmap_bits[PG_U_IDX]; | |
2784 | if (ptepindex < pmap_pt_pindex(0)) | |
2785 | v |= pmap->pmap_bits[PG_M_IDX]; | |
2786 | ||
701c977e | 2787 | ptep = pv_pte_lookup(pvp, ptepindex); |
4611d87f MD |
2788 | if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) |
2789 | ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex); | |
2790 | else | |
2791 | ptep_iso = NULL; | |
a86ce0cd | 2792 | if (*ptep & pmap->pmap_bits[PG_V_IDX]) { |
0600465e | 2793 | panic("pmap_allocpte: ptpte present without pv_entry!"); |
76f1911e MD |
2794 | } else { |
2795 | pt_entry_t pte; | |
2796 | ||
2797 | pte = atomic_swap_long(ptep, v); | |
4611d87f MD |
2798 | if (ptep_iso) |
2799 | atomic_swap_long(ptep_iso, v); | |
76f1911e MD |
2800 | if (pte != 0) { |
2801 | kprintf("install pgtbl mixup 0x%016jx " | |
2802 | "old/new 0x%016jx/0x%016jx\n", | |
2803 | (intmax_t)ptepindex, pte, v); | |
2804 | } | |
921c891e | 2805 | } |
701c977e MD |
2806 | } |
2807 | vm_page_wakeup(m); | |
76f1911e | 2808 | |
737b020b | 2809 | notnew: |
76f1911e | 2810 | /* |
737b020b | 2811 | * (isnew) may be TRUE or FALSE. |
76f1911e | 2812 | */ |
76f1911e MD |
2813 | if (pvp) { |
2814 | KKASSERT(pvp->pv_m != NULL); | |
2815 | ptep = pv_pte_lookup(pvp, ptepindex); | |
2816 | v = VM_PAGE_TO_PHYS(pv->pv_m) | | |
9e24b495 | 2817 | (pmap->pmap_bits[PG_RW_IDX] | |
76f1911e | 2818 | pmap->pmap_bits[PG_V_IDX] | |
9e24b495 MD |
2819 | pmap->pmap_bits[PG_A_IDX]); |
2820 | if (ptepindex < NUPTE_USER) | |
2821 | v |= pmap->pmap_bits[PG_U_IDX]; | |
2822 | if (ptepindex < pmap_pt_pindex(0)) | |
2823 | v |= pmap->pmap_bits[PG_M_IDX]; | |
76f1911e MD |
2824 | if (*ptep != v) { |
2825 | kprintf("mismatched upper level pt %016jx/%016jx\n", | |
2826 | *ptep, v); | |
2827 | } | |
2828 | } | |
701c977e MD |
2829 | if (pvpp) |
2830 | *pvpp = pvp; | |
2831 | else if (pvp) | |
2832 | pv_put(pvp); | |
2833 | return (pv); | |
2834 | } | |
d7f50089 YY |
2835 | |
2836 | /* | |
701c977e MD |
2837 | * Release any resources held by the given physical map. |
2838 | * | |
2839 | * Called when a pmap initialized by pmap_pinit is being released. Should | |
2840 | * only be called if the map contains no valid mappings. | |
d7f50089 | 2841 | */ |
701c977e MD |
2842 | struct pmap_release_info { |
2843 | pmap_t pmap; | |
2844 | int retry; | |
a7a03a5f | 2845 | pv_entry_t pvp; |
701c977e MD |
2846 | }; |
2847 | ||
2848 | static int pmap_release_callback(pv_entry_t pv, void *data); | |
2849 | ||
2850 | void | |
2851 | pmap_release(struct pmap *pmap) | |
c8fe38ae | 2852 | { |
701c977e MD |
2853 | struct pmap_release_info info; |
2854 | ||
c07315c4 MD |
2855 | KASSERT(CPUMASK_TESTZERO(pmap->pm_active), |
2856 | ("pmap still active! %016jx", | |
2857 | (uintmax_t)CPUMASK_LOWMASK(pmap->pm_active))); | |
701c977e | 2858 | |
73b1bfb1 MD |
2859 | /* |
2860 | * There is no longer a pmap_list, if there were we would remove the | |
2861 | * pmap from it here. | |
2862 | */ | |
c8fe38ae MD |
2863 | |
2864 | /* | |
701c977e MD |
2865 | * Pull pv's off the RB tree in order from low to high and release |
2866 | * each page. | |
c8fe38ae | 2867 | */ |
701c977e MD |
2868 | info.pmap = pmap; |
2869 | do { | |
2870 | info.retry = 0; | |
a7a03a5f MD |
2871 | info.pvp = NULL; |
2872 | ||
701c977e MD |
2873 | spin_lock(&pmap->pm_spin); |
2874 | RB_SCAN(pv_entry_rb_tree, &pmap->pm_pvroot, NULL, | |
2875 | pmap_release_callback, &info); | |
2876 | spin_unlock(&pmap->pm_spin); | |
a7a03a5f MD |
2877 | |
2878 | if (info.pvp) | |
2879 | pv_put(info.pvp); | |
701c977e MD |
2880 | } while (info.retry); |
2881 | ||
a5fc46c9 MD |
2882 | |
2883 | /* | |
4611d87f MD |
2884 | * One resident page (the pml4 page) should remain. Two if |
2885 | * the pmap has implemented an isolated userland PML4E table. | |
701c977e | 2886 | * No wired pages should remain. |
a5fc46c9 | 2887 | */ |
4611d87f MD |
2888 | int expected_res = 0; |
2889 | ||
2890 | if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0) | |
2891 | ++expected_res; | |
2892 | if (pmap->pm_pmlpv_iso) | |
2893 | ++expected_res; | |
2894 | ||
76f1911e | 2895 | #if 1 |
4611d87f | 2896 | if (pmap->pm_stats.resident_count != expected_res || |
76f1911e MD |
2897 | pmap->pm_stats.wired_count != 0) { |
2898 | kprintf("fatal pmap problem - pmap %p flags %08x " | |
2899 | "rescnt=%jd wirecnt=%jd\n", | |
2900 | pmap, | |
2901 | pmap->pm_flags, | |
2902 | pmap->pm_stats.resident_count, | |
2903 | pmap->pm_stats.wired_count); | |
2904 | tsleep(pmap, 0, "DEAD", 0); | |
2905 | } | |
2906 | #else | |
4611d87f | 2907 | KKASSERT(pmap->pm_stats.resident_count == expected_res); |
701c977e | 2908 | KKASSERT(pmap->pm_stats.wired_count == 0); |
76f1911e | 2909 | #endif |
701c977e MD |
2910 | } |
2911 | ||
a7a03a5f MD |
2912 | /* |
2913 | * Called from low to high. We must cache the proper parent pv so we | |
2914 | * can adjust its wired count. | |
2915 | */ | |
701c977e MD |
2916 | static int |
2917 | pmap_release_callback(pv_entry_t pv, void *data) | |
2918 | { | |
2919 | struct pmap_release_info *info = data; | |
2920 | pmap_t pmap = info->pmap; | |
a7a03a5f | 2921 | vm_pindex_t pindex; |
921c891e | 2922 | int r; |
701c977e | 2923 | |
76f1911e MD |
2924 | /* |
2925 | * Acquire a held and locked pv, check for release race | |
2926 | */ | |
2927 | pindex = pv->pv_pindex; | |
a7a03a5f MD |
2928 | if (info->pvp == pv) { |
2929 | spin_unlock(&pmap->pm_spin); | |
2930 | info->pvp = NULL; | |
2931 | } else if (pv_hold_try(pv)) { | |
701c977e MD |
2932 | spin_unlock(&pmap->pm_spin); |
2933 | } else { | |
2934 | spin_unlock(&pmap->pm_spin); | |
2935 | pv_lock(pv); | |
a3a33e50 | 2936 | pv_put(pv); |
5e78aef9 | 2937 | info->retry = 1; |
07fc7b93 MD |
2938 | spin_lock(&pmap->pm_spin); |
2939 | ||
e989b548 | 2940 | return -1; |
48ffc236 | 2941 | } |
e989b548 | 2942 | KKASSERT(pv->pv_pmap == pmap && pindex == pv->pv_pindex); |
a7a03a5f MD |
2943 | |
2944 | if (pv->pv_pindex < pmap_pt_pindex(0)) { | |
2945 | /* | |
76f1911e | 2946 | * I am PTE, parent is PT |
a7a03a5f MD |
2947 | */ |
2948 | pindex = pv->pv_pindex >> NPTEPGSHIFT; | |
2949 | pindex += NUPTE_TOTAL; | |
2950 | } else if (pv->pv_pindex < pmap_pd_pindex(0)) { | |
2951 | /* | |
76f1911e | 2952 | * I am PT, parent is PD |
a7a03a5f MD |
2953 | */ |
2954 | pindex = (pv->pv_pindex - NUPTE_TOTAL) >> NPDEPGSHIFT; | |
2955 | pindex += NUPTE_TOTAL + NUPT_TOTAL; | |
2956 | } else if (pv->pv_pindex < pmap_pdp_pindex(0)) { | |
2957 | /* | |
76f1911e | 2958 | * I am PD, parent is PDP |
a7a03a5f MD |
2959 | */ |
2960 | pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL) >> | |
2961 | NPDPEPGSHIFT; | |
2962 | pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL; | |
2963 | } else if (pv->pv_pindex < pmap_pml4_pindex()) { | |
2964 | /* | |
4611d87f MD |
2965 | * I am PDP, parent is PML4. We always calculate the |
2966 | * normal PML4 here, not the isolated PML4. | |
a7a03a5f | 2967 | */ |
a7a03a5f MD |
2968 | pindex = pmap_pml4_pindex(); |
2969 | } else { | |
2970 | /* | |
2971 | * parent is NULL | |
2972 | */ | |
2973 | if (info->pvp) { | |
2974 | pv_put(info->pvp); | |
2975 | info->pvp = NULL; | |
2976 | } | |
2977 | pindex = 0; | |
2978 | } | |
2979 | if (pindex) { | |
2980 | if (info->pvp && info->pvp->pv_pindex != pindex) { | |
2981 | pv_put(info->pvp); | |
2982 | info->pvp = NULL; | |
2983 | } | |
2984 | if (info->pvp == NULL) | |
76f1911e | 2985 | info->pvp = pv_get(pmap, pindex, NULL); |
a7a03a5f MD |
2986 | } else { |
2987 | if (info->pvp) { | |
2988 | pv_put(info->pvp); | |
2989 | info->pvp = NULL; | |
2990 | } | |
2991 | } | |
2992 | r = pmap_release_pv(pv, info->pvp, NULL); | |
921c891e | 2993 | spin_lock(&pmap->pm_spin); |
76f1911e | 2994 | |
921c891e MD |
2995 | return(r); |
2996 | } | |
2997 | ||
2998 | /* | |
2999 | * Called with held (i.e. also locked) pv. This function will dispose of | |
3000 | * the lock along with the pv. | |
01d2a79f MD |
3001 | * |
3002 | * If the caller already holds the locked parent page table for pv it | |
3003 | * must pass it as pvp, allowing us to avoid a deadlock, else it can | |
3004 | * pass NULL for pvp. | |
921c891e MD |
3005 | */ |
3006 | static int | |
ccd67bf6 | 3007 | pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk) |
921c891e MD |
3008 | { |
3009 | vm_page_t p; | |
48ffc236 | 3010 | |
701c977e MD |
3011 | /* |
3012 | * The pmap is currently not spinlocked, pv is held+locked. | |
3013 | * Remove the pv's page from its parent's page table. The | |
3014 | * parent's page table page's wire_count will be decremented. | |
e3e69557 MD |
3015 | * |
3016 | * This will clean out the pte at any level of the page table. | |
79f2da03 | 3017 | * If smp != 0 all cpus are affected. |
a7a03a5f MD |
3018 | * |
3019 | * Do not tear-down recursively, its faster to just let the | |
3020 | * release run its course. | |
701c977e | 3021 | */ |
a7a03a5f | 3022 | pmap_remove_pv_pte(pv, pvp, bulk, 0); |
c8fe38ae MD |
3023 | |
3024 | /* | |
701c977e MD |
3025 | * Terminal pvs are unhooked from their vm_pages. Because |
3026 | * terminal pages aren't page table pages they aren't wired | |
3027 | * by us, so we have to be sure not to unwire them either. | |
c2830aa6 | 3028 | * |
c2830aa6 MD |
3029 | * XXX It is unclear if this code ever gets called because we |
3030 | * no longer use pv's to track terminal pages. | |
c8fe38ae | 3031 | */ |
701c977e | 3032 | if (pv->pv_pindex < pmap_pt_pindex(0)) { |
c2830aa6 | 3033 | pmap_remove_pv_page(pv, 0); |
701c977e MD |
3034 | goto skip; |
3035 | } | |
c8fe38ae | 3036 | |
c8fe38ae | 3037 | /* |
701c977e MD |
3038 | * We leave the top-level page table page cached, wired, and |
3039 | * mapped in the pmap until the dtor function (pmap_puninit()) | |
3040 | * gets called. | |
e8510e54 | 3041 | * |
701c977e MD |
3042 | * Since we are leaving the top-level pv intact we need |
3043 | * to break out of what would otherwise be an infinite loop. | |
4611d87f MD |
3044 | * |
3045 | * This covers both the normal and the isolated PML4 page. | |
c8fe38ae | 3046 | */ |
4611d87f | 3047 | if (pv->pv_pindex >= pmap_pml4_pindex()) { |
701c977e | 3048 | pv_put(pv); |
701c977e MD |
3049 | return(-1); |
3050 | } | |
3051 | ||
3052 | /* | |
3053 | * For page table pages (other than the top-level page), | |
3054 | * remove and free the vm_page. The representitive mapping | |
3055 | * removed above by pmap_remove_pv_pte() did not undo the | |
3056 | * last wire_count so we have to do that as well. | |
3057 | */ | |
c2830aa6 | 3058 | p = pmap_remove_pv_page(pv, 1); |
701c977e | 3059 | vm_page_busy_wait(p, FALSE, "pmaprl"); |
701c977e | 3060 | if (p->wire_count != 1) { |
567a6398 MD |
3061 | const char *tstr; |
3062 | ||
3063 | if (pv->pv_pindex >= pmap_pdp_pindex(0)) | |
3064 | tstr = "PDP"; | |
3065 | else if (pv->pv_pindex >= pmap_pd_pindex(0)) | |
3066 | tstr = "PD"; | |
3067 | else if (pv->pv_pindex >= pmap_pt_pindex(0)) | |
3068 | tstr = "PT"; | |
3069 | else | |
3070 | tstr = "PTE"; | |
3071 | ||
3072 | kprintf("p(%s) p->wire_count was %016lx %d\n", | |
3073 | tstr, pv->pv_pindex, p->wire_count); | |
701c977e MD |
3074 | } |
3075 | KKASSERT(p->wire_count == 1); | |
831a8507 | 3076 | KKASSERT(p->flags & PG_UNQUEUED); |
701c977e MD |
3077 | |
3078 | vm_page_unwire(p, 0); | |
3079 | KKASSERT(p->wire_count == 0); | |
921c891e | 3080 | |
701c977e MD |
3081 | vm_page_free(p); |
3082 | skip: | |
76f1911e | 3083 | pv_free(pv, pvp); |
a7a03a5f | 3084 | |
921c891e | 3085 | return 0; |
701c977e MD |
3086 | } |
3087 | ||
3088 | /* | |
3089 | * This function will remove the pte associated with a pv from its parent. | |
a7a03a5f MD |
3090 | * Terminal pv's are supported. All cpus specified by (bulk) are properly |
3091 | * invalidated. | |
701c977e MD |
3092 | * |
3093 | * The wire count will be dropped on the parent page table. The wire | |
3094 | * count on the page being removed (pv->pv_m) from the parent page table | |
3095 | * is NOT touched. Note that terminal pages will not have any additional | |
3096 | * wire counts while page table pages will have at least one representing | |
3097 | * the mapping, plus others representing sub-mappings. | |
3098 | * | |
3099 | * NOTE: Cannot be called on kernel page table pages, only KVM terminal | |
3100 | * pages and user page table and terminal pages. | |
3101 | * | |
5ee06c6c MD |
3102 | * NOTE: The pte being removed might be unmanaged, and the pv supplied might |
3103 | * be freshly allocated and not imply that the pte is managed. In this | |
3104 | * case pv->pv_m should be NULL. | |
3105 | * | |
a7a03a5f MD |
3106 | * The pv must be locked. The pvp, if supplied, must be locked. All |
3107 | * supplied pv's will remain locked on return. | |
701c977e MD |
3108 | * |
3109 | * XXX must lock parent pv's if they exist to remove pte XXX | |
3110 | */ | |
3111 | static | |
3112 | void | |
a7a03a5f MD |
3113 | pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk, |
3114 | int destroy) | |
701c977e MD |
3115 | { |
3116 | vm_pindex_t ptepindex = pv->pv_pindex; | |
3117 | pmap_t pmap = pv->pv_pmap; | |
3118 | vm_page_t p; | |
3119 | int gotpvp = 0; | |
48ffc236 | 3120 | |
701c977e | 3121 | KKASSERT(pmap); |
48ffc236 | 3122 | |
4611d87f | 3123 | if (ptepindex >= pmap_pml4_pindex()) { |
b12defdc | 3124 | /* |
76f1911e | 3125 | * We are the top level PML4E table, there is no parent. |
4611d87f MD |
3126 | * |
3127 | * This is either the normal or isolated PML4E table. | |
3128 | * Only the normal is used in regular operation, the isolated | |
3129 | * is only passed in when breaking down the whole pmap. | |
b12defdc | 3130 | */ |
701c977e | 3131 | p = pmap->pm_pmlpv->pv_m; |
5ee06c6c | 3132 | KKASSERT(pv->pv_m == p); /* debugging */ |
701c977e | 3133 | } else if (ptepindex >= pmap_pdp_pindex(0)) { |
e8510e54 | 3134 | /* |
76f1911e | 3135 | * Remove a PDP page from the PML4E. This can only occur |
701c977e MD |
3136 | * with user page tables. We do not have to lock the |
3137 | * pml4 PV so just ignore pvp. | |
e8510e54 | 3138 | */ |
701c977e MD |
3139 | vm_pindex_t pml4_pindex; |
3140 | vm_pindex_t pdp_index; | |
3141 | pml4_entry_t *pdp; | |
4611d87f | 3142 | pml4_entry_t *pdp_iso; |
701c977e MD |
3143 | |
3144 | pdp_index = ptepindex - pmap_pdp_pindex(0); | |
3145 | if (pvp == NULL) { | |
3146 | pml4_pindex = pmap_pml4_pindex(); | |
76f1911e | 3147 | pvp = pv_get(pv->pv_pmap, pml4_pindex, NULL); |
921c891e | 3148 | KKASSERT(pvp); |
701c977e | 3149 | gotpvp = 1; |
e8510e54 | 3150 | } |
76f1911e | 3151 | |
701c977e | 3152 | pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)]; |
a86ce0cd | 3153 | KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0); |
701c977e | 3154 | p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME); |
ccd67bf6 | 3155 | pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0); |
4611d87f MD |
3156 | |
3157 | /* | |
3158 | * Also remove the PDP from the isolated PML4E if the | |
3159 | * process uses one. | |
3160 | */ | |
3161 | if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) { | |
3162 | pdp_iso = &pmap->pm_pml4_iso[pdp_index & | |
3163 | ((1ul << NPML4EPGSHIFT) - 1)]; | |
3164 | pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp_iso, 0); | |
3165 | } | |
5ee06c6c | 3166 | KKASSERT(pv->pv_m == p); /* debugging */ |
701c977e | 3167 | } else if (ptepindex >= pmap_pd_pindex(0)) { |
e8510e54 | 3168 | /* |
76f1911e | 3169 | * Remove a PD page from the PDP |
921c891e MD |
3170 | * |
3171 | * SIMPLE PMAP NOTE: Non-existant pvp's are ok in the case | |
3172 | * of a simple pmap because it stops at | |
3173 | * the PD page. | |
e8510e54 | 3174 | */ |
701c977e MD |
3175 | vm_pindex_t pdp_pindex; |
3176 | vm_pindex_t pd_index; | |
3177 | pdp_entry_t *pd; | |
48ffc236 | 3178 | |
701c977e | 3179 | pd_index = ptepindex - pmap_pd_pindex(0); |
48ffc236 | 3180 | |
701c977e MD |
3181 | if (pvp == NULL) { |
3182 | pdp_pindex = NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + | |
3183 | (pd_index >> NPML4EPGSHIFT); | |
76f1911e | 3184 | pvp = pv_get(pv->pv_pmap, pdp_pindex, NULL); |
a7a03a5f | 3185 | gotpvp = 1; |
921c891e | 3186 | } |
76f1911e | 3187 | |
921c891e MD |
3188 | if (pvp) { |
3189 | pd = pv_pte_lookup(pvp, pd_index & | |
3190 | ((1ul << NPDPEPGSHIFT) - 1)); | |
a86ce0cd | 3191 | KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0); |
921c891e | 3192 | p = PHYS_TO_VM_PAGE(*pd & PG_FRAME); |
ccd67bf6 | 3193 | pmap_inval_bulk(bulk, (vm_offset_t)-1, pd, 0); |
921c891e MD |
3194 | } else { |
3195 | KKASSERT(pmap->pm_flags & PMAP_FLAG_SIMPLE); | |
3196 | p = pv->pv_m; /* degenerate test later */ | |
701c977e | 3197 | } |
5ee06c6c | 3198 | KKASSERT(pv->pv_m == p); /* debugging */ |
701c977e | 3199 | } else if (ptepindex >= pmap_pt_pindex(0)) { |
e8510e54 | 3200 | /* |
76f1911e | 3201 | * Remove a PT page from the PD |
e8510e54 | 3202 | */ |
701c977e MD |
3203 | vm_pindex_t pd_pindex; |
3204 | vm_pindex_t pt_index; | |
3205 | pd_entry_t *pt; | |
b12defdc | 3206 | |
701c977e MD |
3207 | pt_index = ptepindex - pmap_pt_pindex(0); |
3208 | ||
3209 | if (pvp == NULL) { | |
3210 | pd_pindex = NUPTE_TOTAL + NUPT_TOTAL + | |
3211 | (pt_index >> NPDPEPGSHIFT); | |
76f1911e | 3212 | pvp = pv_get(pv->pv_pmap, pd_pindex, NULL); |
921c891e | 3213 | KKASSERT(pvp); |
701c977e MD |
3214 | gotpvp = 1; |
3215 | } | |
76f1911e | 3216 | |
701c977e | 3217 | pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1)); |
76f1911e MD |
3218 | #if 0 |
3219 | KASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0, | |
3220 | ("*pt unexpectedly invalid %016jx " | |
3221 | "gotpvp=%d ptepindex=%ld ptindex=%ld pv=%p pvp=%p", | |
3222 | *pt, gotpvp, ptepindex, pt_index, pv, pvp)); | |
701c977e | 3223 | p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); |
76f1911e MD |
3224 | #else |
3225 | if ((*pt & pmap->pmap_bits[PG_V_IDX]) == 0) { | |
3226 | kprintf("*pt unexpectedly invalid %016jx " | |
3227 | "gotpvp=%d ptepindex=%ld ptindex=%ld " | |
3228 | "pv=%p pvp=%p\n", | |
3229 | *pt, gotpvp, ptepindex, pt_index, pv, pvp); | |
3230 | tsleep(pt, 0, "DEAD", 0); | |
3231 | p = pv->pv_m; | |
3232 | } else { | |
3233 | p = PHYS_TO_VM_PAGE(*pt & PG_FRAME); | |
3234 | } | |
3235 | #endif | |
ccd67bf6 | 3236 | pmap_inval_bulk(bulk, (vm_offset_t)-1, pt, 0); |
5ee06c6c | 3237 | KKASSERT(pv->pv_m == p); /* debugging */ |
701c977e | 3238 | } else { |
e3c330f0 | 3239 | KKASSERT(0); |
c8fe38ae MD |
3240 | } |
3241 | ||
48ffc236 | 3242 | /* |
a7a03a5f MD |
3243 | * If requested, scrap the underlying pv->pv_m and the underlying |
3244 | * pv. If this is a page-table-page we must also free the page. | |
701c977e | 3245 | * |
a7a03a5f | 3246 | * pvp must be returned locked. |
48ffc236 | 3247 | */ |
a7a03a5f MD |
3248 | if (destroy == 1) { |
3249 | /* | |
3250 | * page table page (PT, PD, PDP, PML4), caller was responsible | |
3251 | * for testing wired_count. | |
3252 | */ | |
a7a03a5f | 3253 | KKASSERT(pv->pv_m->wire_count == 1); |
c2830aa6 | 3254 | p = pmap_remove_pv_page(pv, 1); |
76f1911e | 3255 | pv_free(pv, pvp); |
a7a03a5f MD |
3256 | pv = NULL; |
3257 | ||
a7a03a5f MD |
3258 | vm_page_busy_wait(p, FALSE, "pgpun"); |
3259 | vm_page_unwire(p, 0); | |
3260 | vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE); | |
3261 | vm_page_free(p); | |
c2830aa6 | 3262 | } |
a7a03a5f MD |
3263 | |
3264 | /* | |
3265 | * If we acquired pvp ourselves then we are responsible for | |
3266 | * recursively deleting it. | |
3267 | */ | |
3268 | if (pvp && gotpvp) { | |
3269 | /* | |
3270 | * Recursively destroy higher-level page tables. | |
3271 | * | |
3272 | * This is optional. If we do not, they will still | |
3273 | * be destroyed when the process exits. | |
b67dba3c MD |
3274 | * |
3275 | * NOTE: Do not destroy pv_entry's with extra hold refs, | |
3276 | * a caller may have unlocked it and intends to | |
3277 | * continue to use it. | |
a7a03a5f | 3278 | */ |
582f286d MD |
3279 | if (pmap_dynamic_delete && |
3280 | pvp->pv_m && | |
a7a03a5f | 3281 | pvp->pv_m->wire_count == 1 && |
b67dba3c | 3282 | (pvp->pv_hold & PV_HOLD_MASK) == 2 && |
4611d87f | 3283 | pvp->pv_pindex < pmap_pml4_pindex()) { |
c713db65 | 3284 | if (pmap != kernel_pmap) { |
8e9ad8f9 MD |
3285 | pmap_remove_pv_pte(pvp, NULL, bulk, 1); |
3286 | pvp = NULL; /* safety */ | |
3287 | } else { | |
3288 | kprintf("Attempt to remove kernel_pmap pindex " | |
3289 | "%jd\n", pvp->pv_pindex); | |
3290 | pv_put(pvp); | |
3291 | } | |
a7a03a5f MD |
3292 | } else { |
3293 | pv_put(pvp); | |
3294 | } | |
3295 | } | |
c8fe38ae MD |
3296 | } |
3297 | ||
8e2efb11 MD |
3298 | /* |
3299 | * Remove the vm_page association to a pv. The pv must be locked. | |
3300 | */ | |
bfc09ba0 MD |
3301 | static |
3302 | vm_page_t | |
c2830aa6 | 3303 | pmap_remove_pv_page(pv_entry_t pv, int clrpgbits) |
d7f50089 | 3304 | { |
c8fe38ae MD |
3305 | vm_page_t m; |
3306 | ||
701c977e | 3307 | m = pv->pv_m; |
701c977e | 3308 | pv->pv_m = NULL; |
c2830aa6 MD |
3309 | if (clrpgbits) |
3310 | vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE); | |
a7a03a5f | 3311 | |
52bb73bc | 3312 | return(m); |
d7f50089 YY |
3313 | } |
3314 | ||
3315 | /* | |
c8fe38ae | 3316 | * Grow the number of kernel page table entries, if needed. |
a8cf2878 MD |
3317 | * |
3318 | * This routine is always called to validate any address space | |
3319 | * beyond KERNBASE (for kldloads). kernel_vm_end only governs the address | |
3320 | * space below KERNBASE. | |
d95d5e03 MD |
3321 | * |
3322 | * kernel_map must be locked exclusively by the caller. | |
d7f50089 | 3323 | */ |
c8fe38ae | 3324 | void |
a8cf2878 | 3325 | pmap_growkernel(vm_offset_t kstart, vm_offset_t kend) |
d7f50089 | 3326 | { |
48ffc236 | 3327 | vm_paddr_t paddr; |
c8fe38ae MD |
3328 | vm_offset_t ptppaddr; |
3329 | vm_page_t nkpg; | |
701c977e | 3330 | pd_entry_t *pt, newpt; |
8ff9866b | 3331 | pdp_entry_t *pd, newpd; |
a8cf2878 | 3332 | int update_kernel_vm_end; |
c8fe38ae | 3333 | |
a8cf2878 MD |
3334 | /* |
3335 | * bootstrap kernel_vm_end on first real VM use | |
3336 | */ | |
c8fe38ae | 3337 | if (kernel_vm_end == 0) { |
791c6551 | 3338 | kernel_vm_end = VM_MIN_KERNEL_ADDRESS; |
8ff9866b MD |
3339 | |
3340 | for (;;) { | |
c713db65 | 3341 | pt = pmap_pt(kernel_pmap, kernel_vm_end); |
8ff9866b MD |
3342 | if (pt == NULL) |
3343 | break; | |
c713db65 | 3344 | if ((*pt & kernel_pmap->pmap_bits[PG_V_IDX]) == 0) |
8ff9866b | 3345 | break; |
a8cf2878 | 3346 | kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) & |
8ff9866b | 3347 | ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); |
1eeaf6b2 AL |
3348 | if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) { |
3349 | kernel_vm_end = vm_map_max(kernel_map); | |
48ffc236 JG |
3350 | break; |
3351 | } | |
c8fe38ae MD |
3352 | } |
3353 | } | |
a8cf2878 MD |
3354 | |
3355 | /* | |
3356 | * Fill in the gaps. kernel_vm_end is only adjusted for ranges | |
3357 | * below KERNBASE. Ranges above KERNBASE are kldloaded and we | |
3358 | * do not want to force-fill 128G worth of page tables. | |
3359 | */ | |
3360 | if (kstart < KERNBASE) { | |
3361 | if (kstart > kernel_vm_end) | |
3362 | kstart = kernel_vm_end; | |
3363 | KKASSERT(kend <= KERNBASE); | |
3364 | update_kernel_vm_end = 1; | |
3365 | } else { | |
3366 | update_kernel_vm_end = 0; | |
3367 | } | |
3368 | ||
8ff9866b MD |
3369 | kstart = rounddown2(kstart, (vm_offset_t)(PAGE_SIZE * NPTEPG)); |
3370 | kend = roundup2(kend, (vm_offset_t)(PAGE_SIZE * NPTEPG)); | |
a8cf2878 | 3371 | |
1eeaf6b2 AL |
3372 | if (kend - 1 >= vm_map_max(kernel_map)) |
3373 | kend = vm_map_max(kernel_map); | |
a8cf2878 MD |
3374 | |
3375 | while (kstart < kend) { | |
c713db65 | 3376 | pt = pmap_pt(kernel_pmap, kstart); |
701c977e | 3377 | if (pt == NULL) { |
8ff9866b MD |
3378 | /* |
3379 | * We need a new PD entry | |
3380 | */ | |
76f1911e | 3381 | nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, |
a8cf2878 MD |
3382 | VM_ALLOC_NORMAL | |
3383 | VM_ALLOC_SYSTEM | | |
3384 | VM_ALLOC_INTERRUPT); | |
3385 | if (nkpg == NULL) { | |
3386 | panic("pmap_growkernel: no memory to grow " | |
3387 | "kernel"); | |
3388 | } | |
48ffc236 | 3389 | paddr = VM_PAGE_TO_PHYS(nkpg); |
afd2da4d | 3390 | pmap_zero_page(paddr); |
c713db65 | 3391 | pd = pmap_pd(kernel_pmap, kstart); |
8ff9866b | 3392 | |
701c977e | 3393 | newpd = (pdp_entry_t) |
a86ce0cd | 3394 | (paddr | |
c713db65 AL |
3395 | kernel_pmap->pmap_bits[PG_V_IDX] | |
3396 | kernel_pmap->pmap_bits[PG_RW_IDX] | | |
3397 | kernel_pmap->pmap_bits[PG_A_IDX]); | |
8ff9866b MD |
3398 | atomic_swap_long(pd, newpd); |
3399 | ||
3400 | #if 0 | |
3401 | kprintf("NEWPD pd=%p pde=%016jx phys=%016jx\n", | |
3402 | pd, newpd, paddr); | |
3403 | #endif | |
3404 | ||
48ffc236 JG |
3405 | continue; /* try again */ |
3406 | } | |
8ff9866b | 3407 | |
c713db65 | 3408 | if ((*pt & kernel_pmap->pmap_bits[PG_V_IDX]) != 0) { |
a8cf2878 | 3409 | kstart = (kstart + PAGE_SIZE * NPTEPG) & |
8ff9866b | 3410 | ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); |
1eeaf6b2 AL |
3411 | if (kstart - 1 >= vm_map_max(kernel_map)) { |
3412 | kstart = vm_map_max(kernel_map); | |
48ffc236 JG |
3413 | break; |
3414 | } | |
c8fe38ae MD |
3415 | continue; |
3416 | } | |
3417 | ||
3418 | /* | |
8e9ad8f9 MD |
3419 | * We need a new PT |
3420 | * | |
c8fe38ae MD |
3421 | * This index is bogus, but out of the way |
3422 | */ | |
76f1911e | 3423 | nkpg = vm_page_alloc(NULL, mycpu->gd_rand_incr++, |
a8cf2878 MD |
3424 | VM_ALLOC_NORMAL | |
3425 | VM_ALLOC_SYSTEM | | |
3426 | VM_ALLOC_INTERRUPT); | |
c8fe38ae MD |
3427 | if (nkpg == NULL) |
3428 | panic("pmap_growkernel: no memory to grow kernel"); | |
3429 | ||
3430 | vm_page_wire(nkpg); | |
3431 | ptppaddr = VM_PAGE_TO_PHYS(nkpg); | |
3432 | pmap_zero_page(ptppaddr); | |
76f1911e | 3433 | newpt = (pd_entry_t)(ptppaddr | |
c713db65 AL |
3434 | kernel_pmap->pmap_bits[PG_V_IDX] | |
3435 | kernel_pmap->pmap_bits[PG_RW_IDX] | | |
3436 | kernel_pmap->pmap_bits[PG_A_IDX]); | |
8ff9866b | 3437 | atomic_swap_long(pt, newpt); |
c8fe38ae | 3438 | |
a8cf2878 | 3439 | kstart = (kstart + PAGE_SIZE * NPTEPG) & |
8ff9866b | 3440 | ~(vm_offset_t)(PAGE_SIZE * NPTEPG - 1); |
a8cf2878 | 3441 | |
1eeaf6b2 AL |
3442 | if (kstart - 1 >= vm_map_max(kernel_map)) { |
3443 | kstart = vm_map_max(kernel_map); | |
48ffc236 | 3444 | break; |
c8fe38ae | 3445 | } |
c8fe38ae | 3446 | } |
a8cf2878 MD |
3447 | |
3448 | /* | |
3449 | * Only update kernel_vm_end for areas below KERNBASE. | |
3450 | */ | |
3451 | if (update_kernel_vm_end && kernel_vm_end < kstart) | |
3452 | kernel_vm_end = kstart; | |
d7f50089 YY |
3453 | } |
3454 | ||
3455 | /* | |
921c891e | 3456 | * Add a reference to the specified pmap. |
d7f50089 | 3457 | */ |
c8fe38ae | 3458 | void |
921c891e | 3459 | pmap_reference(pmap_t pmap) |
d7f50089 | 3460 | { |
76f1911e MD |
3461 | if (pmap != NULL) |
3462 | atomic_add_int(&pmap->pm_count, 1); | |
d7f50089 YY |
3463 | } |
3464 | ||
e3c330f0 MD |
3465 | void |
3466 | pmap_maybethreaded(pmap_t pmap) | |
3467 | { | |
3468 | atomic_set_int(&pmap->pm_flags, PMAP_MULTI); | |
3469 | } | |
3470 | ||
3471 | /* | |
3472 | * Called while page is hard-busied to clear the PG_MAPPED and PG_WRITEABLE | |
c2830aa6 MD |
3473 | * flags if able. This can happen when the pmap code is unable to clear |
3474 | * the bits in prior actions due to not holding the page hard-busied at | |
3475 | * the time. | |
3476 | * | |
b9a6fe08 SW |
3477 | * The clearing of PG_MAPPED/WRITEABLE is an optional optimization done |
3478 | * when the pte is removed and only if the pte has not been multiply-mapped. | |
3479 | * The caller may have to call vm_page_protect() if the bits are still set | |
3480 | * here. | |
c2830aa6 MD |
3481 | * |
3482 | * This function is expected to be quick. | |
e3c330f0 MD |
3483 | */ |
3484 | int | |
3485 | pmap_mapped_sync(vm_page_t m) | |
3486 | { | |
e3c330f0 MD |
3487 | return (m->flags); |
3488 | } | |
3489 | ||
c8fe38ae | 3490 | /*************************************************** |
701c977e | 3491 | * page management routines. |
c8fe38ae | 3492 | ***************************************************/ |
d7f50089 YY |
3493 | |
3494 | /* | |
701c977e | 3495 | * Hold a pv without locking it |
d7f50089 | 3496 | */ |
567a6398 | 3497 | #if 0 |
701c977e MD |
3498 | static void |
3499 | pv_hold(pv_entry_t pv) | |
d7f50089 | 3500 | { |
42909ca4 | 3501 | atomic_add_int(&pv->pv_hold, 1); |
d7f50089 | 3502 | } |
567a6398 | 3503 | #endif |
d7f50089 YY |
3504 | |
3505 | /* | |
701c977e MD |
3506 | * Hold a pv_entry, preventing its destruction. TRUE is returned if the pv |
3507 | * was successfully locked, FALSE if it wasn't. The caller must dispose of | |
3508 | * the pv properly. | |
3509 | * | |
3510 | * Either the pmap->pm_spin or the related vm_page_spin (if traversing a | |
e989b548 MD |
3511 | * pv list via its page) must be held by the caller in order to stabilize |
3512 | * the pv. | |
d7f50089 | 3513 | */ |
701c977e MD |
3514 | static int |
3515 | _pv_hold_try(pv_entry_t pv PMAP_DEBUG_DECL) | |
d7f50089 | 3516 | { |
701c977e MD |
3517 | u_int count; |
3518 | ||
8e2efb11 MD |
3519 | /* |
3520 | * Critical path shortcut expects pv to already have one ref | |
3521 | * (for the pv->pv_pmap). | |
3522 | */ | |
67e78c75 MD |
3523 | count = pv->pv_hold; |
3524 | cpu_ccfence(); | |
701c977e | 3525 | for (;;) { |
701c977e | 3526 | if ((count & PV_HOLD_LOCKED) == 0) { |
67e78c75 | 3527 | if (atomic_fcmpset_int(&pv->pv_hold, &count, |
701c977e MD |
3528 | (count + 1) | PV_HOLD_LOCKED)) { |
3529 | #ifdef PMAP_DEBUG | |
3530 | pv->pv_func = func; | |
3531 | pv->pv_line = lineno; | |
3532 | #endif | |
3533 | return TRUE; | |
3534 | } | |
3535 | } else { | |
67e78c75 | 3536 | if (atomic_fcmpset_int(&pv->pv_hold, &count, count + 1)) |
701c977e MD |
3537 | return FALSE; |
3538 | } | |
3539 | /* retry */ | |
c8fe38ae | 3540 | } |
d7f50089 YY |
3541 | } |
3542 | ||
3543 | /* | |
701c977e MD |
3544 | * Drop a previously held pv_entry which could not be locked, allowing its |
3545 | * destruction. | |
3546 | * | |
3547 | * Must not be called with a spinlock held as we might zfree() the pv if it | |
3548 | * is no longer associated with a pmap and this was the last hold count. | |
d7f50089 | 3549 | */ |
701c977e MD |
3550 | static void |
3551 | pv_drop(pv_entry_t pv) | |
d7f50089 | 3552 | { |
701c977e | 3553 | u_int count; |
c8fe38ae | 3554 | |
701c977e MD |
3555 | for (;;) { |
3556 | count = pv->pv_hold; | |
3557 | cpu_ccfence(); | |
3558 | KKASSERT((count & PV_HOLD_MASK) > 0); | |
3559 | KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) != | |
3560 | (PV_HOLD_LOCKED | 1)); | |
3561 | if (atomic_cmpset_int(&pv->pv_hold, count, count - 1)) { | |
8e2efb11 | 3562 | if ((count & PV_HOLD_MASK) == 1) { |
a44410dd MD |
3563 | #ifdef PMAP_DEBUG2 |
3564 | if (pmap_enter_debug > 0) { | |
3565 | --pmap_enter_debug; | |
3566 | kprintf("pv_drop: free pv %p\n", pv); | |
3567 | } | |
3568 | #endif | |
8e2efb11 MD |
3569 | KKASSERT(count == 1); |
3570 | KKASSERT(pv->pv_pmap == NULL); | |
701c977e | 3571 | zfree(pvzone, pv); |
8e2efb11 | 3572 | } |
701c977e | 3573 | return; |
b12defdc | 3574 | } |
701c977e | 3575 | /* retry */ |
c8fe38ae | 3576 | } |
d7f50089 | 3577 | } |
c8fe38ae | 3578 | |
d7f50089 | 3579 | /* |
8e2efb11 MD |
3580 | * Find or allocate the requested PV entry, returning a locked, held pv. |
3581 | * | |
3582 | * If (*isnew) is non-zero, the returned pv will have two hold counts, one | |
3583 | * for the caller and one representing the pmap and vm_page association. | |
3584 | * | |
3585 | * If (*isnew) is zero, the returned pv will have only one hold count. | |
3586 | * | |
3587 | * Since both associations can only be adjusted while the pv is locked, | |
3588 | * together they represent just one additional hold. | |
d7f50089 | 3589 | */ |
bfc09ba0 | 3590 | static |
701c977e MD |
3591 | pv_entry_t |
3592 | _pv_alloc(pmap_t pmap, vm_pindex_t pindex, int *isnew PMAP_DEBUG_DECL) | |
c8fe38ae | 3593 | { |
08abdbfc | 3594 | struct mdglobaldata *md = mdcpu; |
c8fe38ae | 3595 | pv_entry_t pv; |
07fc7b93 | 3596 | pv_entry_t pnew; |
08abdbfc | 3597 | int pmap_excl = 0; |
07fc7b93 MD |
3598 | |
3599 | pnew = NULL; | |
3600 | if (md->gd_newpv) { | |
bb1339f8 | 3601 | #if 1 |
07fc7b93 MD |
3602 | pnew = atomic_swap_ptr((void *)&md->gd_newpv, NULL); |
3603 | #else | |
3604 | crit_enter(); | |
3605 | pnew = md->gd_newpv; /* might race NULL */ | |
3606 | md->gd_newpv = NULL; | |
3607 | crit_exit(); | |
3608 | #endif | |
3609 | } | |
3610 | if (pnew == NULL) | |
3611 | pnew = zalloc(pvzone); | |
c8fe38ae | 3612 | |
08abdbfc | 3613 | spin_lock_shared(&pmap->pm_spin); |
701c977e | 3614 | for (;;) { |
76f1911e MD |
3615 | /* |
3616 | * Shortcut cache | |
3617 | */ | |
bb1339f8 | 3618 | pv = pv_entry_lookup(pmap, pindex); |
701c977e | 3619 | if (pv == NULL) { |
76f1911e MD |
3620 | vm_pindex_t *pmark; |
3621 | ||
08abdbfc MD |
3622 | /* |
3623 | * Requires exclusive pmap spinlock | |
3624 | */ | |
3625 | if (pmap_excl == 0) { | |
3626 | pmap_excl = 1; | |
3627 | if (!spin_lock_upgrade_try(&pmap->pm_spin)) { | |
3628 | spin_unlock_shared(&pmap->pm_spin); | |
3629 | spin_lock(&pmap->pm_spin); | |
3630 | continue; | |
3631 | } | |
3632 | } | |
3633 | ||
76f1911e | 3634 | /* |
2519e05d MD |
3635 | * We need to block if someone is holding our |
3636 | * placemarker. As long as we determine the | |
3637 | * placemarker has not been aquired we do not | |
3638 | * need to get it as acquision also requires | |
3639 | * the pmap spin lock. | |
3640 | * | |
3641 | * However, we can race the wakeup. | |
76f1911e MD |
3642 | */ |
3643 | pmark = pmap_placemarker_hash(pmap, pindex); | |
3644 | ||
3645 | if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { | |
2519e05d | 3646 | tsleep_interlock(pmark, 0); |
92414ddf | 3647 | atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); |
2519e05d MD |
3648 | if (((*pmark ^ pindex) & |
3649 | ~PM_PLACEMARK_WAKEUP) == 0) { | |
3650 | spin_unlock(&pmap->pm_spin); | |
3651 | tsleep(pmark, PINTERLOCKED, "pvplc", 0); | |
3652 | spin_lock(&pmap->pm_spin); | |
3653 | } | |
76f1911e MD |
3654 | continue; |
3655 | } | |
3656 | ||
3657 | /* | |
3658 | * Setup the new entry | |
3659 | */ | |
701c977e MD |
3660 | pnew->pv_pmap = pmap; |
3661 | pnew->pv_pindex = pindex; | |
8e2efb11 | 3662 | pnew->pv_hold = PV_HOLD_LOCKED | 2; |
e05899ce | 3663 | pnew->pv_flags = 0; |
701c977e MD |
3664 | #ifdef PMAP_DEBUG |
3665 | pnew->pv_func = func; | |
3666 | pnew->pv_line = lineno; | |
e989b548 MD |
3667 | if (pnew->pv_line_lastfree > 0) { |
3668 | pnew->pv_line_lastfree = | |
3669 | -pnew->pv_line_lastfree; | |
3670 | } | |
701c977e | 3671 | #endif |
76f1911e | 3672 | pv = pv_entry_rb_tree_RB_INSERT(&pmap->pm_pvroot, pnew); |
701c977e MD |
3673 | atomic_add_long(&pmap->pm_stats.resident_count, 1); |
3674 | spin_unlock(&pmap->pm_spin); | |
3675 | *isnew = 1; | |
76f1911e | 3676 | |
67e78c75 | 3677 | KASSERT(pv == NULL, ("pv insert failed %p->%p", pnew, pv)); |
701c977e MD |
3678 | return(pnew); |
3679 | } | |
76f1911e MD |
3680 | |
3681 | /* | |
07fc7b93 MD |
3682 | * We already have an entry, cleanup the staged pnew if |
3683 | * we can get the lock, otherwise block and retry. | |
76f1911e | 3684 | */ |
07fc7b93 | 3685 | if (__predict_true(_pv_hold_try(pv PMAP_DEBUG_COPY))) { |
08abdbfc MD |
3686 | if (pmap_excl) |
3687 | spin_unlock(&pmap->pm_spin); | |
3688 | else | |
3689 | spin_unlock_shared(&pmap->pm_spin); | |
bb1339f8 | 3690 | #if 1 |
07fc7b93 MD |
3691 | pnew = atomic_swap_ptr((void *)&md->gd_newpv, pnew); |
3692 | if (pnew) | |
3693 | zfree(pvzone, pnew); | |
3694 | #else | |
3695 | crit_enter(); | |
3696 | if (md->gd_newpv == NULL) | |
3697 | md->gd_newpv = pnew; | |
3698 | else | |
3699 | zfree(pvzone, pnew); | |
3700 | crit_exit(); | |
3701 | #endif | |
e989b548 MD |
3702 | KKASSERT(pv->pv_pmap == pmap && |
3703 | pv->pv_pindex == pindex); | |
701c977e MD |
3704 | *isnew = 0; |
3705 | return(pv); | |
3706 | } | |
08abdbfc MD |
3707 | if (pmap_excl) { |
3708 | spin_unlock(&pmap->pm_spin); | |
3709 | _pv_lock(pv PMAP_DEBUG_COPY); | |
3710 | pv_put(pv); | |
3711 | spin_lock(&pmap->pm_spin); | |
3712 | } else { | |
3713 | spin_unlock_shared(&pmap->pm_spin); | |
3714 | _pv_lock(pv PMAP_DEBUG_COPY); | |
3715 | pv_put(pv); | |
3716 | spin_lock_shared(&pmap->pm_spin); | |
3717 | } | |
701c977e | 3718 | } |
07fc7b93 | 3719 | /* NOT REACHED */ |
701c977e | 3720 | } |
b12defdc | 3721 | |
701c977e MD |
3722 | /* |
3723 | * Find the requested PV entry, returning a locked+held pv or NULL | |
3724 | */ | |
3725 | static | |
3726 | pv_entry_t | |
76f1911e | 3727 | _pv_get(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp PMAP_DEBUG_DECL) |
701c977e MD |
3728 | { |
3729 | pv_entry_t pv; | |
08abdbfc | 3730 | int pmap_excl = 0; |
5926987a | 3731 | |
08abdbfc | 3732 | spin_lock_shared(&pmap->pm_spin); |
701c977e MD |
3733 | for (;;) { |
3734 | /* | |
3735 | * Shortcut cache | |
3736 | */ | |
bb1339f8 | 3737 | pv = pv_entry_lookup(pmap, pindex); |
86d59af3 | 3738 | if (pv == NULL) { |
76f1911e | 3739 | /* |
2519e05d MD |
3740 | * Block if there is ANY placemarker. If we are to |
3741 | * return it, we must also aquire the spot, so we | |
3742 | * have to block even if the placemarker is held on | |
3743 | * a different address. | |
3744 | * | |
3745 | * OPTIMIZATION: If pmarkp is passed as NULL the | |
3746 | * caller is just probing (or looking for a real | |
3747 | * pv_entry), and in this case we only need to check | |
3748 | * to see if the placemarker matches pindex. | |
76f1911e MD |
3749 | */ |
3750 | vm_pindex_t *pmark; | |
3751 | ||
08abdbfc MD |
3752 | /* |
3753 | * Requires exclusive pmap spinlock | |
3754 | */ | |
3755 | if (pmap_excl == 0) { | |
3756 | pmap_excl = 1; | |
3757 | if (!spin_lock_upgrade_try(&pmap->pm_spin)) { | |
3758 | spin_unlock_shared(&pmap->pm_spin); | |
3759 | spin_lock(&pmap->pm_spin); | |
3760 | continue; | |
3761 | } | |
3762 | } | |
3763 | ||
76f1911e MD |
3764 | pmark = pmap_placemarker_hash(pmap, pindex); |
3765 | ||
3766 | if ((pmarkp && *pmark != PM_NOPLACEMARK) || | |
3767 | ((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { | |
2519e05d | 3768 | tsleep_interlock(pmark, 0); |
92414ddf | 3769 | atomic_set_long(pmark, PM_PLACEMARK_WAKEUP); |
2519e05d MD |
3770 | if ((pmarkp && *pmark != PM_NOPLACEMARK) || |
3771 | ((*pmark ^ pindex) & | |
3772 | ~PM_PLACEMARK_WAKEUP) == 0) { | |
3773 | spin_unlock(&pmap->pm_spin); | |
3774 | tsleep(pmark, PINTERLOCKED, "pvpld", 0); | |
3775 | spin_lock(&pmap->pm_spin); | |
3776 | } | |
76f1911e MD |
3777 | continue; |
3778 | } | |
3779 | if (pmarkp) { | |
5ee06c6c MD |
3780 | if (atomic_swap_long(pmark, pindex) != |
3781 | PM_NOPLACEMARK) { | |
3782 | panic("_pv_get: pmark race"); | |
3783 | } | |
76f1911e MD |
3784 | *pmarkp = pmark; |
3785 | } | |
701c977e MD |
3786 | spin_unlock(&pmap->pm_spin); |
3787 | return NULL; | |
3788 | } | |
3789 | if (_pv_hold_try(pv PMAP_DEBUG_COPY)) { | |
08abdbfc MD |
3790 | if (pmap_excl) |
3791 | spin_unlock(&pmap->pm_spin); | |
3792 | else | |
3793 | spin_unlock_shared(&pmap->pm_spin); | |
e989b548 MD |
3794 | KKASSERT(pv->pv_pmap == pmap && |
3795 | pv->pv_pindex == pindex); | |
701c977e | 3796 | return(pv); |
5e78aef9 | 3797 | } |
08abdbfc MD |
3798 | if (pmap_excl) { |
3799 | spin_unlock(&pmap->pm_spin); | |
3800 | _pv_lock(pv PMAP_DEBUG_COPY); | |
3801 | pv_put(pv); | |
3802 | spin_lock(&pmap->pm_spin); | |
3803 | } else { | |
3804 | spin_unlock_shared(&pmap->pm_spin); | |
3805 | _pv_lock(pv PMAP_DEBUG_COPY); | |
3806 | pv_put(pv); | |
3807 | spin_lock_shared(&pmap->pm_spin); | |
3808 | } | |
701c977e | 3809 | } |
d7f50089 YY |
3810 | } |
3811 | ||
3812 | /* | |
701c977e MD |
3813 | * Lookup, hold, and attempt to lock (pmap,pindex). |
3814 | * | |
3815 | * If the entry does not exist NULL is returned and *errorp is set to 0 | |
a5fc46c9 | 3816 | * |
701c977e MD |
3817 | * If the entry exists and could be successfully locked it is returned and |
3818 | * errorp is set to 0. | |
3819 | * | |
3820 | * If the entry exists but could NOT be successfully locked it is returned | |
3821 | * held and *errorp is set to 1. | |
76f1911e MD |
3822 | * |
3823 | * If the entry is placemarked by someone else NULL is returned and *errorp | |
3824 | * is set to 1. | |
d7f50089 | 3825 | */ |
bfc09ba0 | 3826 | static |
701c977e | 3827 | pv_entry_t |
76f1911e | 3828 | pv_get_try(pmap_t pmap, vm_pindex_t pindex, vm_pindex_t **pmarkp, int *errorp) |
d7f50089 | 3829 | { |
c8fe38ae MD |
3830 | pv_entry_t pv; |
3831 | ||
a86ce0cd | 3832 | spin_lock_shared(&pmap->pm_spin); |
76f1911e | 3833 | |
bb1339f8 | 3834 | pv = pv_entry_lookup(pmap, pindex); |
701c977e | 3835 | if (pv == NULL) { |
76f1911e MD |
3836 | vm_pindex_t *pmark; |
3837 | ||
3838 | pmark = pmap_placemarker_hash(pmap, pindex); | |
3839 | ||
3840 | if (((*pmark ^ pindex) & ~PM_PLACEMARK_WAKEUP) == 0) { | |
3841 | *errorp = 1; | |
3842 | } else if (pmarkp && | |
3843 | atomic_cmpset_long(pmark, PM_NOPLACEMARK, pindex)) { | |
3844 | *errorp = 0; | |
3845 | } else { | |
3846 | /* | |
5ee06c6c MD |
3847 | * Can't set a placemark with a NULL pmarkp, or if |
3848 | * pmarkp is non-NULL but we failed to set our | |
3849 | * placemark. | |
76f1911e MD |
3850 | */ |
3851 | *errorp = 1; | |
3852 | } | |
3853 | if (pmarkp) | |
3854 | *pmarkp = pmark; | |
a86ce0cd | 3855 | spin_unlock_shared(&pmap->pm_spin); |
76f1911e | 3856 | |
701c977e MD |
3857 | return NULL; |
3858 | } | |
e989b548 MD |
3859 | |
3860 | /* | |
3861 | * XXX This has problems if the lock is shared, why? | |
3862 | */ | |
701c977e | 3863 | if (pv_hold_try(pv)) { |
a86ce0cd | 3864 | spin_unlock_shared(&pmap->pm_spin); |
701c977e | 3865 | *errorp = 0; |
5e78aef9 | 3866 | KKASSERT(pv->pv_pmap == pmap && pv->pv_pindex == pindex); |
701c977e MD |
3867 | return(pv); /* lock succeeded */ |
3868 | } | |
a86ce0cd | 3869 | spin_unlock_shared(&pmap->pm_spin); |
701c977e | 3870 | *errorp = 1; |
e989b548 | 3871 | |
701c977e | 3872 | return (pv); /* lock failed */ |
d7f50089 YY |
3873 | } |
3874 | ||
701c977e MD |
3875 | /* |
3876 | * Lock a held pv, keeping the hold count | |
3877 | */ | |
3878 | static | |
3879 | void | |
3880 | _pv_lock(pv_entry_t pv PMAP_DEBUG_DECL) | |
3881 | { | |
3882 | u_int count; | |
3883 | ||
3884 | for (;;) { | |
3885 | count = pv->pv_hold; | |
3886 | cpu_ccfence(); | |
3887 | if ((count & PV_HOLD_LOCKED) == 0) { | |
3888 | if (atomic_cmpset_int(&pv->pv_hold, count, | |
3889 | count | PV_HOLD_LOCKED)) { | |
3890 | #ifdef PMAP_DEBUG | |
3891 | pv->pv_func = func; | |
3892 | pv->pv_line = lineno; | |
3893 | #endif | |
3894 | return; | |
c8fe38ae | 3895 | } |
701c977e MD |
3896 | continue; |
3897 | } | |
3898 | tsleep_interlock(pv, 0); | |
3899 | if (atomic_cmpset_int(&pv->pv_hold, count, | |
3900 | count | PV_HOLD_WAITING)) { | |
e989b548 MD |
3901 | #ifdef PMAP_DEBUG2 |
3902 | if (pmap_enter_debug > 0) { | |
3903 | --pmap_enter_debug; | |
3904 | kprintf("pv waiting on %s:%d\n", | |
701c977e | 3905 | pv->pv_func, pv->pv_line); |
e989b548 | 3906 | } |
c8fe38ae | 3907 | #endif |
701c977e | 3908 | tsleep(pv, PINTERLOCKED, "pvwait", hz); |
c8fe38ae | 3909 | } |
701c977e | 3910 | /* retry */ |
b12defdc | 3911 | } |
701c977e | 3912 | } |
c8fe38ae | 3913 | |
701c977e MD |
3914 | /* |
3915 | * Unlock a held and locked pv, keeping the hold count. | |
3916 | */ | |
3917 | static | |
3918 | void | |
3919 | pv_unlock(pv_entry_t pv) | |
3920 | { | |
3921 | u_int count; | |
3922 | ||
701c977e MD |
3923 | for (;;) { |
3924 | count = pv->pv_hold; | |
3925 | cpu_ccfence(); | |
8e2efb11 | 3926 | KKASSERT((count & (PV_HOLD_LOCKED | PV_HOLD_MASK)) >= |
701c977e MD |
3927 | (PV_HOLD_LOCKED | 1)); |
3928 | if (atomic_cmpset_int(&pv->pv_hold, count, | |
3929 | count & | |
3930 | ~(PV_HOLD_LOCKED | PV_HOLD_WAITING))) { | |
3931 | if (count & PV_HOLD_WAITING) | |
3932 | wakeup(pv); | |
3933 | break; | |
3934 | } | |
7ab91d55 | 3935 | } |
d7f50089 YY |
3936 | } |
3937 | ||
3938 | /* | |
701c977e MD |
3939 | * Unlock and drop a pv. If the pv is no longer associated with a pmap |
3940 | * and the hold count drops to zero we will free it. | |
d7f50089 | 3941 | * |
701c977e MD |
3942 | * Caller should not hold any spin locks. We are protected from hold races |
3943 | * by virtue of holds only occuring only with a pmap_spin or vm_page_spin | |
3944 | * lock held. A pv cannot be located otherwise. | |
d7f50089 | 3945 | */ |
bfc09ba0 MD |
3946 | static |
3947 | void | |
701c977e | 3948 | pv_put(pv_entry_t pv) |
c8fe38ae | 3949 | { |
a44410dd MD |
3950 | #ifdef PMAP_DEBUG2 |
3951 | if (pmap_enter_debug > 0) { | |
3952 | --pmap_enter_debug; | |
3953 | kprintf("pv_put pv=%p hold=%08x\n", pv, pv->pv_hold); | |
3954 | } | |
3955 | #endif | |
3956 | ||
e989b548 | 3957 | /* |
f76ae5a7 | 3958 | * Normal put-aways must have a pv_m associated with the pv, |
f545b541 MD |
3959 | * but allow the case where the pv has been destructed due |
3960 | * to pmap_dynamic_delete. | |
e989b548 | 3961 | */ |
f76ae5a7 | 3962 | KKASSERT(pv->pv_pmap == NULL || pv->pv_m != NULL); |
e989b548 | 3963 | |
8e2efb11 MD |
3964 | /* |
3965 | * Fast - shortcut most common condition | |
3966 | */ | |
3967 | if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 1)) | |
701c977e | 3968 | return; |
8e2efb11 MD |
3969 | |
3970 | /* | |
3971 | * Slow | |
3972 | */ | |
701c977e MD |
3973 | pv_unlock(pv); |
3974 | pv_drop(pv); | |
3975 | } | |
c8fe38ae | 3976 | |
701c977e | 3977 | /* |
8e2efb11 MD |
3978 | * Remove the pmap association from a pv, require that pv_m already be removed, |
3979 | * then unlock and drop the pv. Any pte operations must have already been | |
3980 | * completed. This call may result in a last-drop which will physically free | |
3981 | * the pv. | |
3982 | * | |
3983 | * Removing the pmap association entails an additional drop. | |
3984 | * | |
3985 | * pv must be exclusively locked on call and will be disposed of on return. | |
701c977e MD |
3986 | */ |
3987 | static | |
3988 | void | |
e989b548 | 3989 | _pv_free(pv_entry_t pv, pv_entry_t pvp PMAP_DEBUG_DECL) |
701c977e MD |
3990 | { |
3991 | pmap_t pmap; | |
b12defdc | 3992 | |
e989b548 MD |
3993 | #ifdef PMAP_DEBUG |
3994 | pv->pv_func_lastfree = func; | |
3995 | pv->pv_line_lastfree = lineno; | |
3996 | #endif | |
701c977e | 3997 | KKASSERT(pv->pv_m == NULL); |
e989b548 MD |
3998 | KKASSERT((pv->pv_hold & (PV_HOLD_LOCKED|PV_HOLD_MASK)) >= |
3999 | (PV_HOLD_LOCKED|1)); | |
701c977e MD |
4000 | if ((pmap = pv->pv_pmap) != NULL) { |
4001 | spin_lock(&pmap->pm_spin); | |
76f1911e | 4002 | KKASSERT(pv->pv_pmap == pmap); |
bb1339f8 MD |
4003 | if (pmap->pm_pvhint_pt == pv) |
4004 | pmap->pm_pvhint_pt = NULL; | |
567a6398 MD |
4005 | if (pmap->pm_pvhint_unused == pv) |
4006 | pmap->pm_pvhint_unused = NULL; | |
76f1911e | 4007 | pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv); |
701c977e MD |
4008 | atomic_add_long(&pmap->pm_stats.resident_count, -1); |
4009 | pv->pv_pmap = NULL; | |
4010 | pv->pv_pindex = 0; | |
4011 | spin_unlock(&pmap->pm_spin); | |
8e2efb11 MD |
4012 | |
4013 | /* | |
4014 | * Try to shortcut three atomic ops, otherwise fall through | |
4015 | * and do it normally. Drop two refs and the lock all in | |
4016 | * one go. | |
4017 | */ | |
e3c330f0 MD |
4018 | if (pvp) { |
4019 | if (vm_page_unwire_quick(pvp->pv_m)) | |
4020 | panic("_pv_free: bad wirecount on pvp"); | |
4021 | } | |
76f1911e | 4022 | if (atomic_cmpset_int(&pv->pv_hold, PV_HOLD_LOCKED | 2, 0)) { |
a44410dd MD |
4023 | #ifdef PMAP_DEBUG2 |
4024 | if (pmap_enter_debug > 0) { | |
4025 | --pmap_enter_debug; | |
4026 | kprintf("pv_free: free pv %p\n", pv); | |
4027 | } | |
4028 | #endif | |
8e2efb11 MD |
4029 | zfree(pvzone, pv); |
4030 | return; | |
4031 | } | |
4032 | pv_drop(pv); /* ref for pv_pmap */ | |
701c977e | 4033 | } |
e989b548 MD |
4034 | pv_unlock(pv); |
4035 | pv_drop(pv); | |
701c977e MD |
4036 | } |
4037 | ||
4038 | /* | |
4039 | * This routine is very drastic, but can save the system | |
4040 | * in a pinch. | |
4041 | */ | |
4042 | void | |
4043 | pmap_collect(void) | |
4044 | { | |
4045 | int i; | |
4046 | vm_page_t m; | |
4047 | static int warningdone=0; | |
4048 | ||
4049 | if (pmap_pagedaemon_waken == 0) | |
48ffc236 | 4050 | return; |
701c977e MD |
4051 | pmap_pagedaemon_waken = 0; |
4052 | if (warningdone < 5) { | |
df49ec1e MD |
4053 | kprintf("pmap_collect: pv_entries exhausted -- " |
4054 | "suggest increasing vm.pmap_pv_entries above %ld\n", | |
4055 | vm_pmap_pv_entries); | |
701c977e MD |
4056 | warningdone++; |
4057 | } | |
4058 | ||
4059 | for (i = 0; i < vm_page_array_size; i++) { | |
4060 | m = &vm_page_array[i]; | |
4061 | if (m->wire_count || m->hold_count) | |
4062 | continue; | |
4063 | if (vm_page_busy_try(m, TRUE) == 0) { | |
4064 | if (m->wire_count == 0 && m->hold_count == 0) { | |
4065 | pmap_remove_all(m); | |
4066 | } | |
4067 | vm_page_wakeup(m); | |
4068 | } | |
4069 | } | |
d7f50089 YY |
4070 | } |
4071 | ||
4072 | /* | |
701c977e | 4073 | * Scan the pmap for active page table entries and issue a callback. |
921c891e MD |
4074 | * The callback must dispose of pte_pv, whos PTE entry is at *ptep in |
4075 | * its parent page table. | |
d7f50089 | 4076 | * |
fb4ca018 | 4077 | * pte_pv will be NULL if the page or page table is unmanaged. |
921c891e | 4078 | * pt_pv will point to the page table page containing the pte for the page. |
701c977e | 4079 | * |
921c891e MD |
4080 | * NOTE! If we come across an unmanaged page TABLE (verses an unmanaged page), |
4081 | * we pass a NULL pte_pv and we pass a pt_pv pointing to the passed | |
4082 | * process pmap's PD and page to the callback function. This can be | |
4083 | * confusing because the pt_pv is really a pd_pv, and the target page | |
4084 | * table page is simply aliased by the pmap and not owned by it. | |
d7f50089 | 4085 | * |
701c977e | 4086 | * It is assumed that the start and end are properly rounded to the page size. |
fb4ca018 MD |
4087 | * |
4088 | * It is assumed that PD pages and above are managed and thus in the RB tree, | |
4089 | * allowing us to use RB_SCAN from the PD pages down for ranged scans. | |
4090 | */ | |
4091 | struct pmap_scan_info { | |
4092 | struct pmap *pmap; | |
4093 | vm_offset_t sva; | |
4094 | vm_offset_t eva; | |
4095 | vm_pindex_t sva_pd_pindex; | |
4096 | vm_pindex_t eva_pd_pindex; | |
9df83100 | 4097 | void (*func)(pmap_t, struct pmap_scan_info *, |
567a6398 | 4098 | vm_pindex_t *, pv_entry_t, vm_offset_t, |
fb4ca018 MD |
4099 | pt_entry_t *, void *); |
4100 | void *arg; | |
ccd67bf6 MD |
4101 | pmap_inval_bulk_t bulk_core; |
4102 | pmap_inval_bulk_t *bulk; | |
e674353b | 4103 | int count; |
a7a03a5f | 4104 | int stop; |
fb4ca018 MD |
4105 | }; |
4106 | ||
4107 | static int pmap_scan_cmp(pv_entry_t pv, void *data); | |
4108 | static int pmap_scan_callback(pv_entry_t pv, void *data); | |
4109 | ||
701c977e | 4110 | static void |
ccd67bf6 | 4111 | pmap_scan(struct pmap_scan_info *info, int smp_inval) |
701c977e | 4112 | { |
fb4ca018 | 4113 | struct pmap *pmap = info->pmap; |
701c977e MD |
4114 | pv_entry_t pt_pv; /* A page table PV */ |
4115 | pv_entry_t pte_pv; /* A page table entry PV */ | |
76f1911e MD |
4116 | vm_pindex_t *pte_placemark; |
4117 | vm_pindex_t *pt_placemark; | |
701c977e | 4118 | pt_entry_t *ptep; |
8e2efb11 | 4119 | pt_entry_t oldpte; |
fb4ca018 | 4120 | struct pv_entry dummy_pv; |
c8fe38ae | 4121 | |
a7a03a5f | 4122 | info->stop = 0; |
c8fe38ae MD |
4123 | if (pmap == NULL) |
4124 | return; | |
95270b7e MD |
4125 | if (info->sva == info->eva) |
4126 | return; | |
ccd67bf6 MD |
4127 | if (smp_inval) { |
4128 | info->bulk = &info->bulk_core; | |
4129 | pmap_inval_bulk_init(&info->bulk_core, pmap); | |
4130 | } else { | |
4131 | info->bulk = NULL; | |
4132 | } | |
c8fe38ae | 4133 | |
701c977e MD |
4134 | /* |
4135 | * Hold the token for stability; if the pmap is empty we have nothing | |
4136 | * to do. | |
4137 | */ | |
701c977e | 4138 | #if 0 |
10d6182e | 4139 | if (pmap->pm_stats.resident_count == 0) { |
c8fe38ae | 4140 | return; |
10d6182e | 4141 | } |
701c977e | 4142 | #endif |
c8fe38ae | 4143 | |
e674353b | 4144 | info->count = 0; |
c8fe38ae MD |
4145 | |
4146 | /* | |
fb4ca018 | 4147 | * Special handling for scanning one page, which is a very common |
701c977e | 4148 | * operation (it is?). |
fb4ca018 | 4149 | * |
701c977e | 4150 | * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4 |
c8fe38ae | 4151 | */ |
fb4ca018 MD |
4152 | if (info->sva + PAGE_SIZE == info->eva) { |
4153 | if (info->sva >= VM_MAX_USER_ADDRESS) { | |
701c977e MD |
4154 | /* |
4155 | * Kernel mappings do not track wire counts on | |
fb4ca018 MD |
4156 | * page table pages and only maintain pd_pv and |
4157 | * pte_pv levels so pmap_scan() works. | |
701c977e MD |
4158 | */ |
4159 | pt_pv = NULL; | |
76f1911e MD |
4160 | pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), |
4161 | &pte_placemark); | |
567a6398 | 4162 | KKASSERT(pte_pv == NULL); |
fb4ca018 | 4163 | ptep = vtopte(info->sva); |
701c977e MD |
4164 | } else { |
4165 | /* | |
76f1911e MD |
4166 | * We hold pte_placemark across the operation for |
4167 | * unmanaged pages. | |
4168 | * | |
4169 | * WARNING! We must hold pt_placemark across the | |
4170 | * *ptep test to prevent misintepreting | |
4171 | * a non-zero *ptep as a shared page | |
4172 | * table page. Hold it across the function | |
4173 | * callback as well for SMP safety. | |
701c977e | 4174 | */ |
76f1911e MD |
4175 | pte_pv = pv_get(pmap, pmap_pte_pindex(info->sva), |
4176 | &pte_placemark); | |
567a6398 | 4177 | KKASSERT(pte_pv == NULL); |
76f1911e | 4178 | pt_pv = pv_get(pmap, pmap_pt_pindex(info->sva), |
567a6398 | 4179 | &pt_placemark); |
701c977e | 4180 | if (pt_pv == NULL) { |
567a6398 MD |
4181 | #if 0 |
4182 | KKASSERT(0); | |
76f1911e MD |
4183 | pd_pv = pv_get(pmap, |
4184 | pmap_pd_pindex(info->sva), | |
4185 | NULL); | |
921c891e MD |
4186 | if (pd_pv) { |
4187 | ptep = pv_pte_lookup(pd_pv, | |
fb4ca018 | 4188 | pmap_pt_index(info->sva)); |
921c891e | 4189 | if (*ptep) { |
9df83100 | 4190 | info->func(pmap, info, |
567a6398 | 4191 | pt_placemark, pd_pv, |
fb4ca018 MD |
4192 | info->sva, ptep, |
4193 | info->arg); | |
76f1911e MD |
4194 | } else { |
4195 | pv_placemarker_wakeup(pmap, | |
4196 | pt_placemark); | |
921c891e MD |
4197 | } |
4198 | pv_put(pd_pv); | |
76f1911e MD |
4199 | } else { |
4200 | pv_placemarker_wakeup(pmap, | |
4201 | pt_placemark); | |
921c891e | 4202 | } |
567a6398 MD |
4203 | #else |
4204 | pv_placemarker_wakeup(pmap, pt_placemark); | |
4205 | #endif | |
76f1911e | 4206 | pv_placemarker_wakeup(pmap, pte_placemark); |
701c977e MD |
4207 | goto fast_skip; |
4208 | } | |
fb4ca018 | 4209 | ptep = pv_pte_lookup(pt_pv, pmap_pte_index(info->sva)); |
701c977e | 4210 | } |
8e2efb11 MD |
4211 | |
4212 | /* | |
4213 | * NOTE: *ptep can't be ripped out from under us if we hold | |
76f1911e MD |
4214 | * pte_pv (or pte_placemark) locked, but bits can |
4215 | * change. | |
8e2efb11 MD |
4216 | */ |
4217 | oldpte = *ptep; | |
4218 | cpu_ccfence(); | |
4219 | if (oldpte == 0) { | |
701c977e | 4220 | KKASSERT(pte_pv == NULL); |
76f1911e | 4221 | pv_placemarker_wakeup(pmap, pte_placemark); |
701c977e | 4222 | } else { |
567a6398 MD |
4223 | KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]) == |
4224 | pmap->pmap_bits[PG_V_IDX], | |
76f1911e MD |
4225 | ("badB *ptep %016lx/%016lx sva %016lx pte_pv NULL", |
4226 | *ptep, oldpte, info->sva)); | |
567a6398 | 4227 | info->func(pmap, info, pte_placemark, pt_pv, |
76f1911e | 4228 | info->sva, ptep, info->arg); |
48ffc236 | 4229 | } |
701c977e MD |
4230 | if (pt_pv) |
4231 | pv_put(pt_pv); | |
4232 | fast_skip: | |
ccd67bf6 | 4233 | pmap_inval_bulk_flush(info->bulk); |
701c977e | 4234 | return; |
c8fe38ae MD |
4235 | } |
4236 | ||
fb4ca018 MD |
4237 | /* |
4238 | * Nominal scan case, RB_SCAN() for PD pages and iterate from | |
4239 | * there. | |
95270b7e MD |
4240 | * |
4241 | * WARNING! eva can overflow our standard ((N + mask) >> bits) | |
4242 | * bounds, resulting in a pd_pindex of 0. To solve the | |
4243 | * problem we use an inclusive range. | |
fb4ca018 MD |
4244 | */ |
4245 | info->sva_pd_pindex = pmap_pd_pindex(info->sva); | |
95270b7e | 4246 | info->eva_pd_pindex = pmap_pd_pindex(info->eva - PAGE_SIZE); |
fb4ca018 MD |
4247 | |
4248 | if (info->sva >= VM_MAX_USER_ADDRESS) { | |
4249 | /* | |
4250 | * The kernel does not currently maintain any pv_entry's for | |
4251 | * higher-level page tables. | |
4252 | */ | |
4253 | bzero(&dummy_pv, sizeof(dummy_pv)); | |
4254 | dummy_pv.pv_pindex = info->sva_pd_pindex; | |
4255 | spin_lock(&pmap->pm_spin); | |
95270b7e | 4256 | while (dummy_pv.pv_pindex <= info->eva_pd_pindex) { |
fb4ca018 MD |
4257 | pmap_scan_callback(&dummy_pv, info); |
4258 | ++dummy_pv.pv_pindex; | |
95270b7e MD |
4259 | if (dummy_pv.pv_pindex < info->sva_pd_pindex) /*wrap*/ |
4260 | break; | |
fb4ca018 MD |
4261 | } |
4262 | spin_unlock(&pmap->pm_spin); | |
4263 | } else { | |
4264 | /* | |
567a6398 MD |
4265 | * User page tables maintain local PML4, PDP, PD, and PT |
4266 | * pv_entry's. pv_entry's are not used for PTEs. | |
fb4ca018 MD |
4267 | */ |
4268 | spin_lock(&pmap->pm_spin); | |
76f1911e MD |
4269 | pv_entry_rb_tree_RB_SCAN(&pmap->pm_pvroot, pmap_scan_cmp, |
4270 | pmap_scan_callback, info); | |
fb4ca018 MD |
4271 | spin_unlock(&pmap->pm_spin); |
4272 | } | |
ccd67bf6 | 4273 | pmap_inval_bulk_flush(info->bulk); |
fb4ca018 MD |
4274 | } |
4275 | ||
4276 | /* | |
4277 | * WARNING! pmap->pm_spin held | |
95270b7e MD |
4278 | * |
4279 | * WARNING! eva can overflow our standard ((N + mask) >> bits) | |
4280 | * bounds, resulting in a pd_pindex of 0. To solve the | |
4281 | * problem we use an inclusive range. | |
fb4ca018 MD |
4282 | */ |
4283 | static int | |
4284 | pmap_scan_cmp(pv_entry_t pv, void *data) | |
4285 | { | |
4286 | struct pmap_scan_info *info = data; | |
4287 | if (pv->pv_pindex < info->sva_pd_pindex) | |
4288 | return(-1); | |
95270b7e | 4289 | if (pv->pv_pindex > info->eva_pd_pindex) |
fb4ca018 MD |
4290 | return(1); |
4291 | return(0); | |
4292 | } | |
4293 | ||
4294 | /* | |
76f1911e MD |
4295 | * pmap_scan() by PDs |
4296 | * | |
fb4ca018 MD |
4297 | * WARNING! pmap->pm_spin held |
4298 | */ | |
4299 | static int | |
4300 | pmap_scan_callback(pv_entry_t pv, void *data) | |
4301 | { | |
4302 | struct pmap_scan_info *info = data; | |
4303 | struct pmap *pmap = info->pmap; | |
4304 | pv_entry_t pd_pv; /* A page directory PV */ | |
4305 | pv_entry_t pt_pv; /* A page table PV */ | |
76f1911e | 4306 | vm_pindex_t *pt_placemark; |
fb4ca018 | 4307 | pt_entry_t *ptep; |
8e2efb11 | 4308 | pt_entry_t oldpte; |
fb4ca018 MD |
4309 | vm_offset_t sva; |
4310 | vm_offset_t eva; | |
4311 | vm_offset_t va_next; | |
4312 | vm_pindex_t pd_pindex; | |
4313 | int error; | |
4314 | ||
a7a03a5f MD |
4315 | /* |
4316 | * Stop if requested | |
4317 | */ | |
4318 | if (info->stop) | |
4319 | return -1; | |
4320 | ||
fb4ca018 MD |
4321 | /* |
4322 | * Pull the PD pindex from the pv before releasing the spinlock. | |
4323 | * | |
4324 | * WARNING: pv is faked for kernel pmap scans. | |
4325 | */ | |
4326 | pd_pindex = pv->pv_pindex; | |
4327 | spin_unlock(&pmap->pm_spin); | |
4328 | pv = NULL; /* invalid after spinlock unlocked */ | |
4329 | ||
4330 | /* | |
4331 | * Calculate the page range within the PD. SIMPLE pmaps are | |
4332 | * direct-mapped for the entire 2^64 address space. Normal pmaps | |
4333 | * reflect the user and kernel address space which requires | |
4334 | * cannonicalization w/regards to converting pd_pindex's back | |
4335 | * into addresses. | |
4336 | */ | |
76f1911e | 4337 | sva = (pd_pindex - pmap_pd_pindex(0)) << PDPSHIFT; |
fb4ca018 MD |
4338 | if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0 && |
4339 | (sva & PML4_SIGNMASK)) { | |
4340 | sva |= PML4_SIGNMASK; | |
4341 | } | |
4342 | eva = sva + NBPDP; /* can overflow */ | |
4343 | if (sva < info->sva) | |
4344 | sva = info->sva; | |
4345 | if (eva < info->sva || eva > info->eva) | |
4346 | eva = info->eva; | |
4347 | ||
701c977e MD |
4348 | /* |
4349 | * NOTE: kernel mappings do not track page table pages, only | |
4350 | * terminal pages. | |
4351 | * | |
4352 | * NOTE: Locks must be ordered bottom-up. pte,pt,pd,pdp,pml4. | |
4353 | * However, for the scan to be efficient we try to | |
4354 | * cache items top-down. | |
4355 | */ | |
701c977e MD |
4356 | pd_pv = NULL; |
4357 | pt_pv = NULL; | |
4358 | ||
48ffc236 | 4359 | for (; sva < eva; sva = va_next) { |
a7a03a5f MD |
4360 | if (info->stop) |
4361 | break; | |
701c977e MD |
4362 | if (sva >= VM_MAX_USER_ADDRESS) { |
4363 | if (pt_pv) { | |
4364 | pv_put(pt_pv); | |
4365 | pt_pv = NULL; | |
4366 | } | |
4367 | goto kernel_skip; | |
4368 | } | |
4369 | ||
4370 | /* | |
76f1911e | 4371 | * PD cache, scan shortcut if it doesn't exist. |
701c977e MD |
4372 | */ |
4373 | if (pd_pv == NULL) { | |
76f1911e MD |
4374 | pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); |
4375 | } else if (pd_pv->pv_pmap != pmap || | |
4376 | pd_pv->pv_pindex != pmap_pd_pindex(sva)) { | |
701c977e | 4377 | pv_put(pd_pv); |
76f1911e | 4378 | pd_pv = pv_get(pmap, pmap_pd_pindex(sva), NULL); |
701c977e MD |
4379 | } |
4380 | if (pd_pv == NULL) { | |
48ffc236 JG |
4381 | va_next = (sva + NBPDP) & ~PDPMASK; |
4382 | if (va_next < sva) | |
4383 | va_next = eva; | |
4384 | continue; | |
4385 | } | |
c8fe38ae MD |
4386 | |
4387 | /* | |
701c977e | 4388 | * PT cache |
a3a33e50 MD |
4389 | * |
4390 | * NOTE: The cached pt_pv can be removed from the pmap when | |
4391 | * pmap_dynamic_delete is enabled. | |
c8fe38ae | 4392 | */ |
76f1911e MD |
4393 | if (pt_pv && (pt_pv->pv_pmap != pmap || |
4394 | pt_pv->pv_pindex != pmap_pt_pindex(sva))) { | |
701c977e | 4395 | pv_put(pt_pv); |
76f1911e MD |
4396 | pt_pv = NULL; |
4397 | } | |
4398 | if (pt_pv == NULL) { | |
4399 | pt_pv = pv_get_try(pmap, pmap_pt_pindex(sva), | |
4400 | &pt_placemark, &error); | |
4401 | if (error) { | |
4402 | pv_put(pd_pv); /* lock order */ | |
4403 | pd_pv = NULL; | |
4404 | if (pt_pv) { | |
4405 | pv_lock(pt_pv); | |
a3a33e50 | 4406 | pv_put(pt_pv); |
76f1911e MD |
4407 | pt_pv = NULL; |
4408 | } else { | |
4409 | pv_placemarker_wait(pmap, pt_placemark); | |
4410 | } | |
4411 | va_next = sva; | |
4412 | continue; | |
4413 | } | |
4414 | /* may have to re-check later if pt_pv is NULL here */ | |
701c977e | 4415 | } |
c8fe38ae MD |
4416 | |
4417 | /* | |
567a6398 MD |
4418 | * If pt_pv is NULL we either have a shared page table |
4419 | * page (NOT IMPLEMENTED XXX) and must issue a callback | |
4420 | * specific to that case, or there is no page table page. | |
921c891e MD |
4421 | * |
4422 | * Either way we can skip the page table page. | |
76f1911e MD |
4423 | * |
4424 | * WARNING! pt_pv can also be NULL due to a pv creation | |
4425 | * race where we find it to be NULL and then | |
4426 | * later see a pte_pv. But its possible the pt_pv | |
4427 | * got created inbetween the two operations, so | |
4428 | * we must check. | |
567a6398 MD |
4429 | * |
4430 | * XXX This should no longer be the case because | |
4431 | * we have pt_placemark. | |
c8fe38ae | 4432 | */ |
701c977e | 4433 | if (pt_pv == NULL) { |
567a6398 MD |
4434 | #if 0 |
4435 | /* XXX REMOVED */ | |
921c891e MD |
4436 | /* |
4437 | * Possible unmanaged (shared from another pmap) | |
4438 | * page table page. | |
76f1911e MD |
4439 | * |
4440 | * WARNING! We must hold pt_placemark across the | |
4441 | * *ptep test to prevent misintepreting | |
4442 | * a non-zero *ptep as a shared page | |
4443 | * table page. Hold it across the function | |
4444 | * callback as well for SMP safety. | |
921c891e | 4445 | */ |
567a6398 | 4446 | KKASSERT(0); |
921c891e | 4447 | ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva)); |
a86ce0cd | 4448 | if (*ptep & pmap->pmap_bits[PG_V_IDX]) { |
567a6398 | 4449 | info->func(pmap, info, pt_placemark, pd_pv, |
fb4ca018 | 4450 | sva, ptep, info->arg); |
76f1911e MD |
4451 | } else { |
4452 | pv_placemarker_wakeup(pmap, pt_placemark); | |
921c891e | 4453 | } |
567a6398 MD |
4454 | #else |
4455 | pv_placemarker_wakeup(pmap, pt_placemark); | |
4456 | #endif | |
921c891e MD |
4457 | |
4458 | /* | |
4459 | * Done, move to next page table page. | |
4460 | */ | |
701c977e MD |
4461 | va_next = (sva + NBPDR) & ~PDRMASK; |
4462 | if (va_next < sva) | |
4463 | va_next = eva; | |
c8fe38ae | 4464 | continue; |
701c977e | 4465 | } |
c8fe38ae | 4466 | |
48ffc236 | 4467 | /* |
701c977e MD |
4468 | * From this point in the loop testing pt_pv for non-NULL |
4469 | * means we are in UVM, else if it is NULL we are in KVM. | |
fb4ca018 MD |
4470 | * |
4471 | * Limit our scan to either the end of the va represented | |
4472 | * by the current page table page, or to the end of the | |
4473 | * range being removed. | |
48ffc236 | 4474 | */ |
701c977e MD |
4475 | kernel_skip: |
4476 | va_next = (sva + NBPDR) & ~PDRMASK; | |
4477 | if (va_next < sva) | |
4478 | va_next = eva; | |
fb4ca018 MD |
4479 | if (va_next > eva) |
4480 | va_next = eva; | |
48ffc236 | 4481 | |
c8fe38ae | 4482 | /* |
701c977e MD |
4483 | * Scan the page table for pages. Some pages may not be |
4484 | * managed (might not have a pv_entry). | |
4485 | * | |
4486 | * There is no page table management for kernel pages so | |
4487 | * pt_pv will be NULL in that case, but otherwise pt_pv | |
4488 | * is non-NULL, locked, and referenced. | |
c8fe38ae | 4489 | */ |
c8fe38ae | 4490 | |
f2c5d4ab MD |
4491 | /* |
4492 | * At this point a non-NULL pt_pv means a UVA, and a NULL | |
4493 | * pt_pv means a KVA. | |
4494 | */ | |
701c977e MD |
4495 | if (pt_pv) |
4496 | ptep = pv_pte_lookup(pt_pv, pmap_pte_index(sva)); | |
4497 | else | |
4498 | ptep = vtopte(sva); | |
4499 | ||
4500 | while (sva < va_next) { | |
76f1911e | 4501 | vm_pindex_t *pte_placemark; |
567a6398 | 4502 | pv_entry_t pte_pv; |
76f1911e | 4503 | |
cfffe7b1 | 4504 | /* |
a7a03a5f | 4505 | * Yield every 64 pages, stop if requested. |
cfffe7b1 MD |
4506 | */ |
4507 | if ((++info->count & 63) == 0) | |
4508 | lwkt_user_yield(); | |
a7a03a5f MD |
4509 | if (info->stop) |
4510 | break; | |
4511 | ||
4512 | /* | |
76f1911e MD |
4513 | * We can shortcut our scan if *ptep == 0. This is |
4514 | * an unlocked check. | |
a7a03a5f | 4515 | */ |
76f1911e MD |
4516 | if (*ptep == 0) { |
4517 | sva += PAGE_SIZE; | |
4518 | ++ptep; | |
4519 | continue; | |
4520 | } | |
5ee06c6c | 4521 | cpu_ccfence(); |
cfffe7b1 | 4522 | |
f2c5d4ab | 4523 | /* |
567a6398 MD |
4524 | * Acquire the pte_placemark. pte_pv's won't exist |
4525 | * for leaf pages. | |
f2c5d4ab | 4526 | * |
567a6398 MD |
4527 | * A multitude of races are possible here so if we |
4528 | * cannot lock definite state we clean out our cache | |
4529 | * and break the inner while() loop to force a loop | |
4530 | * up to the top of the for(). | |
90244566 | 4531 | * |
76f1911e MD |
4532 | * XXX unlock/relock pd_pv, pt_pv, and re-test their |
4533 | * validity instead of looping up? | |
f2c5d4ab | 4534 | */ |
76f1911e MD |
4535 | pte_pv = pv_get_try(pmap, pmap_pte_pindex(sva), |
4536 | &pte_placemark, &error); | |
567a6398 | 4537 | KKASSERT(pte_pv == NULL); |
76f1911e | 4538 | if (error) { |
698ba577 MD |
4539 | if (pd_pv) { |
4540 | pv_put(pd_pv); /* lock order */ | |
4541 | pd_pv = NULL; | |
4542 | } | |
76f1911e MD |
4543 | if (pt_pv) { |
4544 | pv_put(pt_pv); /* lock order */ | |
4545 | pt_pv = NULL; | |
4546 | } | |
567a6398 | 4547 | pv_placemarker_wait(pmap, pte_placemark); |
76f1911e MD |
4548 | va_next = sva; /* retry */ |
4549 | break; | |
701c977e MD |
4550 | } |
4551 | ||
a505393f | 4552 | /* |
e989b548 | 4553 | * Reload *ptep after successfully locking the |
567a6398 | 4554 | * pindex. |
a505393f | 4555 | */ |
e989b548 | 4556 | cpu_ccfence(); |
8e2efb11 MD |
4557 | oldpte = *ptep; |
4558 | if (oldpte == 0) { | |
567a6398 | 4559 | pv_placemarker_wakeup(pmap, pte_placemark); |
90244566 MD |
4560 | sva += PAGE_SIZE; |
4561 | ++ptep; | |
a505393f MD |
4562 | continue; |
4563 | } | |
4564 | ||
76f1911e MD |
4565 | /* |
4566 | * We can't hold pd_pv across the callback (because | |
4567 | * we don't pass it to the callback and the callback | |
4568 | * might deadlock) | |
4569 | */ | |
4570 | if (pd_pv) { | |
4571 | vm_page_wire_quick(pd_pv->pv_m); | |
4572 | pv_unlock(pd_pv); | |
4573 | } | |
4574 | ||
701c977e | 4575 | /* |
567a6398 MD |
4576 | * Ready for the callback. The locked placemarker |
4577 | * is consumed by the callback. | |
701c977e | 4578 | */ |
e989b548 MD |
4579 | if (oldpte & pmap->pmap_bits[PG_MANAGED_IDX]) { |
4580 | /* | |
4581 | * Managed pte | |
4582 | */ | |
567a6398 MD |
4583 | KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), |
4584 | ("badC *ptep %016lx/%016lx sva %016lx", | |
4585 | *ptep, oldpte, sva)); | |
a7a03a5f MD |
4586 | /* |
4587 | * We must unlock pd_pv across the callback | |
4588 | * to avoid deadlocks on any recursive | |
4589 | * disposal. Re-check that it still exists | |
4590 | * after re-locking. | |
76f1911e | 4591 | * |
567a6398 MD |
4592 | * Call target disposes of pte_placemark |
4593 | * and may destroy but will not dispose | |
4594 | * of pt_pv. | |
a7a03a5f | 4595 | */ |
567a6398 | 4596 | info->func(pmap, info, pte_placemark, pt_pv, |
a7a03a5f | 4597 | sva, ptep, info->arg); |
701c977e | 4598 | } else { |
8e2efb11 | 4599 | /* |
e989b548 MD |
4600 | * Unmanaged pte |
4601 | * | |
a7a03a5f MD |
4602 | * We must unlock pd_pv across the callback |
4603 | * to avoid deadlocks on any recursive | |
4604 | * disposal. Re-check that it still exists | |
4605 | * after re-locking. | |
76f1911e | 4606 | * |
567a6398 MD |
4607 | * Call target disposes of pte_placemark |
4608 | * and may destroy but will not dispose | |
4609 | * of pt_pv. | |
8e2efb11 | 4610 | */ |
567a6398 MD |
4611 | KASSERT((oldpte & pmap->pmap_bits[PG_V_IDX]), |
4612 | ("badD *ptep %016lx/%016lx sva %016lx ", | |
4613 | *ptep, oldpte, sva)); | |
4614 | info->func(pmap, info, pte_placemark, pt_pv, | |
4615 | sva, ptep, info->arg); | |
76f1911e MD |
4616 | } |
4617 | if (pd_pv) { | |
4618 | pv_lock(pd_pv); | |
e3c330f0 MD |
4619 | if (vm_page_unwire_quick(pd_pv->pv_m)) { |
4620 | panic("pmap_scan_callback: " | |
4621 | "bad wirecount on pd_pv"); | |
4622 | } | |
76f1911e MD |
4623 | if (pd_pv->pv_pmap == NULL) { |
4624 | va_next = sva; /* retry */ | |
4625 | break; | |
a7a03a5f | 4626 | } |
701c977e | 4627 | } |
a3a33e50 MD |
4628 | |
4629 | /* | |
4630 | * NOTE: The cached pt_pv can be removed from the | |
4631 | * pmap when pmap_dynamic_delete is enabled, | |
4632 | * which will cause ptep to become stale. | |
4633 | * | |
4634 | * This also means that no pages remain under | |
4635 | * the PT, so we can just break out of the inner | |
4636 | * loop and let the outer loop clean everything | |
4637 | * up. | |
4638 | */ | |
4639 | if (pt_pv && pt_pv->pv_pmap != pmap) | |
4640 | break; | |
701c977e MD |
4641 | sva += PAGE_SIZE; |
4642 | ++ptep; | |
c8fe38ae MD |
4643 | } |
4644 | } | |
701c977e MD |
4645 | if (pd_pv) { |
4646 | pv_put(pd_pv); | |
4647 | pd_pv = NULL; | |
4648 | } | |
4649 | if (pt_pv) { | |
4650 | pv_put(pt_pv); | |
4651 | pt_pv = NULL; | |
4652 | } | |
e674353b MD |
4653 | if ((++info->count & 7) == 0) |
4654 | lwkt_user_yield(); | |
fb4ca018 MD |
4655 | |
4656 | /* | |
4657 | * Relock before returning. | |
4658 | */ | |
4659 | spin_lock(&pmap->pm_spin); | |
4660 | return (0); | |
701c977e MD |
4661 | } |
4662 | ||
4663 | void | |
4664 | pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) | |
4665 | { | |
fb4ca018 MD |
4666 | struct pmap_scan_info info; |
4667 | ||
4668 | info.pmap = pmap; | |
4669 | info.sva = sva; | |
4670 | info.eva = eva; | |
4671 | info.func = pmap_remove_callback; | |
4672 | info.arg = NULL; | |
ccd67bf6 | 4673 | pmap_scan(&info, 1); |
95270b7e MD |
4674 | #if 0 |
4675 | cpu_invltlb(); | |
4676 | if (eva - sva < 1024*1024) { | |
4677 | while (sva < eva) { | |
4678 | cpu_invlpg((void *)sva); | |
4679 | sva += PAGE_SIZE; | |
4680 | } | |
4681 | } | |
4682 | #endif | |
701c977e MD |
4683 | } |
4684 | ||
4685 | static void | |
9df83100 MD |
4686 | pmap_remove_noinval(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva) |
4687 | { | |
4688 | struct pmap_scan_info info; | |
4689 | ||
4690 | info.pmap = pmap; | |
4691 | info.sva = sva; | |
4692 | info.eva = eva; | |
4693 | info.func = pmap_remove_callback; | |
4694 | info.arg = NULL; | |
ccd67bf6 | 4695 | pmap_scan(&info, 0); |
9df83100 MD |
4696 | } |
4697 | ||
4698 | static void | |
4699 | pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info, | |
567a6398 | 4700 | vm_pindex_t *pte_placemark, pv_entry_t pt_pv, |
921c891e | 4701 | vm_offset_t va, pt_entry_t *ptep, void *arg __unused) |
701c977e MD |
4702 | { |
4703 | pt_entry_t pte; | |
c2830aa6 | 4704 | vm_page_t oldm; |
701c977e | 4705 | |
567a6398 MD |
4706 | /* |
4707 | * Managed or unmanaged pte (pte_placemark is non-NULL) | |
4708 | * | |
4709 | * pt_pv's wire_count is still bumped by unmanaged pages | |
4710 | * so we must decrement it manually. | |
4711 | * | |
4712 | * We have to unwire the target page table page. | |
4713 | */ | |
c2830aa6 MD |
4714 | pte = *ptep; |
4715 | if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { | |
4716 | oldm = PHYS_TO_VM_PAGE(pte & PG_FRAME); | |
4717 | atomic_add_long(&oldm->md.interlock_count, 1); | |
4718 | } else { | |
4719 | oldm = NULL; | |
4720 | } | |
c2830aa6 | 4721 | |
567a6398 MD |
4722 | pte = pmap_inval_bulk(info->bulk, va, ptep, 0); |
4723 | if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { | |
4724 | vm_page_t p; | |
a7a03a5f | 4725 | |
567a6398 MD |
4726 | p = PHYS_TO_VM_PAGE(pte & PG_FRAME); |
4727 | KKASSERT(pte & pmap->pmap_bits[PG_V_IDX]); | |
4728 | if (pte & pmap->pmap_bits[PG_M_IDX]) | |
4729 | vm_page_dirty(p); | |
4730 | if (pte & pmap->pmap_bits[PG_A_IDX]) | |
4731 | vm_page_flag_set(p, PG_REFERENCED); | |
e3c330f0 MD |
4732 | |
4733 | /* | |
c2830aa6 MD |
4734 | * (p) is not hard-busied. |
4735 | * | |
b9a6fe08 SW |
4736 | * We can safely clear PG_MAPPED and PG_WRITEABLE only |
4737 | * if PG_MAPPEDMULTI is not set, atomically. | |
e3c330f0 | 4738 | */ |
c2830aa6 | 4739 | pmap_removed_pte(p, pte); |
e3c330f0 MD |
4740 | } |
4741 | if (pte & pmap->pmap_bits[PG_V_IDX]) { | |
4742 | atomic_add_long(&pmap->pm_stats.resident_count, -1); | |
4743 | if (pt_pv && vm_page_unwire_quick(pt_pv->pv_m)) | |
4744 | panic("pmap_remove: insufficient wirecount"); | |
567a6398 MD |
4745 | } |
4746 | if (pte & pmap->pmap_bits[PG_W_IDX]) | |
4747 | atomic_add_long(&pmap->pm_stats.wired_count, -1); | |
4748 | if (pte & pmap->pmap_bits[PG_G_IDX]) | |
4749 | cpu_invlpg((void *)va); | |
567a6398 | 4750 | pv_placemarker_wakeup(pmap, pte_placemark); |
c2830aa6 MD |
4751 | if (oldm) { |
4752 | if ((atomic_fetchadd_long(&oldm->md.interlock_count, -1) & | |
4753 | 0x7FFFFFFFFFFFFFFFLU) == 0x4000000000000001LU) { | |
4754 | atomic_clear_long(&oldm->md.interlock_count, | |
4755 | 0x4000000000000000LU); | |
4756 | wakeup(&oldm->md.interlock_count); | |
4757 | } | |
4758 | } | |
d7f50089 YY |
4759 | } |
4760 | ||
4761 | /* | |
b12defdc MD |
4762 | * Removes this physical page from all physical maps in which it resides. |
4763 | * Reflects back modify bits to the pager. | |
d7f50089 | 4764 | * |
b12defdc | 4765 | * This routine may not be called from an interrupt. |
567a6398 MD |
4766 | * |
4767 | * The page must be busied by its caller, preventing new ptes from being | |
4768 | * installed. This allows us to assert that pmap_count is zero and safely | |
4769 | * clear the MAPPED and WRITEABLE bits upon completion. | |
d7f50089 | 4770 | */ |
bfc09ba0 MD |
4771 | static |
4772 | void | |
d7f50089 YY |
4773 | pmap_remove_all(vm_page_t m) |
4774 | { | |
c2830aa6 | 4775 | long icount; |
e3c330f0 MD |
4776 | int retry; |
4777 | ||
427fbc33 | 4778 | if (__predict_false(!pmap_initialized)) |
c8fe38ae | 4779 | return; |
831a8507 MD |
4780 | |
4781 | /* | |
4782 | * pmap_count doesn't cover fictitious pages, but PG_MAPPED does | |
4783 | * (albeit without certain race protections). | |
4784 | */ | |
4785 | #if 0 | |
567a6398 MD |
4786 | if (m->md.pmap_count == 0) |
4787 | return; | |
831a8507 MD |
4788 | #endif |
4789 | if ((m->flags & PG_MAPPED) == 0) | |
4790 | return; | |
e3c330f0 MD |
4791 | |
4792 | retry = ticks + hz * 60; | |
4793 | again: | |
567a6398 MD |
4794 | PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { |
4795 | if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) | |
4796 | PMAP_PAGE_BACKING_RETRY; | |
4797 | if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { | |
4798 | if (ipte & ipmap->pmap_bits[PG_M_IDX]) | |
4799 | vm_page_dirty(m); | |
4800 | if (ipte & ipmap->pmap_bits[PG_A_IDX]) | |
4801 | vm_page_flag_set(m, PG_REFERENCED); | |
e3c330f0 MD |
4802 | |
4803 | /* | |
4804 | * NOTE: m is not hard-busied so it is not safe to | |
4805 | * clear PG_MAPPED and PG_WRITEABLE on the 1->0 | |
4806 | * transition against them being set in | |
4807 | * pmap_enter(). | |
4808 | */ | |
c2830aa6 | 4809 | pmap_removed_pte(m, ipte); |
b12defdc | 4810 | } |
e3e69557 | 4811 | |
b12defdc | 4812 | /* |
567a6398 MD |
4813 | * Cleanup various tracking counters. pt_pv can't go away |
4814 | * due to our wired ref. | |
b12defdc | 4815 | */ |
c713db65 | 4816 | if (ipmap != kernel_pmap) { |
567a6398 | 4817 | pv_entry_t pt_pv; |
67e7cb85 | 4818 | |
567a6398 MD |
4819 | spin_lock_shared(&ipmap->pm_spin); |
4820 | pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); | |
4821 | spin_unlock_shared(&ipmap->pm_spin); | |
4822 | ||
4823 | if (pt_pv) { | |
e3c330f0 MD |
4824 | if (vm_page_unwire_quick(pt_pv->pv_m)) { |
4825 | panic("pmap_remove_all: bad " | |
4826 | "wire_count on pt_pv"); | |
4827 | } | |
567a6398 MD |
4828 | atomic_add_long( |
4829 | &ipmap->pm_stats.resident_count, -1); | |
67e7cb85 | 4830 | } |
67e7cb85 | 4831 | } |
567a6398 MD |
4832 | if (ipte & ipmap->pmap_bits[PG_W_IDX]) |
4833 | atomic_add_long(&ipmap->pm_stats.wired_count, -1); | |
4834 | if (ipte & ipmap->pmap_bits[PG_G_IDX]) | |
4835 | cpu_invlpg((void *)iva); | |
4836 | } PMAP_PAGE_BACKING_DONE; | |
e3c330f0 | 4837 | |
c2830aa6 MD |
4838 | /* |
4839 | * If our scan lost a pte swap race oldm->md.interlock_count might | |
4840 | * be set from the pmap_enter() code. If so sleep a little and try | |
4841 | * again. | |
4842 | */ | |
4843 | icount = atomic_fetchadd_long(&m->md.interlock_count, | |
4844 | 0x8000000000000000LU) + | |
4845 | 0x8000000000000000LU; | |
4846 | cpu_ccfence(); | |
4847 | while (icount & 0x3FFFFFFFFFFFFFFFLU) { | |
4848 | tsleep_interlock(&m->md.interlock_count, 0); | |
4849 | if (atomic_fcmpset_long(&m->md.interlock_count, &icount, | |
4850 | icount | 0x4000000000000000LU)) { | |
4851 | tsleep(&m->md.interlock_count, PINTERLOCKED, | |
4852 | "pgunm", 1); | |
4853 | icount = m->md.interlock_count; | |
4854 | if (retry - ticks > 0) | |
4855 | goto again; | |
4856 | panic("pmap_remove_all: cannot return interlock_count " | |
4857 | "to 0 (%p, %ld)", | |
4858 | m, m->md.interlock_count); | |
4859 | } | |
4860 | } | |
c2830aa6 | 4861 | vm_page_flag_clear(m, PG_MAPPED | PG_MAPPEDMULTI | PG_WRITEABLE); |
d7f50089 YY |
4862 | } |
4863 | ||
a7a03a5f | 4864 | /* |
567a6398 MD |
4865 | * Removes the page from a particular pmap. |
4866 | * | |
4867 | * The page must be busied by the caller. | |
a7a03a5f MD |
4868 | */ |
4869 | void | |
567a6398 | 4870 | pmap_remove_specific(pmap_t pmap_match, vm_page_t m) |
a7a03a5f | 4871 | { |
427fbc33 | 4872 | if (__predict_false(!pmap_initialized)) |
a7a03a5f | 4873 | return; |
831a8507 MD |
4874 | |
4875 | /* | |
4876 | * PG_MAPPED test works for both non-fictitious and fictitious pages. | |
4877 | */ | |
4878 | if ((m->flags & PG_MAPPED) == 0) | |
567a6398 | 4879 | return; |
831a8507 | 4880 | |
567a6398 MD |
4881 | PMAP_PAGE_BACKING_SCAN(m, pmap_match, ipmap, iptep, ipte, iva) { |
4882 | if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, 0)) | |
4883 | PMAP_PAGE_BACKING_RETRY; | |
4884 | if (ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) { | |
4885 | if (ipte & ipmap->pmap_bits[PG_M_IDX]) | |
4886 | vm_page_dirty(m); | |
4887 | if (ipte & ipmap->pmap_bits[PG_A_IDX]) | |
4888 | vm_page_flag_set(m, PG_REFERENCED); | |
e3c330f0 MD |
4889 | |
4890 | /* | |
4891 | * NOTE: m is not hard-busied so it is not safe to | |
4892 | * clear PG_MAPPED and PG_WRITEABLE on the 1->0 | |
4893 | * transition against them being set in | |
4894 | * pmap_enter(). | |
4895 | */ | |
c2830aa6 | 4896 | pmap_removed_pte(m, ipte); |
a7a03a5f MD |
4897 | } |
4898 | ||
4899 | /* | |
567a6398 MD |
4900 | * Cleanup various tracking counters. pt_pv can't go away |
4901 | * due to our wired ref. | |
a7a03a5f | 4902 | */ |
c713db65 | 4903 | if (ipmap != kernel_pmap) { |
567a6398 MD |
4904 | pv_entry_t pt_pv; |
4905 | ||
4906 | spin_lock_shared(&ipmap->pm_spin); | |
4907 | pt_pv = pv_entry_lookup(ipmap, pmap_pt_pindex(iva)); | |
4908 | spin_unlock_shared(&ipmap->pm_spin); | |
4909 | ||
4910 | if (pt_pv) { | |
4911 | atomic_add_long( | |
4912 | &ipmap->pm_stats.resident_count, -1); | |
e3c330f0 MD |
4913 | if (vm_page_unwire_quick(pt_pv->pv_m)) { |
4914 | panic("pmap_remove_specific: bad " | |
4915 | "wire_count on pt_pv"); | |
4916 | } | |
567a6398 MD |
4917 | } |
4918 | } | |
4919 | if (ipte & ipmap->pmap_bits[PG_W_IDX]) | |
4920 | atomic_add_long(&ipmap->pm_stats.wired_count, -1); | |
4921 | if (ipte & ipmap->pmap_bits[PG_G_IDX]) | |
4922 | cpu_invlpg((void *)iva); | |
4923 | } PMAP_PAGE_BACKING_DONE; | |
a7a03a5f MD |
4924 | } |
4925 | ||
d7f50089 | 4926 | /* |
921c891e MD |
4927 | * Set the physical protection on the specified range of this map |
4928 | * as requested. This function is typically only used for debug watchpoints | |
4929 | * and COW pages. | |
d7f50089 | 4930 | * |
921c891e MD |
4931 | * This function may not be called from an interrupt if the map is |
4932 | * not the kernel_pmap. | |
d7f50089 | 4933 | * |
921c891e | 4934 | * NOTE! For shared page table pages we just unmap the page. |
d7f50089 YY |
4935 | */ |
4936 | void | |
4937 | pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot) | |
4938 | { | |
fb4ca018 | 4939 | struct pmap_scan_info info; |
48ffc236 JG |
4940 | /* JG review for NX */ |
4941 | ||
c8fe38ae MD |
4942 | if (pmap == NULL) |
4943 | return; | |
73d64b98 | 4944 | if ((prot & (VM_PROT_READ | VM_PROT_EXECUTE)) == VM_PROT_NONE) { |
c8fe38ae MD |
4945 | pmap_remove(pmap, sva, eva); |
4946 | return; | |
4947 | } | |
c8fe38ae MD |
4948 | if (prot & VM_PROT_WRITE) |
4949 | return; | |
fb4ca018 MD |
4950 | info.pmap = pmap; |
4951 | info.sva = sva; | |
4952 | info.eva = eva; | |
4953 | info.func = pmap_protect_callback; | |
4954 | info.arg = &prot; | |
ccd67bf6 | 4955 | pmap_scan(&info, 1); |
701c977e | 4956 | } |
c8fe38ae | 4957 | |
701c977e MD |
4958 | static |
4959 | void | |
9df83100 | 4960 | pmap_protect_callback(pmap_t pmap, struct pmap_scan_info *info, |
567a6398 MD |
4961 | vm_pindex_t *pte_placemark, |
4962 | pv_entry_t pt_pv, vm_offset_t va, | |
4963 | pt_entry_t *ptep, void *arg __unused) | |
701c977e MD |
4964 | { |
4965 | pt_entry_t pbits; | |
4966 | pt_entry_t cbits; | |
4967 | vm_page_t m; | |
c8fe38ae | 4968 | |
c2fb025d | 4969 | again: |
701c977e | 4970 | pbits = *ptep; |
567a6398 | 4971 | cpu_ccfence(); |
701c977e | 4972 | cbits = pbits; |
567a6398 MD |
4973 | if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { |
4974 | cbits &= ~pmap->pmap_bits[PG_A_IDX]; | |
4975 | cbits &= ~pmap->pmap_bits[PG_M_IDX]; | |
c8fe38ae | 4976 | } |
921c891e MD |
4977 | /* else unmanaged page, adjust bits, no wire changes */ |
4978 | ||
4979 | if (ptep) { | |
a86ce0cd | 4980 | cbits &= ~pmap->pmap_bits[PG_RW_IDX]; |
a44410dd MD |
4981 | #ifdef PMAP_DEBUG2 |
4982 | if (pmap_enter_debug > 0) { | |
4983 | --pmap_enter_debug; | |
567a6398 | 4984 | kprintf("pmap_protect va=%lx ptep=%p " |
a44410dd | 4985 | "pt_pv=%p cbits=%08lx\n", |
567a6398 | 4986 | va, ptep, pt_pv, cbits |
a44410dd MD |
4987 | ); |
4988 | } | |
4989 | #endif | |
79f2da03 | 4990 | if (pbits != cbits) { |
567a6398 | 4991 | if (!pmap_inval_smp_cmpset(pmap, va, |
79f2da03 MD |
4992 | ptep, pbits, cbits)) { |
4993 | goto again; | |
4994 | } | |
921c891e | 4995 | } |
567a6398 | 4996 | if (pbits & pmap->pmap_bits[PG_MANAGED_IDX]) { |
e3c330f0 MD |
4997 | m = PHYS_TO_VM_PAGE(pbits & PG_FRAME); |
4998 | if (pbits & pmap->pmap_bits[PG_A_IDX]) | |
4999 | vm_page_flag_set(m, PG_REFERENCED); | |
5000 | if (pbits & pmap->pmap_bits[PG_M_IDX]) | |
5001 | vm_page_dirty(m); | |
567a6398 | 5002 | } |
701c977e | 5003 | } |
567a6398 | 5004 | pv_placemarker_wakeup(pmap, pte_placemark); |
d7f50089 YY |
5005 | } |
5006 | ||
5007 | /* | |
701c977e MD |
5008 | * Insert the vm_page (m) at the virtual address (va), replacing any prior |
5009 | * mapping at that address. Set protection and wiring as requested. | |
d7f50089 | 5010 | * |
921c891e MD |
5011 | * If entry is non-NULL we check to see if the SEG_SIZE optimization is |
5012 | * possible. If it is we enter the page into the appropriate shared pmap | |
5013 | * hanging off the related VM object instead of the passed pmap, then we | |
5014 | * share the page table page from the VM object's pmap into the current pmap. | |
5015 | * | |
701c977e MD |
5016 | * NOTE: This routine MUST insert the page into the pmap now, it cannot |
5017 | * lazy-evaluate. | |
d7f50089 YY |
5018 | */ |
5019 | void | |
5020 | pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot, | |
07b67fa6 | 5021 | boolean_t wired, vm_map_entry_t entry) |
701c977e | 5022 | { |
701c977e MD |
5023 | pv_entry_t pt_pv; /* page table */ |
5024 | pv_entry_t pte_pv; /* page table entry */ | |
76f1911e | 5025 | vm_pindex_t *pte_placemark; |
701c977e | 5026 | pt_entry_t *ptep; |
e3c330f0 | 5027 | pt_entry_t origpte; |
c8fe38ae | 5028 | vm_paddr_t opa; |
e3c330f0 MD |
5029 | vm_page_t oldm; |
5030 | pt_entry_t newpte; | |
701c977e | 5031 | vm_paddr_t pa; |
c2830aa6 MD |
5032 | int flags; |
5033 | int nflags; | |
c8fe38ae MD |
5034 | |
5035 | if (pmap == NULL) | |
5036 | return; | |
48ffc236 | 5037 | va = trunc_page(va); |
c8fe38ae MD |
5038 | #ifdef PMAP_DIAGNOSTIC |
5039 | if (va >= KvaEnd) | |
5040 | panic("pmap_enter: toobig"); | |
5041 | if ((va >= UPT_MIN_ADDRESS) && (va < UPT_MAX_ADDRESS)) | |
701c977e MD |
5042 | panic("pmap_enter: invalid to pmap_enter page table " |
5043 | "pages (va: 0x%lx)", va); | |
c8fe38ae | 5044 | #endif |
c713db65 | 5045 | if (va < UPT_MAX_ADDRESS && pmap == kernel_pmap) { |
701c977e MD |
5046 | kprintf("Warning: pmap_enter called on UVA with " |
5047 | "kernel_pmap\n"); | |
48ffc236 | 5048 | #ifdef DDB |
4a3a2ba2 | 5049 | print_backtrace(-1); |
48ffc236 | 5050 | #endif |
c8fe38ae | 5051 | } |
c713db65 | 5052 | if (va >= UPT_MAX_ADDRESS && pmap != kernel_pmap) { |
701c977e MD |
5053 | kprintf("Warning: pmap_enter called on KVA without" |
5054 | "kernel_pmap\n"); | |
48ffc236 | 5055 | #ifdef DDB |
4a3a2ba2 | 5056 | print_backtrace(-1); |
48ffc236 | 5057 | #endif |
c8fe38ae MD |
5058 | } |
5059 | ||
5060 | /* | |
567a6398 MD |
5061 | * Get the locked page table page (pt_pv) for our new page table |
5062 | * entry, allocating it if necessary. | |
701c977e | 5063 | * |
567a6398 MD |
5064 | * There is no pte_pv for a terminal pte so the terminal pte will |
5065 | * be locked via pte_placemark. | |
701c977e | 5066 | * |
567a6398 MD |
5067 | * Only MMU actions by the CPU itself can modify the ptep out from |
5068 | * under us. | |
701c977e MD |
5069 | * |
5070 | * If the pmap is still being initialized we assume existing | |
5071 | * page tables. | |
5072 | * | |
567a6398 MD |
5073 | * NOTE: Kernel mapppings do not track page table pages |
5074 | * (i.e. there is no pt_pv pt_pv structure). | |
e989b548 | 5075 | * |
567a6398 MD |
5076 | * NOTE: origpte here is 'tentative', used only to check for |
5077 | * the degenerate case where the entry already exists and | |
5078 | * matches. | |
701c977e | 5079 | */ |
427fbc33 | 5080 | if (__predict_false(pmap_initialized == FALSE)) { |
701c977e MD |
5081 | pte_pv = NULL; |
5082 | pt_pv = NULL; | |
76f1911e | 5083 | pte_placemark = NULL; |
701c977e | 5084 | ptep = vtopte(va); |
8e2efb11 | 5085 | origpte = *ptep; |
567a6398 | 5086 | } else { |
76f1911e MD |
5087 | pte_pv = pv_get(pmap, pmap_pte_pindex(va), &pte_placemark); |
5088 | KKASSERT(pte_pv == NULL); | |
701c977e MD |
5089 | if (va >= VM_MAX_USER_ADDRESS) { |
5090 | pt_pv = NULL; | |
5091 | ptep = vtopte(va); | |
5092 | } else { | |
567a6398 | 5093 | pt_pv = pmap_allocpte(pmap, pmap_pt_pindex(va), NULL); |
701c977e MD |
5094 | ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); |
5095 | } | |
8e2efb11 MD |
5096 | origpte = *ptep; |
5097 | cpu_ccfence(); | |
701c977e | 5098 | } |
c8fe38ae | 5099 | |
48ffc236 | 5100 | pa = VM_PAGE_TO_PHYS(m); |
c8fe38ae | 5101 | |
5ee06c6c | 5102 | /* |
567a6398 | 5103 | * Calculate the new PTE. |
5ee06c6c | 5104 | */ |
a86ce0cd | 5105 | newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | |
8e2efb11 | 5106 | pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]); |
52bb73bc | 5107 | if (wired) |
a86ce0cd | 5108 | newpte |= pmap->pmap_bits[PG_W_IDX]; |
52bb73bc | 5109 | if (va < VM_MAX_USER_ADDRESS) |
a86ce0cd | 5110 | newpte |= pmap->pmap_bits[PG_U_IDX]; |
831a8507 | 5111 | if ((m->flags & PG_FICTITIOUS) == 0) |
a86ce0cd | 5112 | newpte |= pmap->pmap_bits[PG_MANAGED_IDX]; |
c713db65 | 5113 | // if (pmap == kernel_pmap) |
a86ce0cd | 5114 | // newpte |= pgeflag; |
c2ec3418 | 5115 | newpte |= pmap->pmap_cache_bits_pte[m->pat_mode]; |
52bb73bc | 5116 | |
c8fe38ae | 5117 | /* |
52bb73bc MD |
5118 | * It is possible for multiple faults to occur in threaded |
5119 | * environments, the existing pte might be correct. | |
c8fe38ae | 5120 | */ |
e989b548 MD |
5121 | if (((origpte ^ newpte) & |
5122 | ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] | | |
5123 | pmap->pmap_bits[PG_A_IDX])) == 0) { | |
52bb73bc | 5124 | goto done; |
e989b548 | 5125 | } |
c8fe38ae | 5126 | |
e3c330f0 MD |
5127 | /* |
5128 | * Adjust page flags. The page is soft-busied or hard-busied, we | |
5129 | * should be able to safely set PG_* flag bits even with the (shared) | |
5130 | * soft-busy. | |
5131 | * | |
831a8507 MD |
5132 | * The pmap_count and writeable_count is only tracked for |
5133 | * non-fictitious pages. As a bit of a safety, bump pmap_count | |
5134 | * and set the PG_* bits before mapping the page. If another part | |
5135 | * of the system does not properly hard-busy the page (against our | |
5136 | * soft-busy or hard-busy) in order to remove mappings it might not | |
5137 | * see the pte that we are about to add and thus will not be able to | |
5138 | * drop pmap_count to 0. | |
5139 | * | |
5140 | * The PG_MAPPED and PG_WRITEABLE flags are set for any type of page. | |
e3c330f0 MD |
5141 | * |
5142 | * NOTE! PG_MAPPED and PG_WRITEABLE can only be cleared when | |
5143 | * the page is hard-busied AND pmap_count is 0. This | |
5144 | * interlocks our setting of the flags here. | |
5145 | */ | |
5146 | /*vm_page_spin_lock(m);*/ | |
c2830aa6 MD |
5147 | |
5148 | /* | |
5149 | * In advanced mode we keep track of single mappings verses | |
5150 | * multiple mappings in order to avoid unnecessary vm_page_protect() | |
5151 | * calls (particularly on the kernel_map). | |
5152 | * | |
5153 | * If non-advanced mode we track the mapping count for similar effect. | |
427fbc33 MD |
5154 | * |
5155 | * Avoid modifying the vm_page as much as possible, conditionalize | |
5156 | * updates to reduce cache line ping-ponging. | |
c2830aa6 | 5157 | */ |
c2830aa6 MD |
5158 | flags = m->flags; |
5159 | cpu_ccfence(); | |
5160 | for (;;) { | |
5161 | nflags = PG_MAPPED; | |
5162 | if (newpte & pmap->pmap_bits[PG_RW_IDX]) | |
5163 | nflags |= PG_WRITEABLE; | |
5164 | if (flags & PG_MAPPED) | |
5165 | nflags |= PG_MAPPEDMULTI; | |
427fbc33 MD |
5166 | if (flags == (flags | nflags)) |
5167 | break; | |
c2830aa6 MD |
5168 | if (atomic_fcmpset_int(&m->flags, &flags, flags | nflags)) |
5169 | break; | |
5170 | } | |
e3c330f0 | 5171 | /*vm_page_spin_unlock(m);*/ |
e3c330f0 MD |
5172 | |
5173 | /* | |
5174 | * A race can develop when replacing an existing mapping. The new | |
5175 | * page has been busied and the pte is placemark-locked, but the | |
c2830aa6 | 5176 | * old page could be ripped out from under us at any time by |
e3c330f0 MD |
5177 | * a backing scan. |
5178 | * | |
b9a6fe08 SW |
5179 | * If we do nothing, a concurrent backing scan may clear |
5180 | * PG_WRITEABLE and PG_MAPPED before we can act on oldm. | |
e3c330f0 MD |
5181 | */ |
5182 | opa = origpte & PG_FRAME; | |
5183 | if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { | |
5184 | oldm = PHYS_TO_VM_PAGE(opa); | |
5185 | KKASSERT(opa == oldm->phys_addr); | |
5186 | KKASSERT(entry != NULL); | |
c2830aa6 | 5187 | atomic_add_long(&oldm->md.interlock_count, 1); |
e3c330f0 MD |
5188 | } else { |
5189 | oldm = NULL; | |
5190 | } | |
5191 | ||
c8fe38ae | 5192 | /* |
567a6398 MD |
5193 | * Swap the new and old PTEs and perform any necessary SMP |
5194 | * synchronization. | |
701c977e | 5195 | */ |
567a6398 MD |
5196 | if ((prot & VM_PROT_NOSYNC) || (opa == 0 && pt_pv != NULL)) { |
5197 | /* | |
5198 | * Explicitly permitted to avoid pmap cpu mask synchronization | |
5199 | * or the prior content of a non-kernel-related pmap was | |
5200 | * invalid. | |
5201 | */ | |
5202 | origpte = atomic_swap_long(ptep, newpte); | |
5203 | if (opa) | |
5204 | cpu_invlpg((void *)va); | |
5205 | } else { | |
5206 | /* | |
5207 | * Not permitted to avoid pmap cpu mask synchronization | |
5208 | * or there prior content being replaced or this is a kernel | |
5209 | * related pmap. | |
5210 | * | |
5211 | * Due to other kernel optimizations, we cannot assume a | |
5212 | * 0->non_zero transition of *ptep can be done with a swap. | |
5213 | */ | |
5214 | origpte = pmap_inval_smp(pmap, va, 1, ptep, newpte); | |
c8fe38ae | 5215 | } |
567a6398 | 5216 | opa = origpte & PG_FRAME; |
c8fe38ae | 5217 | |
a44410dd MD |
5218 | #ifdef PMAP_DEBUG2 |
5219 | if (pmap_enter_debug > 0) { | |
5220 | --pmap_enter_debug; | |
5221 | kprintf("pmap_enter: va=%lx m=%p origpte=%lx newpte=%lx ptep=%p" | |
5222 | " pte_pv=%p pt_pv=%p opa=%lx prot=%02x\n", | |
5223 | va, m, | |
5224 | origpte, newpte, ptep, | |
5225 | pte_pv, pt_pv, opa, prot); | |
5226 | } | |
5227 | #endif | |
5228 | ||
567a6398 MD |
5229 | /* |
5230 | * Account for the changes in the pt_pv and pmap. | |
5231 | * | |
5232 | * Retain the same wiring count due to replacing an existing page, | |
5233 | * or bump the wiring count for a new page. | |
5234 | */ | |
567a6398 MD |
5235 | if (pt_pv && opa == 0) { |
5236 | vm_page_wire_quick(pt_pv->pv_m); | |
5237 | atomic_add_long(&pt_pv->pv_pmap->pm_stats.resident_count, 1); | |
5238 | } | |
5239 | if (wired && (origpte & pmap->pmap_bits[PG_W_IDX]) == 0) | |
5240 | atomic_add_long(&pmap->pm_stats.wired_count, 1); | |
e989b548 | 5241 | |
c8fe38ae | 5242 | /* |
567a6398 MD |
5243 | * Account for the removal of the old page. pmap and pt_pv stats |
5244 | * have already been fully adjusted for both. | |
921c891e | 5245 | * |
e3c330f0 MD |
5246 | * WARNING! oldm is not soft or hard-busied. The pte at worst can |
5247 | * only be removed out from under us since we hold the | |
5248 | * placemarker. So if it is still there, it must not have | |
5249 | * changed. | |
c2830aa6 | 5250 | * |
b9a6fe08 SW |
5251 | * WARNING! A backing scan can clear PG_WRITEABLE and/or PG_MAPPED |
5252 | * and rip oldm away from us, possibly even freeing or | |
5253 | * paging it, and not setting our dirtying below. | |
c2830aa6 MD |
5254 | * |
5255 | * To deal with this, oldm->md.interlock_count is bumped | |
5256 | * to indicate that we might (only might) have won the pte | |
5257 | * swap race, and then released below. | |
c8fe38ae | 5258 | */ |
567a6398 | 5259 | if (opa && (origpte & pmap->pmap_bits[PG_MANAGED_IDX])) { |
e3c330f0 | 5260 | KKASSERT(oldm == PHYS_TO_VM_PAGE(opa)); |
e3c330f0 | 5261 | if (origpte & pmap->pmap_bits[PG_M_IDX]) |
567a6398 | 5262 | vm_page_dirty(oldm); |
e3c330f0 MD |
5263 | if (origpte & pmap->pmap_bits[PG_A_IDX]) |
5264 | vm_page_flag_set(oldm, PG_REFERENCED); | |
5265 | ||
5266 | /* | |
5267 | * NOTE: oldm is not hard-busied so it is not safe to | |
5268 | * clear PG_MAPPED and PG_WRITEABLE on the 1->0 | |
5269 | * transition against them being set in | |
5270 | * pmap_enter(). | |
5271 | */ | |
c2830aa6 MD |
5272 | pmap_removed_pte(oldm, origpte); |
5273 | } | |
c2830aa6 MD |
5274 | if (oldm) { |
5275 | if ((atomic_fetchadd_long(&oldm->md.interlock_count, -1) & | |
5276 | 0x7FFFFFFFFFFFFFFFLU) == 0x4000000000000001LU) { | |
5277 | atomic_clear_long(&oldm->md.interlock_count, | |
5278 | 0x4000000000000000LU); | |
5279 | wakeup(&oldm->md.interlock_count); | |
5280 | } | |
79f2da03 | 5281 | } |
c8fe38ae | 5282 | |
52bb73bc | 5283 | done: |
8e2efb11 MD |
5284 | KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || |
5285 | (m->flags & PG_MAPPED)); | |
701c977e MD |
5286 | |
5287 | /* | |
e989b548 MD |
5288 | * Cleanup the pv entry, allowing other accessors. If the new page |
5289 | * is not managed but we have a pte_pv (which was locking our | |
5290 | * operation), we can free it now. pte_pv->pv_m should be NULL. | |
701c977e | 5291 | */ |
567a6398 | 5292 | if (pte_placemark) |
76f1911e | 5293 | pv_placemarker_wakeup(pmap, pte_placemark); |
701c977e MD |
5294 | if (pt_pv) |
5295 | pv_put(pt_pv); | |
d7f50089 YY |
5296 | } |
5297 | ||
d7f50089 | 5298 | /* |
c8fe38ae MD |
5299 | * Make a temporary mapping for a physical address. This is only intended |
5300 | * to be used for panic dumps. | |
fb8345e6 MD |
5301 | * |
5302 | * The caller is responsible for calling smp_invltlb(). | |
d7f50089 | 5303 | */ |
c8fe38ae | 5304 | void * |
8e5ea5f7 | 5305 | pmap_kenter_temporary(vm_paddr_t pa, long i) |
d7f50089 | 5306 | { |
fb8345e6 | 5307 | pmap_kenter_quick((vm_offset_t)crashdumpmap + (i * PAGE_SIZE), pa); |
c8fe38ae | 5308 | return ((void *)crashdumpmap); |
d7f50089 YY |
5309 | } |
5310 | ||
e3c330f0 | 5311 | #if 0 |
c8fe38ae MD |
5312 | #define MAX_INIT_PT (96) |
5313 | ||
d7f50089 YY |
5314 | /* |
5315 | * This routine preloads the ptes for a given object into the specified pmap. | |
5316 | * This eliminates the blast of soft faults on process startup and | |
5317 | * immediately after an mmap. | |
5318 | */ | |
5319 | static int pmap_object_init_pt_callback(vm_page_t p, void *data); | |
e3c330f0 | 5320 | #endif |
d7f50089 YY |
5321 | |
5322 | void | |
530e94fc MD |
5323 | pmap_object_init_pt(pmap_t pmap, vm_map_entry_t entry, |
5324 | vm_offset_t addr, vm_size_t size, int limit) | |
d7f50089 | 5325 | { |
e3c330f0 | 5326 | #if 0 |
530e94fc MD |
5327 | vm_prot_t prot = entry->protection; |
5328 | vm_object_t object = entry->ba.object; | |
5329 | vm_pindex_t pindex = atop(entry->ba.offset + (addr - entry->ba.start)); | |
c8fe38ae MD |
5330 | struct rb_vm_page_scan_info info; |
5331 | struct lwp *lp; | |
48ffc236 | 5332 | vm_size_t psize; |
c8fe38ae MD |
5333 | |
5334 | /* | |
5335 | * We can't preinit if read access isn't set or there is no pmap | |
5336 | * or object. | |
5337 | */ | |
5338 | if ((prot & VM_PROT_READ) == 0 || pmap == NULL || object == NULL) | |
5339 | return; | |
5340 | ||
5341 | /* | |
5342 | * We can't preinit if the pmap is not the current pmap | |
5343 | */ | |
5344 | lp = curthread->td_lwp; | |
5345 | if (lp == NULL || pmap != vmspace_pmap(lp->lwp_vmspace)) | |
5346 | return; | |
5347 | ||
921c891e MD |
5348 | /* |
5349 | * Misc additional checks | |
5350 | */ | |
b2b3ffcd | 5351 | psize = x86_64_btop(size); |
c8fe38ae MD |
5352 | |
5353 | if ((object->type != OBJT_VNODE) || | |
5354 | ((limit & MAP_PREFAULT_PARTIAL) && (psize > MAX_INIT_PT) && | |
5355 | (object->resident_page_count > MAX_INIT_PT))) { | |
5356 | return; | |
5357 | } | |
5358 | ||
701c977e | 5359 | if (pindex + psize > object->size) { |
c8fe38ae MD |
5360 | if (object->size < pindex) |
5361 | return; | |
5362 | psize = object->size - pindex; | |
5363 | } | |
5364 | ||
5365 | if (psize == 0) | |
5366 | return; | |
5367 | ||
921c891e MD |
5368 | /* |
5369 | * If everything is segment-aligned do not pre-init here. Instead | |
5370 | * allow the normal vm_fault path to pass a segment hint to | |
5371 | * pmap_enter() which will then use an object-referenced shared | |
5372 | * page table page. | |
5373 | */ | |
5374 | if ((addr & SEG_MASK) == 0 && | |
5375 | (ctob(psize) & SEG_MASK) == 0 && | |
5376 | (ctob(pindex) & SEG_MASK) == 0) { | |
5377 | return; | |
5378 | } | |
5379 | ||
c8fe38ae MD |
5380 | /* |
5381 | * Use a red-black scan to traverse the requested range and load | |
5382 | * any valid pages found into the pmap. | |
5383 | * | |
a5fc46c9 MD |
5384 | * We cannot safely scan the object's memq without holding the |
5385 | * object token. | |
c8fe38ae MD |
5386 | */ |
5387 | info.start_pindex = pindex; | |
5388 | info.end_pindex = pindex + psize - 1; | |
5389 | info.limit = limit; | |
5390 | info.mpte = NULL; | |
5391 | info.addr = addr; | |
5392 | info.pmap = pmap; | |
bb1339f8 | 5393 | info.object = object; |
530e94fc | 5394 | info.entry = entry; |
c8fe38ae | 5395 | |
bb1339f8 MD |
5396 | /* |
5397 | * By using the NOLK scan, the callback function must be sure | |
5398 | * to return -1 if the VM page falls out of the object. | |
5399 | */ | |
54341a3b | 5400 | vm_object_hold_shared(object); |
bb1339f8 MD |
5401 | vm_page_rb_tree_RB_SCAN_NOLK(&object->rb_memq, rb_vm_page_scancmp, |
5402 | pmap_object_init_pt_callback, &info); | |
a5fc46c9 | 5403 | vm_object_drop(object); |
e3c330f0 | 5404 | #endif |
d7f50089 YY |
5405 | } |
5406 | ||
e3c330f0 MD |
5407 | #if 0 |
5408 | ||
d7f50089 YY |
5409 | static |
5410 | int | |
5411 | pmap_object_init_pt_callback(vm_page_t p, void *data) | |
5412 | { | |
c8fe38ae MD |
5413 | struct rb_vm_page_scan_info *info = data; |
5414 | vm_pindex_t rel_index; | |
bb1339f8 | 5415 | int hard_busy; |
b12defdc | 5416 | |
c8fe38ae MD |
5417 | /* |
5418 | * don't allow an madvise to blow away our really | |
5419 | * free pages allocating pv entries. | |
5420 | */ | |
5421 | if ((info->limit & MAP_PREFAULT_MADVISE) && | |
5422 | vmstats.v_free_count < vmstats.v_free_reserved) { | |
5423 | return(-1); | |
5424 | } | |
0d987a03 MD |
5425 | |
5426 | /* | |
5427 | * Ignore list markers and ignore pages we cannot instantly | |
5428 | * busy (while holding the object token). | |
5429 | */ | |
5430 | if (p->flags & PG_MARKER) | |
5431 | return 0; | |
bb1339f8 MD |
5432 | hard_busy = 0; |
5433 | again: | |
5434 | if (hard_busy) { | |
5435 | if (vm_page_busy_try(p, TRUE)) | |
5436 | return 0; | |
5437 | } else { | |
5438 | if (vm_page_sbusy_try(p)) | |
5439 | return 0; | |
5440 | } | |
c8fe38ae | 5441 | if (((p->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) && |
b12defdc | 5442 | (p->flags & PG_FICTITIOUS) == 0) { |
bb1339f8 MD |
5443 | if ((p->queue - p->pc) == PQ_CACHE) { |
5444 | if (hard_busy == 0) { | |
5445 | vm_page_sbusy_drop(p); | |
5446 | hard_busy = 1; | |
5447 | goto again; | |
5448 | } | |
c8fe38ae | 5449 | vm_page_deactivate(p); |
bb1339f8 | 5450 | } |
c8fe38ae | 5451 | rel_index = p->pindex - info->start_pindex; |
530e94fc MD |
5452 | pmap_enter(info->pmap, info->addr + x86_64_ptob(rel_index), p, |
5453 | VM_PROT_READ, FALSE, info->entry); | |
c8fe38ae | 5454 | } |
bb1339f8 MD |
5455 | if (hard_busy) |
5456 | vm_page_wakeup(p); | |
5457 | else | |
5458 | vm_page_sbusy_drop(p); | |
5459 | ||
5460 | /* | |
5461 | * We are using an unlocked scan (that is, the scan expects its | |
5462 | * current element to remain in the tree on return). So we have | |
5463 | * to check here and abort the scan if it isn't. | |
5464 | */ | |
5465 | if (p->object != info->object) | |
5466 | return -1; | |
fc9ed34d | 5467 | lwkt_yield(); |
d7f50089 YY |
5468 | return(0); |
5469 | } | |
5470 | ||
e3c330f0 MD |
5471 | #endif |
5472 | ||
d7f50089 | 5473 | /* |
701c977e MD |
5474 | * Return TRUE if the pmap is in shape to trivially pre-fault the specified |
5475 | * address. | |
1b9d3514 | 5476 | * |
701c977e MD |
5477 | * Returns FALSE if it would be non-trivial or if a pte is already loaded |
5478 | * into the slot. | |
54341a3b | 5479 | * |
e3c330f0 MD |
5480 | * The address must reside within a vm_map mapped range to ensure that the |
5481 | * page table doesn't get ripped out from under us. | |
5482 | * | |
54341a3b | 5483 | * XXX This is safe only because page table pages are not freed. |
d7f50089 | 5484 | */ |
1b9d3514 MD |
5485 | int |
5486 | pmap_prefault_ok(pmap_t pmap, vm_offset_t addr) | |
d7f50089 | 5487 | { |
1b9d3514 | 5488 | pt_entry_t *pte; |
c8fe38ae | 5489 | |
54341a3b | 5490 | /*spin_lock(&pmap->pm_spin);*/ |
701c977e | 5491 | if ((pte = pmap_pte(pmap, addr)) != NULL) { |
a86ce0cd | 5492 | if (*pte & pmap->pmap_bits[PG_V_IDX]) { |
54341a3b | 5493 | /*spin_unlock(&pmap->pm_spin);*/ |
701c977e MD |
5494 | return FALSE; |
5495 | } | |
10d6182e | 5496 | } |
54341a3b | 5497 | /*spin_unlock(&pmap->pm_spin);*/ |
701c977e | 5498 | return TRUE; |
d7f50089 YY |
5499 | } |
5500 | ||
5501 | /* | |
701c977e | 5502 | * Change the wiring attribute for a pmap/va pair. The mapping must already |
76f1911e MD |
5503 | * exist in the pmap. The mapping may or may not be managed. The wiring in |
5504 | * the page is not changed, the page is returned so the caller can adjust | |
5505 | * its wiring (the page is not locked in any way). | |
582f286d MD |
5506 | * |
5507 | * Wiring is not a hardware characteristic so there is no need to invalidate | |
5508 | * TLB. However, in an SMP environment we must use a locked bus cycle to | |
5509 | * update the pte (if we are not using the pmap_inval_*() API that is)... | |
5510 | * it's ok to do this for simple wiring changes. | |
d7f50089 | 5511 | */ |
76f1911e MD |
5512 | vm_page_t |
5513 | pmap_unwire(pmap_t pmap, vm_offset_t va) | |
d7f50089 | 5514 | { |
701c977e | 5515 | pt_entry_t *ptep; |
76f1911e MD |
5516 | pv_entry_t pt_pv; |
5517 | vm_paddr_t pa; | |
5518 | vm_page_t m; | |
c8fe38ae MD |
5519 | |
5520 | if (pmap == NULL) | |
76f1911e | 5521 | return NULL; |
582f286d | 5522 | |
76f1911e MD |
5523 | /* |
5524 | * Assume elements in the kernel pmap are stable | |
5525 | */ | |
c713db65 | 5526 | if (pmap == kernel_pmap) { |
76f1911e MD |
5527 | if (pmap_pt(pmap, va) == 0) |
5528 | return NULL; | |
582f286d | 5529 | ptep = pmap_pte_quick(pmap, va); |
62443eb5 MD |
5530 | if (pmap_pte_v(pmap, ptep)) { |
5531 | if (pmap_pte_w(pmap, ptep)) | |
5532 | atomic_add_long(&pmap->pm_stats.wired_count,-1); | |
5533 | atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); | |
5534 | pa = *ptep & PG_FRAME; | |
5535 | m = PHYS_TO_VM_PAGE(pa); | |
5536 | } else { | |
5537 | m = NULL; | |
5538 | } | |
582f286d MD |
5539 | } else { |
5540 | /* | |
76f1911e MD |
5541 | * We can only [un]wire pmap-local pages (we cannot wire |
5542 | * shared pages) | |
582f286d | 5543 | */ |
76f1911e MD |
5544 | pt_pv = pv_get(pmap, pmap_pt_pindex(va), NULL); |
5545 | if (pt_pv == NULL) | |
5546 | return NULL; | |
5547 | ||
5548 | ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va)); | |
5549 | if ((*ptep & pmap->pmap_bits[PG_V_IDX]) == 0) { | |
5550 | pv_put(pt_pv); | |
5551 | return NULL; | |
5552 | } | |
582f286d | 5553 | |
76f1911e MD |
5554 | if (pmap_pte_w(pmap, ptep)) { |
5555 | atomic_add_long(&pt_pv->pv_pmap->pm_stats.wired_count, | |
5556 | -1); | |
5557 | } | |
5558 | /* XXX else return NULL so caller doesn't unwire m ? */ | |
582f286d | 5559 | |
76f1911e MD |
5560 | atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]); |
5561 | ||
5562 | pa = *ptep & PG_FRAME; | |
5563 | m = PHYS_TO_VM_PAGE(pa); /* held by wired count */ | |
5564 | pv_put(pt_pv); | |
582f286d | 5565 | } |
76f1911e | 5566 | return m; |
d7f50089 YY |
5567 | } |
5568 | ||
5569 | /* | |
a5fc46c9 MD |
5570 | * Copy the range specified by src_addr/len from the source map to |
5571 | * the range dst_addr/len in the destination map. | |
d7f50089 | 5572 | * |
a5fc46c9 | 5573 | * This routine is only advisory and need not do anything. |
d7f50089 YY |
5574 | */ |
5575 | void | |
5576 | pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, | |
bfc09ba0 | 5577 | vm_size_t len, vm_offset_t src_addr) |
d7f50089 YY |
5578 | { |
5579 | } | |
5580 | ||
5581 | /* | |
5582 | * pmap_zero_page: | |
5583 | * | |
48ffc236 | 5584 | * Zero the specified physical page. |
d7f50089 YY |
5585 | * |
5586 | * This function may be called from an interrupt and no locking is | |
5587 | * required. | |
5588 | */ | |
5589 | void | |
5590 | pmap_zero_page(vm_paddr_t phys) | |
5591 | { | |
48ffc236 | 5592 | vm_offset_t va = PHYS_TO_DMAP(phys); |
c8fe38ae | 5593 | |
48ffc236 | 5594 | pagezero((void *)va); |
d7f50089 YY |
5595 | } |
5596 | ||
d7f50089 YY |
5597 | /* |
5598 | * pmap_zero_page: | |
5599 | * | |
5600 | * Zero part of a physical page by mapping it into memory and clearing | |
5601 | * its contents with bzero. | |
5602 | * | |
5603 | * off and size may not cover an area beyond a single hardware page. | |
5604 | */ | |
5605 | void | |
5606 | pmap_zero_page_area(vm_paddr_t phys, int off, int size) | |
5607 | { | |
48ffc236 | 5608 | vm_offset_t virt = PHYS_TO_DMAP(phys); |
bfc09ba0 | 5609 | |
48ffc236 | 5610 | bzero((char *)virt + off, size); |
d7f50089 YY |
5611 | } |
5612 | ||
5613 | /* | |
5614 | * pmap_copy_page: | |
5615 | * | |
5616 | * Copy the physical page from the source PA to the target PA. | |
5617 | * This function may be called from an interrupt. No locking | |
5618 | * is required. | |
5619 | */ | |
5620 | void | |
5621 | pmap_copy_page(vm_paddr_t src, vm_paddr_t dst) | |
5622 | { | |
48ffc236 | 5623 | vm_offset_t src_virt, dst_virt; |
c8fe38ae | 5624 | |
48ffc236 JG |
5625 | src_virt = PHYS_TO_DMAP(src); |
5626 | dst_virt = PHYS_TO_DMAP(dst); | |
bfc09ba0 | 5627 | bcopy((void *)src_virt, (void *)dst_virt, PAGE_SIZE); |
d7f50089 YY |
5628 | } |
5629 | ||
5630 | /* | |
5631 | * pmap_copy_page_frag: | |
5632 | * | |
5633 | * Copy the physical page from the source PA to the target PA. | |
5634 | * This function may be called from an interrupt. No locking | |
5635 | * is required. | |
5636 | */ | |
5637 | void | |
5638 | pmap_copy_page_frag(vm_paddr_t src, vm_paddr_t dst, size_t bytes) | |
5639 | { | |
48ffc236 | 5640 | vm_offset_t src_virt, dst_virt; |
c8fe38ae | 5641 | |
48ffc236 JG |
5642 | src_virt = PHYS_TO_DMAP(src); |
5643 | dst_virt = PHYS_TO_DMAP(dst); | |
bfc09ba0 | 5644 | |
48ffc236 JG |
5645 | bcopy((char *)src_virt + (src & PAGE_MASK), |
5646 | (char *)dst_virt + (dst & PAGE_MASK), | |
c8fe38ae | 5647 | bytes); |
d7f50089 YY |
5648 | } |
5649 | ||
d7f50089 | 5650 | /* |
b12defdc | 5651 | * Remove all pages from specified address space this aids process exit |
701c977e MD |
5652 | * speeds. Also, this code may be special cased for the current process |
5653 | * only. | |
d7f50089 YY |
5654 | */ |
5655 | void | |
5656 | pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva) | |
5657 | { | |
9df83100 MD |
5658 | pmap_remove_noinval(pmap, sva, eva); |
5659 | cpu_invltlb(); | |
d7f50089 YY |
5660 | } |
5661 | ||
5662 | /* | |
a5fc46c9 MD |
5663 | * pmap_testbit tests bits in pte's note that the testbit/clearbit |
5664 | * routines are inline, and a lot of things compile-time evaluate. | |
c2830aa6 MD |
5665 | * |
5666 | * Currently only used to test the 'M'odified bit. If the page | |
5667 | * is not PG_WRITEABLE, the 'M'odified bit cannot be set and we | |
5668 | * return immediately. Fictitious pages do not track this bit. | |
d7f50089 | 5669 | */ |
bfc09ba0 MD |
5670 | static |
5671 | boolean_t | |
d7f50089 YY |
5672 | pmap_testbit(vm_page_t m, int bit) |
5673 | { | |
567a6398 | 5674 | int res = FALSE; |
c8fe38ae | 5675 | |
427fbc33 | 5676 | if (__predict_false(!pmap_initialized || (m->flags & PG_FICTITIOUS))) |
c8fe38ae | 5677 | return FALSE; |
e3c330f0 MD |
5678 | /* |
5679 | * Nothing to do if all the mappings are already read-only. | |
5680 | * The page's [M]odify bits have already been synchronized | |
5681 | * to the vm_page_t and cleaned out. | |
5682 | */ | |
c2830aa6 MD |
5683 | if (bit == PG_M_IDX && (m->flags & PG_WRITEABLE) == 0) |
5684 | return FALSE; | |
e3c330f0 MD |
5685 | |
5686 | /* | |
5687 | * Iterate the mapping | |
5688 | */ | |
567a6398 MD |
5689 | PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { |
5690 | if (ipte & ipmap->pmap_bits[bit]) { | |
5691 | res = TRUE; | |
5692 | break; | |
701c977e | 5693 | } |
567a6398 MD |
5694 | } PMAP_PAGE_BACKING_DONE; |
5695 | return res; | |
d7f50089 YY |
5696 | } |
5697 | ||
5698 | /* | |
f036b531 | 5699 | * This routine is used to modify bits in ptes. Only one bit should be |
831a8507 MD |
5700 | * specified. PG_RW requires special handling. This call works with |
5701 | * any sort of mapped page. PG_FICTITIOUS pages might not be optimal. | |
b12defdc | 5702 | * |
701c977e | 5703 | * Caller must NOT hold any spin locks |
567a6398 MD |
5704 | * Caller must hold (m) hard-busied |
5705 | * | |
5706 | * NOTE: When clearing PG_M we could also (not implemented) drop | |
5707 | * through to the PG_RW code and clear PG_RW too, forcing | |
5708 | * a fault on write to redetect PG_M for virtual kernels, but | |
5709 | * it isn't necessary since virtual kernels invalidate the | |
5710 | * pte when they clear the VPTE_M bit in their virtual page | |
5711 | * tables. | |
5712 | * | |
5713 | * NOTE: Does not re-dirty the page when clearing only PG_M. | |
5714 | * | |
5715 | * NOTE: Because we do not lock the pv, *pte can be in a state of | |
5716 | * flux. Despite this the value of *pte is still somewhat | |
5717 | * related while we hold the vm_page spin lock. | |
5718 | * | |
5719 | * *pte can be zero due to this race. Since we are clearing | |
5720 | * bits we basically do no harm when this race occurs. | |
d7f50089 | 5721 | */ |
bfc09ba0 MD |
5722 | static __inline |
5723 | void | |
a86ce0cd | 5724 | pmap_clearbit(vm_page_t m, int bit_index) |
d7f50089 | 5725 | { |
567a6398 | 5726 | pt_entry_t npte; |
e3c330f0 | 5727 | int retry; |
c2830aa6 | 5728 | long icount; |
c8fe38ae | 5729 | |
e3c330f0 | 5730 | /* |
831a8507 | 5731 | * Too early in the boot |
e3c330f0 | 5732 | */ |
427fbc33 | 5733 | if (__predict_false(!pmap_initialized)) { |
b443039b MD |
5734 | if (bit_index == PG_RW_IDX) |
5735 | vm_page_flag_clear(m, PG_WRITEABLE); | |
c8fe38ae | 5736 | return; |
b443039b | 5737 | } |
c2830aa6 MD |
5738 | if ((m->flags & (PG_MAPPED | PG_WRITEABLE)) == 0) |
5739 | return; | |
e3c330f0 MD |
5740 | |
5741 | /* | |
5742 | * Being asked to clear other random bits, we don't track them | |
5743 | * so we have to iterate. | |
c2830aa6 | 5744 | * |
b9a6fe08 SW |
5745 | * pmap_clear_reference() is called (into here) with the page |
5746 | * hard-busied to check whether the page is still mapped and | |
5747 | * will clear PG_MAPPED and PG_WRITEABLE if it isn't. | |
e3c330f0 | 5748 | */ |
a86ce0cd | 5749 | if (bit_index != PG_RW_IDX) { |
c2830aa6 | 5750 | #if 0 |
c2830aa6 MD |
5751 | long icount; |
5752 | ||
5753 | icount = 0; | |
c2830aa6 | 5754 | #endif |
567a6398 | 5755 | PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { |
c2830aa6 | 5756 | #if 0 |
c2830aa6 | 5757 | ++icount; |
c2830aa6 | 5758 | #endif |
567a6398 MD |
5759 | if (ipte & ipmap->pmap_bits[bit_index]) { |
5760 | atomic_clear_long(iptep, | |
5761 | ipmap->pmap_bits[bit_index]); | |
f036b531 | 5762 | } |
567a6398 | 5763 | } PMAP_PAGE_BACKING_DONE; |
c2830aa6 | 5764 | #if 0 |
c2830aa6 MD |
5765 | if (icount == 0) { |
5766 | icount = atomic_fetchadd_long(&m->md.interlock_count, | |
5767 | 0x8000000000000000LU); | |
5768 | if ((icount & 0x3FFFFFFFFFFFFFFFLU) == 0) { | |
5769 | vm_page_flag_clear(m, PG_MAPPED | | |
5770 | PG_MAPPEDMULTI | | |
5771 | PG_WRITEABLE); | |
5772 | } | |
5773 | } | |
c2830aa6 | 5774 | #endif |
f036b531 MD |
5775 | return; |
5776 | } | |
5777 | ||
e3c330f0 MD |
5778 | /* |
5779 | * Being asked to clear the RW bit. | |
5780 | * | |
5781 | * Nothing to do if all the mappings are already read-only | |
5782 | */ | |
c2830aa6 MD |
5783 | if ((m->flags & PG_WRITEABLE) == 0) |
5784 | return; | |
e3c330f0 MD |
5785 | |
5786 | /* | |
5787 | * Iterate the mappings and check. | |
5788 | */ | |
5789 | retry = ticks + hz * 60; | |
5790 | again: | |
f036b531 | 5791 | /* |
567a6398 MD |
5792 | * Clear PG_RW. This also clears PG_M and marks the page dirty if |
5793 | * PG_M was set. | |
e3c330f0 MD |
5794 | * |
5795 | * Since the caller holds the page hard-busied we can safely clear | |
5796 | * PG_WRITEABLE, and callers expect us to for the PG_RW_IDX path. | |
f036b531 | 5797 | */ |
567a6398 | 5798 | PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { |
831a8507 | 5799 | #if 0 |
e3c330f0 MD |
5800 | if ((ipte & ipmap->pmap_bits[PG_MANAGED_IDX]) == 0) |
5801 | continue; | |
831a8507 | 5802 | #endif |
567a6398 | 5803 | if ((ipte & ipmap->pmap_bits[PG_RW_IDX]) == 0) |
97c8c48c | 5804 | continue; |
567a6398 MD |
5805 | npte = ipte & ~(ipmap->pmap_bits[PG_RW_IDX] | |
5806 | ipmap->pmap_bits[PG_M_IDX]); | |
5807 | if (!pmap_inval_smp_cmpset(ipmap, iva, iptep, ipte, npte)) | |
5808 | PMAP_PAGE_BACKING_RETRY; | |
5809 | if (ipte & ipmap->pmap_bits[PG_M_IDX]) | |
f036b531 | 5810 | vm_page_dirty(m); |
e3c330f0 MD |
5811 | |
5812 | /* | |
5813 | * NOTE: m is not hard-busied so it is not safe to | |
5814 | * clear PG_WRITEABLE on the 1->0 transition | |
5815 | * against it being set in pmap_enter(). | |
831a8507 MD |
5816 | * |
5817 | * pmap_count and writeable_count are only applicable | |
5818 | * to non-fictitious pages (PG_MANAGED_IDX from pte) | |
e3c330f0 | 5819 | */ |
567a6398 | 5820 | } PMAP_PAGE_BACKING_DONE; |
e3c330f0 | 5821 | |
c2830aa6 MD |
5822 | /* |
5823 | * If our scan lost a pte swap race oldm->md.interlock_count might | |
5824 | * be set from the pmap_enter() code. If so sleep a little and try | |
5825 | * again. | |
5826 | * | |
5827 | * Use an atomic op to access interlock_count to ensure ordering. | |
5828 | */ | |
5829 | icount = atomic_fetchadd_long(&m->md.interlock_count, | |
5830 | 0x8000000000000000LU) + | |
5831 | 0x8000000000000000LU; | |
5832 | cpu_ccfence(); | |
5833 | while (icount & 0x3FFFFFFFFFFFFFFFLU) { | |
5834 | tsleep_interlock(&m->md.interlock_count, 0); | |
5835 | if (atomic_fcmpset_long(&m->md.interlock_count, &icount, | |
5836 | icount | 0x4000000000000000LU)) { | |
5837 | tsleep(&m->md.interlock_count, PINTERLOCKED, | |
5838 | "pgunm", 1); | |
5839 | icount = m->md.interlock_count; | |
5840 | if (retry - ticks > 0) | |
5841 | goto again; | |
5842 | panic("pmap_clearbit: cannot return interlock_count " | |
5843 | "to 0 (%p, %ld)", | |
5844 | m, m->md.interlock_count); | |
5845 | } | |
5846 | } | |
567a6398 | 5847 | vm_page_flag_clear(m, PG_WRITEABLE); |
d7f50089 YY |
5848 | } |
5849 | ||
5850 | /* | |
b12defdc | 5851 | * Lower the permission for all mappings to a given page. |
d7f50089 | 5852 | * |
567a6398 MD |
5853 | * Page must be hard-busied by caller. Because the page is busied by the |
5854 | * caller, this should not be able to race a pmap_enter(). | |
d7f50089 YY |
5855 | */ |
5856 | void | |
5857 | pmap_page_protect(vm_page_t m, vm_prot_t prot) | |
5858 | { | |
48ffc236 | 5859 | /* JG NX support? */ |
c8fe38ae MD |
5860 | if ((prot & VM_PROT_WRITE) == 0) { |
5861 | if (prot & (VM_PROT_READ | VM_PROT_EXECUTE)) { | |
b12defdc MD |
5862 | /* |
5863 | * NOTE: pmap_clearbit(.. PG_RW) also clears | |
5864 | * the PG_WRITEABLE flag in (m). | |
5865 | */ | |
a86ce0cd | 5866 | pmap_clearbit(m, PG_RW_IDX); |
c8fe38ae MD |
5867 | } else { |
5868 | pmap_remove_all(m); | |
5869 | } | |
5870 | } | |
d7f50089 YY |
5871 | } |
5872 | ||
5873 | vm_paddr_t | |
c8fe38ae | 5874 | pmap_phys_address(vm_pindex_t ppn) |
d7f50089 | 5875 | { |
b2b3ffcd | 5876 | return (x86_64_ptob(ppn)); |
d7f50089 YY |
5877 | } |
5878 | ||
5879 | /* | |
b12defdc MD |
5880 | * Return a count of reference bits for a page, clearing those bits. |
5881 | * It is not necessary for every reference bit to be cleared, but it | |
5882 | * is necessary that 0 only be returned when there are truly no | |
5883 | * reference bits set. | |
d7f50089 | 5884 | * |
b12defdc MD |
5885 | * XXX: The exact number of bits to check and clear is a matter that |
5886 | * should be tested and standardized at some point in the future for | |
5887 | * optimal aging of shared pages. | |
d7f50089 | 5888 | * |
b12defdc | 5889 | * This routine may not block. |
d7f50089 YY |
5890 | */ |
5891 | int | |
5892 | pmap_ts_referenced(vm_page_t m) | |
5893 | { | |
567a6398 MD |
5894 | int rval = 0; |
5895 | pt_entry_t npte; | |
c8fe38ae | 5896 | |
427fbc33 | 5897 | if (__predict_false(!pmap_initialized || (m->flags & PG_FICTITIOUS))) |
567a6398 MD |
5898 | return rval; |
5899 | PMAP_PAGE_BACKING_SCAN(m, NULL, ipmap, iptep, ipte, iva) { | |
5900 | if (ipte & ipmap->pmap_bits[PG_A_IDX]) { | |
5901 | npte = ipte & ~ipmap->pmap_bits[PG_A_IDX]; | |
5902 | if (!atomic_cmpset_long(iptep, ipte, npte)) | |
5903 | PMAP_PAGE_BACKING_RETRY; | |
5904 | ++rval; | |
5905 | if (rval > 4) | |
701c977e MD |
5906 | break; |
5907 | } | |
567a6398 MD |
5908 | } PMAP_PAGE_BACKING_DONE; |
5909 | return rval; | |
d7f50089 YY |
5910 | } |
5911 | ||
5912 | /* | |
5913 | * pmap_is_modified: | |
5914 | * | |
5915 | * Return whether or not the specified physical page was modified | |
5916 | * in any physical maps. | |
5917 | */ | |
5918 | boolean_t | |
5919 | pmap_is_modified(vm_page_t m) | |
5920 | { | |
10d6182e MD |
5921 | boolean_t res; |
5922 | ||
a86ce0cd | 5923 | res = pmap_testbit(m, PG_M_IDX); |
10d6182e | 5924 | return (res); |
d7f50089 YY |
5925 | } |
5926 | ||
5927 | /* | |
567a6398 MD |
5928 | * Clear the modify bit on the vm_page. |
5929 | * | |
5930 | * The page must be hard-busied. | |
d7f50089 YY |
5931 | */ |
5932 | void | |
5933 | pmap_clear_modify(vm_page_t m) | |
5934 | { | |
a86ce0cd | 5935 | pmap_clearbit(m, PG_M_IDX); |
d7f50089 YY |
5936 | } |
5937 | ||
5938 | /* | |
5939 | * pmap_clear_reference: | |
5940 | * | |
5941 | * Clear the reference bit on the specified physical page. | |
5942 | */ | |
5943 | void | |
5944 | pmap_clear_reference(vm_page_t m) | |
5945 | { | |
a86ce0cd | 5946 | pmap_clearbit(m, PG_A_IDX); |
d7f50089 YY |
5947 | } |
5948 | ||
d7f50089 YY |
5949 | /* |
5950 | * Miscellaneous support routines follow | |
5951 | */ | |
5952 | ||
bfc09ba0 MD |
5953 | static |
5954 | void | |
d87f4462 | 5955 | x86_64_protection_init(void) |
d7f50089 | 5956 | { |
3e925ec2 MD |
5957 | uint64_t *kp; |
5958 | int prot; | |
d7f50089 | 5959 | |
2620a64f MD |
5960 | /* |
5961 | * NX supported? (boot time loader.conf override only) | |
d92e3890 MD |
5962 | * |
5963 | * -1 Automatic (sets mode 1) | |
5964 | * 0 Disabled | |
5965 | * 1 NX implemented, differentiates PROT_READ vs PROT_READ|PROT_EXEC | |
5966 | * 2 NX implemented for all cases | |
2620a64f MD |
5967 | */ |
5968 | TUNABLE_INT_FETCH("machdep.pmap_nx_enable", &pmap_nx_enable); | |
d92e3890 | 5969 | if ((amd_feature & AMDID_NX) == 0) { |
2620a64f | 5970 | pmap_bits_default[PG_NX_IDX] = 0; |
d92e3890 MD |
5971 | pmap_nx_enable = 0; |
5972 | } else if (pmap_nx_enable < 0) { | |
5973 | pmap_nx_enable = 1; /* default to mode 1 (READ) */ | |
5974 | } | |
2620a64f | 5975 | |
3e925ec2 MD |
5976 | /* |
5977 | * 0 is basically read-only access, but also set the NX (no-execute) | |
5978 | * bit when VM_PROT_EXECUTE is not specified. | |
5979 | */ | |
d7f50089 | 5980 | kp = protection_codes; |
a86ce0cd | 5981 | for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) { |
c8fe38ae MD |
5982 | switch (prot) { |
5983 | case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE: | |
5984 | /* | |
3e925ec2 | 5985 | * This case handled elsewhere |
c8fe38ae | 5986 | */ |
d92e3890 | 5987 | *kp = 0; |
3e925ec2 | 5988 | break; |
c8fe38ae | 5989 | case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE: |
3e925ec2 | 5990 | /* |
d92e3890 | 5991 | * Read-only is 0|NX (pmap_nx_enable mode >= 1) |
3e925ec2 | 5992 | */ |
d92e3890 MD |
5993 | if (pmap_nx_enable >= 1) |
5994 | *kp = pmap_bits_default[PG_NX_IDX]; | |
3e925ec2 | 5995 | break; |
c8fe38ae MD |
5996 | case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE: |
5997 | case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE: | |
3e925ec2 MD |
5998 | /* |
5999 | * Execute requires read access | |
6000 | */ | |
d92e3890 | 6001 | *kp = 0; |
c8fe38ae MD |
6002 | break; |
6003 | case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE: | |
c8fe38ae | 6004 | case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE: |
3e925ec2 MD |
6005 | /* |
6006 | * Write without execute is RW|NX | |
d92e3890 | 6007 | * (pmap_nx_enable mode >= 2) |
3e925ec2 | 6008 | */ |
d92e3890 MD |
6009 | *kp = pmap_bits_default[PG_RW_IDX]; |
6010 | if (pmap_nx_enable >= 2) | |
6011 | *kp |= pmap_bits_default[PG_NX_IDX]; | |
3e925ec2 | 6012 | break; |
c8fe38ae | 6013 | case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE: |
3e925ec2 MD |
6014 | case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE: |
6015 | /* | |
6016 | * Write with execute is RW | |
6017 | */ | |
d92e3890 | 6018 | *kp = pmap_bits_default[PG_RW_IDX]; |
c8fe38ae MD |
6019 | break; |
6020 | } | |
d92e3890 | 6021 | ++kp; |
d7f50089 YY |
6022 | } |
6023 | } | |
6024 | ||
6025 | /* | |
6026 | * Map a set of physical memory pages into the kernel virtual | |
6027 | * address space. Return a pointer to where it is mapped. This | |
6028 | * routine is intended to be used for mapping device memory, | |
6029 | * NOT real memory. | |
6030 | * | |
b524ca76 MD |
6031 | * NOTE: We can't use pgeflag unless we invalidate the pages one at |
6032 | * a time. | |
6033 | * | |
6034 | * NOTE: The PAT attributes {WRITE_BACK, WRITE_THROUGH, UNCACHED, UNCACHEABLE} | |
6035 | * work whether the cpu supports PAT or not. The remaining PAT | |
6036 | * attributes {WRITE_PROTECTED, WRITE_COMBINING} only work if the cpu | |
6037 | * supports PAT. | |
d7f50089 YY |
6038 | */ |
6039 | void * | |
6040 | pmap_mapdev(vm_paddr_t pa, vm_size_t size) | |
6041 | { | |
b524ca76 | 6042 | return(pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); |
057877ac JG |
6043 | } |
6044 | ||
6045 | void * | |
6046 | pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size) | |
b524ca76 MD |
6047 | { |
6048 | return(pmap_mapdev_attr(pa, size, PAT_UNCACHEABLE)); | |
6049 | } | |
6050 | ||
ac9e78e3 FT |
6051 | void * |
6052 | pmap_mapbios(vm_paddr_t pa, vm_size_t size) | |
6053 | { | |
6054 | return (pmap_mapdev_attr(pa, size, PAT_WRITE_BACK)); | |
6055 | } | |
6056 | ||
1d400425 FT |
6057 | /* |
6058 | * Map a set of physical memory pages into the kernel virtual | |
6059 | * address space. Return a pointer to where it is mapped. This | |
6060 | * routine is intended to be used for mapping device memory, | |
6061 | * NOT real memory. | |
6062 | */ | |
b524ca76 MD |
6063 | void * |
6064 | pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode) | |
057877ac JG |
6065 | { |
6066 | vm_offset_t va, tmpva, offset; | |
6067 | pt_entry_t *pte; | |
b524ca76 | 6068 | vm_size_t tmpsize; |
057877ac JG |
6069 | |
6070 | offset = pa & PAGE_MASK; | |
6071 | size = roundup(offset + size, PAGE_SIZE); | |
6072 | ||
1eeaf6b2 | 6073 | va = kmem_alloc_nofault(kernel_map, size, VM_SUBSYS_MAPDEV, PAGE_SIZE); |
057877ac JG |
6074 | if (va == 0) |
6075 | panic("pmap_mapdev: Couldn't alloc kernel virtual memory"); | |
6076 | ||
6077 | pa = pa & ~PAGE_MASK; | |
b524ca76 | 6078 | for (tmpva = va, tmpsize = size; tmpsize > 0;) { |
057877ac | 6079 | pte = vtopte(tmpva); |
a86ce0cd | 6080 | *pte = pa | |
c713db65 AL |
6081 | kernel_pmap->pmap_bits[PG_RW_IDX] | |
6082 | kernel_pmap->pmap_bits[PG_V_IDX] | /* pgeflag | */ | |
6083 | kernel_pmap->pmap_cache_bits_pte[mode]; | |
b524ca76 | 6084 | tmpsize -= PAGE_SIZE; |
057877ac JG |
6085 | tmpva += PAGE_SIZE; |
6086 | pa += PAGE_SIZE; | |
6087 | } | |
c713db65 | 6088 | pmap_invalidate_range(kernel_pmap, va, va + size); |
1d400425 | 6089 | pmap_invalidate_cache_range(va, va + size); |
057877ac | 6090 | |
d7f50089 YY |
6091 | return ((void *)(va + offset)); |
6092 | } | |
6093 | ||
6094 | void | |
6095 | pmap_unmapdev(vm_offset_t va, vm_size_t size) | |
6096 | { | |
6097 | vm_offset_t base, offset; | |
6098 | ||
48ffc236 | 6099 | base = va & ~PAGE_MASK; |
d7f50089 YY |
6100 | offset = va & PAGE_MASK; |
6101 | size = roundup(offset + size, PAGE_SIZE); | |
6102 | pmap_qremove(va, size >> PAGE_SHIFT); | |
1eeaf6b2 | 6103 | kmem_free(kernel_map, base, size); |
d7f50089 YY |
6104 | } |
6105 | ||
ec1a31dd FT |
6106 | /* |
6107 | * Sets the memory attribute for the specified page. | |
6108 | */ | |
6109 | void | |
6110 | pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma) | |
6111 | { | |
6112 | ||
6113 | m->pat_mode = ma; | |
6114 | ||
6115 | /* | |
6116 | * If "m" is a normal page, update its direct mapping. This update | |
6117 | * can be relied upon to perform any cache operations that are | |
6118 | * required for data coherence. | |
6119 | */ | |
6120 | if ((m->flags & PG_FICTITIOUS) == 0) | |
96acd33e | 6121 | pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), 1, m->pat_mode); |
ec1a31dd FT |
6122 | } |
6123 | ||
b524ca76 MD |
6124 | /* |
6125 | * Change the PAT attribute on an existing kernel memory map. Caller | |
6126 | * must ensure that the virtual memory in question is not accessed | |
6127 | * during the adjustment. | |
c2ec3418 MD |
6128 | * |
6129 | * If the va is within the DMAP we cannot use vtopte() because the DMAP | |
6130 | * utilizes 2MB or 1GB pages. 2MB is forced atm so calculate the pd_entry | |
6131 | * pointer based on that. | |
b524ca76 MD |
6132 | */ |
6133 | void | |
6134 | pmap_change_attr(vm_offset_t va, vm_size_t count, int mode) | |
6135 | { | |
6136 | pt_entry_t *pte; | |
1d400425 FT |
6137 | vm_offset_t base; |
6138 | int changed = 0; | |
b524ca76 MD |
6139 | |
6140 | if (va == 0) | |
6141 | panic("pmap_change_attr: va is NULL"); | |
1d400425 | 6142 | base = trunc_page(va); |
b524ca76 | 6143 | |
c2ec3418 MD |
6144 | if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) { |
6145 | pd_entry_t *pd; | |
6146 | ||
6147 | KKASSERT(va < DMapMaxAddress); | |
6148 | pd = (pd_entry_t *)PHYS_TO_DMAP(DMPDphys); | |
6149 | pd += (va - DMAP_MIN_ADDRESS) >> PDRSHIFT; | |
6150 | ||
6151 | while ((long)count > 0) { | |
6152 | *pd = | |
c713db65 AL |
6153 | (*pd & ~(pd_entry_t)(kernel_pmap->pmap_cache_mask_pde)) | |
6154 | kernel_pmap->pmap_cache_bits_pde[mode]; | |
c2ec3418 MD |
6155 | count -= NBPDR / PAGE_SIZE; |
6156 | va += NBPDR; | |
6157 | ++pd; | |
6158 | } | |
6159 | } else { | |
6160 | while (count) { | |
6161 | pte = vtopte(va); | |
6162 | *pte = | |
c713db65 AL |
6163 | (*pte & ~(pt_entry_t)(kernel_pmap->pmap_cache_mask_pte)) | |
6164 | kernel_pmap->pmap_cache_bits_pte[mode]; | |
c2ec3418 MD |
6165 | --count; |
6166 | va += PAGE_SIZE; | |
6167 | } | |
b524ca76 | 6168 | } |
1d400425 FT |
6169 | |
6170 | changed = 1; /* XXX: not optimal */ | |
6171 | ||
6172 | /* | |
6173 | * Flush CPU caches if required to make sure any data isn't cached that | |
6174 | * shouldn't be, etc. | |
6175 | */ | |
6176 | if (changed) { | |
c713db65 | 6177 | pmap_invalidate_range(kernel_pmap, base, va); |
1d400425 FT |
6178 | pmap_invalidate_cache_range(base, va); |
6179 | } | |
b524ca76 MD |
6180 | } |
6181 | ||
d7f50089 YY |
6182 | /* |
6183 | * perform the pmap work for mincore | |
6184 | */ | |
6185 | int | |
6186 | pmap_mincore(pmap_t pmap, vm_offset_t addr) | |
6187 | { | |
c8fe38ae MD |
6188 | pt_entry_t *ptep, pte; |
6189 | vm_page_t m; | |
6190 | int val = 0; | |
6191 | ||
6192 | ptep = pmap_pte(pmap, addr); | |
d7f50089 | 6193 | |
10d6182e | 6194 | if (ptep && (pte = *ptep) != 0) { |
c8fe38ae MD |
6195 | vm_offset_t pa; |
6196 | ||
6197 | val = MINCORE_INCORE; | |
c8fe38ae | 6198 | pa = pte & PG_FRAME; |
831a8507 | 6199 | if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) |
9e5e1578 | 6200 | m = PHYS_TO_VM_PAGE(pa); |
831a8507 MD |
6201 | else |
6202 | m = NULL; | |
c8fe38ae MD |
6203 | |
6204 | /* | |
6205 | * Modified by us | |
6206 | */ | |
a86ce0cd | 6207 | if (pte & pmap->pmap_bits[PG_M_IDX]) |
c8fe38ae | 6208 | val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER; |
831a8507 | 6209 | |
c8fe38ae MD |
6210 | /* |
6211 | * Modified by someone | |
6212 | */ | |
9e5e1578 | 6213 | else if (m && (m->dirty || pmap_is_modified(m))) |
c8fe38ae | 6214 | val |= MINCORE_MODIFIED_OTHER; |
c8fe38ae MD |
6215 | |
6216 | /* | |
831a8507 | 6217 | * Referenced by us, or someone else. |
c8fe38ae | 6218 | */ |
831a8507 MD |
6219 | if (pte & pmap->pmap_bits[PG_A_IDX]) { |
6220 | val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER; | |
6221 | } else if (m && ((m->flags & PG_REFERENCED) || | |
6222 | pmap_ts_referenced(m))) { | |
c8fe38ae MD |
6223 | val |= MINCORE_REFERENCED_OTHER; |
6224 | vm_page_flag_set(m, PG_REFERENCED); | |
6225 | } | |
6226 | } | |
6227 | return val; | |
6228 | } | |
6229 | ||
6230 | /* | |
6231 | * Replace p->p_vmspace with a new one. If adjrefs is non-zero the new | |
6232 | * vmspace will be ref'd and the old one will be deref'd. | |
6233 | * | |
6234 | * The vmspace for all lwps associated with the process will be adjusted | |
6235 | * and cr3 will be reloaded if any lwp is the current lwp. | |
a5fc46c9 | 6236 | * |
b12defdc | 6237 | * The process must hold the vmspace->vm_map.token for oldvm and newvm |
c8fe38ae | 6238 | */ |
d7f50089 YY |
6239 | void |
6240 | pmap_replacevm(struct proc *p, struct vmspace *newvm, int adjrefs) | |
6241 | { | |
c8fe38ae MD |
6242 | struct vmspace *oldvm; |
6243 | struct lwp *lp; | |
6244 | ||
c8fe38ae MD |
6245 | oldvm = p->p_vmspace; |
6246 | if (oldvm != newvm) { | |
a5fc46c9 | 6247 | if (adjrefs) |
93f86408 | 6248 | vmspace_ref(newvm); |
c8fe38ae MD |
6249 | p->p_vmspace = newvm; |
6250 | KKASSERT(p->p_nthreads == 1); | |
6251 | lp = RB_ROOT(&p->p_lwp_tree); | |
6252 | pmap_setlwpvm(lp, newvm); | |
a5fc46c9 | 6253 | if (adjrefs) |
93f86408 | 6254 | vmspace_rel(oldvm); |
c8fe38ae | 6255 | } |
d7f50089 YY |
6256 | } |
6257 | ||
c8fe38ae MD |
6258 | /* |
6259 | * Set the vmspace for a LWP. The vmspace is almost universally set the | |
6260 | * same as the process vmspace, but virtual kernels need to swap out contexts | |
6261 | * on a per-lwp basis. | |
a5fc46c9 | 6262 | * |
b12defdc | 6263 | * Caller does not necessarily hold any vmspace tokens. Caller must control |
a5fc46c9 MD |
6264 | * the lwp (typically be in the context of the lwp). We use a critical |
6265 | * section to protect against statclock and hardclock (statistics collection). | |
c8fe38ae | 6266 | */ |
d7f50089 YY |
6267 | void |
6268 | pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) | |
6269 | { | |
c8fe38ae MD |
6270 | struct vmspace *oldvm; |
6271 | struct pmap *pmap; | |
4611d87f | 6272 | thread_t td; |
d7f50089 | 6273 | |
c8fe38ae MD |
6274 | oldvm = lp->lwp_vmspace; |
6275 | ||
6276 | if (oldvm != newvm) { | |
a5fc46c9 | 6277 | crit_enter(); |
4611d87f | 6278 | td = curthread; |
a3a33e50 | 6279 | KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0); |
c8fe38ae | 6280 | lp->lwp_vmspace = newvm; |
4611d87f | 6281 | if (td->td_lwp == lp) { |
c8fe38ae | 6282 | pmap = vmspace_pmap(newvm); |
c07315c4 | 6283 | ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid); |
cc694a4a | 6284 | if (pmap->pm_active_lock & CPULOCK_EXCL) |
8c5af5b8 | 6285 | pmap_interlock_wait(newvm); |
c8fe38ae MD |
6286 | #if defined(SWTCH_OPTIM_STATS) |
6287 | tlb_flush_count++; | |
6288 | #endif | |
a86ce0cd | 6289 | if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) { |
4611d87f | 6290 | td->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4); |
94c5f25a | 6291 | if (meltdown_mitigation && pmap->pm_pmlpv_iso) { |
4611d87f MD |
6292 | td->td_pcb->pcb_cr3_iso = |
6293 | vtophys(pmap->pm_pml4_iso); | |
6294 | td->td_pcb->pcb_flags |= PCB_ISOMMU; | |
6295 | } else { | |
6296 | td->td_pcb->pcb_cr3_iso = 0; | |
6297 | td->td_pcb->pcb_flags &= ~PCB_ISOMMU; | |
6298 | } | |
a86ce0cd | 6299 | } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) { |
4611d87f MD |
6300 | td->td_pcb->pcb_cr3 = KPML4phys; |
6301 | td->td_pcb->pcb_cr3_iso = 0; | |
6302 | td->td_pcb->pcb_flags &= ~PCB_ISOMMU; | |
a86ce0cd MD |
6303 | } else { |
6304 | panic("pmap_setlwpvm: unknown pmap type\n"); | |
6305 | } | |
4611d87f MD |
6306 | |
6307 | /* | |
6308 | * The MMU separation fields needs to be updated. | |
6309 | * (it can't access the pcb directly from the | |
6310 | * restricted user pmap). | |
6311 | */ | |
9e24b495 | 6312 | { |
fc921477 MD |
6313 | struct trampframe *tramp; |
6314 | ||
6315 | tramp = &pscpu->trampoline; | |
6316 | tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3; | |
6317 | tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso; | |
6318 | tramp->tr_pcb_flags = td->td_pcb->pcb_flags; | |
9e24b495 | 6319 | tramp->tr_pcb_rsp = (register_t)td->td_pcb; |
fc921477 | 6320 | /* tr_pcb_rsp doesn't change */ |
4611d87f MD |
6321 | } |
6322 | ||
6323 | /* | |
6324 | * In kernel-land we always use the normal PML4E | |
6325 | * so the kernel is fully mapped and can also access | |
6326 | * user memory. | |
6327 | */ | |
6328 | load_cr3(td->td_pcb->pcb_cr3); | |
c8fe38ae | 6329 | pmap = vmspace_pmap(oldvm); |
c07315c4 MD |
6330 | ATOMIC_CPUMASK_NANDBIT(pmap->pm_active, |
6331 | mycpu->gd_cpuid); | |
c8fe38ae | 6332 | } |
a5fc46c9 | 6333 | crit_exit(); |
c8fe38ae | 6334 | } |
c8fe38ae | 6335 | } |
d7f50089 | 6336 | |
f9fa4782 AL |
6337 | /* |
6338 | * Used to control the backing vmspace on the host for a guest VM. | |
6339 | * The cpumask is needed by the host pager to properly invalidate the | |
6340 | * host TLB when paging out the backing memory of a guest VM. | |
6341 | * | |
6342 | * NOTE: The scheduler might somtimes overload multiple vCPUs on the | |
6343 | * same physical cpu, so operating is not quite as simple as | |
6344 | * calling add_cpu/del_cpu in the core vmrun routines. | |
6345 | */ | |
6346 | void | |
6347 | pmap_add_cpu(struct vmspace *vm, int cpuid) | |
6348 | { | |
6349 | ATOMIC_CPUMASK_ORBIT(vm->vm_pmap.pm_active, mycpu->gd_cpuid); | |
6350 | crit_enter(); | |
6351 | pmap_interlock_wait(vm); | |
6352 | crit_exit(); | |
6353 | } | |
6354 | ||
6355 | void | |
6356 | pmap_del_cpu(struct vmspace *vm, int cpuid) | |
6357 | { | |
6358 | ATOMIC_CPUMASK_NANDBIT(vm->vm_pmap.pm_active, mycpu->gd_cpuid); | |
6359 | } | |
6360 | ||
6361 | void | |
6362 | pmap_del_all_cpus(struct vmspace *vm) | |
6363 | { | |
6364 | CPUMASK_ASSZERO(vm->vm_pmap.pm_active); | |
6365 | } | |
6366 | ||
c2fb025d | 6367 | /* |
9d8625eb AHJ |
6368 | * Called when switching to a locked pmap, used to interlock against pmaps |
6369 | * undergoing modifications to prevent us from activating the MMU for the | |
6370 | * target pmap until all such modifications have completed. We have to do | |
6371 | * this because the thread making the modifications has already set up its | |
6372 | * SMP synchronization mask. | |
6373 | * | |
3cb318dc MD |
6374 | * This function cannot sleep! |
6375 | * | |
9d8625eb | 6376 | * No requirements. |
c2fb025d MD |
6377 | */ |
6378 | void | |
6379 | pmap_interlock_wait(struct vmspace *vm) | |
6380 | { | |
6381 | struct pmap *pmap = &vm->vm_pmap; | |
6382 | ||
cc694a4a | 6383 | if (pmap->pm_active_lock & CPULOCK_EXCL) { |
9d8625eb | 6384 | crit_enter(); |
993bac44 | 6385 | KKASSERT(curthread->td_critcount >= 2); |
b12defdc | 6386 | DEBUG_PUSH_INFO("pmap_interlock_wait"); |
cc694a4a | 6387 | while (pmap->pm_active_lock & CPULOCK_EXCL) { |
c2fb025d MD |
6388 | cpu_ccfence(); |
6389 | lwkt_process_ipiq(); | |
6390 | } | |
cfaeae2a | 6391 | DEBUG_POP_INFO(); |
b12defdc | 6392 | crit_exit(); |
c2fb025d MD |
6393 | } |
6394 | } | |
6395 | ||
d7f50089 YY |
6396 | vm_offset_t |
6397 | pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size) | |
6398 | { | |
c8fe38ae | 6399 | |
f6a0c819 JH |
6400 | if ((obj == NULL) || (size < NBPDR) || |
6401 | ((obj->type != OBJT_DEVICE) && (obj->type != OBJT_MGTDEVICE))) { | |
c8fe38ae MD |
6402 | return addr; |
6403 | } | |
6404 | ||
965b839f | 6405 | addr = roundup2(addr, NBPDR); |
c8fe38ae MD |
6406 | return addr; |
6407 | } | |
722871d3 MD |
6408 | |
6409 | /* | |
6410 | * Used by kmalloc/kfree, page already exists at va | |
6411 | */ | |
6412 | vm_page_t | |
6413 | pmap_kvtom(vm_offset_t va) | |
6414 | { | |
9e5e1578 MD |
6415 | pt_entry_t *ptep = vtopte(va); |
6416 | ||
9e5e1578 | 6417 | return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME)); |
722871d3 | 6418 | } |
921c891e MD |
6419 | |
6420 | /* | |
6421 | * Initialize machine-specific shared page directory support. This | |
6422 | * is executed when a VM object is created. | |
6423 | */ | |
6424 | void | |
6425 | pmap_object_init(vm_object_t object) | |
6426 | { | |
921c891e MD |
6427 | } |
6428 | ||
6429 | /* | |
6430 | * Clean up machine-specific shared page directory support. This | |
6431 | * is executed when a VM object is destroyed. | |
6432 | */ | |
6433 | void | |
6434 | pmap_object_free(vm_object_t object) | |
6435 | { | |
921c891e | 6436 | } |
a7a03a5f MD |
6437 | |
6438 | /* | |
6439 | * pmap_pgscan_callback - Used by pmap_pgscan to acquire the related | |
6440 | * VM page and issue a pginfo->callback. | |
a7a03a5f MD |
6441 | */ |
6442 | static | |
6443 | void | |
6444 | pmap_pgscan_callback(pmap_t pmap, struct pmap_scan_info *info, | |
567a6398 MD |
6445 | vm_pindex_t *pte_placemark, |
6446 | pv_entry_t pt_pv, vm_offset_t va, | |
6447 | pt_entry_t *ptep, void *arg) | |
a7a03a5f MD |
6448 | { |
6449 | struct pmap_pgscan_info *pginfo = arg; | |
6450 | vm_page_t m; | |
567a6398 MD |
6451 | pt_entry_t pte; |
6452 | ||
6453 | pte = *ptep; | |
6454 | cpu_ccfence(); | |
a7a03a5f | 6455 | |
567a6398 | 6456 | if (pte & pmap->pmap_bits[PG_MANAGED_IDX]) { |
a7a03a5f | 6457 | /* |
567a6398 | 6458 | * Try to busy the page while we hold the pte_placemark locked. |
a7a03a5f MD |
6459 | */ |
6460 | m = PHYS_TO_VM_PAGE(*ptep & PG_FRAME); | |
6461 | if (vm_page_busy_try(m, TRUE) == 0) { | |
6462 | if (m == PHYS_TO_VM_PAGE(*ptep & PG_FRAME)) { | |
6463 | /* | |
567a6398 | 6464 | * The callback is issued with the pt_pv |
a7a03a5f MD |
6465 | * unlocked. |
6466 | */ | |
567a6398 | 6467 | pv_placemarker_wakeup(pmap, pte_placemark); |
76f1911e MD |
6468 | if (pt_pv) { |
6469 | vm_page_wire_quick(pt_pv->pv_m); | |
a7a03a5f | 6470 | pv_unlock(pt_pv); |
76f1911e | 6471 | } |
a7a03a5f MD |
6472 | if (pginfo->callback(pginfo, va, m) < 0) |
6473 | info->stop = 1; | |
76f1911e | 6474 | if (pt_pv) { |
a7a03a5f | 6475 | pv_lock(pt_pv); |
e3c330f0 MD |
6476 | if (vm_page_unwire_quick(pt_pv->pv_m)) { |
6477 | panic("pmap_pgscan: bad wire_" | |
6478 | "count on pt_pv"); | |
6479 | } | |
76f1911e | 6480 | } |
a7a03a5f | 6481 | } else { |
da2da420 | 6482 | vm_page_wakeup(m); |
567a6398 | 6483 | pv_placemarker_wakeup(pmap, pte_placemark); |
a7a03a5f MD |
6484 | } |
6485 | } else { | |
6486 | ++pginfo->busycount; | |
567a6398 | 6487 | pv_placemarker_wakeup(pmap, pte_placemark); |
a7a03a5f | 6488 | } |
a7a03a5f | 6489 | } else { |
e989b548 MD |
6490 | /* |
6491 | * Shared page table or unmanaged page (sharept or !sharept) | |
6492 | */ | |
76f1911e | 6493 | pv_placemarker_wakeup(pmap, pte_placemark); |
a7a03a5f MD |
6494 | } |
6495 | } | |
6496 | ||
6497 | void | |
6498 | pmap_pgscan(struct pmap_pgscan_info *pginfo) | |
6499 | { | |
6500 | struct pmap_scan_info info; | |
6501 | ||
6502 | pginfo->offset = pginfo->beg_addr; | |
6503 | info.pmap = pginfo->pmap; | |
6504 | info.sva = pginfo->beg_addr; | |
6505 | info.eva = pginfo->end_addr; | |
6506 | info.func = pmap_pgscan_callback; | |
6507 | info.arg = pginfo; | |
6508 | pmap_scan(&info, 0); | |
6509 | if (info.stop == 0) | |
6510 | pginfo->offset = pginfo->end_addr; | |
6511 | } | |
76f1911e MD |
6512 | |
6513 | /* | |
6514 | * Wait for a placemarker that we do not own to clear. The placemarker | |
2519e05d | 6515 | * in question is not necessarily set to the pindex we want, we may have |
76f1911e | 6516 | * to wait on the element because we want to reserve it ourselves. |
2519e05d MD |
6517 | * |
6518 | * NOTE: PM_PLACEMARK_WAKEUP sets a bit which is already set in | |
6519 | * PM_NOPLACEMARK, so it does not interfere with placemarks | |
6520 | * which have already been woken up. | |
92414ddf MD |
6521 | * |
6522 | * NOTE: This routine is called without the pmap spin-lock and so can | |
6523 | * race changes to *pmark. Due to the sensitivity of the routine | |
6524 | * to possible MULTIPLE interactions from other cpus, and the | |
6525 | * overloading of the WAKEUP bit on PM_NOPLACEMARK, we have to | |
6526 | * use a cmpset loop to avoid a race that might cause the WAKEUP | |
6527 | * bit to be lost. | |
6528 | * | |
6529 | * Caller is expected to retry its operation upon return. | |
76f1911e MD |
6530 | */ |
6531 | static | |
6532 | void | |
6533 | pv_placemarker_wait(pmap_t pmap, vm_pindex_t *pmark) | |
6534 | { | |
92414ddf MD |
6535 | vm_pindex_t mark; |
6536 | ||
6537 | mark = *pmark; | |
6538 | cpu_ccfence(); | |
6539 | while (mark != PM_NOPLACEMARK) { | |
2519e05d | 6540 | tsleep_interlock(pmark, 0); |
92414ddf MD |
6541 | if (atomic_fcmpset_long(pmark, &mark, |
6542 | mark | PM_PLACEMARK_WAKEUP)) { | |
2519e05d | 6543 | tsleep(pmark, PINTERLOCKED, "pvplw", 0); |
92414ddf MD |
6544 | break; |
6545 | } | |
76f1911e | 6546 | } |
76f1911e MD |
6547 | } |
6548 | ||
6549 | /* | |
6550 | * Wakeup a placemarker that we own. Replace the entry with | |
6551 | * PM_NOPLACEMARK and issue a wakeup() if necessary. | |
6552 | */ | |
6553 | static | |
6554 | void | |
6555 | pv_placemarker_wakeup(pmap_t pmap, vm_pindex_t *pmark) | |
6556 | { | |
6557 | vm_pindex_t pindex; | |
6558 | ||
76f1911e | 6559 | pindex = atomic_swap_long(pmark, PM_NOPLACEMARK); |
76f1911e MD |
6560 | KKASSERT(pindex != PM_NOPLACEMARK); |
6561 | if (pindex & PM_PLACEMARK_WAKEUP) | |
6562 | wakeup(pmark); | |
6563 | } |