2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
38 * pmap invalidation support code. Certain hardware requirements must
39 * be dealt with when manipulating page table entries and page directory
40 * entries within a pmap. In particular, we cannot safely manipulate
41 * page tables which are in active use by another cpu (even if it is
42 * running in userland) for two reasons: First, TLB writebacks will
43 * race against our own modifications and tests. Second, even if we
44 * were to use bus-locked instruction we can still screw up the
45 * target cpu's instruction pipeline due to Intel cpu errata.
47 * For our virtual page tables, the real kernel will handle SMP interactions
48 * with pmaps that may be active on other cpus. Even so, we have to be
49 * careful about bit setting races particularly when we are trying to clean
50 * a page and test the modified bit to avoid races where the modified bit
51 * might get set after our poll but before we clear the field.
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
57 #include <sys/vmmeter.h>
58 #include <sys/thread2.h>
59 #include <sys/cdefs.h>
61 #include <sys/vmspace.h>
66 #include <vm/vm_object.h>
68 #include <machine/cputypes.h>
69 #include <machine/md_var.h>
70 #include <machine/specialreg.h>
71 #include <machine/smp.h>
72 #include <machine/globaldata.h>
73 #include <machine/pmap.h>
74 #include <machine/pmap_inval.h>
79 extern int vmm_enabled;
85 /* For VMM mode forces vmmexit/resume */
87 __asm __volatile("syscall;"
94 * Invalidate va in the TLB on the current cpu
98 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
100 if (pmap == &kernel_pmap) {
101 madvise((void *)va, bytes, MADV_INVAL);
103 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
108 * This is a bit of a mess because we don't know what virtual cpus are
109 * mapped to real cpus. Basically try to optimize the degenerate cases
110 * (primarily related to user processes with only one thread or only one
111 * running thread), and shunt all the rest to the host cpu. The host cpu
112 * will invalidate all real cpu's the vkernel is running on.
114 * This can't optimize situations where a pmap is only mapped to some of
115 * the virtual cpus, though shunting to the real host will still be faster
116 * if the virtual kernel processes are running on fewer real-host cpus.
117 * (And probably will be faster anyway since there's no round-trip signaling
120 * NOTE: The critical section protects against preemption while the pmap
121 * is locked, which could otherwise result in a deadlock.
125 guest_sync_addr(struct pmap *pmap,
126 volatile vpte_t *dst_ptep, volatile vpte_t *src_ptep)
128 globaldata_t gd = mycpu;
133 if (pmap->pm_active == 0 &&
134 atomic_cmpset_cpumask(&pmap->pm_active, 0, CPUMASK_LOCK)) {
136 * Avoid IPIs if pmap is inactive and we can trivially
139 *dst_ptep = *src_ptep;
141 } else if (pmap->pm_active == gd->gd_cpumask &&
142 atomic_cmpset_cpumask(&pmap->pm_active,
143 gd->gd_cpumask, gd->gd_cpumask | CPUMASK_LOCK)) {
145 * Avoid IPIs if only our cpu is using the pmap and we
146 * can trivially lock it.
148 *dst_ptep = *src_ptep;
155 oactive = pmap->pm_active;
157 if ((oactive & CPUMASK_LOCK) == 0) {
158 nactive = oactive | CPUMASK_LOCK;
159 if (atomic_cmpset_cpumask(&pmap->pm_active,
169 vmm_guest_sync_addr(__DEVOLATILE(void *, dst_ptep),
170 __DEVOLATILE(void *, src_ptep));
172 atomic_clear_cpumask(&pmap->pm_active, CPUMASK_LOCK);
177 * Invalidate a pte in a pmap and synchronize with target cpus
178 * as required. Throw away the modified and access bits. Use
179 * pmap_clean_pte() to do the same thing but also get an interlocked
180 * modified/access status.
182 * Clearing the field first (basically clearing VPTE_V) prevents any
183 * new races from occuring while we invalidate the TLB (i.e. the pmap
184 * on the real cpu), then clear it again to clean out any race that
185 * might have occured before the invalidation completed.
188 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
192 if (vmm_enabled == 0) {
194 pmap_inval_cpu(pmap, va, PAGE_SIZE);
197 guest_sync_addr(pmap, ptep, &pte);
202 * Same as pmap_inval_pte() but only synchronize with the current
203 * cpu. For the moment its the same as the non-quick version.
206 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
212 pmap_inval_cpu(pmap, va, PAGE_SIZE);
216 * Invalidating page directory entries requires some additional
217 * sophistication. The cachemask must be cleared so the kernel
218 * resynchronizes its temporary page table mappings cache.
221 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
225 if (vmm_enabled == 0) {
227 pmap_inval_cpu(pmap, va, SEG_SIZE);
228 } else if ((pmap->pm_active & mycpu->gd_other_cpus) == 0) {
233 guest_sync_addr(pmap, ptep, &pte);
238 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
240 pmap_inval_pde(ptep, pmap, va);
244 * These carefully handle interactions with other cpus and return
245 * the original vpte. Clearing VPTE_RW prevents us from racing the
246 * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's
247 * pmap) and get good status for VPTE_M.
249 * When messing with page directory entries we have to clear the cpu
250 * mask to force a reload of the kernel's page table mapping cache.
252 * clean: clear VPTE_M and VPTE_RW
253 * setro: clear VPTE_RW
254 * load&clear: clear entire field
258 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
264 atomic_clear_long(ptep, VPTE_RW); /* XXX */
265 if (vmm_enabled == 0) {
266 pmap_inval_cpu(pmap, va, PAGE_SIZE);
269 guest_sync_addr(pmap, &pte, ptep);
271 atomic_clear_long(ptep, VPTE_RW|VPTE_M);
277 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
283 atomic_clear_long(ptep, VPTE_RW);
284 if (vmm_enabled == 0) {
285 pmap_inval_cpu(pmap, va, SEG_SIZE);
288 guest_sync_addr(pmap, &pte, ptep);
290 atomic_clear_long(ptep, VPTE_RW|VPTE_M);
296 * This is an odd case and I'm not sure whether it even occurs in normal
297 * operation. Turn off write access to the page, clean out the tlb
298 * (the real cpu's pmap), and deal with any VPTE_M race that may have
299 * occured. VPTE_M is not cleared.
302 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
309 atomic_clear_long(ptep, VPTE_RW);
310 if (vmm_enabled == 0) {
311 pmap_inval_cpu(pmap, va, PAGE_SIZE);
312 pte |= *ptep & VPTE_M;
314 guest_sync_addr(pmap, &npte, ptep);
315 pte |= npte & VPTE_M;
322 * This is a combination of pmap_inval_pte() and pmap_clean_pte().
323 * Firts prevent races with the 'A' and 'M' bits, then clean out
324 * the tlb (the real cpu's pmap), then incorporate any races that
325 * may have occured in the mean time, and finally zero out the pte.
328 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
337 atomic_clear_long(ptep, VPTE_RW);
338 if (vmm_enabled == 0) {
339 pmap_inval_cpu(pmap, va, PAGE_SIZE);
340 pte |= *ptep & (VPTE_A | VPTE_M);
342 guest_sync_addr(pmap, &npte, ptep);
343 pte |= npte & (VPTE_A | VPTE_M);
351 * Synchronize a kvm mapping originally made for the private use on
352 * some other cpu so it can be used on all cpus.
354 * XXX add MADV_RESYNC to improve performance.
356 * We don't need to do anything because our pmap_inval_pte_quick()
357 * synchronizes it immediately.
360 pmap_kenter_sync(vm_offset_t va __unused)
365 cpu_invlpg(void *addr)
368 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
370 madvise(addr, PAGE_SIZE, MADV_INVAL);
377 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
379 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
385 /* XXX must invalidate the tlb on all cpus */
386 /* at the moment pmap_inval_pte_quick */