2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * pmap invalidation support code. Certain hardware requirements must
37 * be dealt with when manipulating page table entries and page directory
38 * entries within a pmap. In particular, we cannot safely manipulate
39 * page tables which are in active use by another cpu (even if it is
40 * running in userland) for two reasons: First, TLB writebacks will
41 * race against our own modifications and tests. Second, even if we
42 * were to use bus-locked instruction we can still screw up the
43 * target cpu's instruction pipeline due to Intel cpu errata.
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
56 #include <vm/vm_object.h>
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
67 #define LOOPMASK (/* 32 * */ 16 * 128 * 1024 - 1)
70 #define MAX_INVAL_PAGES 128
72 struct pmap_inval_info {
77 enum { INVDONE, INVSTORE, INVCMPSET } mode;
89 typedef struct pmap_inval_info pmap_inval_info_t;
91 static pmap_inval_info_t invinfo[MAXCPU];
92 extern cpumask_t smp_invmask;
95 extern cpumask_t smp_in_mask;
97 extern cpumask_t smp_smurf_mask;
99 static long pmap_inval_bulk_count;
101 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
102 &pmap_inval_bulk_count, 0, "");
105 pmap_inval_init(pmap_t pmap)
110 crit_enter_id("inval");
112 if (pmap != &kernel_pmap) {
114 olock = pmap->pm_active_lock;
116 nlock = olock | CPULOCK_EXCL;
117 if (olock != nlock &&
118 atomic_cmpset_int(&pmap->pm_active_lock,
125 atomic_add_acq_long(&pmap->pm_invgen, 1);
130 pmap_inval_done(pmap_t pmap)
132 if (pmap != &kernel_pmap) {
133 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
134 atomic_add_acq_long(&pmap->pm_invgen, 1);
136 crit_exit_id("inval");
140 * API function - invalidation the pte at (va) and replace *ptep with
141 * npte atomically across the pmap's active cpus.
143 * This is a holy mess.
145 * Returns the previous contents of *ptep.
149 loopdebug(const char *msg, pmap_inval_info_t *info)
152 int cpu = mycpu->gd_cpuid;
155 atomic_add_long(&smp_smurf_mask.ary[0], 0);
156 kprintf("%s %d mode=%d m=%08jx d=%08jx s=%08jx "
161 msg, cpu, info->mode,
164 info->sigmask.ary[0],
168 smp_smurf_mask.ary[0]);
170 for (p = 0; p < ncpus; ++p)
171 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
177 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__)
181 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
186 CPUMASK_ANDMASK(tmp, info->sigmask);
187 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
188 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
189 file, line, info->sigmask.ary[0], info->mask.ary[0]);
195 #define CHECKSIGMASK(info)
200 * Invalidate the specified va across all cpus associated with the pmap.
201 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation
202 * will be done fully synchronously with storing npte into *ptep and returning
205 * If ptep is NULL the operation will execute semi-synchronously.
206 * ptep must be NULL if npgs > 1
209 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
210 pt_entry_t *ptep, pt_entry_t npte)
212 globaldata_t gd = mycpu;
213 pmap_inval_info_t *info;
215 int cpu = gd->gd_cpuid;
217 unsigned long rflags;
220 * Shortcut single-cpu case if possible.
224 pmap_inval_init(pmap);
225 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
227 * Convert to invltlb if there are too many pages to
230 if (npgs > MAX_INVAL_PAGES) {
232 va = (vm_offset_t)-1;
236 * Invalidate the specified pages, handle invltlb if requested.
241 opte = atomic_swap_long(ptep, npte);
244 if (va == (vm_offset_t)-1)
246 cpu_invlpg((void *)va);
249 if (va == (vm_offset_t)-1)
251 pmap_inval_done(pmap);
257 * We must wait for other cpus which may still be finishing up a
260 info = &invinfo[cpu];
261 while (CPUMASK_TESTNZERO(info->done)) {
265 loops = ++info->xloops;
266 if ((loops & LOOPMASK) == 0) {
268 loopdebug("orig_waitA", info);
269 /* XXX recover from possible bug */
270 CPUMASK_ASSZERO(info->done);
275 KKASSERT(info->mode == INVDONE);
278 * Must disable interrupts to prevent an Xinvltlb (which ignores
279 * critical sections) from trying to execute our command before we
280 * have managed to send any IPIs to the target cpus.
282 rflags = read_rflags();
286 * Must set our cpu in the invalidation scan mask before
287 * any possibility of [partial] execution (remember, XINVLTLB
288 * can interrupt a critical section).
290 if (CPUMASK_TESTBIT(smp_invmask, cpu)) {
291 kprintf("bcpu %d already in\n", cpu);
293 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
303 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */
305 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
308 * If ptep is NULL the operation can be semi-synchronous, which means
309 * we can improve performance by flagging and removing idle cpus
310 * (see the idleinvlclr function in mp_machdep.c).
312 * Typically kernel page table operation is semi-synchronous.
315 smp_smurf_idleinvlclr(&tmpmask);
316 CPUMASK_ORBIT(tmpmask, cpu);
317 info->mode = INVSTORE;
320 * Command may start executing the moment 'done' is initialized,
321 * disable current cpu interrupt to prevent 'done' field from
322 * changing (other cpus can't clear done bits until the originating
323 * cpu clears its mask bit, but other cpus CAN start clearing their
326 info->mask = tmpmask;
328 info->sigmask = tmpmask;
332 info->done = tmpmask; /* execute can begin here due to races */
335 * Pass our copy of the done bits (so they don't change out from
336 * under us) to generate the Xinvltlb interrupt on the targets.
338 smp_invlpg(&tmpmask);
340 KKASSERT(info->mode == INVDONE);
343 * Target cpus will be in their loop exiting concurrently with our
344 * cleanup. They will not lose the bitmask they obtained before so
345 * we can safely clear this bit.
347 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
348 write_rflags(rflags);
349 pmap_inval_done(pmap);
355 * API function - invalidate the pte at (va) and replace *ptep with npte
356 * atomically only if *ptep equals opte, across the pmap's active cpus.
358 * Returns 1 on success, 0 on failure (caller typically retries).
361 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
362 pt_entry_t opte, pt_entry_t npte)
364 globaldata_t gd = mycpu;
365 pmap_inval_info_t *info;
367 int cpu = gd->gd_cpuid;
369 unsigned long rflags;
372 * Shortcut single-cpu case if possible.
376 pmap_inval_init(pmap);
377 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
378 if (atomic_cmpset_long(ptep, opte, npte)) {
379 if (va == (vm_offset_t)-1)
382 cpu_invlpg((void *)va);
383 pmap_inval_done(pmap);
386 pmap_inval_done(pmap);
392 * We must wait for other cpus which may still be finishing
393 * up a prior operation.
395 info = &invinfo[cpu];
396 while (CPUMASK_TESTNZERO(info->done)) {
400 loops = ++info->xloops;
401 if ((loops & LOOPMASK) == 0) {
403 loopdebug("orig_waitB", info);
404 /* XXX recover from possible bug */
405 CPUMASK_ASSZERO(info->done);
410 KKASSERT(info->mode == INVDONE);
413 * Must disable interrupts to prevent an Xinvltlb (which ignores
414 * critical sections) from trying to execute our command before we
415 * have managed to send any IPIs to the target cpus.
417 rflags = read_rflags();
421 * Must set our cpu in the invalidation scan mask before
422 * any possibility of [partial] execution (remember, XINVLTLB
423 * can interrupt a critical section).
425 if (CPUMASK_TESTBIT(smp_invmask, cpu)) {
426 kprintf("acpu %d already in\n", cpu);
428 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
431 info->npgs = 1; /* unused */
436 tmpmask = pmap->pm_active; /* volatile */
438 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
439 CPUMASK_ORBIT(tmpmask, cpu);
440 info->mode = INVCMPSET; /* initialize last */
444 * Command may start executing the moment 'done' is initialized,
445 * disable current cpu interrupt to prevent 'done' field from
446 * changing (other cpus can't clear done bits until the originating
447 * cpu clears its mask bit).
450 info->mask = tmpmask;
452 info->sigmask = tmpmask;
455 info->done = tmpmask;
458 * Calling smp_invlpg() will issue the IPIs to XINVLTLB (which can
459 * execute even from inside a critical section), and will call us
460 * back with via pmap_inval_intr() with interrupts disabled.
462 * Unlike smp_invltlb(), this interface causes all cpus to stay
463 * inside XINVLTLB until the whole thing is done. When our cpu
464 * detects that the whole thing is done we execute the requested
465 * operation and return.
467 smp_invlpg(&tmpmask);
468 success = info->success;
469 KKASSERT(info->mode == INVDONE);
471 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
472 write_rflags(rflags);
473 pmap_inval_done(pmap);
479 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
488 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
489 pt_entry_t *ptep, pt_entry_t npte)
494 * Degenerate case, localized or we don't care (e.g. because we
495 * are jacking the entire page table) or the pmap is not in-use
496 * by anyone. No invalidations are done on any cpu.
499 pte = atomic_swap_long(ptep, npte);
504 * If it isn't the kernel pmap we execute the operation synchronously
505 * on all cpus belonging to the pmap, which avoids concurrency bugs in
506 * the hw related to changing pte's out from under threads.
508 * Eventually I would like to implement streaming pmap invalidation
509 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
512 if (bulk->pmap != &kernel_pmap) {
513 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
518 * This is the kernel_pmap. All unmap operations presume that there
519 * are no other cpus accessing the addresses in question. Implement
520 * the bulking algorithm. collect the required information and
521 * synchronize once at the end.
523 pte = atomic_swap_long(ptep, npte);
524 if (va == (vm_offset_t)-1) {
526 } else if (bulk->va_beg == bulk->va_end) {
528 bulk->va_end = va + PAGE_SIZE;
529 } else if (va == bulk->va_end) {
530 bulk->va_end = va + PAGE_SIZE;
532 bulk->va_beg = (vm_offset_t)-1;
535 pmap_inval_bulk_flush(bulk);
537 if (va == (vm_offset_t)-1) {
542 bulk->va_end = va + PAGE_SIZE;
552 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
557 pmap_inval_bulk_count += (bulk->count - 1);
558 if (bulk->va_beg != bulk->va_end) {
559 if (bulk->va_beg == (vm_offset_t)-1) {
560 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
564 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
565 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
574 * Called with interrupts hard-disabled.
577 pmap_inval_intr(cpumask_t *cpumaskp)
579 globaldata_t gd = mycpu;
580 pmap_inval_info_t *info;
589 * Check all cpus for invalidations we may need to service.
595 while (CPUMASK_TESTNZERO(cpumask)) {
596 int n = BSFCPUMASK(cpumask);
599 KKASSERT(n >= 0 && n < MAXCPU);
602 CPUMASK_NANDBIT(cpumask, n);
606 * Due to interrupts/races we can catch a new operation
607 * in an older interrupt. A fence is needed once we detect
608 * the (not) done bit.
610 if (!CPUMASK_TESTBIT(info->done, cpu))
615 * info->mask and info->done always contain the originating
616 * cpu until the originator is done. Targets may still be
617 * present in info->done after the originator is done (they
618 * will be finishing up their loops).
620 * Clear info->mask bits on other cpus to indicate that they
621 * have quiesced (entered the loop). Once the other mask bits
622 * are clear we can execute the operation on the original,
623 * then clear the mask and done bits on the originator. The
624 * targets will then finish up their side and clear their
627 * The command is considered 100% done when all done bits have
632 * Command state machine for 'other' cpus.
634 if (CPUMASK_TESTBIT(info->mask, cpu)) {
636 * Other cpu indicate to originator that they
639 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
641 } else if (info->ptep &&
642 CPUMASK_TESTBIT(info->mask, n)) {
644 * Other cpu must wait for the originator (n)
645 * to complete its command if ptep is not NULL.
650 * Other cpu detects that the originator has
651 * completed its command, or there was no
654 * Now that the page table entry has changed,
655 * we can follow up with our own invalidation.
657 vm_offset_t va = info->va;
660 if (va == (vm_offset_t)-1 ||
661 info->npgs > MAX_INVAL_PAGES) {
664 for (npgs = info->npgs; npgs; --npgs) {
665 cpu_invlpg((void *)va);
669 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
670 /* info invalid now */
671 /* loopme left alone */
673 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
675 * Originator is waiting for other cpus
677 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
679 * Originator waits for other cpus to enter
680 * their loop (aka quiesce).
684 loops = ++info->xloops;
685 if ((loops & LOOPMASK) == 0) {
687 loopdebug("orig_waitC", info);
688 /* XXX recover from possible bug */
689 mdcpu->gd_xinvaltlb = 0;
690 smp_invlpg(&smp_active_mask);
695 * Originator executes operation and clears
696 * mask to allow other cpus to finish.
698 KKASSERT(info->mode != INVDONE);
699 if (info->mode == INVSTORE) {
701 info->opte = atomic_swap_long(info->ptep, info->npte);
703 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
706 if (atomic_cmpset_long(info->ptep,
707 info->opte, info->npte)) {
713 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
720 * Originator does not have to wait for the other
721 * cpus to finish. It clears its done bit. A new
722 * command will not be initiated by the originator
723 * until the other cpus have cleared their done bits
726 vm_offset_t va = info->va;
729 if (va == (vm_offset_t)-1 ||
730 info->npgs > MAX_INVAL_PAGES) {
733 for (npgs = info->npgs; npgs; --npgs) {
734 cpu_invlpg((void *)va);
741 /* leave loopme alone */
742 /* other cpus may still be finishing up */
743 /* can't race originator since that's us */
744 info->mode = INVDONE;
745 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);