2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * pmap invalidation support code. Certain hardware requirements must
37 * be dealt with when manipulating page table entries and page directory
38 * entries within a pmap. In particular, we cannot safely manipulate
39 * page tables which are in active use by another cpu (even if it is
40 * running in userland) for two reasons: First, TLB writebacks will
41 * race against our own modifications and tests. Second, even if we
42 * were to use bus-locked instruction we can still screw up the
43 * target cpu's instruction pipeline due to Intel cpu errata.
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
56 #include <vm/vm_object.h>
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 #include <machine/clock.h>
68 #define LOOPRECOVER /* enable watchdog */
72 * Watchdog recovery interval = 1.0 / (1 << radix), or 1/16 second
73 * for the initial watchdog. If the initial watchdog fails, further
74 * instances occur at 1/2 second intervals.
76 * The watchdog value is generous for two reasons. First, because the
77 * situaation is not supposed to happen at all (but does), and second,
78 * because VMs could be very slow at handling IPIs.
80 #define LOOPRECOVER_RADIX1 4 /* initial recovery */
81 #define LOOPRECOVER_RADIX2 1 /* repeated recoveries */
83 #define MAX_INVAL_PAGES 128
85 struct pmap_inval_info {
90 enum { INVDONE, INVSTORE, INVCMPSET } mode;
102 typedef struct pmap_inval_info pmap_inval_info_t;
104 static pmap_inval_info_t invinfo[MAXCPU];
105 extern cpumask_t smp_invmask;
108 extern cpumask_t smp_in_mask;
110 extern cpumask_t smp_smurf_mask;
112 static long pmap_inval_bulk_count;
113 static int pmap_inval_watchdog_print; /* must always default off */
115 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
116 &pmap_inval_bulk_count, 0, "");
117 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW,
118 &pmap_inval_watchdog_print, 0, "");
121 pmap_inval_init(pmap_t pmap)
126 crit_enter_id("inval");
128 if (pmap != &kernel_pmap) {
130 olock = pmap->pm_active_lock;
132 nlock = olock | CPULOCK_EXCL;
133 if (olock != nlock &&
134 atomic_cmpset_int(&pmap->pm_active_lock,
141 atomic_add_acq_long(&pmap->pm_invgen, 1);
146 pmap_inval_done(pmap_t pmap)
148 if (pmap != &kernel_pmap) {
149 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
150 atomic_add_acq_long(&pmap->pm_invgen, 1);
152 crit_exit_id("inval");
158 * Debugging and lost IPI recovery code.
163 loopwdog(struct pmap_inval_info *info)
168 if (info->tsc_target - tsc < 0 && tsc_frequency) {
169 info->tsc_target = tsc + (tsc_frequency >> LOOPRECOVER_RADIX2);
177 loopdebug(const char *msg, pmap_inval_info_t *info)
180 int cpu = mycpu->gd_cpuid;
183 * Don't kprintf() anything if the pmap inval watchdog gets hit.
184 * DRM can cause an occassional watchdog hit (at least with a 1/16
185 * second watchdog), and attempting to kprintf to the KVM frame buffer
186 * from Xinvltlb, which ignores critical sections, can implode the
189 if (pmap_inval_watchdog_print == 0)
194 atomic_add_long(&smp_smurf_mask.ary[0], 0);
196 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
206 , msg, cpu, info->mode,
210 , info->sigmask.ary[0]
216 , smp_smurf_mask.ary[0]
220 for (p = 0; p < ncpus; ++p)
221 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
229 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__)
233 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
238 CPUMASK_ANDMASK(tmp, info->sigmask);
239 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
240 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
241 file, line, info->sigmask.ary[0], info->mask.ary[0]);
247 #define CHECKSIGMASK(info)
252 * Invalidate the specified va across all cpus associated with the pmap.
253 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation
254 * will be done fully synchronously with storing npte into *ptep and returning
257 * If ptep is NULL the operation will execute semi-synchronously.
258 * ptep must be NULL if npgs > 1
261 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
262 pt_entry_t *ptep, pt_entry_t npte)
264 globaldata_t gd = mycpu;
265 pmap_inval_info_t *info;
267 int cpu = gd->gd_cpuid;
269 unsigned long rflags;
272 * Initialize invalidation for pmap and enter critical section.
276 pmap_inval_init(pmap);
279 * Shortcut single-cpu case if possible.
281 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
283 * Convert to invltlb if there are too many pages to
286 if (npgs > MAX_INVAL_PAGES) {
288 va = (vm_offset_t)-1;
292 * Invalidate the specified pages, handle invltlb if requested.
297 opte = atomic_swap_long(ptep, npte);
300 if (va == (vm_offset_t)-1)
302 cpu_invlpg((void *)va);
305 if (va == (vm_offset_t)-1)
307 pmap_inval_done(pmap);
313 * We need a critical section to prevent getting preempted while
314 * we setup our command. A preemption might execute its own
315 * pmap_inval*() command and create confusion below.
317 * tsc_target is our watchdog timeout that will attempt to recover
318 * from a lost IPI. Set to 1/16 second for now.
320 info = &invinfo[cpu];
321 info->tsc_target = rdtsc() + (tsc_frequency >> LOOPRECOVER_RADIX1);
324 * We must wait for other cpus which may still be finishing up a
325 * prior operation that we requested.
327 * We do not have to disable interrupts here. An Xinvltlb can occur
328 * at any time (even within a critical section), but it will not
329 * act on our command until we set our done bits.
331 while (CPUMASK_TESTNZERO(info->done)) {
333 if (loopwdog(info)) {
335 loopdebug("A", info);
336 /* XXX recover from possible bug */
337 CPUMASK_ASSZERO(info->done);
342 KKASSERT(info->mode == INVDONE);
345 * Must set our cpu in the invalidation scan mask before
346 * any possibility of [partial] execution (remember, XINVLTLB
347 * can interrupt a critical section).
349 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
359 info->mode = INVSTORE;
361 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */
363 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
366 * If ptep is NULL the operation can be semi-synchronous, which means
367 * we can improve performance by flagging and removing idle cpus
368 * (see the idleinvlclr function in mp_machdep.c).
370 * Typically kernel page table operation is semi-synchronous.
373 smp_smurf_idleinvlclr(&tmpmask);
374 CPUMASK_ORBIT(tmpmask, cpu);
375 info->mask = tmpmask;
378 * Command may start executing the moment 'done' is initialized,
379 * disable current cpu interrupt to prevent 'done' field from
380 * changing (other cpus can't clear done bits until the originating
381 * cpu clears its mask bit, but other cpus CAN start clearing their
385 info->sigmask = tmpmask;
389 rflags = read_rflags();
392 ATOMIC_CPUMASK_COPY(info->done, tmpmask);
393 /* execution can begin here due to races */
396 * Pass our copy of the done bits (so they don't change out from
397 * under us) to generate the Xinvltlb interrupt on the targets.
399 smp_invlpg(&tmpmask);
401 KKASSERT(info->mode == INVDONE);
404 * Target cpus will be in their loop exiting concurrently with our
405 * cleanup. They will not lose the bitmask they obtained before so
406 * we can safely clear this bit.
408 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
409 write_rflags(rflags);
410 pmap_inval_done(pmap);
416 * API function - invalidate the pte at (va) and replace *ptep with npte
417 * atomically only if *ptep equals opte, across the pmap's active cpus.
419 * Returns 1 on success, 0 on failure (caller typically retries).
422 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
423 pt_entry_t opte, pt_entry_t npte)
425 globaldata_t gd = mycpu;
426 pmap_inval_info_t *info;
428 int cpu = gd->gd_cpuid;
430 unsigned long rflags;
433 * Initialize invalidation for pmap and enter critical section.
437 pmap_inval_init(pmap);
440 * Shortcut single-cpu case if possible.
442 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
443 if (atomic_cmpset_long(ptep, opte, npte)) {
444 if (va == (vm_offset_t)-1)
447 cpu_invlpg((void *)va);
448 pmap_inval_done(pmap);
451 pmap_inval_done(pmap);
457 * We need a critical section to prevent getting preempted while
458 * we setup our command. A preemption might execute its own
459 * pmap_inval*() command and create confusion below.
461 info = &invinfo[cpu];
464 * We must wait for other cpus which may still be finishing
465 * up a prior operation.
467 while (CPUMASK_TESTNZERO(info->done)) {
469 if (loopwdog(info)) {
471 loopdebug("B", info);
472 /* XXX recover from possible bug */
473 CPUMASK_ASSZERO(info->done);
478 KKASSERT(info->mode == INVDONE);
481 * Must set our cpu in the invalidation scan mask before
482 * any possibility of [partial] execution (remember, XINVLTLB
483 * can interrupt a critical section).
485 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
488 info->npgs = 1; /* unused */
495 info->mode = INVCMPSET;
498 tmpmask = pmap->pm_active; /* volatile */
500 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
501 CPUMASK_ORBIT(tmpmask, cpu);
502 info->mask = tmpmask;
505 * Command may start executing the moment 'done' is initialized,
506 * disable current cpu interrupt to prevent 'done' field from
507 * changing (other cpus can't clear done bits until the originating
508 * cpu clears its mask bit).
511 info->sigmask = tmpmask;
515 rflags = read_rflags();
518 ATOMIC_CPUMASK_COPY(info->done, tmpmask);
521 * Pass our copy of the done bits (so they don't change out from
522 * under us) to generate the Xinvltlb interrupt on the targets.
524 smp_invlpg(&tmpmask);
525 success = info->success;
526 KKASSERT(info->mode == INVDONE);
528 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
529 write_rflags(rflags);
530 pmap_inval_done(pmap);
536 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
545 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
546 pt_entry_t *ptep, pt_entry_t npte)
551 * Degenerate case, localized or we don't care (e.g. because we
552 * are jacking the entire page table) or the pmap is not in-use
553 * by anyone. No invalidations are done on any cpu.
556 pte = atomic_swap_long(ptep, npte);
561 * If it isn't the kernel pmap we execute the operation synchronously
562 * on all cpus belonging to the pmap, which avoids concurrency bugs in
563 * the hw related to changing pte's out from under threads.
565 * Eventually I would like to implement streaming pmap invalidation
566 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
569 if (bulk->pmap != &kernel_pmap) {
570 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
575 * This is the kernel_pmap. All unmap operations presume that there
576 * are no other cpus accessing the addresses in question. Implement
577 * the bulking algorithm. collect the required information and
578 * synchronize once at the end.
580 pte = atomic_swap_long(ptep, npte);
581 if (va == (vm_offset_t)-1) {
583 } else if (bulk->va_beg == bulk->va_end) {
585 bulk->va_end = va + PAGE_SIZE;
586 } else if (va == bulk->va_end) {
587 bulk->va_end = va + PAGE_SIZE;
589 bulk->va_beg = (vm_offset_t)-1;
592 pmap_inval_bulk_flush(bulk);
594 if (va == (vm_offset_t)-1) {
599 bulk->va_end = va + PAGE_SIZE;
609 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
614 pmap_inval_bulk_count += (bulk->count - 1);
615 if (bulk->va_beg != bulk->va_end) {
616 if (bulk->va_beg == (vm_offset_t)-1) {
617 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
621 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
622 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
631 * Called with a critical section held and interrupts enabled.
634 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
636 globaldata_t gd = mycpu;
637 pmap_inval_info_t *info;
643 * Check all cpus for invalidations we may need to service.
649 while (CPUMASK_TESTNZERO(cpumask)) {
650 int n = BSFCPUMASK(cpumask);
653 KKASSERT(n >= 0 && n < MAXCPU);
656 CPUMASK_NANDBIT(cpumask, n);
660 * Due to interrupts/races we can catch a new operation
661 * in an older interrupt. A fence is needed once we detect
662 * the (not) done bit.
664 if (!CPUMASK_TESTBIT(info->done, cpu))
669 kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
670 cpu, n, info->done.ary[0], info->mask.ary[0],
676 * info->mask and info->done always contain the originating
677 * cpu until the originator is done. Targets may still be
678 * present in info->done after the originator is done (they
679 * will be finishing up their loops).
681 * Clear info->mask bits on other cpus to indicate that they
682 * have quiesced (entered the loop). Once the other mask bits
683 * are clear we can execute the operation on the original,
684 * then clear the mask and done bits on the originator. The
685 * targets will then finish up their side and clear their
688 * The command is considered 100% done when all done bits have
693 * Command state machine for 'other' cpus.
695 if (CPUMASK_TESTBIT(info->mask, cpu)) {
697 * Other cpu indicate to originator that they
700 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
702 } else if (info->ptep &&
703 CPUMASK_TESTBIT(info->mask, n)) {
705 * Other cpu must wait for the originator (n)
706 * to complete its command if ptep is not NULL.
711 * Other cpu detects that the originator has
712 * completed its command, or there was no
715 * Now that the page table entry has changed,
716 * we can follow up with our own invalidation.
718 vm_offset_t va = info->va;
721 if (va == (vm_offset_t)-1 ||
722 info->npgs > MAX_INVAL_PAGES) {
725 for (npgs = info->npgs; npgs; --npgs) {
726 cpu_invlpg((void *)va);
730 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
731 /* info invalid now */
732 /* loopme left alone */
734 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
736 * Originator is waiting for other cpus
738 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
740 * Originator waits for other cpus to enter
741 * their loop (aka quiesce).
743 * If this bugs out the IPI may have been lost,
744 * try to reissue by resetting our own
745 * reentrancy bit and clearing the smurf mask
746 * for the cpus that did not respond, then
751 if (loopwdog(info)) {
753 loopdebug("C", info);
754 /* XXX recover from possible bug */
755 mdcpu->gd_xinvaltlb = 0;
756 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
759 smp_invlpg(&smp_active_mask);
762 * Force outer-loop retest of Xinvltlb
763 * requests (see mp_machdep.c).
765 mdcpu->gd_xinvaltlb = 2;
771 * Originator executes operation and clears
772 * mask to allow other cpus to finish.
774 KKASSERT(info->mode != INVDONE);
775 if (info->mode == INVSTORE) {
777 info->opte = atomic_swap_long(info->ptep, info->npte);
779 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
782 if (atomic_cmpset_long(info->ptep,
783 info->opte, info->npte)) {
789 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
796 * Originator does not have to wait for the other
797 * cpus to finish. It clears its done bit. A new
798 * command will not be initiated by the originator
799 * until the other cpus have cleared their done bits
802 vm_offset_t va = info->va;
805 if (va == (vm_offset_t)-1 ||
806 info->npgs > MAX_INVAL_PAGES) {
809 for (npgs = info->npgs; npgs; --npgs) {
810 cpu_invlpg((void *)va);
815 /* leave loopme alone */
816 /* other cpus may still be finishing up */
817 /* can't race originator since that's us */
818 info->mode = INVDONE;
819 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);