2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * pmap invalidation support code. Certain hardware requirements must
37 * be dealt with when manipulating page table entries and page directory
38 * entries within a pmap. In particular, we cannot safely manipulate
39 * page tables which are in active use by another cpu (even if it is
40 * running in userland) for two reasons: First, TLB writebacks will
41 * race against our own modifications and tests. Second, even if we
42 * were to use bus-locked instruction we can still screw up the
43 * target cpu's instruction pipeline due to Intel cpu errata.
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
55 #include <vm/vm_object.h>
57 #include <machine/cputypes.h>
58 #include <machine/md_var.h>
59 #include <machine/specialreg.h>
60 #include <machine/smp.h>
61 #include <machine/globaldata.h>
62 #include <machine/pmap.h>
63 #include <machine/pmap_inval.h>
66 #define LOOPMASK (/* 32 * */ 16 * 128 * 1024 - 1)
69 struct pmap_inval_info {
74 enum { INVDONE, INVSTORE, INVCMPSET } mode;
85 typedef struct pmap_inval_info pmap_inval_info_t;
87 static pmap_inval_info_t invinfo[MAXCPU];
88 extern cpumask_t smp_invmask;
91 extern cpumask_t smp_in_mask;
93 extern cpumask_t smp_smurf_mask;
97 pmap_inval_init(pmap_t pmap)
102 crit_enter_id("inval");
104 if (pmap != &kernel_pmap) {
106 olock = pmap->pm_active_lock;
108 nlock = olock | CPULOCK_EXCL;
109 if (olock != nlock &&
110 atomic_cmpset_int(&pmap->pm_active_lock,
117 atomic_add_acq_long(&pmap->pm_invgen, 1);
122 pmap_inval_done(pmap_t pmap)
124 if (pmap != &kernel_pmap) {
125 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
126 atomic_add_acq_long(&pmap->pm_invgen, 1);
128 crit_exit_id("inval");
132 * API function - invalidation the pte at (va) and replace *ptep with
133 * npte atomically across the pmap's active cpus.
135 * This is a holy mess.
137 * Returns the previous contents of *ptep.
141 loopdebug(const char *msg, pmap_inval_info_t *info)
144 int cpu = mycpu->gd_cpuid;
147 atomic_add_long(&smp_smurf_mask.ary[0], 0);
148 kprintf("%s %d mode=%d m=%08jx d=%08jx s=%08jx "
153 msg, cpu, info->mode,
156 info->sigmask.ary[0],
160 smp_smurf_mask.ary[0]);
162 for (p = 0; p < ncpus; ++p)
163 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
167 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__)
171 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
176 CPUMASK_ANDMASK(tmp, info->sigmask);
177 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
178 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
179 file, line, info->sigmask.ary[0], info->mask.ary[0]);
185 pmap_inval_smp(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
188 globaldata_t gd = mycpu;
189 pmap_inval_info_t *info;
191 int cpu = gd->gd_cpuid;
193 unsigned long rflags;
196 * Shortcut single-cpu case if possible.
200 pmap_inval_init(pmap);
201 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
205 if (atomic_cmpset_long(ptep, opte, npte)) {
206 if (va == (vm_offset_t)-1)
209 cpu_invlpg((void *)va);
210 pmap_inval_done(pmap);
218 * We must wait for other cpus which may still be finishing up a
221 info = &invinfo[cpu];
222 while (CPUMASK_TESTNZERO(info->done)) {
226 loops = ++info->xloops;
227 if ((loops & LOOPMASK) == 0) {
229 loopdebug("orig_waitA", info);
230 /* XXX recover from possible bug */
231 CPUMASK_ASSZERO(info->done);
236 KKASSERT(info->mode == INVDONE);
239 * Must disable interrupts to prevent an Xinvltlb (which ignores
240 * critical sections) from trying to execute our command before we
241 * have managed to send any IPIs to the target cpus.
243 rflags = read_rflags();
247 * Must set our cpu in the invalidation scan mask before
248 * any possibility of [partial] execution (remember, XINVLTLB
249 * can interrupt a critical section).
251 if (CPUMASK_TESTBIT(smp_invmask, cpu)) {
252 kprintf("bcpu %d already in\n", cpu);
254 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
263 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */
265 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
266 CPUMASK_ORBIT(tmpmask, cpu);
267 info->mode = INVSTORE;
271 * Command may start executing the moment 'done' is initialized,
272 * disable current cpu interrupt to prevent 'done' field from
273 * changing (other cpus can't clear done bits until the originating
274 * cpu clears its mask bit, but other cpus CAN start clearing their
278 info->mask = tmpmask;
280 info->sigmask = tmpmask;
283 info->done = tmpmask;
286 * Pass our copy of the done bits (so they don't change out from
287 * under us) to generate the Xinvltlb interrupt on the targets.
289 smp_invlpg(&tmpmask);
291 KKASSERT(info->mode == INVDONE);
294 * Target cpus will be in their loop exiting concurrently with our
295 * cleanup. They will not lose the bitmask they obtained before so
296 * we can safely clear this bit.
298 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
299 write_rflags(rflags);
300 pmap_inval_done(pmap);
306 * API function - invalidation the pte at (va) and replace *ptep with
307 * npte atomically only if *ptep equals opte, across the pmap's active cpus.
309 * Returns 1 on success, 0 on failure (caller typically retries).
312 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
313 pt_entry_t opte, pt_entry_t npte)
315 globaldata_t gd = mycpu;
316 pmap_inval_info_t *info;
318 int cpu = gd->gd_cpuid;
320 unsigned long rflags;
323 * Shortcut single-cpu case if possible.
327 pmap_inval_init(pmap);
328 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
329 if (atomic_cmpset_long(ptep, opte, npte)) {
330 if (va == (vm_offset_t)-1)
333 cpu_invlpg((void *)va);
334 pmap_inval_done(pmap);
337 pmap_inval_done(pmap);
343 * We must wait for other cpus which may still be finishing
344 * up a prior operation.
346 info = &invinfo[cpu];
347 while (CPUMASK_TESTNZERO(info->done)) {
351 loops = ++info->xloops;
352 if ((loops & LOOPMASK) == 0) {
354 loopdebug("orig_waitB", info);
355 /* XXX recover from possible bug */
356 CPUMASK_ASSZERO(info->done);
361 KKASSERT(info->mode == INVDONE);
364 * Must disable interrupts to prevent an Xinvltlb (which ignores
365 * critical sections) from trying to execute our command before we
366 * have managed to send any IPIs to the target cpus.
368 rflags = read_rflags();
372 * Must set our cpu in the invalidation scan mask before
373 * any possibility of [partial] execution (remember, XINVLTLB
374 * can interrupt a critical section).
376 if (CPUMASK_TESTBIT(smp_invmask, cpu)) {
377 kprintf("acpu %d already in\n", cpu);
379 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
386 tmpmask = pmap->pm_active; /* volatile */
388 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
389 CPUMASK_ORBIT(tmpmask, cpu);
390 info->mode = INVCMPSET; /* initialize last */
394 * Command may start executing the moment 'done' is initialized,
395 * disable current cpu interrupt to prevent 'done' field from
396 * changing (other cpus can't clear done bits until the originating
397 * cpu clears its mask bit).
400 info->mask = tmpmask;
402 info->sigmask = tmpmask;
405 info->done = tmpmask;
408 * Calling smp_invlpg() will issue the IPIs to XINVLTLB (which can
409 * execute even from inside a critical section), and will call us
410 * back with via pmap_inval_intr() with interrupts disabled.
412 * Unlike smp_invltlb(), this interface causes all cpus to stay
413 * inside XINVLTLB until the whole thing is done. When our cpu
414 * detects that the whole thing is done we execute the requested
415 * operation and return.
417 smp_invlpg(&tmpmask);
418 success = info->success;
419 KKASSERT(info->mode == INVDONE);
421 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
422 write_rflags(rflags);
423 pmap_inval_done(pmap);
429 * Called with interrupts hard-disabled.
432 pmap_inval_intr(cpumask_t *cpumaskp)
434 globaldata_t gd = mycpu;
435 pmap_inval_info_t *info;
444 * Check all cpus for invalidations we may need to service.
450 while (CPUMASK_TESTNZERO(cpumask)) {
451 int n = BSFCPUMASK(cpumask);
454 KKASSERT(n >= 0 && n < MAXCPU);
457 CPUMASK_NANDBIT(cpumask, n);
461 * Due to interrupts/races we can catch a new operation
462 * in an older interrupt. A fence is needed once we detect
463 * the (not) done bit.
465 if (!CPUMASK_TESTBIT(info->done, cpu))
470 * info->mask and info->done always contain the originating
471 * cpu until the originator is done. Targets may still be
472 * present in info->done after the originator is done (they
473 * will be finishing up their loops).
475 * Clear info->mask bits on other cpus to indicate that they
476 * have quiesced (entered the loop). Once the other mask bits
477 * are clear we can execute the operation on the original,
478 * then clear the mask and done bits on the originator. The
479 * targets will then finish up their side and clear their
482 * The command is considered 100% done when all done bits have
487 * Command state machine for 'other' cpus.
489 if (CPUMASK_TESTBIT(info->mask, cpu)) {
491 * Other cpu indicate to originator that they
494 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
496 } else if (CPUMASK_TESTBIT(info->mask, n)) {
498 * Other cpu waits for originator (n) to
499 * complete the command.
504 * Other cpu detects that the originator has
505 * completed its command. Now that the page
506 * table entry has changed, we can follow up
507 * with our own invalidation.
509 if (info->va == (vm_offset_t)-1)
512 cpu_invlpg((void *)info->va);
513 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
514 /* info invalid now */
515 /* loopme left alone */
517 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
519 * Originator is waiting for other cpus
521 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
523 * Originator waits for other cpus to enter
524 * their loop (aka quiesce).
528 loops = ++info->xloops;
529 if ((loops & LOOPMASK) == 0) {
531 loopdebug("orig_waitC", info);
532 /* XXX recover from possible bug */
533 mdcpu->gd_xinvaltlb = 0;
534 smp_invlpg(&smp_active_mask);
539 * Originator executes operation and clears
540 * mask to allow other cpus to finish.
542 KKASSERT(info->mode != INVDONE);
543 if (info->mode == INVSTORE) {
544 info->opte = *info->ptep;
546 if (atomic_cmpset_long(info->ptep,
547 info->opte, info->npte)) {
549 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
552 /* else will loop/retry */
554 if (atomic_cmpset_long(info->ptep,
555 info->opte, info->npte)) {
561 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
568 * Originator does not have to wait for the other
569 * cpus to finish. It clears its done bit. A new
570 * command will not be initiated by the originator
571 * until the other cpus have cleared their done bits
574 if (info->va == (vm_offset_t)-1)
577 cpu_invlpg((void *)info->va);
581 /* leave loopme alone */
582 /* other cpus may still be finishing up */
583 /* can't race originator since that's us */
584 info->mode = INVDONE;
585 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);