kernel - Fix some rare pmap races in i386 and x86_64.
authorMatthew Dillon <dillon@apollo.backplane.com>
Mon, 22 Feb 2010 02:23:13 +0000 (18:23 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Mon, 22 Feb 2010 02:23:13 +0000 (18:23 -0800)
* Adjust pmap_inval_init() to enter a critical section and add
  a new pmap_inval_done() function which flushes and exits it.

  It was possible for an interrupt or other preemptive action to
  come along during a pmap operation and issue its own pmap operation,
  potentially leading to races which corrupt the pmap.

  This case was tested an could actually occur, though the damage (if any)
  is unknown.  x86_64 machines have had a long standing and difficult to
  reproduce bug where a program would sometimes seg-fault for no reason.
  It is unknown whether this fixes the bug or not.

* Interlock the pmap structure when invalidating pages using a bit
  in the pm_active field.

  Check for the interlock in swtch.s when switching into threads
  and print a nice warning if it occurs.

  It was possible for one cpu to initiate a pmap modifying operation
  while another switches into a thread using the pmap the first cpu
  was in the middle of modifying.  The case is extremely rare but can
  occur if the cpu doing the modifying operation receives a SMI
  interrupt, stalling it long enough for the other cpu to switch
  into the thread and resume running in userspace.

* pmap_protect() assumed no races when clearing PG_RW and PG_M due
  to the pmap_inval operations it runs.  This should in fact be
  true with the above fixes.  However, the rest of the pmap code
  uses atomic operations so adjust pmap_protect() to also use atomic
  operations.

17 files changed:
sys/platform/pc32/i386/genassym.c
sys/platform/pc32/i386/globals.s
sys/platform/pc32/i386/pmap.c
sys/platform/pc32/i386/pmap_inval.c
sys/platform/pc32/i386/swtch.s
sys/platform/pc32/include/pmap.h
sys/platform/pc32/include/pmap_inval.h
sys/platform/pc64/include/pmap.h
sys/platform/pc64/include/pmap_inval.h
sys/platform/pc64/x86_64/genassym.c
sys/platform/pc64/x86_64/global.s
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/pmap_inval.c
sys/platform/pc64/x86_64/swtch.s
sys/platform/vkernel/i386/genassym.c
sys/platform/vkernel/i386/global.s
sys/platform/vkernel/platform/pmap.c

index adeef1e..65e5c4d 100644 (file)
@@ -73,6 +73,7 @@
 #include <machine/sigframe.h>
 #include <machine/vm86.h>
 #include <machine/globaldata.h>
+#include <machine/pmap.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -101,6 +102,9 @@ ASSYM(TD_SAVEFPU, offsetof(struct thread, td_mach) + offsetof(struct md_thread,
 
 ASSYM(TDPRI_CRIT, TDPRI_CRIT);
 ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT);
+#ifdef SMP
+ASSYM(CPUMASK_LOCK, CPUMASK_LOCK);
+#endif
 
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
@@ -180,6 +184,7 @@ ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 
 ASSYM(GD_CURTHREAD, offsetof(struct mdglobaldata, mi.gd_curthread));
 ASSYM(GD_CPUID, offsetof(struct mdglobaldata, mi.gd_cpuid));
+ASSYM(GD_CPUMASK, offsetof(struct mdglobaldata, mi.gd_cpumask));
 ASSYM(GD_CNT, offsetof(struct mdglobaldata, mi.gd_cnt));
 ASSYM(GD_PRIVATE_TSS, offsetof(struct mdglobaldata, gd_private_tss));
 ASSYM(GD_INTR_NESTING_LEVEL, offsetof(struct mdglobaldata, mi.gd_intr_nesting_level));
index 2c488d2..2aa2467 100644 (file)
@@ -73,7 +73,7 @@
         * The BSP version of these get setup in locore.s and pmap.c, while
         * the AP versions are setup in mp_machdep.c.
         */
-       .globl  gd_cpuid, gd_other_cpus
+       .globl  gd_cpuid, gd_cpumask, gd_other_cpus
        .globl  gd_ss_eflags, gd_intr_nesting_level
        .globl  gd_CMAP1, gd_CMAP2, gd_CMAP3, gd_PMAP1
        .globl  gd_CADDR1, gd_CADDR2, gd_CADDR3, gd_PADDR1
@@ -81,6 +81,7 @@
        .globl  gd_cnt, gd_private_tss
 
        .set    gd_cpuid,globaldata + GD_CPUID
+       .set    gd_cpumask,globaldata + GD_CPUMASK
        .set    gd_private_tss,globaldata + GD_PRIVATE_TSS
        .set    gd_other_cpus,globaldata + GD_OTHER_CPUS
        .set    gd_ss_eflags,globaldata + GD_SS_EFLAGS
index 6c79724..2785d81 100644 (file)
@@ -362,7 +362,7 @@ pmap_bootstrap(vm_paddr_t firstaddr, vm_paddr_t loadaddr)
         */
        kernel_pmap.pm_pdir = (pd_entry_t *)(KERNBASE + (u_int)IdlePTD);
        kernel_pmap.pm_count = 1;
-       kernel_pmap.pm_active = (cpumask_t)-1;  /* don't allow deactivation */
+       kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
        TAILQ_INIT(&kernel_pmap.pm_pvlist);
        nkpt = NKPT;
 
@@ -750,9 +750,10 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa)
        pmap_inval_init(&info);
        npte = pa | PG_RW | PG_V | pgeflag;
        pte = (unsigned *)vtopte(va);
-       pmap_inval_add(&info, &kernel_pmap, va);
+       pmap_inval_interlock(&info, &kernel_pmap, va);
        *pte = npte;
-       pmap_inval_flush(&info);
+       pmap_inval_deinterlock(&info, &kernel_pmap);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -779,8 +780,9 @@ pmap_kenter_sync(vm_offset_t va)
        pmap_inval_info info;
 
        pmap_inval_init(&info);
-       pmap_inval_add(&info, &kernel_pmap, va);
-       pmap_inval_flush(&info);
+       pmap_inval_interlock(&info, &kernel_pmap, va);
+       pmap_inval_deinterlock(&info, &kernel_pmap);
+       pmap_inval_done(&info);
 }
 
 void
@@ -800,9 +802,10 @@ pmap_kremove(vm_offset_t va)
 
        pmap_inval_init(&info);
        pte = (unsigned *)vtopte(va);
-       pmap_inval_add(&info, &kernel_pmap, va);
+       pmap_inval_interlock(&info, &kernel_pmap, va);
        *pte = 0;
-       pmap_inval_flush(&info);
+       pmap_inval_deinterlock(&info, &kernel_pmap);
+       pmap_inval_done(&info);
 }
 
 void
@@ -1026,10 +1029,11 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_page_t m, pmap_inval_info_t info)
                 *       entry.
                 */
                vm_page_busy(m);
-               pmap_inval_add(info, pmap, -1);
+               pmap_inval_interlock(info, pmap, -1);
                KKASSERT(pmap->pm_pdir[m->pindex]);
                pmap->pm_pdir[m->pindex] = 0;
                pmap->pm_cached = 0;
+               pmap_inval_deinterlock(info, pmap);
 
                KKASSERT(pmap->pm_stats.resident_count > 0);
                --pmap->pm_stats.resident_count;
@@ -1734,12 +1738,13 @@ pmap_remove_pte(struct pmap *pmap, unsigned *ptq, vm_offset_t va,
        vm_page_t m;
 
        ptbase_assert(pmap);
-       pmap_inval_add(info, pmap, va);
+       pmap_inval_interlock(info, pmap, va);
        ptbase_assert(pmap);
        oldpte = loadandclear(ptq);
-       KKASSERT(oldpte);
        if (oldpte & PG_W)
                pmap->pm_stats.wired_count -= 1;
+       pmap_inval_deinterlock(info, pmap);
+       KKASSERT(oldpte);
        /*
         * Machines that don't support invlpg, also don't support
         * PG_G.  XXX PG_G is disabled for SMP so don't worry about
@@ -1834,7 +1839,7 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
        if (((sva + PAGE_SIZE) == eva) && 
                (((unsigned) pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
                pmap_remove_page(pmap, sva, &info);
-               pmap_inval_flush(&info);
+               pmap_inval_done(&info);
                return;
        }
 
@@ -1857,10 +1862,11 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
 
                pdirindex = sindex / NPDEPG;
                if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
-                       pmap_inval_add(&info, pmap, -1);
+                       pmap_inval_interlock(&info, pmap, -1);
                        pmap->pm_pdir[pdirindex] = 0;
                        pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
                        pmap->pm_cached = 0;
+                       pmap_inval_deinterlock(&info, pmap);
                        continue;
                }
 
@@ -1895,7 +1901,7 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
                                break;
                }
        }
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -1918,22 +1924,21 @@ pmap_remove_all(vm_page_t m)
                return;
 
        pmap_inval_init(&info);
-       crit_enter();
        while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
                KKASSERT(pv->pv_pmap->pm_stats.resident_count > 0);
                --pv->pv_pmap->pm_stats.resident_count;
 
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-               pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
+               pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
                tpte = loadandclear(pte);
-#ifdef PMAP_DEBUG
-               KKASSERT(PHYS_TO_VM_PAGE(tpte) == m);
-#endif
                if (tpte & PG_W)
                        pv->pv_pmap->pm_stats.wired_count--;
-
+               pmap_inval_deinterlock(&info, pv->pv_pmap);
                if (tpte & PG_A)
                        vm_page_flag_set(m, PG_REFERENCED);
+#ifdef PMAP_DEBUG
+               KKASSERT(PHYS_TO_VM_PAGE(tpte) == m);
+#endif
 
                /*
                 * Update the vm_page_t clean and reference bits.
@@ -1961,9 +1966,8 @@ pmap_remove_all(vm_page_t m)
                pmap_unuse_pt(pv->pv_pmap, pv->pv_va, pv->pv_ptem, &info);
                free_pv_entry(pv);
        }
-       crit_exit();
        KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2002,16 +2006,16 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
        eindex = i386_btop(eva);
 
        for (; sindex < eindex; sindex = pdnxt) {
-
                unsigned pdirindex;
 
                pdnxt = ((sindex + NPTEPG) & ~(NPTEPG - 1));
 
                pdirindex = sindex / NPDEPG;
                if (((ptpaddr = (unsigned) pmap->pm_pdir[pdirindex]) & PG_PS) != 0) {
-                       pmap_inval_add(&info, pmap, -1);
+                       pmap_inval_interlock(&info, pmap, -1);
                        pmap->pm_pdir[pdirindex] &= ~(PG_M|PG_RW);
                        pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
+                       pmap_inval_deinterlock(&info, pmap);
                        continue;
                }
 
@@ -2027,44 +2031,46 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
                }
 
                for (; sindex != pdnxt; sindex++) {
-
                        unsigned pbits;
+                       unsigned cbits;
                        vm_page_t m;
 
                        /*
                         * XXX non-optimal.  Note also that there can be
                         * no pmap_inval_flush() calls until after we modify
                         * ptbase[sindex] (or otherwise we have to do another
-                        * pmap_inval_add() call).
+                        * pmap_inval_interlock() call).
                         */
-                       pmap_inval_add(&info, pmap, i386_ptob(sindex));
+                       pmap_inval_interlock(&info, pmap, i386_ptob(sindex));
+again:
                        pbits = ptbase[sindex];
+                       cbits = pbits;
 
                        if (pbits & PG_MANAGED) {
                                m = NULL;
                                if (pbits & PG_A) {
                                        m = PHYS_TO_VM_PAGE(pbits);
                                        vm_page_flag_set(m, PG_REFERENCED);
-                                       pbits &= ~PG_A;
+                                       cbits &= ~PG_A;
                                }
                                if (pbits & PG_M) {
                                        if (pmap_track_modified(i386_ptob(sindex))) {
                                                if (m == NULL)
                                                        m = PHYS_TO_VM_PAGE(pbits);
                                                vm_page_dirty(m);
-                                               pbits &= ~PG_M;
+                                               cbits &= ~PG_M;
                                        }
                                }
                        }
-
-                       pbits &= ~PG_RW;
-
-                       if (pbits != ptbase[sindex]) {
-                               ptbase[sindex] = pbits;
+                       cbits &= ~PG_RW;
+                       if (pbits != cbits &&
+                           !atomic_cmpset_int(ptbase + sindex, pbits, cbits)) {
+                               goto again;
                        }
+                       pmap_inval_deinterlock(&info, pmap);
                }
        }
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2254,16 +2260,17 @@ validate:
         * to update the pte.
         */
        if ((origpte & ~(PG_M|PG_A)) != newpte) {
-               pmap_inval_add(&info, pmap, va);
+               pmap_inval_interlock(&info, pmap, va);
                ptbase_assert(pmap);
                KKASSERT(*pte == 0 ||
                         (*pte & PG_FRAME) == (newpte & PG_FRAME));
                *pte = newpte | PG_A;
+               pmap_inval_deinterlock(&info, pmap);
                if (newpte & PG_RW)
                        vm_page_flag_set(m, PG_WRITEABLE);
        }
        KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2350,6 +2357,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
                        pmap_unwire_pte_hold(pmap, mpte, &info);
                pa = VM_PAGE_TO_PHYS(m);
                KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
+               pmap_inval_done(&info);
                return;
        }
 
@@ -2376,7 +2384,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
        else
                *pte = pa | PG_V | PG_U | PG_MANAGED;
 /*     pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2700,7 +2708,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
        }
 failed:
        crit_exit();
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 #endif
 }      
 
@@ -2912,7 +2920,6 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                iscurrentpmap = 0;
 
        pmap_inval_init(&info);
-       crit_enter();
        for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
                if (pv->pv_va >= eva || pv->pv_va < sva) {
                        npv = TAILQ_NEXT(pv, pv_plist);
@@ -2926,19 +2933,20 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                else
                        pte = pmap_pte_quick(pmap, pv->pv_va);
                KKASSERT(*pte);
-               if (pmap->pm_active)
-                       pmap_inval_add(&info, pmap, pv->pv_va);
+               pmap_inval_interlock(&info, pmap, pv->pv_va);
 
                /*
                 * We cannot remove wired pages from a process' mapping
                 * at this time
                 */
                if (*pte & PG_W) {
+                       pmap_inval_deinterlock(&info, pmap);
                        npv = TAILQ_NEXT(pv, pv_plist);
                        continue;
                }
                KKASSERT(*pte);
                tpte = loadandclear(pte);
+               pmap_inval_deinterlock(&info, pmap);
 
                m = PHYS_TO_VM_PAGE(tpte);
                test_m_maps_pv(m, pv);
@@ -2981,8 +2989,7 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                        npv = TAILQ_FIRST(&pmap->pm_pvlist);
                }
        }
-       pmap_inval_flush(&info);
-       crit_exit();
+       pmap_inval_done(&info);
 }
 
 /*
@@ -3047,7 +3054,6 @@ pmap_clearbit(vm_page_t m, int bit)
                return;
 
        pmap_inval_init(&info);
-       crit_enter();
 
        /*
         * Loop over all current mappings setting/clearing as appropos If
@@ -3081,7 +3087,7 @@ pmap_clearbit(vm_page_t m, int bit)
                 * entry when/if it needs to resynchronize the Modify bit.
                 */
                if (bit & PG_RW)
-                       pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
+                       pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 again:
                pbits = *pte;
@@ -3114,9 +3120,10 @@ again:
                                atomic_clear_int(pte, bit);
                        }
                }
+               if (bit & PG_RW)
+                       pmap_inval_deinterlock(&info, pv->pv_pmap);
        }
-       pmap_inval_flush(&info);
-       crit_exit();
+       pmap_inval_done(&info);
 }
 
 /*
@@ -3414,7 +3421,7 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                if (curthread->td_lwp == lp) {
                        pmap = vmspace_pmap(newvm);
 #if defined(SMP)
-                       atomic_set_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
+                       atomic_set_int(&pmap->pm_active, mycpu->gd_cpumask);
 #else
                        pmap->pm_active |= 1;
 #endif
@@ -3425,8 +3432,7 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                        load_cr3(curthread->td_pcb->pcb_cr3);
                        pmap = vmspace_pmap(oldvm);
 #if defined(SMP)
-                       atomic_clear_int(&pmap->pm_active,
-                                         1 << mycpu->gd_cpuid);
+                       atomic_clear_int(&pmap->pm_active, mycpu->gd_cpumask);
 #else
                        pmap->pm_active &= ~1;
 #endif
@@ -3435,6 +3441,24 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
        crit_exit();
 }
 
+/*
+ * Called when switching to a locked pmap
+ */
+void
+pmap_interlock_wait(struct vmspace *vm)
+{
+       struct pmap *pmap = &vm->vm_pmap;
+
+       if (pmap->pm_active & CPUMASK_LOCK) {
+               kprintf("Warning: pmap_interlock %08x\n", pmap->pm_active);
+               while (pmap->pm_active & CPUMASK_LOCK) {
+                       cpu_pause();
+                       cpu_ccfence();
+                       lwkt_process_ipiq();
+               }
+       }
+}
+
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
index b6b68f3..9b26855 100644 (file)
@@ -87,24 +87,40 @@ void
 pmap_inval_init(pmap_inval_info_t info)
 {
     info->pir_flags = 0;
+    crit_enter_id("inval");
 }
 
 /*
  * Add a (pmap, va) pair to the invalidation list and protect access
  * as appropriate.
+ *
+ * CPUMASK_LOCK is used to interlock thread switchins
  */
 void
-pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
+pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
 {
 #ifdef SMP
+    cpumask_t oactive;
+    cpumask_t nactive;
+
+    for (;;) {
+       oactive = pmap->pm_active & ~CPUMASK_LOCK;
+       nactive = oactive | CPUMASK_LOCK;
+       if (atomic_cmpset_int(&pmap->pm_active, oactive, nactive))
+               break;
+       crit_enter();
+       lwkt_process_ipiq();
+       crit_exit();
+    }
+
     if ((info->pir_flags & PIRF_CPUSYNC) == 0) {
        info->pir_flags |= PIRF_CPUSYNC;
        info->pir_cpusync.cs_run_func = NULL;
        info->pir_cpusync.cs_fin1_func = NULL;
        info->pir_cpusync.cs_fin2_func = NULL;
-       lwkt_cpusync_start(pmap->pm_active, &info->pir_cpusync);
+       lwkt_cpusync_start(oactive, &info->pir_cpusync);
     } else if (pmap->pm_active & ~info->pir_cpusync.cs_mask) {
-       lwkt_cpusync_add(pmap->pm_active, &info->pir_cpusync);
+       lwkt_cpusync_add(oactive, &info->pir_cpusync);
     }
 #else
     if (pmap->pm_active == 0)
@@ -131,6 +147,14 @@ pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
     }
 }
 
+void
+pmap_inval_deinterlock(pmap_inval_info_t info, pmap_t pmap)
+{
+#ifdef SMP
+    atomic_clear_int(&pmap->pm_active, CPUMASK_LOCK);
+#endif
+}
+
 /*
  * Synchronize changes with target cpus.
  */
@@ -149,3 +173,10 @@ pmap_inval_flush(pmap_inval_info_t info)
     info->pir_flags = 0;
 }
 
+void
+pmap_inval_done(pmap_inval_info_t info)
+{
+    pmap_inval_flush(info);
+    crit_exit_id("flush");
+}
+
index 11812b8..91a66e0 100644 (file)
@@ -273,7 +273,6 @@ ENTRY(cpu_exit_switch)
 
 ENTRY(cpu_heavy_restore)
        popfl
-       movl    TD_PCB(%eax),%edx               /* EDX = PCB */
        movl    TD_LWP(%eax),%ecx
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -285,8 +284,18 @@ ENTRY(cpu_heavy_restore)
         * pmap (remember, we do not hold the MP lock in the switch code).
         */
        movl    LWP_VMSPACE(%ecx), %ecx         /* ECX = vmspace */
-       movl    PCPU(cpuid), %esi
-       MPLOCKED btsl   %esi, VM_PMAP+PM_ACTIVE(%ecx)
+       movl    PCPU(cpumask), %esi
+       MPLOCKED orl    %esi, VM_PMAP+PM_ACTIVE(%ecx)
+#ifdef SMP
+       testl   $CPUMASK_LOCK,VM_PMAP+PM_ACTIVE(%ecx)
+       jz      1f
+       pushl   %eax
+       pushl   %ecx
+       call    pmap_interlock_wait
+       popl    %ecx
+       popl    %eax
+1:
+#endif
 
        /*
         * Restore the MMU address space.  If it is the same as the last
@@ -294,6 +303,7 @@ ENTRY(cpu_heavy_restore)
         * YYY which naturally also means that the PM_ACTIVE bit had better
         * already have been set before we set it above, check? YYY
         */
+       movl    TD_PCB(%eax),%edx               /* EDX = PCB */
        movl    %cr3,%esi
        movl    PCB_CR3(%edx),%ecx
        cmpl    %esi,%ecx
index 9b635e7..7973fcd 100644 (file)
@@ -207,6 +207,7 @@ pmap_kextract(vm_offset_t va)
 struct pv_entry;
 struct vm_page;
 struct vm_object;
+struct vmspace;
 
 struct md_page {
        int pv_list_count;
@@ -245,6 +246,8 @@ struct pmap {
 
 #define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count
 
+#define CPUMASK_LOCK            (1 << SMP_MAXCPU)
+
 typedef struct pmap    *pmap_t;
 
 #ifdef _KERNEL
@@ -287,7 +290,8 @@ extern vm_offset_t clean_eva;
 extern vm_offset_t clean_sva;
 extern char *ptvmmap;          /* poor name! */
 
-void   pmap_bootstrap ( vm_paddr_t, vm_paddr_t);
+void   pmap_interlock_wait (struct vmspace *);
+void   pmap_bootstrap (vm_paddr_t, vm_paddr_t);
 void   *pmap_mapdev (vm_paddr_t, vm_size_t);
 void   pmap_unmapdev (vm_offset_t, vm_size_t);
 unsigned *pmap_pte (pmap_t, vm_offset_t) __pure2;
index e8cd668..c4a18d4 100644 (file)
@@ -59,8 +59,10 @@ typedef pmap_inval_info *pmap_inval_info_t;
 #endif
 
 void pmap_inval_init(pmap_inval_info_t);
-void pmap_inval_add(pmap_inval_info_t, pmap_t, vm_offset_t);
+void pmap_inval_interlock(pmap_inval_info_t, pmap_t, vm_offset_t);
+void pmap_inval_deinterlock(pmap_inval_info_t, pmap_t);
 void pmap_inval_flush(pmap_inval_info_t);
+void pmap_inval_done(pmap_inval_info_t);
 
 #endif
 
index af2a9f6..e85eabb 100644 (file)
@@ -178,6 +178,7 @@ pte_store(pt_entry_t *ptep, pt_entry_t pte)
 struct pv_entry;
 struct vm_page;
 struct vm_object;
+struct vmspace;
 
 struct md_page {
        int pv_list_count;
@@ -213,6 +214,8 @@ struct pmap {
        int                     pm_generation;  /* detect pvlist deletions */
 };
 
+#define CPUMASK_LOCK           (1 << SMP_MAXCPU)
+
 #define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count
 
 typedef struct pmap    *pmap_t;
@@ -253,7 +256,8 @@ extern vm_offset_t clean_sva;
 extern char *ptvmmap;          /* poor name! */
 
 void   init_paging(vm_paddr_t *);
-void   pmap_bootstrap ( vm_paddr_t *);
+void   pmap_interlock_wait (struct vmspace *);
+void   pmap_bootstrap (vm_paddr_t *);
 void   *pmap_mapdev (vm_paddr_t, vm_size_t);
 void   *pmap_mapdev_uncacheable(vm_paddr_t, vm_size_t);
 void   pmap_unmapdev (vm_offset_t, vm_size_t);
index b352051..c70dbb2 100644 (file)
@@ -59,8 +59,10 @@ typedef pmap_inval_info *pmap_inval_info_t;
 #endif
 
 void pmap_inval_init(pmap_inval_info_t);
-void pmap_inval_add(pmap_inval_info_t, pmap_t, vm_offset_t);
+void pmap_inval_interlock(pmap_inval_info_t, pmap_t, vm_offset_t);
+void pmap_inval_deinterlock(pmap_inval_info_t, pmap_t);
 void pmap_inval_flush(pmap_inval_info_t);
+void pmap_inval_done(pmap_inval_info_t);
 
 #endif
 
index bc47780..116204d 100644 (file)
@@ -72,6 +72,7 @@
 #include <machine/specialreg.h>
 #include <machine/pcb.h>
 #include <machine/smp.h>
+#include <machine/pmap.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -109,6 +110,7 @@ ASSYM(VM_MAX_USER_ADDRESS, VM_MAX_USER_ADDRESS);
 ASSYM(GD_CURTHREAD, offsetof(struct mdglobaldata, mi.gd_curthread));
 ASSYM(GD_CNT, offsetof(struct mdglobaldata, mi.gd_cnt));
 ASSYM(GD_CPUID, offsetof(struct mdglobaldata, mi.gd_cpuid));
+ASSYM(GD_CPUMASK, offsetof(struct mdglobaldata, mi.gd_cpumask));
 
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15));
@@ -236,6 +238,9 @@ ASSYM(MACHINTR_INTREN, offsetof(struct machintr_abi, intren));
 
 ASSYM(TDPRI_CRIT, TDPRI_CRIT);
 ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT);
+#ifdef SMP
+ASSYM(CPUMASK_LOCK, CPUMASK_LOCK);
+#endif
 
 #ifdef SMP
 ASSYM(AIMI_APIC_ADDRESS, offsetof(struct apic_intmapinfo, apic_address));
index 2d9cd17..7b012a2 100644 (file)
@@ -76,7 +76,7 @@
         * The BSP version of these get setup in locore.s and pmap.c, while
         * the AP versions are setup in mp_machdep.c.
         */
-       .globl  gd_cpuid, gd_other_cpus
+       .globl  gd_cpuid, gd_cpumask, gd_other_cpus
        .globl  gd_ss_eflags, gd_intr_nesting_level
        .globl  gd_CMAP1, gd_CMAP2, gd_CMAP3, gd_PMAP1
        .globl  gd_CADDR1, gd_CADDR2, gd_CADDR3, gd_PADDR1
@@ -86,6 +86,7 @@
        .globl  gd_user_fs, gd_user_gs
 
        .set    gd_cpuid,globaldata + GD_CPUID
+       .set    gd_cpumask,globaldata + GD_CPUMASK
        .set    gd_private_tss,globaldata + GD_PRIVATE_TSS
        .set    gd_other_cpus,globaldata + GD_OTHER_CPUS
        .set    gd_ss_eflags,globaldata + GD_SS_EFLAGS
index 583a7a1..3f76afa 100644 (file)
@@ -594,7 +594,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
         */
        kernel_pmap.pm_pml4 = (pdp_entry_t *) (PTOV_OFFSET + KPML4phys);
        kernel_pmap.pm_count = 1;
-       kernel_pmap.pm_active = (cpumask_t)-1;  /* don't allow deactivation */
+       kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
        TAILQ_INIT(&kernel_pmap.pm_pvlist);
        nkpt = NKPT;
 
@@ -928,9 +928,10 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa)
        pmap_inval_init(&info);
        npte = pa | PG_RW | PG_V | pgeflag;
        pte = vtopte(va);
-       pmap_inval_add(&info, &kernel_pmap, va);
+       pmap_inval_interlock(&info, &kernel_pmap, va);
        *pte = npte;
-       pmap_inval_flush(&info);
+       pmap_inval_deinterlock(&info, &kernel_pmap);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -957,8 +958,9 @@ pmap_kenter_sync(vm_offset_t va)
        pmap_inval_info info;
 
        pmap_inval_init(&info);
-       pmap_inval_add(&info, &kernel_pmap, va);
-       pmap_inval_flush(&info);
+       pmap_inval_interlock(&info, &kernel_pmap, va);
+       pmap_inval_deinterlock(&info, &kernel_pmap);
+       pmap_inval_done(&info);
 }
 
 void
@@ -978,9 +980,10 @@ pmap_kremove(vm_offset_t va)
 
        pmap_inval_init(&info);
        pte = vtopte(va);
-       pmap_inval_add(&info, &kernel_pmap, va);
+       pmap_inval_interlock(&info, &kernel_pmap, va);
        *pte = 0;
-       pmap_inval_flush(&info);
+       pmap_inval_deinterlock(&info, &kernel_pmap);
+       pmap_inval_done(&info);
 }
 
 void
@@ -1219,7 +1222,7 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
         */
        KKASSERT(m->hold_count == 1);
        vm_page_busy(m);
-       pmap_inval_add(info, pmap, -1);
+       pmap_inval_interlock(info, pmap, -1);
 
        if (m->pindex >= (NUPDE + NUPDPE)) {
                /* PDP page */
@@ -1243,6 +1246,7 @@ _pmap_unwire_pte_hold(pmap_t pmap, vm_offset_t va, vm_page_t m,
 
        if (pmap->pm_ptphint == m)
                pmap->pm_ptphint = NULL;
+       pmap_inval_deinterlock(info, pmap);
 
        if (m->pindex < NUPDE) {
                /* We just released a PT, unhold the matching PD */
@@ -2093,8 +2097,9 @@ pmap_remove_pte(struct pmap *pmap, pt_entry_t *ptq, vm_offset_t va,
        pt_entry_t oldpte;
        vm_page_t m;
 
-       pmap_inval_add(info, pmap, va);
+       pmap_inval_interlock(info, pmap, va);
        oldpte = pte_load_clear(ptq);
+       pmap_inval_deinterlock(info, pmap);
        if (oldpte & PG_W)
                pmap->pm_stats.wired_count -= 1;
        /*
@@ -2189,7 +2194,7 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
                pde = pmap_pde(pmap, sva);
                if (pde && (*pde & PG_PS) == 0) {
                        pmap_remove_page(pmap, sva, &info);
-                       pmap_inval_flush(&info);
+                       pmap_inval_done(&info);
                        return;
                }
        }
@@ -2232,9 +2237,10 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
                 */
                if ((ptpaddr & PG_PS) != 0) {
                        /* JG FreeBSD has more complex treatment here */
-                       pmap_inval_add(&info, pmap, -1);
+                       pmap_inval_interlock(&info, pmap, -1);
                        *pde = 0;
                        pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
+                       pmap_inval_deinterlock(&info, pmap);
                        continue;
                }
 
@@ -2257,7 +2263,7 @@ pmap_remove(struct pmap *pmap, vm_offset_t sva, vm_offset_t eva)
                                break;
                }
        }
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2287,12 +2293,11 @@ pmap_remove_all(vm_page_t m)
                --pv->pv_pmap->pm_stats.resident_count;
 
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
-               pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
+               pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
                tpte = pte_load_clear(pte);
-
                if (tpte & PG_W)
                        pv->pv_pmap->pm_stats.wired_count--;
-
+               pmap_inval_deinterlock(&info, pv->pv_pmap);
                if (tpte & PG_A)
                        vm_page_flag_set(m, PG_REFERENCED);
 
@@ -2322,7 +2327,7 @@ pmap_remove_all(vm_page_t m)
        }
        crit_exit();
        KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2388,9 +2393,10 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
                 * Check for large page.
                 */
                if ((ptpaddr & PG_PS) != 0) {
-                       pmap_inval_add(&info, pmap, -1);
+                       pmap_inval_interlock(&info, pmap, -1);
                        *pde &= ~(PG_M|PG_RW);
                        pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
+                       pmap_inval_deinterlock(&info, pmap);
                        continue;
                }
 
@@ -2405,8 +2411,9 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
                        va_next = eva;
 
                for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
-                   sva += PAGE_SIZE) {
-                       pt_entry_t obits, pbits;
+                    sva += PAGE_SIZE) {
+                       pt_entry_t pbits;
+                       pt_entry_t cbits;
                        vm_page_t m;
 
                        /*
@@ -2415,35 +2422,39 @@ pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
                         * ptbase[sindex] (or otherwise we have to do another
                         * pmap_inval_add() call).
                         */
-                       pmap_inval_add(&info, pmap, sva);
-                       obits = pbits = *pte;
-                       if ((pbits & PG_V) == 0)
+                       pmap_inval_interlock(&info, pmap, sva);
+again:
+                       pbits = *pte;
+                       cbits = pbits;
+                       if ((pbits & PG_V) == 0) {
+                               pmap_inval_deinterlock(&info, pmap);
                                continue;
+                       }
                        if (pbits & PG_MANAGED) {
                                m = NULL;
                                if (pbits & PG_A) {
                                        m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
                                        vm_page_flag_set(m, PG_REFERENCED);
-                                       pbits &= ~PG_A;
+                                       cbits &= ~PG_A;
                                }
                                if (pbits & PG_M) {
                                        if (pmap_track_modified(sva)) {
                                                if (m == NULL)
                                                        m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
                                                vm_page_dirty(m);
-                                               pbits &= ~PG_M;
+                                               cbits &= ~PG_M;
                                        }
                                }
                        }
-
-                       pbits &= ~PG_RW;
-
-                       if (pbits != obits) {
-                               *pte = pbits;
+                       cbits &= ~PG_RW;
+                       if (pbits != cbits &&
+                           !atomic_cmpset_long(pte, pbits, cbits)) {
+                               goto again;
                        }
+                       pmap_inval_deinterlock(&info, pmap);
                }
        }
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2617,13 +2628,14 @@ validate:
         * to update the pte.
         */
        if ((origpte & ~(PG_M|PG_A)) != newpte) {
-               pmap_inval_add(&info, pmap, va);
+               pmap_inval_interlock(&info, pmap, va);
                *pte = newpte | PG_A;
+               pmap_inval_deinterlock(&info, pmap);
                if (newpte & PG_RW)
                        vm_page_flag_set(m, PG_WRITEABLE);
        }
        KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -2714,6 +2726,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
                        pmap_unwire_pte_hold(pmap, va, mpte, &info);
                pa = VM_PAGE_TO_PHYS(m);
                KKASSERT(((*pte ^ pa) & PG_FRAME) == 0);
+               pmap_inval_done(&info);
                return;
        }
 
@@ -2740,7 +2753,7 @@ pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m)
        else
                *pte = pa | PG_V | PG_U | PG_MANAGED;
 /*     pmap_inval_add(&info, pmap, va); shouldn't be needed inval->valid */
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 }
 
 /*
@@ -3069,7 +3082,7 @@ pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
        }
 failed:
        crit_exit();
-       pmap_inval_flush(&info);
+       pmap_inval_done(&info);
 #endif
 }      
 
@@ -3217,7 +3230,6 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                iscurrentpmap = 0;
 
        pmap_inval_init(&info);
-       crit_enter();
        for (pv = TAILQ_FIRST(&pmap->pm_pvlist); pv; pv = npv) {
                if (pv->pv_va >= eva || pv->pv_va < sva) {
                        npv = TAILQ_NEXT(pv, pv_plist);
@@ -3230,14 +3242,14 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                        pte = vtopte(pv->pv_va);
                else
                        pte = pmap_pte_quick(pmap, pv->pv_va);
-               if (pmap->pm_active)
-                       pmap_inval_add(&info, pmap, pv->pv_va);
+               pmap_inval_interlock(&info, pmap, pv->pv_va);
 
                /*
                 * We cannot remove wired pages from a process' mapping
                 * at this time
                 */
                if (*pte & PG_W) {
+                       pmap_inval_deinterlock(&info, pmap);
                        npv = TAILQ_NEXT(pv, pv_plist);
                        continue;
                }
@@ -3250,6 +3262,7 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 
                KKASSERT(pmap->pm_stats.resident_count > 0);
                --pmap->pm_stats.resident_count;
+               pmap_inval_deinterlock(&info, pmap);
 
                /*
                 * Update the vm_page_t clean and reference bits.
@@ -3279,8 +3292,7 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
                        npv = TAILQ_FIRST(&pmap->pm_pvlist);
                }
        }
-       pmap_inval_flush(&info);
-       crit_exit();
+       pmap_inval_done(&info);
 }
 
 /*
@@ -3346,7 +3358,6 @@ pmap_clearbit(vm_page_t m, int bit)
                return;
 
        pmap_inval_init(&info);
-       crit_enter();
 
        /*
         * Loop over all current mappings setting/clearing as appropos If
@@ -3379,7 +3390,7 @@ pmap_clearbit(vm_page_t m, int bit)
                 * entry when/if it needs to resynchronize the Modify bit.
                 */
                if (bit & PG_RW)
-                       pmap_inval_add(&info, pv->pv_pmap, pv->pv_va);
+                       pmap_inval_interlock(&info, pv->pv_pmap, pv->pv_va);
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va);
 again:
                pbits = *pte;
@@ -3412,9 +3423,10 @@ again:
                                atomic_clear_long(pte, bit);
                        }
                }
+               if (bit & PG_RW)
+                       pmap_inval_deinterlock(&info, pv->pv_pmap);
        }
-       pmap_inval_flush(&info);
-       crit_exit();
+       pmap_inval_done(&info);
 }
 
 /*
@@ -3742,10 +3754,12 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                if (curthread->td_lwp == lp) {
                        pmap = vmspace_pmap(newvm);
 #if defined(SMP)
-                       atomic_set_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
+                       atomic_set_int(&pmap->pm_active, mycpu->gd_cpumask);
 #else
                        pmap->pm_active |= 1;
 #endif
+                       if (pmap->pm_active & CPUMASK_LOCK)
+                               pmap_interlock_wait(newvm);
 #if defined(SWTCH_OPTIM_STATS)
                        tlb_flush_count++;
 #endif
@@ -3753,8 +3767,7 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                        load_cr3(curthread->td_pcb->pcb_cr3);
                        pmap = vmspace_pmap(oldvm);
 #if defined(SMP)
-                       atomic_clear_int(&pmap->pm_active,
-                                         1 << mycpu->gd_cpuid);
+                       atomic_clear_int(&pmap->pm_active, mycpu->gd_cpumask);
 #else
                        pmap->pm_active &= ~1;
 #endif
@@ -3763,6 +3776,25 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
        crit_exit();
 }
 
+/*
+ * Called when switching to a locked pmap
+ */
+void
+pmap_interlock_wait(struct vmspace *vm)
+{
+       struct pmap *pmap = &vm->vm_pmap;
+
+       if (pmap->pm_active & CPUMASK_LOCK) {
+               kprintf("Warning: pmap_interlock %p %08x\n",
+                       pmap, pmap->pm_active);
+               while (pmap->pm_active & CPUMASK_LOCK) {
+                       cpu_pause();
+                       cpu_ccfence();
+                       lwkt_process_ipiq();
+               }
+       }
+}
+
 vm_offset_t
 pmap_addr_hint(vm_object_t obj, vm_offset_t addr, vm_size_t size)
 {
index 07a9c0a..2aafd0b 100644 (file)
@@ -85,24 +85,40 @@ void
 pmap_inval_init(pmap_inval_info_t info)
 {
     info->pir_flags = 0;
+    crit_enter_id("inval");
 }
 
 /*
  * Add a (pmap, va) pair to the invalidation list and protect access
  * as appropriate.
+ *
+ * CPUMASK_LOCK is used to interlock thread switchins
  */
 void
-pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
+pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
 {
 #ifdef SMP
+    cpumask_t oactive;
+    cpumask_t nactive;
+
+    for (;;) {
+       oactive = pmap->pm_active & ~CPUMASK_LOCK;
+       nactive = oactive | CPUMASK_LOCK;
+       if (atomic_cmpset_int(&pmap->pm_active, oactive, nactive))
+               break;
+       crit_enter();
+       lwkt_process_ipiq();
+       crit_exit();
+    }
+
     if ((info->pir_flags & PIRF_CPUSYNC) == 0) {
        info->pir_flags |= PIRF_CPUSYNC;
        info->pir_cpusync.cs_run_func = NULL;
        info->pir_cpusync.cs_fin1_func = NULL;
        info->pir_cpusync.cs_fin2_func = NULL;
-       lwkt_cpusync_start(pmap->pm_active, &info->pir_cpusync);
+       lwkt_cpusync_start(oactive, &info->pir_cpusync);
     } else if (pmap->pm_active & ~info->pir_cpusync.cs_mask) {
-       lwkt_cpusync_add(pmap->pm_active, &info->pir_cpusync);
+       lwkt_cpusync_add(oactive, &info->pir_cpusync);
     }
 #else
     if (pmap->pm_active == 0)
@@ -129,6 +145,14 @@ pmap_inval_add(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
     }
 }
 
+void
+pmap_inval_deinterlock(pmap_inval_info_t info, pmap_t pmap)
+{
+#ifdef SMP
+       atomic_clear_int(&pmap->pm_active, CPUMASK_LOCK);
+#endif
+}
+
 /*
  * Synchronize changes with target cpus.
  */
@@ -147,3 +171,10 @@ pmap_inval_flush(pmap_inval_info_t info)
     info->pir_flags = 0;
 }
 
+void
+pmap_inval_done(pmap_inval_info_t info)
+{
+    pmap_inval_flush(info);
+    crit_exit_id("inval");
+}
+
index 6882068..87cc24a 100644 (file)
@@ -286,7 +286,6 @@ ENTRY(cpu_exit_switch)
 
 ENTRY(cpu_heavy_restore)
        popfq
-       movq    TD_PCB(%rax),%rdx               /* RDX = PCB */
        movq    TD_LWP(%rax),%rcx
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -298,8 +297,17 @@ ENTRY(cpu_heavy_restore)
         * pmap (remember, we do not hold the MP lock in the switch code).
         */
        movq    LWP_VMSPACE(%rcx), %rcx         /* RCX = vmspace */
-       movl    PCPU(cpuid), %esi
-       MPLOCKED btsl   %esi, VM_PMAP+PM_ACTIVE(%rcx)
+       movl    PCPU(cpumask), %esi
+       MPLOCKED orl    %esi, VM_PMAP+PM_ACTIVE(%rcx)
+#ifdef SMP
+       testl   $CPUMASK_LOCK,VM_PMAP+PM_ACTIVE(%rcx)
+       jz      1f
+       pushq   %rax
+       movq    %rcx,%rdi
+       call    pmap_interlock_wait             /* pmap_interlock_wait(vm) */
+       popq    %rax
+1:
+#endif
 
        /*
         * Restore the MMU address space.  If it is the same as the last
@@ -307,6 +315,7 @@ ENTRY(cpu_heavy_restore)
         * YYY which naturally also means that the PM_ACTIVE bit had better
         * already have been set before we set it above, check? YYY
         */
+       movq    TD_PCB(%rax),%rdx               /* RDX = PCB */
        movq    %cr3,%rsi
        movq    PCB_CR3(%rdx),%rcx
        cmpq    %rsi,%rcx
index 743c5d5..cbabc03 100644 (file)
@@ -70,6 +70,7 @@
 #include <machine/sigframe.h>
 #include <machine/vm86.h>
 #include <machine/globaldata.h>
+#include <machine/pmap.h>
 
 ASSYM(P_VMSPACE, offsetof(struct proc, p_vmspace));
 ASSYM(VM_PMAP, offsetof(struct vmspace, vm_pmap));
@@ -95,6 +96,9 @@ ASSYM(TD_SAVEFPU, offsetof(struct thread, td_mach) + offsetof(struct md_thread,
 
 ASSYM(TDPRI_CRIT, TDPRI_CRIT);
 ASSYM(TDPRI_INT_SUPPORT, TDPRI_INT_SUPPORT);
+#ifdef SMP
+ASSYM(CPUMASK_LOCK, CPUMASK_LOCK);
+#endif
 
 ASSYM(V_TRAP, offsetof(struct vmmeter, v_trap));
 ASSYM(V_SYSCALL, offsetof(struct vmmeter, v_syscall));
@@ -171,6 +175,7 @@ ASSYM(BI_KERNEND, offsetof(struct bootinfo, bi_kernend));
 
 ASSYM(GD_CURTHREAD, offsetof(struct mdglobaldata, mi.gd_curthread));
 ASSYM(GD_CPUID, offsetof(struct mdglobaldata, mi.gd_cpuid));
+ASSYM(GD_CPUMASK, offsetof(struct mdglobaldata, mi.gd_cpumask));
 ASSYM(GD_CNT, offsetof(struct mdglobaldata, mi.gd_cnt));
 ASSYM(GD_INTR_NESTING_LEVEL, offsetof(struct mdglobaldata, mi.gd_intr_nesting_level));
 ASSYM(GD_REQFLAGS, offsetof(struct mdglobaldata, mi.gd_reqflags));
index 36fd01d..c81783f 100644 (file)
@@ -70,7 +70,7 @@
         * The BSP version of these get setup in locore.s and pmap.c, while
         * the AP versions are setup in mp_machdep.c.
         */
-       .globl  gd_cpuid, gd_other_cpus
+       .globl  gd_cpuid, gd_cpumask, gd_other_cpus
        .globl  gd_ss_eflags, gd_intr_nesting_level
        .globl  gd_CMAP1, gd_CMAP2, gd_CMAP3, gd_PMAP1
        .globl  gd_CADDR1, gd_CADDR2, gd_CADDR3, gd_PADDR1
@@ -78,6 +78,7 @@
        .globl  gd_cnt
 
        .set    gd_cpuid,globaldata + GD_CPUID
+       .set    gd_cpumask,globaldata + GD_CPUMASK
        .set    gd_other_cpus,globaldata + GD_OTHER_CPUS
        .set    gd_ss_eflags,globaldata + GD_SS_EFLAGS
        .set    gd_intr_nesting_level,globaldata + GD_INTR_NESTING_LEVEL
index b742846..0c8594f 100644 (file)
@@ -155,7 +155,7 @@ pmap_bootstrap(void)
        kernel_pmap.pm_pdir = KernelPTD - (KvaStart >> SEG_SHIFT);
        kernel_pmap.pm_pdirpte = KernelPTA[i];
        kernel_pmap.pm_count = 1;
-       kernel_pmap.pm_active = (cpumask_t)-1;
+       kernel_pmap.pm_active = (cpumask_t)-1 & ~CPUMASK_LOCK;
        TAILQ_INIT(&kernel_pmap.pm_pvlist);
        i386_protection_init();
 }
@@ -2963,7 +2963,7 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                if (curthread->td_lwp == lp) {
                        pmap = vmspace_pmap(newvm);
 #if defined(SMP)
-                       atomic_set_int(&pmap->pm_active, 1 << mycpu->gd_cpuid);
+                       atomic_set_int(&pmap->pm_active, mycpu->gd_cpumask);
 #else
                        pmap->pm_active |= 1;
 #endif
@@ -2972,8 +2972,7 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
 #endif
                        pmap = vmspace_pmap(oldvm);
 #if defined(SMP)
-                       atomic_clear_int(&pmap->pm_active,
-                                         1 << mycpu->gd_cpuid);
+                       atomic_clear_int(&pmap->pm_active, mycpu->gd_cpumask);
 #else
                        pmap->pm_active &= ~1;
 #endif