kernel - Intel user/kernel separation MMU bug fix part 3/3
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 5 Jan 2018 08:47:46 +0000 (00:47 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 5 Jan 2018 18:23:24 +0000 (10:23 -0800)
* Implement the isolated pmap template, iso_pmap.  The pmap code will
  generate a dummy iso_pmap containing only the kernel mappings required
  for userland to be able to transition into the kernel and vise-versa.

  The mappings needed are:

  (1) The per-cpu trampoline area for our stack (rsp0)
  (2) The global descriptor table (gdt) for all cpus
  (3) The interrupt descriptor table (idt) for all cpus
  (4) The TSS block for all cpus (we store this in the trampoline page)
  (5) Kernel code addresses for the interrupt vector entry and exit

* In this implementation the 'kernel code' addresses are currently just
  btext to etext.  That is, the kernel's primary text area.  Kernel
  data and bss are not part of the isolation map.

  TODO - just put the vector entry and exit points in the map, and
  not the entire kernel.

* System call performance is reduced when isolation is turned on.
  100ns -> 350ns or so.  However, typical workloads should not lose
  more than 5% performance or so.  System-call heavy and interrupt-heavy
  workloads (network, database, high-speed storage, etc) can lose a lot
  more performance.

  We leave the trampoline code in-place whether isolation is turned on
  or not.  The trampoline overhead, without isolation, is only 5nS or so.

* Fix a missing exec-related trampoline initialization.

* Clean-up kernel page table PTEs a bit. PG_M is ignored on non-terminal
  PTEs, so don't set it.  Also don't set PG_U in non-terminal kernel
  page table pages (PG_U is never set on terminal PTEs so this wasn't
  a problem, but we should be correct).

* Fix a bug in fast_syscall's trampoline stack.  The wrong stack
  pointer was being loaded.

* Move mdglobaldata->gd_common_tss to privatespace->common_tss.
  Place common_tss in the same page as the trampoline to reduce
  exposure to globaldata from the isolated MMU context.

* 16-byte align struct trampframe for convenience.

* Fix a bug in POP_FRAME.  Always cli in order to avoid getting
  an interrupt just at the iretq instruction, which might be
  misinterpreted.

sys/cpu/x86_64/include/asmacros.h
sys/cpu/x86_64/include/frame.h
sys/cpu/x86_64/include/segments.h
sys/platform/pc64/include/globaldata.h
sys/platform/pc64/vmm/vmx.c
sys/platform/pc64/x86_64/exception.S
sys/platform/pc64/x86_64/genassym.c
sys/platform/pc64/x86_64/machdep.c
sys/platform/pc64/x86_64/mp_machdep.c
sys/platform/pc64/x86_64/pmap.c

index 2e5e53c..376fb49 100644 (file)
        movq    TF_R13(%rsp),%r13 ;                                     \
        movq    TF_R14(%rsp),%r14 ;                                     \
        movq    TF_R15(%rsp),%r15 ;                                     \
+       cli ;                                                           \
        testb   $SEL_RPL_MASK,TF_CS(%rsp) ; /* return to user? */       \
        jz      1f ;                                                    \
-       cli ;                   /* return to user */                    \
        KMMUEXIT ;              /* return to user */                    \
        swapgs ;                /* return to user */                    \
        jmp     2f ;                                                    \
index 0999638..1b15f50 100644 (file)
@@ -124,6 +124,7 @@ struct intrframe {
  * of the isolated user pmap.
  */
 struct trampframe {
+       register_t      tr_unused01;
        register_t      tr_cr2;
        register_t      tr_r10;
        register_t      tr_r11;
index 126cab2..0aa64af 100644 (file)
@@ -234,10 +234,11 @@ struct region_descriptor {
 #ifndef LOCORE
 
 #ifdef _KERNEL
-extern struct user_segment_descriptor gdt[];
 extern struct soft_segment_descriptor gdt_segs[];
 extern struct gate_descriptor idt_arr[MAXCPU][NIDT];
 extern struct region_descriptor r_idt_arr[];
+extern struct region_descriptor r_gdt;
+extern struct user_segment_descriptor gdt[NGDT * MAXCPU];
 
 void   lgdt(struct region_descriptor *rdp);
 void   sdtossd(struct user_segment_descriptor *sdp,
index dad818a..fcfdcb5 100644 (file)
@@ -68,7 +68,6 @@ struct mdglobaldata {
        struct user_segment_descriptor gd_common_tssd;
        struct user_segment_descriptor *gd_tss_gdt;
        struct thread   *gd_npxthread;
-       struct x86_64tss gd_common_tss;
        union savefpu   gd_savefpu;     /* fast bcopy/zero temp fpu save area */
        int             gd_fpu_lock;    /* fast bcopy/zero cpu lock */
        int             gd_xinvaltlb;   /* reentrancy check invaltlb routine */
@@ -109,24 +108,43 @@ struct mdglobaldata {
  * This is the upper (0xff800000) address space layout that is per-cpu.
  * It is setup in locore.s and pmap.c for the BSP and in mp_machdep.c for
  * each AP.  genassym helps export this to the assembler code.
+ *
+ * Most of the major elements in struct privatespace must be
+ * PAGE_SIZE aligned.
  */
 struct privatespace {
-       /* JG TODO: fix comments describing layout */
-       /* page 0 - data page */
+       /*
+        * page 0 - data page
+        */
        struct mdglobaldata mdglobaldata;
        char            __filler0[MDGLOBALDATA_PAD];
 
        /*
-        * page 1 - trap and interrupt trampoline (rsp0 points to top,
-        *          then minus whatever hardware pushes)
+        * page 1 - Unused (unmapped)
+        */
+       char            unused2[PAGE_SIZE];
+
+       /*
+        * page 2 - Trampoline page.  Put the trampoline and common_tss
+        *          in the same page to make them easier to isolate
+        *          from the rest of the kernel map.  See x86_64/pmap.c
+        *
+        *          rsp0 points into trampoline.  Interrupts are always
+        *          disabled for this case but leave reserved1[]
+        *          reserved just in case.
         */
-       char            reserved1[PAGE_SIZE - sizeof(struct trampframe)];
+       char            reserved1[PAGE_SIZE -
+                                 sizeof(struct trampframe) -
+                                 sizeof(uint64_t) -
+                                 sizeof(struct x86_64tss)];
        struct trampframe trampoline;
+       uint64_t        reserved1b;     /* 16-byte-align trampoline */
+       struct x86_64tss common_tss;
 
-       /* page 2, 3, 4 - CPAGE2,CPAGE3,PPAGE1 (unused) */
-       char            unused2[PAGE_SIZE];
-       char            unused3[PAGE_SIZE];
-       char            unused4[PAGE_SIZE];
+       /*
+        * page 3, 4 - Double fault stack
+        */
+       char            dblstack[PAGE_SIZE * 2];
 
        /* page 5..4+UPAGES - idle stack (UPAGES pages) */
        char            idlestack[UPAGES * PAGE_SIZE];
index 7eaeb10..ffa8558 100644 (file)
@@ -939,7 +939,7 @@ vmx_vminit(struct vmm_guest_options *options)
         */
        gd = mycpu;
        ERROR_IF(vmwrite(VMCS_HOST_GS_BASE, (uint64_t)gd));
-       ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t)&gd->gd_prvspace->mdglobaldata.gd_common_tss));
+       ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t)&gd->gd_prvspace->common_tss));
 
        ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t)&gdt[gd->gd_cpuid * NGDT]));
        ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t)r_idt_arr[gd->gd_cpuid].rd_base));
@@ -1070,7 +1070,7 @@ vmx_handle_cpu_migration(void)
 
                /* Host related registers */
                ERROR_IF(vmwrite(VMCS_HOST_GS_BASE, (uint64_t) gd)); /* mycpu points to %gs:0 */
-               ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t) &gd->gd_prvspace->mdglobaldata.gd_common_tss));
+               ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t) &gd->gd_prvspace->common_tss));
 
                ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
                ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t) r_idt_arr[gd->gd_cpuid].rd_base));
index e901f5d..a229cad 100644 (file)
@@ -233,8 +233,9 @@ prot_normal:
 IDTVEC(fast_syscall)
        swapgs                                  /* get kernel %gs */
        movq    %rsp,PCPU(trampoline)+TR_R10    /* save user %rsp */
-       movq    PCPU(trampoline)+TR_PCB_RSP,%rsp
+       movq    PCPU(common_tss)+TSS_RSP0,%rsp
        KMMUENTER_SYSCALL
+       movq    PCPU(trampoline)+TR_PCB_RSP,%rsp
 
        /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
        subq    $TF_SIZE,%rsp
index 5eab421..8f29ec5 100644 (file)
@@ -204,6 +204,7 @@ ASSYM(FIRST_SOFTINT, FIRST_SOFTINT);
 ASSYM(MDGLOBALDATA_BASEALLOC_PAGES, MDGLOBALDATA_BASEALLOC_PAGES);
 
 ASSYM(GD_PRIVATE_TSS, offsetof(struct mdglobaldata, gd_private_tss));
+ASSYM(GD_COMMON_TSS, offsetof(struct privatespace, common_tss));
 ASSYM(GD_TRAMPOLINE, offsetof(struct privatespace, trampoline));
 ASSYM(GD_USER_FS, offsetof(struct mdglobaldata, gd_user_fs));
 ASSYM(GD_USER_GS, offsetof(struct mdglobaldata, gd_user_gs));
@@ -225,7 +226,6 @@ ASSYM(TR_PCB_CR3, offsetof(struct trampframe, tr_pcb_cr3));
 
 ASSYM(GD_IPENDING, offsetof(struct mdglobaldata, gd_ipending));
 ASSYM(GD_SPENDING, offsetof(struct mdglobaldata, gd_spending));
-ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss));
 ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd));
 ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt));
 ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread));
index b14d3ae..f6fa619 100644 (file)
@@ -2315,13 +2315,16 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
        int metadata_missing, off;
 #endif
        struct mdglobaldata *gd;
+       struct privatespace *ps;
        u_int64_t msr;
 
        /*
         * Prevent lowering of the ipl if we call tsleep() early.
         */
        gd = &CPU_prvspace[0]->mdglobaldata;
+       ps = (struct privatespace *)gd;
        bzero(gd, sizeof(*gd));
+       bzero(&ps->common_tss, sizeof(ps->common_tss));
 
        /*
         * Note: on both UP and SMP curthread must be set non-NULL
@@ -2379,7 +2382,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
         * make gdt memory segments
         */
        gdt_segs[GPROC0_SEL].ssd_base =
-               (uintptr_t) &CPU_prvspace[0]->mdglobaldata.gd_common_tss;
+               (uintptr_t) &CPU_prvspace[0]->common_tss;
 
        gd->mi.gd_prvspace = CPU_prvspace[0];
 
@@ -2531,19 +2534,15 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
         * address of tr_pcb_rsp is the same as the desired top of
         * stack.
         */
-       gd->gd_common_tss.tss_rsp0 =
-               (register_t)&((struct privatespace *)gd)->trampoline.tr_pcb_rsp;
-
-       ((struct privatespace *)gd)->trampoline.tr_pcb_rsp =
-               gd->gd_common_tss.tss_rsp0;
+       ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
+       ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
 
        /* double fault stack */
-       gd->gd_common_tss.tss_ist1 =
-               (long)&gd->mi.gd_prvspace->idlestack[
-                       sizeof(gd->mi.gd_prvspace->idlestack)];
+       ps->common_tss.tss_ist1 = (register_t)ps->dblstack +
+                                 sizeof(ps->dblstack);
 
        /* Set the IO permission bitmap (empty due to tss seg limit) */
-       gd->gd_common_tss.tss_iobase = sizeof(struct x86_64tss);
+       ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
 
        gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
        gd->gd_tss_gdt = &gdt[GPROC0_SEL];
index def008e..adba3ec 100644 (file)
 int    current_postcode;
 
 /** XXX FIXME: what system files declare these??? */
-extern struct region_descriptor r_gdt;
 
 extern int naps;
 
@@ -251,8 +250,7 @@ init_secondary(void)
 
        ps = CPU_prvspace[myid];
 
-       gdt_segs[GPROC0_SEL].ssd_base =
-               (long) &ps->mdglobaldata.gd_common_tss;
+       gdt_segs[GPROC0_SEL].ssd_base = (long)&ps->common_tss;
        ps->mdglobaldata.mi.gd_prvspace = ps;
 
        /* We fill the 32-bit segment descriptors */
@@ -297,21 +295,18 @@ init_secondary(void)
         * address of tr_pcb_rsp is the same as the desired top of
         * stack.
         */
-       md->gd_common_tss.tss_rsp0 =
-               (register_t)&((struct privatespace *)md)->trampoline.tr_pcb_rsp;
-       ((struct privatespace *)md)->trampoline.tr_pcb_rsp =
-               md->gd_common_tss.tss_rsp0;
+       ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
+       ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
 
 #if 0 /* JG XXX */
-       md->gd_common_tss.tss_ioopt = (sizeof md->gd_common_tss) << 16;
+       ps->common_tss.tss_ioopt = (sizeof ps->common_tss) << 16;
 #endif
        md->gd_tss_gdt = &gdt[myid * NGDT + GPROC0_SEL];
        md->gd_common_tssd = *md->gd_tss_gdt;
 
        /* double fault stack */
-       md->gd_common_tss.tss_ist1 =
-               (long)&md->mi.gd_prvspace->idlestack[
-                       sizeof(md->mi.gd_prvspace->idlestack)];
+       ps->common_tss.tss_ist1 = (register_t)ps->dblstack +
+                                 sizeof(ps->dblstack);
 
        ltr(gsel_tss);
 
index dc9833f..8112365 100644 (file)
  */
 /*
  * Manage physical address maps for x86-64 systems.
+ *
+ * Some notes:
+ *     - The 'M'odified bit is only applicable to terminal PTEs.
+ *
+ *     - The 'U'ser access bit can be set for higher-level PTEs as
+ *       long as it isn't set for terminal PTEs for pages we don't
+ *       want user access to.
  */
 
 #if 0 /* JG */
@@ -77,6 +84,7 @@
 #include <vm/vm_page2.h>
 
 #include <machine/cputypes.h>
+#include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/specialreg.h>
 #include <machine/smp.h>
 static uint64_t protection_codes[PROTECTION_CODES_SIZE];
 
 struct pmap kernel_pmap;
+struct pmap iso_pmap;
 
 MALLOC_DEFINE(M_OBJPMAP, "objpmap", "pmaps associated with VM objects");
 
@@ -947,7 +956,7 @@ create_pagetables(vm_paddr_t *firstaddr)
                ((pdp_entry_t *)KPDPphys)[NKPML4E * NPDPEPG - NKPDPE + i] |=
                    pmap_bits_default[PG_RW_IDX] |
                    pmap_bits_default[PG_V_IDX] |
-                   pmap_bits_default[PG_U_IDX];
+                   pmap_bits_default[PG_A_IDX];
        }
 
        /*
@@ -960,7 +969,7 @@ create_pagetables(vm_paddr_t *firstaddr)
                ((pdp_entry_t *)KPDPphys)[i + j] |=
                    pmap_bits_default[PG_RW_IDX] |
                    pmap_bits_default[PG_V_IDX] |
-                   pmap_bits_default[PG_U_IDX];
+                   pmap_bits_default[PG_A_IDX];
        }
 
        /*
@@ -971,6 +980,9 @@ create_pagetables(vm_paddr_t *firstaddr)
         * entries are set to zero as we allocated enough PD pages
         */
        if ((amd_feature & AMDID_PAGE1GB) == 0) {
+               /*
+                * Use 2MB pages
+                */
                for (i = 0; i < NPDEPG * ndmpdp; i++) {
                        ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
                        ((pd_entry_t *)DMPDphys)[i] |=
@@ -990,10 +1002,12 @@ create_pagetables(vm_paddr_t *firstaddr)
                                                        (i << PAGE_SHIFT);
                        ((pdp_entry_t *)DMPDPphys)[i] |=
                            pmap_bits_default[PG_RW_IDX] |
-                           pmap_bits_default[PG_V_IDX] |
-                           pmap_bits_default[PG_U_IDX];
+                           pmap_bits_default[PG_V_IDX];
                }
        } else {
+               /*
+                * 1GB pages
+                */
                for (i = 0; i < ndmpdp; i++) {
                        ((pdp_entry_t *)DMPDPphys)[i] =
                                                (vm_paddr_t)i << PDPSHIFT;
@@ -1012,7 +1026,7 @@ create_pagetables(vm_paddr_t *firstaddr)
        ((pdp_entry_t *)KPML4phys)[PML4PML4I] |=
            pmap_bits_default[PG_RW_IDX] |
            pmap_bits_default[PG_V_IDX] |
-           pmap_bits_default[PG_U_IDX];
+           pmap_bits_default[PG_A_IDX];
 
        /*
         * Connect the Direct Map slots up to the PML4
@@ -1022,7 +1036,7 @@ create_pagetables(vm_paddr_t *firstaddr)
                    (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
                    pmap_bits_default[PG_RW_IDX] |
                    pmap_bits_default[PG_V_IDX] |
-                   pmap_bits_default[PG_U_IDX];
+                   pmap_bits_default[PG_A_IDX];
        }
 
        /*
@@ -1034,7 +1048,7 @@ create_pagetables(vm_paddr_t *firstaddr)
                ((pdp_entry_t *)KPML4phys)[KPML4I + j] |=
                    pmap_bits_default[PG_RW_IDX] |
                    pmap_bits_default[PG_V_IDX] |
-                   pmap_bits_default[PG_U_IDX];
+                   pmap_bits_default[PG_A_IDX];
        }
        cpu_mfence();
        cpu_invltlb();
@@ -1288,7 +1302,16 @@ pmap_init(void)
  * Initialize the address space (zone) for the pv_entries.  Set a
  * high water mark so that the system can recover from excessive
  * numbers of pv entries.
+ *
+ * Also create the kernel page table template for isolated user
+ * pmaps.
  */
+static void pmap_init_iso_range(vm_offset_t base, size_t bytes);
+static void pmap_init2_iso_pmap(void);
+#if 0
+static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base);
+#endif
+
 void
 pmap_init2(void)
 {
@@ -1322,8 +1345,168 @@ pmap_init2(void)
                else
                        pmap_dynamic_delete = 0;
        }
+
+       pmap_init2_iso_pmap();
 }
 
+/*
+ * Create the isolation pmap template.  Once created, the template
+ * is static and its PML4e entries are used to populate the
+ * kernel portion of any isolated user pmaps.
+ *
+ * Our isolation pmap must contain:
+ * (1) trampoline area for all cpus
+ * (2) common_tss area for all cpus (its part of the trampoline area now)
+ * (3) IDT for all cpus
+ * (4) GDT for all cpus
+ */
+static void
+pmap_init2_iso_pmap(void)
+{
+       int n;
+
+       kprintf("Initialize isolation pmap\n");
+
+       /*
+        * Try to use our normal API calls to make this easier.  We have
+        * to scrap the shadowed kernel PDPs pmap_pinit() creates for our
+        * iso_pmap.
+        */
+       pmap_pinit(&iso_pmap);
+       bzero(iso_pmap.pm_pml4, PAGE_SIZE);
+
+       /*
+        * Install areas needed by the cpu and trampoline.
+        */
+       for (n = 0; n < ncpus; ++n) {
+               struct privatespace *ps;
+
+               ps = CPU_prvspace[n];
+               pmap_init_iso_range((vm_offset_t)&ps->trampoline,
+                                   sizeof(ps->trampoline));
+               pmap_init_iso_range((vm_offset_t)&ps->common_tss,
+                                   sizeof(ps->common_tss));
+               pmap_init_iso_range(r_idt_arr[n].rd_base,
+                                   r_idt_arr[n].rd_limit + 1);
+       }
+       pmap_init_iso_range((register_t)gdt, sizeof(gdt));
+       pmap_init_iso_range((vm_offset_t)(int *)btext,
+                           (vm_offset_t)(int *)etext -
+                            (vm_offset_t)(int *)btext);
+
+#if 0
+       kprintf("Dump iso_pmap:\n");
+       dump_pmap(&iso_pmap, vtophys(iso_pmap.pm_pml4), 0, 0);
+       kprintf("\nDump kernel_pmap:\n");
+       dump_pmap(&kernel_pmap, vtophys(kernel_pmap.pm_pml4), 0, 0);
+#endif
+}
+
+/*
+ * This adds a kernel virtual address range to the isolation pmap.
+ */
+static void
+pmap_init_iso_range(vm_offset_t base, size_t bytes)
+{
+       pv_entry_t pv;
+       pv_entry_t pvp;
+       pt_entry_t *ptep;
+       pt_entry_t pte;
+       vm_offset_t va;
+
+       kprintf("isolate %016jx-%016jx (%zd)\n",
+               base, base + bytes, bytes);
+       va = base & ~(vm_offset_t)PAGE_MASK;
+       while (va < base + bytes) {
+               if ((va & PDRMASK) == 0 && va + NBPDR <= base + bytes &&
+                   (ptep = pmap_pt(&kernel_pmap, va)) != NULL &&
+                   (*ptep & kernel_pmap.pmap_bits[PG_V_IDX]) &&
+                   (*ptep & kernel_pmap.pmap_bits[PG_PS_IDX])) {
+                       /*
+                        * Use 2MB pages if possible
+                        */
+                       pte = *ptep;
+                       pv = pmap_allocpte(&iso_pmap, pmap_pd_pindex(va), &pvp);
+                       ptep = pv_pte_lookup(pv, (va >> PDRSHIFT) & 511);
+                       *ptep = pte;
+                       va += NBPDR;
+               } else {
+                       /*
+                        * Otherwise use 4KB pages
+                        */
+                       pv = pmap_allocpte(&iso_pmap, pmap_pt_pindex(va), &pvp);
+                       ptep = pv_pte_lookup(pv, (va >> PAGE_SHIFT) & 511);
+                       *ptep = vtophys(va) | kernel_pmap.pmap_bits[PG_RW_IDX] |
+                                             kernel_pmap.pmap_bits[PG_V_IDX] |
+                                             kernel_pmap.pmap_bits[PG_A_IDX] |
+                                             kernel_pmap.pmap_bits[PG_M_IDX];
+
+                       va += PAGE_SIZE;
+               }
+               pv_put(pv);
+               pv_put(pvp);
+       }
+}
+
+#if 0
+/*
+ * Useful debugging pmap dumper, do not remove (#if 0 when not in use)
+ */
+static
+void
+dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base)
+{
+       pt_entry_t *ptp;
+       vm_offset_t incr;
+       int i;
+
+       switch(level) {
+       case 0:                                 /* PML4e page, 512G entries */
+               incr = (1LL << 48) / 512;
+               break;
+       case 1:                                 /* PDP page, 1G entries */
+               incr = (1LL << 39) / 512;
+               break;
+       case 2:                                 /* PD page, 2MB entries */
+               incr = (1LL << 30) / 512;
+               break;
+       case 3:                                 /* PT page, 4KB entries */
+               incr = (1LL << 21) / 512;
+               break;
+       default:
+               incr = 0;
+               break;
+       }
+
+       if (level == 0)
+               kprintf("cr3 %016jx @ va=%016jx\n", pte, base);
+       ptp = (void *)PHYS_TO_DMAP(pte & ~(pt_entry_t)PAGE_MASK);
+       for (i = 0; i < 512; ++i) {
+               if (level == 0 && i == 128)
+                       base += 0xFFFF000000000000LLU;
+               if (ptp[i]) {
+                       kprintf("%*.*s ", level * 4, level * 4, "");
+                       if (level == 1 && (ptp[i] & 0x180) == 0x180) {
+                               kprintf("va=%016jx %3d term %016jx (1GB)\n",
+                                       base, i, ptp[i]);
+                       } else if (level == 2 && (ptp[i] & 0x180) == 0x180) {
+                               kprintf("va=%016jx %3d term %016jx (2MB)\n",
+                                       base, i, ptp[i]);
+                       } else if (level == 3) {
+                               kprintf("va=%016jx %3d term %016jx\n",
+                                       base, i, ptp[i]);
+                       } else {
+                               kprintf("va=%016jx %3d deep %016jx\n",
+                                       base, i, ptp[i]);
+                               dump_pmap(pmap, ptp[i], level + 1, base);
+                       }
+               }
+               base += incr;
+       }
+}
+
+#endif
+
 /*
  * Typically used to initialize a fictitious page by vm/device_pager.c
  */
@@ -2034,14 +2217,14 @@ pmap_pinit(struct pmap *pmap)
                            (DMPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
                            pmap->pmap_bits[PG_RW_IDX] |
                            pmap->pmap_bits[PG_V_IDX] |
-                           pmap->pmap_bits[PG_U_IDX];
+                           pmap->pmap_bits[PG_A_IDX];
                }
                for (j = 0; j < NKPML4E; ++j) {
                        pmap->pm_pml4[KPML4I + j] =
                            (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
                            pmap->pmap_bits[PG_RW_IDX] |
                            pmap->pmap_bits[PG_V_IDX] |
-                           pmap->pmap_bits[PG_U_IDX];
+                           pmap->pmap_bits[PG_A_IDX];
                }
 
                /*
@@ -2050,8 +2233,7 @@ pmap_pinit(struct pmap *pmap)
                pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
                    pmap->pmap_bits[PG_V_IDX] |
                    pmap->pmap_bits[PG_RW_IDX] |
-                   pmap->pmap_bits[PG_A_IDX] |
-                   pmap->pmap_bits[PG_M_IDX];
+                   pmap->pmap_bits[PG_A_IDX];
        } else {
                KKASSERT(pv->pv_m->flags & PG_MAPPED);
                KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
@@ -2063,8 +2245,13 @@ pmap_pinit(struct pmap *pmap)
         * is needed.  We use pmap_pml4_pindex() + 1 for convenience, but
         * note that we do not operate on this table using our API functions
         * so handling of the + 1 case is mostly just to prevent implosions.
+        *
+        * We install an isolated version of the kernel PDPs into this
+        * second PML4e table.  The pmap code will mirror all user PDPs
+        * between the primary and secondary PML4e table.
         */
-       if ((pv = pmap->pm_pmlpv_iso) == NULL && vm_isolated_user_pmap) {
+       if ((pv = pmap->pm_pmlpv_iso) == NULL && vm_isolated_user_pmap &&
+           pmap != &iso_pmap) {
                pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL);
                pmap->pm_pmlpv_iso = pv;
                pmap_kenter((vm_offset_t)pmap->pm_pml4_iso,
@@ -2072,17 +2259,13 @@ pmap_pinit(struct pmap *pmap)
                pv_put(pv);
 
                /*
-                * Install just enough KMAP for our trampoline.  DMAP not
-                * needed at all.  XXX
+                * Install an isolated version of the kernel pmap for
+                * user consumption, using PDPs constructed in iso_pmap.
                 */
                for (j = 0; j < NKPML4E; ++j) {
                        pmap->pm_pml4_iso[KPML4I + j] =
-                           (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
-                           pmap->pmap_bits[PG_RW_IDX] |
-                           pmap->pmap_bits[PG_V_IDX] |
-                           pmap->pmap_bits[PG_U_IDX];
+                               iso_pmap.pm_pml4[KPML4I + j];
                }
-               KKASSERT(pmap->pm_pml4_iso[255] == 0);
        } else if (pv) {
                KKASSERT(pv->pv_m->flags & PG_MAPPED);
                KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
@@ -2196,9 +2379,13 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
         * a pt_pv is not being requested for kernel VAs.  The kernel
         * pre-wires all higher-level page tables so don't overload managed
         * higher-level page tables on top of it!
+        *
+        * However, its convenient for us to allow the case when creating
+        * iso_pmap.  This is a bit of a hack but it simplifies iso_pmap
+        * a lot.
         */
        if (ptepindex < pmap_pt_pindex(0)) {
-               if (ptepindex >= NUPTE_USER) {
+               if (ptepindex >= NUPTE_USER && pmap != &iso_pmap) {
                        /* kernel manages this manually for KVM */
                        KKASSERT(pvpp == NULL);
                } else {
@@ -2330,11 +2517,14 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
         */
        if (pvp) {
                v = VM_PAGE_TO_PHYS(m) |
-                   (pmap->pmap_bits[PG_U_IDX] |
-                    pmap->pmap_bits[PG_RW_IDX] |
+                   (pmap->pmap_bits[PG_RW_IDX] |
                     pmap->pmap_bits[PG_V_IDX] |
-                    pmap->pmap_bits[PG_A_IDX] |
-                    pmap->pmap_bits[PG_M_IDX]);
+                    pmap->pmap_bits[PG_A_IDX]);
+               if (ptepindex < NUPTE_USER)
+                       v |= pmap->pmap_bits[PG_U_IDX];
+               if (ptepindex < pmap_pt_pindex(0))
+                       v |= pmap->pmap_bits[PG_M_IDX];
+
                ptep = pv_pte_lookup(pvp, ptepindex);
                if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso)
                        ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex);
@@ -2381,11 +2571,13 @@ notnew:
                KKASSERT(pvp->pv_m != NULL);
                ptep = pv_pte_lookup(pvp, ptepindex);
                v = VM_PAGE_TO_PHYS(pv->pv_m) |
-                   (pmap->pmap_bits[PG_U_IDX] |
-                    pmap->pmap_bits[PG_RW_IDX] |
+                   (pmap->pmap_bits[PG_RW_IDX] |
                     pmap->pmap_bits[PG_V_IDX] |
-                    pmap->pmap_bits[PG_A_IDX] |
-                    pmap->pmap_bits[PG_M_IDX]);
+                    pmap->pmap_bits[PG_A_IDX]);
+               if (ptepindex < NUPTE_USER)
+                       v |= pmap->pmap_bits[PG_U_IDX];
+               if (ptepindex < pmap_pt_pindex(0))
+                       v |= pmap->pmap_bits[PG_M_IDX];
                if (*ptep != v) {
                        kprintf("mismatched upper level pt %016jx/%016jx\n",
                                *ptep, v);
@@ -3324,8 +3516,7 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                            (paddr |
                            kernel_pmap.pmap_bits[PG_V_IDX] |
                            kernel_pmap.pmap_bits[PG_RW_IDX] |
-                           kernel_pmap.pmap_bits[PG_A_IDX] |
-                           kernel_pmap.pmap_bits[PG_M_IDX]);
+                           kernel_pmap.pmap_bits[PG_A_IDX]);
                        atomic_swap_long(pd, newpd);
 
 #if 0
@@ -3364,8 +3555,7 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                newpt = (pd_entry_t)(ptppaddr |
                                     kernel_pmap.pmap_bits[PG_V_IDX] |
                                     kernel_pmap.pmap_bits[PG_RW_IDX] |
-                                    kernel_pmap.pmap_bits[PG_A_IDX] |
-                                    kernel_pmap.pmap_bits[PG_M_IDX]);
+                                    kernel_pmap.pmap_bits[PG_A_IDX]);
                atomic_swap_long(pt, newpt);
 
                kstart = (kstart + PAGE_SIZE * NPTEPG) &
@@ -6329,13 +6519,14 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                         * (it can't access the pcb directly from the
                         * restricted user pmap).
                         */
-                       if (td == curthread) {
+                       {
                                struct trampframe *tramp;
 
                                tramp = &pscpu->trampoline;
                                tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3;
                                tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso;
                                tramp->tr_pcb_flags = td->td_pcb->pcb_flags;
+                               tramp->tr_pcb_rsp = (register_t)td->td_pcb;
                                /* tr_pcb_rsp doesn't change */
                        }