kernel - Intel user/kernel separation MMU bug fix part 2/3
authorMatthew Dillon <dillon@apollo.backplane.com>
Thu, 4 Jan 2018 18:34:51 +0000 (10:34 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 5 Jan 2018 18:23:24 +0000 (10:23 -0800)
* Cleanup pass.  Throw in some documentation.

* Move the gd_pcb_* fields into the trampoline page to allow
  kernel memory to be further restricted in part 3.

sys/cpu/x86_64/include/asmacros.h
sys/cpu/x86_64/include/frame.h
sys/platform/pc64/include/globaldata.h
sys/platform/pc64/x86_64/exception.S
sys/platform/pc64/x86_64/genassym.c
sys/platform/pc64/x86_64/global.s
sys/platform/pc64/x86_64/machdep.c
sys/platform/pc64/x86_64/mp_machdep.c
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/swtch.s

index 948151c..2e5e53c 100644 (file)
  * execute a cmp/branch sequence, detect timing.  Iterate cmp $values
  * to suss-out content of speculatively read kernel memory.
  *
+ * We do this by creating a trampoline area for all user->kernel and
+ * kernel->user transitions.  The trampoline area allows us to limit
+ * the reach the kernel map in the isolated version of the user pmap
+ * to JUST the trampoline area (for all cpus), tss, and vector area.
+ *
+ * It is very important that these transitions not access any memory
+ * outside of the trampoline page while the isolated user process pmap
+ * is active in %cr3.
+ *
+ * The trampoline does not add much overhead when pmap isolation is
+ * disabled, so we just run with it regardless.  Of course, when pmap
+ * isolation is enabled, the %cr3 loads add 150-250ns to every system
+ * call as well as (without PCID) smash the TLB.
+ *
  * KMMUENTER - Executed by the trampoline when a user->kernel transition
  *             is detected.  The stack pointer points into the pcpu
  *             trampoline space and is available for register save/restore.
        subq    $TR_RIP, %rsp ;                                         \
        movq    %r10, TR_R10(%rsp) ;                                    \
        movq    %r11, TR_R11(%rsp) ;                                    \
-       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
-       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    PCPU(trampoline)+TR_PCB_CR3,%r10 ;                      \
        movq    %r10,%cr3 ;                                             \
 40:                                                                    \
        movq    %rsp, %r10 ;            /* trampoline rsp */            \
-       movq    PCPU(pcb_rsp),%rsp ;    /* kstack rsp */                \
+       movq    PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */     \
        movq    TR_SS(%r10), %r11 ;                                     \
        pushq   %r11 ;                                                  \
        movq    TR_RSP(%r10), %r11 ;                                    \
        subq    $TR_ERR, %rsp ;                                         \
        movq    %r10, TR_R10(%rsp) ;                                    \
        movq    %r11, TR_R11(%rsp) ;                                    \
-       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
-       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    PCPU(trampoline)+TR_PCB_CR3,%r10 ;                      \
        movq    %r10,%cr3 ;                                             \
 40:                                                                    \
        movq    %rsp, %r10 ;            /* trampoline rsp */            \
-       movq    PCPU(pcb_rsp),%rsp ;    /* kstack rsp */                \
+       movq    PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */     \
        movq    TR_SS(%r10), %r11 ;                                     \
        pushq   %r11 ;                                                  \
        movq    TR_RSP(%r10), %r11 ;                                    \
        movq    %r11, TR_R11(%rsp) ;                                    \
        movq    %cr2, %r10 ;                                            \
        movq    %r10, PCPU(trampoline)+TR_CR2 ;                         \
-       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
-       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    PCPU(trampoline)+TR_PCB_CR3,%r10 ;                      \
        movq    %r10,%cr3 ;                                             \
 40:                                                                    \
        movq    %rsp, %r10 ;            /* trampoline rsp */            \
-       movq    PCPU(pcb_rsp),%rsp ;    /* kstack rsp */                \
+       movq    PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */     \
        movq    TR_SS(%r10), %r11 ;                                     \
        pushq   %r11 ;                                                  \
        movq    TR_RSP(%r10), %r11 ;                                    \
  * disturbed.
  */
 #define KMMUENTER_SYSCALL                                              \
-       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
        pushq   %r10 ;                                                  \
-       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    PCPU(trampoline)+TR_PCB_CR3,%r10 ;                      \
        movq    %r10,%cr3 ;                                             \
        popq    %r10 ;                                                  \
 40:                                                                    \
  */
 #define KMMUEXIT                                                       \
        addq    $TF_RIP,%rsp ;                                          \
-       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
        movq    %r11, PCPU(trampoline)+TR_ERR ; /* save in TR_ERR */    \
        popq    %r11 ;                          /* copy %rip */         \
        movq    %gs:0,%r11 ;                                            \
        addq    $GD_TRAMPOLINE+TR_ERR,%r11 ;                            \
        movq    %r11,%rsp ;                                             \
-       movq    PCPU(pcb_cr3_iso),%r11 ;                                \
+       movq    PCPU(trampoline)+TR_PCB_CR3_ISO,%r11 ;                  \
        movq    %r11,%cr3 ;                                             \
        popq    %r11 ;          /* positioned at TR_RIP after this */   \
 40:                                                                    \
  * point.  We still have the kernel %gs.
  */
 #define KMMUEXIT_SYSCALL                                               \
-       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
        movq    %r10, PCPU(trampoline)+TR_R10 ;                         \
-       movq    PCPU(pcb_cr3_iso),%r10 ;                                \
+       movq    PCPU(trampoline)+TR_PCB_CR3_ISO,%r10 ;                  \
        movq    %r10,%cr3 ;                                             \
        movq    PCPU(trampoline)+TR_R10, %r10 ;                         \
 40:                                                                    \
index 57dec41..0999638 100644 (file)
@@ -118,6 +118,11 @@ struct intrframe {
        register_t      if_ss;
 };
 
+/*
+ * The trampframe is placed at the top of the trampoline page and
+ * contains all the information needed to trampoline into and out
+ * of the isolated user pmap.
+ */
 struct trampframe {
        register_t      tr_cr2;
        register_t      tr_r10;
@@ -128,6 +133,16 @@ struct trampframe {
        register_t      tr_rflags;
        register_t      tr_rsp;
        register_t      tr_ss;
+
+       /*
+        * Top of hw stack in TSS is &tr_pcb_rsp (first push is tr_ss).
+        * Make sure this is at least 16-byte aligned, so be sure the
+        * fields below are in multiples of 16 bytes.
+        */
+       register_t      tr_pcb_rsp;     /* hw frame tramp top of stack */
+       register_t      tr_pcb_flags;   /* copy of pcb control flags */
+       register_t      tr_pcb_cr3_iso; /* copy of isolated pml4e */
+       register_t      tr_pcb_cr3;     /* copy of primary pml4e */
 };
 
 int    kdb_trap(int, int, struct trapframe *);
index 73f072f..dad818a 100644 (file)
@@ -80,10 +80,10 @@ struct mdglobaldata {
        u_int           gd_unused002;
        u_int           gd_unused003;
        u_int           gd_ss_eflags;
-       char            *gd_pcb_rsp;    /* transfer trampoline to td stack */
-       long            gd_pcb_flags;   /* pcb control flags */
-       long            gd_pcb_cr3_iso; /* pcb isolated mmu cr3 */
-       long            gd_pcb_cr3;     /* pcb normal mmu cr3 */
+       long            gd_lunused0;
+       long            gd_lunused1;
+       long            gd_lunused2;
+       long            gd_lunusde3;
        caddr_t         gd_aunused0;
        caddr_t         gd_aunused1;
        caddr_t         gd_aunused2;
@@ -131,7 +131,9 @@ struct privatespace {
        /* page 5..4+UPAGES - idle stack (UPAGES pages) */
        char            idlestack[UPAGES * PAGE_SIZE];
 } __packed;
+
 #define mdcpu                  ((struct mdglobaldata *)_get_mycpu())
+#define pscpu                  ((struct privatespace *)_get_mycpu())
 
 #endif
 
index 2e05274..e901f5d 100644 (file)
@@ -233,7 +233,7 @@ prot_normal:
 IDTVEC(fast_syscall)
        swapgs                                  /* get kernel %gs */
        movq    %rsp,PCPU(trampoline)+TR_R10    /* save user %rsp */
-       movq    PCPU(pcb_rsp),%rsp
+       movq    PCPU(trampoline)+TR_PCB_RSP,%rsp
        KMMUENTER_SYSCALL
 
        /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
index 6b2c638..5eab421 100644 (file)
@@ -218,15 +218,15 @@ ASSYM(TR_CS, offsetof(struct trampframe, tr_cs));
 ASSYM(TR_RFLAGS, offsetof(struct trampframe, tr_rflags));
 ASSYM(TR_RSP, offsetof(struct trampframe, tr_rsp));
 ASSYM(TR_SS, offsetof(struct trampframe, tr_ss));
+ASSYM(TR_PCB_RSP, offsetof(struct trampframe, tr_pcb_rsp));
+ASSYM(TR_PCB_FLAGS, offsetof(struct trampframe, tr_pcb_flags));
+ASSYM(TR_PCB_CR3_ISO, offsetof(struct trampframe, tr_pcb_cr3_iso));
+ASSYM(TR_PCB_CR3, offsetof(struct trampframe, tr_pcb_cr3));
 
 ASSYM(GD_IPENDING, offsetof(struct mdglobaldata, gd_ipending));
 ASSYM(GD_SPENDING, offsetof(struct mdglobaldata, gd_spending));
 ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss));
 ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd));
-ASSYM(GD_PCB_RSP, offsetof(struct mdglobaldata, gd_pcb_rsp));
-ASSYM(GD_PCB_FLAGS, offsetof(struct mdglobaldata, gd_pcb_flags));
-ASSYM(GD_PCB_CR3_ISO, offsetof(struct mdglobaldata, gd_pcb_cr3_iso));
-ASSYM(GD_PCB_CR3, offsetof(struct mdglobaldata, gd_pcb_cr3));
 ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt));
 ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread));
 ASSYM(GD_FPU_LOCK, offsetof(struct mdglobaldata, gd_fpu_lock));
index ea9bc0c..d1f4098 100644 (file)
         * the per-cpu address space, otherwise it's in the data segment.
         */
        .globl  gd_trampoline
-       .globl  gd_pcb_rsp, gd_pcb_flags, gd_pcb_cr3_iso, gd_pcb_cr3
        .globl  gd_curthread, gd_npxthread, gd_reqflags, gd_common_tss
        .set    gd_trampoline,globaldata + GD_TRAMPOLINE
-       .set    gd_pcb_rsp,globaldata + GD_PCB_RSP
-       .set    gd_pcb_flags,globaldata + GD_PCB_FLAGS
-       .set    gd_pcb_cr3_iso,globaldata + GD_PCB_CR3_ISO
-       .set    gd_pcb_cr3,globaldata + GD_PCB_CR3
        .set    gd_curthread,globaldata + GD_CURTHREAD
        .set    gd_npxthread,globaldata + GD_NPXTHREAD
        .set    gd_reqflags,globaldata + GD_REQFLAGS
index 634825d..b14d3ae 100644 (file)
@@ -2521,14 +2521,21 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 
        /*
         * TSS entry point for interrupts, traps, and exceptions
-        * (sans NMI).  This will always go to the top of the pcpu
+        * (sans NMI).  This will always go to near the top of the pcpu
         * trampoline area.  Hardware-pushed data will be copied into
         * the trap-frame on entry, and (if necessary) returned to the
         * trampoline on exit.
+        *
+        * We store some pcb data for the trampoline code above the
+        * stack the cpu hw pushes into, and arrange things so the
+        * address of tr_pcb_rsp is the same as the desired top of
+        * stack.
         */
        gd->gd_common_tss.tss_rsp0 =
-               (register_t)(&CPU_prvspace[0]->trampoline + 1);
-       gd->gd_pcb_rsp = (void *)gd->gd_common_tss.tss_rsp0;
+               (register_t)&((struct privatespace *)gd)->trampoline.tr_pcb_rsp;
+
+       ((struct privatespace *)gd)->trampoline.tr_pcb_rsp =
+               gd->gd_common_tss.tss_rsp0;
 
        /* double fault stack */
        gd->gd_common_tss.tss_ist1 =
index 888de18..def008e 100644 (file)
@@ -286,12 +286,22 @@ init_secondary(void)
        md = mdcpu;     /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/
 
        /*
-        * Each cpu gets its own trampoline area for interrupts, traps, and
-        * exceptions.
+        * TSS entry point for interrupts, traps, and exceptions
+        * (sans NMI).  This will always go to near the top of the pcpu
+        * trampoline area.  Hardware-pushed data will be copied into
+        * the trap-frame on entry, and (if necessary) returned to the
+        * trampoline on exit.
+        *
+        * We store some pcb data for the trampoline code above the
+        * stack the cpu hw pushes into, and arrange things so the
+        * address of tr_pcb_rsp is the same as the desired top of
+        * stack.
         */
        md->gd_common_tss.tss_rsp0 =
-               (register_t)(&CPU_prvspace[md->mi.gd_cpuid]->trampoline + 1);
-       md->gd_pcb_rsp = (void *)md->gd_common_tss.tss_rsp0;
+               (register_t)&((struct privatespace *)md)->trampoline.tr_pcb_rsp;
+       ((struct privatespace *)md)->trampoline.tr_pcb_rsp =
+               md->gd_common_tss.tss_rsp0;
+
 #if 0 /* JG XXX */
        md->gd_common_tss.tss_ioopt = (sizeof md->gd_common_tss) << 16;
 #endif
index 44de7b5..dc9833f 100644 (file)
@@ -6330,10 +6330,13 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                         * restricted user pmap).
                         */
                        if (td == curthread) {
-                               mdcpu->gd_pcb_cr3 = td->td_pcb->pcb_cr3;
-                               mdcpu->gd_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso;
-                               mdcpu->gd_pcb_flags = td->td_pcb->pcb_flags;
-                               /* gd_pcb_rsp doesn't change */
+                               struct trampframe *tramp;
+
+                               tramp = &pscpu->trampoline;
+                               tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3;
+                               tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso;
+                               tramp->tr_pcb_flags = td->td_pcb->pcb_flags;
+                               /* tr_pcb_rsp doesn't change */
                        }
 
                        /*
index 799b53b..ee6b379 100644 (file)
@@ -362,13 +362,13 @@ END(cpu_exit_switch)
 
 ENTRY(cpu_heavy_restore)
        movq    TD_PCB(%rax),%rdx               /* RDX = PCB */
-       movq    %rdx, PCPU(pcb_rsp)
+       movq    %rdx, PCPU(trampoline)+TR_PCB_RSP
        movq    PCB_FLAGS(%rdx), %rcx
-       movq    %rcx, PCPU(pcb_flags)
+       movq    %rcx, PCPU(trampoline)+TR_PCB_FLAGS
        movq    PCB_CR3_ISO(%rdx), %rcx
-       movq    %rcx, PCPU(pcb_cr3_iso)
+       movq    %rcx, PCPU(trampoline)+TR_PCB_CR3_ISO
        movq    PCB_CR3(%rdx), %rcx
-       movq    %rcx, PCPU(pcb_cr3)
+       movq    %rcx, PCPU(trampoline)+TR_PCB_CR3
        popfq
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -473,13 +473,13 @@ ENTRY(cpu_heavy_restore)
         * Set the top of the supervisor stack for the new thread
         * in gd_thread_pcb so the trampoline code can load it into %rsp.
         */
-       movq    %rdx, PCPU(pcb_rsp)
+       movq    %rdx, PCPU(trampoline)+TR_PCB_RSP
        movq    PCB_FLAGS(%rdx), %rcx
-       movq    %rcx, PCPU(pcb_flags)
+       movq    %rcx, PCPU(trampoline)+TR_PCB_FLAGS
        movq    PCB_CR3_ISO(%rdx), %rcx
-       movq    %rcx, PCPU(pcb_cr3_iso)
+       movq    %rcx, PCPU(trampoline)+TR_PCB_CR3_ISO
        movq    PCB_CR3(%rdx), %rcx
-       movq    %rcx, PCPU(pcb_cr3)
+       movq    %rcx, PCPU(trampoline)+TR_PCB_CR3
 #endif
 
 #if 0 /* JG */