kernel - Intel user/kernel separation MMU bug fix part 1/3
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 3 Jan 2018 20:45:17 +0000 (12:45 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 5 Jan 2018 18:23:24 +0000 (10:23 -0800)
* Part 1/3 of the fix for the Intel user/kernel separation MMU bug.
  It appears that it is possible to discern the contents of kernel
  memory with careful timing measurements of instructions due to
  speculative memory reads and speculative instruction execution
  by Intel cpus.  This can happen because Intel will allow both to
  occur even when the memory access is later disallowed due to
  privilege separation in the PTE.

  Even though the execution is always aborted, the speculative
  reads and speculative execution results in timing artifacts which
  can be measured.  A speculative compare/branch can lead to timing
  artifacts that allow the actual contents of kernel memory to be
  discerned.

  While there are multiple speculative attacks possible, the Intel
  bug is particularly bad because it allows a user program to more
  or less effortlessly access kernel memory (and if a DMAP is
  present, all of physical memory).

* Part 1 implements all the logic required to load an 'isolated'
  version of the user process's PML4e into %cr3 on all user
  transitions, and to load the 'normal' U+K version into %cr3 on
  all transitions from user to kernel.

* Part 1 fully allocates, copies, and implements the %cr3 loads for
  the 'isolated' version of the user process PML4e.

* Part 1 does not yet actually adjust the contents of this isolated
  version to replace the kernel map with just a trampoline map in
  kernel space.  It does remove the DMAP as a test, though.  The
  full separation will be done in part 3.

17 files changed:
sys/cpu/x86_64/include/asmacros.h
sys/cpu/x86_64/include/frame.h
sys/platform/pc64/apic/apic_vector.s
sys/platform/pc64/icu/icu_vector.s
sys/platform/pc64/include/globaldata.h
sys/platform/pc64/include/pcb.h
sys/platform/pc64/include/pmap.h
sys/platform/pc64/x86_64/exception.S
sys/platform/pc64/x86_64/genassym.c
sys/platform/pc64/x86_64/global.s
sys/platform/pc64/x86_64/ipl.s
sys/platform/pc64/x86_64/machdep.c
sys/platform/pc64/x86_64/mp_machdep.c
sys/platform/pc64/x86_64/msi_vector.s
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/swtch.s
sys/platform/pc64/x86_64/vm_machdep.c

index 9cf2504..948151c 100644 (file)
                        .type __CONCAT(X,name),@function; __CONCAT(X,name):
 
 /*
- * Macros to create and destroy a trap frame.
+ * stack frame macro support - supports mmu isolation, swapgs, and
+ * stack frame pushing and popping.
+ */
+
+/*
+ * Kernel pmap isolation to work-around the massive Intel mmu bug
+ * that allows kernel memory to be sussed out due to speculative memory
+ * reads and instruction execution creating timing differences that can
+ * be detected by userland.  e.g. force speculative read, speculatively
+ * execute a cmp/branch sequence, detect timing.  Iterate cmp $values
+ * to suss-out content of speculatively read kernel memory.
+ *
+ * KMMUENTER - Executed by the trampoline when a user->kernel transition
+ *             is detected.  The stack pointer points into the pcpu
+ *             trampoline space and is available for register save/restore.
+ *             Other registers have not yet been saved.  %gs points at
+ *             the kernel pcpu structure.
+ *
+ *             Caller has already determined that a transition is in
+ *             progress and has already issued the swapgs.  hwtf indicates
+ *             how much hardware has already pushed.
+ *
+ * KMMUEXIT  - Executed when a kernel->user transition is made.  The stack
+ *             pointer points into the pcpu trampoline space and we are
+ *             almost ready to iretq.  %gs still points at the kernel pcpu
+ *             structure.
+ *
+ *             Caller has already determined that a transition is in
+ *             progress.  hwtf indicates how much hardware has already
+ *             pushed.
+ */
+#define KMMUENTER_TFRIP                                                        \
+       subq    $TR_RIP, %rsp ;                                         \
+       movq    %r10, TR_R10(%rsp) ;                                    \
+       movq    %r11, TR_R11(%rsp) ;                                    \
+       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       je      40f ;                                                   \
+       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    %r10,%cr3 ;                                             \
+40:                                                                    \
+       movq    %rsp, %r10 ;            /* trampoline rsp */            \
+       movq    PCPU(pcb_rsp),%rsp ;    /* kstack rsp */                \
+       movq    TR_SS(%r10), %r11 ;                                     \
+       pushq   %r11 ;                                                  \
+       movq    TR_RSP(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_RFLAGS(%r10), %r11 ;                                 \
+       pushq   %r11 ;                                                  \
+       movq    TR_CS(%r10), %r11 ;                                     \
+       pushq   %r11 ;                                                  \
+       movq    TR_RIP(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_R11(%r10), %r11 ;                                    \
+       movq    TR_R10(%r10), %r10                                      \
+
+#define KMMUENTER_TFERR                                                        \
+       subq    $TR_ERR, %rsp ;                                         \
+       movq    %r10, TR_R10(%rsp) ;                                    \
+       movq    %r11, TR_R11(%rsp) ;                                    \
+       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       je      40f ;                                                   \
+       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    %r10,%cr3 ;                                             \
+40:                                                                    \
+       movq    %rsp, %r10 ;            /* trampoline rsp */            \
+       movq    PCPU(pcb_rsp),%rsp ;    /* kstack rsp */                \
+       movq    TR_SS(%r10), %r11 ;                                     \
+       pushq   %r11 ;                                                  \
+       movq    TR_RSP(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_RFLAGS(%r10), %r11 ;                                 \
+       pushq   %r11 ;                                                  \
+       movq    TR_CS(%r10), %r11 ;                                     \
+       pushq   %r11 ;                                                  \
+       movq    TR_RIP(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_ERR(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_R11(%r10), %r11 ;                                    \
+       movq    TR_R10(%r10), %r10                                      \
+
+#define KMMUENTER_TFERR_SAVECR2                                                \
+       subq    $TR_ERR, %rsp ;                                         \
+       movq    %r10, TR_R10(%rsp) ;                                    \
+       movq    %r11, TR_R11(%rsp) ;                                    \
+       movq    %cr2, %r10 ;                                            \
+       movq    %r10, PCPU(trampoline)+TR_CR2 ;                         \
+       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       je      40f ;                                                   \
+       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    %r10,%cr3 ;                                             \
+40:                                                                    \
+       movq    %rsp, %r10 ;            /* trampoline rsp */            \
+       movq    PCPU(pcb_rsp),%rsp ;    /* kstack rsp */                \
+       movq    TR_SS(%r10), %r11 ;                                     \
+       pushq   %r11 ;                                                  \
+       movq    TR_RSP(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_RFLAGS(%r10), %r11 ;                                 \
+       pushq   %r11 ;                                                  \
+       movq    TR_CS(%r10), %r11 ;                                     \
+       pushq   %r11 ;                                                  \
+       movq    TR_RIP(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_ERR(%r10), %r11 ;                                    \
+       pushq   %r11 ;                                                  \
+       movq    TR_R11(%r10), %r11 ;                                    \
+       movq    TR_R10(%r10), %r10                                      \
+
+/*
+ * Set %cr3 if necessary on syscall entry.  No registers may be
+ * disturbed.
+ */
+#define KMMUENTER_SYSCALL                                              \
+       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       je      40f ;                                                   \
+       pushq   %r10 ;                                                  \
+       movq    PCPU(pcb_cr3),%r10 ;                                    \
+       movq    %r10,%cr3 ;                                             \
+       popq    %r10 ;                                                  \
+40:                                                                    \
+
+/*
+ * We are positioned at the base of the trapframe.  Advance the trapframe
+ * and handle MMU isolation.  MMU isolation requires us to copy the
+ * hardware frame to the trampoline area before setting %cr3 to the
+ * isolated map.  We then set the %rsp for iretq to TR_RIP in the
+ * trampoline area (after restoring the register we saved in TR_ERR).
+ */
+#define KMMUEXIT                                                       \
+       addq    $TF_RIP,%rsp ;                                          \
+       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       je      40f ;                                                   \
+       movq    %r11, PCPU(trampoline)+TR_ERR ; /* save in TR_ERR */    \
+       popq    %r11 ;                          /* copy %rip */         \
+       movq    %r11, PCPU(trampoline)+TR_RIP ;                         \
+       popq    %r11 ;                          /* copy %cs */          \
+       movq    %r11, PCPU(trampoline)+TR_CS ;                          \
+       popq    %r11 ;                          /* copy %rflags */      \
+       movq    %r11, PCPU(trampoline)+TR_RFLAGS ;                      \
+       popq    %r11 ;                          /* copy %rsp */         \
+       movq    %r11, PCPU(trampoline)+TR_RSP ;                         \
+       popq    %r11 ;                          /* copy %ss */          \
+       movq    %r11, PCPU(trampoline)+TR_SS ;                          \
+       movq    %gs:0,%r11 ;                                            \
+       addq    $GD_TRAMPOLINE+TR_ERR,%r11 ;                            \
+       movq    %r11,%rsp ;                                             \
+       movq    PCPU(pcb_cr3_iso),%r11 ;                                \
+       movq    %r11,%cr3 ;                                             \
+       popq    %r11 ;          /* positioned at TR_RIP after this */   \
+40:                                                                    \
+
+/*
+ * Warning: user stack pointer already loaded into %rsp at this
+ * point.  We still have the kernel %gs.
+ */
+#define KMMUEXIT_SYSCALL                                               \
+       testq   $PCB_ISOMMU,PCPU(pcb_flags) ;                           \
+       je      40f ;                                                   \
+       movq    %r10, PCPU(trampoline)+TR_R10 ;                         \
+       movq    PCPU(pcb_cr3_iso),%r10 ;                                \
+       movq    %r10,%cr3 ;                                             \
+       movq    PCPU(trampoline)+TR_R10, %r10 ;                         \
+40:                                                                    \
+
+/*
+ * Macros to create and destroy a trap frame.  rsp has already been shifted
+ * to the base of the trapframe in the thread structure.
  */
 #define PUSH_FRAME_REGS                                                        \
        movq    %rdi,TF_RDI(%rsp) ;                                     \
        movq    %r14,TF_R14(%rsp) ;                                     \
        movq    %r15,TF_R15(%rsp)
 
-#define PUSH_FRAME                                                     \
-       subq    $TF_RIP,%rsp ;  /* extend hardware frame to trapframe */ \
-       testb   $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */     \
-       jz      1f ;            /* Yes, dont swapgs again */            \
-       swapgs ;                                                        \
+/*
+ * PUSH_FRAME is the first thing executed upon interrupt entry.  We are
+ * responsible for swapgs execution and the KMMUENTER dispatch.
+ */
+#define PUSH_FRAME_TFRIP                                               \
+       testb   $SEL_RPL_MASK,TF_CS-TF_RIP(%rsp) ; /* from userland? */ \
+       jz      1f ;                                                    \
+       swapgs ;                /* from userland */                     \
+       KMMUENTER_TFRIP ;       /* from userland */                     \
 1:                                                                     \
-       PUSH_FRAME_REGS                                                 \
+       subq    $TF_RIP,%rsp ;                                          \
+       PUSH_FRAME_REGS                                                 \
+
+#define PUSH_FRAME_TFERR                                               \
+       testb   $SEL_RPL_MASK,TF_CS-TF_ERR(%rsp) ; /* from userland? */ \
+       jz      1f ;                                                    \
+       swapgs ;                /* from userland */                     \
+       KMMUENTER_TFERR ;       /* from userland */                     \
+1:                                                                     \
+       subq    $TF_ERR,%rsp ;                                          \
+       PUSH_FRAME_REGS                                                 \
+
+#define PUSH_FRAME_TFERR_SAVECR2                                       \
+       testb   $SEL_RPL_MASK,TF_CS-TF_ERR(%rsp) ;                      \
+       jz      1f ;                                                    \
+       swapgs ;                /* from userland */                     \
+       KMMUENTER_TFERR_SAVECR2 ;/* from userland */                    \
+       subq    $TF_ERR,%rsp ;                                          \
+       PUSH_FRAME_REGS ;                                               \
+       movq    PCPU(trampoline)+TR_CR2, %r10 ;                         \
+       jmp 2f ;                                                        \
+1:                                                                     \
+       subq    $TF_ERR,%rsp ;                                          \
+       PUSH_FRAME_REGS ;                                               \
+       movq    %cr2, %r10 ;                                            \
+2:                                                                     \
+       movq    %r10, TF_ADDR(%rsp)
 
+/*
+ * Called when the iretq in doreti_iret faults.  XXX
+ */
 #define PUSH_FRAME_NOSWAP                                              \
-       subq    $TF_RIP,%rsp ;  /* extend hardware frame to trapframe */ \
+       KMMUENTER_TFRIP ;                                               \
        PUSH_FRAME_REGS                                                 \
 
-#define POP_FRAME                                                      \
+/*
+ * POP_FRAME is issued just prior to the iretq, or just prior to a
+ * jmp doreti_iret.  These must be passed in to the macro.
+ */
+#define POP_FRAME(lastinsn)                                            \
        movq    TF_RDI(%rsp),%rdi ;                                     \
        movq    TF_RSI(%rsp),%rsi ;                                     \
        movq    TF_RDX(%rsp),%rdx ;                                     \
        movq    TF_R13(%rsp),%r13 ;                                     \
        movq    TF_R14(%rsp),%r14 ;                                     \
        movq    TF_R15(%rsp),%r15 ;                                     \
-       testb   $SEL_RPL_MASK,TF_CS(%rsp) ; /* come from kernel? */     \
-       jz      1f ;            /* keep kernel GS.base */               \
-       cli ;                                                           \
-       swapgs ;                                                        \
-1:     addq    $TF_RIP,%rsp    /* skip over tf_err, tf_trapno, tf_xflags */
+       testb   $SEL_RPL_MASK,TF_CS(%rsp) ; /* return to user? */       \
+       jz      1f ;                                                    \
+       cli ;                   /* return to user */                    \
+       KMMUEXIT ;              /* return to user */                    \
+       swapgs ;                /* return to user */                    \
+       jmp     2f ;                                                    \
+1:                                                                     \
+       addq    $TF_RIP,%rsp ;  /* setup for iretq */                   \
+2:                                                                     \
+       lastinsn
 
 /*
  * Access per-CPU data.
index eb10720..57dec41 100644 (file)
@@ -118,6 +118,18 @@ struct intrframe {
        register_t      if_ss;
 };
 
+struct trampframe {
+       register_t      tr_cr2;
+       register_t      tr_r10;
+       register_t      tr_r11;
+       register_t      tr_err;
+       register_t      tr_rip;
+       register_t      tr_cs;
+       register_t      tr_rflags;
+       register_t      tr_rsp;
+       register_t      tr_ss;
+};
+
 int    kdb_trap(int, int, struct trapframe *);
 
 #endif /* _CPU_FRAME_H_ */
index 60fea63..c7ef77e 100644 (file)
@@ -34,8 +34,8 @@
 
 #define MPLOCKED     lock ;
 
-#define APIC_PUSH_FRAME                                                        \
-       PUSH_FRAME ;            /* 15 regs + space for 5 extras */      \
+#define APIC_PUSH_FRAME_TFRIP                                          \
+       PUSH_FRAME_TFRIP ;      /* 15 regs + space for 5 extras */      \
        movq $0,TF_XFLAGS(%rsp) ;                                       \
        movq $0,TF_TRAPNO(%rsp) ;                                       \
        movq $0,TF_ADDR(%rsp) ;                                         \
@@ -48,8 +48,8 @@
  * segment register being changed (e.g. by procfs), which is why syscalls
  * have to use doreti.
  */
-#define APIC_POP_FRAME                                                 \
-       POP_FRAME ;                                                     \
+#define APIC_POP_FRAME(lastinsn)                                       \
+       POP_FRAME(lastinsn)                                             \
 
 #define IOAPICADDR(irq_num) \
        CNAME(ioapic_irqs) + IOAPIC_IRQI_SIZE * (irq_num) + IOAPIC_IRQI_ADDR
        .text ;                                                         \
        SUPERALIGN_TEXT ;                                               \
 IDTVEC(ioapic_intr##irq_num) ;                                         \
-       APIC_PUSH_FRAME ;                                               \
+       APIC_PUSH_FRAME_TFRIP ;                                         \
        FAKE_MCOUNT(TF_RIP(%rsp)) ;                                     \
        MASK_LEVEL_IRQ(irq_num) ;                                       \
        movq    lapic, %rax ;                                           \
@@ -172,12 +172,11 @@ IDTVEC(ioapic_intr##irq_num) ;                                            \
        SUPERALIGN_TEXT
        .globl Xspuriousint
 Xspuriousint:
-       APIC_PUSH_FRAME
+       APIC_PUSH_FRAME_TFRIP
        /* No EOI cycle used here */
        FAKE_MCOUNT(TF_RIP(%rsp))
        MEXITCOUNT
-       APIC_POP_FRAME
-       jmp     doreti_iret
+       APIC_POP_FRAME(jmp doreti_iret)
 
 /*
  * Handle TLB shootdowns.
@@ -188,7 +187,7 @@ Xspuriousint:
        SUPERALIGN_TEXT
        .globl  Xinvltlb
 Xinvltlb:
-       APIC_PUSH_FRAME
+       APIC_PUSH_FRAME_TFRIP
        movq    lapic, %rax
        movl    $0, LA_EOI(%rax)        /* End Of Interrupt to APIC */
        FAKE_MCOUNT(TF_RIP(%rsp))
@@ -213,7 +212,7 @@ Xinvltlb:
        SUPERALIGN_TEXT
        .globl  Xsniff
 Xsniff:
-       APIC_PUSH_FRAME
+       APIC_PUSH_FRAME_TFRIP
        movq    lapic, %rax
        movl    $0, LA_EOI(%rax)        /* End Of Interrupt to APIC */
        FAKE_MCOUNT(TF_RIP(%rsp))
@@ -223,8 +222,7 @@ Xsniff:
        movq    TF_RSP(%rsp),%rax
        movq    %rax,PCPU(sample_sp)
        MEXITCOUNT
-       APIC_POP_FRAME
-       jmp     doreti_iret
+       APIC_POP_FRAME(jmp doreti_iret)
 
 /*
  * Executed by a CPU when it receives an Xcpustop IPI from another CPU,
@@ -240,7 +238,7 @@ Xsniff:
        SUPERALIGN_TEXT
        .globl Xcpustop
 Xcpustop:
-       APIC_PUSH_FRAME
+       APIC_PUSH_FRAME_TFRIP
        movq    lapic, %rax
        movl    $0, LA_EOI(%rax)        /* End Of Interrupt to APIC */
 
@@ -338,7 +336,7 @@ Xcpustop:
        SUPERALIGN_TEXT
        .globl Xipiq
 Xipiq:
-       APIC_PUSH_FRAME
+       APIC_PUSH_FRAME_TFRIP
        movq    lapic, %rax
        movl    $0, LA_EOI(%rax)        /* End Of Interrupt to APIC */
        FAKE_MCOUNT(TF_RIP(%rsp))
@@ -364,14 +362,13 @@ Xipiq:
 1:
        orl     $RQF_IPIQ,PCPU(reqflags)
        MEXITCOUNT
-       APIC_POP_FRAME
-       jmp     doreti_iret
+       APIC_POP_FRAME(jmp doreti_iret)
 
        .text
        SUPERALIGN_TEXT
        .globl Xtimer
 Xtimer:
-       APIC_PUSH_FRAME
+       APIC_PUSH_FRAME_TFRIP
        movq    lapic, %rax
        movl    $0, LA_EOI(%rax)        /* End Of Interrupt to APIC */
        FAKE_MCOUNT(TF_RIP(%rsp))
@@ -404,8 +401,7 @@ Xtimer:
 1:
        orl     $RQF_TIMER,PCPU(reqflags)
        MEXITCOUNT
-       APIC_POP_FRAME
-       jmp     doreti_iret
+       APIC_POP_FRAME(jmp doreti_iret)
 
 MCOUNT_LABEL(bintr)
        INTR_HANDLER(0)
index f0a9d4e..d2c8b3b 100644 (file)
@@ -85,7 +85,7 @@
  * Macro helpers
  */
 #define ICU_PUSH_FRAME                                                 \
-       PUSH_FRAME ;            /* 15 regs + space for 5 extras */      \
+       PUSH_FRAME_TFRIP ;      /* 15 regs + space for 5 extras */      \
        movl $0,TF_XFLAGS(%rsp) ;                                       \
        movl $0,TF_TRAPNO(%rsp) ;                                       \
        movl $0,TF_ADDR(%rsp) ;                                         \
index 376a434..73f072f 100644 (file)
@@ -80,18 +80,18 @@ struct mdglobaldata {
        u_int           gd_unused002;
        u_int           gd_unused003;
        u_int           gd_ss_eflags;
-       pt_entry_t      *gd_cunused0;
-       pt_entry_t      *gd_cunused1;
-       pt_entry_t      *gd_cunused2;
-       pt_entry_t      *gd_cunused3;
+       char            *gd_pcb_rsp;    /* transfer trampoline to td stack */
+       long            gd_pcb_flags;   /* pcb control flags */
+       long            gd_pcb_cr3_iso; /* pcb isolated mmu cr3 */
+       long            gd_pcb_cr3;     /* pcb normal mmu cr3 */
        caddr_t         gd_aunused0;
        caddr_t         gd_aunused1;
        caddr_t         gd_aunused2;
        struct pv_entry *gd_newpv;
        u_int           gd_acpi_id;
        u_int           gd_apic_id;
-       register_t      gd_scratch_rsp;
-       register_t      unused004;
+       register_t      gd_unused004;
+       register_t      gd_unused005;
        register_t      gd_user_fs;     /* current user fs in MSR */
        register_t      gd_user_gs;     /* current user gs in MSR */
        cpumask_t       gd_unused006;
@@ -116,15 +116,21 @@ struct privatespace {
        struct mdglobaldata mdglobaldata;
        char            __filler0[MDGLOBALDATA_PAD];
 
-       /* page 1..4 - CPAGE1,CPAGE2,CPAGE3,PPAGE1 (unused) */
-       char            unused1[PAGE_SIZE];
+       /*
+        * page 1 - trap and interrupt trampoline (rsp0 points to top,
+        *          then minus whatever hardware pushes)
+        */
+       char            reserved1[PAGE_SIZE - sizeof(struct trampframe)];
+       struct trampframe trampoline;
+
+       /* page 2, 3, 4 - CPAGE2,CPAGE3,PPAGE1 (unused) */
        char            unused2[PAGE_SIZE];
        char            unused3[PAGE_SIZE];
        char            unused4[PAGE_SIZE];
 
        /* page 5..4+UPAGES - idle stack (UPAGES pages) */
        char            idlestack[UPAGES * PAGE_SIZE];
-};
+} __packed;
 #define mdcpu                  ((struct mdglobaldata *)_get_mycpu())
 
 #endif
index 3ae8c25..6264e32 100644 (file)
@@ -48,8 +48,9 @@
 #include <machine/npx.h>
 
 struct pcb {
-       register_t      padxx[8];
-       register_t      pcb_cr3;
+       register_t      padxx[7];
+       register_t      pcb_cr3_iso;    /* isolated U (+minimal K) PML4e */
+       register_t      pcb_cr3;        /* U+K PML4e */
        register_t      pcb_r15;
        register_t      pcb_r14;
        register_t      pcb_r13;
@@ -83,10 +84,11 @@ struct pcb {
        struct  pcb_ext *pcb_ext;       /* optional pcb extension */
 };
 
-#define        PCB_DBREGS      0x02    /* process using debug registers */
-#define        PCB_FPUINITDONE 0x08    /* fpu state is initialized */
-#define FP_SOFTFP       0x01    /* process using software fltng pnt emulator */
-#define        FP_VIRTFP       0x04    /* virtual kernel wants exception */
+#define        PCB_DBREGS      0x00000002      /* process using debug registers */
+#define        PCB_FPUINITDONE 0x00000008      /* fpu state is initialized */
+#define PCB_ISOMMU     0x00000010      /* isolated mmu context active */
+#define FP_SOFTFP       0x01           /* process using soft flt emulator */
+#define        FP_VIRTFP       0x04            /* vkernel wants exception */
 
 #ifdef _KERNEL
 void   savectx(struct pcb *);
index 057d1ea..5d657a9 100644 (file)
@@ -287,7 +287,9 @@ RB_PROTOTYPE2(pv_entry_rb_tree, pv_entry, pv_entry,
 
 struct pmap {
        pml4_entry_t            *pm_pml4;       /* KVA of level 4 page table */
+       pml4_entry_t            *pm_pml4_iso;   /* (isolated version) */
        struct pv_entry         *pm_pmlpv;      /* PV entry for pml4 */
+       struct pv_entry         *pm_pmlpv_iso;  /* (isolated version) */
        TAILQ_ENTRY(pmap)       pm_pmnode;      /* list of pmaps */
        RB_HEAD(pv_entry_rb_tree, pv_entry) pm_pvroot;
        int                     pm_count;       /* reference count */
index 40312e6..2e05274 100644 (file)
@@ -81,27 +81,27 @@ MCOUNT_LABEL(user)
 MCOUNT_LABEL(btrap)
 
 /*
- * Interrupts are enabled for all traps, otherwise horrible livelocks
- * can occur with the smp_invltlb and cpusync ode.
+ * Interrupts must be disabled for all traps, otherwise horrible %gs
+ * issues will occur.
  */
-#if 0
-#define        TRAP_NOEN(a)    \
-       subq $TF_RIP,%rsp; \
-       movq $0,TF_XFLAGS(%rsp) ; \
-       movq $(a),TF_TRAPNO(%rsp) ; \
-       movq $0,TF_ADDR(%rsp) ; \
-       movq $0,TF_ERR(%rsp) ; \
-       jmp alltraps_noen
-#endif
 
 /* Regular traps; The cpu does not supply tf_err for these. */
 #define        TRAP(a)  \
-       subq $TF_RIP,%rsp; \
-       movq $0,TF_XFLAGS(%rsp) ; \
-       movq $(a),TF_TRAPNO(%rsp) ; \
-       movq $0,TF_ADDR(%rsp) ; \
-       movq $0,TF_ERR(%rsp) ; \
+       PUSH_FRAME_TFRIP ;                      \
+       movq $0,TF_XFLAGS(%rsp) ;               \
+       movq $(a),TF_TRAPNO(%rsp) ;             \
+       movq $0,TF_ADDR(%rsp) ;                 \
+       movq $0,TF_ERR(%rsp) ;                  \
+       jmp alltraps
+
+/* This group of traps have tf_err already pushed by the cpu */
+#define        TRAP_ERR(a)                             \
+       PUSH_FRAME_TFERR ;                      \
+       movq $(a),TF_TRAPNO(%rsp) ;             \
+       movq $0,TF_ADDR(%rsp) ;                 \
+       movq $0,TF_XFLAGS(%rsp) ;               \
        jmp alltraps
+
 IDTVEC(dbg)
        TRAP(T_TRCTRAP)
 IDTVEC(bpt)
@@ -127,13 +127,6 @@ IDTVEC(fpu)
 IDTVEC(xmm)
        TRAP(T_XMMFLT)
 
-/* This group of traps have tf_err already pushed by the cpu */
-#define        TRAP_ERR(a)     \
-       subq $TF_ERR,%rsp; \
-       movq $(a),TF_TRAPNO(%rsp) ; \
-       movq $0,TF_ADDR(%rsp) ; \
-       movq $0,TF_XFLAGS(%rsp) ; \
-       jmp alltraps
 IDTVEC(tss)
        TRAP_ERR(T_TSSFLT)
 IDTVEC(missing)
@@ -147,20 +140,15 @@ IDTVEC(align)
         * alltraps entry point.  Use swapgs if this is the first time in the
         * kernel from userland.  Reenable interrupts if they were enabled
         * before the trap.
+        *
+        * WARNING!  %gs not available until after our swapgs code
         */
-
        SUPERALIGN_TEXT
        .globl  alltraps
        .type   alltraps,@function
 alltraps:
-       /* Fixup %gs if coming from userland */
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jz      alltraps_testi
-       swapgs
-alltraps_testi:
-       testq   $PSL_I,TF_RFLAGS(%rsp)
-       jz      alltraps_pushregs
-       sti
+
+#if 0
 alltraps_pushregs:
        movq    %rdi,TF_RDI(%rsp)
 alltraps_pushregs_no_rdi:
@@ -178,6 +166,8 @@ alltraps_pushregs_no_rdi:
        movq    %r13,TF_R13(%rsp)
        movq    %r14,TF_R14(%rsp)
        movq    %r15,TF_R15(%rsp)
+#endif
+       sti
        FAKE_MCOUNT(TF_RIP(%rsp))
        .globl  calltrap
        .type   calltrap,@function
@@ -188,66 +178,27 @@ calltrap:
        MEXITCOUNT
        jmp     doreti                  /* Handle any pending ASTs */
 
-       /*
-        * alltraps_noen entry point.  Unlike alltraps above, we want to
-        * leave the interrupts disabled.
-        */
-       SUPERALIGN_TEXT
-       .globl  alltraps_noen
-       .type   alltraps_noen,@function
-alltraps_noen:
-       /* Fixup %gs if coming from userland */
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jz      alltraps_pushregs
-       swapgs
-       jmp     alltraps_pushregs
-
 IDTVEC(dblfault)
-       subq    $TF_ERR,%rsp
+       PUSH_FRAME_TFERR
        movq    $T_DOUBLEFLT,TF_TRAPNO(%rsp)
        movq    $0,TF_ADDR(%rsp)
-       movq    $0,TF_ERR(%rsp)
        movq    $0,TF_XFLAGS(%rsp)
-       movq    %rdi,TF_RDI(%rsp)
-       movq    %rsi,TF_RSI(%rsp)
-       movq    %rdx,TF_RDX(%rsp)
-       movq    %rcx,TF_RCX(%rsp)
-       movq    %r8,TF_R8(%rsp)
-       movq    %r9,TF_R9(%rsp)
-       movq    %rax,TF_RAX(%rsp)
-       movq    %rbx,TF_RBX(%rsp)
-       movq    %rbp,TF_RBP(%rsp)
-       movq    %r10,TF_R10(%rsp)
-       movq    %r11,TF_R11(%rsp)
-       movq    %r12,TF_R12(%rsp)
-       movq    %r13,TF_R13(%rsp)
-       movq    %r14,TF_R14(%rsp)
-       movq    %r15,TF_R15(%rsp)
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jz      1f
-       swapgs
-1:     movq    %rsp, %rdi
+
        cld
+       movq    %rsp, %rdi
        call    dblfault_handler
 2:     hlt
        jmp     2b
 
+       /*
+        * We need to save the contents of %cr2 before PUSH_FRAME* messes
+        * with %cr3.
+        */
 IDTVEC(page)
-       subq    $TF_ERR,%rsp
+       PUSH_FRAME_TFERR_SAVECR2
        movq    $T_PAGEFLT,TF_TRAPNO(%rsp)
-       /* Fixup %gs if coming from userland */
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jz      1f
-       swapgs
-1:
-       movq    %rdi,TF_RDI(%rsp)       /* free up a GP register */
-       movq    %cr2,%rdi               /* preserve %cr2 before ..  */
-       movq    %rdi,TF_ADDR(%rsp)      /* enabling interrupts. */
        movq    $0,TF_XFLAGS(%rsp)
-       testq   $PSL_I,TF_RFLAGS(%rsp)
-       jz      alltraps_pushregs_no_rdi
-       sti
-       jmp     alltraps_pushregs_no_rdi
+       jmp     alltraps
 
        /*
         * We have to special-case this one.  If we get a trap in doreti() at
@@ -256,48 +207,43 @@ IDTVEC(page)
         * XXX linux has a trap handler for their equivalent of load_gs().
         */
 IDTVEC(prot)
-       subq    $TF_ERR,%rsp
+       pushq   %r10
+       leaq    doreti_iret(%rip),%r10
+       cmpq    %r10,TF_RIP-TF_ERR+8(%rsp)              /* +8 due to pushq */
+       jne     prot_normal
+       testb   $SEL_RPL_MASK,TF_CS-TF_ERR+8(%rsp)      /* +8 due to pushq */
+       jnz     prot_normal
+       swapgs                  /* doreti_iret fault from kernel mode */
+prot_normal:
+       popq    %r10
+       PUSH_FRAME_TFERR
        movq    $T_PROTFLT,TF_TRAPNO(%rsp)
        movq    $0,TF_ADDR(%rsp)
        movq    $0,TF_XFLAGS(%rsp)
-       movq    %rdi,TF_RDI(%rsp)       /* free up a GP register */
-
-       /*
-        * Fixup %gs if coming from userland.  Handle the special case where
-        * %fs faults in doreti at the iretq instruction itself.
-        */
-       leaq    doreti_iret(%rip),%rdi
-       cmpq    %rdi,TF_RIP(%rsp)               /* special iretq fault case */
-       je      2f
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)       /* check if from userland */
-       jz      1f
-2:
-       swapgs
-1:
-       testq   $PSL_I,TF_RFLAGS(%rsp)
-       jz      alltraps_pushregs_no_rdi
-       sti
-       jmp     alltraps_pushregs_no_rdi
+       jmp     alltraps
 
 /*
  * Fast syscall entry point.  We enter here with just our new %cs/%ss set,
  * and the new privilige level.  We are still running on the old user stack
  * pointer.  We have to juggle a few things around to find our stack etc.
  * swapgs gives us access to our PCPU space only.
+ *
+ * We use GD_TRAMPOLINE+TR_R10
  */
 IDTVEC(fast_syscall)
-       swapgs
-       movq    %rsp,PCPU(scratch_rsp)
-       movq    PCPU(common_tss) + TSS_RSP0, %rsp
+       swapgs                                  /* get kernel %gs */
+       movq    %rsp,PCPU(trampoline)+TR_R10    /* save user %rsp */
+       movq    PCPU(pcb_rsp),%rsp
+       KMMUENTER_SYSCALL
+
        /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */
        subq    $TF_SIZE,%rsp
        /* defer TF_RSP till we have a spare register */
        movq    %r11,TF_RFLAGS(%rsp)
        movq    %rcx,TF_RIP(%rsp)       /* %rcx original value is in %r10 */
-       movq    PCPU(scratch_rsp),%r11  /* %r11 already saved */
+       movq    PCPU(trampoline)+TR_R10,%r11    /* %r11 already saved */
        movq    %r11,TF_RSP(%rsp)       /* user stack pointer */
        orl     $RQF_QUICKRET,PCPU(reqflags)
-       sti
        movq    $KUDSEL,TF_SS(%rsp)
        movq    $KUCSEL,TF_CS(%rsp)
        movq    $2,TF_ERR(%rsp)
@@ -316,6 +262,7 @@ IDTVEC(fast_syscall)
        movq    %r13,TF_R13(%rsp)       /* C preserved */
        movq    %r14,TF_R14(%rsp)       /* C preserved */
        movq    %r15,TF_R15(%rsp)       /* C preserved */
+       sti
        FAKE_MCOUNT(TF_RIP(%rsp))
        movq    %rsp, %rdi
        call    syscall2
@@ -336,8 +283,10 @@ IDTVEC(fast_syscall)
        movq    TF_RFLAGS(%rsp),%r11
        movq    TF_RIP(%rsp),%rcx
        movq    TF_RSP(%rsp),%rsp
+       KMMUEXIT_SYSCALL
        swapgs
        sysretq
+
        /*
         * Normal slow / full iret
         */
@@ -376,64 +325,19 @@ IDTVEC(fast_syscall32)
  */
 
 IDTVEC(nmi)
-       subq    $TF_RIP,%rsp
-       movq    $(T_NMI),TF_TRAPNO(%rsp)
+       PUSH_FRAME_TFRIP
+       movq    $0,TF_XFLAGS(%rsp)
+       movq    $T_NMI,TF_TRAPNO(%rsp)
        movq    $0,TF_ADDR(%rsp)
        movq    $0,TF_ERR(%rsp)
-       movq    $0,TF_XFLAGS(%rsp)
-       movq    %rdi,TF_RDI(%rsp)
-       movq    %rsi,TF_RSI(%rsp)
-       movq    %rdx,TF_RDX(%rsp)
-       movq    %rcx,TF_RCX(%rsp)
-       movq    %r8,TF_R8(%rsp)
-       movq    %r9,TF_R9(%rsp)
-       movq    %rax,TF_RAX(%rsp)
-       movq    %rbx,TF_RBX(%rsp)
-       movq    %rbp,TF_RBP(%rsp)
-       movq    %r10,TF_R10(%rsp)
-       movq    %r11,TF_R11(%rsp)
-       movq    %r12,TF_R12(%rsp)
-       movq    %r13,TF_R13(%rsp)
-       movq    %r14,TF_R14(%rsp)
-       movq    %r15,TF_R15(%rsp)
-       xorl    %ebx,%ebx
-       testb   $SEL_RPL_MASK,TF_CS(%rsp)
-       jnz     nmi_needswapgs          /* we came from userland */
-       movl    $MSR_GSBASE,%ecx
-       rdmsr
-       cmpl    $VM_MAX_USER_ADDRESS >> 32,%edx
-       jae     nmi_calltrap            /* GS.base holds a kernel VA */
-nmi_needswapgs:
-       incl    %ebx
-       swapgs
-/* Note: this label is also used by ddb and gdb: */
-nmi_calltrap:
+
        FAKE_MCOUNT(TF_RIP(%rsp))
        cld
        movq    %rsp, %rdi
        call    trap
        MEXITCOUNT
-       testl   %ebx,%ebx
-       jz      nmi_restoreregs
-       swapgs
-nmi_restoreregs:
-       movq    TF_RDI(%rsp),%rdi
-       movq    TF_RSI(%rsp),%rsi
-       movq    TF_RDX(%rsp),%rdx
-       movq    TF_RCX(%rsp),%rcx
-       movq    TF_R8(%rsp),%r8
-       movq    TF_R9(%rsp),%r9
-       movq    TF_RAX(%rsp),%rax
-       movq    TF_RBX(%rsp),%rbx
-       movq    TF_RBP(%rsp),%rbp
-       movq    TF_R10(%rsp),%r10
-       movq    TF_R11(%rsp),%r11
-       movq    TF_R12(%rsp),%r12
-       movq    TF_R13(%rsp),%r13
-       movq    TF_R14(%rsp),%r14
-       movq    TF_R15(%rsp),%r15
-       addq    $TF_RIP,%rsp
-       iretq
+
+       POP_FRAME(jmp doreti_iret)
 
 /*
  * This function is what cpu_heavy_restore jumps to after a new process
index 0ae3f04..6b2c638 100644 (file)
@@ -115,6 +115,7 @@ ASSYM(GD_CPUMASK_SIMPLE, offsetof(struct mdglobaldata, mi.gd_cpumask_simple));
 ASSYM(GD_CPUMASK_OFFSET, offsetof(struct mdglobaldata, mi.gd_cpumask_offset));
 ASSYM(GD_IRESERVED, offsetof(struct mdglobaldata, mi.gd_ireserved[0]));
 
+ASSYM(PCB_CR3_ISO, offsetof(struct pcb, pcb_cr3_iso));
 ASSYM(PCB_CR3, offsetof(struct pcb, pcb_cr3));
 ASSYM(PCB_R15, offsetof(struct pcb, pcb_r15));
 ASSYM(PCB_R14, offsetof(struct pcb, pcb_r14));
@@ -133,7 +134,10 @@ ASSYM(PCB_DR2, offsetof(struct pcb, pcb_dr2));
 ASSYM(PCB_DR3, offsetof(struct pcb, pcb_dr3));
 ASSYM(PCB_DR6, offsetof(struct pcb, pcb_dr6));
 ASSYM(PCB_DR7, offsetof(struct pcb, pcb_dr7));
+
 ASSYM(PCB_DBREGS, PCB_DBREGS);
+ASSYM(PCB_ISOMMU, PCB_ISOMMU);
+
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
 ASSYM(PCB_ONFAULT, offsetof(struct pcb, pcb_onfault));
@@ -200,15 +204,29 @@ ASSYM(FIRST_SOFTINT, FIRST_SOFTINT);
 ASSYM(MDGLOBALDATA_BASEALLOC_PAGES, MDGLOBALDATA_BASEALLOC_PAGES);
 
 ASSYM(GD_PRIVATE_TSS, offsetof(struct mdglobaldata, gd_private_tss));
-ASSYM(GD_SCRATCH_RSP, offsetof(struct mdglobaldata, gd_scratch_rsp));
+ASSYM(GD_TRAMPOLINE, offsetof(struct privatespace, trampoline));
 ASSYM(GD_USER_FS, offsetof(struct mdglobaldata, gd_user_fs));
 ASSYM(GD_USER_GS, offsetof(struct mdglobaldata, gd_user_gs));
 ASSYM(GD_INTR_NESTING_LEVEL, offsetof(struct mdglobaldata, mi.gd_intr_nesting_level));
 
+ASSYM(TR_CR2, offsetof(struct trampframe, tr_cr2));
+ASSYM(TR_R10, offsetof(struct trampframe, tr_r10));
+ASSYM(TR_R11, offsetof(struct trampframe, tr_r11));
+ASSYM(TR_ERR, offsetof(struct trampframe, tr_err));
+ASSYM(TR_RIP, offsetof(struct trampframe, tr_rip));
+ASSYM(TR_CS, offsetof(struct trampframe, tr_cs));
+ASSYM(TR_RFLAGS, offsetof(struct trampframe, tr_rflags));
+ASSYM(TR_RSP, offsetof(struct trampframe, tr_rsp));
+ASSYM(TR_SS, offsetof(struct trampframe, tr_ss));
+
 ASSYM(GD_IPENDING, offsetof(struct mdglobaldata, gd_ipending));
 ASSYM(GD_SPENDING, offsetof(struct mdglobaldata, gd_spending));
 ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss));
 ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd));
+ASSYM(GD_PCB_RSP, offsetof(struct mdglobaldata, gd_pcb_rsp));
+ASSYM(GD_PCB_FLAGS, offsetof(struct mdglobaldata, gd_pcb_flags));
+ASSYM(GD_PCB_CR3_ISO, offsetof(struct mdglobaldata, gd_pcb_cr3_iso));
+ASSYM(GD_PCB_CR3, offsetof(struct mdglobaldata, gd_pcb_cr3));
 ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt));
 ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread));
 ASSYM(GD_FPU_LOCK, offsetof(struct mdglobaldata, gd_fpu_lock));
index ce612a5..ea9bc0c 100644 (file)
         * Define layout of the global data.  On SMP this lives in
         * the per-cpu address space, otherwise it's in the data segment.
         */
+       .globl  gd_trampoline
+       .globl  gd_pcb_rsp, gd_pcb_flags, gd_pcb_cr3_iso, gd_pcb_cr3
        .globl  gd_curthread, gd_npxthread, gd_reqflags, gd_common_tss
+       .set    gd_trampoline,globaldata + GD_TRAMPOLINE
+       .set    gd_pcb_rsp,globaldata + GD_PCB_RSP
+       .set    gd_pcb_flags,globaldata + GD_PCB_FLAGS
+       .set    gd_pcb_cr3_iso,globaldata + GD_PCB_CR3_ISO
+       .set    gd_pcb_cr3,globaldata + GD_PCB_CR3
        .set    gd_curthread,globaldata + GD_CURTHREAD
        .set    gd_npxthread,globaldata + GD_NPXTHREAD
        .set    gd_reqflags,globaldata + GD_REQFLAGS
@@ -77,7 +84,6 @@
        .globl  gd_ss_eflags, gd_intr_nesting_level
        .globl  gd_spending, gd_ipending
        .globl  gd_cnt, gd_private_tss
-       .globl  gd_scratch_rsp
        .globl  gd_user_fs, gd_user_gs
        .globl  gd_sample_pc
        .globl  gd_sample_sp
        .set    gd_ipending,globaldata + GD_IPENDING
        .set    gd_spending,globaldata + GD_SPENDING
        .set    gd_cnt,globaldata + GD_CNT
-       .set    gd_scratch_rsp,globaldata + GD_SCRATCH_RSP
        .set    gd_user_fs,globaldata + GD_USER_FS
        .set    gd_user_gs,globaldata + GD_USER_GS
        .set    gd_sample_pc,globaldata + GD_SAMPLE_PC
index a6d6093..cd32c27 100644 (file)
@@ -206,7 +206,7 @@ doreti_next:
        .globl  doreti_iret
        .globl  doreti_syscall_ret
 doreti_syscall_ret:
-       POP_FRAME               /* registers and %gs (+cli) */
+       POP_FRAME()             /* registers and %gs (+cli) */
        /* WARNING: special global doreti_iret is  also used by exception.S */
 doreti_iret:
        iretq
index afc338a..634825d 100644 (file)
@@ -2519,12 +2519,16 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
                        ioapic_enable = 1;
        }
 
-       /* make an initial tss so cpu can get interrupt stack on syscall! */
+       /*
+        * TSS entry point for interrupts, traps, and exceptions
+        * (sans NMI).  This will always go to the top of the pcpu
+        * trampoline area.  Hardware-pushed data will be copied into
+        * the trap-frame on entry, and (if necessary) returned to the
+        * trampoline on exit.
+        */
        gd->gd_common_tss.tss_rsp0 =
-               (register_t)(thread0.td_kstack +
-                            KSTACK_PAGES * PAGE_SIZE - sizeof(struct pcb));
-       /* Ensure the stack is aligned to 16 bytes */
-       gd->gd_common_tss.tss_rsp0 &= ~(register_t)0xF;
+               (register_t)(&CPU_prvspace[0]->trampoline + 1);
+       gd->gd_pcb_rsp = (void *)gd->gd_common_tss.tss_rsp0;
 
        /* double fault stack */
        gd->gd_common_tss.tss_ist1 =
index 9432311..888de18 100644 (file)
@@ -285,7 +285,13 @@ init_secondary(void)
 
        md = mdcpu;     /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/
 
-       md->gd_common_tss.tss_rsp0 = 0; /* not used until after switch */
+       /*
+        * Each cpu gets its own trampoline area for interrupts, traps, and
+        * exceptions.
+        */
+       md->gd_common_tss.tss_rsp0 =
+               (register_t)(&CPU_prvspace[md->mi.gd_cpuid]->trampoline + 1);
+       md->gd_pcb_rsp = (void *)md->gd_common_tss.tss_rsp0;
 #if 0 /* JG XXX */
        md->gd_common_tss.tss_ioopt = (sizeof md->gd_common_tss) << 16;
 #endif
index a1cc604..853b975 100644 (file)
@@ -22,7 +22,7 @@
 #define IRQ_LIDX(irq_num)      ((irq_num) >> 6)
 
 #define MSI_PUSH_FRAME                                                 \
-       PUSH_FRAME ;            /* 15 regs + space for 5 extras */      \
+       PUSH_FRAME_TFRIP ;      /* 15 regs + space for 5 extras */      \
        movq $0,TF_XFLAGS(%rsp) ;                                       \
        movq $0,TF_TRAPNO(%rsp) ;                                       \
        movq $0,TF_ADDR(%rsp) ;                                         \
index 6cd4e71..44de7b5 100644 (file)
@@ -251,6 +251,9 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_dynamic_delete, CTLFLAG_RW,
 int pmap_lock_delay = 100;
 SYSCTL_INT(_machdep, OID_AUTO, pmap_lock_delay, CTLFLAG_RW,
     &pmap_lock_delay, 0, "Spin loops");
+static int vm_isolated_user_pmap = 0;
+SYSCTL_INT(_vm, OID_AUTO, isolated_user_pmap, CTLFLAG_RW,
+    &vm_isolated_user_pmap, 0, "Userland pmap isolation");
 
 static int pmap_nx_enable = 0;
 /* needs manual TUNABLE in early probe, see below */
@@ -2005,15 +2008,16 @@ pmap_pinit(struct pmap *pmap)
        if (pmap->pm_pml4 == NULL) {
                pmap->pm_pml4 =
                    (pml4_entry_t *)kmem_alloc_pageable(&kernel_map,
-                                                       PAGE_SIZE,
+                                                       PAGE_SIZE * 2,
                                                        VM_SUBSYS_PML4);
+               pmap->pm_pml4_iso = (void *)((char *)pmap->pm_pml4 + PAGE_SIZE);
        }
 
        /*
-        * Allocate the page directory page, which wires it even though
-        * it isn't being entered into some higher level page table (it
-        * being the highest level).  If one is already cached we don't
-        * have to do anything.
+        * Allocate the PML4e table, which wires it even though it isn't
+        * being entered into some higher level page table (it being the
+        * highest level).  If one is already cached we don't have to do
+        * anything.
         */
        if ((pv = pmap->pm_pmlpv) == NULL) {
                pv = pmap_allocpte(pmap, pmap_pml4_pindex(), NULL);
@@ -2053,9 +2057,36 @@ pmap_pinit(struct pmap *pmap)
                KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
        }
        KKASSERT(pmap->pm_pml4[255] == 0);
-       KKASSERT(RB_ROOT(&pmap->pm_pvroot) == pv);
-       KKASSERT(pv->pv_entry.rbe_left == NULL);
-       KKASSERT(pv->pv_entry.rbe_right == NULL);
+
+       /*
+        * When implementing an isolated userland pmap, a second PML4e table
+        * is needed.  We use pmap_pml4_pindex() + 1 for convenience, but
+        * note that we do not operate on this table using our API functions
+        * so handling of the + 1 case is mostly just to prevent implosions.
+        */
+       if ((pv = pmap->pm_pmlpv_iso) == NULL && vm_isolated_user_pmap) {
+               pv = pmap_allocpte(pmap, pmap_pml4_pindex() + 1, NULL);
+               pmap->pm_pmlpv_iso = pv;
+               pmap_kenter((vm_offset_t)pmap->pm_pml4_iso,
+                           VM_PAGE_TO_PHYS(pv->pv_m));
+               pv_put(pv);
+
+               /*
+                * Install just enough KMAP for our trampoline.  DMAP not
+                * needed at all.  XXX
+                */
+               for (j = 0; j < NKPML4E; ++j) {
+                       pmap->pm_pml4_iso[KPML4I + j] =
+                           (KPDPphys + ((vm_paddr_t)j << PAGE_SHIFT)) |
+                           pmap->pmap_bits[PG_RW_IDX] |
+                           pmap->pmap_bits[PG_V_IDX] |
+                           pmap->pmap_bits[PG_U_IDX];
+               }
+               KKASSERT(pmap->pm_pml4_iso[255] == 0);
+       } else if (pv) {
+               KKASSERT(pv->pv_m->flags & PG_MAPPED);
+               KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
+       }
 }
 
 /*
@@ -2083,18 +2114,30 @@ pmap_puninit(pmap_t pmap)
                KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
                vm_page_unwire(p, 0);
                vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
-
-               /*
-                * XXX eventually clean out PML4 static entries and
-                * use vm_page_free_zero()
-                */
                vm_page_free(p);
                pmap->pm_pmlpv = NULL;
        }
+       if ((pv = pmap->pm_pmlpv_iso) != NULL) {
+               if (pv_hold_try(pv) == 0)
+                       pv_lock(pv);
+               KKASSERT(pv == pmap->pm_pmlpv_iso);
+               p = pmap_remove_pv_page(pv);
+               pv_free(pv, NULL);
+               pv = NULL;      /* safety */
+               pmap_kremove((vm_offset_t)pmap->pm_pml4_iso);
+               vm_page_busy_wait(p, FALSE, "pgpun");
+               KKASSERT(p->flags & (PG_FICTITIOUS|PG_UNMANAGED));
+               vm_page_unwire(p, 0);
+               vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
+               vm_page_free(p);
+               pmap->pm_pmlpv_iso = NULL;
+       }
        if (pmap->pm_pml4) {
                KKASSERT(pmap->pm_pml4 != (void *)(PTOV_OFFSET + KPML4phys));
-               kmem_free(&kernel_map, (vm_offset_t)pmap->pm_pml4, PAGE_SIZE);
+               kmem_free(&kernel_map,
+                         (vm_offset_t)pmap->pm_pml4, PAGE_SIZE * 2);
                pmap->pm_pml4 = NULL;
+               pmap->pm_pml4_iso = NULL;
        }
        KKASSERT(pmap->pm_stats.resident_count == 0);
        KKASSERT(pmap->pm_stats.wired_count == 0);
@@ -2123,6 +2166,7 @@ pv_entry_t
 pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
 {
        pt_entry_t *ptep;
+       pt_entry_t *ptep_iso;
        pv_entry_t pv;
        pv_entry_t pvp;
        pt_entry_t v;
@@ -2292,6 +2336,10 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                     pmap->pmap_bits[PG_A_IDX] |
                     pmap->pmap_bits[PG_M_IDX]);
                ptep = pv_pte_lookup(pvp, ptepindex);
+               if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso)
+                       ptep_iso = pv_pte_lookup(pmap->pm_pmlpv_iso, ptepindex);
+               else
+                       ptep_iso  = NULL;
                if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
                        pt_entry_t pte;
 
@@ -2299,7 +2347,12 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                                panic("pmap_allocpte: unexpected pte %p/%d",
                                      pvp, (int)ptepindex);
                        }
-                       pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1, ptep, v);
+                       pte = pmap_inval_smp(pmap, (vm_offset_t)-1, 1,
+                                            ptep, v);
+                       if (ptep_iso) {
+                               pmap_inval_smp(pmap, (vm_offset_t)-1, 1,
+                                              ptep_iso, v);
+                       }
                        if (vm_page_unwire_quick(
                                        PHYS_TO_VM_PAGE(pte & PG_FRAME))) {
                                panic("pmap_allocpte: shared pgtable "
@@ -2309,6 +2362,8 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                        pt_entry_t pte;
 
                        pte = atomic_swap_long(ptep, v);
+                       if (ptep_iso)
+                               atomic_swap_long(ptep_iso, v);
                        if (pte != 0) {
                                kprintf("install pgtbl mixup 0x%016jx "
                                        "old/new 0x%016jx/0x%016jx\n",
@@ -2674,12 +2729,19 @@ pmap_release(struct pmap *pmap)
 
 
        /*
-        * One resident page (the pml4 page) should remain.
+        * One resident page (the pml4 page) should remain.  Two if
+        * the pmap has implemented an isolated userland PML4E table.
         * No wired pages should remain.
         */
+       int expected_res = 0;
+
+       if ((pmap->pm_flags & PMAP_FLAG_SIMPLE) == 0)
+               ++expected_res;
+       if (pmap->pm_pmlpv_iso)
+               ++expected_res;
+
 #if 1
-       if (pmap->pm_stats.resident_count !=
-           ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1) ||
+       if (pmap->pm_stats.resident_count != expected_res ||
            pmap->pm_stats.wired_count != 0) {
                kprintf("fatal pmap problem - pmap %p flags %08x "
                        "rescnt=%jd wirecnt=%jd\n",
@@ -2690,8 +2752,7 @@ pmap_release(struct pmap *pmap)
                tsleep(pmap, 0, "DEAD", 0);
        }
 #else
-       KKASSERT(pmap->pm_stats.resident_count ==
-                ((pmap->pm_flags & PMAP_FLAG_SIMPLE) ? 0 : 1));
+       KKASSERT(pmap->pm_stats.resident_count == expected_res);
        KKASSERT(pmap->pm_stats.wired_count == 0);
 #endif
 }
@@ -2749,13 +2810,9 @@ pmap_release_callback(pv_entry_t pv, void *data)
                pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL;
        } else if (pv->pv_pindex < pmap_pml4_pindex()) {
                /*
-                * I am PDP, parent is PML4 (there's only one)
+                * I am PDP, parent is PML4.  We always calculate the
+                * normal PML4 here, not the isolated PML4.
                 */
-#if 0
-               pindex = (pv->pv_pindex - NUPTE_TOTAL - NUPT_TOTAL -
-                          NUPD_TOTAL) >> NPML4EPGSHIFT;
-               pindex += NUPTE_TOTAL + NUPT_TOTAL + NUPD_TOTAL + NUPDP_TOTAL;
-#endif
                pindex = pmap_pml4_pindex();
        } else {
                /*
@@ -2829,8 +2886,10 @@ pmap_release_pv(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk)
         *
         * Since we are leaving the top-level pv intact we need
         * to break out of what would otherwise be an infinite loop.
+        *
+        * This covers both the normal and the isolated PML4 page.
         */
-       if (pv->pv_pindex == pmap_pml4_pindex()) {
+       if (pv->pv_pindex >= pmap_pml4_pindex()) {
                pv_put(pv);
                return(-1);
        }
@@ -2895,9 +2954,13 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
 
        KKASSERT(pmap);
 
-       if (ptepindex == pmap_pml4_pindex()) {
+       if (ptepindex >= pmap_pml4_pindex()) {
                /*
                 * We are the top level PML4E table, there is no parent.
+                *
+                * This is either the normal or isolated PML4E table.
+                * Only the normal is used in regular operation, the isolated
+                * is only passed in when breaking down the whole pmap.
                 */
                p = pmap->pm_pmlpv->pv_m;
                KKASSERT(pv->pv_m == p);        /* debugging */
@@ -2910,6 +2973,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
                vm_pindex_t pml4_pindex;
                vm_pindex_t pdp_index;
                pml4_entry_t *pdp;
+               pml4_entry_t *pdp_iso;
 
                pdp_index = ptepindex - pmap_pdp_pindex(0);
                if (pvp == NULL) {
@@ -2923,6 +2987,16 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
                KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0);
                p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
                pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp, 0);
+
+               /*
+                * Also remove the PDP from the isolated PML4E if the
+                * process uses one.
+                */
+               if (pvp == pmap->pm_pmlpv && pmap->pm_pmlpv_iso) {
+                       pdp_iso = &pmap->pm_pml4_iso[pdp_index &
+                                               ((1ul << NPML4EPGSHIFT) - 1)];
+                       pmap_inval_bulk(bulk, (vm_offset_t)-1, pdp_iso, 0);
+               }
                KKASSERT(pv->pv_m == p);        /* debugging */
        } else if (ptepindex >= pmap_pd_pindex(0)) {
                /*
@@ -3129,7 +3203,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, pmap_inval_bulk_t *bulk,
                    pvp->pv_m &&
                    pvp->pv_m->wire_count == 1 &&
                    (pvp->pv_hold & PV_HOLD_MASK) == 2 &&
-                   pvp->pv_pindex != pmap_pml4_pindex()) {
+                   pvp->pv_pindex < pmap_pml4_pindex()) {
                        if (pmap_dynamic_delete == 2)
                                kprintf("A %jd %08x\n", pvp->pv_pindex, pvp->pv_hold);
                        if (pmap != &kernel_pmap) {
@@ -4606,7 +4680,7 @@ pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
                    pt_pv->pv_m &&
                    pt_pv->pv_m->wire_count == 1 &&
                    (pt_pv->pv_hold & PV_HOLD_MASK) == 2 &&
-                   pt_pv->pv_pindex != pmap_pml4_pindex()) {
+                   pt_pv->pv_pindex < pmap_pml4_pindex()) {
                        if (pmap_dynamic_delete == 2)
                                kprintf("B %jd %08x\n", pt_pv->pv_pindex, pt_pv->pv_hold);
                        pv_hold(pt_pv); /* extra hold */
@@ -6214,14 +6288,16 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
 {
        struct vmspace *oldvm;
        struct pmap *pmap;
+       thread_t td;
 
        oldvm = lp->lwp_vmspace;
 
        if (oldvm != newvm) {
                crit_enter();
+               td = curthread;
                KKASSERT((newvm->vm_refcnt & VM_REF_DELETED) == 0);
                lp->lwp_vmspace = newvm;
-               if (curthread->td_lwp == lp) {
+               if (td->td_lwp == lp) {
                        pmap = vmspace_pmap(newvm);
                        ATOMIC_CPUMASK_ORBIT(pmap->pm_active, mycpu->gd_cpuid);
                        if (pmap->pm_active_lock & CPULOCK_EXCL)
@@ -6230,13 +6306,42 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
                        tlb_flush_count++;
 #endif
                        if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) {
-                               curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
+                               td->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
+                               if (vm_isolated_user_pmap &&
+                                   pmap->pm_pmlpv_iso) {
+                                       td->td_pcb->pcb_cr3_iso =
+                                               vtophys(pmap->pm_pml4_iso);
+                                       td->td_pcb->pcb_flags |= PCB_ISOMMU;
+                               } else {
+                                       td->td_pcb->pcb_cr3_iso = 0;
+                                       td->td_pcb->pcb_flags &= ~PCB_ISOMMU;
+                               }
                        } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) {
-                               curthread->td_pcb->pcb_cr3 = KPML4phys;
+                               td->td_pcb->pcb_cr3 = KPML4phys;
+                               td->td_pcb->pcb_cr3_iso = 0;
+                               td->td_pcb->pcb_flags &= ~PCB_ISOMMU;
                        } else {
                                panic("pmap_setlwpvm: unknown pmap type\n");
                        }
-                       load_cr3(curthread->td_pcb->pcb_cr3);
+
+                       /*
+                        * The MMU separation fields needs to be updated.
+                        * (it can't access the pcb directly from the
+                        * restricted user pmap).
+                        */
+                       if (td == curthread) {
+                               mdcpu->gd_pcb_cr3 = td->td_pcb->pcb_cr3;
+                               mdcpu->gd_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso;
+                               mdcpu->gd_pcb_flags = td->td_pcb->pcb_flags;
+                               /* gd_pcb_rsp doesn't change */
+                       }
+
+                       /*
+                        * In kernel-land we always use the normal PML4E
+                        * so the kernel is fully mapped and can also access
+                        * user memory.
+                        */
+                       load_cr3(td->td_pcb->pcb_cr3);
                        pmap = vmspace_pmap(oldvm);
                        ATOMIC_CPUMASK_NANDBIT(pmap->pm_active,
                                               mycpu->gd_cpuid);
index a7781ef..799b53b 100644 (file)
@@ -362,7 +362,13 @@ END(cpu_exit_switch)
 
 ENTRY(cpu_heavy_restore)
        movq    TD_PCB(%rax),%rdx               /* RDX = PCB */
-       movq    %rdx, PCPU(common_tss) + TSS_RSP0
+       movq    %rdx, PCPU(pcb_rsp)
+       movq    PCB_FLAGS(%rdx), %rcx
+       movq    %rcx, PCPU(pcb_flags)
+       movq    PCB_CR3_ISO(%rdx), %rcx
+       movq    %rcx, PCPU(pcb_cr3_iso)
+       movq    PCB_CR3(%rdx), %rcx
+       movq    %rcx, PCPU(pcb_cr3)
        popfq
 
 #if defined(SWTCH_OPTIM_STATS)
@@ -459,15 +465,22 @@ ENTRY(cpu_heavy_restore)
        jnz     2f
 #endif
 
+#if 0
        /*
-        * Going back to the common_tss.  We may need to update TSS_RSP0
-        * which sets the top of the supervisor stack when entering from
-        * usermode.  The PCB is at the top of the stack but we need another
-        * 16 bytes to take vm86 into account.
-        */
-       movq    %rdx,%rcx
-       /*leaq  -TF_SIZE(%rdx),%rcx*/
-       movq    %rcx, PCPU(common_tss) + TSS_RSP0
+        * Going back to the common_tss.  (this was already executed at
+        * the top).
+        *
+        * Set the top of the supervisor stack for the new thread
+        * in gd_thread_pcb so the trampoline code can load it into %rsp.
+        */
+       movq    %rdx, PCPU(pcb_rsp)
+       movq    PCB_FLAGS(%rdx), %rcx
+       movq    %rcx, PCPU(pcb_flags)
+       movq    PCB_CR3_ISO(%rdx), %rcx
+       movq    %rcx, PCPU(pcb_cr3_iso)
+       movq    PCB_CR3(%rdx), %rcx
+       movq    %rcx, PCPU(pcb_cr3)
+#endif
 
 #if 0 /* JG */
        cmpl    $0,PCPU(private_tss)    /* don't have to reload if      */
@@ -855,15 +868,6 @@ ENTRY(cpu_lwkt_restore)
        je      1f
        movq    %rcx,%cr3
 1:
-       /*
-        * Safety, clear RSP0 in the tss so it isn't pointing at the
-        * previous thread's kstack (if a heavy weight user thread).
-        * RSP0 should only be used in ring 3 transitions and kernel
-        * threads run in ring 0 so there should be none.
-        */
-       xorq    %rdx,%rdx
-       movq    %rdx, PCPU(common_tss) + TSS_RSP0
-
        /*
         * NOTE: %rbx is the previous thread and %rax is the new thread.
         *       %rbx is retained throughout so we can return it.
index 98d6395..5c644de 100644 (file)
@@ -88,6 +88,7 @@ void
 cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
 {
        struct pcb *pcb2;
+       struct pmap *pmap2;
 
        if ((flags & RFPROC) == 0) {
                if ((flags & RFMEM) == 0) {
@@ -146,8 +147,16 @@ cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
        /*
         * Set registers for trampoline to user mode.  Leave space for the
         * return address on stack.  These are the kernel mode register values.
+        *
+        * Set the new pmap CR3.  If the new process uses isolated VM spaces,
+        * also set the isolated CR3.
         */
-       pcb2->pcb_cr3 = vtophys(vmspace_pmap(lp2->lwp_proc->p_vmspace)->pm_pml4);
+       pmap2 = vmspace_pmap(lp2->lwp_proc->p_vmspace);
+       pcb2->pcb_cr3 = vtophys(pmap2->pm_pml4);
+       if (pcb2->pcb_flags & PCB_ISOMMU)
+               pcb2->pcb_cr3_iso = vtophys(pmap2->pm_pml4_iso);
+       else
+               pcb2->pcb_cr3_iso = 0;
        pcb2->pcb_rbx = (unsigned long)fork_return;     /* fork_trampoline argument */
        pcb2->pcb_rbp = 0;
        pcb2->pcb_rsp = (unsigned long)lp2->lwp_md.md_regs - sizeof(void *);
@@ -164,7 +173,7 @@ cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
        /*
         * pcb2->pcb_ldt:       duplicated below, if necessary.
         * pcb2->pcb_savefpu:   cloned above.
-        * pcb2->pcb_flags:     cloned above (always 0 here?).
+        * pcb2->pcb_flags:     cloned above
         * pcb2->pcb_onfault:   cloned above (always NULL here).
         * pcb2->pcb_onfault_sp:cloned above (dont care)
         */