From fc92147724be4a0985994579fe0e42c08c03d861 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 4 Jan 2018 10:34:51 -0800 Subject: [PATCH] kernel - Intel user/kernel separation MMU bug fix part 2/3 * Cleanup pass. Throw in some documentation. * Move the gd_pcb_* fields into the trampoline page to allow kernel memory to be further restricted in part 3. --- sys/cpu/x86_64/include/asmacros.h | 44 +++++++++++++++++--------- sys/cpu/x86_64/include/frame.h | 15 +++++++++ sys/platform/pc64/include/globaldata.h | 10 +++--- sys/platform/pc64/x86_64/exception.S | 2 +- sys/platform/pc64/x86_64/genassym.c | 8 ++--- sys/platform/pc64/x86_64/global.s | 5 --- sys/platform/pc64/x86_64/machdep.c | 13 ++++++-- sys/platform/pc64/x86_64/mp_machdep.c | 18 ++++++++--- sys/platform/pc64/x86_64/pmap.c | 11 ++++--- sys/platform/pc64/x86_64/swtch.s | 16 +++++----- 10 files changed, 94 insertions(+), 48 deletions(-) diff --git a/sys/cpu/x86_64/include/asmacros.h b/sys/cpu/x86_64/include/asmacros.h index 948151c0f2..2e5e53cdbb 100644 --- a/sys/cpu/x86_64/include/asmacros.h +++ b/sys/cpu/x86_64/include/asmacros.h @@ -153,6 +153,20 @@ * execute a cmp/branch sequence, detect timing. Iterate cmp $values * to suss-out content of speculatively read kernel memory. * + * We do this by creating a trampoline area for all user->kernel and + * kernel->user transitions. The trampoline area allows us to limit + * the reach the kernel map in the isolated version of the user pmap + * to JUST the trampoline area (for all cpus), tss, and vector area. + * + * It is very important that these transitions not access any memory + * outside of the trampoline page while the isolated user process pmap + * is active in %cr3. + * + * The trampoline does not add much overhead when pmap isolation is + * disabled, so we just run with it regardless. Of course, when pmap + * isolation is enabled, the %cr3 loads add 150-250ns to every system + * call as well as (without PCID) smash the TLB. + * * KMMUENTER - Executed by the trampoline when a user->kernel transition * is detected. The stack pointer points into the pcpu * trampoline space and is available for register save/restore. @@ -176,13 +190,13 @@ subq $TR_RIP, %rsp ; \ movq %r10, TR_R10(%rsp) ; \ movq %r11, TR_R11(%rsp) ; \ - testq $PCB_ISOMMU,PCPU(pcb_flags) ; \ + testq $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ; \ je 40f ; \ - movq PCPU(pcb_cr3),%r10 ; \ + movq PCPU(trampoline)+TR_PCB_CR3,%r10 ; \ movq %r10,%cr3 ; \ 40: \ movq %rsp, %r10 ; /* trampoline rsp */ \ - movq PCPU(pcb_rsp),%rsp ; /* kstack rsp */ \ + movq PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */ \ movq TR_SS(%r10), %r11 ; \ pushq %r11 ; \ movq TR_RSP(%r10), %r11 ; \ @@ -200,13 +214,13 @@ subq $TR_ERR, %rsp ; \ movq %r10, TR_R10(%rsp) ; \ movq %r11, TR_R11(%rsp) ; \ - testq $PCB_ISOMMU,PCPU(pcb_flags) ; \ + testq $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ; \ je 40f ; \ - movq PCPU(pcb_cr3),%r10 ; \ + movq PCPU(trampoline)+TR_PCB_CR3,%r10 ; \ movq %r10,%cr3 ; \ 40: \ movq %rsp, %r10 ; /* trampoline rsp */ \ - movq PCPU(pcb_rsp),%rsp ; /* kstack rsp */ \ + movq PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */ \ movq TR_SS(%r10), %r11 ; \ pushq %r11 ; \ movq TR_RSP(%r10), %r11 ; \ @@ -228,13 +242,13 @@ movq %r11, TR_R11(%rsp) ; \ movq %cr2, %r10 ; \ movq %r10, PCPU(trampoline)+TR_CR2 ; \ - testq $PCB_ISOMMU,PCPU(pcb_flags) ; \ + testq $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ; \ je 40f ; \ - movq PCPU(pcb_cr3),%r10 ; \ + movq PCPU(trampoline)+TR_PCB_CR3,%r10 ; \ movq %r10,%cr3 ; \ 40: \ movq %rsp, %r10 ; /* trampoline rsp */ \ - movq PCPU(pcb_rsp),%rsp ; /* kstack rsp */ \ + movq PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */ \ movq TR_SS(%r10), %r11 ; \ pushq %r11 ; \ movq TR_RSP(%r10), %r11 ; \ @@ -255,10 +269,10 @@ * disturbed. */ #define KMMUENTER_SYSCALL \ - testq $PCB_ISOMMU,PCPU(pcb_flags) ; \ + testq $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ; \ je 40f ; \ pushq %r10 ; \ - movq PCPU(pcb_cr3),%r10 ; \ + movq PCPU(trampoline)+TR_PCB_CR3,%r10 ; \ movq %r10,%cr3 ; \ popq %r10 ; \ 40: \ @@ -272,7 +286,7 @@ */ #define KMMUEXIT \ addq $TF_RIP,%rsp ; \ - testq $PCB_ISOMMU,PCPU(pcb_flags) ; \ + testq $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ; \ je 40f ; \ movq %r11, PCPU(trampoline)+TR_ERR ; /* save in TR_ERR */ \ popq %r11 ; /* copy %rip */ \ @@ -288,7 +302,7 @@ movq %gs:0,%r11 ; \ addq $GD_TRAMPOLINE+TR_ERR,%r11 ; \ movq %r11,%rsp ; \ - movq PCPU(pcb_cr3_iso),%r11 ; \ + movq PCPU(trampoline)+TR_PCB_CR3_ISO,%r11 ; \ movq %r11,%cr3 ; \ popq %r11 ; /* positioned at TR_RIP after this */ \ 40: \ @@ -298,10 +312,10 @@ * point. We still have the kernel %gs. */ #define KMMUEXIT_SYSCALL \ - testq $PCB_ISOMMU,PCPU(pcb_flags) ; \ + testq $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ; \ je 40f ; \ movq %r10, PCPU(trampoline)+TR_R10 ; \ - movq PCPU(pcb_cr3_iso),%r10 ; \ + movq PCPU(trampoline)+TR_PCB_CR3_ISO,%r10 ; \ movq %r10,%cr3 ; \ movq PCPU(trampoline)+TR_R10, %r10 ; \ 40: \ diff --git a/sys/cpu/x86_64/include/frame.h b/sys/cpu/x86_64/include/frame.h index 57dec412cd..09996384ac 100644 --- a/sys/cpu/x86_64/include/frame.h +++ b/sys/cpu/x86_64/include/frame.h @@ -118,6 +118,11 @@ struct intrframe { register_t if_ss; }; +/* + * The trampframe is placed at the top of the trampoline page and + * contains all the information needed to trampoline into and out + * of the isolated user pmap. + */ struct trampframe { register_t tr_cr2; register_t tr_r10; @@ -128,6 +133,16 @@ struct trampframe { register_t tr_rflags; register_t tr_rsp; register_t tr_ss; + + /* + * Top of hw stack in TSS is &tr_pcb_rsp (first push is tr_ss). + * Make sure this is at least 16-byte aligned, so be sure the + * fields below are in multiples of 16 bytes. + */ + register_t tr_pcb_rsp; /* hw frame tramp top of stack */ + register_t tr_pcb_flags; /* copy of pcb control flags */ + register_t tr_pcb_cr3_iso; /* copy of isolated pml4e */ + register_t tr_pcb_cr3; /* copy of primary pml4e */ }; int kdb_trap(int, int, struct trapframe *); diff --git a/sys/platform/pc64/include/globaldata.h b/sys/platform/pc64/include/globaldata.h index 73f072f3c7..dad818ab21 100644 --- a/sys/platform/pc64/include/globaldata.h +++ b/sys/platform/pc64/include/globaldata.h @@ -80,10 +80,10 @@ struct mdglobaldata { u_int gd_unused002; u_int gd_unused003; u_int gd_ss_eflags; - char *gd_pcb_rsp; /* transfer trampoline to td stack */ - long gd_pcb_flags; /* pcb control flags */ - long gd_pcb_cr3_iso; /* pcb isolated mmu cr3 */ - long gd_pcb_cr3; /* pcb normal mmu cr3 */ + long gd_lunused0; + long gd_lunused1; + long gd_lunused2; + long gd_lunusde3; caddr_t gd_aunused0; caddr_t gd_aunused1; caddr_t gd_aunused2; @@ -131,7 +131,9 @@ struct privatespace { /* page 5..4+UPAGES - idle stack (UPAGES pages) */ char idlestack[UPAGES * PAGE_SIZE]; } __packed; + #define mdcpu ((struct mdglobaldata *)_get_mycpu()) +#define pscpu ((struct privatespace *)_get_mycpu()) #endif diff --git a/sys/platform/pc64/x86_64/exception.S b/sys/platform/pc64/x86_64/exception.S index 2e05274349..e901f5d519 100644 --- a/sys/platform/pc64/x86_64/exception.S +++ b/sys/platform/pc64/x86_64/exception.S @@ -233,7 +233,7 @@ prot_normal: IDTVEC(fast_syscall) swapgs /* get kernel %gs */ movq %rsp,PCPU(trampoline)+TR_R10 /* save user %rsp */ - movq PCPU(pcb_rsp),%rsp + movq PCPU(trampoline)+TR_PCB_RSP,%rsp KMMUENTER_SYSCALL /* Now emulate a trapframe. Make the 8 byte alignment odd for call. */ diff --git a/sys/platform/pc64/x86_64/genassym.c b/sys/platform/pc64/x86_64/genassym.c index 6b2c638f90..5eab421875 100644 --- a/sys/platform/pc64/x86_64/genassym.c +++ b/sys/platform/pc64/x86_64/genassym.c @@ -218,15 +218,15 @@ ASSYM(TR_CS, offsetof(struct trampframe, tr_cs)); ASSYM(TR_RFLAGS, offsetof(struct trampframe, tr_rflags)); ASSYM(TR_RSP, offsetof(struct trampframe, tr_rsp)); ASSYM(TR_SS, offsetof(struct trampframe, tr_ss)); +ASSYM(TR_PCB_RSP, offsetof(struct trampframe, tr_pcb_rsp)); +ASSYM(TR_PCB_FLAGS, offsetof(struct trampframe, tr_pcb_flags)); +ASSYM(TR_PCB_CR3_ISO, offsetof(struct trampframe, tr_pcb_cr3_iso)); +ASSYM(TR_PCB_CR3, offsetof(struct trampframe, tr_pcb_cr3)); ASSYM(GD_IPENDING, offsetof(struct mdglobaldata, gd_ipending)); ASSYM(GD_SPENDING, offsetof(struct mdglobaldata, gd_spending)); ASSYM(GD_COMMON_TSS, offsetof(struct mdglobaldata, gd_common_tss)); ASSYM(GD_COMMON_TSSD, offsetof(struct mdglobaldata, gd_common_tssd)); -ASSYM(GD_PCB_RSP, offsetof(struct mdglobaldata, gd_pcb_rsp)); -ASSYM(GD_PCB_FLAGS, offsetof(struct mdglobaldata, gd_pcb_flags)); -ASSYM(GD_PCB_CR3_ISO, offsetof(struct mdglobaldata, gd_pcb_cr3_iso)); -ASSYM(GD_PCB_CR3, offsetof(struct mdglobaldata, gd_pcb_cr3)); ASSYM(GD_TSS_GDT, offsetof(struct mdglobaldata, gd_tss_gdt)); ASSYM(GD_NPXTHREAD, offsetof(struct mdglobaldata, gd_npxthread)); ASSYM(GD_FPU_LOCK, offsetof(struct mdglobaldata, gd_fpu_lock)); diff --git a/sys/platform/pc64/x86_64/global.s b/sys/platform/pc64/x86_64/global.s index ea9bc0c265..d1f40980d2 100644 --- a/sys/platform/pc64/x86_64/global.s +++ b/sys/platform/pc64/x86_64/global.s @@ -50,13 +50,8 @@ * the per-cpu address space, otherwise it's in the data segment. */ .globl gd_trampoline - .globl gd_pcb_rsp, gd_pcb_flags, gd_pcb_cr3_iso, gd_pcb_cr3 .globl gd_curthread, gd_npxthread, gd_reqflags, gd_common_tss .set gd_trampoline,globaldata + GD_TRAMPOLINE - .set gd_pcb_rsp,globaldata + GD_PCB_RSP - .set gd_pcb_flags,globaldata + GD_PCB_FLAGS - .set gd_pcb_cr3_iso,globaldata + GD_PCB_CR3_ISO - .set gd_pcb_cr3,globaldata + GD_PCB_CR3 .set gd_curthread,globaldata + GD_CURTHREAD .set gd_npxthread,globaldata + GD_NPXTHREAD .set gd_reqflags,globaldata + GD_REQFLAGS diff --git a/sys/platform/pc64/x86_64/machdep.c b/sys/platform/pc64/x86_64/machdep.c index 634825de41..b14d3ae59c 100644 --- a/sys/platform/pc64/x86_64/machdep.c +++ b/sys/platform/pc64/x86_64/machdep.c @@ -2521,14 +2521,21 @@ hammer_time(u_int64_t modulep, u_int64_t physfree) /* * TSS entry point for interrupts, traps, and exceptions - * (sans NMI). This will always go to the top of the pcpu + * (sans NMI). This will always go to near the top of the pcpu * trampoline area. Hardware-pushed data will be copied into * the trap-frame on entry, and (if necessary) returned to the * trampoline on exit. + * + * We store some pcb data for the trampoline code above the + * stack the cpu hw pushes into, and arrange things so the + * address of tr_pcb_rsp is the same as the desired top of + * stack. */ gd->gd_common_tss.tss_rsp0 = - (register_t)(&CPU_prvspace[0]->trampoline + 1); - gd->gd_pcb_rsp = (void *)gd->gd_common_tss.tss_rsp0; + (register_t)&((struct privatespace *)gd)->trampoline.tr_pcb_rsp; + + ((struct privatespace *)gd)->trampoline.tr_pcb_rsp = + gd->gd_common_tss.tss_rsp0; /* double fault stack */ gd->gd_common_tss.tss_ist1 = diff --git a/sys/platform/pc64/x86_64/mp_machdep.c b/sys/platform/pc64/x86_64/mp_machdep.c index 888de1816f..def008e170 100644 --- a/sys/platform/pc64/x86_64/mp_machdep.c +++ b/sys/platform/pc64/x86_64/mp_machdep.c @@ -286,12 +286,22 @@ init_secondary(void) md = mdcpu; /* loaded through %gs:0 (mdglobaldata.mi.gd_prvspace)*/ /* - * Each cpu gets its own trampoline area for interrupts, traps, and - * exceptions. + * TSS entry point for interrupts, traps, and exceptions + * (sans NMI). This will always go to near the top of the pcpu + * trampoline area. Hardware-pushed data will be copied into + * the trap-frame on entry, and (if necessary) returned to the + * trampoline on exit. + * + * We store some pcb data for the trampoline code above the + * stack the cpu hw pushes into, and arrange things so the + * address of tr_pcb_rsp is the same as the desired top of + * stack. */ md->gd_common_tss.tss_rsp0 = - (register_t)(&CPU_prvspace[md->mi.gd_cpuid]->trampoline + 1); - md->gd_pcb_rsp = (void *)md->gd_common_tss.tss_rsp0; + (register_t)&((struct privatespace *)md)->trampoline.tr_pcb_rsp; + ((struct privatespace *)md)->trampoline.tr_pcb_rsp = + md->gd_common_tss.tss_rsp0; + #if 0 /* JG XXX */ md->gd_common_tss.tss_ioopt = (sizeof md->gd_common_tss) << 16; #endif diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index 44de7b58c4..dc9833f409 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -6330,10 +6330,13 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm) * restricted user pmap). */ if (td == curthread) { - mdcpu->gd_pcb_cr3 = td->td_pcb->pcb_cr3; - mdcpu->gd_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso; - mdcpu->gd_pcb_flags = td->td_pcb->pcb_flags; - /* gd_pcb_rsp doesn't change */ + struct trampframe *tramp; + + tramp = &pscpu->trampoline; + tramp->tr_pcb_cr3 = td->td_pcb->pcb_cr3; + tramp->tr_pcb_cr3_iso = td->td_pcb->pcb_cr3_iso; + tramp->tr_pcb_flags = td->td_pcb->pcb_flags; + /* tr_pcb_rsp doesn't change */ } /* diff --git a/sys/platform/pc64/x86_64/swtch.s b/sys/platform/pc64/x86_64/swtch.s index 799b53b948..ee6b3790d6 100644 --- a/sys/platform/pc64/x86_64/swtch.s +++ b/sys/platform/pc64/x86_64/swtch.s @@ -362,13 +362,13 @@ END(cpu_exit_switch) ENTRY(cpu_heavy_restore) movq TD_PCB(%rax),%rdx /* RDX = PCB */ - movq %rdx, PCPU(pcb_rsp) + movq %rdx, PCPU(trampoline)+TR_PCB_RSP movq PCB_FLAGS(%rdx), %rcx - movq %rcx, PCPU(pcb_flags) + movq %rcx, PCPU(trampoline)+TR_PCB_FLAGS movq PCB_CR3_ISO(%rdx), %rcx - movq %rcx, PCPU(pcb_cr3_iso) + movq %rcx, PCPU(trampoline)+TR_PCB_CR3_ISO movq PCB_CR3(%rdx), %rcx - movq %rcx, PCPU(pcb_cr3) + movq %rcx, PCPU(trampoline)+TR_PCB_CR3 popfq #if defined(SWTCH_OPTIM_STATS) @@ -473,13 +473,13 @@ ENTRY(cpu_heavy_restore) * Set the top of the supervisor stack for the new thread * in gd_thread_pcb so the trampoline code can load it into %rsp. */ - movq %rdx, PCPU(pcb_rsp) + movq %rdx, PCPU(trampoline)+TR_PCB_RSP movq PCB_FLAGS(%rdx), %rcx - movq %rcx, PCPU(pcb_flags) + movq %rcx, PCPU(trampoline)+TR_PCB_FLAGS movq PCB_CR3_ISO(%rdx), %rcx - movq %rcx, PCPU(pcb_cr3_iso) + movq %rcx, PCPU(trampoline)+TR_PCB_CR3_ISO movq PCB_CR3(%rdx), %rcx - movq %rcx, PCPU(pcb_cr3) + movq %rcx, PCPU(trampoline)+TR_PCB_CR3 #endif #if 0 /* JG */ -- 2.41.0