From: Matthew Dillon Date: Mon, 11 Jun 2018 20:43:22 +0000 (-0700) Subject: Kernel - Additional cpu bug hardening part 1/2 X-Git-Tag: v5.5.0~504 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/e5aace14a443f92cdfe7f6d36df9f7dc6f86b76b Kernel - Additional cpu bug hardening part 1/2 * OpenBSD recently made a commit that scraps the use of delayed FP state saving due to a rumor that the content of FP registers owned by another process can be speculatively detected when they are present for the current process, even when the TS bit is used to force a DNA trap. This rumor has been circulating for a while. OpenBSD felt that the lack of responsiveness from Intel forced their hand. Since they've gone ahead and pushed a fix for this potential problem, we are going to as well. * DragonFlyBSD already synchronously saves FP state on switch-out. However, it only cleans the state up afterwords by calling fninit and this isn't enough to actually erase the content in the %xmm registers. We want to continue to use delayed FP state restores because it saves a considerable amount of switching time when we do not have to do a FP restore. Most programs touch the FP registers at startup due to rtld linking, and more and more programs use the %xmm registers as general purpose registers. OpenBSD's solution of always proactively saving and restoring FP state is a reasonable one. DragonFlyBSD is going to take a slightly different tact in order to try to retain more optimal switching behavior when the FP unit is not in continuous use. * Our first fix is to issue a FP restore on dummy state after our FP save to guarantee that all FP registers are zerod after FP state is saved. This allows us to continue to support delayed FP state loads with zero chance of leakage between processes. * Our second fix is to add a heuristic which allows the kernel to detect when the FP unit is being used seriously (verses just during program startup). We have added a sysctl machdep.npx_fpu_heuristic heuristic for this purpose which defaults to the value 32. Values can be: 0 - Proactive FPU state loading disabled (old behavior retained). Note that the first fix remains active, the FP register state is still cleared after saving so no leakage can occur. All processes will take a DNA trap after a thread switch when they access the FP state. 1 - Proactive FPU state loading is enabled at all times for a thread after the first FP access made by that thread. Upon returning from a thread switch, the FPU state will already be ready to go and a DNA trap will not occur. N - Proactive FPU state loading enabled for N context switches, then disabled. The next DNA fault after disablement then re-enables proactive loading for the next N context switches. Default value is 32. This ensures that program startup can use the FP unit but integer-centric programs which don't afterwords will not incur the FP switching overhead (for either switch-away or switch-back). --- diff --git a/sys/platform/pc64/include/globaldata.h b/sys/platform/pc64/include/globaldata.h index c2873c2006..182eb29ff1 100644 --- a/sys/platform/pc64/include/globaldata.h +++ b/sys/platform/pc64/include/globaldata.h @@ -69,6 +69,7 @@ struct mdglobaldata { struct user_segment_descriptor *gd_tss_gdt; struct thread *gd_npxthread; union savefpu gd_savefpu; /* fast bcopy/zero temp fpu save area */ + union savefpu gd_zerofpu; /* xrstor/fxrstor/frstor zero regs */ int gd_fpu_lock; /* fast bcopy/zero cpu lock */ int gd_xinvaltlb; /* reentrancy check invaltlb routine */ int gd_unused001; diff --git a/sys/platform/pc64/include/md_var.h b/sys/platform/pc64/include/md_var.h index 7b22058f3f..e5c34c5594 100644 --- a/sys/platform/pc64/include/md_var.h +++ b/sys/platform/pc64/include/md_var.h @@ -93,6 +93,7 @@ int user_dbreg_trap(void); void fpstate_drop(struct thread *td); int npxdna(void); +void npxdna_quick(struct thread *td); void npxpush(struct __mcontext *mctx); void npxpop(struct __mcontext *mctx); diff --git a/sys/platform/pc64/x86_64/genassym.c b/sys/platform/pc64/x86_64/genassym.c index d559453bfb..6ef29bb32d 100644 --- a/sys/platform/pc64/x86_64/genassym.c +++ b/sys/platform/pc64/x86_64/genassym.c @@ -198,12 +198,14 @@ ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count)); ASSYM(TD_FLAGS, offsetof(struct thread, td_flags)); ASSYM(TD_TYPE, offsetof(struct thread, td_type)); ASSYM(TD_PREEMPTED, offsetof(struct thread, td_preempted)); +ASSYM(TD_FPU_HEUR, offsetof(struct thread, td_fpu_heur)); ASSYM(TD_SAVEFPU, offsetof(struct thread, td_savefpu)); ASSYM(TDF_RUNNING, TDF_RUNNING); ASSYM(TDF_USINGFP, TDF_USINGFP); ASSYM(TDF_KERNELFP, TDF_KERNELFP); ASSYM(TDF_PREEMPT_DONE, TDF_PREEMPT_DONE); +ASSYM(TDF_FPU_HEUR, TDF_FPU_HEUR); ASSYM(FIRST_SOFTINT, FIRST_SOFTINT); ASSYM(MDGLOBALDATA_BASEALLOC_PAGES, MDGLOBALDATA_BASEALLOC_PAGES); diff --git a/sys/platform/pc64/x86_64/npx.c b/sys/platform/pc64/x86_64/npx.c index 3e575953b7..add3a7b79b 100644 --- a/sys/platform/pc64/x86_64/npx.c +++ b/sys/platform/pc64/x86_64/npx.c @@ -90,6 +90,11 @@ static void fpurstor (union savefpu *); uint32_t npx_mxcsr_mask = 0xFFBF; /* this is the default */ +static int npx_fpu_heuristic = 32; +SYSCTL_INT(_machdep, OID_AUTO, npx_fpu_heuristic, CTLFLAG_RW, + &npx_fpu_heuristic, 0, "FPU active restore 0=never 1=always N=after-N"); + + /* * Probe the npx_mxcsr_mask as described in the intel document * "Intel processor identification and the CPUID instruction" Section 7 @@ -128,7 +133,6 @@ void npxinit(void) * fnsave to throw away any junk in the fpu. npxsave() initializes * the fpu and sets npxthread = NULL as important side effects. */ - npxsave(&dummy); crit_enter(); stop_emulating(); @@ -335,12 +339,24 @@ static char fpetable[128] = { int npxdna(void) { - thread_t td = curthread; + struct mdglobaldata *md = mdcpu; + thread_t td; int didinit = 0; - if (mdcpu->gd_npxthread != NULL) { + td = md->mi.gd_curthread; + + /* + * npxthread is almost always NULL. When it isn't NULL it can + * only be exactly equal to 'td'. This case occurs when the switch + * code pro-actively restores the FPU state due to the trap() code + * being interruptable (e.g. such as by an interrupt thread). + */ + if (__predict_false(md->gd_npxthread != NULL)) { + if (md->gd_npxthread == td) { + return 1; + } kprintf("npxdna: npxthread = %p, curthread = %p\n", - mdcpu->gd_npxthread, curthread); + md->gd_npxthread, td); panic("npxdna"); } @@ -356,6 +372,15 @@ npxdna(void) didinit = 1; } + /* + * Actively restore the fpu state after N npxdna faults instead of + * soaking the npxdna fault overhead on each switch. + */ + if (npx_fpu_heuristic && ++td->td_fpu_heur >= npx_fpu_heuristic) { + td->td_fpu_heur = npx_fpu_heuristic; + td->td_flags |= TDF_FPU_HEUR; + } + /* * The setting of gd_npxthread and the call to fpurstor() must not * be preempted by an interrupt thread or we will take an npxdna @@ -364,10 +389,12 @@ npxdna(void) * fpstate. */ stop_emulating(); + /* * Record new context early in case frstor causes an IRQ13. */ - mdcpu->gd_npxthread = td; + md->gd_npxthread = td; + /* * The following frstor may cause an IRQ13 when the state being * restored has a pending error. The error will appear to have been @@ -387,7 +414,7 @@ npxdna(void) td->td_comm, td->td_savefpu->sv_xmm.sv_env.en_mxcsr, didinit); td->td_savefpu->sv_xmm.sv_env.en_mxcsr &= npx_mxcsr_mask; - lwpsignal(curproc, curthread->td_lwp, SIGFPE); + lwpsignal(td->td_proc, td->td_lwp, SIGFPE); } fpurstor(td->td_savefpu); crit_exit(); @@ -395,6 +422,41 @@ npxdna(void) return (1); } +/* + * From cpu heavy restore (already in critical section, gd_npxthread is NULL), + * and TDF_USINGFP is already set. Actively restore the FPU state to avoid + * excessive npxdna traps. + */ +void +npxdna_quick(thread_t newtd) +{ + stop_emulating(); + mdcpu->gd_npxthread = newtd; + if ((newtd->td_savefpu->sv_xmm.sv_env.en_mxcsr & ~npx_mxcsr_mask) && + cpu_fxsr) { + krateprintf(&badfprate, + "%s: FXRSTR: illegal FP MXCSR %08x\n", + newtd->td_comm, + newtd->td_savefpu->sv_xmm.sv_env.en_mxcsr); + newtd->td_savefpu->sv_xmm.sv_env.en_mxcsr &= npx_mxcsr_mask; + lwpsignal(newtd->td_proc, newtd->td_lwp, SIGFPE); + } + fpurstor(newtd->td_savefpu); + + /* + * If npx_fpu_heuristic is larger than 1 we reset the heuristic + * after N switches and shift to probe mode. Any npxdna trap will + * retrigger active fpu state loading, then probe again after N + * switches. + * + * If npx_fpu_heuristic is 1 active mode is simply left on forever. + */ + if (npx_fpu_heuristic > 1 && --newtd->td_fpu_heur <= 0) { + newtd->td_fpu_heur = npx_fpu_heuristic - 1; + newtd->td_flags &= ~TDF_FPU_HEUR; + } +} + /* * Wrapper for the fnsave instruction to handle h/w bugs. If there is an error * pending, then fnsave generates a bogus IRQ13 on some systems. Force @@ -418,11 +480,15 @@ npxdna(void) void npxsave(union savefpu *addr) { + struct mdglobaldata *md; + + md = mdcpu; crit_enter(); stop_emulating(); fpusave(addr); - mdcpu->gd_npxthread = NULL; + md->gd_npxthread = NULL; fninit(); + fpurstor(&md->gd_zerofpu); /* security wipe */ start_emulating(); crit_exit(); } diff --git a/sys/platform/pc64/x86_64/swtch.s b/sys/platform/pc64/x86_64/swtch.s index ee6b3790d6..068924854c 100644 --- a/sys/platform/pc64/x86_64/swtch.s +++ b/sys/platform/pc64/x86_64/swtch.s @@ -238,7 +238,6 @@ ENTRY(cpu_heavy_switch) movq %rax,PCB_DR0(%rdx) 1: -#if 1 /* * Save the FP state if we have used the FP. Note that calling * npxsave will NULL out PCPU(npxthread). @@ -250,7 +249,6 @@ ENTRY(cpu_heavy_switch) call npxsave /* do it in a big C function */ movq %r12,%rdi /* restore %rdi */ 1: -#endif /* * Switch to the next thread, which was passed as an argument @@ -338,6 +336,10 @@ END(cpu_exit_switch) /* * cpu_heavy_restore() (current thread in %rax on entry, old thread in %rbx) * + * We immediately move %rax to %r12. %rbx is retained throughout, and + * we nominally use %r14 for TD_PCB(%r12) until near the end where we + * switch to %rdx for that. + * * Restore the thread after an LWKT switch. This entry is normally * called via the LWKT switch restore function, which was pulled * off the thread stack and jumped to. @@ -361,13 +363,14 @@ END(cpu_exit_switch) */ ENTRY(cpu_heavy_restore) - movq TD_PCB(%rax),%rdx /* RDX = PCB */ - movq %rdx, PCPU(trampoline)+TR_PCB_RSP - movq PCB_FLAGS(%rdx), %rcx + movq %rax,%r12 /* R12 = newtd */ + movq TD_PCB(%rax),%r14 /* R14 = PCB */ + movq %r14, PCPU(trampoline)+TR_PCB_RSP + movq PCB_FLAGS(%r14), %rcx movq %rcx, PCPU(trampoline)+TR_PCB_FLAGS - movq PCB_CR3_ISO(%rdx), %rcx + movq PCB_CR3_ISO(%r14), %rcx movq %rcx, PCPU(trampoline)+TR_PCB_CR3_ISO - movq PCB_CR3(%rdx), %rcx + movq PCB_CR3(%r14), %rcx movq %rcx, PCPU(trampoline)+TR_PCB_CR3 popfq @@ -380,7 +383,7 @@ ENTRY(cpu_heavy_restore) * us, our %cr3 and pmap were borrowed and are being returned to * us and no further action on those items need be taken. */ - testl $TDF_PREEMPT_DONE, TD_FLAGS(%rax) + testl $TDF_PREEMPT_DONE, TD_FLAGS(%r12) jne 4f #endif @@ -399,27 +402,24 @@ ENTRY(cpu_heavy_restore) * avoid checking for the interlock via CPULOCK_EXCL. We currently * do not perform this optimization. */ - movq TD_LWP(%rax),%rcx + movq TD_LWP(%r12),%rcx movq LWP_VMSPACE(%rcx),%rcx /* RCX = vmspace */ movq PCPU(cpumask_simple),%rsi - movq PCPU(cpumask_offset),%r12 - MPLOCKED orq %rsi, VM_PMAP+PM_ACTIVE(%rcx, %r12, 1) + movq PCPU(cpumask_offset),%r13 + MPLOCKED orq %rsi, VM_PMAP+PM_ACTIVE(%rcx, %r13, 1) movl VM_PMAP+PM_ACTIVE_LOCK(%rcx),%esi testl $CPULOCK_EXCL,%esi jz 1f - movq %rax,%r12 /* save newthread ptr */ movq %rcx,%rdi /* (found to be set) */ call pmap_interlock_wait /* pmap_interlock_wait(%rdi:vm) */ - movq %r12,%rax /* * Need unconditional load cr3 */ - movq TD_PCB(%rax),%rdx /* RDX = PCB */ - movq PCB_CR3(%rdx),%rcx /* RCX = desired CR3 */ + movq PCB_CR3(%r14),%rcx /* RCX = desired CR3 */ jmp 2f /* unconditional reload */ 1: /* @@ -435,9 +435,8 @@ ENTRY(cpu_heavy_restore) * When that happens, and we don't invltlb (by loading %cr3), we * wind up with a stale TLB. */ - movq TD_PCB(%rax),%rdx /* RDX = PCB */ movq %cr3,%rsi /* RSI = current CR3 */ - movq PCB_CR3(%rdx),%rcx /* RCX = desired CR3 */ + movq PCB_CR3(%r14),%rcx /* RCX = desired CR3 */ cmpq %rsi,%rcx /*je 4f*/ 2: @@ -449,7 +448,7 @@ ENTRY(cpu_heavy_restore) 4: /* - * NOTE: %rbx is the previous thread and %rax is the new thread. + * NOTE: %rbx is the previous thread and %r12 is the new thread. * %rbx is retained throughout so we can return it. * * lwkt_switch[_return] is responsible for handling TDF_RUNNING. @@ -458,7 +457,7 @@ ENTRY(cpu_heavy_restore) /* * Deal with the PCB extension, restore the private tss */ - movq PCB_EXT(%rdx),%rdi /* check for a PCB extension */ + movq PCB_EXT(%r14),%rdi /* check for a PCB extension */ movq $1,%rcx /* maybe mark use of a private tss */ testq %rdi,%rdi #if 0 /* JG */ @@ -473,12 +472,12 @@ ENTRY(cpu_heavy_restore) * Set the top of the supervisor stack for the new thread * in gd_thread_pcb so the trampoline code can load it into %rsp. */ - movq %rdx, PCPU(trampoline)+TR_PCB_RSP - movq PCB_FLAGS(%rdx), %rcx + movq %r14, PCPU(trampoline)+TR_PCB_RSP + movq PCB_FLAGS(%r14), %rcx movq %rcx, PCPU(trampoline)+TR_PCB_FLAGS - movq PCB_CR3_ISO(%rdx), %rcx + movq PCB_CR3_ISO(%r14), %rcx movq %rcx, PCPU(trampoline)+TR_PCB_CR3_ISO - movq PCB_CR3(%rdx), %rcx + movq PCB_CR3(%r14), %rcx movq %rcx, PCPU(trampoline)+TR_PCB_CR3 #endif @@ -518,32 +517,45 @@ ENTRY(cpu_heavy_restore) /* * Restore the user %gs and %fs */ - movq PCB_FSBASE(%rdx),%r9 + movq PCB_FSBASE(%r14),%r9 cmpq PCPU(user_fs),%r9 je 4f - movq %rdx,%r10 movq %r9,PCPU(user_fs) movl $MSR_FSBASE,%ecx - movl PCB_FSBASE(%r10),%eax - movl PCB_FSBASE+4(%r10),%edx + movl PCB_FSBASE(%r14),%eax + movl PCB_FSBASE+4(%r14),%edx wrmsr - movq %r10,%rdx 4: - movq PCB_GSBASE(%rdx),%r9 + movq PCB_GSBASE(%r14),%r9 cmpq PCPU(user_gs),%r9 je 5f - movq %rdx,%r10 movq %r9,PCPU(user_gs) movl $MSR_KGSBASE,%ecx /* later swapgs moves it to GSBASE */ - movl PCB_GSBASE(%r10),%eax - movl PCB_GSBASE+4(%r10),%edx + movl PCB_GSBASE(%r14),%eax + movl PCB_GSBASE+4(%r14),%edx wrmsr - movq %r10,%rdx 5: + /* + * Actively restore FP state + */ + movq PCPU(npxthread),%r13 + testq %r13,%r13 + jnz 6f + movl TD_FLAGS(%r12),%r13d + andq $TDF_FPU_HEUR|TDF_USINGFP,%r13 + cmpq $TDF_FPU_HEUR|TDF_USINGFP,%r13 + jne 6f + movq %r12,%rdi /* npxdna_quick(newtd) */ + call npxdna_quick +6: /* * Restore general registers. %rbx is restored later. + * + * Switch our PCB register from %r14 to %rdx so we can restore + * %r14. */ + movq %r14,%rdx movq PCB_RSP(%rdx), %rsp movq PCB_RBP(%rdx), %rbp movq PCB_R12(%rdx), %r12 diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index b80dabcfc5..d9603bdb48 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -603,8 +603,10 @@ trap(struct trapframe *frame) * when it tries to use the FP unit. Restore the * state here */ - if (npxdna()) + if (npxdna()) { + gd->gd_cnt.v_trap++; goto out; + } i = SIGFPE; ucode = FPE_FPU_NP_TRAP; break; @@ -633,8 +635,10 @@ trap(struct trapframe *frame) * XXX this should be fatal unless the kernel has * registered such use. */ - if (npxdna()) + if (npxdna()) { + gd->gd_cnt.v_trap++; goto out2; + } break; case T_STKFLT: /* stack fault */ @@ -705,6 +709,7 @@ trap(struct trapframe *frame) * entry (if not shortcutted in exception.s via * DIRECT_DISALLOW_SS_CPUBUG). */ + gd->gd_cnt.v_trap++; if (frame->tf_rip == (register_t)IDTVEC(fast_syscall)) { krateprintf(&sscpubugrate, "Caught #DB at syscall cpu artifact\n"); @@ -788,6 +793,7 @@ trap(struct trapframe *frame) if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); + gd->gd_cnt.v_trap++; trapsignal(lp, i, ucode); #ifdef DEBUG diff --git a/sys/platform/vkernel64/x86_64/trap.c b/sys/platform/vkernel64/x86_64/trap.c index 21ed565270..709f4c29b2 100644 --- a/sys/platform/vkernel64/x86_64/trap.c +++ b/sys/platform/vkernel64/x86_64/trap.c @@ -537,8 +537,10 @@ user_trap(struct trapframe *frame) * when it tries to use the FP unit. Restore the * state here */ - if (npxdna(frame)) + if (npxdna(frame)) { + gd->gd_cnt.v_trap++; goto out; + } if (!pmath_emulate) { i = SIGFPE; ucode = FPE_FPU_NP_TRAP; @@ -778,6 +780,7 @@ kernel_trap: if (*p->p_sysent->sv_transtrap) i = (*p->p_sysent->sv_transtrap)(i, type); + gd->gd_cnt.v_trap++; trapsignal(lp, i, ucode); #ifdef DEBUG diff --git a/sys/sys/thread.h b/sys/sys/thread.h index a5d7dfc445..f77b3157b1 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -280,7 +280,8 @@ struct thread { int td_type; /* thread type, TD_TYPE_ */ int td_tracker; /* for callers to debug lock counts */ int td_fdcache_lru; - int td_unused03[3]; /* for future fields */ + int td_fpu_heur; /* active restore on switch heuristic */ + int td_unused03[2]; /* for future fields */ struct iosched_data td_iosdata; /* Dynamic I/O scheduling data */ struct timeval td_start; /* start time for a thread/process */ char td_comm[MAXCOMLEN+1]; /* typ 16+1 bytes */ @@ -352,7 +353,7 @@ struct thread { #define TDF_SYSTHREAD 0x00000100 /* reserve memory may be used */ #define TDF_ALLOCATED_THREAD 0x00000200 /* objcache allocated thread */ #define TDF_ALLOCATED_STACK 0x00000400 /* objcache allocated stack */ -#define TDF_UNUSED0800 0x00000800 +#define TDF_FPU_HEUR 0x00000800 /* active restore on switch */ #define TDF_DEADLKTREAT 0x00001000 /* special lockmgr treatment */ #define TDF_MARKER 0x00002000 /* tdallq list scan marker */ #define TDF_TIMEOUT_RUNNING 0x00004000 /* tsleep timeout race */