kernel - Implement spectre mitigations part 2
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 10 Jan 2018 22:59:47 +0000 (14:59 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 10 Jan 2018 23:13:43 +0000 (15:13 -0800)
* NOTE: The last few commits may have said 'IBPB' but they really
  meant 'IBRS.  The last few commits addde IBRS support, this one
  cleans that up and adds IBPB support.

* Intel says for IBRS always-on mode (mode 2), SPEC_CTRL still has
  to be poked on every user->kernel entry as a barrier, even though
  the value is not being changed.  So make this change.  This actually
  somewhat improves performance a little on Skylake and later verses
  when I just set it globally and left it that way.

* Implement IBPB detection and support on Intel.  At the moment
  we default to turning it off because the performance hit is pretty
  massive.  Currently the work on linux is only using IBPB for
  VMM related operations and not for user->kernel entry.

* Enhance the machdep.spectre_mitigation sysctl to print out
  what the mode matrix is whenever you change it, in human
  readable terms.

  0 IBRS disabled IBPB disabled
  1 IBRS mode 1 (kernel-only) IBPB disabled
  2 IBRS mode 2 (at all times) IBPB disabled

  4 IBRS disabled IBPB enabled
  5 IBRS mode 1 (kernel-only) IBPB enabled
  6 IBRS mode 2 (at all times) IBPB enabled

  Currently we default to (1) instead of (5) when we detect that
  the microcode detects both features.  IBPB is not turned on by default
  (you can see why below).

* Haswell and Skylake performance loss matrix using the following
  test.  This tests a high-concurrency compile, which is approximately
  a 5:1 user:kernel test with high concurrency.

  The haswell box is:  i3-4130 CPU @ 3.40GHz  (2-core/4-thread)
  The skylake box is:  i5-6500 CPU @ 3.20GHz  (4-core/4-thread)

  This does not include MMU isolation losses, which will add another
  3-4% or so in losses.

  (/usr/obj on tmpfs)
  time make -j 8 nativekernel NO_MODULES=TRUE

    PERFORMANCE LOSS MATRIX
       HASWELL                 SKYLAKE
    IBPB=0  IBPB=1          IBPB=0  IBPB=1
    IBRS=0     0%    12%              0%     17%
    IBRS=1   >12%<   21%           >2.4%<    15%
    IBRS=2    58%    60%             23%     32%

  Note that the automatic default when microcode support is detected
  is IBRS=1, IBPB=0 (12% loss on Haswell and 2.4% loss on Skylake
  for this test).  If we add 3-4% or so for MMU isolation, a Haswell
  cpu loses around 16% and a Skylake cpu loses around 6% or so in
  performance.

    PERFORMANCE LOSS MATRIX
      (including 3% MMU isolation losses)
       HASWELL                 SKYLAKE
    IBPB=0  IBPB=1          IBPB=0  IBPB=1
    IBRS=0     3%    15%              3%     20%
    IBRS=1   >15%<   24%           >5.4%<    18%
    IBRS=2    61%    63%             26%     35%

sys/cpu/x86_64/include/asmacros.h
sys/cpu/x86_64/include/specialreg.h
sys/platform/pc64/include/pcb.h
sys/platform/pc64/x86_64/exception.S
sys/platform/pc64/x86_64/genassym.c
sys/platform/pc64/x86_64/vm_machdep.c

index 53cd2d0..80b6c0c 100644 (file)
@@ -34,6 +34,7 @@
 #define _CPU_ASMACROS_H_
 
 #include <sys/cdefs.h>
+#include <machine/specialreg.h>
 
 /* XXX too much duplication in various asm*.h's. */
 
  *             progress.  hwtf indicates how much hardware has already
  *             pushed.
  */
-#define KMMUENTER_TFRIP                                                        \
-       subq    $TR_RIP, %rsp ;                                         \
-       movq    %rcx, TR_RCX(%rsp) ;                                    \
-       movq    %rdx, TR_RDX(%rsp) ;                                    \
+
+/*
+ * KMMUENTER_CORE - Handles ISOMMU, IBRS, and IBPB.  Caller has already
+ *                 saved %rcx and %rdx.  We have to deal with %rax.
+ *
+ *                 XXX If IBPB is not supported, try to clear the
+ *                 call return hw cache w/ many x chained call sequence?
+ *
+ *     IBRS2 note - We are leaving IBRS on full-time.  However, Intel
+ *     believes it is not safe unless the MSR is poked on each user->kernel
+ *     transition, so poke the MSR for both IBRS1 and IBRS2.
+ */
+#define KMMUENTER_CORE                                                 \
        testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
        movq    PCPU(trampoline)+TR_PCB_CR3,%rcx ;                      \
        movq    %rcx,%cr3 ;                                             \
-40:    testq   $PCB_IBRS1,PCPU(trampoline)+TR_PCB_GFLAGS ;     \
-       je      41f ;                                                   \
+40:    testq   $PCB_IBRS1|PCB_IBRS2|PCB_IBPB,PCPU(trampoline)+TR_PCB_GFLAGS ;\
+       je      43f ;                                                   \
        movq    %rax, TR_RAX(%rsp) ;                                    \
-       movl    $0x48,%ecx ;                                            \
-       movl    $1,%eax ;                                               \
+       testq   $PCB_IBRS1|PCB_IBRS2,PCPU(trampoline)+TR_PCB_GFLAGS ;   \
+       je      41f ;                                                   \
+       movl    $MSR_SPEC_CTRL,%ecx ;                                   \
+       movl    $MSR_IBRS_ENABLE,%eax ;                                 \
        xorl    %edx,%edx ;                                             \
        wrmsr ;                                                         \
-       movq    TR_RAX(%rsp), %rax ;                                    \
-41:                                                                    \
+41:    testq   $PCB_IBPB,PCPU(trampoline)+TR_PCB_GFLAGS ;              \
+       je      42f ;                                                   \
+       movl    $MSR_PRED_CMD,%ecx ;                                    \
+       movl    $MSR_IBPB_BARRIER,%eax ;                                \
+       xorl    %edx,%edx ;                                             \
+       wrmsr ;                                                         \
+42:    movq    TR_RAX(%rsp), %rax ;                                    \
+43:                                                                    \
+
+
+/*
+ * Enter with trampoline, hardware pushed up to %rip
+ */
+#define KMMUENTER_TFRIP                                                        \
+       subq    $TR_RIP, %rsp ;                                         \
+       movq    %rcx, TR_RCX(%rsp) ;                                    \
+       movq    %rdx, TR_RDX(%rsp) ;                                    \
+       KMMUENTER_CORE ;                                                \
        movq    %rsp, %rcx ;            /* trampoline rsp */            \
        movq    PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */     \
        movq    TR_SS(%rcx), %rdx ;                                     \
        movq    TR_RDX(%rcx), %rdx ;                                    \
        movq    TR_RCX(%rcx), %rcx                                      \
 
+/*
+ * Enter with trampoline, hardware pushed up to ERR
+ */
 #define KMMUENTER_TFERR                                                        \
        subq    $TR_ERR, %rsp ;                                         \
        movq    %rcx, TR_RCX(%rsp) ;                                    \
        movq    %rdx, TR_RDX(%rsp) ;                                    \
-       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
-       je      40f ;                                                   \
-       movq    PCPU(trampoline)+TR_PCB_CR3,%rcx ;                      \
-       movq    %rcx,%cr3 ;                                             \
-40:    testq   $PCB_IBRS1,PCPU(trampoline)+TR_PCB_GFLAGS ;     \
-       je      41f ;                                                   \
-       movq    %rax, TR_RAX(%rsp) ;                                    \
-       movl    $0x48,%ecx ;                                            \
-       movl    $1,%eax ;                                               \
-       xorl    %edx,%edx ;                                             \
-       wrmsr ;                                                         \
-       movq    TR_RAX(%rsp), %rax ;                                    \
-41:                                                                    \
+       KMMUENTER_CORE ;                                                \
        movq    %rsp, %rcx ;            /* trampoline rsp */            \
        movq    PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */     \
        movq    TR_SS(%rcx), %rdx ;                                     \
        movq    TR_RDX(%rcx), %rdx ;                                    \
        movq    TR_RCX(%rcx), %rcx                                      \
 
+/*
+ * Enter with trampoline, hardware pushed up to ERR and
+ * we need to save %cr2 early (before potentially reloading %cr3).
+ */
 #define KMMUENTER_TFERR_SAVECR2                                                \
        subq    $TR_ERR, %rsp ;                                         \
        movq    %rcx, TR_RCX(%rsp) ;                                    \
        movq    %rdx, TR_RDX(%rsp) ;                                    \
        movq    %cr2, %rcx ;                                            \
        movq    %rcx, PCPU(trampoline)+TR_CR2 ;                         \
-       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
-       je      40f ;                                                   \
-       movq    PCPU(trampoline)+TR_PCB_CR3,%rcx ;                      \
-       movq    %rcx,%cr3 ;                                             \
-40:    testq   $PCB_IBRS1,PCPU(trampoline)+TR_PCB_GFLAGS ;     \
-       je      41f ;                                                   \
-       movq    %rax, TR_RAX(%rsp) ;                                    \
-       movl    $0x48,%ecx ;                                            \
-       movl    $1,%eax ;                                               \
-       xorl    %edx,%edx ;                                             \
-       wrmsr ;                                                         \
-       movq    TR_RAX(%rsp), %rax ;                                    \
-41:                                                                    \
+       KMMUENTER_CORE ;                                                \
        movq    %rsp, %rcx ;            /* trampoline rsp */            \
        movq    PCPU(trampoline)+TR_PCB_RSP,%rsp ; /* kstack rsp */     \
        movq    TR_SS(%rcx), %rdx ;                                     \
  * Set %cr3 if necessary on syscall entry.  No registers may be
  * disturbed.
  *
- * NOTE: TR_RCX is used by the caller, we cannot use it here
+ * NOTE: TR_CR2 is used by the caller to save %rsp, we cannot use it here.
  */
 #define KMMUENTER_SYSCALL                                              \
-       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
-       je      40f ;                                                   \
-       pushq   %rcx ;                                                  \
-       movq    PCPU(trampoline)+TR_PCB_CR3,%rcx ;                      \
-       movq    %rcx,%cr3 ;                                             \
-       popq    %rcx ;                                                  \
-40:    testq   $PCB_IBRS1,PCPU(trampoline)+TR_PCB_GFLAGS ;     \
-       je      41f ;                                                   \
-       pushq   %rax ;                                                  \
-       pushq   %rcx ;                                                  \
-       pushq   %rdx ;                                                  \
-       movl    $0x48,%ecx ;                                            \
-       movl    $1,%eax ;                                               \
-       xorl    %edx,%edx ;                                             \
-       wrmsr ;                                                         \
-       popq    %rdx ;                                                  \
-       popq    %rcx ;                                                  \
-       popq    %rax ;                                                  \
-41:                                                                    \
+       movq    %rcx, PCPU(trampoline)+TR_RCX ;                         \
+       movq    %rdx, PCPU(trampoline)+TR_RDX ;                         \
+       KMMUENTER_CORE ;                                                \
+       movq    PCPU(trampoline)+TR_RDX, %rdx ;                         \
+       movq    PCPU(trampoline)+TR_RCX, %rcx                           \
 
 /*
- * We are positioned at the base of the trapframe.  Advance the trapframe
- * and handle MMU isolation.  MMU isolation requires us to copy the
- * hardware frame to the trampoline area before setting %cr3 to the
- * isolated map.  We then set the %rsp for iretq to TR_RIP in the
- * trampoline area (after restoring the register we saved in TR_ERR).
+ * KMMUEXIT_CORE handles IBRS and IBPB, but not ISOMMU
+ *
+ * We don't re-execute the IBPB barrier on exit atm.
  */
-#define KMMUEXIT                                                       \
-       addq    $TF_RIP,%rsp ;                                          \
+#define KMMUEXIT_CORE                                                  \
        testq   $PCB_IBRS1,PCPU(trampoline)+TR_PCB_GFLAGS ;             \
        je      41f ;                                                   \
        movq    %rax, PCPU(trampoline)+TR_RAX ;                         \
        movq    %rcx, PCPU(trampoline)+TR_RCX ;                         \
        movq    %rdx, PCPU(trampoline)+TR_RDX ;                         \
-       movl    $0x48,%ecx ;                                            \
-       movl    $0,%eax ;                                               \
+       movl    $MSR_SPEC_CTRL,%ecx ;                                   \
+       movl    $MSR_IBRS_DISABLE,%eax ;                                \
        xorl    %edx,%edx ;                                             \
        wrmsr ;                                                         \
        movq    PCPU(trampoline)+TR_RDX, %rdx ;                         \
        movq    PCPU(trampoline)+TR_RCX, %rcx ;                         \
        movq    PCPU(trampoline)+TR_RAX, %rax ;                         \
-41:    testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
+41:
+
+/*
+ * We are positioned at the base of the trapframe.  Advance the trapframe
+ * and handle MMU isolation.  MMU isolation requires us to copy the
+ * hardware frame to the trampoline area before setting %cr3 to the
+ * isolated map.  We then set the %rsp for iretq to TR_RIP in the
+ * trampoline area (after restoring the register we saved in TR_ERR).
+ */
+#define KMMUEXIT                                                       \
+       addq    $TF_RIP,%rsp ;                                          \
+       KMMUEXIT_CORE ;                                                 \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
        movq    %rcx, PCPU(trampoline)+TR_ERR ; /* save in TR_ERR */    \
        popq    %rcx ;                          /* copy %rip */         \
 /*
  * Warning: user stack pointer already loaded into %rsp at this
  * point.  We still have the kernel %gs.
+ *
+ * Caller will sysexit, we do not have to copy anything to the
+ * trampoline area.
  */
 #define KMMUEXIT_SYSCALL                                               \
-       testq   $PCB_IBRS1,PCPU(trampoline)+TR_PCB_GFLAGS ;             \
-       je      41f ;                                                   \
-       movq    %rax, PCPU(trampoline)+TR_RAX ;                         \
-       movq    %rcx, PCPU(trampoline)+TR_RCX ;                         \
-       movq    %rdx, PCPU(trampoline)+TR_RDX ;                         \
-       movl    $0x48,%ecx ;                                            \
-       movl    $0,%eax ;                                               \
-       xorl    %edx,%edx ;                                             \
-       wrmsr ;                                                         \
-       movq    PCPU(trampoline)+TR_RDX, %rdx ;                         \
-       movq    PCPU(trampoline)+TR_RCX, %rcx ;                         \
-       movq    PCPU(trampoline)+TR_RAX, %rax ;                         \
-41:    testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
+       KMMUEXIT_CORE ;                                                 \
+       testq   $PCB_ISOMMU,PCPU(trampoline)+TR_PCB_FLAGS ;             \
        je      40f ;                                                   \
        movq    %rcx, PCPU(trampoline)+TR_RCX ;                         \
        movq    PCPU(trampoline)+TR_PCB_CR3_ISO,%rcx ;                  \
index 48bc0de..9ef1213 100644 (file)
 #define        MSR_APICBASE            0x01b
 #define        MSR_EBL_CR_POWERON      0x02a
 #define        MSR_TEST_CTL            0x033
+#define MSR_SPEC_CTRL          0x048   /* IBRS Spectre mitigation */
+#define MSR_PRED_CMD           0x049   /* IBPB Spectre mitigation */
 #define        MSR_BIOS_UPDT_TRIG      0x079
 #define        MSR_BBL_CR_D0           0x088
 #define        MSR_BBL_CR_D1           0x089
 #define        APICBASE_ENABLED        0x00000800
 #define        APICBASE_ADDRESS        0xfffff000
 
+/*
+ * IBRS and IBPB Spectre mitigation
+ *
+ * NOTE: Either CPUID_80000008_I1_IBPB_SUPPORT or CPUID_7_0_I3_SPEC_CTRL
+ *      indicates IBPB support.  However, note that MSR_PRED_CMD is
+ *      a command register that may only be written, not read.
+ *
+ *      MSR_IBPB_BARRIER is written to MSR_PRED_CMD unconditionally.
+ *      Writing 0 has no effect.
+ */
+#define MSR_IBRS_DISABLE               0       /* MSR_SPEC_CTRL (bit 0) */
+#define MSR_IBRS_ENABLE                        1
+#define MSR_IBPB_BARRIER               1       /* MSR_PRED_CMD */
+
+#define CPUID_7_0_I3_SPEC_CTRL         0x04000000      /* in EDX (index 3) */
+#define CPUID_7_0_I3_STIBP             0x08000000      /* in EDX (index 3) */
+#define CPUID_80000008_I1_IBPB_SUPPORT 0x00001000      /* in EBX (index 1) */
+
 /*
  * PAT modes.
  */
index 104aa8c..123ee41 100644 (file)
@@ -89,6 +89,7 @@ struct pcb {
 #define PCB_ISOMMU     0x00000010      /* isolated mmu context active */
 #define PCB_IBRS1      0x00000020      /* IBRS mode 1 (kernel only) */
 #define PCB_IBRS2      0x00000040      /* IBRS mode 2 (always) */
+#define PCB_IBPB       0x00000080      /* IBPB barrier user->kernel */
 #define FP_SOFTFP       0x01           /* process using soft flt emulator */
 #define        FP_VIRTFP       0x04            /* vkernel wants exception */
 
index 8a9376c..a29b660 100644 (file)
@@ -241,11 +241,11 @@ prot_normal:
  * pointer.  We have to juggle a few things around to find our stack etc.
  * swapgs gives us access to our PCPU space only.
  *
- * We use GD_TRAMPOLINE+TR_RCX
+ * We use GD_TRAMPOLINE+TR_CR2 to save the user stack pointer temporarily.
  */
 IDTVEC(fast_syscall)
        swapgs                                  /* get kernel %gs */
-       movq    %rsp,PCPU(trampoline)+TR_RCX    /* save user %rsp */
+       movq    %rsp,PCPU(trampoline)+TR_CR2    /* save user %rsp */
        movq    PCPU(common_tss)+TSS_RSP0,%rsp
        KMMUENTER_SYSCALL
        movq    PCPU(trampoline)+TR_PCB_RSP,%rsp
@@ -255,7 +255,7 @@ IDTVEC(fast_syscall)
        /* defer TF_RSP till we have a spare register */
        movq    %r11,TF_RFLAGS(%rsp)
        movq    %rcx,TF_RIP(%rsp)       /* %rcx original value is in %r10 */
-       movq    PCPU(trampoline)+TR_RCX,%r11    /* %r11 already saved */
+       movq    PCPU(trampoline)+TR_CR2,%r11    /* %r11 already saved */
        movq    %r11,TF_RSP(%rsp)       /* user stack pointer */
        orl     $RQF_QUICKRET,PCPU(reqflags)
        movq    $KUDSEL,TF_SS(%rsp)
index bed0c41..567692e 100644 (file)
@@ -139,6 +139,7 @@ ASSYM(PCB_DBREGS, PCB_DBREGS);
 ASSYM(PCB_ISOMMU, PCB_ISOMMU);
 ASSYM(PCB_IBRS1, PCB_IBRS1);
 ASSYM(PCB_IBRS2, PCB_IBRS2);
+ASSYM(PCB_IBPB, PCB_IBPB);
 
 ASSYM(PCB_EXT, offsetof(struct pcb, pcb_ext));
 ASSYM(PCB_FLAGS, offsetof(struct pcb, pcb_flags));
index 1e4e557..5c73b7c 100644 (file)
@@ -63,6 +63,7 @@
 #include <machine/pcb_ext.h>
 #include <machine/segments.h>
 #include <machine/globaldata.h>        /* npxthread */
+#include <machine/specialreg.h>
 #include <machine/vmm.h>
 
 #include <vm/vm.h>
@@ -82,6 +83,19 @@ static void  cpu_reset_real (void);
 
 int spectre_mitigation = -1;
 
+static int spectre_ibrs_mode = 0;
+SYSCTL_INT(_machdep, OID_AUTO, spectre_ibrs_mode, CTLFLAG_RD,
+       &spectre_ibrs_mode, 0, "current IBRS mode");
+static int spectre_ibpb_mode = 0;
+SYSCTL_INT(_machdep, OID_AUTO, spectre_ibpb_mode, CTLFLAG_RD,
+       &spectre_ibpb_mode, 0, "current IBPB mode");
+static int spectre_ibrs_supported = 0;
+SYSCTL_INT(_machdep, OID_AUTO, spectre_ibrs_supported, CTLFLAG_RD,
+       &spectre_ibrs_supported, 0, "IBRS mode supported");
+static int spectre_ibpb_supported = 0;
+SYSCTL_INT(_machdep, OID_AUTO, spectre_ibpb_supported, CTLFLAG_RD,
+       &spectre_ibpb_supported, 0, "IBPB mode supported");
+
 /*
  * Finish a fork operation, with lwp lp2 nearly set up.
  * Copy and update the pcb, set up the stack so that the child
@@ -409,35 +423,49 @@ SYSINIT(swi_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY, swi_vm_setup, NULL);
 void spectre_vm_setup(void *arg);
 
 /*
- * Check for IBRS support
+ * Check for IBPB and IBRS support
+ *
+ * Returns a mask:     0x1     IBRS supported
+ *                     0x2     IBPB supported
  */
 static
 int
 spectre_check_support(void)
 {
        uint32_t p[4];
+       int rv = 0;
 
+       /*
+        * SPEC_CTRL (bit 26) and STIBP support (bit 27)
+        *
+        * XXX Not sure what the STIBP flag is meant to be used for.
+        *
+        * SPEC_CTRL indicates IBRS and IBPB support.
+        */
        p[0] = 0;
        p[1] = 0;
        p[2] = 0;
        p[3] = 0;
        cpuid_count(7, 0, p);
-       if ((p[3] & 0x0C000000U) == 0x0C000000U) {
+       if (p[3] & CPUID_7_0_I3_SPEC_CTRL)
+               rv |= 3;
 
-               /*
-                * SPEC_CTRL (bit 26) and STIBP support (bit 27)
-                *
-                * 0x80000008 p[0] bit 12 indicates IBPB support
-                */
+       /*
+        * 0x80000008 p[1] bit 12 indicates IBPB support
+        *
+        * This bit might be set even though SPEC_CTRL is not set.
+        */
+       if (cpu_vendor_id == CPU_VENDOR_INTEL) {
                p[0] = 0;
                p[1] = 0;
                p[2] = 0;
                p[3] = 0;
                do_cpuid(0x80000008U, p);
-               if (p[0] & 0x00001000)
-                       return 1;
+               if (p[1] & CPUID_80000008_I1_IBPB_SUPPORT)
+                       rv |= 2;
        }
-       return 0;
+
+       return rv;
 }
 
 /*
@@ -446,31 +474,107 @@ spectre_check_support(void)
  */
 static
 void
-spectre_sysctl_changed(int old_value)
+spectre_sysctl_changed(void)
 {
        globaldata_t save_gd;
+       struct trampframe *tr;
        int n;
 
+       /*
+        * Console message on mitigation mode change
+        */
+       kprintf("machdep.spectre_mitigation=%d: ", spectre_mitigation);
+
+       if (spectre_ibrs_supported == 0) {
+               kprintf("IBRS=NOSUPPORT, ");
+       } else {
+               switch(spectre_mitigation & 3) {
+               case 0:
+                       kprintf("IBRS=0 (disabled), ");
+                       break;
+               case 1:
+                       kprintf("IBRS=1 (kern-only), ");
+                       break;
+               case 2:
+                       kprintf("IBRS=2 (always-on), ");
+                       break;
+               case 3:
+                       kprintf("IBRS=?, ");
+                       break;
+               }
+       }
+
+       if (spectre_ibpb_supported == 0) {
+               kprintf("IBPB=NOSUPPORT\n");
+       } else {
+               switch(spectre_mitigation & 4) {
+               case 0:
+                       kprintf("IBPB=0 (disabled)\n");
+                       break;
+               case 4:
+                       kprintf("IBPB=1 (enabled)\n");
+                       break;
+               }
+       }
+
+       /*
+        * Fixup state
+        */
        save_gd = mycpu;
        for (n = 0; n < ncpus; ++n) {
                lwkt_setcpu_self(globaldata_find(n));
+               cpu_ccfence();
+               tr = &pscpu->trampoline;
 
-               pscpu->trampoline.tr_pcb_gflags &= ~(PCB_IBRS1 | PCB_IBRS2);
+               tr->tr_pcb_gflags &= ~(PCB_IBRS1 | PCB_IBRS2 | PCB_IBPB);
+               spectre_ibrs_mode = 0;
+               spectre_ibpb_mode = 0;
 
-               switch(spectre_mitigation) {
+               /*
+                * IBRS mode
+                */
+               switch(spectre_mitigation & 3) {
                case 0:
-                       if (old_value >= 0)
-                               wrmsr(0x48, 0);
+                       /*
+                        * Disable IBRS
+                        *
+                        * Make sure IBRS is turned off in case we were in
+                        * a global mode before.
+                        */
+                       if (spectre_ibrs_supported)
+                               wrmsr(MSR_SPEC_CTRL, 0);
                        break;
                case 1:
-                       pscpu->trampoline.tr_pcb_gflags |= PCB_IBRS1;
-                       wrmsr(0x48, 1);
+                       /*
+                        * IBRS in kernel
+                        */
+                       if (spectre_ibrs_supported) {
+                               tr->tr_pcb_gflags |= PCB_IBRS1;
+                               wrmsr(MSR_SPEC_CTRL, 1);
+                               spectre_ibrs_mode = 1;
+                       }
                        break;
                case 2:
-                       pscpu->trampoline.tr_pcb_gflags |= PCB_IBRS2;
-                       wrmsr(0x48, 1);
+                       /*
+                        * IBRS at all times
+                        */
+                       if (spectre_ibrs_supported) {
+                               tr->tr_pcb_gflags |= PCB_IBRS2;
+                               wrmsr(MSR_SPEC_CTRL, 1);
+                               spectre_ibrs_mode = 2;
+                       }
                        break;
                }
+
+               /*
+                * IBPB mode
+                */
+               if (spectre_mitigation & 4) {
+                       if (spectre_ibpb_supported) {
+                               tr->tr_pcb_gflags |= PCB_IBPB;
+                               spectre_ibpb_mode = 1;
+                       }
+               }
        }
        if (save_gd != mycpu)
                lwkt_setcpu_self(save_gd);
@@ -482,17 +586,15 @@ spectre_sysctl_changed(int old_value)
 static int
 sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
 {
-       int new_spectre;
-       int old_spectre;
+       int spectre;
        int error;
 
-       old_spectre = spectre_mitigation;
-       new_spectre = old_spectre;
-       error = sysctl_handle_int(oidp, &new_spectre, 0, req);
+       spectre = spectre_mitigation;
+       error = sysctl_handle_int(oidp, &spectre, 0, req);
        if (error || req->newptr == NULL)
                return error;
-       spectre_mitigation = new_spectre;
-       spectre_sysctl_changed(old_spectre);
+       spectre_mitigation = spectre;
+       spectre_sysctl_changed();
 
        return 0;
 }
@@ -500,89 +602,113 @@ sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
 SYSCTL_PROC(_machdep, OID_AUTO, spectre_mitigation, CTLTYPE_INT | CTLFLAG_RW,
        0, 0, sysctl_spectre_mitigation, "I", "Spectre exploit mitigation");
 
+/*
+ * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
+ *      updated.  Microcode updates must be applied to all cpus
+ *      for support to be recognized.
+ */
 void
 spectre_vm_setup(void *arg)
 {
        int inconsistent = 0;
-       int old_value = spectre_mitigation;
+       int supmask;
 
+       /*
+        * Fetch tunable in auto mode
+        */
        if (spectre_mitigation < 0) {
                TUNABLE_INT_FETCH("machdep.spectre_mitigation",
                                  &spectre_mitigation);
        }
 
-       if (cpu_vendor_id == CPU_VENDOR_INTEL) {
-               if (spectre_check_support()) {
-                       /*
-                        * Must be supported on all cpus before we
-                        * can enable it.  Returns silently if it
-                        * isn't.
-                        *
-                        * NOTE! arg != NULL indicates we were called
-                        *       from cpuctl after a successful microcode
-                        *       update.
-                        */
-                       if (arg != NULL) {
-                               globaldata_t save_gd;
-                               int n;
-
-                               save_gd = mycpu;
-                               for (n = 0; n < ncpus; ++n) {
-                                       lwkt_setcpu_self(globaldata_find(n));
-                                       if (spectre_check_support() == 0) {
-                                               inconsistent = 1;
-                                               break;
-                                       }
+       if ((supmask = spectre_check_support()) != 0) {
+               /*
+                * Must be supported on all cpus before we
+                * can enable it.  Returns silently if it
+                * isn't.
+                *
+                * NOTE! arg != NULL indicates we were called
+                *       from cpuctl after a successful microcode
+                *       update.
+                */
+               if (arg != NULL) {
+                       globaldata_t save_gd;
+                       int n;
+
+                       save_gd = mycpu;
+                       for (n = 0; n < ncpus; ++n) {
+                               lwkt_setcpu_self(globaldata_find(n));
+                               if (spectre_check_support() !=
+                                   supmask) {
+                                       inconsistent = 1;
+                                       break;
                                }
-                               if (save_gd != mycpu)
-                                       lwkt_setcpu_self(save_gd);
-                       }
-                       if (inconsistent == 0) {
-                               if (spectre_mitigation < 0)
-                                       spectre_mitigation = 1;
-                       } else {
-                               spectre_mitigation = -1;
                        }
-               } else {
-                       spectre_mitigation = -1;
+                       if (save_gd != mycpu)
+                               lwkt_setcpu_self(save_gd);
                }
-       } else {
-                spectre_mitigation = -1;                /* no support */
        }
 
+       /*
+        * IBRS support
+        */
+       if (supmask & 1)
+               spectre_ibrs_supported = 1;
+       else
+               spectre_ibrs_supported = 0;
+
+       /*
+        * IBPB support.
+        */
+       if (supmask & 2)
+               spectre_ibpb_supported = 1;
+       else
+               spectre_ibpb_supported = 0;
+
        /*
         * Be silent while microcode is being loaded on various CPUs,
         * until all done.
         */
-       if (inconsistent)
+       if (inconsistent) {
+               spectre_mitigation = -1;
                return;
+       }
+
+       /*
+        * Enable spectre_mitigation, set defaults if -1, adjust
+        * tuned value according to support if not.
+        *
+        * NOTE!  We do not enable IBPB for user->kernel transitions
+        *        by default, so this code is commented out for now.
+        */
+       if (spectre_ibrs_supported || spectre_ibpb_supported) {
+               if (spectre_mitigation < 0) {
+                       spectre_mitigation = 0;
+                       if (spectre_ibrs_supported)
+                               spectre_mitigation |= 1;
+#if 0
+                       if (spectre_ibpb_supported)
+                               spectre_mitigation |= 4;
+#endif
+               }
+               if (spectre_ibrs_supported == 0)
+                       spectre_mitigation &= ~3;
+               if (spectre_ibpb_supported == 0)
+                       spectre_mitigation &= ~4;
+       } else {
+               spectre_mitigation = -1;
+       }
 
        /*
         * Disallow sysctl changes when there is no support (otherwise
         * the wrmsr will cause a protection fault).
         */
-       switch(spectre_mitigation) {
-       case 0:
-               sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
-               kprintf("machdep.spectre_mitigation available but disabled\n");
-               break;
-       case 1:
-               sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
-               kprintf("machdep.spectre_mitigation available, system call\n"
-                       "performance and kernel operation will be impacted\n");
-               break;
-       case 2:
-               sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
-               kprintf("machdep.spectre_mitigation available, whole machine\n"
-                       "performance will be impacted\n");
-               break;
-       default:
+       if (spectre_mitigation < 0)
                sysctl___machdep_spectre_mitigation.oid_kind &= ~CTLFLAG_WR;
-               if (cpu_vendor_id == CPU_VENDOR_INTEL)
-                       kprintf("no microcode spectre mitigation available\n");
-               break;
-       }
-       spectre_sysctl_changed(old_value);
+       else
+               sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
+
+       spectre_sysctl_changed();
 }
 
 SYSINIT(spectre_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY,