2 * Copyright (c) 1982, 1986 The Regents of the University of California.
3 * Copyright (c) 1989, 1990 William Jolitz
4 * Copyright (c) 1994 John Dyson
5 * Copyright (c) 2008-2018 The DragonFly Project.
8 * This code is derived from software contributed to Berkeley by
9 * the Systems Programming Group of the University of Utah Computer
10 * Science Department, and William Jolitz.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * from: @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
41 * Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
42 * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $
45 #include "opt_reset.h"
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/malloc.h>
52 #include <sys/interrupt.h>
53 #include <sys/vnode.h>
54 #include <sys/vmmeter.h>
55 #include <sys/kernel.h>
56 #include <sys/sysctl.h>
57 #include <sys/unistd.h>
60 #include <machine/clock.h>
61 #include <machine/cpu.h>
62 #include <machine/md_var.h>
63 #include <machine/smp.h>
64 #include <machine/pcb.h>
65 #include <machine/pcb_ext.h>
66 #include <machine/segments.h>
67 #include <machine/globaldata.h> /* npxthread */
68 #include <machine/specialreg.h>
71 #include <vm/vm_param.h>
73 #include <vm/vm_kern.h>
74 #include <vm/vm_page.h>
75 #include <vm/vm_map.h>
76 #include <vm/vm_extern.h>
78 #include <sys/thread2.h>
80 #include <bus/isa/isa.h>
82 static void cpu_reset_real (void);
84 static int spectre_mitigation = -1;
85 static int spectre_support = 0;
86 static int spectre_mode = 0;
87 SYSCTL_INT(_machdep, OID_AUTO, spectre_mode, CTLFLAG_RD,
88 &spectre_mode, 0, "current Spectre enablements");
90 static int mds_mitigation = -1;
91 static int mds_support = 0;
92 static int mds_mode = 0;
93 SYSCTL_INT(_machdep, OID_AUTO, mds_mode, CTLFLAG_RD,
94 &mds_mode, 0, "current MDS enablements");
97 * Finish a fork operation, with lwp lp2 nearly set up.
98 * Copy and update the pcb, set up the stack so that the child
99 * ready to run and return to user mode.
102 cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
107 if ((flags & RFPROC) == 0) {
108 if ((flags & RFMEM) == 0) {
110 * Unshare user LDT. > 1 test is MPSAFE. While
111 * it can potentially race a 2->1 transition, the
112 * worst that happens is that we do an unnecessary
115 struct pcb *pcb1 = lp1->lwp_thread->td_pcb;
116 struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt;
118 if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) {
119 pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len);
121 pcb1->pcb_ldt = pcb_ldt;
128 /* Ensure that lp1's pcb is up to date. */
129 if (mdcpu->gd_npxthread == lp1->lwp_thread)
130 npxsave(lp1->lwp_thread->td_savefpu);
133 * Copy lp1's PCB. This really only applies to the
134 * debug registers and FP state, but its faster to just copy the
135 * whole thing. Because we only save the PCB at switchout time,
136 * the register state may not be current.
138 pcb2 = lp2->lwp_thread->td_pcb;
139 *pcb2 = *lp1->lwp_thread->td_pcb;
142 * Create a new fresh stack for the new process.
143 * Copy the trap frame for the return to user mode as if from a
144 * syscall. This copies the user mode register values.
146 * pcb_rsp must allocate an additional call-return pointer below
147 * the trap frame which will be restored by cpu_heavy_restore from
148 * PCB_RIP, and the thread's td_sp pointer must allocate an
149 * additonal two quadwords below the pcb_rsp call-return pointer to
150 * hold the LWKT restore function pointer and rflags.
152 * The LWKT restore function pointer must be set to cpu_heavy_restore,
153 * which is our standard heavy-weight process switch-in function.
154 * YYY eventually we should shortcut fork_return and fork_trampoline
155 * to use the LWKT restore function directly so we can get rid of
156 * all the extra crap we are setting up.
158 lp2->lwp_md.md_regs = (struct trapframe *)pcb2 - 1;
159 bcopy(lp1->lwp_md.md_regs, lp2->lwp_md.md_regs, sizeof(*lp2->lwp_md.md_regs));
162 * Set registers for trampoline to user mode. Leave space for the
163 * return address on stack. These are the kernel mode register values.
165 * Set the new pmap CR3. If the new process uses isolated VM spaces,
166 * also set the isolated CR3.
168 pmap2 = vmspace_pmap(lp2->lwp_proc->p_vmspace);
169 pcb2->pcb_cr3 = vtophys(pmap2->pm_pml4);
170 if ((pcb2->pcb_flags & PCB_ISOMMU) && pmap2->pm_pmlpv_iso) {
171 pcb2->pcb_cr3_iso = vtophys(pmap2->pm_pml4_iso);
173 pcb2->pcb_flags &= ~PCB_ISOMMU;
174 pcb2->pcb_cr3_iso = 0;
179 * Per-process spectre mitigation (future)
181 pcb2->pcb_flags &= ~(PCB_IBRS1 | PCB_IBRS2);
182 switch (spectre_mitigation) {
184 pcb2->pcb_flags |= PCB_IBRS1;
187 pcb2->pcb_flags |= PCB_IBRS2;
194 pcb2->pcb_rbx = (unsigned long)fork_return; /* fork_trampoline argument */
196 pcb2->pcb_rsp = (unsigned long)lp2->lwp_md.md_regs - sizeof(void *);
197 pcb2->pcb_r12 = (unsigned long)lp2; /* fork_trampoline argument */
201 pcb2->pcb_rip = (unsigned long)fork_trampoline;
202 lp2->lwp_thread->td_sp = (char *)(pcb2->pcb_rsp - sizeof(void *));
203 *(u_int64_t *)lp2->lwp_thread->td_sp = PSL_USER;
204 lp2->lwp_thread->td_sp -= sizeof(void *);
205 *(void **)lp2->lwp_thread->td_sp = (void *)cpu_heavy_restore;
208 * pcb2->pcb_ldt: duplicated below, if necessary.
209 * pcb2->pcb_savefpu: cloned above.
210 * pcb2->pcb_flags: cloned above
211 * pcb2->pcb_onfault: cloned above (always NULL here).
212 * pcb2->pcb_onfault_sp:cloned above (dont care)
216 * XXX don't copy the i/o pages. this should probably be fixed.
218 pcb2->pcb_ext = NULL;
220 /* Copy the LDT, if necessary. */
221 if (pcb2->pcb_ldt != NULL) {
223 atomic_add_int(&pcb2->pcb_ldt->ldt_refcnt, 1);
225 pcb2->pcb_ldt = user_ldt_alloc(pcb2,
226 pcb2->pcb_ldt->ldt_len);
229 bcopy(&lp1->lwp_thread->td_tls, &lp2->lwp_thread->td_tls,
230 sizeof(lp2->lwp_thread->td_tls));
232 * Now, cpu_switch() can schedule the new lwp.
233 * pcb_rsp is loaded pointing to the cpu_switch() stack frame
234 * containing the return address when exiting cpu_switch.
235 * This will normally be to fork_trampoline(), which will have
236 * %rbx loaded with the new lwp's pointer. fork_trampoline()
237 * will set up a stack to call fork_return(lp, frame); to complete
238 * the return to user-mode.
243 * Prepare new lwp to return to the address specified in params.
246 cpu_prepare_lwp(struct lwp *lp, struct lwp_params *params)
248 struct trapframe *regs = lp->lwp_md.md_regs;
249 void *bad_return = NULL;
252 regs->tf_rip = (long)params->lwp_func;
253 regs->tf_rsp = (long)params->lwp_stack;
254 /* Set up argument for function call */
255 regs->tf_rdi = (long)params->lwp_arg;
258 * Set up fake return address. As the lwp function may never return,
259 * we simply copy out a NULL pointer and force the lwp to receive
260 * a SIGSEGV if it returns anyways.
262 regs->tf_rsp -= sizeof(void *);
263 error = copyout(&bad_return, (void *)regs->tf_rsp, sizeof(bad_return));
267 cpu_set_fork_handler(lp,
268 (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
273 * Intercept the return address from a freshly forked process that has NOT
274 * been scheduled yet.
276 * This is needed to make kernel threads stay in kernel mode.
279 cpu_set_fork_handler(struct lwp *lp, void (*func)(void *, struct trapframe *),
283 * Note that the trap frame follows the args, so the function
284 * is really called like this: func(arg, frame);
286 lp->lwp_thread->td_pcb->pcb_rbx = (long)func; /* function */
287 lp->lwp_thread->td_pcb->pcb_r12 = (long)arg; /* first arg */
291 cpu_set_thread_handler(thread_t td, void (*rfunc)(void), void *func, void *arg)
293 td->td_pcb->pcb_rbx = (long)func;
294 td->td_pcb->pcb_r12 = (long)arg;
295 td->td_switch = cpu_lwkt_switch;
296 td->td_sp -= sizeof(void *);
297 *(void **)td->td_sp = rfunc; /* exit function on return */
298 td->td_sp -= sizeof(void *);
299 *(void **)td->td_sp = cpu_kthread_restore;
305 struct thread *td = curthread;
310 /* Some x86 functionality was dropped */
311 KKASSERT(pcb->pcb_ext == NULL);
314 * disable all hardware breakpoints
316 if (pcb->pcb_flags & PCB_DBREGS) {
318 pcb->pcb_flags &= ~PCB_DBREGS;
320 td->td_gd->gd_cnt.v_swtch++;
322 crit_enter_quick(td);
323 if (td->td_flags & TDF_TSLEEPQ)
325 lwkt_deschedule_self(td);
326 lwkt_remove_tdallq(td);
331 * Terminate the current thread. The caller must have already acquired
332 * the thread's rwlock and placed it on a reap list or otherwise notified
333 * a reaper of its existance. We set a special assembly switch function which
334 * releases td_rwlock after it has cleaned up the MMU state and switched
337 * Must be caller from a critical section and with the thread descheduled.
340 cpu_thread_exit(void)
343 curthread->td_switch = cpu_exit_switch;
344 curthread->td_flags |= TDF_EXITING;
346 panic("cpu_thread_exit: lwkt_switch() unexpectedly returned");
359 * Attempt to do a CPU reset via the keyboard controller,
360 * do not turn off the GateA20, as any machine that fails
361 * to do the reset here would then end up in no man's land.
364 #if !defined(BROKEN_KEYBOARD_RESET)
365 outb(IO_KBD + 4, 0xFE);
366 DELAY(500000); /* wait 0.5 sec to see if that did it */
367 kprintf("Keyboard reset did not work, attempting CPU shutdown\n");
368 DELAY(1000000); /* wait 1 sec for kprintf to complete */
371 /* force a shutdown by unmapping entire address space ! */
372 bzero((caddr_t) PTD, PAGE_SIZE);
375 /* "good night, sweet prince .... <THUNK!>" */
382 swi_vm(void *arg, void *frame)
384 if (busdma_swi_pending != 0)
389 swi_vm_setup(void *arg)
391 register_swi_mp(SWI_VM, swi_vm, NULL, "swi_vm", NULL, 0);
394 SYSINIT(swi_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY, swi_vm_setup, NULL);
397 * NOTE: This routine is also called after a successful microcode
400 void mitigation_vm_setup(void *arg);
403 * Check for IBPB and IBRS support
405 * This bits also specify desired modes in the spectre_mitigation sysctl.
407 #define IBRS_SUPPORTED 0x0001
408 #define STIBP_SUPPORTED 0x0002
409 #define IBPB_SUPPORTED 0x0004
410 #define IBRS_AUTO_SUPPORTED 0x0008
411 #define STIBP_AUTO_SUPPORTED 0x0010
412 #define IBRS_PREFERRED_REQUEST 0x0020
416 spectre_check_support(void)
422 * Spectre mitigation hw bits
424 * IBRS Indirect Branch Restricted Speculation (isolation)
425 * STIBP Single Thread Indirect Branch Prediction (isolation)
426 * IBPB Branch Prediction Barrier (barrier)
428 * IBRS and STIBP must be toggled (enabled on entry to kernel,
429 * disabled on exit, as well as disabled during any MWAIT/HLT).
430 * When *_AUTO bits are available, IBRS and STIBP may be left
431 * turned on and do not have to be toggled on kernel entry/exit.
432 * Be sure to clear before going idle (else hyperthread performance
435 * All this shit has enormous overhead. IBPB in particular, and
436 * non-auto modes are disabled by default.
438 if (cpu_vendor_id == CPU_VENDOR_INTEL) {
443 cpuid_count(7, 0, p);
444 if (p[3] & CPUID_STDEXT3_IBPB)
445 rv |= IBRS_SUPPORTED | IBPB_SUPPORTED;
446 if (p[3] & CPUID_STDEXT3_STIBP)
447 rv |= STIBP_SUPPORTED;
450 * 0x80000008 p[1] bit 12 indicates IBPB support
452 * This bit might be set even though STDEXT3_IBPB is not set.
458 do_cpuid(0x80000008U, p);
459 if (p[1] & CPUID_CAPEX_IBPB)
460 rv |= IBPB_SUPPORTED;
461 } else if (cpu_vendor_id == CPU_VENDOR_AMD) {
464 * p[1] bit 12 indicates IBPB support
465 * p[1] bit 14 indicates IBRS support
466 * p[1] bit 15 indicates STIBP support
468 * p[1] bit 16 indicates IBRS auto support
469 * p[1] bit 17 indicates STIBP auto support
470 * p[1] bit 18 indicates processor prefers using
471 * IBRS instead of retpoline.
477 do_cpuid(0x80000008U, p);
478 if (p[1] & CPUID_CAPEX_IBPB)
479 rv |= IBPB_SUPPORTED;
480 if (p[1] & CPUID_CAPEX_IBRS)
481 rv |= IBRS_SUPPORTED;
482 if (p[1] & CPUID_CAPEX_STIBP)
483 rv |= STIBP_SUPPORTED;
485 if (p[1] & CPUID_CAPEX_IBRS_ALWAYSON)
486 rv |= IBRS_AUTO_SUPPORTED;
487 if (p[1] & CPUID_CAPEX_STIBP_ALWAYSON)
488 rv |= STIBP_AUTO_SUPPORTED;
489 if (p[1] & CPUID_CAPEX_PREFER_IBRS)
490 rv |= IBRS_PREFERRED_REQUEST;
497 * Iterate CPUs and adjust MSR for global operations, since
498 * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
500 #define CHECK(flag) (spectre_mitigation & spectre_support & (flag))
504 spectre_sysctl_changed(void)
506 globaldata_t save_gd;
507 struct trampframe *tr;
513 spec_mask = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP |
514 SPEC_CTRL_DUMMY_ENABLE | SPEC_CTRL_DUMMY_IBPB;
521 for (n = 0; n < ncpus; ++n) {
522 lwkt_setcpu_self(globaldata_find(n));
524 tr = &pscpu->trampoline;
527 * Make sure we are cleaned out.
529 * XXX cleanup, reusing globals inside the loop (they get
530 * set to the same thing each loop)
532 * [0] kernel entry (idle exit)
533 * [1] kernel exit (idle entry)
535 tr->tr_pcb_spec_ctrl[0] &= ~spec_mask;
536 tr->tr_pcb_spec_ctrl[1] &= ~spec_mask;
539 * Don't try to parse if not available
541 if (spectre_mitigation < 0)
545 * IBRS mode. Auto overrides toggling.
547 * Only set the ENABLE flag if we have to toggle something
551 if (CHECK(IBRS_AUTO_SUPPORTED)) {
552 spec_ctrl |= SPEC_CTRL_IBRS;
553 mode |= IBRS_AUTO_SUPPORTED;
554 } else if (CHECK(IBRS_SUPPORTED)) {
555 spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_DUMMY_ENABLE;
556 mode |= IBRS_SUPPORTED;
558 if (CHECK(STIBP_AUTO_SUPPORTED)) {
559 spec_ctrl |= SPEC_CTRL_STIBP;
560 mode |= STIBP_AUTO_SUPPORTED;
561 } else if (CHECK(STIBP_SUPPORTED)) {
562 spec_ctrl |= SPEC_CTRL_STIBP | SPEC_CTRL_DUMMY_ENABLE;
563 mode |= STIBP_SUPPORTED;
567 * IBPB requested and supported.
569 if (CHECK(IBPB_SUPPORTED)) {
570 spec_ctrl |= SPEC_CTRL_DUMMY_IBPB;
571 mode |= IBPB_SUPPORTED;
575 * Update the MSR if the cpu supports the modes to ensure
576 * proper disablement if the user disabled the mode.
578 if (spectre_support & (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
579 STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED)) {
581 spec_ctrl & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
585 * Update spec_ctrl fields in the trampoline.
587 * [0] on-kernel-entry (on-idle-exit)
588 * [1] on-kernel-exit (on-idle-entry)
590 * When auto mode is supported we leave the bit set, otherwise
593 tr->tr_pcb_spec_ctrl[0] |= spec_ctrl;
594 if (CHECK(IBRS_AUTO_SUPPORTED) == 0)
595 spec_ctrl &= ~SPEC_CTRL_IBRS;
596 if (CHECK(STIBP_AUTO_SUPPORTED) == 0)
597 spec_ctrl &= ~SPEC_CTRL_STIBP;
598 tr->tr_pcb_spec_ctrl[1] |= spec_ctrl;
601 * Make sure we set this on the first loop. It will be
602 * the same value on remaining loops.
606 lwkt_setcpu_self(save_gd);
610 * Console message on mitigation mode change
612 kprintf("Spectre: support=(");
613 if (spectre_support == 0) {
616 if (spectre_support & IBRS_SUPPORTED)
618 if (spectre_support & STIBP_SUPPORTED)
620 if (spectre_support & IBPB_SUPPORTED)
622 if (spectre_support & IBRS_AUTO_SUPPORTED)
623 kprintf(" IBRS_AUTO");
624 if (spectre_support & STIBP_AUTO_SUPPORTED)
625 kprintf(" STIBP_AUTO");
626 if (spectre_support & IBRS_PREFERRED_REQUEST)
627 kprintf(" IBRS_REQUESTED");
629 kprintf(" ) req=%04x operating=(", (uint16_t)spectre_mitigation);
630 if (spectre_mode == 0) {
633 if (spectre_mode & IBRS_SUPPORTED)
635 if (spectre_mode & STIBP_SUPPORTED)
637 if (spectre_mode & IBPB_SUPPORTED)
639 if (spectre_mode & IBRS_AUTO_SUPPORTED)
640 kprintf(" IBRS_AUTO");
641 if (spectre_mode & STIBP_AUTO_SUPPORTED)
642 kprintf(" STIBP_AUTO");
643 if (spectre_mode & IBRS_PREFERRED_REQUEST)
644 kprintf(" IBRS_REQUESTED");
652 * User changes sysctl value
655 sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
666 * Return current operating mode or support.
668 if (oidp->oid_kind & CTLFLAG_WR)
669 spectre = spectre_mode;
671 spectre = spectre_support;
673 spectre &= (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
674 STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED |
680 error = SYSCTL_OUT(req, " ", 1);
684 if (spectre & IBRS_SUPPORTED) {
685 spectre &= ~IBRS_SUPPORTED;
686 error = SYSCTL_OUT(req, "IBRS", 4);
688 if (spectre & IBRS_AUTO_SUPPORTED) {
689 spectre &= ~IBRS_AUTO_SUPPORTED;
690 error = SYSCTL_OUT(req, "IBRS_AUTO", 9);
692 if (spectre & STIBP_SUPPORTED) {
693 spectre &= ~STIBP_SUPPORTED;
694 error = SYSCTL_OUT(req, "STIBP", 5);
696 if (spectre & STIBP_AUTO_SUPPORTED) {
697 spectre &= ~STIBP_AUTO_SUPPORTED;
698 error = SYSCTL_OUT(req, "STIBP_AUTO", 10);
700 if (spectre & IBPB_SUPPORTED) {
701 spectre &= ~IBPB_SUPPORTED;
702 error = SYSCTL_OUT(req, "IBPB", 4);
706 error = SYSCTL_OUT(req, "NONE", 4);
709 if (error || req->newptr == NULL)
711 if ((oidp->oid_kind & CTLFLAG_WR) == 0)
715 * Change current operating mode
717 len = req->newlen - req->newidx;
718 if (len >= sizeof(buf)) {
722 error = SYSCTL_IN(req, buf, len);
728 while (error == 0 && iter) {
729 ptr = strsep(&iter, " ,\t\r\n");
732 if (strcasecmp(ptr, "NONE") == 0)
734 else if (strcasecmp(ptr, "IBRS") == 0)
735 spectre |= IBRS_SUPPORTED;
736 else if (strcasecmp(ptr, "IBRS_AUTO") == 0)
737 spectre |= IBRS_AUTO_SUPPORTED;
738 else if (strcasecmp(ptr, "STIBP") == 0)
739 spectre |= STIBP_SUPPORTED;
740 else if (strcasecmp(ptr, "STIBP_AUTO") == 0)
741 spectre |= STIBP_AUTO_SUPPORTED;
742 else if (strcasecmp(ptr, "IBPB") == 0)
743 spectre |= IBPB_SUPPORTED;
748 spectre_mitigation = spectre;
749 spectre_sysctl_changed();
754 SYSCTL_PROC(_machdep, OID_AUTO, spectre_mitigation,
755 CTLTYPE_STRING | CTLFLAG_RW,
756 0, 0, sysctl_spectre_mitigation, "A", "Spectre exploit mitigation");
757 SYSCTL_PROC(_machdep, OID_AUTO, spectre_support,
758 CTLTYPE_STRING | CTLFLAG_RD,
759 0, 0, sysctl_spectre_mitigation, "A", "Spectre supported features");
762 * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
763 * updated. Microcode updates must be applied to all cpus
764 * for support to be recognized.
767 spectre_vm_setup(void *arg)
769 int inconsistent = 0;
773 * Fetch tunable in auto mode
775 if (spectre_mitigation < 0) {
776 TUNABLE_INT_FETCH("machdep.spectre_mitigation",
777 &spectre_mitigation);
780 if ((supmask = spectre_check_support()) != 0) {
782 * Must be supported on all cpus before we
783 * can enable it. Returns silently if it
786 * NOTE! arg != NULL indicates we were called
787 * from cpuctl after a successful microcode
791 globaldata_t save_gd;
795 for (n = 0; n < ncpus; ++n) {
796 lwkt_setcpu_self(globaldata_find(n));
798 if (spectre_check_support() !=
804 lwkt_setcpu_self(save_gd);
810 * Be silent while microcode is being loaded on various CPUs,
814 spectre_mitigation = -1;
822 spectre_support = supmask;
825 * Enable spectre_mitigation, set defaults if -1, adjust
826 * tuned value according to support if not.
828 * NOTE! We do not enable IBPB for user->kernel transitions
829 * by default, so this code is commented out for now.
831 if (spectre_support) {
832 if (spectre_mitigation < 0) {
833 spectre_mitigation = 0;
836 * IBRS toggling not currently recommended as a
839 if (spectre_support & IBRS_AUTO_SUPPORTED)
840 spectre_mitigation |= IBRS_AUTO_SUPPORTED;
841 else if (spectre_support & IBRS_SUPPORTED)
842 spectre_mitigation |= 0;
845 * STIBP toggling not currently recommended as a
848 if (spectre_support & STIBP_AUTO_SUPPORTED)
849 spectre_mitigation |= STIBP_AUTO_SUPPORTED;
850 else if (spectre_support & STIBP_SUPPORTED)
851 spectre_mitigation |= 0;
854 * IBPB adds enormous (~2uS) overhead to system
855 * calls etc, we do not enable it by default.
857 if (spectre_support & IBPB_SUPPORTED)
858 spectre_mitigation |= 0;
861 spectre_mitigation = -1;
865 * Disallow sysctl changes when there is no support (otherwise
866 * the wrmsr will cause a protection fault).
868 if (spectre_mitigation < 0)
869 sysctl___machdep_spectre_mitigation.oid_kind &= ~CTLFLAG_WR;
871 sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
873 spectre_sysctl_changed();
876 #define MDS_AVX512_4VNNIW_SUPPORTED 0x0001
877 #define MDS_AVX512_4FMAPS_SUPPORTED 0x0002
878 #define MDS_MD_CLEAR_SUPPORTED 0x0004
879 #define MDS_TSX_FORCE_ABORT_SUPPORTED 0x0008
880 #define MDS_NOT_REQUIRED 0x8000
884 mds_check_support(void)
891 * MDS mitigation hw bits
893 * MD_CLEAR Use microcode-supported verf insn. This is the
894 * only mode we really support.
896 if (cpu_vendor_id == CPU_VENDOR_INTEL) {
901 cpuid_count(7, 0, p);
904 * Some hypervisors fail to implement
905 * MSR_IA32_ARCH_CAPABILITIES.
907 if (p[3] & CPUID_STDEXT3_ARCH_CAP) {
909 if (rdmsr_safe(MSR_IA32_ARCH_CAPABILITIES, &msr)) {
910 kprintf("Warning: MSR_IA32_ARCH_CAPABILITIES "
911 "cannot be accessed\n");
913 if (msr & IA32_ARCH_CAP_MDS_NO)
914 rv = MDS_NOT_REQUIRED;
916 if (p[3] & CPUID_STDEXT3_AVX5124VNNIW)
917 rv |= MDS_AVX512_4VNNIW_SUPPORTED;
918 if (p[3] & CPUID_STDEXT3_AVX5124FMAPS)
919 rv |= MDS_AVX512_4FMAPS_SUPPORTED;
920 if (p[3] & CPUID_STDEXT3_MD_CLEAR)
921 rv |= MDS_MD_CLEAR_SUPPORTED;
922 if (p[3] & CPUID_STDEXT3_TSXFA)
923 rv |= MDS_TSX_FORCE_ABORT_SUPPORTED;
925 rv = MDS_NOT_REQUIRED;
932 * Iterate CPUs and adjust MSR for global operations, since
933 * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
935 #define CHECK(flag) (mds_mitigation & mds_support & (flag))
939 mds_sysctl_changed(void)
941 globaldata_t save_gd;
942 struct trampframe *tr;
948 spec_mask = SPEC_CTRL_MDS_ENABLE;
955 for (n = 0; n < ncpus; ++n) {
956 lwkt_setcpu_self(globaldata_find(n));
958 tr = &pscpu->trampoline;
961 * Make sure we are cleaned out.
963 * XXX cleanup, reusing globals inside the loop (they get
964 * set to the same thing each loop)
966 * [0] kernel entry (idle exit)
967 * [1] kernel exit (idle entry)
969 tr->tr_pcb_spec_ctrl[0] &= ~spec_mask;
970 tr->tr_pcb_spec_ctrl[1] &= ~spec_mask;
973 * Don't try to parse if not available
975 if (mds_mitigation < 0)
979 if (CHECK(MDS_MD_CLEAR_SUPPORTED)) {
980 spec_ctrl |= SPEC_CTRL_MDS_ENABLE;
981 mode |= MDS_MD_CLEAR_SUPPORTED;
985 * Update spec_ctrl fields in the trampoline.
987 * [0] on-kernel-entry (on-idle-exit)
988 * [1] on-kernel-exit (on-idle-entry)
990 * The MDS stuff is only needed on kernel-exit or idle-entry
992 /* tr->tr_pcb_spec_ctrl[0] |= spec_ctrl; */
993 tr->tr_pcb_spec_ctrl[1] |= spec_ctrl;
996 * Make sure we set this on the first loop. It will be
997 * the same value on remaining loops.
1001 lwkt_setcpu_self(save_gd);
1005 * Console message on mitigation mode change
1007 kprintf("MDS: support=(");
1008 if (mds_support == 0) {
1011 if (mds_support & MDS_AVX512_4VNNIW_SUPPORTED)
1012 kprintf(" AVX512_4VNNIW");
1013 if (mds_support & MDS_AVX512_4FMAPS_SUPPORTED)
1014 kprintf(" AVX512_4FMAPS");
1015 if (mds_support & MDS_MD_CLEAR_SUPPORTED)
1016 kprintf(" MD_CLEAR");
1017 if (mds_support & MDS_TSX_FORCE_ABORT_SUPPORTED)
1018 kprintf(" TSX_FORCE_ABORT");
1019 if (mds_support & MDS_NOT_REQUIRED)
1020 kprintf(" MDS_NOT_REQUIRED");
1022 kprintf(" ) req=%04x operating=(", (uint16_t)mds_mitigation);
1023 if (mds_mode == 0) {
1026 if (mds_mode & MDS_AVX512_4VNNIW_SUPPORTED)
1027 kprintf(" AVX512_4VNNIW");
1028 if (mds_mode & MDS_AVX512_4FMAPS_SUPPORTED)
1029 kprintf(" AVX512_4FMAPS");
1030 if (mds_mode & MDS_MD_CLEAR_SUPPORTED)
1031 kprintf(" MD_CLEAR");
1032 if (mds_mode & MDS_TSX_FORCE_ABORT_SUPPORTED)
1033 kprintf(" TSX_FORCE_ABORT");
1034 if (mds_mode & MDS_NOT_REQUIRED)
1035 kprintf(" MDS_NOT_REQUIRED");
1043 * User changes sysctl value
1046 sysctl_mds_mitigation(SYSCTL_HANDLER_ARGS)
1057 * Return current operating mode or support.
1059 if (oidp->oid_kind & CTLFLAG_WR)
1064 mds &= MDS_AVX512_4VNNIW_SUPPORTED |
1065 MDS_AVX512_4FMAPS_SUPPORTED |
1066 MDS_MD_CLEAR_SUPPORTED |
1067 MDS_TSX_FORCE_ABORT_SUPPORTED |
1074 error = SYSCTL_OUT(req, " ", 1);
1078 if (mds & MDS_AVX512_4VNNIW_SUPPORTED) {
1079 mds &= ~MDS_AVX512_4VNNIW_SUPPORTED;
1080 error = SYSCTL_OUT(req, "AVX512_4VNNIW", 13);
1082 if (mds & MDS_AVX512_4FMAPS_SUPPORTED) {
1083 mds &= ~MDS_AVX512_4FMAPS_SUPPORTED;
1084 error = SYSCTL_OUT(req, "AVX512_4FMAPS", 13);
1086 if (mds & MDS_MD_CLEAR_SUPPORTED) {
1087 mds &= ~MDS_MD_CLEAR_SUPPORTED;
1088 error = SYSCTL_OUT(req, "MD_CLEAR", 8);
1090 if (mds & MDS_TSX_FORCE_ABORT_SUPPORTED) {
1091 mds &= ~MDS_TSX_FORCE_ABORT_SUPPORTED;
1092 error = SYSCTL_OUT(req, "TSX_FORCE_ABORT", 15);
1094 if (mds & MDS_NOT_REQUIRED) {
1095 mds &= ~MDS_NOT_REQUIRED;
1096 error = SYSCTL_OUT(req, "MDS_NOT_REQUIRED", 16);
1100 error = SYSCTL_OUT(req, "NONE", 4);
1103 if (error || req->newptr == NULL)
1105 if ((oidp->oid_kind & CTLFLAG_WR) == 0)
1109 * Change current operating mode
1111 len = req->newlen - req->newidx;
1112 if (len >= sizeof(buf)) {
1116 error = SYSCTL_IN(req, buf, len);
1122 while (error == 0 && iter) {
1123 ptr = strsep(&iter, " ,\t\r\n");
1126 if (strcasecmp(ptr, "NONE") == 0)
1128 else if (strcasecmp(ptr, "AVX512_4VNNIW") == 0)
1129 mds |= MDS_AVX512_4VNNIW_SUPPORTED;
1130 else if (strcasecmp(ptr, "AVX512_4FMAPS") == 0)
1131 mds |= MDS_AVX512_4FMAPS_SUPPORTED;
1132 else if (strcasecmp(ptr, "MD_CLEAR") == 0)
1133 mds |= MDS_MD_CLEAR_SUPPORTED;
1134 else if (strcasecmp(ptr, "TSX_FORCE_ABORT") == 0)
1135 mds |= MDS_TSX_FORCE_ABORT_SUPPORTED;
1136 else if (strcasecmp(ptr, "MDS_NOT_REQUIRED") == 0)
1137 mds |= MDS_NOT_REQUIRED;
1142 mds_mitigation = mds;
1143 mds_sysctl_changed();
1148 SYSCTL_PROC(_machdep, OID_AUTO, mds_mitigation,
1149 CTLTYPE_STRING | CTLFLAG_RW,
1150 0, 0, sysctl_mds_mitigation, "A", "MDS exploit mitigation");
1151 SYSCTL_PROC(_machdep, OID_AUTO, mds_support,
1152 CTLTYPE_STRING | CTLFLAG_RD,
1153 0, 0, sysctl_mds_mitigation, "A", "MDS supported features");
1156 * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
1157 * updated. Microcode updates must be applied to all cpus
1158 * for support to be recognized.
1161 mds_vm_setup(void *arg)
1163 int inconsistent = 0;
1167 * Fetch tunable in auto mode
1169 if (mds_mitigation < 0) {
1170 TUNABLE_INT_FETCH("machdep.mds_mitigation", &mds_mitigation);
1173 if ((supmask = mds_check_support()) != 0) {
1175 * Must be supported on all cpus before we
1176 * can enable it. Returns silently if it
1179 * NOTE! arg != NULL indicates we were called
1180 * from cpuctl after a successful microcode
1184 globaldata_t save_gd;
1188 for (n = 0; n < ncpus; ++n) {
1189 lwkt_setcpu_self(globaldata_find(n));
1191 if (mds_check_support() != supmask) {
1196 lwkt_setcpu_self(save_gd);
1202 * Be silent while microcode is being loaded on various CPUs,
1206 mds_mitigation = -1;
1214 mds_support = supmask;
1217 * Enable mds_mitigation, set defaults if -1, adjust
1218 * tuned value according to support if not.
1220 * NOTE! MDS is not enabled by default.
1223 if (mds_mitigation < 0) {
1226 if ((mds_support & MDS_NOT_REQUIRED) == 0 &&
1227 (mds_support & MDS_MD_CLEAR_SUPPORTED)) {
1228 /* mds_mitigation |= MDS_MD_CLEAR_SUPPORTED; */
1232 mds_mitigation = -1;
1236 * Disallow sysctl changes when there is no support (otherwise
1237 * the wrmsr will cause a protection fault).
1239 if (mds_mitigation < 0)
1240 sysctl___machdep_mds_mitigation.oid_kind &= ~CTLFLAG_WR;
1242 sysctl___machdep_mds_mitigation.oid_kind |= CTLFLAG_WR;
1244 mds_sysctl_changed();
1248 * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
1249 * updated. Microcode updates must be applied to all cpus
1250 * for support to be recognized.
1253 mitigation_vm_setup(void *arg)
1255 spectre_vm_setup(arg);
1259 SYSINIT(mitigation_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY,
1260 mitigation_vm_setup, NULL);
1263 * platform-specific vmspace initialization (nothing for x86_64)
1266 cpu_vmspace_alloc(struct vmspace *vm __unused)
1271 cpu_vmspace_free(struct vmspace *vm __unused)
1276 kvm_access_check(vm_offset_t saddr, vm_offset_t eaddr, int prot)
1280 if (saddr < KvaStart)
1282 if (eaddr >= KvaEnd)
1284 for (addr = saddr; addr < eaddr; addr += PAGE_SIZE) {
1285 if (pmap_kextract(addr) == 0)
1288 if (!kernacc((caddr_t)saddr, eaddr - saddr, prot))
1295 void _test_frame_enter(struct trapframe *frame);
1296 void _test_frame_exit(struct trapframe *frame);
1299 _test_frame_enter(struct trapframe *frame)
1301 thread_t td = curthread;
1303 if (ISPL(frame->tf_cs) == SEL_UPL) {
1304 KKASSERT(td->td_lwp);
1305 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
1306 ("_test_frame_exit: Frame mismatch %p %p",
1307 td->td_lwp->lwp_md.md_regs, frame));
1308 td->td_lwp->lwp_saveusp = (void *)frame->tf_rsp;
1309 td->td_lwp->lwp_saveupc = (void *)frame->tf_rip;
1311 if ((char *)frame < td->td_kstack ||
1312 (char *)frame > td->td_kstack + td->td_kstack_size) {
1313 panic("_test_frame_exit: frame not on kstack %p kstack=%p",
1314 frame, td->td_kstack);
1319 _test_frame_exit(struct trapframe *frame)
1321 thread_t td = curthread;
1323 if (ISPL(frame->tf_cs) == SEL_UPL) {
1324 KKASSERT(td->td_lwp);
1325 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
1326 ("_test_frame_exit: Frame mismatch %p %p",
1327 td->td_lwp->lwp_md.md_regs, frame));
1328 if (td->td_lwp->lwp_saveusp != (void *)frame->tf_rsp) {
1329 kprintf("_test_frame_exit: %s:%d usp mismatch %p/%p\n",
1330 td->td_comm, td->td_proc->p_pid,
1331 td->td_lwp->lwp_saveusp,
1332 (void *)frame->tf_rsp);
1334 if (td->td_lwp->lwp_saveupc != (void *)frame->tf_rip) {
1335 kprintf("_test_frame_exit: %s:%d upc mismatch %p/%p\n",
1336 td->td_comm, td->td_proc->p_pid,
1337 td->td_lwp->lwp_saveupc,
1338 (void *)frame->tf_rip);
1342 * adulterate the fields to catch entries that
1343 * don't run through test_frame_enter
1345 td->td_lwp->lwp_saveusp =
1346 (void *)~(intptr_t)td->td_lwp->lwp_saveusp;
1347 td->td_lwp->lwp_saveupc =
1348 (void *)~(intptr_t)td->td_lwp->lwp_saveupc;
1350 if ((char *)frame < td->td_kstack ||
1351 (char *)frame > td->td_kstack + td->td_kstack_size) {
1352 panic("_test_frame_exit: frame not on kstack %p kstack=%p",
1353 frame, td->td_kstack);