Merge branch 'vendor/DIFFUTILS'
[dragonfly.git] / sys / platform / pc64 / x86_64 / vm_machdep.c
1 /*-
2  * Copyright (c) 1982, 1986 The Regents of the University of California.
3  * Copyright (c) 1989, 1990 William Jolitz
4  * Copyright (c) 1994 John Dyson
5  * Copyright (c) 2008-2018 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * the Systems Programming Group of the University of Utah Computer
10  * Science Department, and William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *      This product includes software developed by the University of
23  *      California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *      from: @(#)vm_machdep.c  7.3 (Berkeley) 5/13/91
41  *      Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
42  * $FreeBSD: src/sys/i386/i386/vm_machdep.c,v 1.132.2.9 2003/01/25 19:02:23 dillon Exp $
43  */
44
45 #include <sys/param.h>
46 #include <sys/systm.h>
47 #include <sys/malloc.h>
48 #include <sys/proc.h>
49 #include <sys/buf.h>
50 #include <sys/interrupt.h>
51 #include <sys/vnode.h>
52 #include <sys/vmmeter.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/unistd.h>
56 #include <sys/lwp.h>
57
58 #include <machine/clock.h>
59 #include <machine/cpu.h>
60 #include <machine/md_var.h>
61 #include <machine/smp.h>
62 #include <machine/pcb.h>
63 #include <machine/pcb_ext.h>
64 #include <machine/segments.h>
65 #include <machine/globaldata.h> /* npxthread */
66 #include <machine/specialreg.h>
67 #include <machine/vmm.h>
68
69 #include <vm/vm.h>
70 #include <vm/vm_param.h>
71 #include <sys/lock.h>
72 #include <vm/vm_kern.h>
73 #include <vm/vm_page.h>
74 #include <vm/vm_map.h>
75 #include <vm/vm_extern.h>
76
77 #include <sys/thread2.h>
78 #include <sys/mplock2.h>
79
80 #include <bus/isa/isa.h>
81
82 static void     cpu_reset_real (void);
83
84 static int spectre_mitigation = -1;
85 static int spectre_support = 0;
86
87 static int spectre_mode = 0;
88 SYSCTL_INT(_machdep, OID_AUTO, spectre_mode, CTLFLAG_RD,
89         &spectre_mode, 0, "current Spectre enablements");
90
91 /*
92  * Finish a fork operation, with lwp lp2 nearly set up.
93  * Copy and update the pcb, set up the stack so that the child
94  * ready to run and return to user mode.
95  */
96 void
97 cpu_fork(struct lwp *lp1, struct lwp *lp2, int flags)
98 {
99         struct pcb *pcb2;
100         struct pmap *pmap2;
101
102         if ((flags & RFPROC) == 0) {
103                 if ((flags & RFMEM) == 0) {
104                         /*
105                          * Unshare user LDT.  > 1 test is MPSAFE.  While
106                          * it can potentially race a 2->1 transition, the
107                          * worst that happens is that we do an unnecessary
108                          * ldt replacement.
109                          */
110                         struct pcb *pcb1 = lp1->lwp_thread->td_pcb;
111                         struct pcb_ldt *pcb_ldt = pcb1->pcb_ldt;
112
113                         if (pcb_ldt && pcb_ldt->ldt_refcnt > 1) {
114                                 pcb_ldt = user_ldt_alloc(pcb1,pcb_ldt->ldt_len);
115                                 user_ldt_free(pcb1);
116                                 pcb1->pcb_ldt = pcb_ldt;
117                                 set_user_ldt(pcb1);
118                         }
119                 }
120                 return;
121         }
122
123         /* Ensure that lp1's pcb is up to date. */
124         if (mdcpu->gd_npxthread == lp1->lwp_thread)
125                 npxsave(lp1->lwp_thread->td_savefpu);
126
127         /*
128          * Copy lp1's PCB.  This really only applies to the
129          * debug registers and FP state, but its faster to just copy the
130          * whole thing.  Because we only save the PCB at switchout time,
131          * the register state may not be current.
132          */
133         pcb2 = lp2->lwp_thread->td_pcb;
134         *pcb2 = *lp1->lwp_thread->td_pcb;
135
136         /*
137          * Create a new fresh stack for the new process.
138          * Copy the trap frame for the return to user mode as if from a
139          * syscall.  This copies the user mode register values.
140          *
141          * pcb_rsp must allocate an additional call-return pointer below
142          * the trap frame which will be restored by cpu_heavy_restore from
143          * PCB_RIP, and the thread's td_sp pointer must allocate an
144          * additonal two quadwords below the pcb_rsp call-return pointer to
145          * hold the LWKT restore function pointer and rflags.
146          *
147          * The LWKT restore function pointer must be set to cpu_heavy_restore,
148          * which is our standard heavy-weight process switch-in function.
149          * YYY eventually we should shortcut fork_return and fork_trampoline
150          * to use the LWKT restore function directly so we can get rid of
151          * all the extra crap we are setting up.
152          */
153         lp2->lwp_md.md_regs = (struct trapframe *)pcb2 - 1;
154         bcopy(lp1->lwp_md.md_regs, lp2->lwp_md.md_regs, sizeof(*lp2->lwp_md.md_regs));
155
156         /*
157          * Set registers for trampoline to user mode.  Leave space for the
158          * return address on stack.  These are the kernel mode register values.
159          *
160          * Set the new pmap CR3.  If the new process uses isolated VM spaces,
161          * also set the isolated CR3.
162          */
163         pmap2 = vmspace_pmap(lp2->lwp_proc->p_vmspace);
164         pcb2->pcb_cr3 = vtophys(pmap2->pm_pml4);
165         if ((pcb2->pcb_flags & PCB_ISOMMU) && pmap2->pm_pmlpv_iso) {
166                 pcb2->pcb_cr3_iso = vtophys(pmap2->pm_pml4_iso);
167         } else {
168                 pcb2->pcb_flags &= ~PCB_ISOMMU;
169                 pcb2->pcb_cr3_iso = 0;
170         }
171
172 #if 0
173         /*
174          * Per-process spectre mitigation (future)
175          */
176         pcb2->pcb_flags &= ~(PCB_IBRS1 | PCB_IBRS2);
177         switch (spectre_mitigation) {
178         case 1:
179                 pcb2->pcb_flags |= PCB_IBRS1;
180                 break;
181         case 2:
182                 pcb2->pcb_flags |= PCB_IBRS2;
183                 break;
184         default:
185                 break;
186         }
187 #endif
188
189         pcb2->pcb_rbx = (unsigned long)fork_return;     /* fork_trampoline argument */
190         pcb2->pcb_rbp = 0;
191         pcb2->pcb_rsp = (unsigned long)lp2->lwp_md.md_regs - sizeof(void *);
192         pcb2->pcb_r12 = (unsigned long)lp2;             /* fork_trampoline argument */
193         pcb2->pcb_r13 = 0;
194         pcb2->pcb_r14 = 0;
195         pcb2->pcb_r15 = 0;
196         pcb2->pcb_rip = (unsigned long)fork_trampoline;
197         lp2->lwp_thread->td_sp = (char *)(pcb2->pcb_rsp - sizeof(void *));
198         *(u_int64_t *)lp2->lwp_thread->td_sp = PSL_USER;
199         lp2->lwp_thread->td_sp -= sizeof(void *);
200         *(void **)lp2->lwp_thread->td_sp = (void *)cpu_heavy_restore;
201
202         /*
203          * pcb2->pcb_ldt:       duplicated below, if necessary.
204          * pcb2->pcb_savefpu:   cloned above.
205          * pcb2->pcb_flags:     cloned above
206          * pcb2->pcb_onfault:   cloned above (always NULL here).
207          * pcb2->pcb_onfault_sp:cloned above (dont care)
208          */
209
210         /*
211          * XXX don't copy the i/o pages.  this should probably be fixed.
212          */
213         pcb2->pcb_ext = NULL;
214
215         /* Copy the LDT, if necessary. */
216         if (pcb2->pcb_ldt != NULL) {
217                 if (flags & RFMEM) {
218                         atomic_add_int(&pcb2->pcb_ldt->ldt_refcnt, 1);
219                 } else {
220                         pcb2->pcb_ldt = user_ldt_alloc(pcb2,
221                                                        pcb2->pcb_ldt->ldt_len);
222                 }
223         }
224         bcopy(&lp1->lwp_thread->td_tls, &lp2->lwp_thread->td_tls,
225               sizeof(lp2->lwp_thread->td_tls));
226         /*
227          * Now, cpu_switch() can schedule the new lwp.
228          * pcb_rsp is loaded pointing to the cpu_switch() stack frame
229          * containing the return address when exiting cpu_switch.
230          * This will normally be to fork_trampoline(), which will have
231          * %rbx loaded with the new lwp's pointer.  fork_trampoline()
232          * will set up a stack to call fork_return(lp, frame); to complete
233          * the return to user-mode.
234          */
235 }
236
237 /*
238  * Prepare new lwp to return to the address specified in params.
239  */
240 int
241 cpu_prepare_lwp(struct lwp *lp, struct lwp_params *params)
242 {
243         struct trapframe *regs = lp->lwp_md.md_regs;
244         void *bad_return = NULL;
245         int error;
246
247         regs->tf_rip = (long)params->lwp_func;
248         regs->tf_rsp = (long)params->lwp_stack;
249         /* Set up argument for function call */
250         regs->tf_rdi = (long)params->lwp_arg;
251
252         /*
253          * Set up fake return address.  As the lwp function may never return,
254          * we simply copy out a NULL pointer and force the lwp to receive
255          * a SIGSEGV if it returns anyways.
256          */
257         regs->tf_rsp -= sizeof(void *);
258         error = copyout(&bad_return, (void *)regs->tf_rsp, sizeof(bad_return));
259         if (error)
260                 return (error);
261
262         if (lp->lwp_proc->p_vmm) {
263                 lp->lwp_thread->td_pcb->pcb_cr3 = KPML4phys;
264                 cpu_set_fork_handler(lp,
265                     (void (*)(void *, struct trapframe *))vmm_lwp_return, lp);
266         } else {
267                 cpu_set_fork_handler(lp,
268                     (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
269         }
270         return (0);
271 }
272
273 /*
274  * Intercept the return address from a freshly forked process that has NOT
275  * been scheduled yet.
276  *
277  * This is needed to make kernel threads stay in kernel mode.
278  */
279 void
280 cpu_set_fork_handler(struct lwp *lp, void (*func)(void *, struct trapframe *),
281                      void *arg)
282 {
283         /*
284          * Note that the trap frame follows the args, so the function
285          * is really called like this:  func(arg, frame);
286          */
287         lp->lwp_thread->td_pcb->pcb_rbx = (long)func;   /* function */
288         lp->lwp_thread->td_pcb->pcb_r12 = (long)arg;    /* first arg */
289 }
290
291 void
292 cpu_set_thread_handler(thread_t td, void (*rfunc)(void), void *func, void *arg)
293 {
294         td->td_pcb->pcb_rbx = (long)func;
295         td->td_pcb->pcb_r12 = (long)arg;
296         td->td_switch = cpu_lwkt_switch;
297         td->td_sp -= sizeof(void *);
298         *(void **)td->td_sp = rfunc;    /* exit function on return */
299         td->td_sp -= sizeof(void *);
300         *(void **)td->td_sp = cpu_kthread_restore;
301 }
302
303 void
304 cpu_lwp_exit(void)
305 {
306         struct thread *td = curthread;
307         struct pcb *pcb;
308
309         pcb = td->td_pcb;
310
311         /* Some x86 functionality was dropped */
312         KKASSERT(pcb->pcb_ext == NULL);
313
314         /*
315          * disable all hardware breakpoints
316          */
317         if (pcb->pcb_flags & PCB_DBREGS) {
318                 reset_dbregs();
319                 pcb->pcb_flags &= ~PCB_DBREGS;
320         }
321         td->td_gd->gd_cnt.v_swtch++;
322
323         crit_enter_quick(td);
324         if (td->td_flags & TDF_TSLEEPQ)
325                 tsleep_remove(td);
326         lwkt_deschedule_self(td);
327         lwkt_remove_tdallq(td);
328         cpu_thread_exit();
329 }
330
331 /*
332  * Terminate the current thread.  The caller must have already acquired
333  * the thread's rwlock and placed it on a reap list or otherwise notified
334  * a reaper of its existance.  We set a special assembly switch function which
335  * releases td_rwlock after it has cleaned up the MMU state and switched
336  * out the stack.
337  *
338  * Must be caller from a critical section and with the thread descheduled.
339  */
340 void
341 cpu_thread_exit(void)
342 {
343         npxexit();
344         curthread->td_switch = cpu_exit_switch;
345         curthread->td_flags |= TDF_EXITING;
346         lwkt_switch();
347         panic("cpu_thread_exit: lwkt_switch() unexpectedly returned");
348 }
349
350 void
351 cpu_reset(void)
352 {
353         cpu_reset_real();
354 }
355
356 static void
357 cpu_reset_real(void)
358 {
359         /*
360          * Attempt to do a CPU reset via the keyboard controller,
361          * do not turn off the GateA20, as any machine that fails
362          * to do the reset here would then end up in no man's land.
363          */
364
365 #if !defined(BROKEN_KEYBOARD_RESET)
366         outb(IO_KBD + 4, 0xFE);
367         DELAY(500000);  /* wait 0.5 sec to see if that did it */
368         kprintf("Keyboard reset did not work, attempting CPU shutdown\n");
369         DELAY(1000000); /* wait 1 sec for kprintf to complete */
370 #endif
371 #if 0 /* JG */
372         /* force a shutdown by unmapping entire address space ! */
373         bzero((caddr_t) PTD, PAGE_SIZE);
374 #endif
375
376         /* "good night, sweet prince .... <THUNK!>" */
377         cpu_invltlb();
378         /* NOTREACHED */
379         while(1);
380 }
381
382 /*
383  * Convert kernel VA to physical address
384  */
385 vm_paddr_t
386 kvtop(void *addr)
387 {
388         vm_paddr_t pa;
389
390         pa = pmap_kextract((vm_offset_t)addr);
391         if (pa == 0)
392                 panic("kvtop: zero page frame");
393         return (pa);
394 }
395
396 static void
397 swi_vm(void *arg, void *frame)
398 {
399         if (busdma_swi_pending != 0)
400                 busdma_swi();
401 }
402
403 static void
404 swi_vm_setup(void *arg)
405 {
406         register_swi_mp(SWI_VM, swi_vm, NULL, "swi_vm", NULL, 0);
407 }
408
409 SYSINIT(swi_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY, swi_vm_setup, NULL);
410
411 /*
412  * NOTE: This routine is also called after a successful microcode
413  *       reload on cpu 0.
414  */
415 void spectre_vm_setup(void *arg);
416
417 /*
418  * Check for IBPB and IBRS support
419  *
420  * This bits also specify desired modes in the spectre_mitigation sysctl.
421  */
422 #define IBRS_SUPPORTED          0x0001
423 #define STIBP_SUPPORTED         0x0002
424 #define IBPB_SUPPORTED          0x0004
425 #define IBRS_AUTO_SUPPORTED     0x0008
426 #define STIBP_AUTO_SUPPORTED    0x0010
427 #define IBRS_PREFERRED_REQUEST  0x0020
428
429 static
430 int
431 spectre_check_support(void)
432 {
433         uint32_t p[4];
434         int rv = 0;
435
436         /*
437          * Spectre mitigation hw bits
438          *
439          * IBRS         Indirect Branch Restricted Speculation   (isolation)
440          * STIBP        Single Thread Indirect Branch Prediction (isolation)
441          * IBPB         Branch Prediction Barrier                (barrier)
442          *
443          * IBRS and STIBP must be toggled (enabled on entry to kernel,
444          * disabled on exit, as well as disabled during any MWAIT/HLT).
445          * When *_AUTO bits are available, IBRS and STIBP may be left
446          * turned on and do not have to be toggled on kernel entry/exit.
447          *
448          * All this shit has enormous overhead.  IBPB in particular, and
449          * non-auto modes are disabled by default.
450          */
451         if (cpu_vendor_id == CPU_VENDOR_INTEL) {
452                 p[0] = 0;
453                 p[1] = 0;
454                 p[2] = 0;
455                 p[3] = 0;
456                 cpuid_count(7, 0, p);
457                 if (p[3] & CPUID_7_0_I3_SPEC_CTRL)
458                         rv |= IBRS_SUPPORTED | IBPB_SUPPORTED;
459                 if (p[3] & CPUID_7_0_I3_STIBP)
460                         rv |= STIBP_SUPPORTED;
461
462                 /*
463                  * 0x80000008 p[1] bit 12 indicates IBPB support
464                  *
465                  * This bit might be set even though SPEC_CTRL is not set.
466                  */
467                 p[0] = 0;
468                 p[1] = 0;
469                 p[2] = 0;
470                 p[3] = 0;
471                 do_cpuid(0x80000008U, p);
472                 if (p[1] & CPUID_INTEL_80000008_I1_IBPB_SUPPORT)
473                         rv |= IBPB_SUPPORTED;
474         } else if (cpu_vendor_id == CPU_VENDOR_AMD) {
475                 /*
476                  * 0x80000008 p[1] bit 12 indicates IBPB support
477                  *            p[1] bit 14 indicates IBRS support
478                  *            p[1] bit 15 indicates STIBP support
479                  *
480                  *            p[1] bit 16 indicates IBRS auto support
481                  *            p[1] bit 17 indicates STIBP auto support
482                  *            p[1] bit 18 indicates processor prefers using
483                  *              IBRS instead of retpoline.
484                  */
485                 p[0] = 0;
486                 p[1] = 0;
487                 p[2] = 0;
488                 p[3] = 0;
489                 do_cpuid(0x80000008U, p);
490                 if (p[1] & CPUID_AMD_80000008_I1_IBPB_SUPPORT)
491                         rv |= IBPB_SUPPORTED;
492                 if (p[1] & CPUID_AMD_80000008_I1_IBRS_SUPPORT)
493                         rv |= IBRS_SUPPORTED;
494                 if (p[1] & CPUID_AMD_80000008_I1_STIBP_SUPPORT)
495                         rv |= STIBP_SUPPORTED;
496
497                 if (p[1] & CPUID_AMD_80000008_I1_IBRS_AUTO)
498                         rv |= IBRS_AUTO_SUPPORTED;
499                 if (p[1] & CPUID_AMD_80000008_I1_STIBP_AUTO)
500                         rv |= STIBP_AUTO_SUPPORTED;
501                 if (p[1] & CPUID_AMD_80000008_I1_IBRS_REQUESTED)
502                         rv |= IBRS_PREFERRED_REQUEST;
503         }
504
505         return rv;
506 }
507
508 /*
509  * Iterate CPUs and adjust MSR for global operations, since
510  * the KMMU* code won't do it if spectre_mitigation is 0 or 2.
511  */
512 #define CHECK(flag)     (spectre_mitigation & spectre_support & (flag))
513
514 static
515 void
516 spectre_sysctl_changed(void)
517 {
518         globaldata_t save_gd;
519         struct trampframe *tr;
520         int spec_ctrl;
521         int mode;
522         int n;
523
524         /*
525          * Fixup state
526          */
527         mode = 0;
528         save_gd = mycpu;
529         for (n = 0; n < ncpus; ++n) {
530                 lwkt_setcpu_self(globaldata_find(n));
531                 cpu_ccfence();
532                 tr = &pscpu->trampoline;
533
534                 /*
535                  * Make sure we are cleaned out.
536                  *
537                  * XXX cleanup, reusing globals inside the loop (they get
538                  * set to the same thing each loop)
539                  */
540                 tr->tr_pcb_spec_ctrl[0] = 0;    /* kernel entry (idle exit) */
541                 tr->tr_pcb_spec_ctrl[1] = 0;    /* kernel exit  (idle entry) */
542
543                 /*
544                  * Don't try to parse if not available
545                  */
546                 if (spectre_mitigation < 0)
547                         continue;
548
549                 /*
550                  * IBRS mode.  Auto overrides toggling.
551                  *
552                  * Only set the ENABLE flag if we have to toggle something
553                  * on entry and exit.
554                  */
555                 spec_ctrl = 0;
556                 if (CHECK(IBRS_AUTO_SUPPORTED)) {
557                         spec_ctrl |= SPEC_CTRL_IBRS;
558                         mode |= IBRS_AUTO_SUPPORTED;
559                 } else if (CHECK(IBRS_SUPPORTED)) {
560                         spec_ctrl |= SPEC_CTRL_IBRS | SPEC_CTRL_DUMMY_ENABLE;
561                         mode |= IBRS_SUPPORTED;
562                 }
563                 if (CHECK(STIBP_AUTO_SUPPORTED)) {
564                         spec_ctrl |= SPEC_CTRL_STIBP;
565                         mode |= STIBP_AUTO_SUPPORTED;
566                 } else if (CHECK(STIBP_SUPPORTED)) {
567                         spec_ctrl |= SPEC_CTRL_STIBP | SPEC_CTRL_DUMMY_ENABLE;
568                         mode |= STIBP_SUPPORTED;
569                 }
570
571                 /*
572                  * IBPB requested and supported.
573                  */
574                 if (CHECK(IBPB_SUPPORTED)) {
575                         spec_ctrl |= SPEC_CTRL_DUMMY_IBPB;
576                         mode |= IBPB_SUPPORTED;
577                 }
578
579                 /*
580                  * Update the MSR if the cpu supports the modes to ensure
581                  * proper disablement if the user disabled the mode.
582                  */
583                 if (spectre_support & (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
584                                     STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED)) {
585                         wrmsr(MSR_SPEC_CTRL,
586                               spec_ctrl & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
587                 }
588
589                 /*
590                  * Update spec_ctrl fields in the trampoline.
591                  *
592                  * [0] on-kernel-entry (on-idle-exit)
593                  * [1] on-kernel-exit  (on-idle-entry)
594                  *
595                  * When auto mode is supported we leave the bit set, otherwise
596                  * we clear the bits.
597                  */
598                 tr->tr_pcb_spec_ctrl[0] = spec_ctrl;
599                 if (CHECK(IBRS_AUTO_SUPPORTED) == 0)
600                         spec_ctrl &= ~SPEC_CTRL_IBRS;
601                 if (CHECK(STIBP_AUTO_SUPPORTED) == 0)
602                         spec_ctrl &= ~SPEC_CTRL_STIBP;
603                 tr->tr_pcb_spec_ctrl[1] = spec_ctrl;
604
605                 /*
606                  * Make sure we set this on the first loop.  It will be
607                  * the same value on remaining loops.
608                  */
609                 spectre_mode = mode;
610         }
611         lwkt_setcpu_self(save_gd);
612         cpu_ccfence();
613
614         /*
615          * Console message on mitigation mode change
616          */
617         kprintf("Spectre: support=(");
618         if (spectre_support == 0) {
619                 kprintf(" none");
620         } else {
621                 if (spectre_support & IBRS_SUPPORTED)
622                         kprintf(" IBRS");
623                 if (spectre_support & STIBP_SUPPORTED)
624                         kprintf(" STIBP");
625                 if (spectre_support & IBPB_SUPPORTED)
626                         kprintf(" IBPB");
627                 if (spectre_support & IBRS_AUTO_SUPPORTED)
628                         kprintf(" IBRS_AUTO");
629                 if (spectre_support & STIBP_AUTO_SUPPORTED)
630                         kprintf(" STIBP_AUTO");
631                 if (spectre_support & IBRS_PREFERRED_REQUEST)
632                         kprintf(" IBRS_REQUESTED");
633         }
634         kprintf(" ) req=%04x operating=(", (uint16_t)spectre_mitigation);
635         if (spectre_mode == 0) {
636                 kprintf(" none");
637         } else {
638                 if (spectre_mode & IBRS_SUPPORTED)
639                         kprintf(" IBRS");
640                 if (spectre_mode & STIBP_SUPPORTED)
641                         kprintf(" STIBP");
642                 if (spectre_mode & IBPB_SUPPORTED)
643                         kprintf(" IBPB");
644                 if (spectre_mode & IBRS_AUTO_SUPPORTED)
645                         kprintf(" IBRS_AUTO");
646                 if (spectre_mode & STIBP_AUTO_SUPPORTED)
647                         kprintf(" STIBP_AUTO");
648                 if (spectre_mode & IBRS_PREFERRED_REQUEST)
649                         kprintf(" IBRS_REQUESTED");
650         }
651         kprintf(" )\n");
652 }
653
654 /*
655  * User changes sysctl value
656  */
657 static int
658 sysctl_spectre_mitigation(SYSCTL_HANDLER_ARGS)
659 {
660         char buf[128];
661         char *ptr;
662         char *iter;
663         size_t len;
664         int spectre;
665         int error = 0;
666         int loop = 0;
667
668         /*
669          * Return current operating mode or support.
670          */
671         if (oidp->oid_kind & CTLFLAG_WR)
672                 spectre = spectre_mode;
673         else
674                 spectre = spectre_support;
675
676         spectre &= (IBRS_SUPPORTED | IBRS_AUTO_SUPPORTED |
677                     STIBP_SUPPORTED | STIBP_AUTO_SUPPORTED |
678                     IBPB_SUPPORTED);
679         while (spectre) {
680                 if (error)
681                         break;
682                 if (loop++) {
683                         error = SYSCTL_OUT(req, " ", 1);
684                         if (error)
685                                 break;
686                 }
687                 if (spectre & IBRS_SUPPORTED) {
688                         spectre &= ~IBRS_SUPPORTED;
689                         error = SYSCTL_OUT(req, "IBRS", 4);
690                 } else
691                 if (spectre & IBRS_AUTO_SUPPORTED) {
692                         spectre &= ~IBRS_AUTO_SUPPORTED;
693                         error = SYSCTL_OUT(req, "IBRS_AUTO", 9);
694                 } else
695                 if (spectre & STIBP_SUPPORTED) {
696                         spectre &= ~STIBP_SUPPORTED;
697                         error = SYSCTL_OUT(req, "STIBP", 5);
698                 } else
699                 if (spectre & STIBP_AUTO_SUPPORTED) {
700                         spectre &= ~STIBP_AUTO_SUPPORTED;
701                         error = SYSCTL_OUT(req, "STIBP_AUTO", 10);
702                 } else
703                 if (spectre & IBPB_SUPPORTED) {
704                         spectre &= ~IBPB_SUPPORTED;
705                         error = SYSCTL_OUT(req, "IBPB", 4);
706                 }
707         }
708         if (loop == 0) {
709                 error = SYSCTL_OUT(req, "NONE", 4);
710         }
711
712         if (error || req->newptr == NULL)
713                 return error;
714         if ((oidp->oid_kind & CTLFLAG_WR) == 0)
715                 return error;
716
717         /*
718          * Change current operating mode
719          */
720         len = req->newlen - req->newidx;
721         if (len >= sizeof(buf)) {
722                 error = EINVAL;
723                 len = 0;
724         } else {
725                 error = SYSCTL_IN(req, buf, len);
726         }
727         buf[len] = 0;
728         iter = &buf[0];
729         spectre = 0;
730
731         while (error == 0 && iter) {
732                 ptr = strsep(&iter, " ,\t\r\n");
733                 if (*ptr == 0)
734                         continue;
735                 if (strcasecmp(ptr, "NONE") == 0)
736                         spectre |= 0;
737                 else if (strcasecmp(ptr, "IBRS") == 0)
738                         spectre |= IBRS_SUPPORTED;
739                 else if (strcasecmp(ptr, "IBRS_AUTO") == 0)
740                         spectre |= IBRS_AUTO_SUPPORTED;
741                 else if (strcasecmp(ptr, "STIBP") == 0)
742                         spectre |= STIBP_SUPPORTED;
743                 else if (strcasecmp(ptr, "STIBP_AUTO") == 0)
744                         spectre |= STIBP_AUTO_SUPPORTED;
745                 else if (strcasecmp(ptr, "IBPB") == 0)
746                         spectre |= IBPB_SUPPORTED;
747                 else
748                         error = ENOENT;
749         }
750         if (error == 0) {
751                 spectre_mitigation = spectre;
752                 spectre_sysctl_changed();
753         }
754         return error;
755 }
756
757 SYSCTL_PROC(_machdep, OID_AUTO, spectre_mitigation,
758         CTLTYPE_STRING | CTLFLAG_RW,
759         0, 0, sysctl_spectre_mitigation, "A", "Spectre exploit mitigation");
760 SYSCTL_PROC(_machdep, OID_AUTO, spectre_support,
761         CTLTYPE_STRING | CTLFLAG_RD,
762         0, 0, sysctl_spectre_mitigation, "A", "Spectre supported features");
763
764 /*
765  * NOTE: Called at SI_BOOT2_MACHDEP and also when the microcode is
766  *       updated.  Microcode updates must be applied to all cpus
767  *       for support to be recognized.
768  */
769 void
770 spectre_vm_setup(void *arg)
771 {
772         int inconsistent = 0;
773         int supmask;
774
775         /*
776          * Fetch tunable in auto mode
777          */
778         if (spectre_mitigation < 0) {
779                 TUNABLE_INT_FETCH("machdep.spectre_mitigation",
780                                   &spectre_mitigation);
781         }
782
783         if ((supmask = spectre_check_support()) != 0) {
784                 /*
785                  * Must be supported on all cpus before we
786                  * can enable it.  Returns silently if it
787                  * isn't.
788                  *
789                  * NOTE! arg != NULL indicates we were called
790                  *       from cpuctl after a successful microcode
791                  *       update.
792                  */
793                 if (arg != NULL) {
794                         globaldata_t save_gd;
795                         int n;
796
797                         save_gd = mycpu;
798                         for (n = 0; n < ncpus; ++n) {
799                                 lwkt_setcpu_self(globaldata_find(n));
800                                 cpu_ccfence();
801                                 if (spectre_check_support() !=
802                                     supmask) {
803                                         inconsistent = 1;
804                                         break;
805                                 }
806                         }
807                         lwkt_setcpu_self(save_gd);
808                         cpu_ccfence();
809                 }
810         }
811
812         /*
813          * Be silent while microcode is being loaded on various CPUs,
814          * until all done.
815          */
816         if (inconsistent) {
817                 spectre_mitigation = -1;
818                 spectre_support = 0;
819                 return;
820         }
821
822         /*
823          * IBRS support
824          */
825         spectre_support = supmask;
826
827         /*
828          * Enable spectre_mitigation, set defaults if -1, adjust
829          * tuned value according to support if not.
830          *
831          * NOTE!  We do not enable IBPB for user->kernel transitions
832          *        by default, so this code is commented out for now.
833          */
834         if (spectre_support) {
835                 if (spectre_mitigation < 0) {
836                         spectre_mitigation = 0;
837
838                         /*
839                          * IBRS toggling not currently recommended as a
840                          * default.
841                          */
842                         if (spectre_support & IBRS_AUTO_SUPPORTED)
843                                 spectre_mitigation |= IBRS_AUTO_SUPPORTED;
844                         else if (spectre_support & IBRS_SUPPORTED)
845                                 spectre_mitigation |= 0;
846
847                         /*
848                          * STIBP toggling not currently recommended as a
849                          * default.
850                          */
851                         if (spectre_support & STIBP_AUTO_SUPPORTED)
852                                 spectre_mitigation |= STIBP_AUTO_SUPPORTED;
853                         else if (spectre_support & STIBP_SUPPORTED)
854                                 spectre_mitigation |= 0;
855
856                         /*
857                          * IBPB adds enormous (~2uS) overhead to system
858                          * calls etc, we do not enable it by default.
859                          */
860                         if (spectre_support & IBPB_SUPPORTED)
861                                 spectre_mitigation |= 0;
862                 }
863         } else {
864                 spectre_mitigation = -1;
865         }
866
867         /*
868          * Disallow sysctl changes when there is no support (otherwise
869          * the wrmsr will cause a protection fault).
870          */
871         if (spectre_mitigation < 0)
872                 sysctl___machdep_spectre_mitigation.oid_kind &= ~CTLFLAG_WR;
873         else
874                 sysctl___machdep_spectre_mitigation.oid_kind |= CTLFLAG_WR;
875
876         spectre_sysctl_changed();
877 }
878
879 SYSINIT(spectre_vm_setup, SI_BOOT2_MACHDEP, SI_ORDER_ANY,
880         spectre_vm_setup, NULL);
881
882 /*
883  * platform-specific vmspace initialization (nothing for x86_64)
884  */
885 void
886 cpu_vmspace_alloc(struct vmspace *vm __unused)
887 {
888 }
889
890 void
891 cpu_vmspace_free(struct vmspace *vm __unused)
892 {
893 }
894
895 int
896 kvm_access_check(vm_offset_t saddr, vm_offset_t eaddr, int prot)
897 {
898         vm_offset_t addr;
899
900         if (saddr < KvaStart)
901                 return EFAULT;
902         if (eaddr >= KvaEnd)
903                 return EFAULT;
904         for (addr = saddr; addr < eaddr; addr += PAGE_SIZE)  {
905                 if (pmap_kextract(addr) == 0)
906                         return EFAULT;
907         }
908         if (!kernacc((caddr_t)saddr, eaddr - saddr, prot))
909                 return EFAULT;
910         return 0;
911 }
912
913 #if 0
914
915 void _test_frame_enter(struct trapframe *frame);
916 void _test_frame_exit(struct trapframe *frame);
917
918 void
919 _test_frame_enter(struct trapframe *frame)
920 {
921         thread_t td = curthread;
922
923         if (ISPL(frame->tf_cs) == SEL_UPL) {
924                 KKASSERT(td->td_lwp);
925                 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
926                         ("_test_frame_exit: Frame mismatch %p %p",
927                         td->td_lwp->lwp_md.md_regs, frame));
928             td->td_lwp->lwp_saveusp = (void *)frame->tf_rsp;
929             td->td_lwp->lwp_saveupc = (void *)frame->tf_rip;
930         }
931         if ((char *)frame < td->td_kstack ||
932             (char *)frame > td->td_kstack + td->td_kstack_size) {
933                 panic("_test_frame_exit: frame not on kstack %p kstack=%p",
934                         frame, td->td_kstack);
935         }
936 }
937
938 void
939 _test_frame_exit(struct trapframe *frame)
940 {
941         thread_t td = curthread;
942
943         if (ISPL(frame->tf_cs) == SEL_UPL) {
944                 KKASSERT(td->td_lwp);
945                 KASSERT(td->td_lwp->lwp_md.md_regs == frame,
946                         ("_test_frame_exit: Frame mismatch %p %p",
947                         td->td_lwp->lwp_md.md_regs, frame));
948                 if (td->td_lwp->lwp_saveusp != (void *)frame->tf_rsp) {
949                         kprintf("_test_frame_exit: %s:%d usp mismatch %p/%p\n",
950                                 td->td_comm, td->td_proc->p_pid,
951                                 td->td_lwp->lwp_saveusp,
952                                 (void *)frame->tf_rsp);
953                 }
954                 if (td->td_lwp->lwp_saveupc != (void *)frame->tf_rip) {
955                         kprintf("_test_frame_exit: %s:%d upc mismatch %p/%p\n",
956                                 td->td_comm, td->td_proc->p_pid,
957                                 td->td_lwp->lwp_saveupc,
958                                 (void *)frame->tf_rip);
959                 }
960
961                 /*
962                  * adulterate the fields to catch entries that
963                  * don't run through test_frame_enter
964                  */
965                 td->td_lwp->lwp_saveusp =
966                         (void *)~(intptr_t)td->td_lwp->lwp_saveusp;
967                 td->td_lwp->lwp_saveupc =
968                         (void *)~(intptr_t)td->td_lwp->lwp_saveupc;
969         }
970         if ((char *)frame < td->td_kstack ||
971             (char *)frame > td->td_kstack + td->td_kstack_size) {
972                 panic("_test_frame_exit: frame not on kstack %p kstack=%p",
973                         frame, td->td_kstack);
974         }
975 }
976
977 #endif