nvmm: Rename a few things for clarity
[dragonfly.git] / sys / dev / virtual / nvmm / x86 / nvmm_x86_svm.c
1 /*      $NetBSD: nvmm_x86_svm.c,v 1.83 2021/03/26 15:59:53 reinoud Exp $        */
2
3 /*
4  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
5  * All rights reserved.
6  *
7  * This code is part of the NVMM hypervisor.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30
31 #include <sys/param.h>
32 #include <sys/systm.h>
33
34 #include <sys/bitops.h>
35 #include <sys/globaldata.h>
36 #include <sys/kernel.h>
37 #include <sys/malloc.h> /* contigmalloc, contigfree */
38 #include <sys/thread2.h> /* lwkt_send_ipiq, lwkt_send_ipiq_mask */
39
40 #include <vm/pmap.h> /* pmap_npt_transform() */
41 #include <vm/vm_map.h>
42
43 #include <machine/cputypes.h> /* CPU_VENDOR_* */
44 #include <machine/md_var.h> /* cpu_*, amd_feature2 */
45 #include <machine/pmap_inval.h> /* pmap_inval_smp() */
46 #include <machine/specialreg.h>
47
48 #include <dev/virtual/nvmm/nvmm_compat.h>
49 #include <dev/virtual/nvmm/nvmm.h>
50 #include <dev/virtual/nvmm/nvmm_internal.h>
51 #include <dev/virtual/nvmm/x86/nvmm_x86.h>
52
53 void svm_vmrun(paddr_t, uint64_t *);
54
55 static inline void
56 svm_clgi(void)
57 {
58         asm volatile ("clgi" ::: "memory");
59 }
60
61 static inline void
62 svm_stgi(void)
63 {
64         asm volatile ("stgi" ::: "memory");
65 }
66
67 #define MSR_VM_HSAVE_PA 0xC0010117
68
69 /* -------------------------------------------------------------------------- */
70
71 #define VMCB_EXITCODE_CR0_READ          0x0000
72 #define VMCB_EXITCODE_CR1_READ          0x0001
73 #define VMCB_EXITCODE_CR2_READ          0x0002
74 #define VMCB_EXITCODE_CR3_READ          0x0003
75 #define VMCB_EXITCODE_CR4_READ          0x0004
76 #define VMCB_EXITCODE_CR5_READ          0x0005
77 #define VMCB_EXITCODE_CR6_READ          0x0006
78 #define VMCB_EXITCODE_CR7_READ          0x0007
79 #define VMCB_EXITCODE_CR8_READ          0x0008
80 #define VMCB_EXITCODE_CR9_READ          0x0009
81 #define VMCB_EXITCODE_CR10_READ         0x000A
82 #define VMCB_EXITCODE_CR11_READ         0x000B
83 #define VMCB_EXITCODE_CR12_READ         0x000C
84 #define VMCB_EXITCODE_CR13_READ         0x000D
85 #define VMCB_EXITCODE_CR14_READ         0x000E
86 #define VMCB_EXITCODE_CR15_READ         0x000F
87 #define VMCB_EXITCODE_CR0_WRITE         0x0010
88 #define VMCB_EXITCODE_CR1_WRITE         0x0011
89 #define VMCB_EXITCODE_CR2_WRITE         0x0012
90 #define VMCB_EXITCODE_CR3_WRITE         0x0013
91 #define VMCB_EXITCODE_CR4_WRITE         0x0014
92 #define VMCB_EXITCODE_CR5_WRITE         0x0015
93 #define VMCB_EXITCODE_CR6_WRITE         0x0016
94 #define VMCB_EXITCODE_CR7_WRITE         0x0017
95 #define VMCB_EXITCODE_CR8_WRITE         0x0018
96 #define VMCB_EXITCODE_CR9_WRITE         0x0019
97 #define VMCB_EXITCODE_CR10_WRITE        0x001A
98 #define VMCB_EXITCODE_CR11_WRITE        0x001B
99 #define VMCB_EXITCODE_CR12_WRITE        0x001C
100 #define VMCB_EXITCODE_CR13_WRITE        0x001D
101 #define VMCB_EXITCODE_CR14_WRITE        0x001E
102 #define VMCB_EXITCODE_CR15_WRITE        0x001F
103 #define VMCB_EXITCODE_DR0_READ          0x0020
104 #define VMCB_EXITCODE_DR1_READ          0x0021
105 #define VMCB_EXITCODE_DR2_READ          0x0022
106 #define VMCB_EXITCODE_DR3_READ          0x0023
107 #define VMCB_EXITCODE_DR4_READ          0x0024
108 #define VMCB_EXITCODE_DR5_READ          0x0025
109 #define VMCB_EXITCODE_DR6_READ          0x0026
110 #define VMCB_EXITCODE_DR7_READ          0x0027
111 #define VMCB_EXITCODE_DR8_READ          0x0028
112 #define VMCB_EXITCODE_DR9_READ          0x0029
113 #define VMCB_EXITCODE_DR10_READ         0x002A
114 #define VMCB_EXITCODE_DR11_READ         0x002B
115 #define VMCB_EXITCODE_DR12_READ         0x002C
116 #define VMCB_EXITCODE_DR13_READ         0x002D
117 #define VMCB_EXITCODE_DR14_READ         0x002E
118 #define VMCB_EXITCODE_DR15_READ         0x002F
119 #define VMCB_EXITCODE_DR0_WRITE         0x0030
120 #define VMCB_EXITCODE_DR1_WRITE         0x0031
121 #define VMCB_EXITCODE_DR2_WRITE         0x0032
122 #define VMCB_EXITCODE_DR3_WRITE         0x0033
123 #define VMCB_EXITCODE_DR4_WRITE         0x0034
124 #define VMCB_EXITCODE_DR5_WRITE         0x0035
125 #define VMCB_EXITCODE_DR6_WRITE         0x0036
126 #define VMCB_EXITCODE_DR7_WRITE         0x0037
127 #define VMCB_EXITCODE_DR8_WRITE         0x0038
128 #define VMCB_EXITCODE_DR9_WRITE         0x0039
129 #define VMCB_EXITCODE_DR10_WRITE        0x003A
130 #define VMCB_EXITCODE_DR11_WRITE        0x003B
131 #define VMCB_EXITCODE_DR12_WRITE        0x003C
132 #define VMCB_EXITCODE_DR13_WRITE        0x003D
133 #define VMCB_EXITCODE_DR14_WRITE        0x003E
134 #define VMCB_EXITCODE_DR15_WRITE        0x003F
135 #define VMCB_EXITCODE_EXCP0             0x0040
136 #define VMCB_EXITCODE_EXCP1             0x0041
137 #define VMCB_EXITCODE_EXCP2             0x0042
138 #define VMCB_EXITCODE_EXCP3             0x0043
139 #define VMCB_EXITCODE_EXCP4             0x0044
140 #define VMCB_EXITCODE_EXCP5             0x0045
141 #define VMCB_EXITCODE_EXCP6             0x0046
142 #define VMCB_EXITCODE_EXCP7             0x0047
143 #define VMCB_EXITCODE_EXCP8             0x0048
144 #define VMCB_EXITCODE_EXCP9             0x0049
145 #define VMCB_EXITCODE_EXCP10            0x004A
146 #define VMCB_EXITCODE_EXCP11            0x004B
147 #define VMCB_EXITCODE_EXCP12            0x004C
148 #define VMCB_EXITCODE_EXCP13            0x004D
149 #define VMCB_EXITCODE_EXCP14            0x004E
150 #define VMCB_EXITCODE_EXCP15            0x004F
151 #define VMCB_EXITCODE_EXCP16            0x0050
152 #define VMCB_EXITCODE_EXCP17            0x0051
153 #define VMCB_EXITCODE_EXCP18            0x0052
154 #define VMCB_EXITCODE_EXCP19            0x0053
155 #define VMCB_EXITCODE_EXCP20            0x0054
156 #define VMCB_EXITCODE_EXCP21            0x0055
157 #define VMCB_EXITCODE_EXCP22            0x0056
158 #define VMCB_EXITCODE_EXCP23            0x0057
159 #define VMCB_EXITCODE_EXCP24            0x0058
160 #define VMCB_EXITCODE_EXCP25            0x0059
161 #define VMCB_EXITCODE_EXCP26            0x005A
162 #define VMCB_EXITCODE_EXCP27            0x005B
163 #define VMCB_EXITCODE_EXCP28            0x005C
164 #define VMCB_EXITCODE_EXCP29            0x005D
165 #define VMCB_EXITCODE_EXCP30            0x005E
166 #define VMCB_EXITCODE_EXCP31            0x005F
167 #define VMCB_EXITCODE_INTR              0x0060
168 #define VMCB_EXITCODE_NMI               0x0061
169 #define VMCB_EXITCODE_SMI               0x0062
170 #define VMCB_EXITCODE_INIT              0x0063
171 #define VMCB_EXITCODE_VINTR             0x0064
172 #define VMCB_EXITCODE_CR0_SEL_WRITE     0x0065
173 #define VMCB_EXITCODE_IDTR_READ         0x0066
174 #define VMCB_EXITCODE_GDTR_READ         0x0067
175 #define VMCB_EXITCODE_LDTR_READ         0x0068
176 #define VMCB_EXITCODE_TR_READ           0x0069
177 #define VMCB_EXITCODE_IDTR_WRITE        0x006A
178 #define VMCB_EXITCODE_GDTR_WRITE        0x006B
179 #define VMCB_EXITCODE_LDTR_WRITE        0x006C
180 #define VMCB_EXITCODE_TR_WRITE          0x006D
181 #define VMCB_EXITCODE_RDTSC             0x006E
182 #define VMCB_EXITCODE_RDPMC             0x006F
183 #define VMCB_EXITCODE_PUSHF             0x0070
184 #define VMCB_EXITCODE_POPF              0x0071
185 #define VMCB_EXITCODE_CPUID             0x0072
186 #define VMCB_EXITCODE_RSM               0x0073
187 #define VMCB_EXITCODE_IRET              0x0074
188 #define VMCB_EXITCODE_SWINT             0x0075
189 #define VMCB_EXITCODE_INVD              0x0076
190 #define VMCB_EXITCODE_PAUSE             0x0077
191 #define VMCB_EXITCODE_HLT               0x0078
192 #define VMCB_EXITCODE_INVLPG            0x0079
193 #define VMCB_EXITCODE_INVLPGA           0x007A
194 #define VMCB_EXITCODE_IOIO              0x007B
195 #define VMCB_EXITCODE_MSR               0x007C
196 #define VMCB_EXITCODE_TASK_SWITCH       0x007D
197 #define VMCB_EXITCODE_FERR_FREEZE       0x007E
198 #define VMCB_EXITCODE_SHUTDOWN          0x007F
199 #define VMCB_EXITCODE_VMRUN             0x0080
200 #define VMCB_EXITCODE_VMMCALL           0x0081
201 #define VMCB_EXITCODE_VMLOAD            0x0082
202 #define VMCB_EXITCODE_VMSAVE            0x0083
203 #define VMCB_EXITCODE_STGI              0x0084
204 #define VMCB_EXITCODE_CLGI              0x0085
205 #define VMCB_EXITCODE_SKINIT            0x0086
206 #define VMCB_EXITCODE_RDTSCP            0x0087
207 #define VMCB_EXITCODE_ICEBP             0x0088
208 #define VMCB_EXITCODE_WBINVD            0x0089
209 #define VMCB_EXITCODE_MONITOR           0x008A
210 #define VMCB_EXITCODE_MWAIT             0x008B
211 #define VMCB_EXITCODE_MWAIT_CONDITIONAL 0x008C
212 #define VMCB_EXITCODE_XSETBV            0x008D
213 #define VMCB_EXITCODE_RDPRU             0x008E
214 #define VMCB_EXITCODE_EFER_WRITE_TRAP   0x008F
215 #define VMCB_EXITCODE_CR0_WRITE_TRAP    0x0090
216 #define VMCB_EXITCODE_CR1_WRITE_TRAP    0x0091
217 #define VMCB_EXITCODE_CR2_WRITE_TRAP    0x0092
218 #define VMCB_EXITCODE_CR3_WRITE_TRAP    0x0093
219 #define VMCB_EXITCODE_CR4_WRITE_TRAP    0x0094
220 #define VMCB_EXITCODE_CR5_WRITE_TRAP    0x0095
221 #define VMCB_EXITCODE_CR6_WRITE_TRAP    0x0096
222 #define VMCB_EXITCODE_CR7_WRITE_TRAP    0x0097
223 #define VMCB_EXITCODE_CR8_WRITE_TRAP    0x0098
224 #define VMCB_EXITCODE_CR9_WRITE_TRAP    0x0099
225 #define VMCB_EXITCODE_CR10_WRITE_TRAP   0x009A
226 #define VMCB_EXITCODE_CR11_WRITE_TRAP   0x009B
227 #define VMCB_EXITCODE_CR12_WRITE_TRAP   0x009C
228 #define VMCB_EXITCODE_CR13_WRITE_TRAP   0x009D
229 #define VMCB_EXITCODE_CR14_WRITE_TRAP   0x009E
230 #define VMCB_EXITCODE_CR15_WRITE_TRAP   0x009F
231 #define VMCB_EXITCODE_INVLPGB           0x00A0
232 #define VMCB_EXITCODE_INVLPGB_ILLEGAL   0x00A1
233 #define VMCB_EXITCODE_INVPCID           0x00A2
234 #define VMCB_EXITCODE_MCOMMIT           0x00A3
235 #define VMCB_EXITCODE_TLBSYNC           0x00A4
236 #define VMCB_EXITCODE_NPF               0x0400
237 #define VMCB_EXITCODE_AVIC_INCOMP_IPI   0x0401
238 #define VMCB_EXITCODE_AVIC_NOACCEL      0x0402
239 #define VMCB_EXITCODE_VMGEXIT           0x0403
240 #define VMCB_EXITCODE_BUSY              -2ULL
241 #define VMCB_EXITCODE_INVALID           -1ULL
242
243 /* -------------------------------------------------------------------------- */
244
245 struct vmcb_ctrl {
246         uint32_t intercept_cr;
247 #define VMCB_CTRL_INTERCEPT_RCR(x)      __BIT( 0 + x)
248 #define VMCB_CTRL_INTERCEPT_WCR(x)      __BIT(16 + x)
249
250         uint32_t intercept_dr;
251 #define VMCB_CTRL_INTERCEPT_RDR(x)      __BIT( 0 + x)
252 #define VMCB_CTRL_INTERCEPT_WDR(x)      __BIT(16 + x)
253
254         uint32_t intercept_vec;
255 #define VMCB_CTRL_INTERCEPT_VEC(x)      __BIT(x)
256
257         uint32_t intercept_misc1;
258 #define VMCB_CTRL_INTERCEPT_INTR        __BIT(0)
259 #define VMCB_CTRL_INTERCEPT_NMI         __BIT(1)
260 #define VMCB_CTRL_INTERCEPT_SMI         __BIT(2)
261 #define VMCB_CTRL_INTERCEPT_INIT        __BIT(3)
262 #define VMCB_CTRL_INTERCEPT_VINTR       __BIT(4)
263 #define VMCB_CTRL_INTERCEPT_CR0_SPEC    __BIT(5)
264 #define VMCB_CTRL_INTERCEPT_RIDTR       __BIT(6)
265 #define VMCB_CTRL_INTERCEPT_RGDTR       __BIT(7)
266 #define VMCB_CTRL_INTERCEPT_RLDTR       __BIT(8)
267 #define VMCB_CTRL_INTERCEPT_RTR         __BIT(9)
268 #define VMCB_CTRL_INTERCEPT_WIDTR       __BIT(10)
269 #define VMCB_CTRL_INTERCEPT_WGDTR       __BIT(11)
270 #define VMCB_CTRL_INTERCEPT_WLDTR       __BIT(12)
271 #define VMCB_CTRL_INTERCEPT_WTR         __BIT(13)
272 #define VMCB_CTRL_INTERCEPT_RDTSC       __BIT(14)
273 #define VMCB_CTRL_INTERCEPT_RDPMC       __BIT(15)
274 #define VMCB_CTRL_INTERCEPT_PUSHF       __BIT(16)
275 #define VMCB_CTRL_INTERCEPT_POPF        __BIT(17)
276 #define VMCB_CTRL_INTERCEPT_CPUID       __BIT(18)
277 #define VMCB_CTRL_INTERCEPT_RSM         __BIT(19)
278 #define VMCB_CTRL_INTERCEPT_IRET        __BIT(20)
279 #define VMCB_CTRL_INTERCEPT_INTN        __BIT(21)
280 #define VMCB_CTRL_INTERCEPT_INVD        __BIT(22)
281 #define VMCB_CTRL_INTERCEPT_PAUSE       __BIT(23)
282 #define VMCB_CTRL_INTERCEPT_HLT         __BIT(24)
283 #define VMCB_CTRL_INTERCEPT_INVLPG      __BIT(25)
284 #define VMCB_CTRL_INTERCEPT_INVLPGA     __BIT(26)
285 #define VMCB_CTRL_INTERCEPT_IOIO_PROT   __BIT(27)
286 #define VMCB_CTRL_INTERCEPT_MSR_PROT    __BIT(28)
287 #define VMCB_CTRL_INTERCEPT_TASKSW      __BIT(29)
288 #define VMCB_CTRL_INTERCEPT_FERR_FREEZE __BIT(30)
289 #define VMCB_CTRL_INTERCEPT_SHUTDOWN    __BIT(31)
290
291         uint32_t intercept_misc2;
292 #define VMCB_CTRL_INTERCEPT_VMRUN       __BIT(0)
293 #define VMCB_CTRL_INTERCEPT_VMMCALL     __BIT(1)
294 #define VMCB_CTRL_INTERCEPT_VMLOAD      __BIT(2)
295 #define VMCB_CTRL_INTERCEPT_VMSAVE      __BIT(3)
296 #define VMCB_CTRL_INTERCEPT_STGI        __BIT(4)
297 #define VMCB_CTRL_INTERCEPT_CLGI        __BIT(5)
298 #define VMCB_CTRL_INTERCEPT_SKINIT      __BIT(6)
299 #define VMCB_CTRL_INTERCEPT_RDTSCP      __BIT(7)
300 #define VMCB_CTRL_INTERCEPT_ICEBP       __BIT(8)
301 #define VMCB_CTRL_INTERCEPT_WBINVD      __BIT(9)
302 #define VMCB_CTRL_INTERCEPT_MONITOR     __BIT(10)
303 #define VMCB_CTRL_INTERCEPT_MWAIT       __BIT(11)
304 #define VMCB_CTRL_INTERCEPT_MWAIT_ARMED __BIT(12)
305 #define VMCB_CTRL_INTERCEPT_XSETBV      __BIT(13)
306 #define VMCB_CTRL_INTERCEPT_RDPRU       __BIT(14)
307 #define VMCB_CTRL_INTERCEPT_EFER_SPEC   __BIT(15)
308 #define VMCB_CTRL_INTERCEPT_WCR_SPEC(x) __BIT(16 + x)
309
310         uint32_t intercept_misc3;
311 #define VMCB_CTRL_INTERCEPT_INVLPGB_ALL __BIT(0)
312 #define VMCB_CTRL_INTERCEPT_INVLPGB_ILL __BIT(1)
313 #define VMCB_CTRL_INTERCEPT_PCID        __BIT(2)
314 #define VMCB_CTRL_INTERCEPT_MCOMMIT     __BIT(3)
315 #define VMCB_CTRL_INTERCEPT_TLBSYNC     __BIT(4)
316
317         uint8_t  rsvd1[36];
318         uint16_t pause_filt_thresh;
319         uint16_t pause_filt_cnt;
320         uint64_t iopm_base_pa;
321         uint64_t msrpm_base_pa;
322         uint64_t tsc_offset;
323         uint32_t guest_asid;
324
325         uint32_t tlb_ctrl;
326 #define VMCB_CTRL_TLB_CTRL_FLUSH_ALL                    0x01
327 #define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST                  0x03
328 #define VMCB_CTRL_TLB_CTRL_FLUSH_GUEST_NONGLOBAL        0x07
329
330         uint64_t v;
331 #define VMCB_CTRL_V_TPR                 __BITS(3,0)
332 #define VMCB_CTRL_V_IRQ                 __BIT(8)
333 #define VMCB_CTRL_V_VGIF                __BIT(9)
334 #define VMCB_CTRL_V_INTR_PRIO           __BITS(19,16)
335 #define VMCB_CTRL_V_IGN_TPR             __BIT(20)
336 #define VMCB_CTRL_V_INTR_MASKING        __BIT(24)
337 #define VMCB_CTRL_V_GUEST_VGIF          __BIT(25)
338 #define VMCB_CTRL_V_AVIC_EN             __BIT(31)
339 #define VMCB_CTRL_V_INTR_VECTOR         __BITS(39,32)
340
341         uint64_t intr;
342 #define VMCB_CTRL_INTR_SHADOW           __BIT(0)
343 #define VMCB_CTRL_INTR_MASK             __BIT(1)
344
345         uint64_t exitcode;
346         uint64_t exitinfo1;
347         uint64_t exitinfo2;
348
349         uint64_t exitintinfo;
350 #define VMCB_CTRL_EXITINTINFO_VECTOR    __BITS(7,0)
351 #define VMCB_CTRL_EXITINTINFO_TYPE      __BITS(10,8)
352 #define VMCB_CTRL_EXITINTINFO_EV        __BIT(11)
353 #define VMCB_CTRL_EXITINTINFO_V         __BIT(31)
354 #define VMCB_CTRL_EXITINTINFO_ERRORCODE __BITS(63,32)
355
356         uint64_t enable1;
357 #define VMCB_CTRL_ENABLE_NP             __BIT(0)
358 #define VMCB_CTRL_ENABLE_SEV            __BIT(1)
359 #define VMCB_CTRL_ENABLE_ES_SEV         __BIT(2)
360 #define VMCB_CTRL_ENABLE_GMET           __BIT(3)
361 #define VMCB_CTRL_ENABLE_SSS            __BIT(4)
362 #define VMCB_CTRL_ENABLE_VTE            __BIT(5)
363
364         uint64_t avic;
365 #define VMCB_CTRL_AVIC_APIC_BAR         __BITS(51,0)
366
367         uint64_t ghcb;
368
369         uint64_t eventinj;
370 #define VMCB_CTRL_EVENTINJ_VECTOR       __BITS(7,0)
371 #define VMCB_CTRL_EVENTINJ_TYPE         __BITS(10,8)
372 #define VMCB_CTRL_EVENTINJ_EV           __BIT(11)
373 #define VMCB_CTRL_EVENTINJ_V            __BIT(31)
374 #define VMCB_CTRL_EVENTINJ_ERRORCODE    __BITS(63,32)
375
376         uint64_t n_cr3;
377
378         uint64_t enable2;
379 #define VMCB_CTRL_ENABLE_LBR            __BIT(0)
380 #define VMCB_CTRL_ENABLE_VVMSAVE        __BIT(1)
381
382         uint32_t vmcb_clean;
383 #define VMCB_CTRL_VMCB_CLEAN_I          __BIT(0)
384 #define VMCB_CTRL_VMCB_CLEAN_IOPM       __BIT(1)
385 #define VMCB_CTRL_VMCB_CLEAN_ASID       __BIT(2)
386 #define VMCB_CTRL_VMCB_CLEAN_TPR        __BIT(3)
387 #define VMCB_CTRL_VMCB_CLEAN_NP         __BIT(4)
388 #define VMCB_CTRL_VMCB_CLEAN_CR         __BIT(5)
389 #define VMCB_CTRL_VMCB_CLEAN_DR         __BIT(6)
390 #define VMCB_CTRL_VMCB_CLEAN_DT         __BIT(7)
391 #define VMCB_CTRL_VMCB_CLEAN_SEG        __BIT(8)
392 #define VMCB_CTRL_VMCB_CLEAN_CR2        __BIT(9)
393 #define VMCB_CTRL_VMCB_CLEAN_LBR        __BIT(10)
394 #define VMCB_CTRL_VMCB_CLEAN_AVIC       __BIT(11)
395 #define VMCB_CTRL_VMCB_CLEAN_CET        __BIT(12)
396
397         uint32_t rsvd2;
398         uint64_t nrip;
399         uint8_t inst_len;
400         uint8_t inst_bytes[15];
401         uint64_t avic_abpp;
402         uint64_t rsvd3;
403         uint64_t avic_ltp;
404
405         uint64_t avic_phys;
406 #define VMCB_CTRL_AVIC_PHYS_TABLE_PTR   __BITS(51,12)
407 #define VMCB_CTRL_AVIC_PHYS_MAX_INDEX   __BITS(7,0)
408
409         uint64_t rsvd4;
410         uint64_t vmsa_ptr;
411
412         uint8_t pad[752];
413 } __packed;
414
415 CTASSERT(sizeof(struct vmcb_ctrl) == 1024);
416
417 struct vmcb_segment {
418         uint16_t selector;
419         uint16_t attrib;        /* hidden */
420         uint32_t limit;         /* hidden */
421         uint64_t base;          /* hidden */
422 } __packed;
423
424 CTASSERT(sizeof(struct vmcb_segment) == 16);
425
426 struct vmcb_state {
427         struct   vmcb_segment es;
428         struct   vmcb_segment cs;
429         struct   vmcb_segment ss;
430         struct   vmcb_segment ds;
431         struct   vmcb_segment fs;
432         struct   vmcb_segment gs;
433         struct   vmcb_segment gdt;
434         struct   vmcb_segment ldt;
435         struct   vmcb_segment idt;
436         struct   vmcb_segment tr;
437         uint8_t  rsvd1[43];
438         uint8_t  cpl;
439         uint8_t  rsvd2[4];
440         uint64_t efer;
441         uint8_t  rsvd3[112];
442         uint64_t cr4;
443         uint64_t cr3;
444         uint64_t cr0;
445         uint64_t dr7;
446         uint64_t dr6;
447         uint64_t rflags;
448         uint64_t rip;
449         uint8_t  rsvd4[88];
450         uint64_t rsp;
451         uint64_t s_cet;
452         uint64_t ssp;
453         uint64_t isst_addr;
454         uint64_t rax;
455         uint64_t star;
456         uint64_t lstar;
457         uint64_t cstar;
458         uint64_t sfmask;
459         uint64_t kernelgsbase;
460         uint64_t sysenter_cs;
461         uint64_t sysenter_esp;
462         uint64_t sysenter_eip;
463         uint64_t cr2;
464         uint8_t  rsvd6[32];
465         uint64_t g_pat;
466         uint64_t dbgctl;
467         uint64_t br_from;
468         uint64_t br_to;
469         uint64_t int_from;
470         uint64_t int_to;
471         uint8_t  pad[2408];
472 } __packed;
473
474 CTASSERT(sizeof(struct vmcb_state) == 0xC00);
475
476 struct vmcb {
477         struct vmcb_ctrl ctrl;
478         struct vmcb_state state;
479 } __packed;
480
481 CTASSERT(sizeof(struct vmcb) == PAGE_SIZE);
482 CTASSERT(offsetof(struct vmcb, state) == 0x400);
483
484 /* -------------------------------------------------------------------------- */
485
486 static void svm_vcpu_state_provide(struct nvmm_cpu *, uint64_t);
487 static void svm_vcpu_state_commit(struct nvmm_cpu *);
488
489 /*
490  * These host values are static, they do not change at runtime and are the same
491  * on all CPUs. We save them here because they are not saved in the VMCB.
492  */
493 static struct {
494         uint64_t xcr0;
495         uint64_t star;
496         uint64_t lstar;
497         uint64_t cstar;
498         uint64_t sfmask;
499 } svm_global_hstate __cacheline_aligned;
500
501 struct svm_hsave {
502         paddr_t pa;
503 };
504
505 static struct svm_hsave hsave[MAXCPUS];
506
507 static uint8_t *svm_asidmap __read_mostly;
508 static uint32_t svm_maxasid __read_mostly;
509 static kmutex_t svm_asidlock __cacheline_aligned;
510
511 static bool svm_decode_assist __read_mostly;
512 static uint32_t svm_ctrl_tlb_flush __read_mostly;
513
514 #define SVM_XCR0_MASK_DEFAULT   (XCR0_X87|XCR0_SSE)
515 static uint64_t svm_xcr0_mask __read_mostly;
516
517 static int svm_change_cpu_count;
518
519 #define SVM_NCPUIDS     32
520
521 #define VMCB_NPAGES     1
522
523 #define MSRBM_NPAGES    2
524 #define MSRBM_SIZE      (MSRBM_NPAGES * PAGE_SIZE)
525
526 #define IOBM_NPAGES     3
527 #define IOBM_SIZE       (IOBM_NPAGES * PAGE_SIZE)
528
529 /* Does not include EFER_LMSLE. */
530 #define EFER_VALID \
531         (EFER_SCE|EFER_LME|EFER_LMA|EFER_NXE|EFER_SVME|EFER_FFXSR|EFER_TCE)
532
533 #define EFER_TLB_FLUSH \
534         (EFER_NXE|EFER_LMA|EFER_LME)
535 #define CR0_TLB_FLUSH \
536         (CR0_PG|CR0_WP|CR0_CD|CR0_NW)
537 #define CR4_TLB_FLUSH \
538         (CR4_PSE|CR4_PAE|CR4_PGE|CR4_PCIDE|CR4_SMEP)
539
540 /* -------------------------------------------------------------------------- */
541
542 struct svm_machdata {
543         volatile uint64_t mach_htlb_gen;
544 };
545
546 static const size_t svm_vcpu_conf_sizes[NVMM_X86_VCPU_NCONF] = {
547         [NVMM_VCPU_CONF_MD(NVMM_VCPU_CONF_CPUID)] =
548             sizeof(struct nvmm_vcpu_conf_cpuid),
549         [NVMM_VCPU_CONF_MD(NVMM_VCPU_CONF_TPR)] =
550             sizeof(struct nvmm_vcpu_conf_tpr)
551 };
552
553 struct svm_cpudata {
554         /* General. */
555         bool shared_asid;
556         bool gtlb_want_flush;
557         bool htlb_want_flush;
558         bool gtsc_want_update;
559         uint64_t vcpu_htlb_gen;
560
561         /* VMCB. */
562         struct vmcb *vmcb;
563         paddr_t vmcb_pa;
564
565         /* I/O bitmap. */
566         uint8_t *iobm;
567         paddr_t iobm_pa;
568
569         /* MSR bitmap. */
570         uint8_t *msrbm;
571         paddr_t msrbm_pa;
572
573         /* Percpu host state, absent from VMCB. */
574         struct {
575                 uint64_t fsbase;
576                 uint64_t kernelgsbase;
577 #ifdef __DragonFly__
578                 mcontext_t hmctx;  /* TODO: remove this like NetBSD */
579 #endif
580         } hstate;
581
582         /* Intr state. */
583         bool int_window_exit;
584         bool nmi_window_exit;
585         bool evt_pending;
586
587         /* Guest state. */
588         uint64_t gxcr0;
589         uint64_t gprs[NVMM_X64_NGPR];
590         uint64_t drs[NVMM_X64_NDR];
591         uint64_t gtsc_offset;
592         uint64_t gtsc_match;
593         struct nvmm_x86_xsave gxsave __aligned(64);
594
595         /* VCPU configuration. */
596         bool cpuidpresent[SVM_NCPUIDS];
597         struct nvmm_vcpu_conf_cpuid cpuid[SVM_NCPUIDS];
598 };
599
600 static void
601 svm_vmcb_cache_default(struct vmcb *vmcb)
602 {
603         vmcb->ctrl.vmcb_clean =
604             VMCB_CTRL_VMCB_CLEAN_I |
605             VMCB_CTRL_VMCB_CLEAN_IOPM |
606             VMCB_CTRL_VMCB_CLEAN_ASID |
607             VMCB_CTRL_VMCB_CLEAN_TPR |
608             VMCB_CTRL_VMCB_CLEAN_NP |
609             VMCB_CTRL_VMCB_CLEAN_CR |
610             VMCB_CTRL_VMCB_CLEAN_DR |
611             VMCB_CTRL_VMCB_CLEAN_DT |
612             VMCB_CTRL_VMCB_CLEAN_SEG |
613             VMCB_CTRL_VMCB_CLEAN_CR2 |
614             VMCB_CTRL_VMCB_CLEAN_LBR |
615             VMCB_CTRL_VMCB_CLEAN_AVIC;
616 }
617
618 static void
619 svm_vmcb_cache_update(struct vmcb *vmcb, uint64_t flags)
620 {
621         if (flags & NVMM_X64_STATE_SEGS) {
622                 vmcb->ctrl.vmcb_clean &=
623                     ~(VMCB_CTRL_VMCB_CLEAN_SEG | VMCB_CTRL_VMCB_CLEAN_DT);
624         }
625         if (flags & NVMM_X64_STATE_CRS) {
626                 vmcb->ctrl.vmcb_clean &=
627                     ~(VMCB_CTRL_VMCB_CLEAN_CR | VMCB_CTRL_VMCB_CLEAN_CR2 |
628                       VMCB_CTRL_VMCB_CLEAN_TPR);
629         }
630         if (flags & NVMM_X64_STATE_DRS) {
631                 vmcb->ctrl.vmcb_clean &= ~VMCB_CTRL_VMCB_CLEAN_DR;
632         }
633         if (flags & NVMM_X64_STATE_MSRS) {
634                 /* CR for EFER, NP for PAT. */
635                 vmcb->ctrl.vmcb_clean &=
636                     ~(VMCB_CTRL_VMCB_CLEAN_CR | VMCB_CTRL_VMCB_CLEAN_NP);
637         }
638 }
639
640 static inline void
641 svm_vmcb_cache_flush(struct vmcb *vmcb, uint64_t flags)
642 {
643         vmcb->ctrl.vmcb_clean &= ~flags;
644 }
645
646 static inline void
647 svm_vmcb_cache_flush_all(struct vmcb *vmcb)
648 {
649         vmcb->ctrl.vmcb_clean = 0;
650 }
651
652 #define SVM_EVENT_TYPE_HW_INT   0
653 #define SVM_EVENT_TYPE_NMI      2
654 #define SVM_EVENT_TYPE_EXC      3
655 #define SVM_EVENT_TYPE_SW_INT   4
656
657 static void
658 svm_event_waitexit_enable(struct nvmm_cpu *vcpu, bool nmi)
659 {
660         struct svm_cpudata *cpudata = vcpu->cpudata;
661         struct vmcb *vmcb = cpudata->vmcb;
662
663         if (nmi) {
664                 vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_IRET;
665                 cpudata->nmi_window_exit = true;
666         } else {
667                 vmcb->ctrl.intercept_misc1 |= VMCB_CTRL_INTERCEPT_VINTR;
668                 vmcb->ctrl.v |= (VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
669                 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_TPR);
670                 cpudata->int_window_exit = true;
671         }
672
673         svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
674 }
675
676 static void
677 svm_event_waitexit_disable(struct nvmm_cpu *vcpu, bool nmi)
678 {
679         struct svm_cpudata *cpudata = vcpu->cpudata;
680         struct vmcb *vmcb = cpudata->vmcb;
681
682         if (nmi) {
683                 vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_IRET;
684                 cpudata->nmi_window_exit = false;
685         } else {
686                 vmcb->ctrl.intercept_misc1 &= ~VMCB_CTRL_INTERCEPT_VINTR;
687                 vmcb->ctrl.v &= ~(VMCB_CTRL_V_IRQ | VMCB_CTRL_V_IGN_TPR);
688                 svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_TPR);
689                 cpudata->int_window_exit = false;
690         }
691
692         svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
693 }
694
695 static inline bool
696 svm_excp_has_rf(uint8_t vector)
697 {
698         switch (vector) {
699         case 1:         /* #DB */
700         case 4:         /* #OF */
701         case 8:         /* #DF */
702         case 18:        /* #MC */
703                 return false;
704         default:
705                 return true;
706         }
707 }
708
709 static inline int
710 svm_excp_has_error(uint8_t vector)
711 {
712         switch (vector) {
713         case 8:         /* #DF */
714         case 10:        /* #TS */
715         case 11:        /* #NP */
716         case 12:        /* #SS */
717         case 13:        /* #GP */
718         case 14:        /* #PF */
719         case 17:        /* #AC */
720         case 21:        /* #CP */
721         case 30:        /* #SX */
722                 return 1;
723         default:
724                 return 0;
725         }
726 }
727
728 static int
729 svm_vcpu_inject(struct nvmm_cpu *vcpu)
730 {
731         struct nvmm_comm_page *comm = vcpu->comm;
732         struct svm_cpudata *cpudata = vcpu->cpudata;
733         struct vmcb *vmcb = cpudata->vmcb;
734         u_int evtype;
735         uint8_t vector;
736         uint64_t error;
737         int type = 0, err = 0;
738
739         evtype = comm->event.type;
740         vector = comm->event.vector;
741         error = comm->event.u.excp.error;
742         __insn_barrier();
743
744         switch (evtype) {
745         case NVMM_VCPU_EVENT_EXCP:
746                 type = SVM_EVENT_TYPE_EXC;
747                 if (vector == 2 || vector >= 32)
748                         return EINVAL;
749                 if (vector == 3 || vector == 0)
750                         return EINVAL;
751                 if (svm_excp_has_rf(vector)) {
752                         vmcb->state.rflags |= PSL_RF;
753                 }
754                 err = svm_excp_has_error(vector);
755                 break;
756         case NVMM_VCPU_EVENT_INTR:
757                 type = SVM_EVENT_TYPE_HW_INT;
758                 if (vector == 2) {
759                         type = SVM_EVENT_TYPE_NMI;
760                         svm_event_waitexit_enable(vcpu, true);
761                 }
762                 err = 0;
763                 break;
764         default:
765                 return EINVAL;
766         }
767
768         vmcb->ctrl.eventinj =
769             __SHIFTIN((uint64_t)vector, VMCB_CTRL_EVENTINJ_VECTOR) |
770             __SHIFTIN((uint64_t)type, VMCB_CTRL_EVENTINJ_TYPE) |
771             __SHIFTIN((uint64_t)err, VMCB_CTRL_EVENTINJ_EV) |
772             __SHIFTIN((uint64_t)1, VMCB_CTRL_EVENTINJ_V) |
773             __SHIFTIN((uint64_t)error, VMCB_CTRL_EVENTINJ_ERRORCODE);
774
775         cpudata->evt_pending = true;
776
777         return 0;
778 }
779
780 static void
781 svm_inject_ud(struct nvmm_cpu *vcpu)
782 {
783         struct nvmm_comm_page *comm = vcpu->comm;
784         int ret __diagused;
785
786         comm->event.type = NVMM_VCPU_EVENT_EXCP;
787         comm->event.vector = 6;
788         comm->event.u.excp.error = 0;
789
790         ret = svm_vcpu_inject(vcpu);
791         KASSERT(ret == 0);
792 }
793
794 static void
795 svm_inject_gp(struct nvmm_cpu *vcpu)
796 {
797         struct nvmm_comm_page *comm = vcpu->comm;
798         int ret __diagused;
799
800         comm->event.type = NVMM_VCPU_EVENT_EXCP;
801         comm->event.vector = 13;
802         comm->event.u.excp.error = 0;
803
804         ret = svm_vcpu_inject(vcpu);
805         KASSERT(ret == 0);
806 }
807
808 static inline int
809 svm_vcpu_event_commit(struct nvmm_cpu *vcpu)
810 {
811         if (__predict_true(!vcpu->comm->event_commit)) {
812                 return 0;
813         }
814         vcpu->comm->event_commit = false;
815         return svm_vcpu_inject(vcpu);
816 }
817
818 static inline void
819 svm_inkernel_advance(struct vmcb *vmcb)
820 {
821         /*
822          * Maybe we should also apply single-stepping and debug exceptions.
823          * Matters for guest-ring3, because it can execute 'cpuid' under a
824          * debugger.
825          */
826         vmcb->state.rip = vmcb->ctrl.nrip;
827         vmcb->state.rflags &= ~PSL_RF;
828         vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW;
829 }
830
831 #define SVM_CPUID_MAX_BASIC             0xD
832 #define SVM_CPUID_MAX_HYPERVISOR        0x40000000
833 #define SVM_CPUID_MAX_EXTENDED          0x8000001F
834 static uint32_t svm_cpuid_max_basic __read_mostly;
835 static uint32_t svm_cpuid_max_extended __read_mostly;
836
837 static void
838 svm_inkernel_exec_cpuid(struct svm_cpudata *cpudata, uint32_t eax, uint32_t ecx)
839 {
840         u_int descs[4];
841
842         x86_cpuid2(eax, ecx, descs);
843         cpudata->vmcb->state.rax = descs[0];
844         cpudata->gprs[NVMM_X64_GPR_RBX] = descs[1];
845         cpudata->gprs[NVMM_X64_GPR_RCX] = descs[2];
846         cpudata->gprs[NVMM_X64_GPR_RDX] = descs[3];
847 }
848
849 static void
850 svm_inkernel_handle_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
851     uint32_t eax, uint32_t ecx)
852 {
853         struct svm_cpudata *cpudata = vcpu->cpudata;
854         unsigned int ncpus;
855         uint64_t cr4;
856
857         if (eax < 0x40000000) {
858                 if (__predict_false(eax > svm_cpuid_max_basic)) {
859                         eax = svm_cpuid_max_basic;
860                         svm_inkernel_exec_cpuid(cpudata, eax, ecx);
861                 }
862         } else if (eax < 0x80000000) {
863                 if (__predict_false(eax > SVM_CPUID_MAX_HYPERVISOR)) {
864                         eax = svm_cpuid_max_basic;
865                         svm_inkernel_exec_cpuid(cpudata, eax, ecx);
866                 }
867         } else {
868                 if (__predict_false(eax > svm_cpuid_max_extended)) {
869                         eax = svm_cpuid_max_basic;
870                         svm_inkernel_exec_cpuid(cpudata, eax, ecx);
871                 }
872         }
873
874         switch (eax) {
875         case 0x00000000:
876                 cpudata->vmcb->state.rax = svm_cpuid_max_basic;
877                 break;
878         case 0x00000001:
879                 cpudata->vmcb->state.rax &= nvmm_cpuid_00000001.eax;
880
881                 cpudata->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_0_01_EBX_LOCAL_APIC_ID;
882                 cpudata->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(vcpu->cpuid,
883                     CPUID_0_01_EBX_LOCAL_APIC_ID);
884
885                 ncpus = atomic_load_acq_int(&mach->ncpus);
886                 cpudata->gprs[NVMM_X64_GPR_RBX] &= ~CPUID_0_01_EBX_HTT_CORES;
887                 cpudata->gprs[NVMM_X64_GPR_RBX] |= __SHIFTIN(ncpus,
888                     CPUID_0_01_EBX_HTT_CORES);
889
890                 cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_00000001.ecx;
891                 cpudata->gprs[NVMM_X64_GPR_RCX] |= CPUID_0_01_ECX_RAZ;
892
893                 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_00000001.edx;
894
895                 /* CPUID_0_01_ECX_OSXSAVE depends on CR4. */
896                 cr4 = cpudata->vmcb->state.cr4;
897                 if (!(cr4 & CR4_OSXSAVE)) {
898                         cpudata->gprs[NVMM_X64_GPR_RCX] &= ~CPUID_0_01_ECX_OSXSAVE;
899                 }
900                 break;
901         case 0x00000002: /* Empty */
902         case 0x00000003: /* Empty */
903         case 0x00000004: /* Empty */
904         case 0x00000005: /* Monitor/MWait */
905         case 0x00000006: /* Power Management Related Features */
906                 cpudata->vmcb->state.rax = 0;
907                 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
908                 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
909                 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
910                 break;
911         case 0x00000007: /* Structured Extended Features */
912                 switch (ecx) {
913                 case 0:
914                         cpudata->vmcb->state.rax = 0;
915                         cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_00000007.ebx;
916                         cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_00000007.ecx;
917                         cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_00000007.edx;
918                         break;
919                 default:
920                         cpudata->vmcb->state.rax = 0;
921                         cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
922                         cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
923                         cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
924                         break;
925                 }
926                 break;
927         case 0x00000008: /* Empty */
928         case 0x00000009: /* Empty */
929         case 0x0000000A: /* Empty */
930         case 0x0000000B: /* Empty */
931         case 0x0000000C: /* Empty */
932                 cpudata->vmcb->state.rax = 0;
933                 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
934                 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
935                 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
936                 break;
937         case 0x0000000D: /* Processor Extended State Enumeration */
938                 if (svm_xcr0_mask == 0) {
939                         break;
940                 }
941                 switch (ecx) {
942                 case 0:
943                         /* Supported XCR0 bits. */
944                         cpudata->vmcb->state.rax = svm_xcr0_mask & 0xFFFFFFFF;
945                         cpudata->gprs[NVMM_X64_GPR_RDX] = svm_xcr0_mask >> 32;
946                         /* XSAVE size for currently enabled XCR0 features. */
947                         cpudata->gprs[NVMM_X64_GPR_RBX] = nvmm_x86_xsave_size(cpudata->gxcr0);
948                         /* XSAVE size for all supported XCR0 features. */
949                         cpudata->gprs[NVMM_X64_GPR_RCX] = nvmm_x86_xsave_size(svm_xcr0_mask);
950                         break;
951                 case 1:
952                         cpudata->vmcb->state.rax &=
953                             (CPUID_0_0D_ECX1_EAX_XSAVEOPT |
954                              CPUID_0_0D_ECX1_EAX_XSAVEC |
955                              CPUID_0_0D_ECX1_EAX_XGETBV);
956                         cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
957                         cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
958                         cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
959                         break;
960                 default:
961                         cpudata->vmcb->state.rax = 0;
962                         cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
963                         cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
964                         cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
965                         break;
966                 }
967                 break;
968
969         case 0x40000000: /* Hypervisor Information */
970                 cpudata->vmcb->state.rax = SVM_CPUID_MAX_HYPERVISOR;
971                 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
972                 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
973                 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
974                 memcpy(&cpudata->gprs[NVMM_X64_GPR_RBX], "___ ", 4);
975                 memcpy(&cpudata->gprs[NVMM_X64_GPR_RCX], "NVMM", 4);
976                 memcpy(&cpudata->gprs[NVMM_X64_GPR_RDX], " ___", 4);
977                 break;
978
979         case 0x80000000:
980                 cpudata->vmcb->state.rax = svm_cpuid_max_extended;
981                 break;
982         case 0x80000001:
983                 cpudata->vmcb->state.rax &= nvmm_cpuid_80000001.eax;
984                 cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_80000001.ebx;
985                 cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_80000001.ecx;
986                 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_80000001.edx;
987                 break;
988         case 0x80000002: /* Extended Processor Name String */
989         case 0x80000003: /* Extended Processor Name String */
990         case 0x80000004: /* Extended Processor Name String */
991         case 0x80000005: /* L1 Cache and TLB Information */
992         case 0x80000006: /* L2 Cache and TLB and L3 Cache Information */
993                 break;
994         case 0x80000007: /* Processor Power Management and RAS Capabilities */
995                 cpudata->vmcb->state.rax &= nvmm_cpuid_80000007.eax;
996                 cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_80000007.ebx;
997                 cpudata->gprs[NVMM_X64_GPR_RCX] &= nvmm_cpuid_80000007.ecx;
998                 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_80000007.edx;
999                 break;
1000         case 0x80000008: /* Processor Capacity Parameters and Ext Feat Ident */
1001                 ncpus = atomic_load_acq_int(&mach->ncpus);
1002                 cpudata->vmcb->state.rax &= nvmm_cpuid_80000008.eax;
1003                 cpudata->gprs[NVMM_X64_GPR_RBX] &= nvmm_cpuid_80000008.ebx;
1004                 cpudata->gprs[NVMM_X64_GPR_RCX] =
1005                     __SHIFTIN(ncpus - 1, CPUID_8_08_ECX_NC) |
1006                     __SHIFTIN(ilog2(NVMM_MAX_VCPUS), CPUID_8_08_ECX_ApicIdSize);
1007                 cpudata->gprs[NVMM_X64_GPR_RDX] &= nvmm_cpuid_80000008.edx;
1008                 break;
1009         case 0x80000009: /* Empty */
1010         case 0x8000000A: /* SVM Features */
1011         case 0x8000000B: /* Empty */
1012         case 0x8000000C: /* Empty */
1013         case 0x8000000D: /* Empty */
1014         case 0x8000000E: /* Empty */
1015         case 0x8000000F: /* Empty */
1016         case 0x80000010: /* Empty */
1017         case 0x80000011: /* Empty */
1018         case 0x80000012: /* Empty */
1019         case 0x80000013: /* Empty */
1020         case 0x80000014: /* Empty */
1021         case 0x80000015: /* Empty */
1022         case 0x80000016: /* Empty */
1023         case 0x80000017: /* Empty */
1024         case 0x80000018: /* Empty */
1025                 cpudata->vmcb->state.rax = 0;
1026                 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
1027                 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
1028                 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
1029                 break;
1030         case 0x80000019: /* TLB Characteristics for 1GB pages */
1031         case 0x8000001A: /* Instruction Optimizations */
1032                 break;
1033         case 0x8000001B: /* Instruction-Based Sampling Capabilities */
1034         case 0x8000001C: /* Lightweight Profiling Capabilities */
1035                 cpudata->vmcb->state.rax = 0;
1036                 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
1037                 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
1038                 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
1039                 break;
1040         case 0x8000001D: /* Cache Topology Information */
1041         case 0x8000001E: /* Processor Topology Information */
1042                 break; /* TODO? */
1043         case 0x8000001F: /* Encrypted Memory Capabilities */
1044                 cpudata->vmcb->state.rax = 0;
1045                 cpudata->gprs[NVMM_X64_GPR_RBX] = 0;
1046                 cpudata->gprs[NVMM_X64_GPR_RCX] = 0;
1047                 cpudata->gprs[NVMM_X64_GPR_RDX] = 0;
1048                 break;
1049
1050         default:
1051                 break;
1052         }
1053 }
1054
1055 static void
1056 svm_exit_insn(struct vmcb *vmcb, struct nvmm_vcpu_exit *exit, uint64_t reason)
1057 {
1058         exit->u.insn.npc = vmcb->ctrl.nrip;
1059         exit->reason = reason;
1060 }
1061
1062 static void
1063 svm_exit_cpuid(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1064     struct nvmm_vcpu_exit *exit)
1065 {
1066         struct svm_cpudata *cpudata = vcpu->cpudata;
1067         struct nvmm_vcpu_conf_cpuid *cpuid;
1068         uint32_t eax, ecx;
1069         size_t i;
1070
1071         eax = (cpudata->vmcb->state.rax & 0xFFFFFFFF);
1072         ecx = (cpudata->gprs[NVMM_X64_GPR_RCX] & 0xFFFFFFFF);
1073         svm_inkernel_exec_cpuid(cpudata, eax, ecx);
1074         svm_inkernel_handle_cpuid(mach, vcpu, eax, ecx);
1075
1076         for (i = 0; i < SVM_NCPUIDS; i++) {
1077                 if (!cpudata->cpuidpresent[i]) {
1078                         continue;
1079                 }
1080                 cpuid = &cpudata->cpuid[i];
1081                 if (cpuid->leaf != eax) {
1082                         continue;
1083                 }
1084
1085                 if (cpuid->exit) {
1086                         svm_exit_insn(cpudata->vmcb, exit, NVMM_VCPU_EXIT_CPUID);
1087                         return;
1088                 }
1089                 KASSERT(cpuid->mask);
1090
1091                 /* del */
1092                 cpudata->vmcb->state.rax &= ~cpuid->u.mask.del.eax;
1093                 cpudata->gprs[NVMM_X64_GPR_RBX] &= ~cpuid->u.mask.del.ebx;
1094                 cpudata->gprs[NVMM_X64_GPR_RCX] &= ~cpuid->u.mask.del.ecx;
1095                 cpudata->gprs[NVMM_X64_GPR_RDX] &= ~cpuid->u.mask.del.edx;
1096
1097                 /* set */
1098                 cpudata->vmcb->state.rax |= cpuid->u.mask.set.eax;
1099                 cpudata->gprs[NVMM_X64_GPR_RBX] |= cpuid->u.mask.set.ebx;
1100                 cpudata->gprs[NVMM_X64_GPR_RCX] |= cpuid->u.mask.set.ecx;
1101                 cpudata->gprs[NVMM_X64_GPR_RDX] |= cpuid->u.mask.set.edx;
1102
1103                 break;
1104         }
1105
1106         svm_inkernel_advance(cpudata->vmcb);
1107         exit->reason = NVMM_VCPU_EXIT_NONE;
1108 }
1109
1110 static void
1111 svm_exit_hlt(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1112     struct nvmm_vcpu_exit *exit)
1113 {
1114         struct svm_cpudata *cpudata = vcpu->cpudata;
1115         struct vmcb *vmcb = cpudata->vmcb;
1116
1117         if (cpudata->int_window_exit && (vmcb->state.rflags & PSL_I)) {
1118                 svm_event_waitexit_disable(vcpu, false);
1119         }
1120
1121         svm_inkernel_advance(cpudata->vmcb);
1122         exit->reason = NVMM_VCPU_EXIT_HALTED;
1123 }
1124
1125 #define SVM_EXIT_IO_PORT        __BITS(31,16)
1126 #define SVM_EXIT_IO_SEG         __BITS(12,10)
1127 #define SVM_EXIT_IO_A64         __BIT(9)
1128 #define SVM_EXIT_IO_A32         __BIT(8)
1129 #define SVM_EXIT_IO_A16         __BIT(7)
1130 #define SVM_EXIT_IO_SZ32        __BIT(6)
1131 #define SVM_EXIT_IO_SZ16        __BIT(5)
1132 #define SVM_EXIT_IO_SZ8         __BIT(4)
1133 #define SVM_EXIT_IO_REP         __BIT(3)
1134 #define SVM_EXIT_IO_STR         __BIT(2)
1135 #define SVM_EXIT_IO_IN          __BIT(0)
1136
1137 static void
1138 svm_exit_io(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1139     struct nvmm_vcpu_exit *exit)
1140 {
1141         struct svm_cpudata *cpudata = vcpu->cpudata;
1142         uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
1143         uint64_t nextpc = cpudata->vmcb->ctrl.exitinfo2;
1144
1145         exit->reason = NVMM_VCPU_EXIT_IO;
1146
1147         exit->u.io.in = (info & SVM_EXIT_IO_IN) != 0;
1148         exit->u.io.port = __SHIFTOUT(info, SVM_EXIT_IO_PORT);
1149
1150         if (svm_decode_assist) {
1151                 KASSERT(__SHIFTOUT(info, SVM_EXIT_IO_SEG) < 6);
1152                 exit->u.io.seg = __SHIFTOUT(info, SVM_EXIT_IO_SEG);
1153         } else {
1154                 exit->u.io.seg = -1;
1155         }
1156
1157         if (info & SVM_EXIT_IO_A64) {
1158                 exit->u.io.address_size = 8;
1159         } else if (info & SVM_EXIT_IO_A32) {
1160                 exit->u.io.address_size = 4;
1161         } else if (info & SVM_EXIT_IO_A16) {
1162                 exit->u.io.address_size = 2;
1163         }
1164
1165         if (info & SVM_EXIT_IO_SZ32) {
1166                 exit->u.io.operand_size = 4;
1167         } else if (info & SVM_EXIT_IO_SZ16) {
1168                 exit->u.io.operand_size = 2;
1169         } else if (info & SVM_EXIT_IO_SZ8) {
1170                 exit->u.io.operand_size = 1;
1171         }
1172
1173         exit->u.io.rep = (info & SVM_EXIT_IO_REP) != 0;
1174         exit->u.io.str = (info & SVM_EXIT_IO_STR) != 0;
1175         exit->u.io.npc = nextpc;
1176
1177         svm_vcpu_state_provide(vcpu,
1178             NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
1179             NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
1180 }
1181
1182 static const uint64_t msr_ignore_list[] = {
1183         0xc0010055, /* MSR_CMPHALT */
1184         MSR_DE_CFG,
1185         MSR_IC_CFG,
1186         MSR_UCODE_AMD_PATCHLEVEL
1187 };
1188
1189 static bool
1190 svm_inkernel_handle_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1191     struct nvmm_vcpu_exit *exit)
1192 {
1193         struct svm_cpudata *cpudata = vcpu->cpudata;
1194         struct vmcb *vmcb = cpudata->vmcb;
1195         uint64_t val;
1196         size_t i;
1197
1198         if (exit->reason == NVMM_VCPU_EXIT_RDMSR) {
1199                 if (exit->u.rdmsr.msr == MSR_EFER) {
1200                         val = vmcb->state.efer & ~EFER_SVME;
1201                         vmcb->state.rax = (val & 0xFFFFFFFF);
1202                         cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
1203                         goto handled;
1204                 }
1205                 if (exit->u.rdmsr.msr == MSR_NB_CFG) {
1206                         val = NB_CFG_INITAPICCPUIDLO;
1207                         vmcb->state.rax = (val & 0xFFFFFFFF);
1208                         cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
1209                         goto handled;
1210                 }
1211                 for (i = 0; i < __arraycount(msr_ignore_list); i++) {
1212                         if (msr_ignore_list[i] != exit->u.rdmsr.msr)
1213                                 continue;
1214                         val = 0;
1215                         vmcb->state.rax = (val & 0xFFFFFFFF);
1216                         cpudata->gprs[NVMM_X64_GPR_RDX] = (val >> 32);
1217                         goto handled;
1218                 }
1219         } else {
1220                 if (exit->u.wrmsr.msr == MSR_EFER) {
1221                         if (__predict_false(exit->u.wrmsr.val & ~EFER_VALID)) {
1222                                 goto error;
1223                         }
1224                         if ((vmcb->state.efer ^ exit->u.wrmsr.val) &
1225                              EFER_TLB_FLUSH) {
1226                                 cpudata->gtlb_want_flush = true;
1227                         }
1228                         vmcb->state.efer = exit->u.wrmsr.val | EFER_SVME;
1229                         svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_CR);
1230                         goto handled;
1231                 }
1232
1233                 /* All bets are off if MSR_TSC is actually written to. */
1234                 if (exit->u.wrmsr.msr == MSR_TSC) {
1235                         cpudata->gtsc_offset = exit->u.wrmsr.val - rdtsc();
1236                         cpudata->gtsc_want_update = true;
1237                         goto handled;
1238                 }
1239                 for (i = 0; i < __arraycount(msr_ignore_list); i++) {
1240                         if (msr_ignore_list[i] != exit->u.wrmsr.msr)
1241                                 continue;
1242                         goto handled;
1243                 }
1244         }
1245
1246         return false;
1247
1248 handled:
1249         svm_inkernel_advance(cpudata->vmcb);
1250         return true;
1251
1252 error:
1253         svm_inject_gp(vcpu);
1254         return true;
1255 }
1256
1257 static inline void
1258 svm_exit_rdmsr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1259     struct nvmm_vcpu_exit *exit)
1260 {
1261         struct svm_cpudata *cpudata = vcpu->cpudata;
1262
1263         exit->reason = NVMM_VCPU_EXIT_RDMSR;
1264         exit->u.rdmsr.msr = (cpudata->gprs[NVMM_X64_GPR_RCX] & 0xFFFFFFFF);
1265         exit->u.rdmsr.npc = cpudata->vmcb->ctrl.nrip;
1266
1267         if (svm_inkernel_handle_msr(mach, vcpu, exit)) {
1268                 exit->reason = NVMM_VCPU_EXIT_NONE;
1269                 return;
1270         }
1271
1272         svm_vcpu_state_provide(vcpu, NVMM_X64_STATE_GPRS);
1273 }
1274
1275 static inline void
1276 svm_exit_wrmsr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1277     struct nvmm_vcpu_exit *exit)
1278 {
1279         struct svm_cpudata *cpudata = vcpu->cpudata;
1280         uint64_t rdx, rax;
1281
1282         rdx = cpudata->gprs[NVMM_X64_GPR_RDX];
1283         rax = cpudata->vmcb->state.rax;
1284
1285         exit->reason = NVMM_VCPU_EXIT_WRMSR;
1286         exit->u.wrmsr.msr = (cpudata->gprs[NVMM_X64_GPR_RCX] & 0xFFFFFFFF);
1287         exit->u.wrmsr.val = (rdx << 32) | (rax & 0xFFFFFFFF);
1288         exit->u.wrmsr.npc = cpudata->vmcb->ctrl.nrip;
1289
1290         if (svm_inkernel_handle_msr(mach, vcpu, exit)) {
1291                 exit->reason = NVMM_VCPU_EXIT_NONE;
1292                 return;
1293         }
1294
1295         svm_vcpu_state_provide(vcpu, NVMM_X64_STATE_GPRS);
1296 }
1297
1298 static void
1299 svm_exit_msr(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1300     struct nvmm_vcpu_exit *exit)
1301 {
1302         struct svm_cpudata *cpudata = vcpu->cpudata;
1303         uint64_t info = cpudata->vmcb->ctrl.exitinfo1;
1304
1305         if (info == 0) {
1306                 svm_exit_rdmsr(mach, vcpu, exit);
1307         } else {
1308                 svm_exit_wrmsr(mach, vcpu, exit);
1309         }
1310 }
1311
1312 static void
1313 svm_exit_npf(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1314     struct nvmm_vcpu_exit *exit)
1315 {
1316         struct svm_cpudata *cpudata = vcpu->cpudata;
1317         gpaddr_t gpa = cpudata->vmcb->ctrl.exitinfo2;
1318
1319         exit->reason = NVMM_VCPU_EXIT_MEMORY;
1320         if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_W)
1321                 exit->u.mem.prot = PROT_WRITE;
1322         else if (cpudata->vmcb->ctrl.exitinfo1 & PGEX_I)
1323                 exit->u.mem.prot = PROT_EXEC;
1324         else
1325                 exit->u.mem.prot = PROT_READ;
1326         exit->u.mem.gpa = gpa;
1327         exit->u.mem.inst_len = cpudata->vmcb->ctrl.inst_len;
1328         memcpy(exit->u.mem.inst_bytes, cpudata->vmcb->ctrl.inst_bytes,
1329             sizeof(exit->u.mem.inst_bytes));
1330
1331         svm_vcpu_state_provide(vcpu,
1332             NVMM_X64_STATE_GPRS | NVMM_X64_STATE_SEGS |
1333             NVMM_X64_STATE_CRS | NVMM_X64_STATE_MSRS);
1334 }
1335
1336 static void
1337 svm_exit_xsetbv(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1338     struct nvmm_vcpu_exit *exit)
1339 {
1340         struct svm_cpudata *cpudata = vcpu->cpudata;
1341         struct vmcb *vmcb = cpudata->vmcb;
1342         uint64_t val;
1343
1344         exit->reason = NVMM_VCPU_EXIT_NONE;
1345
1346         val = (cpudata->gprs[NVMM_X64_GPR_RDX] << 32) |
1347             (vmcb->state.rax & 0xFFFFFFFF);
1348
1349         if (__predict_false(cpudata->gprs[NVMM_X64_GPR_RCX] != 0)) {
1350                 goto error;
1351         } else if (__predict_false(vmcb->state.cpl != 0)) {
1352                 goto error;
1353         } else if (__predict_false((val & ~svm_xcr0_mask) != 0)) {
1354                 goto error;
1355         } else if (__predict_false((val & XCR0_X87) == 0)) {
1356                 goto error;
1357         }
1358
1359         cpudata->gxcr0 = val;
1360
1361         svm_inkernel_advance(cpudata->vmcb);
1362         return;
1363
1364 error:
1365         svm_inject_gp(vcpu);
1366 }
1367
1368 static void
1369 svm_exit_invalid(struct nvmm_vcpu_exit *exit, uint64_t code)
1370 {
1371         exit->u.inv.hwcode = code;
1372         exit->reason = NVMM_VCPU_EXIT_INVALID;
1373 }
1374
1375 /* -------------------------------------------------------------------------- */
1376
1377 static void
1378 svm_vcpu_guest_fpu_enter(struct nvmm_cpu *vcpu)
1379 {
1380         struct svm_cpudata *cpudata = vcpu->cpudata;
1381
1382 #ifdef __NetBSD__
1383         fpu_kern_enter();
1384         fpu_area_restore(&cpudata->gxsave, svm_xcr0_mask, true);
1385 #else /* DragonFly */
1386         /*
1387          * NOTE: Host FPU state depends on whether the user program used the
1388          *       FPU or not.  Need to use npxpush()/npxpop() to handle this.
1389          */
1390         npxpush(&cpudata->hstate.hmctx);
1391         clts();
1392         fpurstor((union savefpu *)&cpudata->gxsave, svm_xcr0_mask);
1393 #endif
1394
1395         if (svm_xcr0_mask != 0) {
1396                 wrxcr(0, cpudata->gxcr0);
1397         }
1398 }
1399
1400 static void
1401 svm_vcpu_guest_fpu_leave(struct nvmm_cpu *vcpu)
1402 {
1403         struct svm_cpudata *cpudata = vcpu->cpudata;
1404
1405         if (svm_xcr0_mask != 0) {
1406                 wrxcr(0, svm_global_hstate.xcr0);
1407         }
1408
1409 #ifdef __NetBSD__
1410         fpu_area_save(&cpudata->gxsave, svm_xcr0_mask, true);
1411         fpu_kern_leave();
1412 #else /* DragonFly */
1413         fpusave((union savefpu *)&cpudata->gxsave, svm_xcr0_mask);
1414         stts();
1415         npxpop(&cpudata->hstate.hmctx);
1416 #endif
1417 }
1418
1419 static void
1420 svm_vcpu_guest_dbregs_enter(struct nvmm_cpu *vcpu)
1421 {
1422         struct svm_cpudata *cpudata = vcpu->cpudata;
1423
1424         x86_dbregs_save(curlwp);
1425
1426         ldr7(0);
1427
1428         ldr0(cpudata->drs[NVMM_X64_DR_DR0]);
1429         ldr1(cpudata->drs[NVMM_X64_DR_DR1]);
1430         ldr2(cpudata->drs[NVMM_X64_DR_DR2]);
1431         ldr3(cpudata->drs[NVMM_X64_DR_DR3]);
1432 }
1433
1434 static void
1435 svm_vcpu_guest_dbregs_leave(struct nvmm_cpu *vcpu)
1436 {
1437         struct svm_cpudata *cpudata = vcpu->cpudata;
1438
1439         cpudata->drs[NVMM_X64_DR_DR0] = rdr0();
1440         cpudata->drs[NVMM_X64_DR_DR1] = rdr1();
1441         cpudata->drs[NVMM_X64_DR_DR2] = rdr2();
1442         cpudata->drs[NVMM_X64_DR_DR3] = rdr3();
1443
1444         x86_dbregs_restore(curlwp);
1445 }
1446
1447 static void
1448 svm_vcpu_guest_misc_enter(struct nvmm_cpu *vcpu)
1449 {
1450         struct svm_cpudata *cpudata = vcpu->cpudata;
1451
1452         /* Save the percpu host state. */
1453         cpudata->hstate.fsbase = rdmsr(MSR_FSBASE);
1454         cpudata->hstate.kernelgsbase = rdmsr(MSR_KERNELGSBASE);
1455 }
1456
1457 static void
1458 svm_vcpu_guest_misc_leave(struct nvmm_cpu *vcpu)
1459 {
1460         struct svm_cpudata *cpudata = vcpu->cpudata;
1461
1462         /* Restore the global host state. */
1463         wrmsr(MSR_STAR, svm_global_hstate.star);
1464         wrmsr(MSR_LSTAR, svm_global_hstate.lstar);
1465         wrmsr(MSR_CSTAR, svm_global_hstate.cstar);
1466         wrmsr(MSR_SFMASK, svm_global_hstate.sfmask);
1467
1468         /* Restore the percpu host state. */
1469         wrmsr(MSR_FSBASE, cpudata->hstate.fsbase);
1470         wrmsr(MSR_KERNELGSBASE, cpudata->hstate.kernelgsbase);
1471 }
1472
1473 /* -------------------------------------------------------------------------- */
1474
1475 static inline void
1476 svm_gtlb_catchup(struct nvmm_cpu *vcpu, int hcpu)
1477 {
1478         struct svm_cpudata *cpudata = vcpu->cpudata;
1479
1480         if (vcpu->hcpu_last != hcpu || cpudata->shared_asid) {
1481                 cpudata->gtlb_want_flush = true;
1482         }
1483 }
1484
1485 static inline void
1486 svm_htlb_catchup(struct nvmm_cpu *vcpu, int hcpu)
1487 {
1488         /*
1489          * Nothing to do. If an hTLB flush was needed, either the VCPU was
1490          * executing on this hCPU and the hTLB already got flushed, or it
1491          * was executing on another hCPU in which case the catchup is done
1492          * indirectly when svm_gtlb_catchup() sets gtlb_want_flush.
1493          */
1494 }
1495
1496 static inline uint64_t
1497 svm_htlb_flush(struct nvmm_machine *mach, struct svm_cpudata *cpudata)
1498 {
1499         struct vmcb *vmcb = cpudata->vmcb;
1500         uint64_t machgen;
1501
1502         clear_xinvltlb();
1503         machgen = mach->vm->vm_pmap.pm_invgen;
1504         if (__predict_true(machgen == cpudata->vcpu_htlb_gen)) {
1505                 return machgen;
1506         }
1507
1508         cpudata->htlb_want_flush = true;
1509         vmcb->ctrl.tlb_ctrl = svm_ctrl_tlb_flush;
1510         return machgen;
1511 }
1512
1513 static inline void
1514 svm_htlb_flush_ack(struct svm_cpudata *cpudata, uint64_t machgen)
1515 {
1516         struct vmcb *vmcb = cpudata->vmcb;
1517
1518         if (__predict_true(vmcb->ctrl.exitcode != VMCB_EXITCODE_INVALID)) {
1519                 cpudata->vcpu_htlb_gen = machgen;
1520                 cpudata->htlb_want_flush = false;
1521         }
1522 }
1523
1524 static inline void
1525 svm_exit_evt(struct svm_cpudata *cpudata, struct vmcb *vmcb)
1526 {
1527         cpudata->evt_pending = false;
1528
1529         if (__predict_false(vmcb->ctrl.exitintinfo & VMCB_CTRL_EXITINTINFO_V)) {
1530                 vmcb->ctrl.eventinj = vmcb->ctrl.exitintinfo;
1531                 cpudata->evt_pending = true;
1532         }
1533 }
1534
1535 static int
1536 svm_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
1537     struct nvmm_vcpu_exit *exit)
1538 {
1539         struct nvmm_comm_page *comm = vcpu->comm;
1540         struct svm_cpudata *cpudata = vcpu->cpudata;
1541         struct vmcb *vmcb = cpudata->vmcb;
1542         struct globaldata *gd;
1543         uint64_t machgen;
1544         int hcpu;
1545
1546         svm_vcpu_state_commit(vcpu);
1547         comm->state_cached = 0;
1548
1549         if (__predict_false(svm_vcpu_event_commit(vcpu) != 0)) {
1550                 return EINVAL;
1551         }
1552
1553         kpreempt_disable();
1554         gd = mycpu;
1555         hcpu = gd->gd_cpuid;
1556
1557         svm_gtlb_catchup(vcpu, hcpu);
1558         svm_htlb_catchup(vcpu, hcpu);
1559
1560         if (vcpu->hcpu_last != hcpu) {
1561                 svm_vmcb_cache_flush_all(vmcb);
1562                 cpudata->gtsc_want_update = true;
1563
1564 #ifdef __DragonFly__
1565                 /*
1566                  * XXX: We aren't tracking overloaded CPUs (multiple vCPUs
1567                  *      scheduled on the same physical CPU) yet so there are
1568                  *      currently no calls to pmap_del_cpu().
1569                  */
1570                 pmap_add_cpu(mach->vm, hcpu);
1571 #endif
1572         }
1573
1574         svm_vcpu_guest_dbregs_enter(vcpu);
1575         svm_vcpu_guest_misc_enter(vcpu);
1576
1577         while (1) {
1578                 if (__predict_false(cpudata->gtlb_want_flush ||
1579                                     cpudata->htlb_want_flush))
1580                 {
1581                         vmcb->ctrl.tlb_ctrl = svm_ctrl_tlb_flush;
1582                 } else {
1583                         vmcb->ctrl.tlb_ctrl = 0;
1584                 }
1585
1586                 if (__predict_false(cpudata->gtsc_want_update)) {
1587                         vmcb->ctrl.tsc_offset = cpudata->gtsc_offset;
1588                         svm_vmcb_cache_flush(vmcb, VMCB_CTRL_VMCB_CLEAN_I);
1589                 }
1590
1591                 svm_clgi();
1592                 svm_vcpu_guest_fpu_enter(vcpu);
1593                 machgen = svm_htlb_flush(mach, cpudata);
1594
1595 #ifdef __DragonFly__
1596                 /*
1597                  * Check for pending host events (e.g., interrupt, AST)
1598                  * to make the state safe to VM Entry.
1599                  */
1600                 if (__predict_false(gd->gd_reqflags & RQF_HVM_MASK)) {
1601                         /* No hTLB flush ack, because it's not executed. */
1602                         svm_vcpu_guest_fpu_leave(vcpu);
1603                         svm_stgi();
1604                         exit->reason = NVMM_VCPU_EXIT_NONE;
1605                         break;
1606                 }
1607 #endif
1608
1609                 svm_vmrun(cpudata->vmcb_pa, cpudata->gprs);
1610                 svm_htlb_flush_ack(cpudata, machgen);
1611                 svm_vcpu_guest_fpu_leave(vcpu);
1612                 svm_stgi();
1613
1614                 svm_vmcb_cache_default(vmcb);
1615
1616                 if (vmcb->ctrl.exitcode != VMCB_EXITCODE_INVALID) {
1617                         cpudata->gtlb_want_flush = false;
1618                         cpudata->gtsc_want_update = false;
1619                         vcpu->hcpu_last = hcpu;
1620                 }
1621                 svm_exit_evt(cpudata, vmcb);
1622
1623                 switch (vmcb->ctrl.exitcode) {
1624                 case VMCB_EXITCODE_INTR:
1625                 case VMCB_EXITCODE_NMI:
1626                         exit->reason = NVMM_VCPU_EXIT_NONE;
1627                         break;
1628                 case VMCB_EXITCODE_VINTR:
1629                         svm_event_waitexit_disable(vcpu, false);
1630                         exit->reason = NVMM_VCPU_EXIT_INT_READY;
1631                         break;
1632                 case VMCB_EXITCODE_IRET:
1633                         svm_event_waitexit_disable(vcpu, true);
1634                         exit->reason = NVMM_VCPU_EXIT_NMI_READY;
1635                         break;
1636                 case VMCB_EXITCODE_CPUID:
1637                         svm_exit_cpuid(mach, vcpu, exit);
1638                         break;
1639                 case VMCB_EXITCODE_HLT:
1640                         svm_exit_hlt(mach, vcpu, exit);
1641                         break;
1642                 case VMCB_EXITCODE_IOIO:
1643                         svm_exit_io(mach, vcpu, exit);
1644                         break;
1645                 case VMCB_EXITCODE_MSR:
1646                         svm_exit_msr(mach, vcpu, exit);
1647                         break;
1648                 case VMCB_EXITCODE_SHUTDOWN:
1649                         exit->reason = NVMM_VCPU_EXIT_SHUTDOWN;
1650                         break;
1651                 case VMCB_EXITCODE_RDPMC:
1652                 case VMCB_EXITCODE_RSM:
1653                 case VMCB_EXITCODE_INVLPGA:
1654                 case VMCB_EXITCODE_VMRUN:
1655                 case VMCB_EXITCODE_VMMCALL:
1656                 case VMCB_EXITCODE_VMLOAD:
1657                 case VMCB_EXITCODE_VMSAVE:
1658                 case VMCB_EXITCODE_STGI:
1659                 case VMCB_EXITCODE_CLGI:
1660                 case VMCB_EXITCODE_SKINIT:
1661                 case VMCB_EXITCODE_RDTSCP:
1662                 case VMCB_EXITCODE_RDPRU:
1663                 case VMCB_EXITCODE_INVLPGB:
1664                 case VMCB_EXITCODE_INVPCID:
1665                 case VMCB_EXITCODE_MCOMMIT:
1666                 case VMCB_EXITCODE_TLBSYNC:
1667                         svm_inject_ud(vcpu);
1668                         exit->reason = NVMM_VCPU_EXIT_NONE;
1669                         break;
1670                 case VMCB_EXITCODE_MONITOR:
1671                         svm_exit_insn(vmcb, exit, NVMM_VCPU_EXIT_MONITOR);
1672                         break;
1673                 case VMCB_EXITCODE_MWAIT:
1674                 case VMCB_EXITCODE_MWAIT_CONDITIONAL:
1675                         svm_exit_insn(vmcb, exit, NVMM_VCPU_EXIT_MWAIT);
1676                         break;
1677                 case VMCB_EXITCODE_XSETBV:
1678                         svm_exit_xsetbv(mach, vcpu, exit);
1679                         break;
1680                 case VMCB_EXITCODE_NPF:
1681                         svm_exit_npf(mach, vcpu, exit);
1682                         break;
1683                 case VMCB_EXITCODE_FERR_FREEZE: /* ? */
1684                 default:
1685                         svm_exit_invalid(exit, vmcb->ctrl.exitcode);
1686                         break;
1687                 }
1688
1689                 /* If no reason to return to userland, keep rolling. */
1690                 if (nvmm_return_needed()) {
1691                         break;
1692                 }
1693                 if (exit->reason != NVMM_VCPU_EXIT_NONE) {
1694                         break;
1695                 }
1696         }
1697
1698         svm_vcpu_guest_misc_leave(vcpu);
1699         svm_vcpu_guest_dbregs_leave(vcpu);
1700
1701         kpreempt_enable();
1702
1703         exit->exitstate.rflags = vmcb->state.rflags;
1704         exit->exitstate.cr8 = __SHIFTOUT(vmcb->ctrl.v, VMCB_CTRL_V_TPR);
1705         exit->exitstate.int_shadow =
1706             ((vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0);
1707         exit->exitstate.int_window_exiting = cpudata->int_window_exit;
1708         exit->exitstate.nmi_window_exiting = cpudata->nmi_window_exit;
1709         exit->exitstate.evt_pending = cpudata->evt_pending;
1710
1711         return 0;
1712 }
1713
1714 /* -------------------------------------------------------------------------- */
1715
1716 static int
1717 svm_memalloc(paddr_t *pa, vaddr_t *va, size_t npages)
1718 {
1719 #ifdef __NetBSD__
1720         struct pglist pglist;
1721         paddr_t _pa;
1722         vaddr_t _va;
1723         size_t i;
1724         int ret;
1725
1726         ret = uvm_pglistalloc(npages * PAGE_SIZE, 0, ~0UL, PAGE_SIZE, 0,
1727             &pglist, 1, 0);
1728         if (ret != 0)
1729                 return ENOMEM;
1730         _pa = VM_PAGE_TO_PHYS(TAILQ_FIRST(&pglist));
1731         _va = uvm_km_alloc(kernel_map, npages * PAGE_SIZE, 0,
1732             UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1733         if (_va == 0)
1734                 goto error;
1735
1736         for (i = 0; i < npages; i++) {
1737                 pmap_kenter_pa(_va + i * PAGE_SIZE, _pa + i * PAGE_SIZE,
1738                     VM_PROT_READ | VM_PROT_WRITE, PMAP_WRITE_BACK);
1739         }
1740         pmap_update(pmap_kernel());
1741
1742         memset((void *)_va, 0, npages * PAGE_SIZE);
1743
1744         *pa = _pa;
1745         *va = _va;
1746         return 0;
1747
1748 error:
1749         for (i = 0; i < npages; i++) {
1750                 uvm_pagefree(PHYS_TO_VM_PAGE(_pa + i * PAGE_SIZE));
1751         }
1752         return ENOMEM;
1753
1754 #else /* DragonFly */
1755         void *addr;
1756
1757         addr = contigmalloc(npages * PAGE_SIZE, M_NVMM, M_WAITOK | M_ZERO,
1758             0, ~0UL, PAGE_SIZE, 0);
1759         if (addr == NULL)
1760                 return ENOMEM;
1761
1762         *va = (vaddr_t)addr;
1763         *pa = vtophys(addr);
1764         return 0;
1765 #endif /* __NetBSD__ */
1766 }
1767
1768 static void
1769 svm_memfree(paddr_t pa __unused, vaddr_t va, size_t npages)
1770 {
1771 #ifdef __NetBSD__
1772         size_t i;
1773
1774         pmap_kremove(va, npages * PAGE_SIZE);
1775         pmap_update(pmap_kernel());
1776         uvm_km_free(kernel_map, va, npages * PAGE_SIZE, UVM_KMF_VAONLY);
1777         for (i = 0; i < npages; i++) {
1778                 uvm_pagefree(PHYS_TO_VM_PAGE(pa + i * PAGE_SIZE));
1779         }
1780 #else /* DragonFly */
1781         contigfree((void *)va, npages * PAGE_SIZE, M_NVMM);
1782 #endif /* __NetBSD__ */
1783 }
1784
1785 /* -------------------------------------------------------------------------- */
1786
1787 #define SVM_MSRBM_READ  __BIT(0)
1788 #define SVM_MSRBM_WRITE __BIT(1)
1789
1790 static void
1791 svm_vcpu_msr_allow(uint8_t *bitmap, uint64_t msr, bool read, bool write)
1792 {
1793         uint64_t byte;
1794         uint8_t bitoff;
1795
1796         if (msr < 0x00002000) {
1797                 /* Range 1 */
1798                 byte = ((msr - 0x00000000) >> 2UL) + 0x0000;
1799         } else if (msr >= 0xC0000000 && msr < 0xC0002000) {
1800                 /* Range 2 */
1801                 byte = ((msr - 0xC0000000) >> 2UL) + 0x0800;
1802         } else if (msr >= 0xC0010000 && msr < 0xC0012000) {
1803                 /* Range 3 */
1804                 byte = ((msr - 0xC0010000) >> 2UL) + 0x1000;
1805         } else {
1806                 panic("%s: wrong range", __func__);
1807         }
1808
1809         bitoff = (msr & 0x3) << 1;
1810
1811         if (read) {
1812                 bitmap[byte] &= ~(SVM_MSRBM_READ << bitoff);
1813         }
1814         if (write) {
1815                 bitmap[byte] &= ~(SVM_MSRBM_WRITE << bitoff);
1816         }
1817 }
1818
1819 #define SVM_SEG_ATTRIB_TYPE             __BITS(3,0)
1820 #define SVM_SEG_ATTRIB_S                __BIT(4)
1821 #define SVM_SEG_ATTRIB_DPL              __BITS(6,5)
1822 #define SVM_SEG_ATTRIB_P                __BIT(7)
1823 #define SVM_SEG_ATTRIB_AVL              __BIT(8)
1824 #define SVM_SEG_ATTRIB_L                __BIT(9)
1825 #define SVM_SEG_ATTRIB_DEF              __BIT(10)
1826 #define SVM_SEG_ATTRIB_G                __BIT(11)
1827
1828 static void
1829 svm_vcpu_setstate_seg(const struct nvmm_x64_state_seg *seg,
1830     struct vmcb_segment *vseg)
1831 {
1832         vseg->selector = seg->selector;
1833         vseg->attrib =
1834             __SHIFTIN(seg->attrib.type, SVM_SEG_ATTRIB_TYPE) |
1835             __SHIFTIN(seg->attrib.s, SVM_SEG_ATTRIB_S) |
1836             __SHIFTIN(seg->attrib.dpl, SVM_SEG_ATTRIB_DPL) |
1837             __SHIFTIN(seg->attrib.p, SVM_SEG_ATTRIB_P) |
1838             __SHIFTIN(seg->attrib.avl, SVM_SEG_ATTRIB_AVL) |
1839             __SHIFTIN(seg->attrib.l, SVM_SEG_ATTRIB_L) |
1840             __SHIFTIN(seg->attrib.def, SVM_SEG_ATTRIB_DEF) |
1841             __SHIFTIN(seg->attrib.g, SVM_SEG_ATTRIB_G);
1842         vseg->limit = seg->limit;
1843         vseg->base = seg->base;
1844 }
1845
1846 static void
1847 svm_vcpu_getstate_seg(struct nvmm_x64_state_seg *seg, struct vmcb_segment *vseg)
1848 {
1849         seg->selector = vseg->selector;
1850         seg->attrib.type = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_TYPE);
1851         seg->attrib.s = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_S);
1852         seg->attrib.dpl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DPL);
1853         seg->attrib.p = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_P);
1854         seg->attrib.avl = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_AVL);
1855         seg->attrib.l = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_L);
1856         seg->attrib.def = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_DEF);
1857         seg->attrib.g = __SHIFTOUT(vseg->attrib, SVM_SEG_ATTRIB_G);
1858         seg->limit = vseg->limit;
1859         seg->base = vseg->base;
1860 }
1861
1862 static inline bool
1863 svm_state_gtlb_flush(const struct vmcb *vmcb, const struct nvmm_x64_state *state,
1864     uint64_t flags)
1865 {
1866         if (flags & NVMM_X64_STATE_CRS) {
1867                 if ((vmcb->state.cr0 ^
1868                      state->crs[NVMM_X64_CR_CR0]) & CR0_TLB_FLUSH) {
1869                         return true;
1870                 }
1871                 if (vmcb->state.cr3 != state->crs[NVMM_X64_CR_CR3]) {
1872                         return true;
1873                 }
1874                 if ((vmcb->state.cr4 ^
1875                      state->crs[NVMM_X64_CR_CR4]) & CR4_TLB_FLUSH) {
1876                         return true;
1877                 }
1878         }
1879
1880         if (flags & NVMM_X64_STATE_MSRS) {
1881                 if ((vmcb->state.efer ^
1882                      state->msrs[NVMM_X64_MSR_EFER]) & EFER_TLB_FLUSH) {
1883                         return true;
1884                 }
1885         }
1886
1887         return false;
1888 }
1889
1890 static void
1891 svm_vcpu_setstate(struct nvmm_cpu *vcpu)
1892 {
1893         struct nvmm_comm_page *comm = vcpu->comm;
1894         const struct nvmm_x64_state *state = &comm->state;
1895         struct svm_cpudata *cpudata = vcpu->cpudata;
1896         struct vmcb *vmcb = cpudata->vmcb;
1897         struct nvmm_x64_state_fpu *fpustate;
1898         uint64_t flags;
1899
1900         flags = comm->state_wanted;
1901
1902         if (svm_state_gtlb_flush(vmcb, state, flags)) {
1903                 cpudata->gtlb_want_flush = true;
1904         }
1905
1906         if (flags & NVMM_X64_STATE_SEGS) {
1907                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_CS],
1908                     &vmcb->state.cs);
1909                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_DS],
1910                     &vmcb->state.ds);
1911                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_ES],
1912                     &vmcb->state.es);
1913                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_FS],
1914                     &vmcb->state.fs);
1915                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_GS],
1916                     &vmcb->state.gs);
1917                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_SS],
1918                     &vmcb->state.ss);
1919                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_GDT],
1920                     &vmcb->state.gdt);
1921                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_IDT],
1922                     &vmcb->state.idt);
1923                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_LDT],
1924                     &vmcb->state.ldt);
1925                 svm_vcpu_setstate_seg(&state->segs[NVMM_X64_SEG_TR],
1926                     &vmcb->state.tr);
1927
1928                 vmcb->state.cpl = state->segs[NVMM_X64_SEG_SS].attrib.dpl;
1929         }
1930
1931         CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs));
1932         if (flags & NVMM_X64_STATE_GPRS) {
1933                 memcpy(cpudata->gprs, state->gprs, sizeof(state->gprs));
1934
1935                 vmcb->state.rip = state->gprs[NVMM_X64_GPR_RIP];
1936                 vmcb->state.rsp = state->gprs[NVMM_X64_GPR_RSP];
1937                 vmcb->state.rax = state->gprs[NVMM_X64_GPR_RAX];
1938                 vmcb->state.rflags = state->gprs[NVMM_X64_GPR_RFLAGS];
1939         }
1940
1941         if (flags & NVMM_X64_STATE_CRS) {
1942                 vmcb->state.cr0 = state->crs[NVMM_X64_CR_CR0];
1943                 vmcb->state.cr2 = state->crs[NVMM_X64_CR_CR2];
1944                 vmcb->state.cr3 = state->crs[NVMM_X64_CR_CR3];
1945                 vmcb->state.cr4 = state->crs[NVMM_X64_CR_CR4];
1946
1947                 vmcb->ctrl.v &= ~VMCB_CTRL_V_TPR;
1948                 vmcb->ctrl.v |= __SHIFTIN(state->crs[NVMM_X64_CR_CR8],
1949                     VMCB_CTRL_V_TPR);
1950
1951                 if (svm_xcr0_mask != 0) {
1952                         /* Clear illegal XCR0 bits, set mandatory X87 bit. */
1953                         cpudata->gxcr0 = state->crs[NVMM_X64_CR_XCR0];
1954                         cpudata->gxcr0 &= svm_xcr0_mask;
1955                         cpudata->gxcr0 |= XCR0_X87;
1956                 }
1957         }
1958
1959         CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs));
1960         if (flags & NVMM_X64_STATE_DRS) {
1961                 memcpy(cpudata->drs, state->drs, sizeof(state->drs));
1962
1963                 vmcb->state.dr6 = state->drs[NVMM_X64_DR_DR6];
1964                 vmcb->state.dr7 = state->drs[NVMM_X64_DR_DR7];
1965         }
1966
1967         if (flags & NVMM_X64_STATE_MSRS) {
1968                 /*
1969                  * EFER_SVME is mandatory.
1970                  */
1971                 vmcb->state.efer = state->msrs[NVMM_X64_MSR_EFER] | EFER_SVME;
1972                 vmcb->state.star = state->msrs[NVMM_X64_MSR_STAR];
1973                 vmcb->state.lstar = state->msrs[NVMM_X64_MSR_LSTAR];
1974                 vmcb->state.cstar = state->msrs[NVMM_X64_MSR_CSTAR];
1975                 vmcb->state.sfmask = state->msrs[NVMM_X64_MSR_SFMASK];
1976                 vmcb->state.kernelgsbase =
1977                     state->msrs[NVMM_X64_MSR_KERNELGSBASE];
1978                 vmcb->state.sysenter_cs =
1979                     state->msrs[NVMM_X64_MSR_SYSENTER_CS];
1980                 vmcb->state.sysenter_esp =
1981                     state->msrs[NVMM_X64_MSR_SYSENTER_ESP];
1982                 vmcb->state.sysenter_eip =
1983                     state->msrs[NVMM_X64_MSR_SYSENTER_EIP];
1984                 vmcb->state.g_pat = state->msrs[NVMM_X64_MSR_PAT];
1985
1986                 /*
1987                  * QEMU or whatever... probably did NOT want to set the TSC,
1988                  * because doing so would destroy tsc mp-synchronization
1989                  * across logical cpus.  Try to figure out what qemu meant
1990                  * to do.
1991                  *
1992                  * If writing the last TSC value we reported via getstate,
1993                  * assume that the hypervisor does not want to write to the
1994                  * TSC.
1995                  *
1996                  * QEMU appears to issue a setstate with the value 0 after
1997                  * a 'reboot', so for now also ignore this case.
1998                  */
1999                 if (state->msrs[NVMM_X64_MSR_TSC] != cpudata->gtsc_match &&
2000                     state->msrs[NVMM_X64_MSR_TSC] != 0) {
2001                         cpudata->gtsc_offset =
2002                             state->msrs[NVMM_X64_MSR_TSC] - rdtsc();
2003                         cpudata->gtsc_want_update = true;
2004                 }
2005         }
2006
2007         if (flags & NVMM_X64_STATE_INTR) {
2008                 if (state->intr.int_shadow) {
2009                         vmcb->ctrl.intr |= VMCB_CTRL_INTR_SHADOW;
2010                 } else {
2011                         vmcb->ctrl.intr &= ~VMCB_CTRL_INTR_SHADOW;
2012                 }
2013
2014                 if (state->intr.int_window_exiting) {
2015                         svm_event_waitexit_enable(vcpu, false);
2016                 } else {
2017                         svm_event_waitexit_disable(vcpu, false);
2018                 }
2019
2020                 if (state->intr.nmi_window_exiting) {
2021                         svm_event_waitexit_enable(vcpu, true);
2022                 } else {
2023                         svm_event_waitexit_disable(vcpu, true);
2024                 }
2025         }
2026
2027         CTASSERT(sizeof(cpudata->gxsave.fpu) == sizeof(state->fpu));
2028         if (flags & NVMM_X64_STATE_FPU) {
2029                 memcpy(&cpudata->gxsave.fpu, &state->fpu, sizeof(state->fpu));
2030
2031                 fpustate = (struct nvmm_x64_state_fpu *)&cpudata->gxsave.fpu;
2032                 fpustate->fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
2033                 fpustate->fx_mxcsr &= fpustate->fx_mxcsr_mask;
2034
2035                 if (svm_xcr0_mask != 0) {
2036                         /* Reset XSTATE_BV, to force a reload. */
2037                         cpudata->gxsave.xstate_bv = svm_xcr0_mask;
2038                 }
2039         }
2040
2041         svm_vmcb_cache_update(vmcb, flags);
2042
2043         comm->state_wanted = 0;
2044         comm->state_cached |= flags;
2045 }
2046
2047 static void
2048 svm_vcpu_getstate(struct nvmm_cpu *vcpu)
2049 {
2050         struct nvmm_comm_page *comm = vcpu->comm;
2051         struct nvmm_x64_state *state = &comm->state;
2052         struct svm_cpudata *cpudata = vcpu->cpudata;
2053         struct vmcb *vmcb = cpudata->vmcb;
2054         uint64_t flags;
2055
2056         flags = comm->state_wanted;
2057
2058         if (flags & NVMM_X64_STATE_SEGS) {
2059                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_CS],
2060                     &vmcb->state.cs);
2061                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_DS],
2062                     &vmcb->state.ds);
2063                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_ES],
2064                     &vmcb->state.es);
2065                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_FS],
2066                     &vmcb->state.fs);
2067                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_GS],
2068                     &vmcb->state.gs);
2069                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_SS],
2070                     &vmcb->state.ss);
2071                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_GDT],
2072                     &vmcb->state.gdt);
2073                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_IDT],
2074                     &vmcb->state.idt);
2075                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_LDT],
2076                     &vmcb->state.ldt);
2077                 svm_vcpu_getstate_seg(&state->segs[NVMM_X64_SEG_TR],
2078                     &vmcb->state.tr);
2079
2080                 state->segs[NVMM_X64_SEG_SS].attrib.dpl = vmcb->state.cpl;
2081         }
2082
2083         CTASSERT(sizeof(cpudata->gprs) == sizeof(state->gprs));
2084         if (flags & NVMM_X64_STATE_GPRS) {
2085                 memcpy(state->gprs, cpudata->gprs, sizeof(state->gprs));
2086
2087                 state->gprs[NVMM_X64_GPR_RIP] = vmcb->state.rip;
2088                 state->gprs[NVMM_X64_GPR_RSP] = vmcb->state.rsp;
2089                 state->gprs[NVMM_X64_GPR_RAX] = vmcb->state.rax;
2090                 state->gprs[NVMM_X64_GPR_RFLAGS] = vmcb->state.rflags;
2091         }
2092
2093         if (flags & NVMM_X64_STATE_CRS) {
2094                 state->crs[NVMM_X64_CR_CR0] = vmcb->state.cr0;
2095                 state->crs[NVMM_X64_CR_CR2] = vmcb->state.cr2;
2096                 state->crs[NVMM_X64_CR_CR3] = vmcb->state.cr3;
2097                 state->crs[NVMM_X64_CR_CR4] = vmcb->state.cr4;
2098                 state->crs[NVMM_X64_CR_CR8] = __SHIFTOUT(vmcb->ctrl.v,
2099                     VMCB_CTRL_V_TPR);
2100                 state->crs[NVMM_X64_CR_XCR0] = cpudata->gxcr0;
2101         }
2102
2103         CTASSERT(sizeof(cpudata->drs) == sizeof(state->drs));
2104         if (flags & NVMM_X64_STATE_DRS) {
2105                 memcpy(state->drs, cpudata->drs, sizeof(state->drs));
2106
2107                 state->drs[NVMM_X64_DR_DR6] = vmcb->state.dr6;
2108                 state->drs[NVMM_X64_DR_DR7] = vmcb->state.dr7;
2109         }
2110
2111         if (flags & NVMM_X64_STATE_MSRS) {
2112                 state->msrs[NVMM_X64_MSR_EFER] = vmcb->state.efer;
2113                 state->msrs[NVMM_X64_MSR_STAR] = vmcb->state.star;
2114                 state->msrs[NVMM_X64_MSR_LSTAR] = vmcb->state.lstar;
2115                 state->msrs[NVMM_X64_MSR_CSTAR] = vmcb->state.cstar;
2116                 state->msrs[NVMM_X64_MSR_SFMASK] = vmcb->state.sfmask;
2117                 state->msrs[NVMM_X64_MSR_KERNELGSBASE] =
2118                     vmcb->state.kernelgsbase;
2119                 state->msrs[NVMM_X64_MSR_SYSENTER_CS] =
2120                     vmcb->state.sysenter_cs;
2121                 state->msrs[NVMM_X64_MSR_SYSENTER_ESP] =
2122                     vmcb->state.sysenter_esp;
2123                 state->msrs[NVMM_X64_MSR_SYSENTER_EIP] =
2124                     vmcb->state.sysenter_eip;
2125                 state->msrs[NVMM_X64_MSR_PAT] = vmcb->state.g_pat;
2126                 state->msrs[NVMM_X64_MSR_TSC] = rdtsc() + cpudata->gtsc_offset;
2127
2128                 /* Hide SVME. */
2129                 state->msrs[NVMM_X64_MSR_EFER] &= ~EFER_SVME;
2130
2131                 /* Save reported TSC value for later setstate hack. */
2132                 cpudata->gtsc_match = state->msrs[NVMM_X64_MSR_TSC];
2133         }
2134
2135         if (flags & NVMM_X64_STATE_INTR) {
2136                 state->intr.int_shadow =
2137                     (vmcb->ctrl.intr & VMCB_CTRL_INTR_SHADOW) != 0;
2138                 state->intr.int_window_exiting = cpudata->int_window_exit;
2139                 state->intr.nmi_window_exiting = cpudata->nmi_window_exit;
2140                 state->intr.evt_pending = cpudata->evt_pending;
2141         }
2142
2143         CTASSERT(sizeof(cpudata->gxsave.fpu) == sizeof(state->fpu));
2144         if (flags & NVMM_X64_STATE_FPU) {
2145                 memcpy(&state->fpu, &cpudata->gxsave.fpu, sizeof(state->fpu));
2146         }
2147
2148         comm->state_wanted = 0;
2149         comm->state_cached |= flags;
2150 }
2151
2152 static void
2153 svm_vcpu_state_provide(struct nvmm_cpu *vcpu, uint64_t flags)
2154 {
2155         vcpu->comm->state_wanted = flags;
2156         svm_vcpu_getstate(vcpu);
2157 }
2158
2159 static void
2160 svm_vcpu_state_commit(struct nvmm_cpu *vcpu)
2161 {
2162         vcpu->comm->state_wanted = vcpu->comm->state_commit;
2163         vcpu->comm->state_commit = 0;
2164         svm_vcpu_setstate(vcpu);
2165 }
2166
2167 /* -------------------------------------------------------------------------- */
2168
2169 static void
2170 svm_asid_alloc(struct nvmm_cpu *vcpu)
2171 {
2172         struct svm_cpudata *cpudata = vcpu->cpudata;
2173         struct vmcb *vmcb = cpudata->vmcb;
2174         size_t i, oct, bit;
2175
2176         mutex_enter(&svm_asidlock);
2177
2178         for (i = 0; i < svm_maxasid; i++) {
2179                 oct = i / 8;
2180                 bit = i % 8;
2181
2182                 if (svm_asidmap[oct] & __BIT(bit)) {
2183                         continue;
2184                 }
2185
2186                 svm_asidmap[oct] |= __BIT(bit);
2187                 vmcb->ctrl.guest_asid = i;
2188                 mutex_exit(&svm_asidlock);
2189                 return;
2190         }
2191
2192         /*
2193          * No free ASID. Use the last one, which is shared and requires
2194          * special TLB handling.
2195          */
2196         cpudata->shared_asid = true;
2197         vmcb->ctrl.guest_asid = svm_maxasid - 1;
2198         mutex_exit(&svm_asidlock);
2199 }
2200
2201 static void
2202 svm_asid_free(struct nvmm_cpu *vcpu)
2203 {
2204         struct svm_cpudata *cpudata = vcpu->cpudata;
2205         struct vmcb *vmcb = cpudata->vmcb;
2206         size_t oct, bit;
2207
2208         if (cpudata->shared_asid) {
2209                 return;
2210         }
2211
2212         oct = vmcb->ctrl.guest_asid / 8;
2213         bit = vmcb->ctrl.guest_asid % 8;
2214
2215         mutex_enter(&svm_asidlock);
2216         svm_asidmap[oct] &= ~__BIT(bit);
2217         mutex_exit(&svm_asidlock);
2218 }
2219
2220 static void
2221 svm_vcpu_init(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
2222 {
2223         struct svm_cpudata *cpudata = vcpu->cpudata;
2224         struct vmcb *vmcb = cpudata->vmcb;
2225
2226         /* Allow reads/writes of Control Registers. */
2227         vmcb->ctrl.intercept_cr = 0;
2228
2229         /* Allow reads/writes of Debug Registers. */
2230         vmcb->ctrl.intercept_dr = 0;
2231
2232         /* Allow exceptions 0 to 31. */
2233         vmcb->ctrl.intercept_vec = 0;
2234
2235         /*
2236          * Allow:
2237          *  - SMI [smm interrupts]
2238          *  - VINTR [virtual interrupts]
2239          *  - CR0_SPEC [CR0 writes changing other fields than CR0.TS or CR0.MP]
2240          *  - RIDTR [reads of IDTR]
2241          *  - RGDTR [reads of GDTR]
2242          *  - RLDTR [reads of LDTR]
2243          *  - RTR [reads of TR]
2244          *  - WIDTR [writes of IDTR]
2245          *  - WGDTR [writes of GDTR]
2246          *  - WLDTR [writes of LDTR]
2247          *  - WTR [writes of TR]
2248          *  - RDTSC [rdtsc instruction]
2249          *  - PUSHF [pushf instruction]
2250          *  - POPF [popf instruction]
2251          *  - IRET [iret instruction]
2252          *  - INTN [int $n instructions]
2253          *  - PAUSE [pause instruction]
2254          *  - INVLPG [invplg instruction]
2255          *  - TASKSW [task switches]
2256          *
2257          * Intercept the rest below.
2258          */
2259         vmcb->ctrl.intercept_misc1 =
2260             VMCB_CTRL_INTERCEPT_INTR |
2261             VMCB_CTRL_INTERCEPT_NMI |
2262             VMCB_CTRL_INTERCEPT_INIT |
2263             VMCB_CTRL_INTERCEPT_RDPMC |
2264             VMCB_CTRL_INTERCEPT_CPUID |
2265             VMCB_CTRL_INTERCEPT_RSM |
2266             VMCB_CTRL_INTERCEPT_INVD |
2267             VMCB_CTRL_INTERCEPT_HLT |
2268             VMCB_CTRL_INTERCEPT_INVLPGA |
2269             VMCB_CTRL_INTERCEPT_IOIO_PROT |
2270             VMCB_CTRL_INTERCEPT_MSR_PROT |
2271             VMCB_CTRL_INTERCEPT_FERR_FREEZE |
2272             VMCB_CTRL_INTERCEPT_SHUTDOWN;
2273
2274         /*
2275          * Allow:
2276          *  - ICEBP [icebp instruction]
2277          *  - WBINVD [wbinvd instruction]
2278          *  - WCR_SPEC(0..15) [writes of CR0-15, received after instruction]
2279          *
2280          * Intercept the rest below.
2281          */
2282         vmcb->ctrl.intercept_misc2 =
2283             VMCB_CTRL_INTERCEPT_VMRUN |
2284             VMCB_CTRL_INTERCEPT_VMMCALL |
2285             VMCB_CTRL_INTERCEPT_VMLOAD |
2286             VMCB_CTRL_INTERCEPT_VMSAVE |
2287             VMCB_CTRL_INTERCEPT_STGI |
2288             VMCB_CTRL_INTERCEPT_CLGI |
2289             VMCB_CTRL_INTERCEPT_SKINIT |
2290             VMCB_CTRL_INTERCEPT_RDTSCP |
2291             VMCB_CTRL_INTERCEPT_MONITOR |
2292             VMCB_CTRL_INTERCEPT_MWAIT |
2293             VMCB_CTRL_INTERCEPT_XSETBV |
2294             VMCB_CTRL_INTERCEPT_RDPRU;
2295
2296         /*
2297          * Intercept everything.
2298          */
2299         vmcb->ctrl.intercept_misc3 =
2300             VMCB_CTRL_INTERCEPT_INVLPGB_ALL |
2301             VMCB_CTRL_INTERCEPT_PCID |
2302             VMCB_CTRL_INTERCEPT_MCOMMIT |
2303             VMCB_CTRL_INTERCEPT_TLBSYNC;
2304
2305         /* Intercept all I/O accesses. */
2306         memset(cpudata->iobm, 0xFF, IOBM_SIZE);
2307         vmcb->ctrl.iopm_base_pa = cpudata->iobm_pa;
2308
2309         /* Allow direct access to certain MSRs. */
2310         memset(cpudata->msrbm, 0xFF, MSRBM_SIZE);
2311         svm_vcpu_msr_allow(cpudata->msrbm, MSR_STAR, true, true);
2312         svm_vcpu_msr_allow(cpudata->msrbm, MSR_LSTAR, true, true);
2313         svm_vcpu_msr_allow(cpudata->msrbm, MSR_CSTAR, true, true);
2314         svm_vcpu_msr_allow(cpudata->msrbm, MSR_SFMASK, true, true);
2315         svm_vcpu_msr_allow(cpudata->msrbm, MSR_KERNELGSBASE, true, true);
2316         svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_CS, true, true);
2317         svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_ESP, true, true);
2318         svm_vcpu_msr_allow(cpudata->msrbm, MSR_SYSENTER_EIP, true, true);
2319         svm_vcpu_msr_allow(cpudata->msrbm, MSR_FSBASE, true, true);
2320         svm_vcpu_msr_allow(cpudata->msrbm, MSR_GSBASE, true, true);
2321         svm_vcpu_msr_allow(cpudata->msrbm, MSR_CR_PAT, true, true);
2322         svm_vcpu_msr_allow(cpudata->msrbm, MSR_TSC, true, false);
2323         vmcb->ctrl.msrpm_base_pa = cpudata->msrbm_pa;
2324
2325         /* Generate ASID. */
2326         svm_asid_alloc(vcpu);
2327
2328         /* Virtual TPR. */
2329         vmcb->ctrl.v = VMCB_CTRL_V_INTR_MASKING;
2330
2331         /* Enable Nested Paging. */
2332         vmcb->ctrl.enable1 = VMCB_CTRL_ENABLE_NP;
2333         vmcb->ctrl.n_cr3 = vtophys(vmspace_pmap(mach->vm)->pm_pml4);
2334
2335         /* Init XSAVE header. */
2336         cpudata->gxsave.xstate_bv = svm_xcr0_mask;
2337         cpudata->gxsave.xcomp_bv = 0;
2338
2339         /* Install the RESET state. */
2340         memcpy(&vcpu->comm->state, &nvmm_x86_reset_state,
2341             sizeof(nvmm_x86_reset_state));
2342         vcpu->comm->state_wanted = NVMM_X64_STATE_ALL;
2343         vcpu->comm->state_cached = 0;
2344         svm_vcpu_setstate(vcpu);
2345 }
2346
2347 static int
2348 svm_vcpu_create(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
2349 {
2350         struct svm_cpudata *cpudata;
2351         int error;
2352
2353         /* Allocate the SVM cpudata. */
2354         cpudata = (struct svm_cpudata *)uvm_km_alloc(kernel_map,
2355             roundup(sizeof(*cpudata), PAGE_SIZE), 0,
2356             UVM_KMF_WIRED|UVM_KMF_ZERO);
2357         if (cpudata == NULL)
2358                 return ENOMEM;
2359
2360         vcpu->cpudata = cpudata;
2361
2362         /* VMCB */
2363         error = svm_memalloc(&cpudata->vmcb_pa, (vaddr_t *)&cpudata->vmcb,
2364             VMCB_NPAGES);
2365         if (error)
2366                 goto error;
2367
2368         /* I/O Bitmap */
2369         error = svm_memalloc(&cpudata->iobm_pa, (vaddr_t *)&cpudata->iobm,
2370             IOBM_NPAGES);
2371         if (error)
2372                 goto error;
2373
2374         /* MSR Bitmap */
2375         error = svm_memalloc(&cpudata->msrbm_pa, (vaddr_t *)&cpudata->msrbm,
2376             MSRBM_NPAGES);
2377         if (error)
2378                 goto error;
2379
2380         /* Init the VCPU info. */
2381         svm_vcpu_init(mach, vcpu);
2382
2383         return 0;
2384
2385 error:
2386         if (cpudata->vmcb_pa) {
2387                 svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb,
2388                     VMCB_NPAGES);
2389         }
2390         if (cpudata->iobm_pa) {
2391                 svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm,
2392                     IOBM_NPAGES);
2393         }
2394         if (cpudata->msrbm_pa) {
2395                 svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm,
2396                     MSRBM_NPAGES);
2397         }
2398         uvm_km_free(kernel_map, (vaddr_t)cpudata,
2399             roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
2400         return error;
2401 }
2402
2403 static void
2404 svm_vcpu_destroy(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
2405 {
2406         struct svm_cpudata *cpudata = vcpu->cpudata;
2407
2408         svm_asid_free(vcpu);
2409
2410         svm_memfree(cpudata->vmcb_pa, (vaddr_t)cpudata->vmcb, VMCB_NPAGES);
2411         svm_memfree(cpudata->iobm_pa, (vaddr_t)cpudata->iobm, IOBM_NPAGES);
2412         svm_memfree(cpudata->msrbm_pa, (vaddr_t)cpudata->msrbm, MSRBM_NPAGES);
2413
2414         uvm_km_free(kernel_map, (vaddr_t)cpudata,
2415             roundup(sizeof(*cpudata), PAGE_SIZE), UVM_KMF_WIRED);
2416 }
2417
2418 /* -------------------------------------------------------------------------- */
2419
2420 static int
2421 svm_vcpu_configure_cpuid(struct svm_cpudata *cpudata, void *data)
2422 {
2423         struct nvmm_vcpu_conf_cpuid *cpuid = data;
2424         size_t i;
2425
2426         if (__predict_false(cpuid->mask && cpuid->exit)) {
2427                 return EINVAL;
2428         }
2429         if (__predict_false(cpuid->mask &&
2430             ((cpuid->u.mask.set.eax & cpuid->u.mask.del.eax) ||
2431              (cpuid->u.mask.set.ebx & cpuid->u.mask.del.ebx) ||
2432              (cpuid->u.mask.set.ecx & cpuid->u.mask.del.ecx) ||
2433              (cpuid->u.mask.set.edx & cpuid->u.mask.del.edx)))) {
2434                 return EINVAL;
2435         }
2436
2437         /* If unset, delete, to restore the default behavior. */
2438         if (!cpuid->mask && !cpuid->exit) {
2439                 for (i = 0; i < SVM_NCPUIDS; i++) {
2440                         if (!cpudata->cpuidpresent[i]) {
2441                                 continue;
2442                         }
2443                         if (cpudata->cpuid[i].leaf == cpuid->leaf) {
2444                                 cpudata->cpuidpresent[i] = false;
2445                         }
2446                 }
2447                 return 0;
2448         }
2449
2450         /* If already here, replace. */
2451         for (i = 0; i < SVM_NCPUIDS; i++) {
2452                 if (!cpudata->cpuidpresent[i]) {
2453                         continue;
2454                 }
2455                 if (cpudata->cpuid[i].leaf == cpuid->leaf) {
2456                         memcpy(&cpudata->cpuid[i], cpuid,
2457                             sizeof(struct nvmm_vcpu_conf_cpuid));
2458                         return 0;
2459                 }
2460         }
2461
2462         /* Not here, insert. */
2463         for (i = 0; i < SVM_NCPUIDS; i++) {
2464                 if (!cpudata->cpuidpresent[i]) {
2465                         cpudata->cpuidpresent[i] = true;
2466                         memcpy(&cpudata->cpuid[i], cpuid,
2467                             sizeof(struct nvmm_vcpu_conf_cpuid));
2468                         return 0;
2469                 }
2470         }
2471
2472         return ENOBUFS;
2473 }
2474
2475 static int
2476 svm_vcpu_configure(struct nvmm_cpu *vcpu, uint64_t op, void *data)
2477 {
2478         struct svm_cpudata *cpudata = vcpu->cpudata;
2479
2480         switch (op) {
2481         case NVMM_VCPU_CONF_MD(NVMM_VCPU_CONF_CPUID):
2482                 return svm_vcpu_configure_cpuid(cpudata, data);
2483         default:
2484                 return EINVAL;
2485         }
2486 }
2487
2488 /* -------------------------------------------------------------------------- */
2489
2490 #ifdef __NetBSD__
2491 static void
2492 svm_tlb_flush(struct pmap *pm)
2493 {
2494         struct nvmm_machine *mach = pm->pm_data;
2495         struct svm_machdata *machdata = mach->machdata;
2496
2497         atomic_inc_64(&machdata->mach_htlb_gen);
2498
2499         /* Generates IPIs, which cause #VMEXITs. */
2500         pmap_tlb_shootdown(pmap_kernel(), -1, PTE_G, TLBSHOOT_NVMM);
2501 }
2502 #endif /* __NetBSD__ */
2503
2504 static void
2505 svm_machine_create(struct nvmm_machine *mach)
2506 {
2507         struct pmap *pmap = vmspace_pmap(mach->vm);
2508         struct svm_machdata *machdata;
2509
2510         /* Transform pmap. */
2511         pmap_npt_transform(pmap, 0);
2512
2513 #ifdef __NetBSD__
2514         /* Fill in pmap info. */
2515         pmap->pm_data = (void *)mach;
2516         pmap->pm_tlb_flush = svm_tlb_flush;
2517 #endif /* __NetBSD__ */
2518
2519         machdata = kmem_zalloc(sizeof(struct svm_machdata), KM_SLEEP);
2520         mach->machdata = machdata;
2521
2522         /* Start with an hTLB flush everywhere. */
2523         machdata->mach_htlb_gen = 1;
2524 }
2525
2526 static void
2527 svm_machine_destroy(struct nvmm_machine *mach)
2528 {
2529         kmem_free(mach->machdata, sizeof(struct svm_machdata));
2530 }
2531
2532 static int
2533 svm_machine_configure(struct nvmm_machine *mach, uint64_t op, void *data)
2534 {
2535         panic("%s: impossible", __func__);
2536 }
2537
2538 /* -------------------------------------------------------------------------- */
2539
2540 static bool
2541 svm_ident(void)
2542 {
2543         u_int descs[4];
2544         uint64_t msr;
2545
2546         if (cpu_vendor_id != CPU_VENDOR_AMD) {
2547                 return false;
2548         }
2549         if (!(amd_feature2 & CPUID_8_01_ECX_SVM)) {
2550                 printf("NVMM: SVM not supported\n");
2551                 return false;
2552         }
2553
2554         if (cpu_exthigh < 0x8000000a) {
2555                 printf("NVMM: CPUID leaf not available\n");
2556                 return false;
2557         }
2558         x86_cpuid(0x8000000a, descs);
2559
2560         /* Expect revision 1. */
2561         if (__SHIFTOUT(descs[0], CPUID_8_0A_EAX_SvmRev) != 1) {
2562                 printf("NVMM: SVM revision not supported\n");
2563                 return false;
2564         }
2565
2566         /* Want Nested Paging. */
2567         if (!(descs[3] & CPUID_8_0A_EDX_NP)) {
2568                 printf("NVMM: SVM-NP not supported\n");
2569                 return false;
2570         }
2571
2572         /* Want nRIP. */
2573         if (!(descs[3] & CPUID_8_0A_EDX_NRIPS)) {
2574                 printf("NVMM: SVM-NRIPS not supported\n");
2575                 return false;
2576         }
2577
2578         svm_decode_assist = (descs[3] & CPUID_8_0A_EDX_DecodeAssists) != 0;
2579
2580         msr = rdmsr(MSR_VMCR);
2581         if ((msr & VMCR_SVMED) && (msr & VMCR_LOCK)) {
2582                 printf("NVMM: SVM disabled in BIOS\n");
2583                 return false;
2584         }
2585
2586         return true;
2587 }
2588
2589 static void
2590 svm_init_asid(uint32_t maxasid)
2591 {
2592         size_t i, j, allocsz;
2593
2594         mutex_init(&svm_asidlock, MUTEX_DEFAULT, IPL_NONE);
2595
2596         /* Arbitrarily limit. */
2597         maxasid = uimin(maxasid, 8192);
2598
2599         svm_maxasid = maxasid;
2600         allocsz = roundup(maxasid, 8) / 8;
2601         svm_asidmap = kmem_zalloc(allocsz, KM_SLEEP);
2602
2603         /* ASID 0 is reserved for the host. */
2604         svm_asidmap[0] |= __BIT(0);
2605
2606         /* ASID n-1 is special, we share it. */
2607         i = (maxasid - 1) / 8;
2608         j = (maxasid - 1) % 8;
2609         svm_asidmap[i] |= __BIT(j);
2610 }
2611
2612 static void
2613 svm_change_cpu(void *arg1)
2614 {
2615         bool enable = arg1 != NULL;
2616         uint64_t msr;
2617
2618         msr = rdmsr(MSR_VMCR);
2619         if (msr & VMCR_SVMED) {
2620                 wrmsr(MSR_VMCR, msr & ~VMCR_SVMED);
2621         }
2622
2623         if (!enable) {
2624                 wrmsr(MSR_VM_HSAVE_PA, 0);
2625         }
2626
2627         msr = rdmsr(MSR_EFER);
2628         if (enable) {
2629                 msr |= EFER_SVME;
2630         } else {
2631                 msr &= ~EFER_SVME;
2632         }
2633         wrmsr(MSR_EFER, msr);
2634
2635         if (enable) {
2636                 wrmsr(MSR_VM_HSAVE_PA, hsave[mycpuid].pa);
2637         }
2638
2639 #ifdef __DragonFly__
2640         if (atomic_fetchadd_int(&svm_change_cpu_count, -1) == 1)
2641                 wakeup(&svm_change_cpu_count);
2642 #endif /* __DragonFly__ */
2643 }
2644
2645 static void
2646 svm_init(void)
2647 {
2648         struct vm_page *pg;
2649         u_int descs[4];
2650         int i;
2651
2652         x86_cpuid(0x8000000a, descs);
2653
2654         /* The guest TLB flush command. */
2655         if (descs[3] & CPUID_8_0A_EDX_FlushByASID) {
2656                 svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_GUEST;
2657         } else {
2658                 svm_ctrl_tlb_flush = VMCB_CTRL_TLB_CTRL_FLUSH_ALL;
2659         }
2660
2661         /* Init the ASID. */
2662         svm_init_asid(descs[1]);
2663
2664         /* Init the XCR0 mask. */
2665         svm_xcr0_mask = SVM_XCR0_MASK_DEFAULT & x86_xsave_features;
2666
2667         /* Init the max basic CPUID leaf. */
2668         svm_cpuid_max_basic = uimin(cpuid_level, SVM_CPUID_MAX_BASIC);
2669
2670         /* Init the max extended CPUID leaf. */
2671         x86_cpuid(0x80000000, descs);
2672         svm_cpuid_max_extended = uimin(descs[0], SVM_CPUID_MAX_EXTENDED);
2673
2674         /* Init the global host state. */
2675         if (svm_xcr0_mask != 0) {
2676                 svm_global_hstate.xcr0 = rdxcr(0);
2677         }
2678         svm_global_hstate.star = rdmsr(MSR_STAR);
2679         svm_global_hstate.lstar = rdmsr(MSR_LSTAR);
2680         svm_global_hstate.cstar = rdmsr(MSR_CSTAR);
2681         svm_global_hstate.sfmask = rdmsr(MSR_SFMASK);
2682
2683         memset(hsave, 0, sizeof(hsave));
2684         for (i = 0; i < ncpus; i++) {
2685                 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
2686                 hsave[i].pa = VM_PAGE_TO_PHYS(pg);
2687         }
2688
2689 #ifdef __NetBSD__
2690         uint64_t xc;
2691         xc = xc_broadcast(0, svm_change_cpu, (void *)true, NULL);
2692         xc_wait(xc);
2693 #else /* DragonFly */
2694         atomic_swap_int(&svm_change_cpu_count, ncpus);
2695         lwkt_send_ipiq_mask(smp_active_mask, svm_change_cpu, (void *)true);
2696         do {
2697                 cpu_ccfence();
2698                 tsleep_interlock(&svm_change_cpu_count, 0);
2699                 if (svm_change_cpu_count)
2700                         tsleep(&svm_change_cpu_count, PINTERLOCKED, "vmx", hz);
2701         } while (svm_change_cpu_count != 0);
2702 #endif /* __NetBSD__ */
2703 }
2704
2705 static void
2706 svm_fini_asid(void)
2707 {
2708         size_t allocsz;
2709
2710         allocsz = roundup(svm_maxasid, 8) / 8;
2711         kmem_free(svm_asidmap, allocsz);
2712
2713         mutex_destroy(&svm_asidlock);
2714 }
2715
2716 static void
2717 svm_fini(void)
2718 {
2719         size_t i;
2720
2721 #ifdef __NetBSD__
2722         uint64_t xc;
2723         xc = xc_broadcast(0, svm_change_cpu, (void *)false, NULL);
2724         xc_wait(xc);
2725 #else /* DragonFly */
2726         atomic_swap_int(&svm_change_cpu_count, ncpus);
2727         lwkt_send_ipiq_mask(smp_active_mask, svm_change_cpu, (void *)false);
2728         do {
2729                 cpu_ccfence();
2730                 tsleep_interlock(&svm_change_cpu_count, 0);
2731                 if (svm_change_cpu_count)
2732                         tsleep(&svm_change_cpu_count, PINTERLOCKED, "vmx", hz);
2733         } while (svm_change_cpu_count != 0);
2734 #endif /* __NetBSD__ */
2735
2736         for (i = 0; i < MAXCPUS; i++) {
2737                 if (hsave[i].pa != 0)
2738                         uvm_pagefree(PHYS_TO_VM_PAGE(hsave[i].pa));
2739         }
2740
2741         svm_fini_asid();
2742 }
2743
2744 static void
2745 svm_capability(struct nvmm_capability *cap)
2746 {
2747         cap->arch.mach_conf_support = 0;
2748         cap->arch.vcpu_conf_support =
2749             NVMM_CAP_ARCH_VCPU_CONF_CPUID;
2750         cap->arch.xcr0_mask = svm_xcr0_mask;
2751         cap->arch.mxcsr_mask = x86_fpu_mxcsr_mask;
2752         cap->arch.conf_cpuid_maxops = SVM_NCPUIDS;
2753 }
2754
2755 const struct nvmm_impl nvmm_x86_svm = {
2756         .name = "x86-svm",
2757         .ident = svm_ident,
2758         .init = svm_init,
2759         .fini = svm_fini,
2760         .capability = svm_capability,
2761         .mach_conf_max = NVMM_X86_MACH_NCONF,
2762         .mach_conf_sizes = NULL,
2763         .vcpu_conf_max = NVMM_X86_VCPU_NCONF,
2764         .vcpu_conf_sizes = svm_vcpu_conf_sizes,
2765         .state_size = sizeof(struct nvmm_x64_state),
2766         .machine_create = svm_machine_create,
2767         .machine_destroy = svm_machine_destroy,
2768         .machine_configure = svm_machine_configure,
2769         .vcpu_create = svm_vcpu_create,
2770         .vcpu_destroy = svm_vcpu_destroy,
2771         .vcpu_configure = svm_vcpu_configure,
2772         .vcpu_setstate = svm_vcpu_setstate,
2773         .vcpu_getstate = svm_vcpu_getstate,
2774         .vcpu_inject = svm_vcpu_inject,
2775         .vcpu_run = svm_vcpu_run
2776 };