kernel/vmx: Fix sysctl types and descriptions.
[dragonfly.git] / sys / platform / pc64 / vmm / vmx.c
1 /*
2  * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Mihai Carabas <mihai.carabas@gmail.com>
6  * by Matthew Dillon <dillon@backplane.com>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35
36 #include <sys/malloc.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/thread.h>
41 #include <sys/thread2.h>
42 #include <sys/sysctl.h>
43 #include <sys/vmm.h>
44 #include <sys/proc.h>
45 #include <sys/syscall.h>
46 #include <sys/wait.h>
47 #include <sys/vkernel.h>
48 #include <sys/mplock2.h>
49 #include <ddb/ddb.h>
50
51 #include <cpu/cpu.h>
52
53 #include <machine/cpufunc.h>
54 #include <machine/cputypes.h>
55 #include <machine/smp.h>
56 #include <machine/globaldata.h>
57 #include <machine/trap.h>
58 #include <machine/pmap.h>
59 #include <machine/md_var.h>
60
61 #include <vm/vm_map.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_param.h>
64
65 #include "vmm.h"
66 #include "vmm_utils.h"
67
68 #include "vmx.h"
69 #include "vmx_instr.h"
70 #include "vmx_vmcs.h"
71
72 #include "ept.h"
73
74 extern void trap(struct trapframe *frame);
75
76 static int vmx_check_cpu_migration(void);
77 static int execute_vmptrld(struct vmx_thread_info *vti);
78
79 struct instr_decode syscall_asm = {
80         .opcode_bytes = 2,
81         .opcode.byte1 = 0x0F,
82         .opcode.byte2 = 0x05,
83 };
84
85 struct vmx_ctl_info vmx_pinbased = {
86         .msr_addr = IA32_VMX_PINBASED_CTLS,
87         .msr_true_addr = IA32_VMX_TRUE_PINBASED_CTLS,
88 };
89
90 struct vmx_ctl_info vmx_procbased = {
91         .msr_addr = IA32_VMX_PROCBASED_CTLS,
92         .msr_true_addr = IA32_VMX_TRUE_PROCBASED_CTLS,
93 };
94
95 struct vmx_ctl_info vmx_procbased2 = {
96         .msr_addr = IA32_VMX_PROCBASED_CTLS2,
97         .msr_true_addr = IA32_VMX_PROCBASED_CTLS2,
98 };
99
100 struct vmx_ctl_info vmx_exit = {
101         .msr_addr = IA32_VMX_EXIT_CTLS,
102         .msr_true_addr = IA32_VMX_TRUE_EXIT_CTLS,
103 };
104
105 struct vmx_ctl_info vmx_entry = {
106         .msr_addr = IA32_VMX_ENTRY_CTLS,
107         .msr_true_addr = IA32_VMX_TRUE_ENTRY_CTLS,
108 };
109
110 /* Declared in generic vmm.c - SYSCTL parent */
111 extern struct sysctl_oid *vmm_sysctl_tree;
112
113 /* SYSCTL tree and context */
114 static struct sysctl_oid *vmx_sysctl_tree;
115 static struct sysctl_ctx_list vmx_sysctl_ctx;
116
117 /* Per cpu info */
118 struct vmx_pcpu_info *pcpu_info;
119
120 /* VMX BASIC INFO */
121 uint32_t vmx_revision;
122 uint32_t vmx_region_size;
123 uint8_t vmx_width_addr;
124
125 /* IA32_VMX_EPT_VPID_CAP */
126 uint64_t vmx_ept_vpid_cap;
127
128 /* VMX fixed bits */
129 uint64_t cr0_fixed_to_0;
130 uint64_t cr4_fixed_to_0;
131 uint64_t cr0_fixed_to_1;
132 uint64_t cr4_fixed_to_1;
133
134 /* VMX status */
135 static uint8_t vmx_enabled = 0;
136 static uint8_t vmx_initialized = 0;
137
138 /* VMX set control setting
139  * Intel System Programming Guide, Part 3, Order Number 326019
140  * 31.5.1 Algorithms for Determining VMX Capabilities
141  * Implement Algorithm 3
142  */
143 static int
144 vmx_set_ctl_setting(struct vmx_ctl_info *vmx_ctl, uint32_t bit_no, setting_t value) {
145         uint64_t vmx_basic;
146         uint64_t ctl_val;
147
148         /* Check if its branch b. or c. */
149         vmx_basic = rdmsr(IA32_VMX_BASIC);
150         if (IS_TRUE_CTL_AVAIL(vmx_basic))
151                 ctl_val = rdmsr(vmx_ctl->msr_true_addr);
152         else
153                 ctl_val = rdmsr(vmx_ctl->msr_addr);
154
155         /* Check if the value is known by VMM or set on DEFAULT */
156         switch(value) {
157                 case DEFAULT:
158                         /* Both settings are allowd
159                          * - step b.iii)
160                          *   or
161                          * - c.iii), c.iv)
162                          */
163                         if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)
164                             && IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
165
166                                 /* For c.iii) and c.iv) */
167                                 if(IS_TRUE_CTL_AVAIL(vmx_basic))
168                                         ctl_val = rdmsr(vmx_ctl->msr_addr);
169
170                                 if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
171                                         vmx_ctl->ctls &= ~BIT(bit_no);
172                                 else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
173                                         vmx_ctl->ctls |= BIT(bit_no);
174
175                         } else if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) {
176                                 /* b.i), c.i) */
177                                 vmx_ctl->ctls &= ~BIT(bit_no);
178
179                         } else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
180                                 /* b.i), c.i) */
181                                 vmx_ctl->ctls |= BIT(bit_no);
182
183                         } else {
184                                 return (EINVAL);
185                         }
186                         break;
187                 case ZERO:
188                         /* For b.ii) or c.ii) */
189                         if (!IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
190                                 return (EINVAL);
191
192                         vmx_ctl->ctls &= ~BIT(bit_no);
193
194                         break;
195                 case ONE:
196                         /* For b.ii) or c.ii) */
197                         if (!IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
198                                 return (EINVAL);
199
200                         vmx_ctl->ctls |= BIT(bit_no);
201
202                         break;
203         }
204         return 0;
205 }
206
207 static void
208 vmx_set_default_settings(struct vmx_ctl_info *vmx_ctl)
209 {
210         int i;
211
212         for(i = 0; i < 32; i++) {
213                 vmx_set_ctl_setting(vmx_ctl, i, DEFAULT);
214         }
215 }
216
217 static void
218 alloc_vmxon_regions(void)
219 {
220         int cpu;
221         pcpu_info = kmalloc(ncpus * sizeof(struct vmx_pcpu_info), M_TEMP, M_WAITOK | M_ZERO);
222
223         for (cpu = 0; cpu < ncpus; cpu++) {
224
225                 /* The address must be aligned to 4K - alloc extra */
226                 pcpu_info[cpu].vmxon_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
227                     M_TEMP,
228                     M_WAITOK | M_ZERO);
229
230                 /* Align address */
231                 pcpu_info[cpu].vmxon_region = (unsigned char*) VMXON_REGION_ALIGN(pcpu_info[cpu].vmxon_region_na);
232
233                 /* In the first 31 bits put the vmx revision*/
234                 *((uint32_t *) pcpu_info[cpu].vmxon_region) = vmx_revision;
235         }
236 }
237
238 static void
239 free_vmxon_regions(void)
240 {
241         int i;
242
243         for (i = 0; i < ncpus; i++) {
244                 pcpu_info[i].vmxon_region = NULL;
245
246                 kfree(pcpu_info[i].vmxon_region_na, M_TEMP);
247         }
248
249         kfree(pcpu_info, M_TEMP);
250 }
251
252 static void
253 build_vmx_sysctl(void)
254 {
255         sysctl_ctx_init(&vmx_sysctl_ctx);
256         vmx_sysctl_tree = SYSCTL_ADD_NODE(&vmx_sysctl_ctx,
257                     SYSCTL_CHILDREN(vmm_sysctl_tree),
258                     OID_AUTO, "vmx",
259                     CTLFLAG_RD, 0, "VMX options");
260
261         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
262             SYSCTL_CHILDREN(vmx_sysctl_tree),
263             OID_AUTO, "revision", CTLFLAG_RD,
264             &vmx_revision, 0,
265             "VMX revision");
266         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
267             SYSCTL_CHILDREN(vmx_sysctl_tree),
268             OID_AUTO, "region_size", CTLFLAG_RD,
269             &vmx_region_size, 0,
270             "VMX region size");
271         SYSCTL_ADD_INT(&vmx_sysctl_ctx,
272             SYSCTL_CHILDREN(vmx_sysctl_tree),
273             OID_AUTO, "width_addr", CTLFLAG_RD,
274             &vmx_width_addr, 0,
275             "VMX width address");
276         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
277             SYSCTL_CHILDREN(vmx_sysctl_tree),
278             OID_AUTO, "pinbased_ctls", CTLFLAG_RD,
279             &vmx_pinbased.ctls, 0,
280             "VMX pin-based controls");
281         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
282             SYSCTL_CHILDREN(vmx_sysctl_tree),
283             OID_AUTO, "procbased_ctls", CTLFLAG_RD,
284             &vmx_procbased.ctls, 0,
285             "VMX primary processor-based controls");
286         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
287             SYSCTL_CHILDREN(vmx_sysctl_tree),
288             OID_AUTO, "procbased2_ctls", CTLFLAG_RD,
289             &vmx_procbased2.ctls, 0,
290             "VMX secondary processor-based controls");
291         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
292             SYSCTL_CHILDREN(vmx_sysctl_tree),
293             OID_AUTO, "vmexit_ctls", CTLFLAG_RD,
294             &vmx_exit.ctls, 0,
295             "VMX exit controls");
296         SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
297             SYSCTL_CHILDREN(vmx_sysctl_tree),
298             OID_AUTO, "vmentry_ctls", CTLFLAG_RD,
299             &vmx_entry.ctls, 0,
300             "VMX entry controls");
301         SYSCTL_ADD_ULONG(&vmx_sysctl_ctx,
302             SYSCTL_CHILDREN(vmx_sysctl_tree),
303             OID_AUTO, "ept_vpid_cap", CTLFLAG_RD,
304             &vmx_ept_vpid_cap,
305             "VMX EPT VPID CAP");
306 }
307
308
309
310 static int
311 vmx_init(void)
312 {
313         uint64_t feature_control;
314         uint64_t vmx_basic_value;
315         uint64_t cr0_fixed_bits_to_1;
316         uint64_t cr0_fixed_bits_to_0;
317         uint64_t cr4_fixed_bits_to_0;
318         uint64_t cr4_fixed_bits_to_1;
319
320         int err;
321
322
323         /*
324          * The ability of a processor to support VMX operation
325          * and related instructions is indicated by:
326          * CPUID.1:ECX.VMX[bit 5] = 1
327          */
328         if (!(cpu_feature2 & CPUID2_VMX)) {
329                 kprintf("VMM: VMX is not supported by this Intel CPU\n");
330                 return (ENODEV);
331         }
332
333         vmx_set_default_settings(&vmx_pinbased);
334
335         vmx_set_default_settings(&vmx_procbased);
336         /* Enable second level for procbased */
337         err = vmx_set_ctl_setting(&vmx_procbased,
338             PROCBASED_ACTIVATE_SECONDARY_CONTROLS,
339             ONE);
340         if (err) {
341                 kprintf("VMM: PROCBASED_ACTIVATE_SECONDARY_CONTROLS not "
342                         "supported by this CPU\n");
343                 return (ENODEV);
344         }
345         vmx_set_default_settings(&vmx_procbased2);
346
347         vmx_set_default_settings(&vmx_exit);
348         vmx_set_default_settings(&vmx_entry);
349
350         /* Enable external interrupts exiting */
351         err = vmx_set_ctl_setting(&vmx_pinbased,
352             PINBASED_EXTERNAL_INTERRUPT_EXITING,
353             ONE);
354         if (err) {
355                 kprintf("VMM: PINBASED_EXTERNAL_INTERRUPT_EXITING not "
356                         "supported by this CPU\n");
357                 return (ENODEV);
358         }
359
360         /* Enable non-maskable interrupts exiting */
361         err = vmx_set_ctl_setting(&vmx_pinbased, PINBASED_NMI_EXITING, ONE);
362         if (err) {
363                 kprintf("VMM: PINBASED_NMI_EXITING not "
364                         "supported by this CPU\n");
365                 return (ENODEV);
366         }
367
368
369         /* Set 64bits mode for GUEST */
370         err = vmx_set_ctl_setting(&vmx_entry, VMENTRY_IA32e_MODE_GUEST, ONE);
371         if (err) {
372                 kprintf("VMM: VMENTRY_IA32e_MODE_GUEST not "
373                         "supported by this CPU\n");
374                 return (ENODEV);
375         }
376
377         /* Load MSR EFER on enry */
378         err = vmx_set_ctl_setting(&vmx_entry,
379                                   VMENTRY_LOAD_IA32_EFER, ONE);
380         if (err) {
381                 kprintf("VMM: VMENTRY_LOAD_IA32_EFER not "
382                         "supported by this CPU\n");
383                 return (ENODEV);
384         }
385
386         /* Set 64bits mode */
387         err = vmx_set_ctl_setting(&vmx_exit,
388                                   VMEXIT_HOST_ADDRESS_SPACE_SIZE, ONE);
389         if (err) {
390                 kprintf("VMM: VMEXIT_HOST_ADDRESS_SPACE_SIZE not "
391                         "supported by this CPU\n");
392                 return (ENODEV);
393         }
394
395         /* Save/Load Efer on exit */
396         err = vmx_set_ctl_setting(&vmx_exit,
397             VMEXIT_SAVE_IA32_EFER,
398             ONE);
399         if (err) {
400                 kprintf("VMM: VMEXIT_SAVE_IA32_EFER not "
401                         "supported by this CPU\n");
402                 return (ENODEV);
403         }
404
405         /* Load Efer on exit */
406         err = vmx_set_ctl_setting(&vmx_exit,
407             VMEXIT_LOAD_IA32_EFER,
408             ONE);
409         if (err) {
410                 kprintf("VMM: VMEXIT_LOAD_IA32_EFER not "
411                         "supported by this CPU\n");
412                 return (ENODEV);
413         }
414
415         /* Enable EPT feature */
416         err = vmx_set_ctl_setting(&vmx_procbased2,
417             PROCBASED2_ENABLE_EPT,
418             ONE);
419         if (err) {
420                 kprintf("VMM: PROCBASED2_ENABLE_EPT not "
421                         "supported by this CPU\n");
422                 return (ENODEV);
423         }
424
425         if (vmx_ept_init()) {
426                 kprintf("VMM: vmx_ept_init failed\n");
427                 return (ENODEV);
428         }
429 #if 0
430         /* XXX - to implement in the feature */
431         /* Enable VPID feature */
432         err = vmx_set_ctl_setting(&vmx_procbased2,
433             PROCBASED2_ENABLE_VPID,
434             ONE);
435         if (err) {
436                 kprintf("VMM: PROCBASED2_ENABLE_VPID not "
437                         "supported by this CPU\n");
438                 return (ENODEV);
439         }
440 #endif
441
442         /* Check for the feature control status */
443         feature_control = rdmsr(IA32_FEATURE_CONTROL);
444         if (!(feature_control & BIT(FEATURE_CONTROL_LOCKED))) {
445                 kprintf("VMM: IA32_FEATURE_CONTROL is not locked\n");
446                 return (EINVAL);
447         }
448         if (!(feature_control & BIT(FEATURE_CONTROL_VMX_BIOS_ENABLED))) {
449                 kprintf("VMM: VMX is disabled by the BIOS\n");
450                 return (EINVAL);
451         }
452
453         vmx_basic_value = rdmsr(IA32_VMX_BASIC);
454         vmx_width_addr = (uint8_t) VMX_WIDTH_ADDR(vmx_basic_value);
455         vmx_region_size = (uint32_t) VMX_REGION_SIZE(vmx_basic_value);
456         vmx_revision = (uint32_t) VMX_REVISION(vmx_basic_value);
457
458         /* A.7 VMX-FIXED BITS IN CR0 */
459         cr0_fixed_bits_to_1 = rdmsr(IA32_VMX_CR0_FIXED0);
460         cr0_fixed_bits_to_0 = rdmsr(IA32_VMX_CR0_FIXED1);
461         cr0_fixed_to_1 = cr0_fixed_bits_to_1 & cr0_fixed_bits_to_0;
462         cr0_fixed_to_0 = ~cr0_fixed_bits_to_1 & ~cr0_fixed_bits_to_0;
463
464         /* A.8 VMX-FIXED BITS IN CR4 */
465         cr4_fixed_bits_to_1 = rdmsr(IA32_VMX_CR4_FIXED0);
466         cr4_fixed_bits_to_0 = rdmsr(IA32_VMX_CR4_FIXED1);
467         cr4_fixed_to_1 = cr4_fixed_bits_to_1 & cr4_fixed_bits_to_0;
468         cr4_fixed_to_0 = ~cr4_fixed_bits_to_1 & ~cr4_fixed_bits_to_0;
469
470         build_vmx_sysctl();
471
472         vmx_initialized = 1;
473         return 0;
474 }
475
476 static void
477 execute_vmxon(void *perr)
478 {
479         unsigned char *vmxon_region;
480         int *err = (int*) perr;
481
482         /* A.7 VMX-FIXED BITS IN CR0 */
483         load_cr0((rcr0() | cr0_fixed_to_1) & ~cr0_fixed_to_0);
484
485         /* A.8 VMX-FIXED BITS IN CR4 */
486         load_cr4((rcr4() | cr4_fixed_to_1) & ~cr4_fixed_to_0);
487
488         /* Enable VMX */
489         load_cr4(rcr4() | CR4_VMXE);
490
491         vmxon_region = pcpu_info[mycpuid].vmxon_region;
492         *err = vmxon(vmxon_region);
493         if (*err) {
494                 kprintf("VMM: vmxon failed on cpu%d\n", mycpuid);
495         }
496 }
497
498 static void
499 execute_vmxoff(void *dummy)
500 {
501         invept_desc_t desc = { 0 };
502
503         if (invept(INVEPT_TYPE_ALL_CONTEXTS, (uint64_t*) &desc))
504                 kprintf("VMM: execute_vmxoff: invet failed on cpu%d\n", mycpu->gd_cpuid);
505
506         vmxoff();
507
508         /* Disable VMX */
509         load_cr4(rcr4() & ~CR4_VMXE);
510 }
511
512 static void
513 execute_vmclear(void *data)
514 {
515         struct vmx_thread_info *vti = data;
516         int err;
517         globaldata_t gd = mycpu;
518
519         if (pcpu_info[gd->gd_cpuid].loaded_vmx == vti) {
520                 /*
521                  * Must set vti->launched to zero after vmclear'ing to
522                  * force a vmlaunch the next time.
523                  *
524                  * Must not clear the loaded_vmx field until after we call
525                  * vmclear on the region.  This field triggers the interlocked
526                  * cpusync from another cpu trying to destroy or reuse
527                  * the vti.  If we clear the field first, the other cpu will
528                  * not interlock and may race our vmclear() on the underlying
529                  * memory.
530                  */
531                 ERROR_IF(vmclear(vti->vmcs_region));
532 error:
533                 pcpu_info[gd->gd_cpuid].loaded_vmx = NULL;
534                 vti->launched = 0;
535         }
536         return;
537 }
538
539 static int
540 execute_vmptrld(struct vmx_thread_info *vti)
541 {
542         globaldata_t gd = mycpu;
543
544         /*
545          * Must vmclear previous active vcms if it is different.
546          */
547         if (pcpu_info[gd->gd_cpuid].loaded_vmx &&
548             pcpu_info[gd->gd_cpuid].loaded_vmx != vti)
549                 execute_vmclear(pcpu_info[gd->gd_cpuid].loaded_vmx);
550
551         /*
552          * Make this the current VMCS.  Must set loaded_vmx field
553          * before calling vmptrld() to avoid races against cpusync.
554          *
555          * Must set vti->launched to zero after the vmptrld to force
556          * a vmlaunch.
557          */
558         if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
559                 vti->launched = 0;
560                 pcpu_info[gd->gd_cpuid].loaded_vmx = vti;
561                 return (vmptrld(vti->vmcs_region));
562         } else {
563                 return (0);
564         }
565 }
566
567 static int
568 vmx_enable(void)
569 {
570         int err;
571         int cpu;
572
573         if (!vmx_initialized) {
574                 kprintf("VMM: vmx_enable - not allowed; vmx not initialized\n");
575                 return (EINVAL);
576         }
577
578         if (vmx_enabled) {
579                 kprintf("VMM: vmx_enable - already enabled\n");
580                 return (EINVAL);
581         }
582
583         alloc_vmxon_regions();
584         for (cpu = 0; cpu < ncpus; cpu++) {
585                 cpumask_t mask;
586
587                 err = 0;
588                 CPUMASK_ASSBIT(mask, cpu);
589                 lwkt_cpusync_simple(mask, execute_vmxon, &err);
590                 if(err) {
591                         kprintf("VMM: vmx_enable error %d on cpu%d\n", err, cpu);
592                         return err;
593                 }
594         }
595         vmx_enabled = 1;
596         return 0;
597 }
598
599 static int
600 vmx_disable(void)
601 {
602         int cpu;
603
604         if (!vmx_enabled) {
605                 kprintf("VMM: vmx_disable not allowed; vmx wasn't enabled\n");
606         }
607
608         for (cpu = 0; cpu < ncpus; cpu++) {
609                 cpumask_t mask;
610
611                 CPUMASK_ASSBIT(mask, cpu);
612                 lwkt_cpusync_simple(mask, execute_vmxoff, NULL);
613         }
614
615         free_vmxon_regions();
616
617         vmx_enabled = 0;
618
619         return 0;
620 }
621
622 static int vmx_set_guest_descriptor(descriptor_t type,
623                 uint16_t selector,
624                 uint32_t rights,
625                 uint64_t base,
626                 uint32_t limit)
627 {
628         int err;
629         int selector_enc;
630         int rights_enc;
631         int base_enc;
632         int limit_enc;
633
634
635         /*
636          * Intel Manual Vol 3C. - page 60
637          * If any bit in the limit field in the range 11:0 is 0, G must be 0.
638          * If any bit in the limit field in the range 31:20 is 1, G must be 1.
639          */
640         if ((~rights & VMCS_SEG_UNUSABLE) || (type == CS)) {
641                 if ((limit & 0xfff) != 0xfff)
642                         rights &= ~VMCS_G;
643                 else if ((limit & 0xfff00000) != 0)
644                         rights |= VMCS_G;
645         }
646
647         switch(type) {
648                 case ES:
649                         selector_enc = VMCS_GUEST_ES_SELECTOR;
650                         rights_enc = VMCS_GUEST_ES_ACCESS_RIGHTS;
651                         base_enc = VMCS_GUEST_ES_BASE;
652                         limit_enc = VMCS_GUEST_ES_LIMIT;
653                         break;
654                 case CS:
655                         selector_enc = VMCS_GUEST_CS_SELECTOR;
656                         rights_enc = VMCS_GUEST_CS_ACCESS_RIGHTS;
657                         base_enc = VMCS_GUEST_CS_BASE;
658                         limit_enc = VMCS_GUEST_CS_LIMIT;
659                         break;
660                 case SS:
661                         selector_enc = VMCS_GUEST_SS_SELECTOR;
662                         rights_enc = VMCS_GUEST_SS_ACCESS_RIGHTS;
663                         base_enc = VMCS_GUEST_SS_BASE;
664                         limit_enc = VMCS_GUEST_SS_LIMIT;
665                         break;
666                 case DS:
667                         selector_enc = VMCS_GUEST_DS_SELECTOR;
668                         rights_enc = VMCS_GUEST_DS_ACCESS_RIGHTS;
669                         base_enc = VMCS_GUEST_DS_BASE;
670                         limit_enc = VMCS_GUEST_DS_LIMIT;
671                         break;
672                 case FS:
673                         selector_enc = VMCS_GUEST_FS_SELECTOR;
674                         rights_enc = VMCS_GUEST_FS_ACCESS_RIGHTS;
675                         base_enc = VMCS_GUEST_FS_BASE;
676                         limit_enc = VMCS_GUEST_FS_LIMIT;
677                         break;
678                 case GS:
679                         selector_enc = VMCS_GUEST_GS_SELECTOR;
680                         rights_enc = VMCS_GUEST_GS_ACCESS_RIGHTS;
681                         base_enc = VMCS_GUEST_GS_BASE;
682                         limit_enc = VMCS_GUEST_GS_LIMIT;
683                         break;
684                 case LDTR:
685                         selector_enc = VMCS_GUEST_LDTR_SELECTOR;
686                         rights_enc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
687                         base_enc = VMCS_GUEST_LDTR_BASE;
688                         limit_enc = VMCS_GUEST_LDTR_LIMIT;
689                         break;
690                 case TR:
691                         selector_enc = VMCS_GUEST_TR_SELECTOR;
692                         rights_enc = VMCS_GUEST_TR_ACCESS_RIGHTS;
693                         base_enc = VMCS_GUEST_TR_BASE;
694                         limit_enc = VMCS_GUEST_TR_LIMIT;
695                         break;
696                 default:
697                         kprintf("VMM: vmx_set_guest_descriptor: unknown descriptor\n");
698                         err = -1;
699                         goto error;
700                         break;
701         }
702
703         ERROR_IF(vmwrite(selector_enc, selector));
704         ERROR_IF(vmwrite(rights_enc, rights));
705         ERROR_IF(vmwrite(base_enc, base));
706         ERROR_IF(vmwrite(limit_enc, limit));
707
708         return 0;
709 error:
710         kprintf("VMM: vmx_set_guest_descriptor failed\n");
711         return err;
712 }
713
714 /*
715  * Called by the first thread of the VMM process
716  * - create a new vmspace
717  * - init the vmspace with EPT PG_* bits and
718  *   EPT copyin/copyout functions
719  * - replace the vmspace of the current proc
720  * - remove the old vmspace
721  */
722 static int
723 vmx_vminit_master(struct vmm_guest_options *options)
724 {
725         struct vmspace *oldvmspace;
726         struct vmspace *newvmspace;
727         struct proc *p = curthread->td_proc;
728         struct vmm_proc *p_vmm;
729
730         oldvmspace = curthread->td_lwp->lwp_vmspace;
731         newvmspace = vmspace_fork(oldvmspace);
732
733         vmx_ept_pmap_pinit(vmspace_pmap(newvmspace));
734         bzero(vmspace_pmap(newvmspace)->pm_pml4, PAGE_SIZE);
735
736         lwkt_gettoken(&oldvmspace->vm_map.token);
737         lwkt_gettoken(&newvmspace->vm_map.token);
738
739         pmap_pinit2(vmspace_pmap(newvmspace));
740         pmap_replacevm(curthread->td_proc, newvmspace, 0);
741
742         lwkt_reltoken(&newvmspace->vm_map.token);
743         lwkt_reltoken(&oldvmspace->vm_map.token);
744
745         vmspace_rel(oldvmspace);
746
747         options->vmm_cr3 = vtophys(vmspace_pmap(newvmspace)->pm_pml4);
748
749         p_vmm = kmalloc(sizeof(struct vmm_proc), M_TEMP, M_WAITOK | M_ZERO);
750         p_vmm->guest_cr3 = options->guest_cr3;
751         p_vmm->vmm_cr3 = options->vmm_cr3;
752         p->p_vmm = (void *)p_vmm;
753
754         if (p->p_vkernel) {
755                 p->p_vkernel->vkernel_cr3 = options->guest_cr3;
756                 dkprintf("PROCESS CR3 %016jx\n", (intmax_t)options->guest_cr3);
757         }
758
759         return 0;
760 }
761
762 static int
763 vmx_vminit(struct vmm_guest_options *options)
764 {
765         struct vmx_thread_info * vti;
766         int err;
767         struct tls_info guest_fs = curthread->td_tls.info[0];
768         struct tls_info guest_gs = curthread->td_tls.info[1];
769
770
771         vti = kmalloc(sizeof(struct vmx_thread_info), M_TEMP, M_WAITOK | M_ZERO);
772         curthread->td_vmm = (void*) vti;
773
774         if (options->master) {
775                 vmx_vminit_master(options);
776         }
777
778         bcopy(&options->tf, &vti->guest, sizeof(struct trapframe));
779
780         /*
781          * Be sure we return success if the VMM hook enters
782          */
783         vti->guest.tf_rax = 0;
784         vti->guest.tf_rflags &= ~PSL_C;
785
786         vti->vmcs_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
787                     M_TEMP,
788                     M_WAITOK | M_ZERO);
789
790         /* Align address */
791         vti->vmcs_region = (unsigned char*) VMXON_REGION_ALIGN(vti->vmcs_region_na);
792         vti->last_cpu = -1;
793
794         vti->guest_cr3 = options->guest_cr3;
795         vti->vmm_cr3 = options->vmm_cr3;
796
797         /* In the first 31 bits put the vmx revision*/
798         *((uint32_t *)vti->vmcs_region) = vmx_revision;
799
800         /*
801          * vmclear the vmcs to initialize it.
802          */
803         ERROR_IF(vmclear(vti->vmcs_region));
804
805         crit_enter();
806
807         ERROR_IF(execute_vmptrld(vti));
808
809         /* Load the VMX controls */
810         ERROR_IF(vmwrite(VMCS_PINBASED_CTLS, vmx_pinbased.ctls));
811         ERROR_IF(vmwrite(VMCS_PROCBASED_CTLS, vmx_procbased.ctls));
812         ERROR_IF(vmwrite(VMCS_PROCBASED2_CTLS, vmx_procbased2.ctls));
813         ERROR_IF(vmwrite(VMCS_VMEXIT_CTLS, vmx_exit.ctls));
814         ERROR_IF(vmwrite(VMCS_VMENTRY_CTLS, vmx_entry.ctls));
815
816         /* Load HOST CRs */
817         ERROR_IF(vmwrite(VMCS_HOST_CR0, rcr0()));
818         ERROR_IF(vmwrite(VMCS_HOST_CR4, rcr4()));
819
820         /* Load HOST EFER and PAT */
821 //      ERROR_IF(vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_PAT)));
822         ERROR_IF(vmwrite(VMCS_HOST_IA32_EFER, rdmsr(MSR_EFER)));
823
824         /* Load HOST selectors */
825         ERROR_IF(vmwrite(VMCS_HOST_ES_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
826         ERROR_IF(vmwrite(VMCS_HOST_SS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
827         ERROR_IF(vmwrite(VMCS_HOST_FS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
828         ERROR_IF(vmwrite(VMCS_HOST_GS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
829         ERROR_IF(vmwrite(VMCS_HOST_CS_SELECTOR, GSEL(GCODE_SEL, SEL_KPL)));
830         ERROR_IF(vmwrite(VMCS_HOST_TR_SELECTOR, GSEL(GPROC0_SEL, SEL_KPL)));
831
832         /*
833          * The BASE addresses are written on each VMRUN in case
834          * the CPU changes because are per-CPU values
835          */
836
837         /*
838          * Call vmx_vmexit on VM_EXIT condition
839          * The RSP will point to the vmx_thread_info
840          */
841         ERROR_IF(vmwrite(VMCS_HOST_RIP, (uint64_t) vmx_vmexit));
842         ERROR_IF(vmwrite(VMCS_HOST_RSP, (uint64_t) vti));
843         ERROR_IF(vmwrite(VMCS_HOST_CR3, (uint64_t) KPML4phys));
844
845         /*
846          * GUEST initialization
847          * - set the descriptors according the conditions from Intel
848          *   manual "26.3.1.2 Checks on Guest Segment Registers
849          * - set the privilege to SEL_UPL (the vkernel will run
850          *   in userspace context)
851          */
852         ERROR_IF(vmx_set_guest_descriptor(ES, GSEL(GUDATA_SEL, SEL_UPL),
853             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
854             0, 0));
855
856         ERROR_IF(vmx_set_guest_descriptor(SS, GSEL(GUDATA_SEL, SEL_UPL),
857             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
858             0, 0));
859
860         ERROR_IF(vmx_set_guest_descriptor(DS, GSEL(GUDATA_SEL, SEL_UPL),
861             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
862             0, 0));
863
864         ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
865             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
866             (uint64_t) guest_fs.base, (uint32_t) guest_fs.size));
867
868         ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
869             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
870             (uint64_t) guest_gs.base, (uint32_t) guest_gs.size));
871
872         ERROR_IF(vmx_set_guest_descriptor(CS, GSEL(GUCODE_SEL, SEL_UPL),
873             VMCS_SEG_TYPE(11) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P | VMCS_L,
874             0, 0));
875
876         ERROR_IF(vmx_set_guest_descriptor(TR, GSEL(GPROC0_SEL, SEL_UPL),
877                         VMCS_SEG_TYPE(11) | VMCS_P,
878                         0, 0));
879
880         ERROR_IF(vmx_set_guest_descriptor(LDTR, 0, VMCS_SEG_UNUSABLE, 0, 0));
881
882         /* Set the CR0/CR4 registers, removing the unsupported bits */
883         ERROR_IF(vmwrite(VMCS_GUEST_CR0, (CR0_PE | CR0_PG |
884             cr0_fixed_to_1) & ~cr0_fixed_to_0));
885         ERROR_IF(vmwrite(VMCS_GUEST_CR4, (CR4_PAE | CR4_FXSR | CR4_XMM | CR4_XSAVE |
886             cr4_fixed_to_1) & ~ cr4_fixed_to_0));
887
888         /* Don't set EFER_SCE for catching "syscall" instructions */
889         ERROR_IF(vmwrite(VMCS_GUEST_IA32_EFER, (EFER_LME | EFER_LMA)));
890
891         vti->guest.tf_rflags = PSL_I | 0x02;
892         ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
893
894         /* The Guest CR3 indicating CR3 pagetable */
895         ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
896
897         /* Throw all possible exceptions */
898         ERROR_IF(vmwrite(VMCS_EXCEPTION_BITMAP,(uint64_t) 0xFFFFFFFF));
899
900         /* Guest RIP and RSP */
901         ERROR_IF(vmwrite(VMCS_GUEST_RIP, options->tf.tf_rip));
902         ERROR_IF(vmwrite(VMCS_GUEST_RSP, options->tf.tf_rsp));
903
904         /*
905          * This field is included for future expansion.
906          * Software should set this field to FFFFFFFF_FFFFFFFFH
907          * to avoid VM-entry failures (see Section 26.3.1.5).
908          */
909         ERROR_IF(vmwrite(VMCS_LINK_POINTER, ~0ULL));
910
911         /* The pointer to the EPT pagetable */
912         ERROR_IF(vmwrite(VMCS_EPTP, vmx_eptp(vti->vmm_cr3)));
913
914         vti->invept_desc.eptp = vmx_eptp(vti->vmm_cr3);
915
916         crit_exit();
917
918         return 0;
919 error:
920         crit_exit();
921
922         kprintf("VMM: vmx_vminit failed\n");
923         execute_vmclear(vti);
924
925         kfree(vti->vmcs_region_na, M_TEMP);
926         kfree(vti, M_TEMP);
927         return err;
928 }
929
930 static int
931 vmx_vmdestroy(void)
932 {
933         struct vmx_thread_info *vti = curthread->td_vmm;
934         struct proc *p = curproc;
935         int error = -1;
936
937         if (vti != NULL) {
938                 vmx_check_cpu_migration();
939                 if (vti->vmcs_region &&
940                     pcpu_info[mycpu->gd_cpuid].loaded_vmx == vti)
941                         execute_vmclear(vti);
942
943                 if (vti->vmcs_region_na != NULL) {
944                         kfree(vti->vmcs_region_na, M_TEMP);
945                         kfree(vti, M_TEMP);
946                         error = 0;
947                 }
948                 curthread->td_vmm = NULL;
949                 lwkt_gettoken(&p->p_token);
950                 if (p->p_nthreads == 1) {
951                         kfree(p->p_vmm, M_TEMP);
952                         p->p_vmm = NULL;
953                 }
954         }
955         return error;
956 }
957
958 /*
959  * Checks if we migrated to another cpu
960  *
961  * No locks are required
962  */
963 static int
964 vmx_check_cpu_migration(void)
965 {
966         struct vmx_thread_info * vti;
967         struct globaldata *gd;
968         cpumask_t mask;
969         int err;
970
971         gd = mycpu;
972         vti = (struct vmx_thread_info *) curthread->td_vmm;
973         ERROR_IF(vti == NULL);
974
975         if (vti->last_cpu != -1 && vti->last_cpu != gd->gd_cpuid &&
976             pcpu_info[vti->last_cpu].loaded_vmx == vti) {
977                 /*
978                  * Do not reset last_cpu to -1 here, leave it caching
979                  * the cpu whos per-cpu fields the VMCS is synchronized
980                  * with.  The pcpu_info[] check prevents unecessary extra
981                  * cpusyncs.
982                  */
983                 dkprintf("VMM: cpusync from %d to %d\n",
984                          gd->gd_cpuid, vti->last_cpu);
985
986                 /* Clear the VMCS area if ran on another CPU */
987                 CPUMASK_ASSBIT(mask, vti->last_cpu);
988                 lwkt_cpusync_simple(mask, execute_vmclear, (void *)vti);
989         }
990         return 0;
991 error:
992         kprintf("VMM: vmx_check_cpu_migration failed\n");
993         return err;
994 }
995
996 /* Handle CPU migration
997  *
998  * We have to enter with interrupts disabled/critical section
999  * to be sure that another VMCS won't steel our CPU.
1000  */
1001 static inline int
1002 vmx_handle_cpu_migration(void)
1003 {
1004         struct vmx_thread_info * vti;
1005         struct globaldata *gd;
1006         int err;
1007
1008         gd = mycpu;
1009         vti = (struct vmx_thread_info *) curthread->td_vmm;
1010         ERROR_IF(vti == NULL);
1011
1012         if (vti->last_cpu != gd->gd_cpuid) {
1013                 /*
1014                  * We need to synchronize the per-cpu fields after changing
1015                  * cpus.
1016                  */
1017                 dkprintf("VMM: vmx_handle_cpu_migration init per CPU data\n");
1018
1019                 ERROR_IF(execute_vmptrld(vti));
1020
1021                 /* Host related registers */
1022                 ERROR_IF(vmwrite(VMCS_HOST_GS_BASE, (uint64_t) gd)); /* mycpu points to %gs:0 */
1023                 ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t) &gd->gd_prvspace->mdglobaldata.gd_common_tss));
1024
1025                 ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
1026                 ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t) r_idt_arr[gd->gd_cpuid].rd_base));
1027
1028
1029                 /* Guest related register */
1030                 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
1031                 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_LIMIT, (uint64_t) (NGDT * sizeof(gdt[0]) - 1)));
1032
1033                 /*
1034                  * Indicates which cpu the per-cpu fields are synchronized
1035                  * with.  Does not indicate whether the vmcs is active on
1036                  * that particular cpu.
1037                  */
1038                 vti->last_cpu = gd->gd_cpuid;
1039         } else if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
1040                 /*
1041                  * We only need to vmptrld
1042                  */
1043                 dkprintf("VMM: vmx_handle_cpu_migration: vmcs is not loaded\n");
1044
1045                 ERROR_IF(execute_vmptrld(vti));
1046
1047         } /* else we don't need to do anything */
1048         return 0;
1049 error:
1050         kprintf("VMM: vmx_handle_cpu_migration failed\n");
1051         return err;
1052 }
1053
1054 /* Load information about VMexit
1055  *
1056  * We still are with interrupts disabled/critical secion
1057  * because we must operate with the VMCS on the CPU
1058  */
1059 static inline int
1060 vmx_vmexit_loadinfo(void)
1061 {
1062         struct vmx_thread_info *vti;
1063         int err;
1064
1065         vti = (struct vmx_thread_info *) curthread->td_vmm;
1066         ERROR_IF(vti == NULL);
1067
1068         ERROR_IF(vmread(VMCS_VMEXIT_REASON, &vti->vmexit_reason));
1069         ERROR_IF(vmread(VMCS_EXIT_QUALIFICATION, &vti->vmexit_qualification));
1070         ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_INFO, &vti->vmexit_interruption_info));
1071         ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_ERROR, &vti->vmexit_interruption_error));
1072         ERROR_IF(vmread(VMCS_VMEXIT_INSTRUCTION_LENGTH, &vti->vmexit_instruction_length));
1073         ERROR_IF(vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &vti->guest_physical_address));
1074         ERROR_IF(vmread(VMCS_GUEST_RIP, &vti->guest.tf_rip));
1075         ERROR_IF(vmread(VMCS_GUEST_CS_SELECTOR, &vti->guest.tf_cs));
1076         ERROR_IF(vmread(VMCS_GUEST_RFLAGS, &vti->guest.tf_rflags));
1077         ERROR_IF(vmread(VMCS_GUEST_RSP, &vti->guest.tf_rsp));
1078         ERROR_IF(vmread(VMCS_GUEST_SS_SELECTOR, &vti->guest.tf_ss));
1079
1080         return 0;
1081 error:
1082         kprintf("VMM: vmx_vmexit_loadinfo failed\n");
1083         return err;
1084 }
1085
1086
1087 static int
1088 vmx_set_tls_area(void)
1089 {
1090         struct tls_info *guest_fs = &curthread->td_tls.info[0];
1091         struct tls_info *guest_gs = &curthread->td_tls.info[1];
1092
1093         int err;
1094
1095         dkprintf("VMM: vmx_set_tls_area hook\n");
1096
1097         crit_enter();
1098
1099         ERROR_IF(vmx_check_cpu_migration());
1100         ERROR_IF(vmx_handle_cpu_migration());
1101
1102         /* set %fs */
1103         ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
1104             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
1105             (uint64_t) guest_fs->base, (uint32_t) guest_fs->size));
1106
1107         /* set %gs */
1108         ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
1109             VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
1110             (uint64_t) guest_gs->base, (uint32_t) guest_gs->size));
1111
1112         crit_exit();
1113         return 0;
1114
1115 error:
1116         crit_exit();
1117         return err;
1118 }
1119
1120
1121 static int
1122 vmx_handle_vmexit(void)
1123 {
1124         struct vmx_thread_info * vti;
1125         int exit_reason;
1126         int exception_type;
1127         int exception_number;
1128         int err;
1129         int func, regs[4];
1130         int fault_type, rv;
1131         int fault_flags = 0;
1132         struct lwp *lp = curthread->td_lwp;
1133
1134         dkprintf("VMM: handle_vmx_vmexit\n");
1135         vti = (struct vmx_thread_info *) curthread->td_vmm;
1136         ERROR_IF(vti == NULL);
1137
1138         exit_reason = VMCS_BASIC_EXIT_REASON(vti->vmexit_reason);
1139         switch (exit_reason) {
1140                 case EXIT_REASON_EXCEPTION:
1141                         dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXCEPTION with qualification "
1142                             "%llx, interruption info %llx, interruption error %llx, instruction "
1143                             "length %llx\n",
1144                             (long long) vti->vmexit_qualification,
1145                             (long long) vti->vmexit_interruption_info,
1146                             (long long) vti->vmexit_interruption_error,
1147                             (long long) vti->vmexit_instruction_length);
1148
1149                         dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, "
1150                             "rsp: %llx,  rdi: %llx, rsi: %llx, %d, vti: %p, master: %p\n",
1151                             (long long)vti->guest.tf_rax,
1152                             (long long)vti->guest.tf_rip,
1153                             (long long)vti->guest.tf_rsp,
1154                             (long long)vti->guest.tf_rdi,
1155                             (long long)vti->guest.tf_rsi, exit_reason, vti, curproc->p_vmm);
1156
1157                         exception_type = VMCS_EXCEPTION_TYPE(vti->vmexit_interruption_info);
1158                         exception_number = VMCS_EXCEPTION_NUMBER(vti->vmexit_interruption_info);
1159
1160                         if (exception_type == VMCS_EXCEPTION_HARDWARE) {
1161                                 switch (exception_number) {
1162                                         case IDT_UD:
1163                                                 /*
1164                                                  * Disabled "syscall" instruction and
1165                                                  * now we catch it for executing
1166                                                  */
1167                                                 dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_UD\n");
1168 #ifdef VMM_DEBUG
1169                                                 /* Check to see if its syscall asm instuction */
1170                                                 uint8_t instr[INSTRUCTION_MAX_LENGTH];
1171                                                 if (copyin((const void *) vti->guest.tf_rip, instr, vti->vmexit_instruction_length) &&
1172                                                     instr_check(&syscall_asm,(void *) instr, (uint8_t) vti->vmexit_instruction_length)) {
1173                                                         kprintf("VMM: handle_vmx_vmexit: UD different from syscall: ");
1174                                                         db_disasm((db_addr_t) instr, FALSE, NULL);
1175                                                 }
1176 #endif
1177                                                 /* Called to force a VMEXIT and invalidate TLB */
1178                                                 if (vti->guest.tf_rax == -1) {
1179                                                         vti->guest.tf_rip += vti->vmexit_instruction_length;
1180                                                         break;
1181                                                 }
1182
1183                                                 vti->guest.tf_err = 2;
1184                                                 vti->guest.tf_trapno = T_FAST_SYSCALL;
1185                                                 vti->guest.tf_xflags = 0;
1186
1187                                                 vti->guest.tf_rip += vti->vmexit_instruction_length;
1188
1189                                                 syscall2(&vti->guest);
1190
1191                                                 break;
1192                                         case IDT_PF:
1193                                                 dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_PF at %llx\n",
1194                                                     (long long) vti->guest.tf_rip);
1195
1196                                                 if (vti->guest.tf_rip == 0) {
1197                                                         kprintf("VMM: handle_vmx_vmexit: Terminating...\n");
1198                                                         err = -1;
1199                                                         goto error;
1200                                                 }
1201
1202                                                 vti->guest.tf_err = vti->vmexit_interruption_error;
1203                                                 vti->guest.tf_addr = vti->vmexit_qualification;
1204                                                 vti->guest.tf_xflags = 0;
1205                                                 vti->guest.tf_trapno = T_PAGEFLT;
1206
1207                                                 /*
1208                                                  * If we are a user process in the vkernel
1209                                                  * pass the PF to the vkernel and will trigger
1210                                                  * the user_trap()
1211                                                  *
1212                                                  * If we are the vkernel, send a SIGSEGV signal
1213                                                  * to us that will trigger the execution of
1214                                                  * kern_trap()
1215                                                  *
1216                                                  */
1217
1218                                                 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
1219                                                         vkernel_trap(lp, &vti->guest);
1220                                                 } else {
1221                                                         trapsignal(lp, SIGSEGV, SEGV_MAPERR);
1222                                                 }
1223
1224                                                 break;
1225                                         default:
1226                                                 kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE unknown "
1227                                                     "number %d rip: %llx, rsp: %llx\n", exception_number,
1228                                                     (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
1229                                                 err = -1;
1230                                                 goto error;
1231                                 }
1232                         } else if (exception_type == VMCS_EXCEPTION_SOFTWARE) {
1233                                 switch (exception_number) {
1234                                         case 3:
1235                                                 dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE "
1236                                                     "number %d rip: %llx, rsp: %llx\n", exception_number,
1237                                                     (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
1238
1239                                                 vti->guest.tf_trapno = T_BPTFLT;
1240                                                 vti->guest.tf_xflags = 0;
1241                                                 vti->guest.tf_err = 0;
1242                                                 vti->guest.tf_addr = 0;
1243
1244                                                 vti->guest.tf_rip += vti->vmexit_instruction_length;
1245
1246                                                 trap(&vti->guest);
1247
1248                                                 break;
1249                                         default:
1250                                                 kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE unknown "
1251                                                     "number %d rip: %llx, rsp: %llx\n", exception_number,
1252                                                     (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
1253                                                 err = -1;
1254                                                 goto error;
1255                                 }
1256                         } else {
1257                                 kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_ %d unknown\n", exception_type);
1258                                 err = -1;
1259                                 goto error;
1260                         }
1261                         break;
1262                 case EXIT_REASON_EXT_INTR:
1263                         dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n");
1264                         break;
1265                 case EXIT_REASON_CPUID:
1266                         dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n");
1267
1268                         /*
1269                          * Execute CPUID instruction and pass
1270                          * the result to the vkernel
1271                          */
1272
1273                         func = vti->guest.tf_rax;
1274                         do_cpuid(func, regs);
1275
1276                         vti->guest.tf_rax = regs[0];
1277                         vti->guest.tf_rbx = regs[1];
1278                         vti->guest.tf_rcx = regs[2];
1279                         vti->guest.tf_rdx = regs[3];
1280
1281                         vti->guest.tf_rip += vti->vmexit_instruction_length;
1282
1283                         break;
1284                 case EXIT_REASON_EPT_FAULT:
1285                         /*
1286                          * EPT_FAULT are resolved like normal PFs. Nothing special
1287                          * - get the fault type
1288                          * - get the fault address (which is a GPA)
1289                          * - execute vm_fault on the vm_map
1290                          */
1291                         dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT with qualification %lld,"
1292                             "GPA: %llx, fault_Type: %d\n",(long long) vti->vmexit_qualification,
1293                             (unsigned long long) vti->guest_physical_address, fault_type);
1294
1295                         fault_type = vmx_ept_fault_type(vti->vmexit_qualification);
1296
1297                         if (fault_type & VM_PROT_WRITE)
1298                                 fault_flags = VM_FAULT_DIRTY;
1299                         else
1300                                 fault_flags = VM_FAULT_NORMAL;
1301
1302                         rv = vm_fault(&curthread->td_lwp->lwp_vmspace->vm_map,
1303                             trunc_page(vti->guest_physical_address), fault_type, fault_flags);
1304
1305                         if (rv != KERN_SUCCESS) {
1306                                 kprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT couldn't resolve %llx\n",
1307                                     (unsigned long long) vti->guest_physical_address);
1308                                 err = -1;
1309                                 goto error;
1310                         }
1311                         break;
1312                 default:
1313                         kprintf("VMM: handle_vmx_vmexit: unknown exit reason: %d with qualification %lld\n",
1314                             exit_reason, (long long) vti->vmexit_qualification);
1315                         err = -1;
1316                         goto error;
1317         }
1318         return 0;
1319 error:
1320         return err;
1321 }
1322
1323 static int
1324 vmx_vmrun(void)
1325 {
1326         struct vmx_thread_info * vti;
1327         struct globaldata *gd;
1328         int err;
1329         int ret;
1330         int sticks = 0;
1331         uint64_t val;
1332         cpulock_t olock;
1333         cpulock_t nlock;
1334         struct trapframe *save_frame;
1335         thread_t td = curthread;
1336
1337         vti = (struct vmx_thread_info *) td->td_vmm;
1338         save_frame = td->td_lwp->lwp_md.md_regs;
1339         td->td_lwp->lwp_md.md_regs = &vti->guest;
1340 restart:
1341         crit_enter();
1342
1343         /*
1344          * This can change the cpu we are running on.
1345          */
1346         trap_handle_userexit(&vti->guest, sticks);
1347         gd = mycpu;
1348
1349         ERROR2_IF(vti == NULL);
1350         ERROR2_IF(vmx_check_cpu_migration());
1351         ERROR2_IF(vmx_handle_cpu_migration());
1352
1353         /*
1354          * Make the state safe to VMENTER
1355          * - disable interrupts and check if there were any pending
1356          * - check for ASTFLTs
1357          * - loop again until there are no ASTFLTs
1358          */
1359         cpu_disable_intr();
1360         splz();
1361         if (gd->gd_reqflags & RQF_AST_MASK) {
1362                 atomic_clear_int(&gd->gd_reqflags, RQF_AST_SIGNAL);
1363                 cpu_enable_intr();
1364                 crit_exit();
1365                 vti->guest.tf_trapno = T_ASTFLT;
1366                 trap(&vti->guest);
1367                 /* CURRENT CPU CAN CHANGE */
1368                 goto restart;
1369         }
1370         if (vti->last_cpu != gd->gd_cpuid) {
1371                 cpu_enable_intr();
1372                 crit_exit();
1373                 kprintf("VMM: vmx_vmrun: vti unexpectedly "
1374                         "changed cpus %d->%d\n",
1375                         gd->gd_cpuid, vti->last_cpu);
1376                 goto restart;
1377         }
1378
1379         /*
1380          * Add us to the list of cpus running vkernel operations, interlock
1381          * against anyone trying to do an invalidation.
1382          *
1383          * We must set the cpumask first to ensure that we interlock another
1384          * cpu that may desire to IPI us after we have successfully
1385          * incremented the cpulock counter.
1386          */
1387         ATOMIC_CPUMASK_ORBIT(td->td_proc->p_vmm_cpumask, gd->gd_cpuid);
1388
1389         for (;;) {
1390                 olock = td->td_proc->p_vmm_cpulock;
1391                 cpu_ccfence();
1392                 if ((olock & CPULOCK_EXCL) == 0) {
1393                         nlock = olock + CPULOCK_INCR;
1394                         if (atomic_cmpset_int(&td->td_proc->p_vmm_cpulock,
1395                                               olock, nlock)) {
1396                                 /* fast path */
1397                                 break;
1398                         }
1399                         /* cmpset race */
1400                         cpu_pause();
1401                         continue;
1402                 }
1403
1404                 /*
1405                  * More complex.  After sleeping we have to re-test
1406                  * everything.
1407                  */
1408                 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask,
1409                                        gd->gd_cpuid);
1410                 cpu_enable_intr();
1411                 tsleep_interlock(&td->td_proc->p_vmm_cpulock, 0);
1412                 if (td->td_proc->p_vmm_cpulock & CPULOCK_EXCL) {
1413                         tsleep(&td->td_proc->p_vmm_cpulock, PINTERLOCKED,
1414                                "vmminvl", hz);
1415                 }
1416                 crit_exit();
1417                 goto restart;
1418         }
1419
1420         /*
1421          * Load specific Guest registers
1422          * GP registers will be loaded in vmx_launch/resume
1423          */
1424         ERROR_IF(vmwrite(VMCS_GUEST_RIP, vti->guest.tf_rip));
1425         ERROR_IF(vmwrite(VMCS_GUEST_CS_SELECTOR, vti->guest.tf_cs));
1426         ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
1427         ERROR_IF(vmwrite(VMCS_GUEST_RSP, vti->guest.tf_rsp));
1428         ERROR_IF(vmwrite(VMCS_GUEST_SS_SELECTOR, vti->guest.tf_ss));
1429         ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
1430
1431         /*
1432          * FPU
1433          */
1434         if (mdcpu->gd_npxthread != td) {
1435                 if (mdcpu->gd_npxthread)
1436                         npxsave(mdcpu->gd_npxthread->td_savefpu);
1437                 npxdna();
1438         }
1439
1440         /*
1441          * The kernel caches the MSR_FSBASE value in mdcpu->gd_user_fs.
1442          * A vmexit loads this unconditionally from the VMCS so make
1443          * sure it loads the correct value.
1444          */
1445         ERROR_IF(vmwrite(VMCS_HOST_FS_BASE, mdcpu->gd_user_fs));
1446
1447         /*
1448          * EPT mappings can't be invalidated with normal invlpg/invltlb
1449          * instructions. We have to execute a special instruction that
1450          * invalidates all EPT cache ("invept").
1451          *
1452          * pm_invgen it's a generation number which is incremented in the
1453          * pmap_inval_interlock, before doing any invalidates. The
1454          * pmap_inval_interlock will cause all the CPUs that are using
1455          * the EPT to VMEXIT and wait for the interlock to complete.
1456          * When they will VMENTER they will see that the generation
1457          * number had changed from their current and do a invept.
1458          */
1459         if (vti->eptgen != td->td_proc->p_vmspace->vm_pmap.pm_invgen) {
1460                 vti->eptgen = td->td_proc->p_vmspace->vm_pmap.pm_invgen;
1461
1462                 ERROR_IF(invept(INVEPT_TYPE_SINGLE_CONTEXT,
1463                     (uint64_t*)&vti->invept_desc));
1464         }
1465
1466         if (vti->launched) { /* vmresume called from vmx_trap.s */
1467                 dkprintf("\n\nVMM: vmx_vmrun: vmx_resume\n");
1468                 ret = vmx_resume(vti);
1469
1470         } else { /* vmlaunch called from vmx_trap.s */
1471                 dkprintf("\n\nVMM: vmx_vmrun: vmx_launch\n");
1472                 vti->launched = 1;
1473                 ret = vmx_launch(vti);
1474         }
1475
1476         /*
1477          * This is our return point from the vmlaunch/vmresume
1478          * There are two situations:
1479          * - the vmlaunch/vmresume executed successfully and they
1480          *   would return through "vmx_vmexit" which will restore
1481          *   the state (registers) and return here with the ret
1482          *   set to VM_EXIT (ret is actually %rax)
1483          * - the vmlaunch/vmresume failed to execute and will return
1484          *   immediately with ret set to the error code
1485          */
1486         if (ret == VM_EXIT) {
1487                 ERROR_IF(vmx_vmexit_loadinfo());
1488
1489                 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask,
1490                                        gd->gd_cpuid);
1491                 atomic_add_int(&td->td_proc->p_vmm_cpulock,
1492                                -CPULOCK_INCR);
1493                 /* WARNING: don't adjust cpulock twice! */
1494
1495                 cpu_enable_intr();
1496                 trap_handle_userenter(td);
1497                 sticks = td->td_sticks;
1498                 crit_exit();
1499
1500                 /*
1501                  * Handle the VMEXIT reason
1502                  * - if successful we VMENTER again
1503                  * - if not, we exit
1504                  */
1505                 if (vmx_handle_vmexit())
1506                         goto done;
1507
1508                 /*
1509                  * We handled the VMEXIT reason and continue with
1510                  * VM execution
1511                  */
1512                 goto restart;
1513
1514         } else {
1515                 vti->launched = 0;
1516
1517                 /*
1518                  * Two types of error:
1519                  * - VM_FAIL_VALID - the host state was ok,
1520                  *   but probably the guest state was not
1521                  * - VM_FAIL_INVALID - the parameters or the host state
1522                  *   was not ok
1523                  */
1524                 if (ret == VM_FAIL_VALID) {
1525                         vmread(VMCS_INSTR_ERR, &val);
1526                         err = (int) val;
1527                         kprintf("VMM: vmx_vmrun: vmenter failed with "
1528                                 "VM_FAIL_VALID, error code %d\n",
1529                                 err);
1530                 } else {
1531                         kprintf("VMM: vmx_vmrun: vmenter failed with "
1532                                 "VM_FAIL_INVALID\n");
1533                 }
1534                 goto error;
1535         }
1536 done:
1537         kprintf("VMM: vmx_vmrun: returning with success\n");
1538         return 0;
1539 error:
1540         ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask, gd->gd_cpuid);
1541         atomic_add_int(&td->td_proc->p_vmm_cpulock, -CPULOCK_INCR);
1542         cpu_enable_intr();
1543 error2:
1544         trap_handle_userenter(td);
1545         td->td_lwp->lwp_md.md_regs = save_frame;
1546         KKASSERT(CPUMASK_TESTMASK(td->td_proc->p_vmm_cpumask,
1547                                   gd->gd_cpumask) == 0);
1548         /*atomic_clear_cpumask(&td->td_proc->p_vmm_cpumask, gd->gd_cpumask);*/
1549         crit_exit();
1550         kprintf("VMM: vmx_vmrun failed\n");
1551         return err;
1552 }
1553
1554 /*
1555  * Called when returning to user-space
1556  * after executing lwp_fork.
1557  */
1558 static void
1559 vmx_lwp_return(struct lwp *lp, struct trapframe *frame)
1560 {
1561         struct vmm_guest_options options;
1562         int vmrun_err;
1563         struct vmm_proc *p_vmm = (struct vmm_proc *)curproc->p_vmm;
1564
1565         dkprintf("VMM: vmx_lwp_return \n");
1566
1567         bzero(&options, sizeof(struct vmm_guest_options));
1568
1569         bcopy(frame, &options.tf, sizeof(struct trapframe));
1570
1571         options.guest_cr3 = p_vmm->guest_cr3;
1572         options.vmm_cr3 = p_vmm->vmm_cr3;
1573
1574         vmx_vminit(&options);
1575         generic_lwp_return(lp, frame);
1576
1577         vmrun_err = vmx_vmrun();
1578
1579         exit1(W_EXITCODE(vmrun_err, 0));
1580 }
1581
1582 static void
1583 vmx_set_guest_cr3(register_t guest_cr3)
1584 {
1585         struct vmx_thread_info *vti = (struct vmx_thread_info *) curthread->td_vmm;
1586         vti->guest_cr3 = guest_cr3;
1587 }
1588
1589 static int
1590 vmx_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr)
1591 {
1592         return guest_phys_addr(p->p_vmspace, gpa, p->p_vkernel->vkernel_cr3, uaddr);
1593 }
1594
1595 static struct vmm_ctl ctl_vmx = {
1596         .name = "VMX from Intel",
1597         .init = vmx_init,
1598         .enable = vmx_enable,
1599         .disable = vmx_disable,
1600         .vminit = vmx_vminit,
1601         .vmdestroy = vmx_vmdestroy,
1602         .vmrun = vmx_vmrun,
1603         .vm_set_tls_area = vmx_set_tls_area,
1604         .vm_lwp_return = vmx_lwp_return,
1605         .vm_set_guest_cr3 = vmx_set_guest_cr3,
1606         .vm_get_gpa = vmx_vm_get_gpa,
1607 };
1608
1609 struct vmm_ctl*
1610 get_ctl_intel(void)
1611 {
1612         return &ctl_vmx;
1613 }