2 * Copyright (c) 2003-2013 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Mihai Carabas <mihai.carabas@gmail.com>
6 * by Matthew Dillon <dillon@backplane.com>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 #include <sys/malloc.h>
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/types.h>
40 #include <sys/thread.h>
41 #include <sys/thread2.h>
42 #include <sys/sysctl.h>
45 #include <sys/syscall.h>
47 #include <sys/vkernel.h>
48 #include <sys/mplock2.h>
53 #include <machine/cpufunc.h>
54 #include <machine/cputypes.h>
55 #include <machine/smp.h>
56 #include <machine/globaldata.h>
57 #include <machine/trap.h>
58 #include <machine/pmap.h>
59 #include <machine/md_var.h>
61 #include <vm/vm_map.h>
62 #include <vm/vm_extern.h>
63 #include <vm/vm_param.h>
66 #include "vmm_utils.h"
69 #include "vmx_instr.h"
74 extern void trap(struct trapframe *frame);
76 static int vmx_check_cpu_migration(void);
77 static int execute_vmptrld(struct vmx_thread_info *vti);
79 struct instr_decode syscall_asm = {
85 struct vmx_ctl_info vmx_pinbased = {
86 .msr_addr = IA32_VMX_PINBASED_CTLS,
87 .msr_true_addr = IA32_VMX_TRUE_PINBASED_CTLS,
90 struct vmx_ctl_info vmx_procbased = {
91 .msr_addr = IA32_VMX_PROCBASED_CTLS,
92 .msr_true_addr = IA32_VMX_TRUE_PROCBASED_CTLS,
95 struct vmx_ctl_info vmx_procbased2 = {
96 .msr_addr = IA32_VMX_PROCBASED_CTLS2,
97 .msr_true_addr = IA32_VMX_PROCBASED_CTLS2,
100 struct vmx_ctl_info vmx_exit = {
101 .msr_addr = IA32_VMX_EXIT_CTLS,
102 .msr_true_addr = IA32_VMX_TRUE_EXIT_CTLS,
105 struct vmx_ctl_info vmx_entry = {
106 .msr_addr = IA32_VMX_ENTRY_CTLS,
107 .msr_true_addr = IA32_VMX_TRUE_ENTRY_CTLS,
110 /* Declared in generic vmm.c - SYSCTL parent */
111 extern struct sysctl_oid *vmm_sysctl_tree;
113 /* SYSCTL tree and context */
114 static struct sysctl_oid *vmx_sysctl_tree;
115 static struct sysctl_ctx_list vmx_sysctl_ctx;
118 struct vmx_pcpu_info *pcpu_info;
121 uint32_t vmx_revision;
122 uint32_t vmx_region_size;
123 uint8_t vmx_width_addr;
125 /* IA32_VMX_EPT_VPID_CAP */
126 uint64_t vmx_ept_vpid_cap;
129 uint64_t cr0_fixed_to_0;
130 uint64_t cr4_fixed_to_0;
131 uint64_t cr0_fixed_to_1;
132 uint64_t cr4_fixed_to_1;
135 static uint8_t vmx_enabled = 0;
136 static uint8_t vmx_initialized = 0;
138 /* VMX set control setting
139 * Intel System Programming Guide, Part 3, Order Number 326019
140 * 31.5.1 Algorithms for Determining VMX Capabilities
141 * Implement Algorithm 3
144 vmx_set_ctl_setting(struct vmx_ctl_info *vmx_ctl, uint32_t bit_no, setting_t value) {
148 /* Check if its branch b. or c. */
149 vmx_basic = rdmsr(IA32_VMX_BASIC);
150 if (IS_TRUE_CTL_AVAIL(vmx_basic))
151 ctl_val = rdmsr(vmx_ctl->msr_true_addr);
153 ctl_val = rdmsr(vmx_ctl->msr_addr);
155 /* Check if the value is known by VMM or set on DEFAULT */
158 /* Both settings are allowd
163 if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)
164 && IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
166 /* For c.iii) and c.iv) */
167 if(IS_TRUE_CTL_AVAIL(vmx_basic))
168 ctl_val = rdmsr(vmx_ctl->msr_addr);
170 if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
171 vmx_ctl->ctls &= ~BIT(bit_no);
172 else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
173 vmx_ctl->ctls |= BIT(bit_no);
175 } else if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) {
177 vmx_ctl->ctls &= ~BIT(bit_no);
179 } else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
181 vmx_ctl->ctls |= BIT(bit_no);
188 /* For b.ii) or c.ii) */
189 if (!IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
192 vmx_ctl->ctls &= ~BIT(bit_no);
196 /* For b.ii) or c.ii) */
197 if (!IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
200 vmx_ctl->ctls |= BIT(bit_no);
208 vmx_set_default_settings(struct vmx_ctl_info *vmx_ctl)
212 for(i = 0; i < 32; i++) {
213 vmx_set_ctl_setting(vmx_ctl, i, DEFAULT);
218 alloc_vmxon_regions(void)
221 pcpu_info = kmalloc(ncpus * sizeof(struct vmx_pcpu_info), M_TEMP, M_WAITOK | M_ZERO);
223 for (cpu = 0; cpu < ncpus; cpu++) {
225 /* The address must be aligned to 4K - alloc extra */
226 pcpu_info[cpu].vmxon_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
231 pcpu_info[cpu].vmxon_region = (unsigned char*) VMXON_REGION_ALIGN(pcpu_info[cpu].vmxon_region_na);
233 /* In the first 31 bits put the vmx revision*/
234 *((uint32_t *) pcpu_info[cpu].vmxon_region) = vmx_revision;
239 free_vmxon_regions(void)
243 for (i = 0; i < ncpus; i++) {
244 pcpu_info[i].vmxon_region = NULL;
246 kfree(pcpu_info[i].vmxon_region_na, M_TEMP);
249 kfree(pcpu_info, M_TEMP);
253 build_vmx_sysctl(void)
255 sysctl_ctx_init(&vmx_sysctl_ctx);
256 vmx_sysctl_tree = SYSCTL_ADD_NODE(&vmx_sysctl_ctx,
257 SYSCTL_CHILDREN(vmm_sysctl_tree),
259 CTLFLAG_RD, 0, "VMX options");
261 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
262 SYSCTL_CHILDREN(vmx_sysctl_tree),
263 OID_AUTO, "revision", CTLFLAG_RD,
266 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
267 SYSCTL_CHILDREN(vmx_sysctl_tree),
268 OID_AUTO, "region_size", CTLFLAG_RD,
271 SYSCTL_ADD_INT(&vmx_sysctl_ctx,
272 SYSCTL_CHILDREN(vmx_sysctl_tree),
273 OID_AUTO, "width_addr", CTLFLAG_RD,
275 "VMX width address");
276 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
277 SYSCTL_CHILDREN(vmx_sysctl_tree),
278 OID_AUTO, "pinbased_ctls", CTLFLAG_RD,
279 &vmx_pinbased.ctls, 0,
280 "VMX pin-based controls");
281 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
282 SYSCTL_CHILDREN(vmx_sysctl_tree),
283 OID_AUTO, "procbased_ctls", CTLFLAG_RD,
284 &vmx_procbased.ctls, 0,
285 "VMX primary processor-based controls");
286 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
287 SYSCTL_CHILDREN(vmx_sysctl_tree),
288 OID_AUTO, "procbased2_ctls", CTLFLAG_RD,
289 &vmx_procbased2.ctls, 0,
290 "VMX secondary processor-based controls");
291 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
292 SYSCTL_CHILDREN(vmx_sysctl_tree),
293 OID_AUTO, "vmexit_ctls", CTLFLAG_RD,
295 "VMX exit controls");
296 SYSCTL_ADD_UINT(&vmx_sysctl_ctx,
297 SYSCTL_CHILDREN(vmx_sysctl_tree),
298 OID_AUTO, "vmentry_ctls", CTLFLAG_RD,
300 "VMX entry controls");
301 SYSCTL_ADD_ULONG(&vmx_sysctl_ctx,
302 SYSCTL_CHILDREN(vmx_sysctl_tree),
303 OID_AUTO, "ept_vpid_cap", CTLFLAG_RD,
313 uint64_t feature_control;
314 uint64_t vmx_basic_value;
315 uint64_t cr0_fixed_bits_to_1;
316 uint64_t cr0_fixed_bits_to_0;
317 uint64_t cr4_fixed_bits_to_0;
318 uint64_t cr4_fixed_bits_to_1;
324 * The ability of a processor to support VMX operation
325 * and related instructions is indicated by:
326 * CPUID.1:ECX.VMX[bit 5] = 1
328 if (!(cpu_feature2 & CPUID2_VMX)) {
329 kprintf("VMM: VMX is not supported by this Intel CPU\n");
333 vmx_set_default_settings(&vmx_pinbased);
335 vmx_set_default_settings(&vmx_procbased);
336 /* Enable second level for procbased */
337 err = vmx_set_ctl_setting(&vmx_procbased,
338 PROCBASED_ACTIVATE_SECONDARY_CONTROLS,
341 kprintf("VMM: PROCBASED_ACTIVATE_SECONDARY_CONTROLS not "
342 "supported by this CPU\n");
345 vmx_set_default_settings(&vmx_procbased2);
347 vmx_set_default_settings(&vmx_exit);
348 vmx_set_default_settings(&vmx_entry);
350 /* Enable external interrupts exiting */
351 err = vmx_set_ctl_setting(&vmx_pinbased,
352 PINBASED_EXTERNAL_INTERRUPT_EXITING,
355 kprintf("VMM: PINBASED_EXTERNAL_INTERRUPT_EXITING not "
356 "supported by this CPU\n");
360 /* Enable non-maskable interrupts exiting */
361 err = vmx_set_ctl_setting(&vmx_pinbased, PINBASED_NMI_EXITING, ONE);
363 kprintf("VMM: PINBASED_NMI_EXITING not "
364 "supported by this CPU\n");
369 /* Set 64bits mode for GUEST */
370 err = vmx_set_ctl_setting(&vmx_entry, VMENTRY_IA32e_MODE_GUEST, ONE);
372 kprintf("VMM: VMENTRY_IA32e_MODE_GUEST not "
373 "supported by this CPU\n");
377 /* Load MSR EFER on enry */
378 err = vmx_set_ctl_setting(&vmx_entry,
379 VMENTRY_LOAD_IA32_EFER, ONE);
381 kprintf("VMM: VMENTRY_LOAD_IA32_EFER not "
382 "supported by this CPU\n");
386 /* Set 64bits mode */
387 err = vmx_set_ctl_setting(&vmx_exit,
388 VMEXIT_HOST_ADDRESS_SPACE_SIZE, ONE);
390 kprintf("VMM: VMEXIT_HOST_ADDRESS_SPACE_SIZE not "
391 "supported by this CPU\n");
395 /* Save/Load Efer on exit */
396 err = vmx_set_ctl_setting(&vmx_exit,
397 VMEXIT_SAVE_IA32_EFER,
400 kprintf("VMM: VMEXIT_SAVE_IA32_EFER not "
401 "supported by this CPU\n");
405 /* Load Efer on exit */
406 err = vmx_set_ctl_setting(&vmx_exit,
407 VMEXIT_LOAD_IA32_EFER,
410 kprintf("VMM: VMEXIT_LOAD_IA32_EFER not "
411 "supported by this CPU\n");
415 /* Enable EPT feature */
416 err = vmx_set_ctl_setting(&vmx_procbased2,
417 PROCBASED2_ENABLE_EPT,
420 kprintf("VMM: PROCBASED2_ENABLE_EPT not "
421 "supported by this CPU\n");
425 if (vmx_ept_init()) {
426 kprintf("VMM: vmx_ept_init failed\n");
430 /* XXX - to implement in the feature */
431 /* Enable VPID feature */
432 err = vmx_set_ctl_setting(&vmx_procbased2,
433 PROCBASED2_ENABLE_VPID,
436 kprintf("VMM: PROCBASED2_ENABLE_VPID not "
437 "supported by this CPU\n");
442 /* Check for the feature control status */
443 feature_control = rdmsr(IA32_FEATURE_CONTROL);
444 if (!(feature_control & BIT(FEATURE_CONTROL_LOCKED))) {
445 kprintf("VMM: IA32_FEATURE_CONTROL is not locked\n");
448 if (!(feature_control & BIT(FEATURE_CONTROL_VMX_BIOS_ENABLED))) {
449 kprintf("VMM: VMX is disabled by the BIOS\n");
453 vmx_basic_value = rdmsr(IA32_VMX_BASIC);
454 vmx_width_addr = (uint8_t) VMX_WIDTH_ADDR(vmx_basic_value);
455 vmx_region_size = (uint32_t) VMX_REGION_SIZE(vmx_basic_value);
456 vmx_revision = (uint32_t) VMX_REVISION(vmx_basic_value);
458 /* A.7 VMX-FIXED BITS IN CR0 */
459 cr0_fixed_bits_to_1 = rdmsr(IA32_VMX_CR0_FIXED0);
460 cr0_fixed_bits_to_0 = rdmsr(IA32_VMX_CR0_FIXED1);
461 cr0_fixed_to_1 = cr0_fixed_bits_to_1 & cr0_fixed_bits_to_0;
462 cr0_fixed_to_0 = ~cr0_fixed_bits_to_1 & ~cr0_fixed_bits_to_0;
464 /* A.8 VMX-FIXED BITS IN CR4 */
465 cr4_fixed_bits_to_1 = rdmsr(IA32_VMX_CR4_FIXED0);
466 cr4_fixed_bits_to_0 = rdmsr(IA32_VMX_CR4_FIXED1);
467 cr4_fixed_to_1 = cr4_fixed_bits_to_1 & cr4_fixed_bits_to_0;
468 cr4_fixed_to_0 = ~cr4_fixed_bits_to_1 & ~cr4_fixed_bits_to_0;
477 execute_vmxon(void *perr)
479 unsigned char *vmxon_region;
480 int *err = (int*) perr;
482 /* A.7 VMX-FIXED BITS IN CR0 */
483 load_cr0((rcr0() | cr0_fixed_to_1) & ~cr0_fixed_to_0);
485 /* A.8 VMX-FIXED BITS IN CR4 */
486 load_cr4((rcr4() | cr4_fixed_to_1) & ~cr4_fixed_to_0);
489 load_cr4(rcr4() | CR4_VMXE);
491 vmxon_region = pcpu_info[mycpuid].vmxon_region;
492 *err = vmxon(vmxon_region);
494 kprintf("VMM: vmxon failed on cpu%d\n", mycpuid);
499 execute_vmxoff(void *dummy)
501 invept_desc_t desc = { 0 };
503 if (invept(INVEPT_TYPE_ALL_CONTEXTS, (uint64_t*) &desc))
504 kprintf("VMM: execute_vmxoff: invet failed on cpu%d\n", mycpu->gd_cpuid);
509 load_cr4(rcr4() & ~CR4_VMXE);
513 execute_vmclear(void *data)
515 struct vmx_thread_info *vti = data;
517 globaldata_t gd = mycpu;
519 if (pcpu_info[gd->gd_cpuid].loaded_vmx == vti) {
521 * Must set vti->launched to zero after vmclear'ing to
522 * force a vmlaunch the next time.
524 * Must not clear the loaded_vmx field until after we call
525 * vmclear on the region. This field triggers the interlocked
526 * cpusync from another cpu trying to destroy or reuse
527 * the vti. If we clear the field first, the other cpu will
528 * not interlock and may race our vmclear() on the underlying
531 ERROR_IF(vmclear(vti->vmcs_region));
533 pcpu_info[gd->gd_cpuid].loaded_vmx = NULL;
540 execute_vmptrld(struct vmx_thread_info *vti)
542 globaldata_t gd = mycpu;
545 * Must vmclear previous active vcms if it is different.
547 if (pcpu_info[gd->gd_cpuid].loaded_vmx &&
548 pcpu_info[gd->gd_cpuid].loaded_vmx != vti)
549 execute_vmclear(pcpu_info[gd->gd_cpuid].loaded_vmx);
552 * Make this the current VMCS. Must set loaded_vmx field
553 * before calling vmptrld() to avoid races against cpusync.
555 * Must set vti->launched to zero after the vmptrld to force
558 if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
560 pcpu_info[gd->gd_cpuid].loaded_vmx = vti;
561 return (vmptrld(vti->vmcs_region));
573 if (!vmx_initialized) {
574 kprintf("VMM: vmx_enable - not allowed; vmx not initialized\n");
579 kprintf("VMM: vmx_enable - already enabled\n");
583 alloc_vmxon_regions();
584 for (cpu = 0; cpu < ncpus; cpu++) {
588 CPUMASK_ASSBIT(mask, cpu);
589 lwkt_cpusync_simple(mask, execute_vmxon, &err);
591 kprintf("VMM: vmx_enable error %d on cpu%d\n", err, cpu);
605 kprintf("VMM: vmx_disable not allowed; vmx wasn't enabled\n");
608 for (cpu = 0; cpu < ncpus; cpu++) {
611 CPUMASK_ASSBIT(mask, cpu);
612 lwkt_cpusync_simple(mask, execute_vmxoff, NULL);
615 free_vmxon_regions();
622 static int vmx_set_guest_descriptor(descriptor_t type,
636 * Intel Manual Vol 3C. - page 60
637 * If any bit in the limit field in the range 11:0 is 0, G must be 0.
638 * If any bit in the limit field in the range 31:20 is 1, G must be 1.
640 if ((~rights & VMCS_SEG_UNUSABLE) || (type == CS)) {
641 if ((limit & 0xfff) != 0xfff)
643 else if ((limit & 0xfff00000) != 0)
649 selector_enc = VMCS_GUEST_ES_SELECTOR;
650 rights_enc = VMCS_GUEST_ES_ACCESS_RIGHTS;
651 base_enc = VMCS_GUEST_ES_BASE;
652 limit_enc = VMCS_GUEST_ES_LIMIT;
655 selector_enc = VMCS_GUEST_CS_SELECTOR;
656 rights_enc = VMCS_GUEST_CS_ACCESS_RIGHTS;
657 base_enc = VMCS_GUEST_CS_BASE;
658 limit_enc = VMCS_GUEST_CS_LIMIT;
661 selector_enc = VMCS_GUEST_SS_SELECTOR;
662 rights_enc = VMCS_GUEST_SS_ACCESS_RIGHTS;
663 base_enc = VMCS_GUEST_SS_BASE;
664 limit_enc = VMCS_GUEST_SS_LIMIT;
667 selector_enc = VMCS_GUEST_DS_SELECTOR;
668 rights_enc = VMCS_GUEST_DS_ACCESS_RIGHTS;
669 base_enc = VMCS_GUEST_DS_BASE;
670 limit_enc = VMCS_GUEST_DS_LIMIT;
673 selector_enc = VMCS_GUEST_FS_SELECTOR;
674 rights_enc = VMCS_GUEST_FS_ACCESS_RIGHTS;
675 base_enc = VMCS_GUEST_FS_BASE;
676 limit_enc = VMCS_GUEST_FS_LIMIT;
679 selector_enc = VMCS_GUEST_GS_SELECTOR;
680 rights_enc = VMCS_GUEST_GS_ACCESS_RIGHTS;
681 base_enc = VMCS_GUEST_GS_BASE;
682 limit_enc = VMCS_GUEST_GS_LIMIT;
685 selector_enc = VMCS_GUEST_LDTR_SELECTOR;
686 rights_enc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
687 base_enc = VMCS_GUEST_LDTR_BASE;
688 limit_enc = VMCS_GUEST_LDTR_LIMIT;
691 selector_enc = VMCS_GUEST_TR_SELECTOR;
692 rights_enc = VMCS_GUEST_TR_ACCESS_RIGHTS;
693 base_enc = VMCS_GUEST_TR_BASE;
694 limit_enc = VMCS_GUEST_TR_LIMIT;
697 kprintf("VMM: vmx_set_guest_descriptor: unknown descriptor\n");
703 ERROR_IF(vmwrite(selector_enc, selector));
704 ERROR_IF(vmwrite(rights_enc, rights));
705 ERROR_IF(vmwrite(base_enc, base));
706 ERROR_IF(vmwrite(limit_enc, limit));
710 kprintf("VMM: vmx_set_guest_descriptor failed\n");
715 * Called by the first thread of the VMM process
716 * - create a new vmspace
717 * - init the vmspace with EPT PG_* bits and
718 * EPT copyin/copyout functions
719 * - replace the vmspace of the current proc
720 * - remove the old vmspace
723 vmx_vminit_master(struct vmm_guest_options *options)
725 struct vmspace *oldvmspace;
726 struct vmspace *newvmspace;
727 struct proc *p = curthread->td_proc;
728 struct vmm_proc *p_vmm;
730 oldvmspace = curthread->td_lwp->lwp_vmspace;
731 newvmspace = vmspace_fork(oldvmspace);
733 vmx_ept_pmap_pinit(vmspace_pmap(newvmspace));
734 bzero(vmspace_pmap(newvmspace)->pm_pml4, PAGE_SIZE);
736 lwkt_gettoken(&oldvmspace->vm_map.token);
737 lwkt_gettoken(&newvmspace->vm_map.token);
739 pmap_pinit2(vmspace_pmap(newvmspace));
740 pmap_replacevm(curthread->td_proc, newvmspace, 0);
742 lwkt_reltoken(&newvmspace->vm_map.token);
743 lwkt_reltoken(&oldvmspace->vm_map.token);
745 vmspace_rel(oldvmspace);
747 options->vmm_cr3 = vtophys(vmspace_pmap(newvmspace)->pm_pml4);
749 p_vmm = kmalloc(sizeof(struct vmm_proc), M_TEMP, M_WAITOK | M_ZERO);
750 p_vmm->guest_cr3 = options->guest_cr3;
751 p_vmm->vmm_cr3 = options->vmm_cr3;
752 p->p_vmm = (void *)p_vmm;
755 p->p_vkernel->vkernel_cr3 = options->guest_cr3;
756 dkprintf("PROCESS CR3 %016jx\n", (intmax_t)options->guest_cr3);
763 vmx_vminit(struct vmm_guest_options *options)
765 struct vmx_thread_info * vti;
767 struct tls_info guest_fs = curthread->td_tls.info[0];
768 struct tls_info guest_gs = curthread->td_tls.info[1];
771 vti = kmalloc(sizeof(struct vmx_thread_info), M_TEMP, M_WAITOK | M_ZERO);
772 curthread->td_vmm = (void*) vti;
774 if (options->master) {
775 vmx_vminit_master(options);
778 bcopy(&options->tf, &vti->guest, sizeof(struct trapframe));
781 * Be sure we return success if the VMM hook enters
783 vti->guest.tf_rax = 0;
784 vti->guest.tf_rflags &= ~PSL_C;
786 vti->vmcs_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
791 vti->vmcs_region = (unsigned char*) VMXON_REGION_ALIGN(vti->vmcs_region_na);
794 vti->guest_cr3 = options->guest_cr3;
795 vti->vmm_cr3 = options->vmm_cr3;
797 /* In the first 31 bits put the vmx revision*/
798 *((uint32_t *)vti->vmcs_region) = vmx_revision;
801 * vmclear the vmcs to initialize it.
803 ERROR_IF(vmclear(vti->vmcs_region));
807 ERROR_IF(execute_vmptrld(vti));
809 /* Load the VMX controls */
810 ERROR_IF(vmwrite(VMCS_PINBASED_CTLS, vmx_pinbased.ctls));
811 ERROR_IF(vmwrite(VMCS_PROCBASED_CTLS, vmx_procbased.ctls));
812 ERROR_IF(vmwrite(VMCS_PROCBASED2_CTLS, vmx_procbased2.ctls));
813 ERROR_IF(vmwrite(VMCS_VMEXIT_CTLS, vmx_exit.ctls));
814 ERROR_IF(vmwrite(VMCS_VMENTRY_CTLS, vmx_entry.ctls));
817 ERROR_IF(vmwrite(VMCS_HOST_CR0, rcr0()));
818 ERROR_IF(vmwrite(VMCS_HOST_CR4, rcr4()));
820 /* Load HOST EFER and PAT */
821 // ERROR_IF(vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_PAT)));
822 ERROR_IF(vmwrite(VMCS_HOST_IA32_EFER, rdmsr(MSR_EFER)));
824 /* Load HOST selectors */
825 ERROR_IF(vmwrite(VMCS_HOST_ES_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
826 ERROR_IF(vmwrite(VMCS_HOST_SS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
827 ERROR_IF(vmwrite(VMCS_HOST_FS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
828 ERROR_IF(vmwrite(VMCS_HOST_GS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
829 ERROR_IF(vmwrite(VMCS_HOST_CS_SELECTOR, GSEL(GCODE_SEL, SEL_KPL)));
830 ERROR_IF(vmwrite(VMCS_HOST_TR_SELECTOR, GSEL(GPROC0_SEL, SEL_KPL)));
833 * The BASE addresses are written on each VMRUN in case
834 * the CPU changes because are per-CPU values
838 * Call vmx_vmexit on VM_EXIT condition
839 * The RSP will point to the vmx_thread_info
841 ERROR_IF(vmwrite(VMCS_HOST_RIP, (uint64_t) vmx_vmexit));
842 ERROR_IF(vmwrite(VMCS_HOST_RSP, (uint64_t) vti));
843 ERROR_IF(vmwrite(VMCS_HOST_CR3, (uint64_t) KPML4phys));
846 * GUEST initialization
847 * - set the descriptors according the conditions from Intel
848 * manual "26.3.1.2 Checks on Guest Segment Registers
849 * - set the privilege to SEL_UPL (the vkernel will run
850 * in userspace context)
852 ERROR_IF(vmx_set_guest_descriptor(ES, GSEL(GUDATA_SEL, SEL_UPL),
853 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
856 ERROR_IF(vmx_set_guest_descriptor(SS, GSEL(GUDATA_SEL, SEL_UPL),
857 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
860 ERROR_IF(vmx_set_guest_descriptor(DS, GSEL(GUDATA_SEL, SEL_UPL),
861 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
864 ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
865 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
866 (uint64_t) guest_fs.base, (uint32_t) guest_fs.size));
868 ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
869 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
870 (uint64_t) guest_gs.base, (uint32_t) guest_gs.size));
872 ERROR_IF(vmx_set_guest_descriptor(CS, GSEL(GUCODE_SEL, SEL_UPL),
873 VMCS_SEG_TYPE(11) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P | VMCS_L,
876 ERROR_IF(vmx_set_guest_descriptor(TR, GSEL(GPROC0_SEL, SEL_UPL),
877 VMCS_SEG_TYPE(11) | VMCS_P,
880 ERROR_IF(vmx_set_guest_descriptor(LDTR, 0, VMCS_SEG_UNUSABLE, 0, 0));
882 /* Set the CR0/CR4 registers, removing the unsupported bits */
883 ERROR_IF(vmwrite(VMCS_GUEST_CR0, (CR0_PE | CR0_PG |
884 cr0_fixed_to_1) & ~cr0_fixed_to_0));
885 ERROR_IF(vmwrite(VMCS_GUEST_CR4, (CR4_PAE | CR4_FXSR | CR4_XMM | CR4_XSAVE |
886 cr4_fixed_to_1) & ~ cr4_fixed_to_0));
888 /* Don't set EFER_SCE for catching "syscall" instructions */
889 ERROR_IF(vmwrite(VMCS_GUEST_IA32_EFER, (EFER_LME | EFER_LMA)));
891 vti->guest.tf_rflags = PSL_I | 0x02;
892 ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
894 /* The Guest CR3 indicating CR3 pagetable */
895 ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
897 /* Throw all possible exceptions */
898 ERROR_IF(vmwrite(VMCS_EXCEPTION_BITMAP,(uint64_t) 0xFFFFFFFF));
900 /* Guest RIP and RSP */
901 ERROR_IF(vmwrite(VMCS_GUEST_RIP, options->tf.tf_rip));
902 ERROR_IF(vmwrite(VMCS_GUEST_RSP, options->tf.tf_rsp));
905 * This field is included for future expansion.
906 * Software should set this field to FFFFFFFF_FFFFFFFFH
907 * to avoid VM-entry failures (see Section 26.3.1.5).
909 ERROR_IF(vmwrite(VMCS_LINK_POINTER, ~0ULL));
911 /* The pointer to the EPT pagetable */
912 ERROR_IF(vmwrite(VMCS_EPTP, vmx_eptp(vti->vmm_cr3)));
914 vti->invept_desc.eptp = vmx_eptp(vti->vmm_cr3);
922 kprintf("VMM: vmx_vminit failed\n");
923 execute_vmclear(vti);
925 kfree(vti->vmcs_region_na, M_TEMP);
933 struct vmx_thread_info *vti = curthread->td_vmm;
934 struct proc *p = curproc;
938 vmx_check_cpu_migration();
939 if (vti->vmcs_region &&
940 pcpu_info[mycpu->gd_cpuid].loaded_vmx == vti)
941 execute_vmclear(vti);
943 if (vti->vmcs_region_na != NULL) {
944 kfree(vti->vmcs_region_na, M_TEMP);
948 curthread->td_vmm = NULL;
949 lwkt_gettoken(&p->p_token);
950 if (p->p_nthreads == 1) {
951 kfree(p->p_vmm, M_TEMP);
959 * Checks if we migrated to another cpu
961 * No locks are required
964 vmx_check_cpu_migration(void)
966 struct vmx_thread_info * vti;
967 struct globaldata *gd;
972 vti = (struct vmx_thread_info *) curthread->td_vmm;
973 ERROR_IF(vti == NULL);
975 if (vti->last_cpu != -1 && vti->last_cpu != gd->gd_cpuid &&
976 pcpu_info[vti->last_cpu].loaded_vmx == vti) {
978 * Do not reset last_cpu to -1 here, leave it caching
979 * the cpu whos per-cpu fields the VMCS is synchronized
980 * with. The pcpu_info[] check prevents unecessary extra
983 dkprintf("VMM: cpusync from %d to %d\n",
984 gd->gd_cpuid, vti->last_cpu);
986 /* Clear the VMCS area if ran on another CPU */
987 CPUMASK_ASSBIT(mask, vti->last_cpu);
988 lwkt_cpusync_simple(mask, execute_vmclear, (void *)vti);
992 kprintf("VMM: vmx_check_cpu_migration failed\n");
996 /* Handle CPU migration
998 * We have to enter with interrupts disabled/critical section
999 * to be sure that another VMCS won't steel our CPU.
1002 vmx_handle_cpu_migration(void)
1004 struct vmx_thread_info * vti;
1005 struct globaldata *gd;
1009 vti = (struct vmx_thread_info *) curthread->td_vmm;
1010 ERROR_IF(vti == NULL);
1012 if (vti->last_cpu != gd->gd_cpuid) {
1014 * We need to synchronize the per-cpu fields after changing
1017 dkprintf("VMM: vmx_handle_cpu_migration init per CPU data\n");
1019 ERROR_IF(execute_vmptrld(vti));
1021 /* Host related registers */
1022 ERROR_IF(vmwrite(VMCS_HOST_GS_BASE, (uint64_t) gd)); /* mycpu points to %gs:0 */
1023 ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t) &gd->gd_prvspace->mdglobaldata.gd_common_tss));
1025 ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
1026 ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t) r_idt_arr[gd->gd_cpuid].rd_base));
1029 /* Guest related register */
1030 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
1031 ERROR_IF(vmwrite(VMCS_GUEST_GDTR_LIMIT, (uint64_t) (NGDT * sizeof(gdt[0]) - 1)));
1034 * Indicates which cpu the per-cpu fields are synchronized
1035 * with. Does not indicate whether the vmcs is active on
1036 * that particular cpu.
1038 vti->last_cpu = gd->gd_cpuid;
1039 } else if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
1041 * We only need to vmptrld
1043 dkprintf("VMM: vmx_handle_cpu_migration: vmcs is not loaded\n");
1045 ERROR_IF(execute_vmptrld(vti));
1047 } /* else we don't need to do anything */
1050 kprintf("VMM: vmx_handle_cpu_migration failed\n");
1054 /* Load information about VMexit
1056 * We still are with interrupts disabled/critical secion
1057 * because we must operate with the VMCS on the CPU
1060 vmx_vmexit_loadinfo(void)
1062 struct vmx_thread_info *vti;
1065 vti = (struct vmx_thread_info *) curthread->td_vmm;
1066 ERROR_IF(vti == NULL);
1068 ERROR_IF(vmread(VMCS_VMEXIT_REASON, &vti->vmexit_reason));
1069 ERROR_IF(vmread(VMCS_EXIT_QUALIFICATION, &vti->vmexit_qualification));
1070 ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_INFO, &vti->vmexit_interruption_info));
1071 ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_ERROR, &vti->vmexit_interruption_error));
1072 ERROR_IF(vmread(VMCS_VMEXIT_INSTRUCTION_LENGTH, &vti->vmexit_instruction_length));
1073 ERROR_IF(vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &vti->guest_physical_address));
1074 ERROR_IF(vmread(VMCS_GUEST_RIP, &vti->guest.tf_rip));
1075 ERROR_IF(vmread(VMCS_GUEST_CS_SELECTOR, &vti->guest.tf_cs));
1076 ERROR_IF(vmread(VMCS_GUEST_RFLAGS, &vti->guest.tf_rflags));
1077 ERROR_IF(vmread(VMCS_GUEST_RSP, &vti->guest.tf_rsp));
1078 ERROR_IF(vmread(VMCS_GUEST_SS_SELECTOR, &vti->guest.tf_ss));
1082 kprintf("VMM: vmx_vmexit_loadinfo failed\n");
1088 vmx_set_tls_area(void)
1090 struct tls_info *guest_fs = &curthread->td_tls.info[0];
1091 struct tls_info *guest_gs = &curthread->td_tls.info[1];
1095 dkprintf("VMM: vmx_set_tls_area hook\n");
1099 ERROR_IF(vmx_check_cpu_migration());
1100 ERROR_IF(vmx_handle_cpu_migration());
1103 ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
1104 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
1105 (uint64_t) guest_fs->base, (uint32_t) guest_fs->size));
1108 ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
1109 VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
1110 (uint64_t) guest_gs->base, (uint32_t) guest_gs->size));
1122 vmx_handle_vmexit(void)
1124 struct vmx_thread_info * vti;
1127 int exception_number;
1131 int fault_flags = 0;
1132 struct lwp *lp = curthread->td_lwp;
1134 dkprintf("VMM: handle_vmx_vmexit\n");
1135 vti = (struct vmx_thread_info *) curthread->td_vmm;
1136 ERROR_IF(vti == NULL);
1138 exit_reason = VMCS_BASIC_EXIT_REASON(vti->vmexit_reason);
1139 switch (exit_reason) {
1140 case EXIT_REASON_EXCEPTION:
1141 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXCEPTION with qualification "
1142 "%llx, interruption info %llx, interruption error %llx, instruction "
1144 (long long) vti->vmexit_qualification,
1145 (long long) vti->vmexit_interruption_info,
1146 (long long) vti->vmexit_interruption_error,
1147 (long long) vti->vmexit_instruction_length);
1149 dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, "
1150 "rsp: %llx, rdi: %llx, rsi: %llx, %d, vti: %p, master: %p\n",
1151 (long long)vti->guest.tf_rax,
1152 (long long)vti->guest.tf_rip,
1153 (long long)vti->guest.tf_rsp,
1154 (long long)vti->guest.tf_rdi,
1155 (long long)vti->guest.tf_rsi, exit_reason, vti, curproc->p_vmm);
1157 exception_type = VMCS_EXCEPTION_TYPE(vti->vmexit_interruption_info);
1158 exception_number = VMCS_EXCEPTION_NUMBER(vti->vmexit_interruption_info);
1160 if (exception_type == VMCS_EXCEPTION_HARDWARE) {
1161 switch (exception_number) {
1164 * Disabled "syscall" instruction and
1165 * now we catch it for executing
1167 dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_UD\n");
1169 /* Check to see if its syscall asm instuction */
1170 uint8_t instr[INSTRUCTION_MAX_LENGTH];
1171 if (copyin((const void *) vti->guest.tf_rip, instr, vti->vmexit_instruction_length) &&
1172 instr_check(&syscall_asm,(void *) instr, (uint8_t) vti->vmexit_instruction_length)) {
1173 kprintf("VMM: handle_vmx_vmexit: UD different from syscall: ");
1174 db_disasm((db_addr_t) instr, FALSE, NULL);
1177 /* Called to force a VMEXIT and invalidate TLB */
1178 if (vti->guest.tf_rax == -1) {
1179 vti->guest.tf_rip += vti->vmexit_instruction_length;
1183 vti->guest.tf_err = 2;
1184 vti->guest.tf_trapno = T_FAST_SYSCALL;
1185 vti->guest.tf_xflags = 0;
1187 vti->guest.tf_rip += vti->vmexit_instruction_length;
1189 syscall2(&vti->guest);
1193 dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_PF at %llx\n",
1194 (long long) vti->guest.tf_rip);
1196 if (vti->guest.tf_rip == 0) {
1197 kprintf("VMM: handle_vmx_vmexit: Terminating...\n");
1202 vti->guest.tf_err = vti->vmexit_interruption_error;
1203 vti->guest.tf_addr = vti->vmexit_qualification;
1204 vti->guest.tf_xflags = 0;
1205 vti->guest.tf_trapno = T_PAGEFLT;
1208 * If we are a user process in the vkernel
1209 * pass the PF to the vkernel and will trigger
1212 * If we are the vkernel, send a SIGSEGV signal
1213 * to us that will trigger the execution of
1218 if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
1219 vkernel_trap(lp, &vti->guest);
1221 trapsignal(lp, SIGSEGV, SEGV_MAPERR);
1226 kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE unknown "
1227 "number %d rip: %llx, rsp: %llx\n", exception_number,
1228 (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
1232 } else if (exception_type == VMCS_EXCEPTION_SOFTWARE) {
1233 switch (exception_number) {
1235 dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE "
1236 "number %d rip: %llx, rsp: %llx\n", exception_number,
1237 (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
1239 vti->guest.tf_trapno = T_BPTFLT;
1240 vti->guest.tf_xflags = 0;
1241 vti->guest.tf_err = 0;
1242 vti->guest.tf_addr = 0;
1244 vti->guest.tf_rip += vti->vmexit_instruction_length;
1250 kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE unknown "
1251 "number %d rip: %llx, rsp: %llx\n", exception_number,
1252 (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
1257 kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_ %d unknown\n", exception_type);
1262 case EXIT_REASON_EXT_INTR:
1263 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n");
1265 case EXIT_REASON_CPUID:
1266 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n");
1269 * Execute CPUID instruction and pass
1270 * the result to the vkernel
1273 func = vti->guest.tf_rax;
1274 do_cpuid(func, regs);
1276 vti->guest.tf_rax = regs[0];
1277 vti->guest.tf_rbx = regs[1];
1278 vti->guest.tf_rcx = regs[2];
1279 vti->guest.tf_rdx = regs[3];
1281 vti->guest.tf_rip += vti->vmexit_instruction_length;
1284 case EXIT_REASON_EPT_FAULT:
1286 * EPT_FAULT are resolved like normal PFs. Nothing special
1287 * - get the fault type
1288 * - get the fault address (which is a GPA)
1289 * - execute vm_fault on the vm_map
1291 dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT with qualification %lld,"
1292 "GPA: %llx, fault_Type: %d\n",(long long) vti->vmexit_qualification,
1293 (unsigned long long) vti->guest_physical_address, fault_type);
1295 fault_type = vmx_ept_fault_type(vti->vmexit_qualification);
1297 if (fault_type & VM_PROT_WRITE)
1298 fault_flags = VM_FAULT_DIRTY;
1300 fault_flags = VM_FAULT_NORMAL;
1302 rv = vm_fault(&curthread->td_lwp->lwp_vmspace->vm_map,
1303 trunc_page(vti->guest_physical_address), fault_type, fault_flags);
1305 if (rv != KERN_SUCCESS) {
1306 kprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT couldn't resolve %llx\n",
1307 (unsigned long long) vti->guest_physical_address);
1313 kprintf("VMM: handle_vmx_vmexit: unknown exit reason: %d with qualification %lld\n",
1314 exit_reason, (long long) vti->vmexit_qualification);
1326 struct vmx_thread_info * vti;
1327 struct globaldata *gd;
1334 struct trapframe *save_frame;
1335 thread_t td = curthread;
1337 vti = (struct vmx_thread_info *) td->td_vmm;
1338 save_frame = td->td_lwp->lwp_md.md_regs;
1339 td->td_lwp->lwp_md.md_regs = &vti->guest;
1344 * This can change the cpu we are running on.
1346 trap_handle_userexit(&vti->guest, sticks);
1349 ERROR2_IF(vti == NULL);
1350 ERROR2_IF(vmx_check_cpu_migration());
1351 ERROR2_IF(vmx_handle_cpu_migration());
1354 * Make the state safe to VMENTER
1355 * - disable interrupts and check if there were any pending
1356 * - check for ASTFLTs
1357 * - loop again until there are no ASTFLTs
1361 if (gd->gd_reqflags & RQF_AST_MASK) {
1362 atomic_clear_int(&gd->gd_reqflags, RQF_AST_SIGNAL);
1365 vti->guest.tf_trapno = T_ASTFLT;
1367 /* CURRENT CPU CAN CHANGE */
1370 if (vti->last_cpu != gd->gd_cpuid) {
1373 kprintf("VMM: vmx_vmrun: vti unexpectedly "
1374 "changed cpus %d->%d\n",
1375 gd->gd_cpuid, vti->last_cpu);
1380 * Add us to the list of cpus running vkernel operations, interlock
1381 * against anyone trying to do an invalidation.
1383 * We must set the cpumask first to ensure that we interlock another
1384 * cpu that may desire to IPI us after we have successfully
1385 * incremented the cpulock counter.
1387 ATOMIC_CPUMASK_ORBIT(td->td_proc->p_vmm_cpumask, gd->gd_cpuid);
1390 olock = td->td_proc->p_vmm_cpulock;
1392 if ((olock & CPULOCK_EXCL) == 0) {
1393 nlock = olock + CPULOCK_INCR;
1394 if (atomic_cmpset_int(&td->td_proc->p_vmm_cpulock,
1405 * More complex. After sleeping we have to re-test
1408 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask,
1411 tsleep_interlock(&td->td_proc->p_vmm_cpulock, 0);
1412 if (td->td_proc->p_vmm_cpulock & CPULOCK_EXCL) {
1413 tsleep(&td->td_proc->p_vmm_cpulock, PINTERLOCKED,
1421 * Load specific Guest registers
1422 * GP registers will be loaded in vmx_launch/resume
1424 ERROR_IF(vmwrite(VMCS_GUEST_RIP, vti->guest.tf_rip));
1425 ERROR_IF(vmwrite(VMCS_GUEST_CS_SELECTOR, vti->guest.tf_cs));
1426 ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
1427 ERROR_IF(vmwrite(VMCS_GUEST_RSP, vti->guest.tf_rsp));
1428 ERROR_IF(vmwrite(VMCS_GUEST_SS_SELECTOR, vti->guest.tf_ss));
1429 ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
1434 if (mdcpu->gd_npxthread != td) {
1435 if (mdcpu->gd_npxthread)
1436 npxsave(mdcpu->gd_npxthread->td_savefpu);
1441 * The kernel caches the MSR_FSBASE value in mdcpu->gd_user_fs.
1442 * A vmexit loads this unconditionally from the VMCS so make
1443 * sure it loads the correct value.
1445 ERROR_IF(vmwrite(VMCS_HOST_FS_BASE, mdcpu->gd_user_fs));
1448 * EPT mappings can't be invalidated with normal invlpg/invltlb
1449 * instructions. We have to execute a special instruction that
1450 * invalidates all EPT cache ("invept").
1452 * pm_invgen it's a generation number which is incremented in
1453 * pmap_inval_smp*(), before doing any invalidates. This will
1454 * cause all CPUs thaat are using the EPT to VMEXIT and wait for
1455 * the interlock to complete. When they VMENTER they will see that
1456 * the generation number had changed from their current and do a
1459 if (vti->eptgen != td->td_proc->p_vmspace->vm_pmap.pm_invgen) {
1460 vti->eptgen = td->td_proc->p_vmspace->vm_pmap.pm_invgen;
1462 ERROR_IF(invept(INVEPT_TYPE_SINGLE_CONTEXT,
1463 (uint64_t*)&vti->invept_desc));
1466 if (vti->launched) { /* vmresume called from vmx_trap.s */
1467 dkprintf("\n\nVMM: vmx_vmrun: vmx_resume\n");
1468 ret = vmx_resume(vti);
1470 } else { /* vmlaunch called from vmx_trap.s */
1471 dkprintf("\n\nVMM: vmx_vmrun: vmx_launch\n");
1473 ret = vmx_launch(vti);
1477 * This is our return point from the vmlaunch/vmresume
1478 * There are two situations:
1479 * - the vmlaunch/vmresume executed successfully and they
1480 * would return through "vmx_vmexit" which will restore
1481 * the state (registers) and return here with the ret
1482 * set to VM_EXIT (ret is actually %rax)
1483 * - the vmlaunch/vmresume failed to execute and will return
1484 * immediately with ret set to the error code
1486 if (ret == VM_EXIT) {
1487 ERROR_IF(vmx_vmexit_loadinfo());
1489 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask,
1491 atomic_add_int(&td->td_proc->p_vmm_cpulock,
1493 /* WARNING: don't adjust cpulock twice! */
1496 trap_handle_userenter(td);
1497 sticks = td->td_sticks;
1501 * Handle the VMEXIT reason
1502 * - if successful we VMENTER again
1505 if (vmx_handle_vmexit())
1509 * We handled the VMEXIT reason and continue with
1518 * Two types of error:
1519 * - VM_FAIL_VALID - the host state was ok,
1520 * but probably the guest state was not
1521 * - VM_FAIL_INVALID - the parameters or the host state
1524 if (ret == VM_FAIL_VALID) {
1525 vmread(VMCS_INSTR_ERR, &val);
1527 kprintf("VMM: vmx_vmrun: vmenter failed with "
1528 "VM_FAIL_VALID, error code %d\n",
1531 kprintf("VMM: vmx_vmrun: vmenter failed with "
1532 "VM_FAIL_INVALID\n");
1537 kprintf("VMM: vmx_vmrun: returning with success\n");
1540 ATOMIC_CPUMASK_NANDBIT(td->td_proc->p_vmm_cpumask, gd->gd_cpuid);
1541 atomic_add_int(&td->td_proc->p_vmm_cpulock, -CPULOCK_INCR);
1544 trap_handle_userenter(td);
1545 td->td_lwp->lwp_md.md_regs = save_frame;
1546 KKASSERT(CPUMASK_TESTMASK(td->td_proc->p_vmm_cpumask,
1547 gd->gd_cpumask) == 0);
1548 /*atomic_clear_cpumask(&td->td_proc->p_vmm_cpumask, gd->gd_cpumask);*/
1550 kprintf("VMM: vmx_vmrun failed\n");
1555 * Called when returning to user-space
1556 * after executing lwp_fork.
1559 vmx_lwp_return(struct lwp *lp, struct trapframe *frame)
1561 struct vmm_guest_options options;
1563 struct vmm_proc *p_vmm = (struct vmm_proc *)curproc->p_vmm;
1565 dkprintf("VMM: vmx_lwp_return \n");
1567 bzero(&options, sizeof(struct vmm_guest_options));
1569 bcopy(frame, &options.tf, sizeof(struct trapframe));
1571 options.guest_cr3 = p_vmm->guest_cr3;
1572 options.vmm_cr3 = p_vmm->vmm_cr3;
1574 vmx_vminit(&options);
1575 generic_lwp_return(lp, frame);
1577 vmrun_err = vmx_vmrun();
1579 exit1(W_EXITCODE(vmrun_err, 0));
1583 vmx_set_guest_cr3(register_t guest_cr3)
1585 struct vmx_thread_info *vti = (struct vmx_thread_info *) curthread->td_vmm;
1586 vti->guest_cr3 = guest_cr3;
1590 vmx_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr)
1592 return guest_phys_addr(p->p_vmspace, gpa, p->p_vkernel->vkernel_cr3, uaddr);
1595 static struct vmm_ctl ctl_vmx = {
1596 .name = "VMX from Intel",
1598 .enable = vmx_enable,
1599 .disable = vmx_disable,
1600 .vminit = vmx_vminit,
1601 .vmdestroy = vmx_vmdestroy,
1603 .vm_set_tls_area = vmx_set_tls_area,
1604 .vm_lwp_return = vmx_lwp_return,
1605 .vm_set_guest_cr3 = vmx_set_guest_cr3,
1606 .vm_get_gpa = vmx_vm_get_gpa,