hammer2 - Merge Mihai Carabas's VKERNEL/VMM GSOC project into the main tree
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 20 Sep 2013 23:15:43 +0000 (16:15 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 20 Sep 2013 23:15:43 +0000 (16:15 -0700)
* This merge contains work primarily by Mihai Carabas, with some misc
  fixes also by Matthew Dillon.

* Special note on GSOC core

  This is, needless to say, a huge amount of work compressed down into a
  few paragraphs of comments.  Adds the pc64/vmm subdirectory and tons
  of stuff to support hardware virtualization in guest-user mode, plus
  the ability for programs (vkernels) running in this mode to make normal
  system calls to the host.

* Add system call infrastructure for VMM mode operations in kern/sys_vmm.c
  which vectors through a structure to machine-specific implementations.

  vmm_guest_ctl_args()
  vmm_guest_sync_addr_args()

  vmm_guest_ctl_args() - bootstrap VMM and EPT modes.  Copydown the original
  user stack for EPT (since EPT 'physical' addresses cannot reach that far
  into the backing store represented by the process's original VM space).
  Also installs the GUEST_CR3 for the guest using parameters supplied by
  the guest.

  vmm_guest_sync_addr_args() - A host helper function that the vkernel can
  use to invalidate page tables on multiple real cpus.  This is a lot more
  efficient than having the vkernel try to do it itself with IPI signals
  via cpusync*().

* Add Intel VMX support to the host infrastructure.  Again, tons of work
  compressed down into a one paragraph commit message.  Intel VMX support
  added.  AMD SVM support is not part of this GSOC and not yet supported
  by DragonFly.

* Remove PG_* defines for PTE's and related mmu operations.  Replace with
  a table lookup so the same pmap code can be used for normal page tables
  and also EPT tables.

* Also include X86_PG_V defines specific to normal page tables for a few
  situations outside the pmap code.

* Adjust DDB to disassemble SVM related (intel) instructions.

* Add infrastructure to exit1() to deal related structures.

* Optimize pfind() and pfindn() to remove the global token when looking
  up the current process's PID (Matt)

* Add support for EPT (double layer page tables).  This primarily required
  adjusting the pmap code to use a table lookup to get the PG_* bits.

  Add an indirect vector for copyin, copyout, and other user address space
  copy operations to support manual walks when EPT is in use.

  A multitude of system calls which manually looked up user addresses via
  the vm_map now need a VMM layer call to translate EPT.

* Remove the MP lock from trapsignal() use cases in trap().

* (Matt) Add pthread_yield()s in most spin loops to help situations where
  the vkernel is running on more cpu's than the host has, and to help with
  scheduler edge cases on the host.

* (Matt) Add a pmap_fault_page_quick() infrastructure that vm_fault_page()
  uses to try to shortcut operations and avoid locks.  Implement it for
  pc64.  This function checks whether the page is already faulted in as
  requested by looking up the PTE.  If not it returns NULL and the full
  blown vm_fault_page() code continues running.

* (Matt) Remove the MP lock from most the vkernel's trap() code

* (Matt) Use a shared spinlock when possible for certain critical paths
  related to the copyin/copyout path.

106 files changed:
include/sysexits.h
lib/libkvm/kvm_minidump_x86_64.c
lib/libkvm/kvm_x86_64.c
sys/conf/files
sys/config/MYKERNEL64 [new symlink]
sys/cpu/i386/include/frame.h
sys/cpu/x86_64/include/frame.h
sys/cpu/x86_64/include/pmap.h
sys/cpu/x86_64/include/specialreg.h
sys/cpu/x86_64/misc/db_disasm.c
sys/kern/init_sysent.c
sys/kern/kern_exec.c
sys/kern/kern_exit.c
sys/kern/kern_proc.c
sys/kern/kern_prot.c
sys/kern/kern_slaballoc.c
sys/kern/kern_umtx.c
sys/kern/lwkt_ipiq.c
sys/kern/lwkt_thread.c
sys/kern/sys_vmm.c [new file with mode: 0644]
sys/kern/syscalls.c
sys/kern/syscalls.master
sys/kern/uipc_syscalls.c
sys/kern/vfs_helper.c
sys/kern/vfs_vm.c
sys/platform/pc32/i386/pmap.c
sys/platform/pc32/include/pmap.h
sys/platform/pc32/include/vmm.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/platform/pc64/conf/files
sys/platform/pc64/include/md_var.h
sys/platform/pc64/include/pmap.h
sys/platform/pc64/include/vmm.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/platform/pc64/vmm/ept.c [new file with mode: 0644]
sys/platform/pc64/vmm/ept.h [new file with mode: 0644]
sys/platform/pc64/vmm/svm.c [new file with mode: 0644]
sys/platform/pc64/vmm/svm.h [new file with mode: 0644]
sys/platform/pc64/vmm/vmm.c [new file with mode: 0644]
sys/platform/pc64/vmm/vmm.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/platform/pc64/vmm/vmm_utils.c [new file with mode: 0644]
sys/platform/pc64/vmm/vmm_utils.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/platform/pc64/vmm/vmx.c [new file with mode: 0644]
sys/platform/pc64/vmm/vmx.h [new file with mode: 0644]
sys/platform/pc64/vmm/vmx_genassym.c [new file with mode: 0644]
sys/platform/pc64/vmm/vmx_instr.h [new file with mode: 0644]
sys/platform/pc64/vmm/vmx_trap.s [new file with mode: 0644]
sys/platform/pc64/vmm/vmx_vmcs.h [new file with mode: 0644]
sys/platform/pc64/x86_64/db_interface.c
sys/platform/pc64/x86_64/genassym.c
sys/platform/pc64/x86_64/machdep.c
sys/platform/pc64/x86_64/minidump_machdep.c
sys/platform/pc64/x86_64/mp_machdep.c
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/pmap_inval.c
sys/platform/pc64/x86_64/support.s
sys/platform/pc64/x86_64/tls.c
sys/platform/pc64/x86_64/trap.c
sys/platform/pc64/x86_64/uwrapper.c [new file with mode: 0644]
sys/platform/pc64/x86_64/vm_machdep.c
sys/platform/vkernel/i386/db_interface.c
sys/platform/vkernel/include/pmap.h
sys/platform/vkernel/include/vmm.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/platform/vkernel/platform/init.c
sys/platform/vkernel/platform/pmap.c
sys/platform/vkernel/platform/pmap_inval.c
sys/platform/vkernel64/include/pmap.h
sys/platform/vkernel64/include/pmap_inval.h
sys/platform/vkernel64/include/vmm.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/platform/vkernel64/include/vmparam.h
sys/platform/vkernel64/platform/cothread.c
sys/platform/vkernel64/platform/init.c
sys/platform/vkernel64/platform/ipl_funcs.c
sys/platform/vkernel64/platform/machintr.c
sys/platform/vkernel64/platform/pmap.c
sys/platform/vkernel64/platform/pmap_inval.c
sys/platform/vkernel64/x86_64/db_interface.c
sys/platform/vkernel64/x86_64/genassym.c
sys/platform/vkernel64/x86_64/global.s
sys/platform/vkernel64/x86_64/mp.c
sys/platform/vkernel64/x86_64/swtch.s
sys/platform/vkernel64/x86_64/trap.c
sys/sys/proc.h
sys/sys/syscall-hide.h
sys/sys/syscall.h
sys/sys/syscall.mk
sys/sys/sysproto.h
sys/sys/sysunion.h
sys/sys/thread.h
sys/sys/vkernel.h
sys/sys/vmm.h [copied from sys/platform/vkernel64/include/pmap_inval.h with 53% similarity]
sys/sys/wait.h
sys/vfs/procfs/procfs_mem.c
sys/vm/device_pager.c
sys/vm/phys_pager.c
sys/vm/pmap.h
sys/vm/vm_fault.c
sys/vm/vm_glue.c
sys/vm/vm_map.c
sys/vm/vm_mmap.c
sys/vm/vm_object.c
sys/vm/vm_page.h
sys/vm/vm_page2.h
sys/vm/vm_vmspace.c
sys/vm/vm_zeroidle.c
sys/vm/vm_zone.c
test/vmm/Makefile [new file with mode: 0644]
test/vmm/vmm_test.c [new file with mode: 0644]

index 464cb11..48c0520 100644 (file)
@@ -94,6 +94,7 @@
  */
 
 #define EX_OK          0       /* successful termination */
+#define EX_REBOOT      32      /* vkernel specific - reboot */
 
 #define EX__BASE       64      /* base value for error messages */
 
index ad87c85..e37d9b7 100644 (file)
@@ -209,7 +209,7 @@ _kvm_minidump_vatop(kvm_t *kd, u_long va, off_t *pa)
        if (va >= vm->hdr.kernbase) {
                pteindex = (va - vm->hdr.kernbase) >> PAGE_SHIFT;
                pte = vm->ptemap[pteindex];
-               if (((u_long)pte & PG_V) == 0) {
+               if (((u_long)pte & X86_PG_V) == 0) {
                        _kvm_err(kd, kd->program, "_kvm_vatop: pte not valid");
                        goto invalid;
                }
index 233fa8a..0a5b121 100644 (file)
@@ -258,7 +258,7 @@ _kvm_vatop(kvm_t *kd, u_long va, off_t *pa)
 
        pml4eindex = (va >> PML4SHIFT) & (NPML4EPG - 1);
        pml4e = vm->PML4[pml4eindex];
-       if (((u_long)pml4e & PG_V) == 0) {
+       if (((u_long)pml4e & X86_PG_V) == 0) {
                _kvm_err(kd, kd->program, "_kvm_vatop: pml4e not valid");
                goto invalid;
        }
@@ -280,7 +280,7 @@ _kvm_vatop(kvm_t *kd, u_long va, off_t *pa)
                _kvm_syserr(kd, kd->program, "_kvm_vatop: read pdpe");
                goto invalid;
        }
-       if (((u_long)pdpe & PG_V) == 0) {
+       if (((u_long)pdpe & X86_PG_V) == 0) {
                _kvm_err(kd, kd->program, "_kvm_vatop: pdpe not valid");
                goto invalid;
        }
@@ -301,12 +301,12 @@ _kvm_vatop(kvm_t *kd, u_long va, off_t *pa)
                _kvm_syserr(kd, kd->program, "_kvm_vatop: read pde");
                goto invalid;
        }
-       if (((u_long)pde & PG_V) == 0) {
+       if (((u_long)pde & X86_PG_V) == 0) {
                _kvm_err(kd, kd->program, "_kvm_vatop: pde not valid");
                goto invalid;
        }
 
-       if ((u_long)pde & PG_PS) {
+       if ((u_long)pde & X86_PG_PS) {
              /*
               * No final-level page table; ptd describes one 2MB page.
               */
@@ -339,7 +339,7 @@ _kvm_vatop(kvm_t *kd, u_long va, off_t *pa)
                _kvm_syserr(kd, kd->program, "_kvm_vatop: read");
                goto invalid;
        }
-       if (((u_long)pte & PG_V) == 0) {
+       if (((u_long)pte & X86_PG_V) == 0) {
                _kvm_err(kd, kd->program, "_kvm_vatop: pte not valid");
                goto invalid;
        }
index 74c767e..42d4088 100644 (file)
@@ -979,6 +979,7 @@ kern/vfs_vfsops.c   standard
 kern/kern_threads.c    standard
 kern/vfs_aio.c         standard
 kern/subr_cpu_topology.c       standard
+kern/sys_vmm.c         standard
 vfs/deadfs/dead_vnops.c        standard
 vfs/fdesc/fdesc_vfsops.c       optional fdesc
 vfs/fdesc/fdesc_vnops.c        optional fdesc
diff --git a/sys/config/MYKERNEL64 b/sys/config/MYKERNEL64
new file mode 120000 (symlink)
index 0000000..19dc81e
--- /dev/null
@@ -0,0 +1 @@
+/root/kernel/MYKERNEL64
\ No newline at end of file
index cabbc04..fb3c7a4 100644 (file)
@@ -70,6 +70,7 @@ struct trapframe {
        int     tf_cs;
        int     tf_eflags;
        /* below only when crossing rings (e.g. user to kernel) */
+#define tf_sp tf_esp
        int     tf_esp;
        int     tf_ss;
 };
index fe24ff4..93f97f4 100644 (file)
@@ -82,6 +82,7 @@ struct trapframe {
        register_t      tf_rip;
        register_t      tf_cs;
        register_t      tf_rflags;
+#define tf_sp tf_rsp
        register_t      tf_rsp;
        register_t      tf_ss;
 };
index ab2cc7f..d3d08b6 100644 (file)
  * of the fields not present here and there, depending on a lot of things.
  */
                                /* ---- Intel Nomenclature ---- */
-#define        PG_V            0x001   /* P    Valid                   */
-#define PG_RW          0x002   /* R/W  Read/Write              */
-#define PG_U           0x004   /* U/S  User/Supervisor         */
-#define        PG_NC_PWT       0x008   /* PWT  Write through           */
-#define        PG_NC_PCD       0x010   /* PCD  Cache disable           */
-#define PG_A           0x020   /* A    Accessed                */
-#define        PG_M            0x040   /* D    Dirty                   */
-#define        PG_PS           0x080   /* PS   Page size (0=4k,1=2M)   */
-#define        PG_PTE_PAT      0x080   /* PAT  PAT index               */
-#define        PG_G            0x100   /* G    Global                  */
-#define        PG_AVAIL1       0x200   /*    / Available for system    */
-#define        PG_AVAIL2       0x400   /*   <  programmers use         */
-#define        PG_AVAIL3       0x800   /*    \                         */
-#define        PG_PDE_PAT      0x1000  /* PAT  PAT index               */
-#define        PG_NX           (1ul<<63) /* No-execute */
+#define        X86_PG_V                0x001   /* P    Valid                   */
+#define        X86_PG_RW               0x002   /* R/W  Read/Write              */
+#define        X86_PG_U                0x004   /* U/S  User/Supervisor         */
+#define        X86_PG_NC_PWT           0x008   /* PWT  Write through           */
+#define        X86_PG_NC_PCD           0x010   /* PCD  Cache disable           */
+#define        X86_PG_A                0x020   /* A    Accessed                */
+#define        X86_PG_M                0x040   /* D    Dirty                   */
+#define        X86_PG_PS               0x080   /* PS   Page size (0=4k,1=2M)   */
+#define        X86_PG_PTE_PAT          0x080   /* PAT  PAT index               */
+#define        X86_PG_G                0x100   /* G    Global                  */
+#define        X86_PG_AVAIL1           0x200   /*    / Available for system    */
+#define        X86_PG_AVAIL2           0x400   /*   <  programmers use         */
+#define        X86_PG_AVAIL3           0x800   /*    \                         */
+#define        X86_PG_PDE_PAT          0x1000  /* PAT  PAT index               */
+#define        X86_PG_NX               (1ul<<63) /* No-execute */
 
 
 /* Our various interpretations of the above */
-#define PG_W           PG_AVAIL1       /* "Wired" pseudoflag */
-#define        PG_MANAGED      PG_AVAIL2
-#define        PG_DEVICE       PG_AVAIL3
+//#define PG_W         PG_AVAIL1       /* "Wired" pseudoflag */
+//#define      PG_MANAGED      PG_AVAIL2
+//#define      PG_DEVICE       PG_AVAIL3
 #define        PG_FRAME        (0x000ffffffffff000ul)
 #define        PG_PS_FRAME     (0x000fffffffe00000ul)
-#define        PG_PROT         (PG_RW|PG_U)    /* all protection bits . */
-#define PG_N           (PG_NC_PWT|PG_NC_PCD)   /* Non-cacheable */
+//#define      PG_PROT         (PG_RW|PG_U)    /* all protection bits . */
+//#define PG_N         (PG_NC_PWT|PG_NC_PCD)   /* Non-cacheable */
 
 /*
  * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB
  * (PTE) page mappings have identical settings for the following fields:
  */
+/*
 #define        PG_PTE_PROMOTE  (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_PAT | \
            PG_M | PG_A | PG_NC_PCD | PG_NC_PWT | PG_U | PG_RW | PG_V)
-
+*/
 /*
  * Page Protection Exception bits
  */
index 887fdbf..ec4df7c 100644 (file)
@@ -66,6 +66,7 @@
 #define        CR4_PCE         0x00000100      /* Performance monitoring counter enable */
 #define        CR4_FXSR        0x00000200      /* Fast FPU save/restore used by OS */
 #define        CR4_XMM         0x00000400      /* Enable SIMD/MMX2 to use except 16 */
+#define        CR4_VMXE        0x00002000      /* Enables VMX - Intel specific */
 #define        CR4_XSAVE       0x00040000      /* Enable XSave (for AVX Instructions)*/
 
 /*
@@ -75,6 +76,7 @@
 #define        EFER_LME        0x000000100     /* Long mode enable (R/W) */
 #define        EFER_LMA        0x000000400     /* Long mode active (R) */
 #define        EFER_NXE        0x000000800     /* PTE No-Execute bit enable (R/W) */
+#define        EFER_SVME       0x000001000     /* SVM Enable (R/W) */
 
 /*
  * CPUID instruction features register
 /* AMD MSRs */
 #define MSR_AMD_DE_CFG 0xc0011029
 
+/* AMD SVM MSRs */
+#define MSR_AMD_VM_CR          0xc0010114
+#define MSR_AMD_VM_HSAVE_PA    0xc0010117
+
+/* AMD MSR_AMD_VM_CR fields */
+#define MSR_AMD_VM_CR_SVMDIS   0x00000010      /* SVM Disabled */
+
 /* VIA ACE crypto featureset: for via_feature_rng */
 #define        VIA_HAS_RNG             1       /* cpu has RNG */
 
index fed00ba..583f400 100644 (file)
@@ -170,6 +170,45 @@ static const char * const db_Grp7[] = {
        "invlpg"
 };
 
+static const char * const db_Grp7_11_000[] = {
+       "",
+       "vmcall",
+       "vmlaunch",
+       "vmresume",
+       "vmxoff",
+};
+
+static const char * const db_Grp7_11_001[] = {
+       "monitor",
+       "mwait",
+       "clac",
+       "stac",
+};
+
+static const char * const db_Grp7_11_010[] = {
+       "xgetbv",
+       "xsetbv",
+       "",
+       "",
+       "vmfunc",
+       "xend",
+       "xtest",
+};
+
+static const char * const db_Grp7_11_111[] = {
+       "swapgs",
+       "rdtscp",
+};
+
+static const char * const* db_Grp7_11[] = {
+       db_Grp7_11_000,
+       db_Grp7_11_001,
+       db_Grp7_11_010,
+       NULL,
+       NULL,
+       db_Grp7_11_111,
+};
+
 static const char * const db_Grp8[] = {
        "",
        "",
@@ -1226,10 +1265,19 @@ db_disasm(db_addr_t loc, boolean_t altfmt, db_regs_t *dummy)
        i_mode = ip->i_mode;
 
        if (ip->i_extra == db_Grp1 || ip->i_extra == db_Grp2 ||
-           ip->i_extra == db_Grp6 || ip->i_extra == db_Grp7 ||
-           ip->i_extra == db_Grp8 || ip->i_extra == db_Grp9) {
+           ip->i_extra == db_Grp6 || ip->i_extra == db_Grp8 ||
+           ip->i_extra == db_Grp9) {
            i_name = ((const char * const *)ip->i_extra)[f_reg(rex, regmodrm)];
        }
+       else if (ip->i_extra == db_Grp7) {
+            if((regmodrm & 0xC0) == 0xC0) {
+               i_name = db_Grp7_11[f_reg(rex, regmodrm)][regmodrm &0x7];
+               i_mode = 0;
+            }
+            else {
+               i_name = ((const char * const *)ip->i_extra)[f_reg(rex, regmodrm)];
+            }
+       }
        else if (ip->i_extra == db_Grp3) {
            ip = ip->i_extra;
            ip = &ip[f_reg(rex, regmodrm)];
index 1b75d12..6de4d6a 100644 (file)
@@ -570,4 +570,6 @@ struct sysent sysent[] = {
        { AS(linkat_args), (sy_call_t *)sys_linkat },   /* 531 = linkat */
        { AS(eaccess_args), (sy_call_t *)sys_eaccess }, /* 532 = eaccess */
        { AS(lpathconf_args), (sy_call_t *)sys_lpathconf },     /* 533 = lpathconf */
+       { AS(vmm_guest_ctl_args), (sy_call_t *)sys_vmm_guest_ctl },     /* 534 = vmm_guest_ctl */
+       { AS(vmm_guest_sync_addr_args), (sy_call_t *)sys_vmm_guest_sync_addr }, /* 535 = vmm_guest_sync_addr */
 };
index bc89cce..24f8048 100644 (file)
@@ -73,6 +73,7 @@
 #include <sys/refcount.h>
 #include <sys/thread2.h>
 #include <sys/mplock2.h>
+#include <vm/vm_page2.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 MALLOC_DEFINE(M_EXECARGS, "exec-args", "Exec arguments");
index cd1b48b..27fd62b 100644 (file)
@@ -76,6 +76,8 @@
 #include <sys/sysref2.h>
 #include <sys/mplock2.h>
 
+#include <machine/vmm.h>
+
 static void reaplwps(void *context, int dummy);
 static void reaplwp(struct lwp *lp);
 static void killlwps(struct lwp *lp);
@@ -595,6 +597,9 @@ lwp_exit(int masterexit)
        if (lp->lwp_vkernel)
                vkernel_lwp_exit(lp);
 
+       if (td->td_vmm)
+               vmm_vmdestroy();
+
        /*
         * Clean up select/poll support
         */
index 3e8b6ab..65dc040 100644 (file)
@@ -383,7 +383,13 @@ pfind(pid_t pid)
 struct proc *
 pfindn(pid_t pid)
 {
-       struct proc *p;
+       struct proc *p = curproc;
+
+       /*
+        * Shortcut the current process
+        */
+       if (p && p->p_pid == pid)
+               return (p);
 
        lwkt_gettoken(&proc_token);
        LIST_FOREACH(p, PIDHASH(pid), p_hash) {
index a2d5ba2..b6a19ab 100644 (file)
@@ -94,7 +94,6 @@ int
 sys_lwp_gettid(struct lwp_gettid_args *uap)
 {
        struct lwp *lp = curthread->td_lwp;
-
        uap->sysmsg_result = lp->lwp_tid;
        return (0);
 }
index c577747..95f42d1 100644 (file)
 #include <machine/cpu.h>
 
 #include <sys/thread2.h>
+#include <vm/vm_page2.h>
 
 #define btokup(z)      (&pmap_kvtom((vm_offset_t)(z))->ku_pagecnt)
 
index 6820d16..7f592e2 100644 (file)
@@ -65,6 +65,8 @@
 
 #include <vm/vm_page2.h>
 
+#include <machine/vmm.h>
+
 static void umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action);
 
 /*
@@ -108,6 +110,13 @@ sys_umtx_sleep(struct umtx_sleep_args *uap)
 
     if (uap->timeout < 0)
        return (EINVAL);
+
+    if (curthread->td_vmm) {
+       register_t gpa;
+       vmm_vm_get_gpa(curproc, &gpa, (register_t) uap->ptr);
+       uap->ptr = (const int *)gpa;
+    }
+
     if ((vm_offset_t)uap->ptr & (sizeof(int) - 1))
        return (EFAULT);
 
@@ -193,6 +202,12 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap)
     int error;
     void *waddr;
 
+    if (curthread->td_vmm) {
+       register_t gpa;
+       vmm_vm_get_gpa(curproc, &gpa, (register_t) uap->ptr);
+       uap->ptr = (const int *)gpa;
+    }
+
     cpu_mfence();
     if ((vm_offset_t)uap->ptr & (sizeof(int) - 1))
        return (EFAULT);
index 94f9dac..d426b28 100644 (file)
@@ -185,6 +185,9 @@ lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
 {
     lwkt_ipiq_t ip;
     int windex;
+#ifdef _KERNEL_VIRTUAL
+    int repeating = 0;
+#endif
     struct globaldata *gd = mycpu;
 
     logipiq(send_norm, func, arg1, arg2, gd, target);
@@ -230,6 +233,10 @@ lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
            KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
            lwkt_process_ipiq();
            cpu_pause();
+#ifdef _KERNEL_VIRTUAL
+           if (repeating++ > 10)
+                   pthread_yield();
+#endif
        }
        DEBUG_POP_INFO();
 #if defined(__i386__)
@@ -283,6 +290,9 @@ lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func,
 {
     lwkt_ipiq_t ip;
     int windex;
+#ifdef _KERNEL_VIRTUAL
+    int repeating = 0;
+#endif
     struct globaldata *gd = mycpu;
 
     KKASSERT(target != gd);
@@ -320,6 +330,10 @@ lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func,
            KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
            lwkt_process_ipiq();
            cpu_pause();
+#ifdef _KERNEL_VIRTUAL
+           if (repeating++ > 10)
+                   pthread_yield();
+#endif
        }
        DEBUG_POP_INFO();
 #if defined(__i386__)
@@ -463,6 +477,9 @@ lwkt_wait_ipiq(globaldata_t target, int seq)
            int64_t time_tgt = tsc_get_target(1000000000LL);
            int time_loops = 10;
            int benice = 0;
+#ifdef _KERNEL_VIRTUAL
+           int repeating = 0;
+#endif
 
            cpu_enable_intr();
            DEBUG_PUSH_INFO("wait_ipiq");
@@ -470,6 +487,10 @@ lwkt_wait_ipiq(globaldata_t target, int seq)
                crit_enter();
                lwkt_process_ipiq();
                crit_exit();
+#ifdef _KERNEL_VIRTUAL
+               if (repeating++ > 10)
+                       pthread_yield();
+#endif
 
                /*
                 * IPIQs must be handled within 10 seconds and this code
index 7ae40a1..a6723b1 100644 (file)
@@ -512,6 +512,7 @@ lwkt_free_thread(thread_t td)
        td->td_kstack = NULL;
        td->td_kstack_size = 0;
     }
+
     KTR_LOG(ctxsw_deadtd, td);
 }
 
@@ -1394,6 +1395,9 @@ lwkt_acquire(thread_t td)
                        td, td->td_flags);
                retry = 10000000;
            }
+#ifdef _KERNEL_VIRTUAL
+           pthread_yield();
+#endif
        }
        DEBUG_POP_INFO();
        cpu_mfence();
diff --git a/sys/kern/sys_vmm.c b/sys/kern/sys_vmm.c
new file mode 100644 (file)
index 0000000..dd81d68
--- /dev/null
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/proc.h>
+#include <sys/user.h>
+#include <sys/wait.h>
+#include <sys/vmm.h>
+
+#include <sys/thread2.h>
+#include <sys/spinlock2.h>
+
+#include <machine/cpu.h>
+#include <machine/vmm.h>
+
+/*
+ * vmm guest system call:
+ * - init the calling thread structure
+ * - prepare for running in non-root mode
+ */
+int
+sys_vmm_guest_ctl(struct vmm_guest_ctl_args *uap)
+{
+       int error = 0;
+       struct guest_options options;
+       struct trapframe *tf = uap->sysmsg_frame;
+       unsigned long stack_limit = USRSTACK;
+       unsigned char stack_page[PAGE_SIZE];
+
+       clear_quickret();
+
+       switch (uap->op) {
+               case VMM_GUEST_RUN:
+                       error = copyin(uap->options, &options, sizeof(struct guest_options));
+                       if (error) {
+                               kprintf("sys_vmm_guest: error copyin guest_options\n");
+                               goto out;
+                       }
+
+                       while(stack_limit > tf->tf_sp) {
+                               stack_limit -= PAGE_SIZE;
+                               options.new_stack -= PAGE_SIZE;
+
+                               error = copyin((const void *)stack_limit, (void *)stack_page, PAGE_SIZE);
+                               if (error) {
+                                       kprintf("sys_vmm_guest: error copyin stack\n");
+                                       goto out;
+                               }
+
+                               error = copyout((const void *)stack_page, (void *)options.new_stack, PAGE_SIZE);
+                               if (error) {
+                                       kprintf("sys_vmm_guest: error copyout stack\n");
+                                       goto out;
+                               }
+                       }
+
+                       bcopy(tf, &options.tf, sizeof(struct trapframe));
+
+                       error = vmm_vminit(&options);
+                       if (error) {
+                               if (error == ENODEV) {
+                                       kprintf("sys_vmm_guest: vmm_vminit failed -"
+                                           "no VMM available \n");
+                                       goto out;
+                               } else {
+                                       kprintf("sys_vmm_guest: vmm_vminit failed\n");
+                                       goto out_exit;
+                               }
+                       }
+
+                       generic_lwp_return(curthread->td_lwp, tf);
+
+                       error = vmm_vmrun();
+
+                       break;
+               default:
+                       kprintf("sys_vmm_guest: INVALID op\n");
+                       error = EINVAL;
+                       goto out;
+       }
+out_exit:
+       exit1(W_EXITCODE(error, 0));
+out:
+       return (error);
+}
+
+static
+void
+vmm_exit_vmm(void *dummy __unused)
+{
+}
+
+int
+sys_vmm_guest_sync_addr(struct vmm_guest_sync_addr_args *uap)
+{
+       int error = 0;
+       cpumask_t oactive;
+       cpumask_t nactive;
+       long val;
+       struct proc *p = curproc;
+
+       if (p->p_vmm == NULL)
+               return ENOSYS;
+
+       crit_enter_id("vmm_inval");
+
+       /*
+        * Set CPUMASK_LOCK, spin if anyone else is trying to set CPUMASK_LOCK.
+        */
+       for (;;) {
+               oactive = p->p_vmm_cpumask & ~CPUMASK_LOCK;
+               cpu_ccfence();
+               nactive = oactive | CPUMASK_LOCK;
+               if (atomic_cmpset_cpumask(&p->p_vmm_cpumask, oactive, nactive))
+                       break;
+               lwkt_process_ipiq();
+               cpu_pause();
+       }
+
+       /*
+        * Wait for other cpu's to exit VMM mode (for this vkernel).  No
+        * new cpus will enter VMM mode while we hold the lock.  New waiters
+        * may turn-up though so the wakeup() later on has to be
+        * unconditional.
+        */
+       if (oactive & mycpu->gd_other_cpus) {
+               lwkt_send_ipiq_mask(oactive & mycpu->gd_other_cpus,
+                                   vmm_exit_vmm, NULL);
+               while (p->p_vmm_cpumask & ~CPUMASK_LOCK) {
+                       lwkt_process_ipiq();
+                       cpu_pause();
+               }
+       }
+
+       /*
+        * Make the requested modification, wakeup any waiters.
+        */
+       copyin(uap->srcaddr, &val, sizeof(long));
+       copyout(&val, uap->dstaddr, sizeof(long));
+
+       atomic_clear_cpumask(&p->p_vmm_cpumask, CPUMASK_LOCK);
+       wakeup(&p->p_vmm_cpumask);
+
+       crit_exit_id("vmm_inval");
+
+       return error;
+}
index 21d8277..021bd26 100644 (file)
@@ -542,4 +542,6 @@ const char *syscallnames[] = {
        "linkat",                       /* 531 = linkat */
        "eaccess",                      /* 532 = eaccess */
        "lpathconf",                    /* 533 = lpathconf */
+       "vmm_guest_ctl",                        /* 534 = vmm_guest_ctl */
+       "vmm_guest_sync_addr",                  /* 535 = vmm_guest_sync_addr */
 };
index 3f1f740..832a1e1 100644 (file)
                                char *path2, int flags); }
 532    STD     BSD     { int eaccess(char *path, int flags); }
 533    STD     BSD     { int lpathconf(char *path, int name); }
+534    STD     BSD     { int vmm_guest_ctl(int op, struct guest_options *options); }
+535    STD     BSD     { int vmm_guest_sync_addr(long *dstaddr, long *srcaddr); }
index 7b1a698..502cbb7 100644 (file)
@@ -75,6 +75,7 @@
 #include <sys/msgport2.h>
 #include <sys/socketvar2.h>
 #include <net/netmsg2.h>
+#include <vm/vm_page2.h>
 
 #ifdef SCTP
 #include <netinet/sctp_peeloff.h>
index b260a44..6104691 100644 (file)
@@ -57,6 +57,7 @@
 #include <sys/sfbuf.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
+#include <vm/vm_page2.h>
 
 #ifdef LWBUF_IS_OPTIMAL
 
index 99e51d4..afad843 100644 (file)
@@ -87,6 +87,7 @@
 #include <sys/buf2.h>
 #include <sys/thread2.h>
 #include <sys/sysref2.h>
+#include <vm/vm_page2.h>
 
 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
index b562aaa..db9f4b0 100644 (file)
@@ -841,6 +841,17 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
        return rtval;
 }
 
+/*
+ * Similar to extract but checks protections, SMP-friendly short-cut for
+ * vm_fault_page[_quick]().
+ */
+vm_page_t
+pmap_fault_page_quick(pmap_t pmap __unused, vm_offset_t vaddr __unused,
+                     vm_prot_t prot __unused)
+{
+       return(NULL);
+}
+
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
index f460d88..518f0d9 100644 (file)
@@ -322,6 +322,11 @@ void       pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
 void   pmap_invalidate_cache_pages(vm_page_t *pages, int count);
 void   pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
 
+static __inline int
+pmap_emulate_ad_bits(pmap_t pmap) {
+       return 0;
+}
+
 #endif /* _KERNEL */
 
 #endif /* !LOCORE */
similarity index 53%
copy from sys/platform/vkernel64/include/pmap_inval.h
copy to sys/platform/pc32/include/vmm.h
index f99fe3f..520099a 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
- * by Matthew Dillon <dillon@backplane.com>
+ * by Mihai Carabas <mihai.carabas@gmail.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $DragonFly: src/sys/platform/vkernel/include/pmap_inval.h,v 1.3 2007/07/02 02:22:57 dillon Exp $
  */
 
-#ifndef _MACHINE_PMAP_INVAL_H_
-#define        _MACHINE_PMAP_INVAL_H_
+#ifndef _MACHINE_VMM_H_
+#define _MACHINE_VMM_H_
 
-#ifndef _SYS_THREAD_H_
-#include <sys/thread.h>
-#endif
+#include <sys/vmm.h>
 
-typedef struct pmap_inval_info {
-    int                        pir_flags;
-    struct lwkt_cpusync        pir_cpusync;
-} pmap_inval_info;
+static __inline
+int vmm_vminit(struct guest_options* opts) {
+       return 0;
+}
 
-typedef pmap_inval_info *pmap_inval_info_t;
+static __inline
+int vmm_vmdestroy(void) {
+       return 0;
+}
 
-#define PIRF_INVLTLB   0x0001  /* request invalidation of whole table */
-#define PIRF_INVL1PG   0x0002  /* else request invalidation of one page */
-#define PIRF_CPUSYNC   0x0004  /* cpusync is currently active */
+static __inline
+int vmm_vmrun(void) {
+       return 0;
+}
 
-#ifdef _KERNEL
+static __inline
+int vmm_vm_set_tls_area(void) {
+       return 0;
+}
 
-#ifndef _MACHINE_PMAP_H_
-#include <machine/pmap.h>
-#endif
+static __inline
+void vmm_lwp_return(struct lwp *lp, struct trapframe *frame) {
+}
 
-void pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
+static __inline
+void vmm_vm_set_guest_cr3(register_t guest_cr3) {
+}
 
-#endif
+static __inline
+int vmm_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr) {
+       *gpa = 0;
+       return 0;
+}
 
 #endif
index ef55c45..6a0cde5 100644 (file)
@@ -171,6 +171,7 @@ platform/pc64/x86_64/identcpu.c             standard
 platform/pc64/x86_64/amd64_mem.c       standard
 platform/pc64/x86_64/cpufreq_machdep.c    standard
 platform/pc64/x86_64/msi.c             standard
+platform/pc64/x86_64/uwrapper.c                standard
 
 platform/pc64/apic/lapic.c             standard
 platform/pc64/apic/ioapic.c            standard
@@ -218,3 +219,20 @@ platform/pc64/isa/clock.c          standard
 platform/pc64/isa/isa_intr.c           optional        isa
 platform/pc64/x86_64/spinlock.s                standard
 dev/netif/elink_layer/elink.c          optional        ep
+
+vmx_genassym.o                         standard                                        \
+       dependency      " $S/platform/$P/vmm/vmx_genassym.c ${FORWARD_HEADERS_COOKIE}"  \
+       compile-with    "${CC} ${CFLAGS:N-fno-common} ${WERROR} -c ${.IMPSRC}"          \
+       no-obj no-implicit-rule                                                         \
+       clean           "vmx_genassym.o"
+vmx_assym.h                            standard                                        \
+       dependency      "$S/kern/genassym.sh vmx_genassym.o"                            \
+       compile-with    "sh $S/kern/genassym.sh vmx_genassym.o > ${.TARGET}"            \
+       no-obj no-implicit-rule before-depend                                           \
+       clean           "vmx_assym.h"
+platform/pc64/vmm/vmm.c                        standard
+platform/pc64/vmm/vmm_utils.c          standard
+platform/pc64/vmm/vmx.c                        standard
+platform/pc64/vmm/ept.c                        standard
+platform/pc64/vmm/vmx_trap.s           standard
+platform/pc64/vmm/svm.c                        standard
index 393b49a..5741538 100644 (file)
@@ -100,6 +100,8 @@ void        cpu_kthread_restore (void);/* cannot be called from C */
 thread_t cpu_exit_switch (struct thread *next);
 
 void   syscall2 (struct trapframe *);
+void   trap_handle_userenter(struct thread *);
+void   trap_handle_userexit(struct trapframe *, int);
 void    minidumpsys(struct dumperinfo *);
 void   dump_add_page(vm_paddr_t);
 void   dump_drop_page(vm_paddr_t);
index 6a628f3..09b410f 100644 (file)
@@ -250,6 +250,28 @@ struct pv_entry_rb_tree;
 RB_PROTOTYPE2(pv_entry_rb_tree, pv_entry, pv_entry,
              pv_entry_compare, vm_pindex_t);
 
+/* Types of PMAP (regular, EPT Intel, NPT Amd) */
+#define        REGULAR_PMAP            0
+#define        EPT_PMAP                1
+
+/* Bits indexes in pmap_bits */
+#define        TYPE_IDX                0
+#define        PG_V_IDX                1
+#define        PG_RW_IDX               2
+#define        PG_U_IDX                3
+#define        PG_A_IDX                4
+#define        PG_M_IDX                5
+#define        PG_PS_IDX               6
+#define        PG_G_IDX                7
+#define        PG_W_IDX                8
+#define        PG_MANAGED_IDX          9
+#define        PG_DEVICE_IDX           10
+#define        PG_N_IDX                11
+#define        PG_BITS_SIZE            12
+
+#define PROTECTION_CODES_SIZE  8
+#define PAT_INDEX_SIZE  8
+
 struct pmap {
        pml4_entry_t            *pm_pml4;       /* KVA of level 4 page table */
        struct pv_entry         *pm_pmlpv;      /* PV entry for pml4 */
@@ -263,12 +285,26 @@ struct pmap {
        int                     pm_generation;  /* detect pvlist deletions */
        struct spinlock         pm_spin;
        struct lwkt_token       pm_token;
+       long                    pm_invgen;
+       uint64_t                pmap_bits[PG_BITS_SIZE];
+       int                     protection_codes[PROTECTION_CODES_SIZE];
+       pt_entry_t              pmap_cache_bits[PAT_INDEX_SIZE];
+       pt_entry_t              pmap_cache_mask;
+       int (*copyinstr)(const void *, void *, size_t, size_t *);
+       int (*copyin)(const void *, void *, size_t);
+       int (*copyout)(const void *, void *, size_t);
+       int (*fubyte)(const void *);
+       int (*subyte)(void *, int);
+       long (*fuword)(const void *);
+       int (*suword)(void *, long);
+       int (*suword32)(void *, int);
 };
 
 #define CPUMASK_LOCK           CPUMASK(SMP_MAXCPU)
 #define CPUMASK_BIT            SMP_MAXCPU      /* for 1LLU << SMP_MAXCPU */
 
 #define PMAP_FLAG_SIMPLE       0x00000001
+#define PMAP_EMULATE_AD_BITS   0x00000002
 
 #define pmap_resident_count(pmap) (pmap)->pm_stats.resident_count
 
@@ -334,6 +370,11 @@ void       pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t);
 void   pmap_invalidate_cache_pages(vm_page_t *pages, int count);
 void   pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva);
 
+static __inline int
+pmap_emulate_ad_bits(pmap_t pmap) {
+       return pmap->pm_flags & PMAP_EMULATE_AD_BITS;
+}
+
 #endif /* _KERNEL */
 
 #endif /* !LOCORE */
similarity index 53%
copy from sys/platform/vkernel64/include/pmap_inval.h
copy to sys/platform/pc64/include/vmm.h
index f99fe3f..3510921 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
- * by Matthew Dillon <dillon@backplane.com>
+ * by Mihai Carabas <mihai.carabas@gmail.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $DragonFly: src/sys/platform/vkernel/include/pmap_inval.h,v 1.3 2007/07/02 02:22:57 dillon Exp $
  */
 
-#ifndef _MACHINE_PMAP_INVAL_H_
-#define        _MACHINE_PMAP_INVAL_H_
-
-#ifndef _SYS_THREAD_H_
-#include <sys/thread.h>
-#endif
-
-typedef struct pmap_inval_info {
-    int                        pir_flags;
-    struct lwkt_cpusync        pir_cpusync;
-} pmap_inval_info;
-
-typedef pmap_inval_info *pmap_inval_info_t;
+#ifndef _MACHINE_VMM_H_
+#define _MACHINE_VMM_H_
 
-#define PIRF_INVLTLB   0x0001  /* request invalidation of whole table */
-#define PIRF_INVL1PG   0x0002  /* else request invalidation of one page */
-#define PIRF_CPUSYNC   0x0004  /* cpusync is currently active */
+#include <sys/vmm.h>
 
-#ifdef _KERNEL
-
-#ifndef _MACHINE_PMAP_H_
-#include <machine/pmap.h>
-#endif
-
-void pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-
-#endif
+int vmm_vminit(struct guest_options*);
+int vmm_vmdestroy(void);
+int vmm_vmrun(void);
+int vmm_vm_set_tls_area(void);
+void vmm_lwp_return(struct lwp *lp, struct trapframe *frame);
+void vmm_vm_set_guest_cr3(register_t guest_cr3);
+int vmm_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr);
 
 #endif
diff --git a/sys/platform/pc64/vmm/ept.c b/sys/platform/pc64/vmm/ept.c
new file mode 100644 (file)
index 0000000..0cae3a6
--- /dev/null
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/systm.h>
+#include <sys/sfbuf.h>
+#include <sys/proc.h>
+#include <sys/thread.h>
+
+#include <machine/pmap.h>
+#include <machine/specialreg.h>
+#include <machine/cpufunc.h>
+#include <machine/vmm.h>
+
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+
+#include "vmx.h"
+#include "ept.h"
+#include "vmm_utils.h"
+
+static uint64_t pmap_bits_ept[PG_BITS_SIZE];
+static pt_entry_t pmap_cache_bits_ept[PAT_INDEX_SIZE];
+static int ept_protection_codes[PROTECTION_CODES_SIZE];
+static pt_entry_t pmap_cache_mask_ept;
+
+static int pmap_pm_flags_ept;
+static int eptp_bits;
+
+extern uint64_t vmx_ept_vpid_cap;
+
+int
+vmx_ept_init(void)
+{
+       int prot;
+       /* Chapter 28 VMX SUPPORT FOR ADDRESS TRANSLATION
+        * Intel Manual 3c, page 107
+        */
+       vmx_ept_vpid_cap = rdmsr(IA32_VMX_EPT_VPID_CAP);
+
+       if(!EPT_PWL4(vmx_ept_vpid_cap)||
+           !EPT_MEMORY_TYPE_WB(vmx_ept_vpid_cap)) {
+               return EINVAL;
+       }
+
+       eptp_bits |= EPTP_CACHE(PAT_WRITE_BACK) |
+           EPTP_PWLEN(EPT_PWLEVELS - 1);
+
+       if (EPT_AD_BITS_SUPPORTED(vmx_ept_vpid_cap)) {
+               eptp_bits |= EPTP_AD_ENABLE;
+       } else {
+               pmap_pm_flags_ept = PMAP_EMULATE_AD_BITS;
+       }
+
+       /* Initialize EPT bits
+        * - for PG_V - set READ and EXECUTE to preserve compatibility
+        * - for PG_U and PG_G - set 0 to preserve compatiblity
+        * - for PG_N - set the Uncacheable bit
+        */
+       pmap_bits_ept[TYPE_IDX] = EPT_PMAP;
+       pmap_bits_ept[PG_V_IDX] = EPT_PG_READ | EPT_PG_EXECUTE;
+       pmap_bits_ept[PG_RW_IDX] = EPT_PG_WRITE;
+       pmap_bits_ept[PG_PS_IDX] = EPT_PG_PS;
+       pmap_bits_ept[PG_G_IDX] = 0;
+       pmap_bits_ept[PG_U_IDX] = 0;
+       pmap_bits_ept[PG_A_IDX] = EPT_PG_A;
+       pmap_bits_ept[PG_M_IDX] = EPT_PG_M;
+       pmap_bits_ept[PG_W_IDX] = EPT_PG_AVAIL1;
+       pmap_bits_ept[PG_MANAGED_IDX] = EPT_PG_AVAIL2;
+       pmap_bits_ept[PG_DEVICE_IDX] = EPT_PG_AVAIL3;
+       pmap_bits_ept[PG_N_IDX] = EPT_IGNORE_PAT | EPT_MEM_TYPE_UC;
+
+
+       pmap_cache_mask_ept = EPT_IGNORE_PAT | EPT_MEM_TYPE_MASK;
+
+       pmap_cache_bits_ept[PAT_UNCACHEABLE] = EPT_IGNORE_PAT | EPT_MEM_TYPE_UC;
+       pmap_cache_bits_ept[PAT_WRITE_COMBINING] = EPT_IGNORE_PAT | EPT_MEM_TYPE_WC;
+       pmap_cache_bits_ept[PAT_WRITE_THROUGH] = EPT_IGNORE_PAT | EPT_MEM_TYPE_WT;
+       pmap_cache_bits_ept[PAT_WRITE_PROTECTED] = EPT_IGNORE_PAT | EPT_MEM_TYPE_WP;
+       pmap_cache_bits_ept[PAT_WRITE_BACK] = EPT_IGNORE_PAT | EPT_MEM_TYPE_WB;
+       pmap_cache_bits_ept[PAT_UNCACHED] = EPT_IGNORE_PAT | EPT_MEM_TYPE_UC;
+
+       for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) {
+               switch (prot) {
+               case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
+               case VM_PROT_READ | VM_PROT_NONE | VM_PROT_NONE:
+               case VM_PROT_READ | VM_PROT_NONE | VM_PROT_EXECUTE:
+               case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_EXECUTE:
+                       ept_protection_codes[prot] = 0;
+                       break;
+               case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_NONE:
+               case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
+               case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
+               case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
+                       ept_protection_codes[prot] = pmap_bits_ept[PG_RW_IDX];
+
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+/* Build the VMCS_EPTP pointer
+ * - the ept_address
+ * - the EPTP bits indicating optional features
+ */
+uint64_t vmx_eptp(uint64_t ept_address)
+{
+       return (ept_address | eptp_bits);
+}
+
+/* Copyin from guest VMM */
+static int
+ept_copyin(const void *udaddr, void *kaddr, size_t len)
+{
+       struct lwbuf *lwb;
+       struct lwbuf lwb_cache;
+       vm_page_t m;
+       register_t gpa;
+       size_t n;
+       int err = 0;
+       struct vmspace *vm = curproc->p_vmspace;
+       struct vmx_thread_info *vti = curthread->td_vmm;
+       register_t guest_cr3 = vti->guest_cr3;
+
+       while (len) {
+               /* Get the GPA by manually walking the-GUEST page table*/
+               err = guest_phys_addr(vm, &gpa, guest_cr3, (vm_offset_t)udaddr);
+               if (err) {
+                       kprintf("%s: could not get guest_phys_addr\n", __func__);
+                       break;
+               }
+
+               m = vm_fault_page(&vm->vm_map, trunc_page(gpa),
+                   VM_PROT_READ, VM_FAULT_NORMAL, &err);
+               if (err) {
+                       kprintf("%s: could not fault in vm map, gpa: %llx\n",
+                           __func__, (unsigned long long) gpa);
+                       break;
+               }
+
+               n = PAGE_SIZE - ((vm_offset_t)udaddr & PAGE_MASK);
+               if (n > len)
+                       n = len;
+
+               lwb = lwbuf_alloc(m, &lwb_cache);
+               bcopy((char *)lwbuf_kva(lwb)+((vm_offset_t)udaddr & PAGE_MASK), kaddr, n);
+               len -= n;
+               udaddr = (const char *)udaddr + n;
+               kaddr = (char *)kaddr + n;
+               lwbuf_free(lwb);
+               vm_page_unhold(m);
+       }
+       if (err)
+               err = EFAULT;
+       return (err);
+}
+
+/* Copyout from guest VMM */
+static int
+ept_copyout(const void *kaddr, void *udaddr, size_t len)
+{
+       struct lwbuf *lwb;
+       struct lwbuf lwb_cache;
+       vm_page_t m;
+       register_t gpa;
+       size_t n;
+       int err = 0;
+       struct vmspace *vm = curproc->p_vmspace;
+       struct vmx_thread_info *vti = curthread->td_vmm;
+       register_t guest_cr3 = vti->guest_cr3;
+
+       while (len) {
+               /* Get the GPA by manually walking the-GUEST page table*/
+               err = guest_phys_addr(vm, &gpa, guest_cr3, (vm_offset_t)udaddr);
+               if (err) {
+                       kprintf("%s: could not get guest_phys_addr\n", __func__);
+                       break;
+               }
+
+               m = vm_fault_page(&vm->vm_map, trunc_page(gpa),
+                   VM_PROT_READ | VM_PROT_WRITE,
+                   VM_FAULT_NORMAL, &err);
+               if (err) {
+                       kprintf("%s: could not fault in vm map, gpa: %llx\n",
+                           __func__, (unsigned long long) gpa);
+                       break;
+               }
+
+               n = PAGE_SIZE - ((vm_offset_t)udaddr & PAGE_MASK);
+               if (n > len)
+                       n = len;
+
+               lwb = lwbuf_alloc(m, &lwb_cache);
+               bcopy(kaddr, (char *)lwbuf_kva(lwb) +
+                            ((vm_offset_t)udaddr & PAGE_MASK), n);
+
+               len -= n;
+               udaddr = (char *)udaddr + n;
+               kaddr = (const char *)kaddr + n;
+               vm_page_dirty(m);
+#if 0
+               /* should not be needed */
+               cpu_invlpg((char *)lwbuf_kva(lwb) +
+                            ((vm_offset_t)udaddr & PAGE_MASK));
+#endif
+               lwbuf_free(lwb);
+               vm_page_unhold(m);
+       }
+       if (err)
+               err = EFAULT;
+       return (err);
+}
+
+static int
+ept_copyinstr(const void *udaddr, void *kaddr, size_t len, size_t *res)
+{
+       int error;
+       size_t n;
+       const char *uptr = udaddr;
+       char *kptr = kaddr;
+
+       if (res)
+               *res = 0;
+       while (len) {
+               n = PAGE_SIZE - ((vm_offset_t)uptr & PAGE_MASK);
+               if (n > 32)
+                       n = 32;
+               if (n > len)
+                       n = len;
+               if ((error = ept_copyin(uptr, kptr, n)) != 0)
+                       return(error);
+               while (n) {
+                       if (res)
+                               ++*res;
+                       if (*kptr == 0)
+                               return(0);
+                       ++kptr;
+                       ++uptr;
+                       --n;
+                       --len;
+               }
+
+       }
+       return(ENAMETOOLONG);
+}
+
+
+static int
+ept_fubyte(const void *base)
+{
+       unsigned char c = 0;
+
+       if (ept_copyin(base, &c, 1) == 0)
+               return((int)c);
+       return(-1);
+}
+
+static int
+ept_subyte(void *base, int byte)
+{
+       unsigned char c = byte;
+
+       if (ept_copyout(&c, base, 1) == 0)
+               return(0);
+       return(-1);
+}
+
+static long
+ept_fuword(const void *base)
+{
+       long v;
+
+       if (ept_copyin(base, &v, sizeof(v)) == 0)
+               return(v);
+       return(-1);
+}
+
+static int
+ept_suword(void *base, long word)
+{
+       if (ept_copyout(&word, base, sizeof(word)) == 0)
+               return(0);
+       return(-1);
+}
+
+static int
+ept_suword32(void *base, int word)
+{
+       if (ept_copyout(&word, base, sizeof(word)) == 0)
+               return(0);
+       return(-1);
+}
+
+void
+vmx_ept_pmap_pinit(pmap_t pmap)
+{
+       pmap->pm_flags |= pmap_pm_flags_ept;
+
+       bcopy(pmap_bits_ept, pmap->pmap_bits, sizeof(pmap_bits_ept));
+       bcopy(ept_protection_codes, pmap->protection_codes,
+             sizeof(ept_protection_codes));
+       bcopy(pmap_cache_bits_ept, pmap->pmap_cache_bits,
+             sizeof(pmap_cache_bits_ept));
+       pmap->pmap_cache_mask = pmap_cache_mask_ept;
+       pmap->copyinstr = ept_copyinstr;
+       pmap->copyin = ept_copyin;
+       pmap->copyout = ept_copyout;
+       pmap->fubyte = ept_fubyte;
+       pmap->subyte = ept_subyte;
+       pmap->fuword = ept_fuword;
+       pmap->suword = ept_suword;
+       pmap->suword32 = ept_suword32;
+}
diff --git a/sys/platform/pc64/vmm/ept.h b/sys/platform/pc64/vmm/ept.h
new file mode 100644 (file)
index 0000000..00db30b
--- /dev/null
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _VMM_EPT_H_
+#define _VMM_EPT_H_
+
+#include <vm/vm.h>
+
+#include <machine/pmap.h>
+
+/* EPT defines */
+#define        EPT_PWL4(cap)                   ((cap) & (1ULL << 6))
+#define        EPT_MEMORY_TYPE_WB(cap)         ((cap) & (1UL << 14))
+#define        EPT_AD_BITS_SUPPORTED(cap)      ((cap) & (1ULL << 21))
+#define        EPT_PG_READ                     (0x1ULL << 0)
+#define        EPT_PG_WRITE                    (0x1ULL << 1)
+#define        EPT_PG_EXECUTE                  (0x1ULL << 2)
+#define        EPT_IGNORE_PAT                  (0x1ULL << 6)
+#define        EPT_PG_PS                       (0x1ULL << 7)
+#define        EPT_PG_A                        (0x1ULL << 8)
+#define        EPT_PG_M                        (0x1ULL << 9)
+#define        EPT_PG_AVAIL1                   (0x1ULL << 10)
+#define        EPT_PG_AVAIL2                   (0x1ULL << 11)
+#define        EPT_PG_AVAIL3                   (0x1ULL << 52)
+#define        EPT_PWLEVELS                    (4)     /* page walk levels */
+
+#define        EPTP_CACHE(x)                   (x)
+#define        EPTP_PWLEN(x)                   ((x) << 3)
+#define        EPTP_AD_ENABLE                  (0x1ULL << 6)
+
+#define        EPT_MEM_TYPE_SHIFT              (0x3)
+#define        EPT_MEM_TYPE_UC                 (0x0ULL << EPT_MEM_TYPE_SHIFT)
+#define        EPT_MEM_TYPE_WC                 (0x1ULL << EPT_MEM_TYPE_SHIFT)
+#define        EPT_MEM_TYPE_WT                 (0x4ULL << EPT_MEM_TYPE_SHIFT)
+#define        EPT_MEM_TYPE_WP                 (0x5ULL << EPT_MEM_TYPE_SHIFT)
+#define        EPT_MEM_TYPE_WB                 (0x6ULL << EPT_MEM_TYPE_SHIFT)
+#define        EPT_MEM_TYPE_MASK               (0x7ULL << EPT_MEM_TYPE_SHIFT)
+
+#define        EPT_VIOLATION_READ              (1ULL << 0)
+#define        EPT_VIOLATION_WRITE             (1ULL << 1)
+#define        EPT_VIOLATION_INST_FETCH        (1ULL << 2)
+#define        EPT_VIOLATION_GPA_READABLE      (1ULL << 3)
+#define        EPT_VIOLATION_GPA_WRITEABLE     (1ULL << 4)
+#define        EPT_VIOLATION_GPA_EXECUTABLE    (1ULL << 5)
+
+#define        INVEPT_TYPE_SINGLE_CONTEXT      1UL
+#define        INVEPT_TYPE_ALL_CONTEXTS        2UL
+
+struct invept_desc {
+       uint64_t        eptp;
+       uint64_t        _res;
+};
+typedef struct invept_desc invept_desc_t;
+
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+int vmx_ept_init(void);
+void vmx_ept_pmap_pinit(pmap_t pmap);
+uint64_t vmx_eptp(uint64_t ept_address);
+
+static __inline int
+vmx_ept_fault_type(uint64_t qualification)
+{
+       if (qualification & EPT_VIOLATION_WRITE)
+               return VM_PROT_WRITE;
+       else if (qualification & EPT_VIOLATION_INST_FETCH)
+               return VM_PROT_EXECUTE;
+       else
+               return VM_PROT_READ;
+}
+
+static __inline int
+vmx_ept_gpa_prot(uint64_t qualification)
+{
+       int prot = 0;
+
+       if (qualification & EPT_VIOLATION_GPA_READABLE)
+               prot |= VM_PROT_READ;
+
+       if (qualification & EPT_VIOLATION_GPA_WRITEABLE)
+               prot |= VM_PROT_WRITE;
+
+       if (qualification & EPT_VIOLATION_GPA_EXECUTABLE)
+               prot |= VM_PROT_EXECUTE;
+
+       return prot;
+}
+
+#endif
diff --git a/sys/platform/pc64/vmm/svm.c b/sys/platform/pc64/vmm/svm.c
new file mode 100644 (file)
index 0000000..e63b676
--- /dev/null
@@ -0,0 +1,156 @@
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/thread.h>
+#include <sys/vmm.h>
+
+#include <vm/pmap.h>
+
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include "vmm.h"
+#include "svm.h"
+
+static int svm_available = 0;
+static int svm_enabled = 0;
+
+//static int svm_rvi_available = 0;
+//static int svm_vmcb_caching_available = 0;
+
+static vm_offset_t vm_hsave_va[MAXCPU];
+
+/*
+ * svm_init() - Identify AMD SVM support.
+ *
+ *     Called in early boot. Detects AMD SVM support and extended features.
+ */
+static int svm_init(void) {
+       uint64_t vm_cr;
+
+       /* SVM is identified by CPUID */
+       if ((amd_feature2 & AMDID2_SVM) == 0)
+               return (ENODEV);
+
+       /* Processor may support SVM, but it may be disabled. */
+       vm_cr = rdmsr(MSR_AMD_VM_CR);
+       if (vm_cr & MSR_AMD_VM_CR_SVMDIS)
+               return (ENODEV);
+
+       svm_available = 1;
+
+       return (0);
+}
+
+/*
+ * svm_enable() - Called to enable SVM extensions on every processor.
+ */
+static int svm_enable(void) {
+       uint64_t efer;
+       int origcpu;
+       int i;
+       vm_paddr_t vm_hsave_pa;
+
+       if (!svm_available)
+               return (ENODEV);
+
+       KKASSERT(svm_enabled == 0);
+
+       /* Set EFER.SVME and allocate a VM Host Save Area on every cpu */
+       origcpu = mycpuid;
+       for (i = 0; i < ncpus; i++) {
+               lwkt_migratecpu(i);
+
+               efer = rdmsr(MSR_EFER);
+               efer |= EFER_SVME;
+               wrmsr(MSR_EFER, efer);
+
+               vm_hsave_va[i] = (vm_offset_t) contigmalloc(4096, M_TEMP,
+                                                           M_WAITOK | M_ZERO,
+                                                           0, 0xffffffff,
+                                                           4096, 0);
+               vm_hsave_pa = vtophys(vm_hsave_va[i]);
+               wrmsr(MSR_AMD_VM_HSAVE_PA, vm_hsave_pa);
+       }
+       lwkt_migratecpu(origcpu);
+
+       svm_enabled = 1;
+
+       return (0);
+}
+
+/*
+ * svm_disable() - Called to disable SVM extensions on every processor.
+ */
+static int svm_disable(void) {
+       uint64_t efer;
+       int origcpu;
+       int i;
+
+       /* XXX Wait till no vmms are running? */
+
+       KKASSERT(svm_enabled == 1);
+
+       origcpu = mycpuid;
+       for (i = 0; i < ncpus; i++) {
+               lwkt_migratecpu(i);
+
+               wrmsr(MSR_AMD_VM_HSAVE_PA, 0);
+
+               contigfree((void *) vm_hsave_va[i], 4096, M_TEMP);
+               vm_hsave_va[i] = 0;
+
+               efer = rdmsr(MSR_EFER);
+               efer &= ~EFER_SVME;
+               wrmsr(MSR_EFER, efer);
+       }
+       lwkt_migratecpu(origcpu);
+
+       svm_enabled = 0;
+
+       return (0);
+}
+
+/*
+ * svm_vminit() - Prepare current thread for VMRUN.
+ *
+ *     Allocates a VMCB for the current thread and flags the thread to return
+ *     to usermode via svm_vmrun().
+ */
+static int svm_vminit(struct guest_options *options) {
+       return (ENODEV);
+}
+
+/*
+ * svm_vmdestroy() -
+ */
+static int svm_vmdestroy(void) {
+       return (ENODEV);
+}
+
+/*
+ * svm_vmrun() - Execute VMRUN on a prepared VMCB for a thread.
+ *
+ *     Called while a thread is returning to userspace, after being flagged as
+ *     a VMM thread. svm_vmrun() continues in a loop around VMRUN/#VMEXIT
+ *     handling until we are no longer a VMM thread.
+ */
+static int svm_vmrun(void) {
+       return (ENODEV);
+}
+
+static struct vmm_ctl ctl_svm = {
+       .name = "SVM",
+       .init = svm_init,
+       .enable = svm_enable,
+       .disable = svm_disable,
+       .vminit = svm_vminit,
+       .vmdestroy = svm_vmdestroy,
+       .vmrun = svm_vmrun,
+};
+
+struct vmm_ctl *
+get_ctl_amd(void)
+{
+       return &ctl_svm;
+}
diff --git a/sys/platform/pc64/vmm/svm.h b/sys/platform/pc64/vmm/svm.h
new file mode 100644 (file)
index 0000000..0d61a09
--- /dev/null
@@ -0,0 +1,90 @@
+#ifndef _VMM_SVM_H_
+#define _VMM_SVM_H_
+
+struct vmcb {
+       /* Control Area */
+       uint16_t        vmcb_cr_read;
+       uint16_t        vmcb_cr_write;
+       uint16_t        vmcb_dr_read;
+       uint16_t        vmcb_dr_write;
+       uint32_t        vmcb_exception;
+       uint32_t        vmcb_ctrl1;
+       uint32_t        vmcb_ctrl2;
+       uint8_t         vmcb_unused1[40];
+       uint16_t        vmcb_pause_filter_threshold;
+       uint16_t        vmcb_pause_filter_count;
+       uint64_t        vmcb_iopm_base_pa;
+       uint64_t        vmcb_msrpm_base_pa;
+       uint64_t        vmcb_tsc_offset;
+       uint32_t        vmcb_asid;
+       uint8_t         vmcb_tlb_ctrl;
+       uint8_t         vmcb_unused2[3];
+       uint8_t         vmcb_v_tpr;
+       uint8_t         vmcb_v_irq;
+       /* ... */
+       uint8_t         vmcb_v_intr_vector;
+       uint8_t         vmcb_unused3[3];
+       uint64_t        vmcb_intr_shadow;
+       uint64_t        vmcb_exitcode;
+       uint64_t        vmcb_exitinfo1;
+       uint64_t        vmcb_exitinfo2;
+       uint64_t        vmcb_exitintinfo;
+       uint64_t        vmcb_np_enable;
+       uint8_t         vmcb_unused4[16];
+       uint64_t        vmcb_event_injection;
+       uint64_t        vmcb_n_cr3;
+
+       /* Saved Guest State */
+};
+
+/* SVM Intercept Codes (vmcb_exitcode) */
+#define VMEXIT_CR_READ(_cr)    (0 + (_cr))
+#define VMEXIT_CR_WRITE(_cr)   (16 + (_cr))
+#define VMEXIT_DR_READ(_dr)    (32 + (_dr))
+#define VMEXIT_DR_WRITE(_dr)   (48 + (_dr))
+#define VMEXIT_EXCP(_excp)     (64 + (_excp))
+#define VMEXIT_INTR            96
+#define VMEXIT_NMI             97
+#define VMEXIT_SMI             98
+#define VMEXIT_INIT            99
+#define VMEXIT_VINTR           100
+#define VMEXIT_CR0_SEL_WRITE   101
+#define VMEXIT_IDTR_READ       102
+#define VMEXIT_GDTR_READ       103
+#define VMEXIT_LDTR_READ       104
+#define VMEXIT_TR_READ         105
+#define VMEXIT_IDTR_WRITE      106
+#define VMEXIT_GDTR_WRITE      107
+#define VMEXIT_LDTR_WRITE      108
+#define VMEXIT_TR_WRITE                109
+#define VMEXIT_RDTSC           110
+#define VMEXIT_RDPMC           111
+#define VMEXIT_PUSHF           112
+#define VMEXIT_POPF            113
+#define VMEXIT_CPUID           114
+#define VMEXIT_RSM             115
+#define VMEXIT_IRET            116
+#define VMEXIT_SWINT           117
+#define VMEXIT_INVD            118
+#define VMEXIT_PAUSE           119
+#define VMEXIT_HLT             120
+#define VMEXIT_INVLPG          121
+#define VMEXIT_INVLPGA         122
+#define VMEXIT_IOIO            123
+#define VMEXIT_MSR             124
+#define VMEXIT_TASK_SWITCH     125
+#define VMEXIT_FERR_FREEZE     126
+#define VMEXIT_SHUTDOWN        127
+#define VMEXIT_VMRUN           128
+#define VMEXIT_VMMCALL         129
+#define VMEXIT_VMLOAD          130
+#define VMEXIT_VMSAVE          131
+#define VMEXIT_STGI            132
+#define VMEXIT_CLGI            133
+#define VMEXIT_SKINIT          134
+#define VMEXIT_RDTSCP          135
+#define VMEXIT_ICEBP           137
+#define VMEXIT_NPF             1024
+#define VMEXIT_INVALID         -1
+
+#endif  /* ndef _VMM_SVM_H_ */
diff --git a/sys/platform/pc64/vmm/vmm.c b/sys/platform/pc64/vmm/vmm.c
new file mode 100644 (file)
index 0000000..b9044a0
--- /dev/null
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/proc.h>
+#include <sys/vkernel.h>
+
+#include <machine/vmm.h>
+#include <machine/cputypes.h>
+#include <machine/md_var.h>
+
+#include "vmm.h"
+
+static struct vmm_ctl *ctl = NULL;
+
+struct sysctl_ctx_list vmm_sysctl_ctx;
+struct sysctl_oid *vmm_sysctl_tree;
+
+int vmm_enabled;
+
+static int
+sysctl_vmm_enable(SYSCTL_HANDLER_ARGS)
+{
+       int error, new_val;
+
+       new_val = vmm_enabled;
+
+       error = sysctl_handle_int(oidp, &new_val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+               return (error);
+
+       if (new_val != 0 && new_val != 1)
+               return (EINVAL);
+
+       if (vmm_enabled != new_val) {
+               if (new_val == 1) {
+                       if (ctl->enable()) {
+                               kprintf("VMM: vmm enable() failed\n");
+                               return (EINVAL);
+                       }
+               } else if (new_val == 0) {
+                       if (ctl->disable()) {
+                               kprintf("VMM: vmm disable() failed\n");
+                               return (EINVAL);
+                       }
+               }
+       } else {
+               return (EINVAL);
+       }
+
+       vmm_enabled = new_val;
+
+       return (0);
+}
+
+static void
+vmm_shutdown(void)
+{
+       if(vmm_enabled)
+               ctl->disable();
+}
+
+static void
+vmm_init(void)
+{
+       sysctl_ctx_init(&vmm_sysctl_ctx);
+       vmm_sysctl_tree = SYSCTL_ADD_NODE(&vmm_sysctl_ctx,
+           SYSCTL_STATIC_CHILDREN(_hw),
+           OID_AUTO, "vmm",
+           CTLFLAG_RD, 0, "VMM options");
+
+       if (cpu_vendor_id == CPU_VENDOR_INTEL) {
+               ctl = get_ctl_intel();
+       } else if (cpu_vendor_id == CPU_VENDOR_AMD) {
+               ctl = get_ctl_amd();
+       }
+
+       if (ctl->init()) {
+               SYSCTL_ADD_STRING(&vmm_sysctl_ctx,
+                   SYSCTL_CHILDREN(vmm_sysctl_tree),
+                   OID_AUTO, "enable", CTLFLAG_RD,
+                   "NOT SUPPORTED", 0,
+                   "enable not supported");
+       } else {
+               SYSCTL_ADD_STRING(&vmm_sysctl_ctx,
+                   SYSCTL_CHILDREN(vmm_sysctl_tree),
+                   OID_AUTO, "type", CTLFLAG_RD,
+                   ctl->name, 0,
+                   "Type of the VMM");
+               SYSCTL_ADD_PROC(&vmm_sysctl_ctx,
+                   SYSCTL_CHILDREN(vmm_sysctl_tree),
+                   OID_AUTO, "enable", CTLTYPE_INT | CTLFLAG_WR,
+                   NULL, sizeof vmm_enabled, sysctl_vmm_enable, "I",
+                   "Control the state of the VMM");
+
+               if (ctl->enable()) {
+                       kprintf("VMM: vmm enable() failed\n");
+               } else {
+                       vmm_enabled = 1;
+               }
+
+               EVENTHANDLER_REGISTER(shutdown_pre_sync, vmm_shutdown, NULL, SHUTDOWN_PRI_DEFAULT-1);
+       }
+}
+SYSINIT(vmm_init, SI_BOOT2_CPU_TOPOLOGY, SI_ORDER_ANY, vmm_init, NULL);
+
+
+int
+vmm_vminit(struct guest_options *options)
+{
+       if (!vmm_enabled) {
+               return ENODEV;
+       }
+
+       return ctl->vminit(options);
+}
+
+int
+vmm_vmdestroy(void)
+{
+       if (!vmm_enabled) {
+               return ENODEV;
+       }
+
+       return ctl->vmdestroy();
+}
+
+int
+vmm_vmrun(void)
+{
+       if (!vmm_enabled) {
+               return ENODEV;
+       }
+       return ctl->vmrun();
+}
+
+int
+vmm_vm_set_tls_area(void)
+{
+       if (!vmm_enabled) {
+               return ENODEV;
+       }
+       return ctl->vm_set_tls_area();
+}
+
+void
+vmm_vm_set_guest_cr3(register_t guest_cr3)
+{
+       ctl->vm_set_guest_cr3(guest_cr3);
+}
+
+void
+vmm_lwp_return(struct lwp *lp, struct trapframe *frame)
+{
+       ctl->vm_lwp_return(lp, frame);
+}
+
+int
+vmm_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr)
+{
+       return ctl->vm_get_gpa(p, gpa, uaddr);
+}
similarity index 53%
copy from sys/platform/vkernel64/include/pmap_inval.h
copy to sys/platform/pc64/vmm/vmm.h
index f99fe3f..97092d0 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
- * by Matthew Dillon <dillon@backplane.com>
+ * by Mihai Carabas <mihai.carabas@gmail.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $DragonFly: src/sys/platform/vkernel/include/pmap_inval.h,v 1.3 2007/07/02 02:22:57 dillon Exp $
  */
 
-#ifndef _MACHINE_PMAP_INVAL_H_
-#define        _MACHINE_PMAP_INVAL_H_
-
-#ifndef _SYS_THREAD_H_
-#include <sys/thread.h>
-#endif
+#ifndef _VMM_VMM_H_
+#define _VMM_VMM_H_
 
-typedef struct pmap_inval_info {
-    int                        pir_flags;
-    struct lwkt_cpusync        pir_cpusync;
-} pmap_inval_info;
+#define MAX_NAME_LEN 256
 
-typedef pmap_inval_info *pmap_inval_info_t;
+#include <sys/param.h>
+#include <sys/vmm.h>
 
-#define PIRF_INVLTLB   0x0001  /* request invalidation of whole table */
-#define PIRF_INVL1PG   0x0002  /* else request invalidation of one page */
-#define PIRF_CPUSYNC   0x0004  /* cpusync is currently active */
+#define ERROR_IF(func)                                 \
+       do {                                            \
+       if ((err = (func))) {                           \
+               kprintf("VMM: %s error at line: %d\n",  \
+                  __func__, __LINE__);                 \
+               goto error;                             \
+       }                                               \
+       } while(0)                                      \
 
-#ifdef _KERNEL
+#define ERROR2_IF(func)                                        \
+       do {                                            \
+       if ((err = (func))) {                           \
+               kprintf("VMM: %s error at line: %d\n",  \
+                  __func__, __LINE__);                 \
+               goto error2;                            \
+       }                                               \
+       } while(0)                                      \
 
-#ifndef _MACHINE_PMAP_H_
-#include <machine/pmap.h>
+#ifdef VMM_DEBUG
+#define dkprintf(fmt, args...)         kprintf(fmt, ##args)
+#else
+#define dkprintf(fmt, args...)
 #endif
 
-void pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
+#define INSTRUCTION_MAX_LENGTH         15
 
-#endif
+struct vmm_ctl {
+       char name[MAX_NAME_LEN];
+       int (*init)(void);
+       int (*enable)(void);
+       int (*disable)(void);
+       int (*vminit)(struct guest_options *);
+       int (*vmdestroy)(void);
+       int (*vmrun)(void);
+       int (*vm_set_tls_area)(void);
+       void (*vm_lwp_return)(struct lwp *lp, struct trapframe *frame);
+       void (*vm_set_guest_cr3)(register_t);
+       int (*vm_get_gpa)(struct proc *, register_t *, register_t);
+
+
+};
+
+struct vmm_proc {
+       uint64_t        guest_cr3;
+       uint64_t        vmm_cr3;
+};
+
+struct vmm_ctl* get_ctl_intel(void);
+struct vmm_ctl* get_ctl_amd(void);
 
 #endif
diff --git a/sys/platform/pc64/vmm/vmm_utils.c b/sys/platform/pc64/vmm/vmm_utils.c
new file mode 100644 (file)
index 0000000..9e65f29
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <cpu/lwbuf.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include "vmm_utils.h"
+
+int
+instr_check(struct instr_decode *instr, void *ip, uint8_t instr_length)
+{
+
+       uint8_t i;
+       uint8_t *instr_ip;
+       uint8_t *instr_opcode;
+
+       instr_ip = (uint8_t *) ip;
+       instr_opcode = (uint8_t *) &instr->opcode;
+
+       /*  Skip REX prefix if present */
+       if (*instr_ip >= 0x40 && *instr_ip <= 0x4F) {
+               instr_ip++;
+               instr_length--;
+       }
+
+       for (i = 0; i < instr->opcode_bytes; i++) {
+               if (i < instr_length) {
+                       if (instr_ip[i] != instr_opcode[i]) {
+                               return -1;
+                       }
+               } else {
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+static int
+get_pt_entry(struct vmspace *vm, pt_entry_t *pte, vm_offset_t addr, int index)
+{
+       struct lwbuf *lwb;
+       struct lwbuf lwb_cache;
+       pt_entry_t *pt;
+       int err = 0;
+       vm_page_t m;
+
+       m = vm_fault_page(&vm->vm_map, trunc_page(addr),
+                         VM_PROT_READ, VM_FAULT_NORMAL, &err);
+       if (err) {
+               kprintf("%s: could not get addr %llx\n",
+                   __func__, (unsigned long long)addr);
+               goto error;
+       }
+       lwb = lwbuf_alloc(m, &lwb_cache);
+       pt = (pt_entry_t *)lwbuf_kva(lwb) + ((vm_offset_t)addr & PAGE_MASK);
+
+       *pte = pt[index];
+       lwbuf_free(lwb);
+       vm_page_unhold(m);
+error:
+       return err;
+}
+
+int
+guest_phys_addr(struct vmspace *vm, register_t *gpa, register_t guest_cr3, vm_offset_t uaddr)
+{
+       pt_entry_t pml4e;
+       pt_entry_t pdpe;
+       pt_entry_t pde;
+       pt_entry_t pte;
+       int err = 0;
+
+       err = get_pt_entry(vm, &pml4e, guest_cr3, uaddr >> PML4SHIFT);
+       if (err) {
+               kprintf("%s: could not get pml4e\n", __func__);
+               goto error;
+       }
+       if (pml4e & kernel_pmap.pmap_bits[PG_V_IDX]) {
+               err = get_pt_entry(vm, &pdpe, pml4e & PG_FRAME, (uaddr & PML4MASK) >> PDPSHIFT);
+               if (err) {
+                       kprintf("%s: could not get pdpe\n", __func__);
+                       goto error;
+               }
+               if (pdpe & kernel_pmap.pmap_bits[PG_V_IDX]) {
+                       if (pdpe & kernel_pmap.pmap_bits[PG_PS_IDX]) {
+                               *gpa = (pdpe & PG_FRAME) | (uaddr & PDPMASK);
+                               goto out;
+                       } else {
+                               err = get_pt_entry(vm, &pde, pdpe & PG_FRAME, (uaddr & PDPMASK) >> PDRSHIFT);
+                               if(err) {
+                                       kprintf("%s: could not get pdpe\n", __func__);
+                                       goto error;
+                               }
+                               if (pde & kernel_pmap.pmap_bits[PG_V_IDX]) {
+                                       if (pde & kernel_pmap.pmap_bits[PG_PS_IDX]) {
+                                               *gpa = (pde & PG_FRAME) | (uaddr & PDRMASK);
+                                               goto out;
+                                       } else {
+                                               err = get_pt_entry(vm, &pte, pde & PG_FRAME, (uaddr & PDRMASK) >> PAGE_SHIFT);
+                                               if (err) {
+                                                       kprintf("%s: could not get pte\n", __func__);
+                                                       goto error;
+                                               }
+                                               if (pte & kernel_pmap.pmap_bits[PG_V_IDX]) {
+                                                       *gpa = (pte & PG_FRAME) | (uaddr & PAGE_MASK);
+                                               } else {
+                                                       kprintf("%s: pte not valid\n", __func__);
+                                                       err = EFAULT;
+                                                       goto error;
+                                               }
+                                       }
+                               } else {
+                                       kprintf("%s: pde not valid\n", __func__);
+                                       err = EFAULT;
+                                       goto error;
+                               }
+                       }
+               } else {
+                       kprintf("%s: pdpe not valid\n", __func__);
+                       err = EFAULT;
+                       goto error;
+               }
+       } else {
+               kprintf("%s: pml4e not valid\n", __func__);
+               err = EFAULT;
+               goto error;
+       }
+out:
+error:
+       return err;
+}
similarity index 53%
copy from sys/platform/vkernel64/include/pmap_inval.h
copy to sys/platform/pc64/vmm/vmm_utils.h
index f99fe3f..5bfa320 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
- * by Matthew Dillon <dillon@backplane.com>
+ * by Mihai Carabas <mihai.carabas@gmail.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $DragonFly: src/sys/platform/vkernel/include/pmap_inval.h,v 1.3 2007/07/02 02:22:57 dillon Exp $
  */
 
-#ifndef _MACHINE_PMAP_INVAL_H_
-#define        _MACHINE_PMAP_INVAL_H_
-
-#ifndef _SYS_THREAD_H_
-#include <sys/thread.h>
-#endif
-
-typedef struct pmap_inval_info {
-    int                        pir_flags;
-    struct lwkt_cpusync        pir_cpusync;
-} pmap_inval_info;
-
-typedef pmap_inval_info *pmap_inval_info_t;
-
-#define PIRF_INVLTLB   0x0001  /* request invalidation of whole table */
-#define PIRF_INVL1PG   0x0002  /* else request invalidation of one page */
-#define PIRF_CPUSYNC   0x0004  /* cpusync is currently active */
+#ifndef _VMM_VMM_UTILS_H_
+#define _VMM_VMM_UTILS_H_
 
-#ifdef _KERNEL
-
-#ifndef _MACHINE_PMAP_H_
-#include <machine/pmap.h>
-#endif
-
-void pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-void pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-vpte_t pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va);
-
-#endif
+struct instr_decode {
+       uint8_t opcode_bytes;
+       struct {
+               uint8_t byte1;
+               uint8_t byte2;
+               uint8_t byte3;
+       } opcode;
+};
 
+int instr_check(struct instr_decode *instr, void *ip, uint8_t instr_length);
+int guest_phys_addr(struct vmspace *vm, register_t *gpa, register_t guest_cr3, vm_offset_t uaddr);
 #endif
diff --git a/sys/platform/pc64/vmm/vmx.c b/sys/platform/pc64/vmm/vmx.c
new file mode 100644 (file)
index 0000000..1bc9814
--- /dev/null
@@ -0,0 +1,1569 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ * by Matthew Dillon <dillon@backplane.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/malloc.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/thread.h>
+#include <sys/thread2.h>
+#include <sys/sysctl.h>
+#include <sys/vmm.h>
+#include <sys/proc.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <sys/vkernel.h>
+#include <sys/mplock2.h>
+#include <ddb/ddb.h>
+
+#include <cpu/cpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/cputypes.h>
+#include <machine/smp.h>
+#include <machine/globaldata.h>
+#include <machine/trap.h>
+#include <machine/pmap.h>
+#include <machine/md_var.h>
+
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_param.h>
+
+#include "vmm.h"
+#include "vmm_utils.h"
+
+#include "vmx.h"
+#include "vmx_instr.h"
+#include "vmx_vmcs.h"
+
+#include "ept.h"
+
+extern void trap(struct trapframe *frame);
+
+static int vmx_check_cpu_migration(void);
+static int execute_vmptrld(struct vmx_thread_info *vti);
+
+struct instr_decode syscall_asm = {
+       .opcode_bytes = 2,
+       .opcode.byte1 = 0x0F,
+       .opcode.byte2 = 0x05,
+};
+
+struct vmx_ctl_info vmx_pinbased = {
+       .msr_addr = IA32_VMX_PINBASED_CTLS,
+       .msr_true_addr = IA32_VMX_TRUE_PINBASED_CTLS,
+};
+
+struct vmx_ctl_info vmx_procbased = {
+       .msr_addr = IA32_VMX_PROCBASED_CTLS,
+       .msr_true_addr = IA32_VMX_TRUE_PROCBASED_CTLS,
+};
+
+struct vmx_ctl_info vmx_procbased2 = {
+       .msr_addr = IA32_VMX_PROCBASED_CTLS2,
+       .msr_true_addr = IA32_VMX_PROCBASED_CTLS2,
+};
+
+struct vmx_ctl_info vmx_exit = {
+       .msr_addr = IA32_VMX_EXIT_CTLS,
+       .msr_true_addr = IA32_VMX_TRUE_EXIT_CTLS,
+};
+
+struct vmx_ctl_info vmx_entry = {
+       .msr_addr = IA32_VMX_ENTRY_CTLS,
+       .msr_true_addr = IA32_VMX_TRUE_ENTRY_CTLS,
+};
+
+/* Declared in generic vmm.c - SYSCTL parent */
+extern struct sysctl_oid *vmm_sysctl_tree;
+
+/* SYSCTL tree and context */
+static struct sysctl_oid *vmx_sysctl_tree;
+static struct sysctl_ctx_list vmx_sysctl_ctx;
+
+/* Per cpu info */
+struct vmx_pcpu_info *pcpu_info;
+
+/* VMX BASIC INFO */
+uint32_t vmx_revision;
+uint32_t vmx_region_size;
+uint8_t vmx_width_addr;
+
+/* IA32_VMX_EPT_VPID_CAP */
+uint64_t vmx_ept_vpid_cap;
+
+/* VMX fixed bits */
+uint64_t cr0_fixed_to_0;
+uint64_t cr4_fixed_to_0;
+uint64_t cr0_fixed_to_1;
+uint64_t cr4_fixed_to_1;
+
+/* VMX status */
+static uint8_t vmx_enabled = 0;
+static uint8_t vmx_initialized = 0;
+
+/* VMX set control setting
+ * Intel System Programming Guide, Part 3, Order Number 326019
+ * 31.5.1 Algorithms for Determining VMX Capabilities
+ * Implement Algorithm 3
+ */
+static int
+vmx_set_ctl_setting(struct vmx_ctl_info *vmx_ctl, uint32_t bit_no, setting_t value) {
+       uint64_t vmx_basic;
+       uint64_t ctl_val;
+
+       /* Check if its branch b. or c. */
+       vmx_basic = rdmsr(IA32_VMX_BASIC);
+       if (IS_TRUE_CTL_AVAIL(vmx_basic))
+               ctl_val = rdmsr(vmx_ctl->msr_true_addr);
+       else
+               ctl_val = rdmsr(vmx_ctl->msr_addr);
+
+       /* Check if the value is known by VMM or set on DEFAULT */
+       switch(value) {
+               case DEFAULT:
+                       /* Both settings are allowd
+                        * - step b.iii)
+                        *   or
+                        * - c.iii), c.iv)
+                        */
+                       if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)
+                           && IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
+
+                               /* For c.iii) and c.iv) */
+                               if(IS_TRUE_CTL_AVAIL(vmx_basic))
+                                       ctl_val = rdmsr(vmx_ctl->msr_addr);
+
+                               if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
+                                       vmx_ctl->ctls &= ~BIT(bit_no);
+                               else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
+                                       vmx_ctl->ctls |= BIT(bit_no);
+
+                       } else if (IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no)) {
+                               /* b.i), c.i) */
+                               vmx_ctl->ctls &= ~BIT(bit_no);
+
+                       } else if (IS_ONE_SETTING_ALLOWED(ctl_val, bit_no)) {
+                               /* b.i), c.i) */
+                               vmx_ctl->ctls |= BIT(bit_no);
+
+                       } else {
+                               return (EINVAL);
+                       }
+                       break;
+               case ZERO:
+                       /* For b.ii) or c.ii) */
+                       if (!IS_ZERO_SETTING_ALLOWED(ctl_val, bit_no))
+                               return (EINVAL);
+
+                       vmx_ctl->ctls &= ~BIT(bit_no);
+
+                       break;
+               case ONE:
+                       /* For b.ii) or c.ii) */
+                       if (!IS_ONE_SETTING_ALLOWED(ctl_val, bit_no))
+                               return (EINVAL);
+
+                       vmx_ctl->ctls |= BIT(bit_no);
+
+                       break;
+       }
+       return 0;
+}
+
+static void
+vmx_set_default_settings(struct vmx_ctl_info *vmx_ctl)
+{
+       int i;
+
+       for(i = 0; i < 32; i++) {
+               vmx_set_ctl_setting(vmx_ctl, i, DEFAULT);
+       }
+}
+
+static void
+alloc_vmxon_regions(void)
+{
+       int cpu;
+       pcpu_info = kmalloc(ncpus * sizeof(struct vmx_pcpu_info), M_TEMP, M_WAITOK | M_ZERO);
+
+       for (cpu = 0; cpu < ncpus; cpu++) {
+
+               /* The address must be aligned to 4K - alloc extra */
+               pcpu_info[cpu].vmxon_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
+                   M_TEMP,
+                   M_WAITOK | M_ZERO);
+
+               /* Align address */
+               pcpu_info[cpu].vmxon_region = (unsigned char*) VMXON_REGION_ALIGN(pcpu_info[cpu].vmxon_region_na);
+
+               /* In the first 31 bits put the vmx revision*/
+               *((uint32_t *) pcpu_info[cpu].vmxon_region) = vmx_revision;
+       }
+}
+
+static void
+free_vmxon_regions(void)
+{
+       int i;
+
+       for (i = 0; i < ncpus; i++) {
+               pcpu_info[i].vmxon_region = NULL;
+
+               kfree(pcpu_info[i].vmxon_region_na, M_TEMP);
+       }
+
+       kfree(pcpu_info, M_TEMP);
+}
+
+static void
+build_vmx_sysctl(void)
+{
+       sysctl_ctx_init(&vmx_sysctl_ctx);
+       vmx_sysctl_tree = SYSCTL_ADD_NODE(&vmx_sysctl_ctx,
+                   SYSCTL_CHILDREN(vmm_sysctl_tree),
+                   OID_AUTO, "vmx",
+                   CTLFLAG_RD, 0, "VMX options");
+
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "revision", CTLFLAG_RD,
+           &vmx_revision, 0,
+           "VMX revision");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "region_size", CTLFLAG_RD,
+           &vmx_region_size, 0,
+           "VMX region size");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "width_addr", CTLFLAG_RD,
+           &vmx_width_addr, 0,
+           "VMX width address");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "pinbased_ctls", CTLFLAG_RD,
+           &vmx_pinbased.ctls, 0,
+           "VMX width address");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "procbased_ctls", CTLFLAG_RD,
+           &vmx_procbased.ctls, 0,
+           "VMX width address");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "procbased2_ctls", CTLFLAG_RD,
+           &vmx_procbased2.ctls, 0,
+           "VMX width address");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "vmexit_ctls", CTLFLAG_RD,
+           &vmx_exit.ctls, 0,
+           "VMX width address");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "vmentry_ctls", CTLFLAG_RD,
+           &vmx_entry.ctls, 0,
+           "VMX width address");
+       SYSCTL_ADD_INT(&vmx_sysctl_ctx,
+           SYSCTL_CHILDREN(vmx_sysctl_tree),
+           OID_AUTO, "ept_vpid_cap", CTLFLAG_RD,
+           &vmx_ept_vpid_cap, 0,
+           "VMX EPT VPID CAP");
+}
+
+
+
+static int
+vmx_init(void)
+{
+       uint64_t feature_control;
+       uint64_t vmx_basic_value;
+       uint64_t cr0_fixed_bits_to_1;
+       uint64_t cr0_fixed_bits_to_0;
+       uint64_t cr4_fixed_bits_to_0;
+       uint64_t cr4_fixed_bits_to_1;
+
+       int err;
+
+
+       /*
+        * The ability of a processor to support VMX operation
+        * and related instructions is indicated by:
+        * CPUID.1:ECX.VMX[bit 5] = 1
+        */
+       if (!(cpu_feature2 & CPUID2_VMX)) {
+               kprintf("VMM: VMX is not supported by this Intel CPU\n");
+               return (ENODEV);
+       }
+
+       vmx_set_default_settings(&vmx_pinbased);
+       vmx_set_default_settings(&vmx_procbased);
+       vmx_set_default_settings(&vmx_procbased2);
+       vmx_set_default_settings(&vmx_exit);
+       vmx_set_default_settings(&vmx_entry);
+
+       /* Enable external interrupts exiting */
+       err = vmx_set_ctl_setting(&vmx_pinbased,
+           PINBASED_EXTERNAL_INTERRUPT_EXITING,
+           ONE);
+       if (err) {
+               kprintf("VMM: PINBASED_EXTERNAL_INTERRUPT_EXITING not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Enable non-maskable interrupts exiting */
+       err = vmx_set_ctl_setting(&vmx_pinbased,
+           PINBASED_NMI_EXITING,
+           ONE);
+       if (err) {
+               kprintf("VMM: PINBASED_NMI_EXITING not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Enable second level for procbased */
+       err = vmx_set_ctl_setting(&vmx_procbased,
+           PROCBASED_ACTIVATE_SECONDARY_CONTROLS,
+           ONE);
+       if (err) {
+               kprintf("VMM: PROCBASED_ACTIVATE_SECONDARY_CONTROLS not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Set 64bits mode for GUEST */
+       err = vmx_set_ctl_setting(&vmx_entry,
+           VMENTRY_IA32e_MODE_GUEST,
+           ONE);
+       if (err) {
+               kprintf("VMM: VMENTRY_IA32e_MODE_GUEST not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Load MSR EFER on enry */
+       err = vmx_set_ctl_setting(&vmx_entry,
+           VMENTRY_LOAD_IA32_EFER,
+           ONE);
+       if (err) {
+               kprintf("VMM: VMENTRY_LOAD_IA32_EFER not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Set 64bits mode */
+       err = vmx_set_ctl_setting(&vmx_exit,
+           VMEXIT_HOST_ADDRESS_SPACE_SIZE,
+           ONE);
+       if (err) {
+               kprintf("VMM: VMEXIT_HOST_ADDRESS_SPACE_SIZE not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Save/Load Efer on exit */
+       err = vmx_set_ctl_setting(&vmx_exit,
+           VMEXIT_SAVE_IA32_EFER,
+           ONE);
+       if (err) {
+               kprintf("VMM: VMEXIT_SAVE_IA32_EFER not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Load Efer on exit */
+       err = vmx_set_ctl_setting(&vmx_exit,
+           VMEXIT_LOAD_IA32_EFER,
+           ONE);
+       if (err) {
+               kprintf("VMM: VMEXIT_LOAD_IA32_EFER not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       /* Enable EPT feature */
+       err = vmx_set_ctl_setting(&vmx_procbased2,
+           PROCBASED2_ENABLE_EPT,
+           ONE);
+       if (err) {
+               kprintf("VMM: PROCBASED2_ENABLE_EPT not supported by this CPU\n");
+               return (ENODEV);
+       }
+
+       if (vmx_ept_init()) {
+               kprintf("VMM: vmx_ept_init failed\n");
+               return (ENODEV);
+       }
+#if 0
+       /* XXX - to implement in the feature */
+       /* Enable VPID feature */
+       err = vmx_set_ctl_setting(&vmx_procbased2,
+           PROCBASED2_ENABLE_VPID,
+           ONE);
+       if (err) {
+               kprintf("VMM: PROCBASED2_ENABLE_VPID not supported by this CPU\n");
+               return (ENODEV);
+       }
+#endif
+
+       /* Check for the feature control status */
+       feature_control = rdmsr(IA32_FEATURE_CONTROL);
+       if (!(feature_control & BIT(FEATURE_CONTROL_LOCKED))) {
+               kprintf("VMM: IA32_FEATURE_CONTROL is not locked\n");
+               return (EINVAL);
+       }
+       if (!(feature_control & BIT(FEATURE_CONTROL_VMX_BIOS_ENABLED))) {
+               kprintf("VMM: VMX is disable by the BIOS\n");
+               return (EINVAL);
+       }
+
+       vmx_basic_value = rdmsr(IA32_VMX_BASIC);
+       vmx_width_addr = (uint8_t) VMX_WIDTH_ADDR(vmx_basic_value);
+       vmx_region_size = (uint32_t) VMX_REGION_SIZE(vmx_basic_value);
+       vmx_revision = (uint32_t) VMX_REVISION(vmx_basic_value);
+
+       /* A.7 VMX-FIXED BITS IN CR0 */
+       cr0_fixed_bits_to_1 = rdmsr(IA32_VMX_CR0_FIXED0);
+       cr0_fixed_bits_to_0 = rdmsr(IA32_VMX_CR0_FIXED1);
+       cr0_fixed_to_1 = cr0_fixed_bits_to_1 & cr0_fixed_bits_to_0;
+       cr0_fixed_to_0 = ~cr0_fixed_bits_to_1 & ~cr0_fixed_bits_to_0;
+
+       /* A.8 VMX-FIXED BITS IN CR4 */
+       cr4_fixed_bits_to_1 = rdmsr(IA32_VMX_CR4_FIXED0);
+       cr4_fixed_bits_to_0 = rdmsr(IA32_VMX_CR4_FIXED1);
+       cr4_fixed_to_1 = cr4_fixed_bits_to_1 & cr4_fixed_bits_to_0;
+       cr4_fixed_to_0 = ~cr4_fixed_bits_to_1 & ~cr4_fixed_bits_to_0;
+
+       build_vmx_sysctl();
+
+       vmx_initialized = 1;
+       return 0;
+}
+
+static void
+execute_vmxon(void *perr)
+{
+       unsigned char *vmxon_region;
+       int *err = (int*) perr;
+
+       /* A.7 VMX-FIXED BITS IN CR0 */
+       load_cr0((rcr0() | cr0_fixed_to_1) & ~cr0_fixed_to_0);
+
+       /* A.8 VMX-FIXED BITS IN CR4 */
+       load_cr4((rcr4() | cr4_fixed_to_1) & ~cr4_fixed_to_0);
+
+       /* Enable VMX */
+       load_cr4(rcr4() | CR4_VMXE);
+
+       vmxon_region = pcpu_info[mycpuid].vmxon_region;
+       *err = vmxon(vmxon_region);
+       if (*err) {
+               kprintf("VMM: vmxon failed on cpu%d\n", mycpuid);
+       }
+}
+
+static void
+execute_vmxoff(void *dummy)
+{
+       invept_desc_t desc = { 0 };
+
+       if (invept(INVEPT_TYPE_ALL_CONTEXTS, (uint64_t*) &desc))
+               kprintf("VMM: execute_vmxoff: invet failed on cpu%d\n", mycpu->gd_cpuid);
+
+       vmxoff();
+
+       /* Disable VMX */
+       load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static void
+execute_vmclear(void *data)
+{
+       struct vmx_thread_info *vti = data;
+       int err;
+       globaldata_t gd = mycpu;
+
+       if (pcpu_info[gd->gd_cpuid].loaded_vmx == vti) {
+               /*
+                * Must set vti->launched to zero after vmclear'ing to
+                * force a vmlaunch the next time.
+                */
+               pcpu_info[gd->gd_cpuid].loaded_vmx = NULL;
+               vti->launched = 0;
+               ERROR_IF(vmclear(vti->vmcs_region));
+       }
+error:
+       return;
+}
+
+static int
+execute_vmptrld(struct vmx_thread_info *vti)
+{
+       globaldata_t gd = mycpu;
+
+       /*
+        * Must vmclear previous active vcms if it is different.
+        */
+       if (pcpu_info[gd->gd_cpuid].loaded_vmx &&
+           pcpu_info[gd->gd_cpuid].loaded_vmx != vti)
+               execute_vmclear(pcpu_info[gd->gd_cpuid].loaded_vmx);
+
+       /*
+        * Make this the current VMCS.  Must set loaded_vmx field
+        * before calling vmptrld() to avoid races against cpusync.
+        *
+        * Must set vti->launched to zero after the vmptrld to force
+        * a vmlaunch.
+        */
+       if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
+               vti->launched = 0;
+               pcpu_info[gd->gd_cpuid].loaded_vmx = vti;
+               return (vmptrld(vti->vmcs_region));
+       } else {
+               return (0);
+       }
+}
+
+static int
+vmx_enable(void)
+{
+       int err;
+       int cpu;
+
+       if (!vmx_initialized) {
+               kprintf("VMM: vmx_enable - not allowed; vmx not initialized\n");
+               return (EINVAL);
+       }
+
+       if (vmx_enabled) {
+               kprintf("VMM: vmx_enable - already enabled\n");
+               return (EINVAL);
+       }
+
+       alloc_vmxon_regions();
+       for (cpu = 0; cpu < ncpus; cpu++) {
+               err = 0;
+               lwkt_cpusync_simple(CPUMASK(cpu), execute_vmxon, &err);
+               if(err) {
+                       kprintf("VMM: vmx_enable error %d on cpu%d\n", err, cpu);
+                       return err;
+               }
+       }
+       vmx_enabled = 1;
+       return 0;
+}
+
+static int
+vmx_disable(void)
+{
+       int cpu;
+
+       if (!vmx_enabled) {
+               kprintf("VMM: vmx_disable not allowed; vmx wasn't enabled\n");
+       }
+
+       for (cpu = 0; cpu < ncpus; cpu++)
+               lwkt_cpusync_simple(CPUMASK(cpu), execute_vmxoff, NULL);
+
+       free_vmxon_regions();
+
+       vmx_enabled = 0;
+
+       return 0;
+}
+
+static int vmx_set_guest_descriptor(descriptor_t type,
+               uint16_t selector,
+               uint32_t rights,
+               uint64_t base,
+               uint32_t limit)
+{
+       int err;
+       int selector_enc;
+       int rights_enc;
+       int base_enc;
+       int limit_enc;
+
+
+       /*
+        * Intel Manual Vol 3C. - page 60
+        * If any bit in the limit field in the range 11:0 is 0, G must be 0.
+        * If any bit in the limit field in the range 31:20 is 1, G must be 1.
+        */
+       if ((~rights & VMCS_SEG_UNUSABLE) || (type == CS)) {
+               if ((limit & 0xfff) != 0xfff)
+                       rights &= ~VMCS_G;
+               else if ((limit & 0xfff00000) != 0)
+                       rights |= VMCS_G;
+       }
+
+       switch(type) {
+               case ES:
+                       selector_enc = VMCS_GUEST_ES_SELECTOR;
+                       rights_enc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_ES_BASE;
+                       limit_enc = VMCS_GUEST_ES_LIMIT;
+                       break;
+               case CS:
+                       selector_enc = VMCS_GUEST_CS_SELECTOR;
+                       rights_enc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_CS_BASE;
+                       limit_enc = VMCS_GUEST_CS_LIMIT;
+                       break;
+               case SS:
+                       selector_enc = VMCS_GUEST_SS_SELECTOR;
+                       rights_enc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_SS_BASE;
+                       limit_enc = VMCS_GUEST_SS_LIMIT;
+                       break;
+               case DS:
+                       selector_enc = VMCS_GUEST_DS_SELECTOR;
+                       rights_enc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_DS_BASE;
+                       limit_enc = VMCS_GUEST_DS_LIMIT;
+                       break;
+               case FS:
+                       selector_enc = VMCS_GUEST_FS_SELECTOR;
+                       rights_enc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_FS_BASE;
+                       limit_enc = VMCS_GUEST_FS_LIMIT;
+                       break;
+               case GS:
+                       selector_enc = VMCS_GUEST_GS_SELECTOR;
+                       rights_enc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_GS_BASE;
+                       limit_enc = VMCS_GUEST_GS_LIMIT;
+                       break;
+               case LDTR:
+                       selector_enc = VMCS_GUEST_LDTR_SELECTOR;
+                       rights_enc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_LDTR_BASE;
+                       limit_enc = VMCS_GUEST_LDTR_LIMIT;
+                       break;
+               case TR:
+                       selector_enc = VMCS_GUEST_TR_SELECTOR;
+                       rights_enc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+                       base_enc = VMCS_GUEST_TR_BASE;
+                       limit_enc = VMCS_GUEST_TR_LIMIT;
+                       break;
+               default:
+                       kprintf("VMM: vmx_set_guest_descriptor: unknown descripton\n");
+                       err = -1;
+                       goto error;
+                       break;
+       }
+
+       ERROR_IF(vmwrite(selector_enc, selector));
+       ERROR_IF(vmwrite(rights_enc, rights));
+       ERROR_IF(vmwrite(base_enc, base));
+       ERROR_IF(vmwrite(limit_enc, limit));
+
+       return 0;
+error:
+       kprintf("VMM: vmx_set_guest_descriptor failed\n");
+       return err;
+}
+
+/*
+ * Called by the first thread of the VMM process
+ * - create a new vmspace
+ * - init the vmspace with EPT PG_* bits and
+ *   EPT copyin/copyout functions
+ * - replace the vmspace of the current proc
+ * - remove the old vmspace
+ */
+static int
+vmx_vminit_master(struct guest_options *options)
+{
+       struct vmspace *oldvmspace;
+       struct vmspace *newvmspace;
+       struct proc *p = curthread->td_proc;
+       struct vmm_proc *p_vmm;
+
+       oldvmspace = curthread->td_lwp->lwp_vmspace;
+       newvmspace = vmspace_fork(oldvmspace);
+
+       vmx_ept_pmap_pinit(vmspace_pmap(newvmspace));
+       bzero(vmspace_pmap(newvmspace)->pm_pml4, PAGE_SIZE);
+
+       lwkt_gettoken(&oldvmspace->vm_map.token);
+       lwkt_gettoken(&newvmspace->vm_map.token);
+
+       pmap_pinit2(vmspace_pmap(newvmspace));
+       pmap_replacevm(curthread->td_proc, newvmspace, 0);
+
+       lwkt_reltoken(&newvmspace->vm_map.token);
+       lwkt_reltoken(&oldvmspace->vm_map.token);
+
+       vmspace_free(oldvmspace);
+
+       options->vmm_cr3 = vtophys(vmspace_pmap(newvmspace)->pm_pml4);
+
+       p_vmm = kmalloc(sizeof(struct vmm_proc), M_TEMP, M_WAITOK | M_ZERO);
+       p_vmm->guest_cr3 = options->guest_cr3;
+       p_vmm->vmm_cr3 = options->vmm_cr3;
+       p->p_vmm = (void *)p_vmm;
+
+       if (p->p_vkernel) {
+               p->p_vkernel->vkernel_cr3 = options->guest_cr3;
+               dkprintf("PROCESS CR3 %016jx\n", (intmax_t)options->guest_cr3);
+       }
+
+       return 0;
+}
+
+static int
+vmx_vminit(struct guest_options *options)
+{
+       struct vmx_thread_info * vti;
+       int err;
+       struct tls_info guest_fs = curthread->td_tls.info[0];
+       struct tls_info guest_gs = curthread->td_tls.info[1];
+
+
+       vti = kmalloc(sizeof(struct vmx_thread_info), M_TEMP, M_WAITOK | M_ZERO);
+       curthread->td_vmm = (void*) vti;
+
+       if (options->master) {
+               vmx_vminit_master(options);
+       }
+
+       bcopy(&options->tf, &vti->guest, sizeof(struct trapframe));
+
+       /*
+        * Be sure we return success if the VMM hook enters
+        */
+       vti->guest.tf_rax = 0;
+       vti->guest.tf_rflags &= ~PSL_C;
+
+       vti->vmcs_region_na = kmalloc(vmx_region_size + VMXON_REGION_ALIGN_SIZE,
+                   M_TEMP,
+                   M_WAITOK | M_ZERO);
+
+       /* Align address */
+       vti->vmcs_region = (unsigned char*) VMXON_REGION_ALIGN(vti->vmcs_region_na);
+       vti->last_cpu = -1;
+
+       vti->guest_cr3 = options->guest_cr3;
+       vti->vmm_cr3 = options->vmm_cr3;
+
+       /* In the first 31 bits put the vmx revision*/
+       *((uint32_t *)vti->vmcs_region) = vmx_revision;
+
+       /*
+        * vmclear the vmcs to initialize it.
+        */
+       ERROR_IF(vmclear(vti->vmcs_region));
+
+       crit_enter();
+
+       ERROR_IF(execute_vmptrld(vti));
+
+       /* Load the VMX controls */
+       ERROR_IF(vmwrite(VMCS_PINBASED_CTLS, vmx_pinbased.ctls));
+       ERROR_IF(vmwrite(VMCS_PROCBASED_CTLS, vmx_procbased.ctls));
+       ERROR_IF(vmwrite(VMCS_PROCBASED2_CTLS, vmx_procbased2.ctls));
+       ERROR_IF(vmwrite(VMCS_VMEXIT_CTLS, vmx_exit.ctls));
+       ERROR_IF(vmwrite(VMCS_VMENTRY_CTLS, vmx_entry.ctls));
+
+       /* Load HOST CRs */
+       ERROR_IF(vmwrite(VMCS_HOST_CR0, rcr0()));
+       ERROR_IF(vmwrite(VMCS_HOST_CR4, rcr4()));
+
+       /* Load HOST EFER and PAT */
+//     ERROR_IF(vmwrite(VMCS_HOST_IA32_PAT, rdmsr(MSR_PAT)));
+       ERROR_IF(vmwrite(VMCS_HOST_IA32_EFER, rdmsr(MSR_EFER)));
+
+       /* Load HOST selectors */
+       ERROR_IF(vmwrite(VMCS_HOST_ES_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
+       ERROR_IF(vmwrite(VMCS_HOST_SS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
+       ERROR_IF(vmwrite(VMCS_HOST_FS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
+       ERROR_IF(vmwrite(VMCS_HOST_GS_SELECTOR, GSEL(GDATA_SEL, SEL_KPL)));
+       ERROR_IF(vmwrite(VMCS_HOST_CS_SELECTOR, GSEL(GCODE_SEL, SEL_KPL)));
+       ERROR_IF(vmwrite(VMCS_HOST_TR_SELECTOR, GSEL(GPROC0_SEL, SEL_KPL)));
+
+       /*
+        * The BASE addresses are written on each VMRUN in case
+        * the CPU changes because are per-CPU values
+        */
+
+       /*
+        * Call vmx_vmexit on VM_EXIT condition
+        * The RSP will point to the vmx_thread_info
+        */
+       ERROR_IF(vmwrite(VMCS_HOST_RIP, (uint64_t) vmx_vmexit));
+       ERROR_IF(vmwrite(VMCS_HOST_RSP, (uint64_t) vti));
+       ERROR_IF(vmwrite(VMCS_HOST_CR3, (uint64_t) KPML4phys));
+
+       /*
+        * GUEST initialization
+        * - set the descriptors according the conditions from Intel
+        *   manual "26.3.1.2 Checks on Guest Segment Registers
+        * - set the privilege to SEL_UPL (the vkernel will run
+        *   in userspace context)
+        */
+       ERROR_IF(vmx_set_guest_descriptor(ES, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           0, 0));
+
+       ERROR_IF(vmx_set_guest_descriptor(SS, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           0, 0));
+
+       ERROR_IF(vmx_set_guest_descriptor(DS, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           0, 0));
+
+       ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           (uint64_t) guest_fs.base, (uint32_t) guest_fs.size));
+
+       ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           (uint64_t) guest_gs.base, (uint32_t) guest_gs.size));
+
+       ERROR_IF(vmx_set_guest_descriptor(CS, GSEL(GUCODE_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(11) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P | VMCS_L,
+           0, 0));
+
+       ERROR_IF(vmx_set_guest_descriptor(TR, GSEL(GPROC0_SEL, SEL_UPL),
+                       VMCS_SEG_TYPE(11) | VMCS_P,
+                       0, 0));
+
+       ERROR_IF(vmx_set_guest_descriptor(LDTR, 0, VMCS_SEG_UNUSABLE, 0, 0));
+
+       /* Set the CR0/CR4 registers, removing the unsupported bits */
+       ERROR_IF(vmwrite(VMCS_GUEST_CR0, (CR0_PE | CR0_PG |
+           cr0_fixed_to_1) & ~cr0_fixed_to_0));
+       ERROR_IF(vmwrite(VMCS_GUEST_CR4, (CR4_PAE | CR4_FXSR | CR4_XMM | CR4_XSAVE |
+           cr4_fixed_to_1) & ~ cr4_fixed_to_0));
+
+       /* Don't set EFER_SCE for catching "syscall" instructions */
+       ERROR_IF(vmwrite(VMCS_GUEST_IA32_EFER, (EFER_LME | EFER_LMA)));
+
+       vti->guest.tf_rflags = PSL_I | 0x02;
+       ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
+
+       /* The Guest CR3 indicating CR3 pagetable */
+       ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
+
+       /* Throw all possible exceptions */
+       ERROR_IF(vmwrite(VMCS_EXCEPTION_BITMAP,(uint64_t) 0xFFFFFFFF));
+
+       /* Guest RIP and RSP */
+       ERROR_IF(vmwrite(VMCS_GUEST_RIP, options->tf.tf_rip));
+       ERROR_IF(vmwrite(VMCS_GUEST_RSP, options->tf.tf_rsp));
+
+       /*
+        * This field is included for future expansion.
+        * Software should set this field to FFFFFFFF_FFFFFFFFH
+        * to avoid VM-entry failures (see Section 26.3.1.5).
+        */
+       ERROR_IF(vmwrite(VMCS_LINK_POINTER, ~0ULL));
+
+       /* The pointer to the EPT pagetable */
+       ERROR_IF(vmwrite(VMCS_EPTP, vmx_eptp(vti->vmm_cr3)));
+
+       vti->invept_desc.eptp = vmx_eptp(vti->vmm_cr3);
+
+       crit_exit();
+
+       return 0;
+error:
+       crit_exit();
+
+       kprintf("VMM: vmx_vminit failed\n");
+       execute_vmclear(vti);
+
+       kfree(vti->vmcs_region_na, M_TEMP);
+       kfree(vti, M_TEMP);
+       return err;
+}
+
+static int
+vmx_vmdestroy(void)
+{
+       struct vmx_thread_info *vti = curthread->td_vmm;
+       struct proc *p = curproc;
+       int error = -1;
+
+       if (vti != NULL) {
+               vmx_check_cpu_migration();
+               if (vti->vmcs_region &&
+                   pcpu_info[mycpu->gd_cpuid].loaded_vmx == vti)
+                       execute_vmclear(vti);
+
+               if (vti->vmcs_region_na != NULL) {
+                       kfree(vti->vmcs_region_na, M_TEMP);
+                       kfree(vti, M_TEMP);
+                       error = 0;
+               }
+               curthread->td_vmm = NULL;
+               lwkt_gettoken(&p->p_token);
+               if (p->p_nthreads == 1) {
+                       kfree(p->p_vmm, M_TEMP);
+                       p->p_vmm = NULL;
+               }
+       }
+       return error;
+}
+
+/*
+ * Checks if we migrated to another cpu
+ *
+ * No locks are required
+ */
+static int
+vmx_check_cpu_migration(void)
+{
+       struct vmx_thread_info * vti;
+       struct globaldata *gd;
+       int err;
+
+       gd = mycpu;
+       vti = (struct vmx_thread_info *) curthread->td_vmm;
+       ERROR_IF(vti == NULL);
+
+       if (vti->last_cpu != -1 && vti->last_cpu != gd->gd_cpuid &&
+           pcpu_info[vti->last_cpu].loaded_vmx == vti) {
+               /*
+                * Do not reset last_cpu to -1 here, leave it caching
+                * the cpu whos per-cpu fields the VMCS is synchronized
+                * with.  The pcpu_info[] check prevents unecessary extra
+                * cpusyncs.
+                */
+               dkprintf("VMM: cpusync from %d to %d\n", gd->gd_cpuid, vti->last_cpu);
+
+               /* Clear the VMCS area if ran on another CPU */
+               lwkt_cpusync_simple(CPUMASK(vti->last_cpu),
+                                   execute_vmclear, (void *)vti);
+       }
+       return 0;
+error:
+       kprintf("VMM: vmx_check_cpu_migration failed\n");
+       return err;
+}
+
+/* Handle CPU migration
+ *
+ * We have to enter with interrupts disabled/critical section
+ * to be sure that another VMCS won't steel our CPU.
+ */
+static inline int
+vmx_handle_cpu_migration(void)
+{
+       struct vmx_thread_info * vti;
+       struct globaldata *gd;
+       int err;
+
+       gd = mycpu;
+       vti = (struct vmx_thread_info *) curthread->td_vmm;
+       ERROR_IF(vti == NULL);
+
+       if (vti->last_cpu != gd->gd_cpuid) {
+               /*
+                * We need to synchronize the per-cpu fields after changing
+                * cpus.
+                */
+               dkprintf("VMM: vmx_handle_cpu_migration init per CPU data\n");
+
+               ERROR_IF(execute_vmptrld(vti));
+
+               /* Host related registers */
+               ERROR_IF(vmwrite(VMCS_HOST_GS_BASE, (uint64_t) gd)); /* mycpu points to %gs:0 */
+               ERROR_IF(vmwrite(VMCS_HOST_TR_BASE, (uint64_t) &gd->gd_prvspace->mdglobaldata.gd_common_tss));
+
+               ERROR_IF(vmwrite(VMCS_HOST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
+               ERROR_IF(vmwrite(VMCS_HOST_IDTR_BASE, (uint64_t) r_idt_arr[gd->gd_cpuid].rd_base));
+
+
+               /* Guest related register */
+               ERROR_IF(vmwrite(VMCS_GUEST_GDTR_BASE, (uint64_t) &gdt[gd->gd_cpuid * NGDT]));
+               ERROR_IF(vmwrite(VMCS_GUEST_GDTR_LIMIT, (uint64_t) (NGDT * sizeof(gdt[0]) - 1)));
+
+               /*
+                * Indicates which cpu the per-cpu fields are synchronized
+                * with.  Does not indicate whether the vmcs is active on
+                * that particular cpu.
+                */
+               vti->last_cpu = gd->gd_cpuid;
+       } else if (pcpu_info[gd->gd_cpuid].loaded_vmx != vti) {
+               /*
+                * We only need to vmptrld
+                */
+               dkprintf("VMM: vmx_handle_cpu_migration: vmcs is not loaded\n");
+
+               ERROR_IF(execute_vmptrld(vti));
+
+       } /* else we don't need to do anything */
+       return 0;
+error:
+       kprintf("VMM: vmx_handle_cpu_migration failed\n");
+       return err;
+}
+
+/* Load information about VMexit
+ *
+ * We still are with interrupts disabled/critical secion
+ * because we must operate with the VMCS on the CPU
+ */
+static inline int
+vmx_vmexit_loadinfo(void)
+{
+       struct vmx_thread_info *vti;
+       int err;
+
+       vti = (struct vmx_thread_info *) curthread->td_vmm;
+       ERROR_IF(vti == NULL);
+
+       ERROR_IF(vmread(VMCS_VMEXIT_REASON, &vti->vmexit_reason));
+       ERROR_IF(vmread(VMCS_EXIT_QUALIFICATION, &vti->vmexit_qualification));
+       ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_INFO, &vti->vmexit_interruption_info));
+       ERROR_IF(vmread(VMCS_VMEXIT_INTERRUPTION_ERROR, &vti->vmexit_interruption_error));
+       ERROR_IF(vmread(VMCS_VMEXIT_INSTRUCTION_LENGTH, &vti->vmexit_instruction_length));
+       ERROR_IF(vmread(VMCS_GUEST_PHYSICAL_ADDRESS, &vti->guest_physical_address));
+       ERROR_IF(vmread(VMCS_GUEST_RIP, &vti->guest.tf_rip));
+       ERROR_IF(vmread(VMCS_GUEST_CS_SELECTOR, &vti->guest.tf_cs));
+       ERROR_IF(vmread(VMCS_GUEST_RFLAGS, &vti->guest.tf_rflags));
+       ERROR_IF(vmread(VMCS_GUEST_RSP, &vti->guest.tf_rsp));
+       ERROR_IF(vmread(VMCS_GUEST_SS_SELECTOR, &vti->guest.tf_ss));
+
+       return 0;
+error:
+       kprintf("VMM: vmx_vmexit_loadinfo failed\n");
+       return err;
+}
+
+
+static int
+vmx_set_tls_area(void)
+{
+       struct tls_info *guest_fs = &curthread->td_tls.info[0];
+       struct tls_info *guest_gs = &curthread->td_tls.info[1];
+
+       int err;
+
+       dkprintf("VMM: vmx_set_tls_area hook\n");
+
+       crit_enter();
+
+       ERROR_IF(vmx_check_cpu_migration());
+       ERROR_IF(vmx_handle_cpu_migration());
+
+       /* set %fs */
+       ERROR_IF(vmx_set_guest_descriptor(FS, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           (uint64_t) guest_fs->base, (uint32_t) guest_fs->size));
+
+       /* set %gs */
+       ERROR_IF(vmx_set_guest_descriptor(GS, GSEL(GUDATA_SEL, SEL_UPL),
+           VMCS_SEG_TYPE(3) | VMCS_S | VMCS_DPL(SEL_UPL) | VMCS_P,
+           (uint64_t) guest_gs->base, (uint32_t) guest_gs->size));
+
+       crit_exit();
+       return 0;
+
+error:
+       crit_exit();
+       return err;
+}
+
+
+static int
+vmx_handle_vmexit(void)
+{
+       struct vmx_thread_info * vti;
+       int exit_reason;
+       int exception_type;
+       int exception_number;
+       int err;
+       int func, regs[4];
+       int fault_type, rv;
+       int fault_flags = 0;
+       struct lwp *lp = curthread->td_lwp;
+
+       dkprintf("VMM: handle_vmx_vmexit\n");
+       vti = (struct vmx_thread_info *) curthread->td_vmm;
+       ERROR_IF(vti == NULL);
+
+       exit_reason = VMCS_BASIC_EXIT_REASON(vti->vmexit_reason);
+       switch (exit_reason) {
+               case EXIT_REASON_EXCEPTION:
+                       dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXCEPTION with qualification "
+                           "%llx, interruption info %llx, interruption error %llx, instruction "
+                           "length %llx\n",
+                           (long long) vti->vmexit_qualification,
+                           (long long) vti->vmexit_interruption_info,
+                           (long long) vti->vmexit_interruption_error,
+                           (long long) vti->vmexit_instruction_length);
+
+                       dkprintf("VMM: handle_vmx_vmexit: rax: %llx, rip: %llx, "
+                           "rsp: %llx,  rdi: %llx, rsi: %llx, %d, vti: %p, master: %p\n",
+                           (long long)vti->guest.tf_rax,
+                           (long long)vti->guest.tf_rip,
+                           (long long)vti->guest.tf_rsp,
+                           (long long)vti->guest.tf_rdi,
+                           (long long)vti->guest.tf_rsi, exit_reason, vti, curproc->p_vmm);
+
+                       exception_type = VMCS_EXCEPTION_TYPE(vti->vmexit_interruption_info);
+                       exception_number = VMCS_EXCEPTION_NUMBER(vti->vmexit_interruption_info);
+
+                       if (exception_type == VMCS_EXCEPTION_HARDWARE) {
+                               switch (exception_number) {
+                                       case IDT_UD:
+                                               /*
+                                                * Disabled "syscall" instruction and
+                                                * now we catch it for executing
+                                                */
+                                               dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_UD\n");
+#ifdef VMM_DEBUG
+                                               /* Check to see if its syscall asm instuction */
+                                               uint8_t instr[INSTRUCTION_MAX_LENGTH];
+                                               if (copyin((const void *) vti->guest.tf_rip, instr, vti->vmexit_instruction_length) &&
+                                                   instr_check(&syscall_asm,(void *) instr, (uint8_t) vti->vmexit_instruction_length)) {
+                                                       kprintf("VMM: handle_vmx_vmexit: UD different from syscall: ");
+                                                       db_disasm((db_addr_t) instr, FALSE, NULL);
+                                               }
+#endif
+                                               /* Called to force a VMEXIT and invalidate TLB */
+                                               if (vti->guest.tf_rax == -1) {
+                                                       vti->guest.tf_rip += vti->vmexit_instruction_length;
+                                                       break;
+                                               }
+
+                                               vti->guest.tf_err = 2;
+                                               vti->guest.tf_trapno = T_FAST_SYSCALL;
+                                               vti->guest.tf_xflags = 0;
+
+                                               vti->guest.tf_rip += vti->vmexit_instruction_length;
+
+                                               syscall2(&vti->guest);
+
+                                               break;
+                                       case IDT_PF:
+                                               dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE IDT_PF at %llx\n",
+                                                   (long long) vti->guest.tf_rip);
+
+                                               if (vti->guest.tf_rip == 0) {
+                                                       kprintf("VMM: handle_vmx_vmexit: Terminating...\n");
+                                                       err = -1;
+                                                       goto error;
+                                               }
+
+                                               vti->guest.tf_err = vti->vmexit_interruption_error;
+                                               vti->guest.tf_addr = vti->vmexit_qualification;
+                                               vti->guest.tf_xflags = 0;
+                                               vti->guest.tf_trapno = T_PAGEFLT;
+
+                                               /*
+                                                * If we are a user process in the vkernel
+                                                * pass the PF to the vkernel and will trigger
+                                                * the user_trap()
+                                                *
+                                                * If we are the vkernel, send a SIGSEGV signal
+                                                * to us that will trigger the execution of
+                                                * kern_trap()
+                                                *
+                                                */
+
+                                               if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
+                                                       vkernel_trap(lp, &vti->guest);
+                                               } else {
+                                                       trapsignal(lp, SIGSEGV, SEGV_MAPERR);
+                                               }
+
+                                               break;
+                                       default:
+                                               kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_HARDWARE unknown "
+                                                   "number %d rip: %llx, rsp: %llx\n", exception_number,
+                                                   (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
+                                               err = -1;
+                                               goto error;
+                               }
+                       } else if (exception_type == VMCS_EXCEPTION_SOFTWARE) {
+                               switch (exception_number) {
+                                       case 3:
+                                               dkprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE "
+                                                   "number %d rip: %llx, rsp: %llx\n", exception_number,
+                                                   (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
+
+                                               vti->guest.tf_trapno = T_BPTFLT;
+                                               vti->guest.tf_xflags = 0;
+                                               vti->guest.tf_err = 0;
+                                               vti->guest.tf_addr = 0;
+
+                                               vti->guest.tf_rip += vti->vmexit_instruction_length;
+
+                                               trap(&vti->guest);
+
+                                               break;
+                                       default:
+                                               kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_SOFTWARE unknown "
+                                                   "number %d rip: %llx, rsp: %llx\n", exception_number,
+                                                   (long long)vti->guest.tf_rip, (long long)vti->guest.tf_rsp);
+                                               err = -1;
+                                               goto error;
+                               }
+                       } else {
+                               kprintf("VMM: handle_vmx_vmexit: VMCS_EXCEPTION_ %d unknown\n", exception_type);
+                               err = -1;
+                               goto error;
+                       }
+                       break;
+               case EXIT_REASON_EXT_INTR:
+                       dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EXT_INTR\n");
+                       break;
+               case EXIT_REASON_CPUID:
+                       dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_CPUID\n");
+
+                       /*
+                        * Execute CPUID instruction and pass
+                        * the result to the vkernel
+                        */
+
+                       func = vti->guest.tf_rax;
+                       do_cpuid(func, regs);
+
+                       vti->guest.tf_rax = regs[0];
+                       vti->guest.tf_rbx = regs[1];
+                       vti->guest.tf_rcx = regs[2];
+                       vti->guest.tf_rdx = regs[3];
+
+                       vti->guest.tf_rip += vti->vmexit_instruction_length;
+
+                       break;
+               case EXIT_REASON_EPT_FAULT:
+                       /*
+                        * EPT_FAULT are resolved like normal PFs. Nothing special
+                        * - get the fault type
+                        * - get the fault address (which is a GPA)
+                        * - execute vm_fault on the vm_map
+                        */
+                       dkprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT with qualification %lld,"
+                           "GPA: %llx, fault_Type: %d\n",(long long) vti->vmexit_qualification,
+                           (unsigned long long) vti->guest_physical_address, fault_type);
+
+                       fault_type = vmx_ept_fault_type(vti->vmexit_qualification);
+
+                       if (fault_type & VM_PROT_WRITE)
+                               fault_flags = VM_FAULT_DIRTY;
+                       else
+                               fault_flags = VM_FAULT_NORMAL;
+
+                       rv = vm_fault(&curthread->td_lwp->lwp_vmspace->vm_map,
+                           trunc_page(vti->guest_physical_address), fault_type, fault_flags);
+
+                       if (rv != KERN_SUCCESS) {
+                               kprintf("VMM: handle_vmx_vmexit: EXIT_REASON_EPT_FAULT couldn't resolve %llx\n",
+                                   (unsigned long long) vti->guest_physical_address);
+                               err = -1;
+                               goto error;
+                       }
+                       break;
+               default:
+                       kprintf("VMM: handle_vmx_vmexit: unknown exit reason: %d with qualification %lld\n",
+                           exit_reason, (long long) vti->vmexit_qualification);
+                       err = -1;
+                       goto error;
+       }
+       return 0;
+error:
+       return err;
+}
+
+static int
+vmx_vmrun(void)
+{
+       struct vmx_thread_info * vti;
+       struct globaldata *gd;
+       int err;
+       int ret;
+       int sticks = 0;
+       uint64_t val;
+       cpumask_t oactive;
+       cpumask_t nactive;
+       struct trapframe *save_frame;
+       thread_t td = curthread;
+
+       vti = (struct vmx_thread_info *) td->td_vmm;
+       save_frame = td->td_lwp->lwp_md.md_regs;
+       td->td_lwp->lwp_md.md_regs = &vti->guest;
+restart:
+       crit_enter();
+
+       /*
+        * This can change the cpu we are running on.
+        */
+       trap_handle_userexit(&vti->guest, sticks);
+       gd = mycpu;
+
+       ERROR2_IF(vti == NULL);
+       ERROR2_IF(vmx_check_cpu_migration());
+       ERROR2_IF(vmx_handle_cpu_migration());
+
+       /*
+        * Make the state safe to VMENTER
+        * - disable interrupts and check if there were any pending
+        * - check for ASTFLTs
+        * - loop again until there are no ASTFLTs
+        */
+       cpu_disable_intr();
+       splz();
+       if (gd->gd_reqflags & RQF_AST_MASK) {
+               atomic_clear_int(&gd->gd_reqflags, RQF_AST_SIGNAL);
+               cpu_enable_intr();
+               crit_exit();
+               vti->guest.tf_trapno = T_ASTFLT;
+               trap(&vti->guest);
+               /* CURRENT CPU CAN CHANGE */
+               goto restart;
+       }
+       if (vti->last_cpu != gd->gd_cpuid) {
+               cpu_enable_intr();
+               crit_exit();
+               kprintf("VMM: vmx_vmrun: vti unexpectedly "
+                       "changed cpus %d->%d\n",
+                       gd->gd_cpuid, vti->last_cpu);
+               goto restart;
+       }
+
+       /*
+        * Add us to the list of cpus running vkernel operations, interlock
+        * against anyone trying to do an invalidation.
+        */
+        for (;;) {
+                oactive = td->td_proc->p_vmm_cpumask;
+                cpu_ccfence();
+               if ((oactive & CPUMASK_LOCK) == 0) {
+                       nactive = oactive | gd->gd_cpumask;
+                       if (atomic_cmpset_cpumask(&td->td_proc->p_vmm_cpumask,
+                                                 oactive, nactive)) {
+                               /* fast path */
+                               break;
+                       }
+                       /* cmpset race */
+                       cpu_pause();
+                       continue;
+               }
+
+               /*
+                * More complex.
+                */
+               cpu_enable_intr();
+               tsleep_interlock(&td->td_proc->p_vmm_cpumask, 0);
+               if (td->td_proc->p_vmm_cpumask & CPUMASK_LOCK) {
+                       tsleep(&td->td_proc->p_vmm_cpumask, PINTERLOCKED,
+                              "vmminvl", hz);
+               }
+               crit_exit();
+               goto restart;
+       }
+
+       /*
+        * Load specific Guest registers
+        * GP registers will be loaded in vmx_launch/resume
+        */
+       ERROR_IF(vmwrite(VMCS_GUEST_RIP, vti->guest.tf_rip));
+       ERROR_IF(vmwrite(VMCS_GUEST_CS_SELECTOR, vti->guest.tf_cs));
+       ERROR_IF(vmwrite(VMCS_GUEST_RFLAGS, vti->guest.tf_rflags));
+       ERROR_IF(vmwrite(VMCS_GUEST_RSP, vti->guest.tf_rsp));
+       ERROR_IF(vmwrite(VMCS_GUEST_SS_SELECTOR, vti->guest.tf_ss));
+       ERROR_IF(vmwrite(VMCS_GUEST_CR3, (uint64_t) vti->guest_cr3));
+
+       /*
+        * FPU
+        */
+       if (mdcpu->gd_npxthread != td) {
+               if (mdcpu->gd_npxthread)
+                       npxsave(mdcpu->gd_npxthread->td_savefpu);
+               npxdna();
+       }
+
+       /*
+        * The kernel caches the MSR_FSBASE value in mdcpu->gd_user_fs.
+        * A vmexit loads this unconditionally from the VMCS so make
+        * sure it loads the correct value.
+        */
+       ERROR_IF(vmwrite(VMCS_HOST_FS_BASE, mdcpu->gd_user_fs));
+
+       /*
+        * EPT mappings can't be invalidated with normal invlpg/invltlb
+        * instructions. We have to execute a special instruction that
+        * invalidates all EPT cache ("invept").
+        *
+        * pm_invgen it's a generation number which is incremented in the
+        * pmap_inval_interlock, before doing any invalidates. The
+        * pmap_inval_interlock will cause all the CPUs that are using
+        * the EPT to VMEXIT and wait for the interlock to complete.
+        * When they will VMENTER they will see that the generation
+        * number had changed from their current and do a invept.
+        */
+       if (vti->eptgen != td->td_proc->p_vmspace->vm_pmap.pm_invgen) {
+               vti->eptgen = td->td_proc->p_vmspace->vm_pmap.pm_invgen;
+
+               ERROR_IF(invept(INVEPT_TYPE_SINGLE_CONTEXT,
+                   (uint64_t*)&vti->invept_desc));
+       }
+
+       if (vti->launched) { /* vmresume called from vmx_trap.s */
+               dkprintf("\n\nVMM: vmx_vmrun: vmx_resume\n");
+               ret = vmx_resume(vti);
+
+       } else { /* vmlaunch called from vmx_trap.s */
+               dkprintf("\n\nVMM: vmx_vmrun: vmx_launch\n");
+               vti->launched = 1;
+               ret = vmx_launch(vti);
+       }
+
+       atomic_clear_cpumask(&td->td_proc->p_vmm_cpumask, gd->gd_cpumask);
+
+       /*
+        * This is our return point from the vmlaunch/vmresume
+        * There are two situations:
+        * - the vmlaunch/vmresume executed successfully and they
+        *   would return through "vmx_vmexit" which will restore
+        *   the state (registers) and return here with the ret
+        *   set to VM_EXIT (ret is actually %rax)
+        * - the vmlaunch/vmresume failed to execute and will return
+        *   immediately with ret set to the error code
+        */
+       if (ret == VM_EXIT) {
+
+               ERROR_IF(vmx_vmexit_loadinfo());
+
+               cpu_enable_intr();
+               trap_handle_userenter(td);
+               sticks = td->td_sticks;
+               crit_exit();
+
+               /*
+                * Handle the VMEXIT reason
+                * - if successful we VMENTER again
+                * - if not, we exit
+                */
+               if (vmx_handle_vmexit())
+                       goto done;
+
+               /* We handled the VMEXIT reason and continue with VM execution */
+               goto restart;
+
+       } else {
+               vti->launched = 0;
+
+               /*
+                * Two types of error:
+                * - VM_FAIL_VALID - the host state was ok,
+                *   but probably the guest state was not
+                * - VM_FAIL_INVALID - the parameters or the host state
+                *   was not ok
+                */
+               if (ret == VM_FAIL_VALID) {
+                       vmread(VMCS_INSTR_ERR, &val);
+                       err = (int) val;
+                       kprintf("VMM: vmx_vmrun: vmenter failed with VM_FAIL_VALID, error code %d\n", err);
+               } else {
+                       kprintf("VMM: vmx_vmrun: vmenter failed with VM_FAIL_INVALID\n");
+               }
+               goto error;
+       }
+done:
+       kprintf("VMM: vmx_vmrun: returning with success\n");
+       return 0;
+error:
+       cpu_enable_intr();
+error2:
+       trap_handle_userenter(td);
+       td->td_lwp->lwp_md.md_regs = save_frame;
+       atomic_clear_cpumask(&td->td_proc->p_vmm_cpumask, gd->gd_cpumask);
+       crit_exit();
+       kprintf("VMM: vmx_vmrun failed\n");
+       return err;
+}
+
+/*
+ * Called when returning to user-space
+ * after executing lwp_fork.
+ */
+static void
+vmx_lwp_return(struct lwp *lp, struct trapframe *frame)
+{
+       struct guest_options options;
+       int vmrun_err;
+       struct vmm_proc *p_vmm = (struct vmm_proc *)curproc->p_vmm;
+
+       dkprintf("VMM: vmx_lwp_return \n");
+
+       bzero(&options, sizeof(struct guest_options));
+
+       bcopy(frame, &options.tf, sizeof(struct trapframe));
+
+       options.guest_cr3 = p_vmm->guest_cr3;
+       options.vmm_cr3 = p_vmm->vmm_cr3;
+
+       vmx_vminit(&options);
+       generic_lwp_return(lp, frame);
+
+       vmrun_err = vmx_vmrun();
+
+       exit1(W_EXITCODE(vmrun_err, 0));
+}
+
+static void
+vmx_set_guest_cr3(register_t guest_cr3)
+{
+       struct vmx_thread_info *vti = (struct vmx_thread_info *) curthread->td_vmm;
+       vti->guest_cr3 = guest_cr3;
+}
+
+static int
+vmx_vm_get_gpa(struct proc *p, register_t *gpa, register_t uaddr)
+{
+       return guest_phys_addr(p->p_vmspace, gpa, p->p_vkernel->vkernel_cr3, uaddr);
+}
+
+static struct vmm_ctl ctl_vmx = {
+       .name = "VMX from Intel",
+       .init = vmx_init,
+       .enable = vmx_enable,
+       .disable = vmx_disable,
+       .vminit = vmx_vminit,
+       .vmdestroy = vmx_vmdestroy,
+       .vmrun = vmx_vmrun,
+       .vm_set_tls_area = vmx_set_tls_area,
+       .vm_lwp_return = vmx_lwp_return,
+       .vm_set_guest_cr3 = vmx_set_guest_cr3,
+       .vm_get_gpa = vmx_vm_get_gpa,
+};
+
+struct vmm_ctl*
+get_ctl_intel(void)
+{
+       return &ctl_vmx;
+}
diff --git a/sys/platform/pc64/vmm/vmx.h b/sys/platform/pc64/vmm/vmx.h
new file mode 100644 (file)
index 0000000..b3bc94e
--- /dev/null
@@ -0,0 +1,333 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _VMM_VMX_H_
+#define _VMM_VMX_H_
+
+#include <machine/pmap.h>
+
+#include "ept.h"
+
+#define BIT(x) (1ULL << x)
+
+
+/* VMX info for a generic control */
+struct vmx_ctl_info {
+       uint32_t msr_addr;
+       uint32_t msr_true_addr;
+       uint32_t ctls;
+};
+
+/* The value of a setting */
+typedef enum {
+       ZERO,
+       ONE,
+       DEFAULT
+} setting_t;
+
+/* VMX per cpu info */
+struct vmx_pcpu_info {
+       unsigned char *vmxon_region_na;
+       unsigned char *vmxon_region;
+       struct vmx_thread_info *loaded_vmx;
+};
+
+struct vmx_thread_info {
+       unsigned char *vmcs_region_na;
+       unsigned char *vmcs_region;
+
+       int launched;   /* vmclear or vmptrld called, must use vmlaunch */
+       int last_cpu;   /* per-cpu info cached in VMCS for this cpu */
+
+       /* Guest unsaved registers in VMCS */
+       struct trapframe guest; /* put them directly in trapframe */
+       register_t      guest_cr2;
+
+       /* Host unsaved registers in VMCS */
+       register_t      host_rbx;
+       register_t      host_rbp;
+       register_t      host_r10;
+       register_t      host_r11;
+       register_t      host_r12;
+       register_t      host_r13;
+       register_t      host_r14;
+       register_t      host_r15;
+       register_t      host_rsp;
+       uint64_t        vmexit_reason;
+       uint64_t        vmexit_qualification;
+       uint64_t        vmexit_interruption_info;
+       uint64_t        vmexit_interruption_error;
+       uint64_t        vmexit_instruction_length;
+       uint64_t        guest_physical_address;
+
+       uint64_t        guest_cr3;
+       uint64_t        vmm_cr3;
+       invept_desc_t   invept_desc;
+       long            eptgen;
+};
+
+typedef enum {
+       ES,
+       CS,
+       SS,
+       DS,
+       FS,
+       GS,
+       LDTR,
+       TR
+} descriptor_t;
+
+int vmx_launch(struct vmx_thread_info *);
+int vmx_resume(struct vmx_thread_info *);
+void vmx_vmexit(void);
+
+
+
+/*
+ * MSR register address
+ */
+#define                IA32_FEATURE_CONTROL                    0x3A
+#define                IA32_VMX_BASIC                          0x480
+#define                IA32_VMX_PINBASED_CTLS                  0x481
+#define                IA32_VMX_PROCBASED_CTLS                 0x482
+#define                IA32_VMX_EXIT_CTLS                      0x483
+#define                IA32_VMX_ENTRY_CTLS                     0x484
+#define                IA32_VMX_CR0_FIXED0                     0x486
+#define                IA32_VMX_CR0_FIXED1                     0x487
+#define                IA32_VMX_CR4_FIXED0                     0x488
+#define                IA32_VMX_CR4_FIXED1                     0x489
+#define                IA32_VMX_EPT_VPID_CAP                   0x48C
+#define                IA32_VMX_PROCBASED_CTLS2                0x48B
+#define                IA32_VMX_TRUE_PINBASED_CTLS             0x48D
+#define                IA32_VMX_TRUE_PROCBASED_CTLS            0x48E
+#define                IA32_VMX_TRUE_EXIT_CTLS                 0x48F
+#define                IA32_VMX_TRUE_ENTRY_CTLS                0x490
+
+
+
+/*
+ * IA32 FEATURE CONTROL bits
+ */
+#define                FEATURE_CONTROL_LOCKED                  0
+#define                FEATURE_CONTROL_VMX_BIOS_ENABLED        2
+
+
+
+/*
+ * IA32_VMX_BASIC
+ * A.1 BASIC VMX INFORMATION
+ */
+#define                IS_TRUE_CTL_AVAIL(VMX_BASIC)    ((VMX_BASIC) & (1ULL << (55)))
+#define                VMX_REVISION(reg_val)           (reg_val & 0x7fffffff) /* 0:30 */
+#define        VMX_REGION_SIZE(reg_val)        ((reg_val >> 32) & 0x01fff) /* 32:44 */
+#define        VMX_WIDTH_ADDR(reg_val)         (reg_val >> 48 & 0x1) /* 48 */
+#define                VMXON_REGION_ALIGN_SIZE         4096ULL
+#define                VMXON_REGION_ALIGN(p)           (((unsigned long long)(p) + VMXON_REGION_ALIGN_SIZE) & ~(VMXON_REGION_ALIGN_SIZE - 1))
+
+
+
+/*
+ * Pin-Based VM-Execution Controls
+ * Table 24-5. Definitions of Pin-Based Controls
+ * */
+#define                PINBASED_EXTERNAL_INTERRUPT_EXITING     0
+#define                PINBASED_NMI_EXITING                    3
+#define                PINBASED_VIRTUAL_NMIS                   5
+#define                PINBASED_ACTIVATE_VMX_PREEMPTION_TIEMR  6
+#define                PINBASED_PROCESS_POSTED_INTERRUPTS      7
+
+
+
+/*
+ * Processor-Based VM-Execution Controls
+ * Table 24-6. Definitions of Primary Processor-Based Controls
+ */
+#define                PROCBASED_INTERRUPT_WINDOW_EXITING      2
+#define                PROCBASED_USE_TSC_OFFSETING             3
+#define                PROCBASED_HLT_OFFSETING                 7
+#define                PROCBASED_INVLPG_EXITING                9
+#define                PROCBASED_MWAIT_EXITING                 10
+#define                PROCBASED_RDPMC_EXITING                 11
+#define                PROCBASED_RDTSC_EXITING                 12
+#define                PROCBASED_CR3_LOAD_EXITING              15
+#define                PROCBASED_CR3_STORE_EXITING             16
+#define                PROCBASED_CR8_LOAD_EXITING              19
+#define                PROCBASED_CR8_STORE_EXITING             20
+#define                PROCBASED_USE_TPR_SHADOW                21
+#define                PROCBASED_NMI_WINDOWS_EXITING           22
+#define                PROCBASED_MOV_DR_EXITING                23
+#define                PROCBASED_UNCOND_IO_EXITING             24
+#define                PROCBASED_USE_IO_BITMAPS                25
+#define                PROCBASED_MONITOR_TRAP_FLAG             27
+#define                PROCBASED_USE_MSR_BITMAPS               28
+#define                PROCBASED_MONITOR_EXITING               29
+#define                PROCBASED_PAUSE_EXITING                 30
+#define                PROCBASED_ACTIVATE_SECONDARY_CONTROLS   31
+/* Table 24-7. Definitions of Secondary Processor-Based Controls */
+#define                PROCBASED2_VIRTUALIZE_APIC_ACCESSES     0
+#define                PROCBASED2_ENABLE_EPT                   1
+#define                PROCBASED2_DESCRIPTOR_TABLE_EXITING     2
+#define                PROCBASED2_ENABLE_RDTSCP                3
+#define                PROCBASED2_VIRTUAL_x2APIC_MODE          4
+#define                PROCBASED2_ENABLE_VPID                  5
+#define                PROCBASED2_WBINVD_EXITING               6
+#define                PROCBASED2_UNRESTRICTED_GUEST           7
+#define                PROCBASED2_APIC_REGISTER_VIRTULIZATION  8
+#define                PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY   9
+#define                PROCBASED2_PAUSE_LOOP_EXITING           10
+#define                PROCBASED2_RDRAND_EXITING               11
+#define                PROCBASED2_ENABLE_INVPCID               12
+#define                PROCBASED2_ENABLE_VM_FUNCTIONS          13
+#define                PROCBASED2_VMCS_SHADOWING               14
+#define                PROCBASED2_EPT_VIOLATION_VE             18
+
+
+
+/*
+ * VM-EXIT CONTROL FIELDS
+ * Table 24-10. Definitions of VM-Exit Controls
+ */
+#define                VMEXIT_SAVE_DEBUG_CONTROLS              2
+#define                VMEXIT_HOST_ADDRESS_SPACE_SIZE          9
+#define                VMEXIT_LOAD_IA32_PERF_GLOBAL_CTRL       12
+#define                VMEXIT_ACKNOWLEDGE_INTERRUPT_ON_EXIT    15
+#define                VMEXIT_SAVE_IA32_PAT                    18
+#define                VMEXIT_LOAD_IA32_PAT                    19
+#define                VMEXIT_SAVE_IA32_EFER                   20
+#define                VMEXIT_LOAD_IA32_EFER                   21
+#define                VMEXIT_SAVE_VMX_PREEMPTION_TIMER        22
+
+
+
+/*
+ * VM-ENTRY CONTROL FIELDS
+ * Table 24-12. Definitions of VM-Entry Controls
+ */
+#define                VMENTRY_LOAD_DEBUG_CONTROLS             2
+#define                VMENTRY_IA32e_MODE_GUEST                9
+#define                VMENTRY_ENTRY_TO_SMM                    10
+#define                VMENTRY_DEACTIVATE_DUAL_MONITOR         11
+#define                VMENTRY_LOAD_IA32_PERF_GLOBAL_CTRL      13
+#define                VMENTRY_LOAD_IA32_PAT                   14
+#define                VMENTRY_LOAD_IA32_EFER                  15
+
+
+
+#define IS_ONE_SETTING_ALLOWED(val, bit)       \
+    ((val) & (1ULL << (bit + 32)))
+
+#define IS_ZERO_SETTING_ALLOWED(val, bit)      \
+    (((val) & (1ULL << (bit))) == 0)
+
+
+
+/*
+ * VMX Basic Exit Reasons
+ */
+#define                EXIT_REASON_EXCEPTION           0
+#define                EXIT_REASON_EXT_INTR            1
+#define                EXIT_REASON_TRIPLE_FAULT        2
+#define                EXIT_REASON_INIT                3
+#define                EXIT_REASON_SIPI                4
+#define                EXIT_REASON_IO_SMI              5
+#define                EXIT_REASON_SMI                 6
+#define                EXIT_REASON_INTR_WINDOW         7
+#define                EXIT_REASON_NMI_WINDOW          8
+#define                EXIT_REASON_TASK_SWITCH         9
+#define                EXIT_REASON_CPUID               10
+#define                EXIT_REASON_GETSEC              11
+#define                EXIT_REASON_HLT                 12
+#define                EXIT_REASON_INVD                13
+#define                EXIT_REASON_INVLPG              14
+#define                EXIT_REASON_RDPMC               15
+#define                EXIT_REASON_RDTSC               16
+#define                EXIT_REASON_RSM                 17
+#define                EXIT_REASON_VMCALL              18
+#define                EXIT_REASON_VMCLEAR             19
+#define                EXIT_REASON_VMLAUNCH            20
+#define                EXIT_REASON_VMPTRLD             21
+#define                EXIT_REASON_VMPTRST             22
+#define                EXIT_REASON_VMREAD              23
+#define                EXIT_REASON_VMRESUME            24
+#define                EXIT_REASON_VMWRITE             25
+#define                EXIT_REASON_VMXOFF              26
+#define                EXIT_REASON_VMXON               27
+#define                EXIT_REASON_CR_ACCESS           28
+#define                EXIT_REASON_DR_ACCESS           29
+#define                EXIT_REASON_INOUT               30
+#define                EXIT_REASON_RDMSR               31
+#define                EXIT_REASON_WRMSR               32
+#define                EXIT_REASON_INVAL_VMCS          33
+#define                EXIT_REASON_INVAL_MSR           34
+#define                EXIT_REASON_MWAIT               36
+#define                EXIT_REASON_MTF                 37
+#define                EXIT_REASON_MONITOR             39
+#define                EXIT_REASON_PAUSE               40
+#define                EXIT_REASON_MCE                 41
+#define                EXIT_REASON_TPR                 43
+#define                EXIT_REASON_APIC                44
+#define                EXIT_REASON_GDTR_IDTR           46
+#define                EXIT_REASON_LDTR_TR             47
+#define                EXIT_REASON_EPT_FAULT           48
+#define                EXIT_REASON_EPT_MISCONFIG       49
+#define                EXIT_REASON_INVEPT              50
+#define                EXIT_REASON_RDTSCP              51
+#define                EXIT_REASON_VMX_PREEMPT         52
+#define                EXIT_REASON_INVVPID             53
+#define                EXIT_REASON_WBINVD              54
+#define                EXIT_REASON_XSETBV              55
+#define                EXIT_REASON_APIC_WRITE          56
+#define                EXIT_REASON_RDRAND              57
+#define                EXIT_REASON_INVPCID             58
+#define                EXIT_REASON_VMFUNC              59
+
+/* Table 24-2. Format of Access Rights */
+#define                VMCS_SEG_TYPE(x)        (x)
+#define                VMCS_S                  (1 << 4)        /* Descriptor type - 0 = system; 1 = code or data */
+#define                VMCS_DPL(x)             (x << 5)        /* Descriptor Privilege Level */
+#define                VMCS_P                  (1 << 7)        /* Segment present*/
+#define                VMCS_AVL                (1 << 12)       /* Available for use by system software */
+#define                VMCS_L                  (1 << 13)       /* 64-bit mode active (for CS only) */
+#define                VMCS_OP_SIZE            (1 << 14)       /* Default operation size (0 = 16-bit segment; 1 = 32-bit segment) */
+#define                VMCS_G                  (1 << 15)       /* Granularity */
+#define                VMCS_SEG_UNUSABLE       (1 << 16)       /* Segment unusable (0 = usable; 1 = unusable) */
+
+#define        VMCS_EXCEPTION_TYPE(x)                  ((x >> 8) & 0x7)
+#define        VMCS_EXCEPTION_EXTERNAL_INTERRUPT       0
+#define        VMCS_EXCEPTION_NMI                      2
+#define        VMCS_EXCEPTION_HARDWARE                 3
+#define        VMCS_EXCEPTION_SOFTWARE                 6
+
+#define        VMCS_EXCEPTION_NUMBER(x)                (x & 0xFF)
+#endif
diff --git a/sys/platform/pc64/vmm/vmx_genassym.c b/sys/platform/pc64/vmm/vmx_genassym.c
new file mode 100644 (file)
index 0000000..274e19f
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/assym.h>
+
+#include "vmx.h"
+#include "vmx_instr.h"
+
+ASSYM(VTI_GUEST_RAX, offsetof(struct vmx_thread_info, guest.tf_rax));
+ASSYM(VTI_GUEST_RBX, offsetof(struct vmx_thread_info, guest.tf_rbx));
+ASSYM(VTI_GUEST_RCX, offsetof(struct vmx_thread_info, guest.tf_rcx));
+ASSYM(VTI_GUEST_RDX, offsetof(struct vmx_thread_info, guest.tf_rdx));
+ASSYM(VTI_GUEST_RSI, offsetof(struct vmx_thread_info, guest.tf_rsi));
+ASSYM(VTI_GUEST_RDI, offsetof(struct vmx_thread_info, guest.tf_rdi));
+ASSYM(VTI_GUEST_RBP, offsetof(struct vmx_thread_info, guest.tf_rbp));
+ASSYM(VTI_GUEST_R8, offsetof(struct vmx_thread_info, guest.tf_r8));
+ASSYM(VTI_GUEST_R9, offsetof(struct vmx_thread_info, guest.tf_r9));
+ASSYM(VTI_GUEST_R10, offsetof(struct vmx_thread_info, guest.tf_r10));
+ASSYM(VTI_GUEST_R11, offsetof(struct vmx_thread_info, guest.tf_r11));
+ASSYM(VTI_GUEST_R12, offsetof(struct vmx_thread_info, guest.tf_r12));
+ASSYM(VTI_GUEST_R13, offsetof(struct vmx_thread_info, guest.tf_r13));
+ASSYM(VTI_GUEST_R14, offsetof(struct vmx_thread_info, guest.tf_r14));
+ASSYM(VTI_GUEST_R15, offsetof(struct vmx_thread_info, guest.tf_r15));
+ASSYM(VTI_GUEST_CR2, offsetof(struct vmx_thread_info, guest_cr2));
+
+ASSYM(VTI_HOST_RBX, offsetof(struct vmx_thread_info, host_rbx));
+ASSYM(VTI_HOST_RBP, offsetof(struct vmx_thread_info, host_rbp));
+ASSYM(VTI_HOST_R10, offsetof(struct vmx_thread_info, host_r10));
+ASSYM(VTI_HOST_R11, offsetof(struct vmx_thread_info, host_r11));
+ASSYM(VTI_HOST_R12, offsetof(struct vmx_thread_info, host_r12));
+ASSYM(VTI_HOST_R13, offsetof(struct vmx_thread_info, host_r13));
+ASSYM(VTI_HOST_R14, offsetof(struct vmx_thread_info, host_r14));
+ASSYM(VTI_HOST_R15, offsetof(struct vmx_thread_info, host_r15));
+ASSYM(VTI_HOST_RSP, offsetof(struct vmx_thread_info, host_rsp));
+
+ASSYM(VM_SUCCEED, VM_SUCCEED);
+ASSYM(VM_FAIL_INVALID, VM_FAIL_INVALID);
+ASSYM(VM_FAIL_VALID, VM_FAIL_VALID);
+ASSYM(VM_EXIT, VM_EXIT);
diff --git a/sys/platform/pc64/vmm/vmx_instr.h b/sys/platform/pc64/vmm/vmx_instr.h
new file mode 100644 (file)
index 0000000..04d63ff
--- /dev/null
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _VMM_VMX_INSTR_H_
+#define _VMM_VMX_INSTR_H_
+
+#include <vm/pmap.h>
+
+/*
+ * Chapter 30 VMX Instruction Reference
+ * Section 30.3 "Conventions"
+ * from Intel Architecture Manual 3C.
+ */
+#define        VM_SUCCEED              0
+#define        VM_FAIL_INVALID         1
+#define        VM_FAIL_VALID           2
+#define        VM_EXIT                 3
+
+#define        GET_ERROR_CODE                          \
+       "               jnc 1f;"                \
+       "               mov $1, %[err];"        \
+       "               jmp 4f;"                \
+       "1:             jnz 3f;"                \
+       "               mov $2, %[err];"        \
+       "               jmp 4f;"                \
+       "3:             mov $0, %[err];"        \
+       "4:"
+
+static inline int
+vmxon(char *vmx_region)
+{
+       int err;
+       uint64_t paddr;
+
+       paddr = vtophys(vmx_region);
+       __asm __volatile("vmxon %[paddr];"
+                        GET_ERROR_CODE
+                        : [err] "=r" (err)
+                        : [paddr] "m" (paddr)
+                        : "memory");
+
+       return err;
+}
+
+static inline void
+vmxoff(void)
+{
+
+       __asm __volatile("vmxoff");
+}
+
+static inline int
+vmclear(char *vmcs_region)
+{
+       int err;
+       uint64_t paddr;
+
+       paddr = vtophys(vmcs_region);
+       __asm __volatile("vmclear %[paddr];"
+                        GET_ERROR_CODE
+                        : [err] "=r" (err)
+                        : [paddr] "m" (paddr)
+                        : "memory");
+       return err;
+}
+
+static inline void
+vmptrst(uint64_t *addr)
+{
+
+       __asm __volatile("vmptrst %[addr]"
+                       :
+                       : [addr] "m" (*addr)
+                       : "memory");
+}
+
+static inline int
+vmptrld(char *vmcs)
+{
+       int err;
+       uint64_t paddr;
+
+       paddr = vtophys(vmcs);
+       __asm __volatile("vmptrld %[paddr];"
+                        GET_ERROR_CODE
+                        : [err] "=r" (err)
+                        : [paddr] "m" (paddr)
+                        : "memory");
+       return err;
+}
+
+static inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+       int err;
+
+       __asm __volatile("vmwrite %[val], %[reg];"
+                        GET_ERROR_CODE
+                        : [err] "=r" (err)
+                        : [val] "r" (val), [reg] "r" (reg)
+                        : "memory");
+
+       return err;
+}
+
+static inline int
+vmread(uint64_t reg, uint64_t *addr)
+{
+       int err;
+
+       __asm __volatile("vmread %[reg], %[addr];"
+                        GET_ERROR_CODE
+                        : [err] "=r" (err)
+                        : [reg] "r" (reg), [addr] "m" (*addr)
+                        : "memory");
+
+       return err;
+}
+
+static inline int
+invept(uint64_t type, uint64_t *desc_addr)
+{
+       int err;
+
+       __asm __volatile("invept %[desc_addr], %[type];"
+                        GET_ERROR_CODE
+                        : [err] "=r" (err)
+                        : [desc_addr] "m" (*desc_addr), [type] "r" (type)
+                        : "memory");
+       return err;
+}
+
+#endif
diff --git a/sys/platform/pc64/vmm/vmx_trap.s b/sys/platform/pc64/vmm/vmx_trap.s
new file mode 100644 (file)
index 0000000..164a104
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <machine/asmacros.h>
+#include "vmx_assym.h"
+
+#define        VMX_RESTORE_GUEST(reg)                  \
+       movq VTI_GUEST_CR2(reg),%rsi;           \
+       movq %rsi,%cr2;                         \
+       movq VTI_GUEST_RAX(reg),%rax;           \
+       movq VTI_GUEST_RBX(reg),%rbx;           \
+       movq VTI_GUEST_RCX(reg),%rcx;           \
+       movq VTI_GUEST_RDX(reg),%rdx;           \
+       movq VTI_GUEST_RSI(reg),%rsi;           \
+       movq VTI_GUEST_RDI(reg),%rdi;           \
+       movq VTI_GUEST_RBP(reg),%rbp;           \
+       movq VTI_GUEST_R8(reg),%r8;             \
+       movq VTI_GUEST_R9(reg),%r9;             \
+       movq VTI_GUEST_R10(reg),%r10;           \
+       movq VTI_GUEST_R11(reg),%r11;           \
+       movq VTI_GUEST_R12(reg),%r12;           \
+       movq VTI_GUEST_R13(reg),%r13;           \
+       movq VTI_GUEST_R14(reg),%r14;           \
+       movq VTI_GUEST_R15(reg),%r15;           \
+
+#define        VMX_SAVE_GUEST(reg)                     \
+       movq %rax,VTI_GUEST_RAX(reg);           \
+       movq %rbx,VTI_GUEST_RBX(reg);           \
+       movq %rcx,VTI_GUEST_RCX(reg);           \
+       movq %rdx,VTI_GUEST_RDX(reg);           \
+       movq %rsi,VTI_GUEST_RSI(reg);           \
+       movq %rdi,VTI_GUEST_RDI(reg);           \
+       movq %rbp,VTI_GUEST_RBP(reg);           \
+       movq %r8,VTI_GUEST_R8(reg);             \
+       movq %r9,VTI_GUEST_R9(reg);             \
+       movq %r10,VTI_GUEST_R10(reg);           \
+       movq %r11,VTI_GUEST_R11(reg);           \
+       movq %r12,VTI_GUEST_R12(reg);           \
+       movq %r13,VTI_GUEST_R13(reg);           \
+       movq %r14,VTI_GUEST_R14(reg);           \
+       movq %r15,VTI_GUEST_R15(reg);           \
+       movq %cr2, %rsi;                        \
+       movq %rsi, VTI_GUEST_CR2(reg);          \
+
+#define        VMX_RUN_ERROR(dst_reg)                  \
+       jnc     1f;                             \
+       movq    $VM_FAIL_INVALID,dst_reg;       \
+       jmp     3f;                             \
+1:     jnz     2f;                             \
+       movq    $VM_FAIL_VALID,dst_reg;         \
+       jmp     3f;                             \
+2:     movq    $VM_SUCCEED,dst_reg;            \
+3:
+
+
+.text
+
+/*
+ * Called by the HW VMM when doing a VMEXIT.
+ * - restore the host context
+ * - return to handle_vmx_vmexit() with
+ *   ret=VM_EXIT, in vmx.c
+ *
+ * void vmx_exit(void)
+ * %rsp = vmx_thread_info
+ */
+ENTRY(vmx_vmexit)
+
+       VMX_SAVE_GUEST(%rsp)
+
+       movq    %rsp,%rdi
+
+       movq VTI_HOST_RBX(%rdi),%rbx
+       movq VTI_HOST_RBP(%rdi),%rbp
+       movq VTI_HOST_R12(%rdi),%r12
+       movq VTI_HOST_R13(%rdi),%r13
+       movq VTI_HOST_R14(%rdi),%r14
+       movq VTI_HOST_R15(%rdi),%r15
+       movq VTI_HOST_RSP(%rdi),%rsp
+
+       movq $VM_EXIT, %rax
+
+       ret
+END(vmx_vmexit)
+
+/*
+ * Called first time when entering the VMM
+ * - executing "vmlaunch" with success, doesn't
+ *   return here. Starts execution from the RIP
+ *   pointed in by VMCS_GUEST_CR3
+ * - not executing "vmlaunch" with success, it
+ *   returns immediately with the appropiate
+ *   error code
+ *
+ * int vmx_launch(struct vmx_thread_info* vti)
+ * %rdi = cti
+ */
+ENTRY(vmx_launch)
+       movq %rbx,VTI_HOST_RBX(%rdi)
+       movq %rbp,VTI_HOST_RBP(%rdi)
+       movq %r12,VTI_HOST_R12(%rdi)
+       movq %r13,VTI_HOST_R13(%rdi)
+       movq %r14,VTI_HOST_R14(%rdi)
+       movq %r15,VTI_HOST_R15(%rdi)
+       movq %rsp,VTI_HOST_RSP(%rdi)
+
+       movq %rdi,%rsp
+
+       VMX_RESTORE_GUEST(%rsp)
+
+       vmlaunch
+
+       VMX_RUN_ERROR(%rax)
+
+       movq    %rsp,%rdi
+
+       movq VTI_HOST_RBX(%rdi),%rbx
+       movq VTI_HOST_RBP(%rdi),%rbp
+       movq VTI_HOST_R12(%rdi),%r12
+       movq VTI_HOST_R13(%rdi),%r13
+       movq VTI_HOST_R14(%rdi),%r14
+       movq VTI_HOST_R15(%rdi),%r15
+       movq VTI_HOST_RSP(%rdi),%rsp
+
+       ret
+END(vmx_launch)
+
+/*
+ * Called every time when entering the VMM, but only
+ * after vmlaunch was executed before it
+ * - executing "vmresume" with success, doesn't
+ *   return here. Starts execution from the RIP
+ *   pointed in by VMCS_GUEST_CR3
+ * - not executing "vmresume" with success, it
+ *   returns immediately with the appropiate
+ *   error code
+ *
+ * int vmx_resume(struct vmx_thread_info* vti)
+ * %rdi = cti
+ */
+ENTRY(vmx_resume)
+       movq %rbx,VTI_HOST_RBX(%rdi)
+       movq %rbp,VTI_HOST_RBP(%rdi)
+       movq %r12,VTI_HOST_R12(%rdi)
+       movq %r13,VTI_HOST_R13(%rdi)
+       movq %r14,VTI_HOST_R14(%rdi)
+       movq %r15,VTI_HOST_R15(%rdi)
+       movq %rsp,VTI_HOST_RSP(%rdi)
+
+       movq %rdi,%rsp
+
+       VMX_RESTORE_GUEST(%rsp)
+
+       vmresume
+
+       VMX_RUN_ERROR(%rax)
+
+       movq    %rsp,%rdi
+
+       movq VTI_HOST_RBX(%rdi),%rbx
+       movq VTI_HOST_RBP(%rdi),%rbp
+       movq VTI_HOST_R12(%rdi),%r12
+       movq VTI_HOST_R13(%rdi),%r13
+       movq VTI_HOST_R14(%rdi),%r14
+       movq VTI_HOST_R15(%rdi),%r15
+       movq VTI_HOST_RSP(%rdi),%rsp
+
+       ret
+END(vmx_resume)
diff --git a/sys/platform/pc64/vmm/vmx_vmcs.h b/sys/platform/pc64/vmm/vmx_vmcs.h
new file mode 100644 (file)
index 0000000..80478b4
--- /dev/null
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Mihai Carabas <mihai.carabas@gmail.com>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _VMM_VMX_VMCS_H_
+#define _VMM_VMX_VMCS_H_
+
+/*
+ * Appendix B: Field Enconding in VMCS
+ * Intel Architecture Manual Vol3C - 326019
+ */
+
+/* 16-Bit Control Fields */
+#define                VMCS_VPID                       0x00000000
+#define                VMCS_PINV                       0x00000002
+#define                VMCS_EPTP_INDEX                 0x00000004
+
+/* 16-Bit Guest-State Fields */
+#define                VMCS_GUEST_ES_SELECTOR          0x00000800
+#define                VMCS_GUEST_CS_SELECTOR          0x00000802
+#define                VMCS_GUEST_SS_SELECTOR          0x00000804
+#define                VMCS_GUEST_DS_SELECTOR          0x00000806
+#define                VMCS_GUEST_FS_SELECTOR          0x00000808
+#define                VMCS_GUEST_GS_SELECTOR          0x0000080A
+#define                VMCS_GUEST_LDTR_SELECTOR        0x0000080C
+#define                VMCS_GUEST_TR_SELECTOR          0x0000080E
+#define                VMCS_GUEST_INTERRUPT_STATUS     0x00000810
+
+/* 16-Bit Host-State Fields */
+#define                VMCS_HOST_ES_SELECTOR           0x00000C00
+#define                VMCS_HOST_CS_SELECTOR           0x00000C02
+#define                VMCS_HOST_SS_SELECTOR           0x00000C04
+#define                VMCS_HOST_DS_SELECTOR           0x00000C06
+#define                VMCS_HOST_FS_SELECTOR           0x00000C08
+#define                VMCS_HOST_GS_SELECTOR           0x00000C0A
+#define                VMCS_HOST_TR_SELECTOR           0x00000C0C
+
+/* 64-Bit Control Fields */
+#define                VMCS_IO_BITMAP_A                0x00002000
+#define                VMCS_IO_BITMAP_B                0x00002002
+#define                VMCS_MSR_BITMAP                 0x00002004
+#define                VMCS_VMEXIT_MSR_STORE           0x00002006
+#define                VMCS_VMEXIT_MSR_LOAD            0x00002008
+#define                VMCS_VMENTRY_MSR_LOAD           0x0000200A
+#define                VMCS_EXECUTIVE_VMCS             0x0000200C
+#define                VMCS_TSC_OFFSET                 0x00002010
+#define                VMCS_VIRTUAL_APIC               0x00002012
+#define                VMCS_APIC_ACCESS                0x00002014
+#define                VMCS_POSTED_INTERRUPT_DESCR     0x00002016
+#define                VMCS_VMFUNCTION_CONTROLS        0x00002018
+#define                VMCS_EPTP                       0x0000201A
+
+/* 64-Bit Read-Only Data Field */
+#define                VMCS_GUEST_PHYSICAL_ADDRESS     0x00002400
+
+/* 64-Bit Guest-State Fields */
+#define                VMCS_LINK_POINTER               0x00002800
+#define                VMCS_GUEST_IA32_DEBUGCTL        0x00002802
+#define                VMCS_GUEST_IA32_PAT             0x00002804
+#define                VMCS_GUEST_IA32_EFER            0x00002806
+#define                VMCS_GUEST_IA32_PERF_GLOBAL     0x00002808
+#define                VMCS_GUEST_PDPTE0               0x0000280A
+#define                VMCS_GUEST_PDPTE1               0x0000280C
+#define                VMCS_GUEST_PDPTE2               0x0000280E
+#define                VMCS_GUEST_PDPTE3               0x00002810
+
+/* 64-Bit Host-State Fields */
+#define                VMCS_HOST_IA32_PAT              0x00002C00
+#define                VMCS_HOST_IA32_EFER             0x00002C02
+#define                VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-Bit Control Fields */
+#define                VMCS_PINBASED_CTLS              0x00004000
+#define                VMCS_PROCBASED_CTLS             0x00004002
+#define                VMCS_EXCEPTION_BITMAP           0x00004004
+#define                VMCS_PAGE_FAULT_ERR_MASK        0x00004006
+#define                VMCS_PAGE_FAULT_ERR_MATCH       0x00004008
+#define                VMCS_CR3_TARGET_COUNT           0x0000400A
+#define                VMCS_VMEXIT_CTLS                0x0000400C
+#define                VMCS_VMEXIT_MSR_STORE_COUNT     0x0000400E
+#define                VMCS_VMEXIT_MSR_LOAD_COUNT      0x00004010
+#define                VMCS_VMENTRY_CTLS               0x00004012
+#define                VMCS_VMENTRY_MSR_LOAD_COUNT     0x00004014
+#define                VMCS_VMENTRY_INTR_INFO          0x00004016
+#define                VMCS_VMENTRY_EXCEPTION_ERR      0x00004018
+#define                VMCS_VMENTRY_INSTR_LENGTH       0x0000401A
+#define                VMCS_TPR_THRESHOLD              0x0000401C
+#define                VMCS_PROCBASED2_CTLS            0x0000401E
+#define                VMCS_PLE_GAP                    0x00004020
+#define                VMCS_PLE_WINDOW                 0x00004022
+
+/* 32-Bit Read-Only Data Fields */
+#define                VMCS_INSTR_ERR                  0x00004400
+#define                VMCS_VMEXIT_REASON              0x00004402
+#define                VMCS_BASIC_EXIT_REASON(val)     (val & 0xffff)
+#define                VMCS_VMEXIT_INTERRUPTION_INFO   0x00004404
+#define                VMCS_VMEXIT_INTERRUPTION_ERROR  0x00004406
+#define                VMCS_IDT_VECTORING_INFO         0x00004408
+#define                VMCS_IDT_VECTORING_ERR          0x0000440A
+#define                VMCS_VMEXIT_INSTRUCTION_LENGTH  0x0000440C
+#define                VMCS_VMEXIT_INSTRUCTION_INFO    0x0000440E
+
+/* 32-Bit Guest-State Fields */
+#define                VMCS_GUEST_ES_LIMIT             0x00004800
+#define                VMCS_GUEST_CS_LIMIT             0x00004802
+#define                VMCS_GUEST_SS_LIMIT             0x00004804
+#define                VMCS_GUEST_DS_LIMIT             0x00004806
+#define                VMCS_GUEST_FS_LIMIT             0x00004808
+#define                VMCS_GUEST_GS_LIMIT             0x0000480A
+#define                VMCS_GUEST_LDTR_LIMIT           0x0000480C
+#define                VMCS_GUEST_TR_LIMIT             0x0000480E
+#define                VMCS_GUEST_GDTR_LIMIT           0x00004810
+#define                VMCS_GUEST_IDTR_LIMIT           0x00004812
+#define                VMCS_GUEST_ES_ACCESS_RIGHTS     0x00004814
+#define                VMCS_GUEST_CS_ACCESS_RIGHTS     0x00004816
+#define                VMCS_GUEST_SS_ACCESS_RIGHTS     0x00004818
+#define                VMCS_GUEST_DS_ACCESS_RIGHTS     0x0000481A
+#define                VMCS_GUEST_FS_ACCESS_RIGHTS     0x0000481C
+#define                VMCS_GUEST_GS_ACCESS_RIGHTS     0x0000481E
+#define                VMCS_GUEST_LDTR_ACCESS_RIGHTS   0x00004820
+#define                VMCS_GUEST_TR_ACCESS_RIGHTS     0x00004822
+#define                VMCS_GUEST_INTERRUPTIBILITY     0x00004824
+#define                VMCS_GUEST_ACTIVITY             0x00004826
+#define                VMCS_GUEST_SMBASE               0x00004828
+#define                VMCS_GUEST_IA32_SYSENTER_CS     0x0000482A
+#define                VMCS_PREEMPTION_TIMER_VALUE     0x0000482E
+
+/* 32-Bit Host-State Field */
+#define                VMCS_HOST_IA32_SYSENTER_CS      0x00004C00
+
+/* Natural-Width Control Fields */
+#define                VMCS_CR0_MASK                   0x00006000
+#define                VMCS_CR4_MASK                   0x00006002
+#define                VMCS_CR0_SHADOW                 0x00006004
+#define                VMCS_CR4_SHADOW                 0x00006006
+#define                VMCS_CR3_TARGET_VALUE0          0x00006008
+#define                VMCS_CR3_TARGET_VALUE1          0x0000600A
+#define                VMCS_CR3_TARGET_VALUE2          0x0000600C
+#define                VMCS_CR3_TARGET_VALUE3          0x0000600E
+
+/* Natural-Width Read-Only Data Fields */
+#define                VMCS_EXIT_QUALIFICATION         0x00006400
+#define                VMCS_IO_RCX                     0x00006402
+#define                VMCS_IO_RSI                     0x00006404
+#define                VMCS_IO_RDI                     0x00006406
+#define                VMCS_IO_RIP                     0x00006408
+#define                VMCS_GUEST_LINEAR_ADDRESS       0x0000640A
+
+/* Natural-Width Guest-State Fields */
+#define                VMCS_GUEST_CR0                  0x00006800
+#define                VMCS_GUEST_CR3                  0x00006802
+#define                VMCS_GUEST_CR4                  0x00006804
+#define                VMCS_GUEST_ES_BASE              0x00006806
+#define                VMCS_GUEST_CS_BASE              0x00006808
+#define                VMCS_GUEST_SS_BASE              0x0000680A
+#define                VMCS_GUEST_DS_BASE              0x0000680C
+#define                VMCS_GUEST_FS_BASE              0x0000680E
+#define                VMCS_GUEST_GS_BASE              0x00006810
+#define                VMCS_GUEST_LDTR_BASE            0x00006812
+#define                VMCS_GUEST_TR_BASE              0x00006814
+#define                VMCS_GUEST_GDTR_BASE            0x00006816
+#define                VMCS_GUEST_IDTR_BASE            0x00006818
+#define                VMCS_GUEST_DR7                  0x0000681A
+#define                VMCS_GUEST_RSP                  0x0000681C
+#define                VMCS_GUEST_RIP                  0x0000681E
+#define                VMCS_GUEST_RFLAGS               0x00006820
+#define                VMCS_GUEST_PENDING_DBG_EXCEPT   0x00006822
+#define                VMCS_GUEST_IA32_SYSENTER_ESP    0x00006824
+#define                VMCS_GUEST_IA32_SYSENTER_EIP    0x00006826
+
+/* Natural-Width Host-State Fields */
+#define                VMCS_HOST_CR0                   0x00006C00
+#define                VMCS_HOST_CR3                   0x00006C02
+#define                VMCS_HOST_CR4                   0x00006C04
+#define                VMCS_HOST_FS_BASE               0x00006C06
+#define                VMCS_HOST_GS_BASE               0x00006C08
+#define                VMCS_HOST_TR_BASE               0x00006C0A
+#define                VMCS_HOST_GDTR_BASE             0x00006C0C
+#define                VMCS_HOST_IDTR_BASE             0x00006C0E
+#define                VMCS_HOST_IA32_SYSENTER_ESP     0x00006C10
+#define                VMCS_HOST_IA32_SYSENTER_EIP     0x00006C12
+#define                VMCS_HOST_RSP                   0x00006C14
+#define                VMCS_HOST_RIP                   0x00006c16
+
+#endif
index 9b396ce..50750f6 100644 (file)
@@ -265,7 +265,7 @@ db_write_bytes(vm_offset_t addr, size_t size, char *data)
 
            ptep0 = pmap_kpte(addr);
            oldmap0 = *ptep0;
-           *ptep0 |= VPTE_W;
+           *ptep0 |= VPTE_RW;
 
            /* Map another page if the data crosses a page boundary. */
            if ((*ptep0 & PG_PS) == 0) {
@@ -273,14 +273,14 @@ db_write_bytes(vm_offset_t addr, size_t size, char *data)
                if (trunc_page(addr) != addr1) {
                    ptep1 = pmap_kpte(addr1);
                    oldmap1 = *ptep1;
-                   *ptep1 |= VPTE_W;
+                   *ptep1 |= VPTE_RW;
                }
            } else {
                addr1 = trunc_4mpage(addr + size - 1);
                if (trunc_4mpage(addr) != addr1) {
                    ptep1 = pmap_kpte(addr1);
                    oldmap1 = *ptep1;
-                   *ptep1 |= VPTE_W;
+                   *ptep1 |= VPTE_RW;
                }
            }
 
index 12f860e..c7a1892 100644 (file)
@@ -185,6 +185,8 @@ ASSYM(TD_MACH, offsetof(struct thread, td_mach));
 ASSYM(TD_WCHAN, offsetof(struct thread, td_wchan));
 ASSYM(TD_NEST_COUNT, offsetof(struct thread, td_nest_count));
 ASSYM(TD_FLAGS, offsetof(struct thread, td_flags));
+ASSYM(TD_TYPE, offsetof(struct thread, td_type));
+
 ASSYM(TD_SAVEFPU, offsetof(struct thread, td_savefpu));
 ASSYM(TDF_RUNNING, TDF_RUNNING);
 ASSYM(TDF_USINGFP, TDF_USINGFP);
index e5536fe..c47dee1 100644 (file)
@@ -1475,7 +1475,10 @@ getmemsize(caddr_t kmdp, u_int64_t first)
                        /*
                         * map page into kernel: valid, read/write,non-cacheable
                         */
-                       *pte = pa | PG_V | PG_RW | PG_N;
+                       *pte = pa |
+                           kernel_pmap.pmap_bits[PG_V_IDX] |
+                           kernel_pmap.pmap_bits[PG_RW_IDX] |
+                           kernel_pmap.pmap_bits[PG_N_IDX];
                        cpu_invltlb();
 
                        tmp = *ptr;
index 12950c6..744d3a7 100644 (file)
@@ -223,11 +223,12 @@ minidumpsys(struct dumperinfo *di)
                 */
                i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
                ptesize += PAGE_SIZE;
-               if ((pdp[i] & PG_V) == 0)
+               if ((pdp[i] & kernel_pmap.pmap_bits[PG_V_IDX]) == 0)
                        continue;
                pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
                j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
-               if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+               if ((pd[j] & (kernel_pmap.pmap_bits[PG_PS_IDX] | kernel_pmap.pmap_bits[PG_V_IDX])) ==
+                   (kernel_pmap.pmap_bits[PG_PS_IDX] | kernel_pmap.pmap_bits[PG_V_IDX]))  {
                        /* This is an entire 2M page. */
                        pa = pd[j] & PG_PS_FRAME;
                        for (k = 0; k < NPTEPG; k++) {
@@ -237,11 +238,11 @@ minidumpsys(struct dumperinfo *di)
                        }
                        continue;
                }
-               if ((pd[j] & PG_V) == PG_V) {
+               if ((pd[j] & kernel_pmap.pmap_bits[PG_V_IDX]) == kernel_pmap.pmap_bits[PG_V_IDX]) {
                        /* set bit for each valid page in this 2MB block */
                        pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
                        for (k = 0; k < NPTEPG; k++) {
-                               if ((pt[k] & PG_V) == PG_V) {
+                               if ((pt[k] & kernel_pmap.pmap_bits[PG_V_IDX]) == kernel_pmap.pmap_bits[PG_V_IDX]) {
                                        pa = pt[k] & PG_FRAME;
                                        if (is_dumpable(pa))
                                                dump_add_page(pa);
@@ -334,7 +335,7 @@ minidumpsys(struct dumperinfo *di)
                 * We always write a page, even if it is zero
                 */
                i = (va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1);
-               if ((pdp[i] & PG_V) == 0) {
+               if ((pdp[i] & kernel_pmap.pmap_bits[PG_V_IDX]) == 0) {
                        bzero(fakept, sizeof(fakept));
                        error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
                        if (error)
@@ -347,11 +348,16 @@ minidumpsys(struct dumperinfo *di)
                }
                pd = (uint64_t *)PHYS_TO_DMAP(pdp[i] & PG_FRAME);
                j = ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1));
-               if ((pd[j] & (PG_PS | PG_V)) == (PG_PS | PG_V))  {
+               if ((pd[j] & (kernel_pmap.pmap_bits[PG_PS_IDX] | kernel_pmap.pmap_bits[PG_V_IDX])) ==
+                   (kernel_pmap.pmap_bits[PG_PS_IDX] | kernel_pmap.pmap_bits[PG_V_IDX]))  {
                        /* This is a single 2M block. Generate a fake PTP */
                        pa = pd[j] & PG_PS_FRAME;
                        for (k = 0; k < NPTEPG; k++) {
-                               fakept[k] = (pa + (k * PAGE_SIZE)) | PG_V | PG_RW | PG_A | PG_M;
+                               fakept[k] = (pa + (k * PAGE_SIZE)) |
+                                   kernel_pmap.pmap_bits[PG_V_IDX] |
+                                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                                   kernel_pmap.pmap_bits[PG_A_IDX] |
+                                   kernel_pmap.pmap_bits[PG_M_IDX];
                        }
                        error = blk_write(di, (char *)&fakept, 0, PAGE_SIZE);
                        if (error)
@@ -362,7 +368,7 @@ minidumpsys(struct dumperinfo *di)
                                goto fail;
                        continue;
                }
-               if ((pd[j] & PG_V) == PG_V) {
+               if ((pd[j] & kernel_pmap.pmap_bits[PG_V_IDX]) == kernel_pmap.pmap_bits[PG_V_IDX]) {
                        pt = (uint64_t *)PHYS_TO_DMAP(pd[j] & PG_FRAME);
                        error = blk_write(di, (char *)pt, 0, PAGE_SIZE);
                        if (error)
index 14efb40..02f0b3a 100644 (file)
@@ -367,15 +367,22 @@ start_all_aps(u_int boot_addr)
        for (i = 0; i < 512; i++) {
                /* Each slot of the level 4 pages points to the same level 3 page */
                pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + PAGE_SIZE);
-               pt4[i] |= PG_V | PG_RW | PG_U;
+               pt4[i] |= kernel_pmap.pmap_bits[PG_V_IDX] |
+                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                   kernel_pmap.pmap_bits[PG_U_IDX];
 
                /* Each slot of the level 3 pages points to the same level 2 page */
                pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables + (2 * PAGE_SIZE));
-               pt3[i] |= PG_V | PG_RW | PG_U;
+               pt3[i] |= kernel_pmap.pmap_bits[PG_V_IDX] |
+                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                   kernel_pmap.pmap_bits[PG_U_IDX];
 
                /* The level 2 page slots are mapped with 2MB pages for 1GB. */
                pt2[i] = i * (2 * 1024 * 1024);
-               pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+               pt2[i] |= kernel_pmap.pmap_bits[PG_V_IDX] |
+                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                   kernel_pmap.pmap_bits[PG_PS_IDX] |
+                   kernel_pmap.pmap_bits[PG_U_IDX];
        }
 
        /* save the current value of the warm-start vector */
index b94a1ca..5e57ef1 100644 (file)
 #include "opt_msgbuf.h"
 
 #include <sys/param.h>
-#include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/msgbuf.h>
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
+#include <sys/systm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
  */
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
-#define pmap_pde_v(pte)                ((*(pd_entry_t *)pte & PG_V) != 0)
-#define pmap_pte_w(pte)                ((*(pt_entry_t *)pte & PG_W) != 0)
-#define pmap_pte_m(pte)                ((*(pt_entry_t *)pte & PG_M) != 0)
-#define pmap_pte_u(pte)                ((*(pt_entry_t *)pte & PG_A) != 0)
-#define pmap_pte_v(pte)                ((*(pt_entry_t *)pte & PG_V) != 0)
+#define pmap_pde_v(pmap, pte)          ((*(pd_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
+#define pmap_pte_w(pmap, pte)          ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_W_IDX]) != 0)
+#define pmap_pte_m(pmap, pte)          ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_M_IDX]) != 0)
+#define pmap_pte_u(pmap, pte)          ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_U_IDX]) != 0)
+#define pmap_pte_v(pmap, pte)          ((*(pt_entry_t *)pte & pmap->pmap_bits[PG_V_IDX]) != 0)
 
 /*
  * Given a map and a machine independent protection code,
  * convert to a vax protection code.
  */
 #define pte_prot(m, p)         \
-       (protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
-static int protection_codes[8];
+       (m->protection_codes[p & (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)])
+static int protection_codes[PROTECTION_CODES_SIZE];
 
 struct pmap kernel_pmap;
 static TAILQ_HEAD(,pmap)       pmap_list = TAILQ_HEAD_INITIALIZER(pmap_list);
@@ -166,8 +166,8 @@ vm_offset_t KvaStart;               /* VA start of KVA space */
 vm_offset_t KvaEnd;            /* VA end of KVA space (non-inclusive) */
 vm_offset_t KvaSize;           /* max size of kernel virtual address space */
 static boolean_t pmap_initialized = FALSE;     /* Has pmap_init completed? */
-static int pgeflag;            /* PG_G or-in */
-static int pseflag;            /* PG_PS or-in */
+//static int pgeflag;          /* PG_G or-in */
+//static int pseflag;          /* PG_PS or-in */
 uint64_t PatMsr;
 
 static int ndmpdp;
@@ -175,7 +175,6 @@ static vm_paddr_t dmaplimit;
 static int nkpt;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 
-#define PAT_INDEX_SIZE  8
 static pt_entry_t pat_pte_index[PAT_INDEX_SIZE];       /* PAT -> PG_ bits */
 /*static pt_entry_t pat_pde_index[PAT_INDEX_SIZE];*/   /* PAT -> PG_ bits */
 
@@ -207,6 +206,24 @@ caddr_t CADDR1 = NULL, ptvmmap = NULL;
 static pt_entry_t *msgbufmap;
 struct msgbuf *msgbufp=NULL;
 
+/*
+ * PMAP default PG_* bits. Needed to be able to add
+ * EPT/NPT pagetable pmap_bits for the VMM module
+ */
+uint64_t pmap_bits_default[] = {
+               REGULAR_PMAP,                                   /* TYPE_IDX             0 */
+               X86_PG_V,                                       /* PG_V_IDX             1 */
+               X86_PG_RW,                                      /* PG_RW_IDX            2 */
+               X86_PG_U,                                       /* PG_U_IDX             3 */
+               X86_PG_A,                                       /* PG_A_IDX             4 */
+               X86_PG_M,                                       /* PG_M_IDX             5 */
+               X86_PG_PS,                                      /* PG_PS_IDX3           6 */
+               X86_PG_G,                                       /* PG_G_IDX             7 */
+               X86_PG_AVAIL1,                                  /* PG_AVAIL1_IDX        8 */
+               X86_PG_AVAIL2,                                  /* PG_AVAIL2_IDX        9 */
+               X86_PG_AVAIL3,                                  /* PG_AVAIL3_IDX        10 */
+               X86_PG_NC_PWT | X86_PG_NC_PCD,                  /* PG_N_IDX     11 */
+};
 /*
  * Crashdump maps.
  */
@@ -222,6 +239,17 @@ SYSCTL_INT(_machdep, OID_AUTO, pmap_mmu_optimize, CTLFLAG_RW,
 
 #define DISABLE_PSE
 
+/* Standard user access funtions */
+extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len,
+    size_t *lencopied);
+extern int std_copyin (const void *udaddr, void *kaddr, size_t len);
+extern int std_copyout (const void *kaddr, void *udaddr, size_t len);
+extern int std_fubyte (const void *base);
+extern int std_subyte (void *base, int byte);
+extern long std_fuword (const void *base);
+extern int std_suword (void *base, long word);
+extern int std_suword32 (void *base, int word);
+
 static void pv_hold(pv_entry_t pv);
 static int _pv_hold_try(pv_entry_t pv
                                PMAP_DEBUG_DECL);
@@ -263,6 +291,8 @@ static boolean_t pmap_testbit (vm_page_t m, int bit);
 static pt_entry_t * pmap_pte_quick (pmap_t pmap, vm_offset_t va);
 static vm_offset_t pmap_kmem_choose(vm_offset_t addr);
 
+static void pmap_pinit_defaults(struct pmap *pmap);
+
 static unsigned pdir4mb;
 
 static int
@@ -456,7 +486,7 @@ pmap_pd(pmap_t pmap, vm_offset_t va)
        pml4_entry_t *pdp;
 
        pdp = pmap_pdp(pmap, va);
-       if ((*pdp & PG_V) == 0)
+       if ((*pdp & pmap->pmap_bits[PG_V_IDX]) == 0)
                return NULL;
        return (pmap_pdp_to_pd(*pdp, va));
 }
@@ -499,7 +529,7 @@ pmap_pt(pmap_t pmap, vm_offset_t va)
                return (pmap_pd_to_pt(VM_PAGE_TO_PHYS(pv->pv_m), va));
        } else {
                pd = pmap_pd(pmap, va);
-               if (pd == NULL || (*pd & PG_V) == 0)
+               if (pd == NULL || (*pd & pmap->pmap_bits[PG_V_IDX]) == 0)
                         return NULL;
                return (pmap_pd_to_pt(*pd, va));
        }
@@ -528,9 +558,9 @@ pmap_pte(pmap_t pmap, vm_offset_t va)
        pd_entry_t *pt;
 
        pt = pmap_pt(pmap, va);
-       if (pt == NULL || (*pt & PG_V) == 0)
+       if (pt == NULL || (*pt & pmap->pmap_bits[PG_V_IDX]) == 0)
                 return NULL;
-       if ((*pt & PG_PS) != 0)
+       if ((*pt & pmap->pmap_bits[PG_PS_IDX]) != 0)
                return ((pt_entry_t *)pt);
        return (pmap_pt_to_pte(*pt, va));
 }
@@ -538,6 +568,9 @@ pmap_pte(pmap_t pmap, vm_offset_t va)
 /*
  * Of all the layers (PTE, PT, PD, PDP, PML4) the best one to cache is
  * the PT layer.  This will speed up core pmap operations considerably.
+ *
+ * NOTE: Can be called with the pmap spin lock held shared.  pm_pvhint
+ *      race is ok.
  */
 static __inline
 void
@@ -655,7 +688,10 @@ create_pagetables(vm_paddr_t *firstaddr)
         */
        for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
                ((pt_entry_t *)KPTbase)[i] = i << PAGE_SHIFT;
-               ((pt_entry_t *)KPTbase)[i] |= PG_RW | PG_V | PG_G;
+               ((pt_entry_t *)KPTbase)[i] |=
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_V_IDX] |
+                   pmap_bits_default[PG_G_IDX];
        }
 
        /*
@@ -666,11 +702,15 @@ create_pagetables(vm_paddr_t *firstaddr)
         */
        for (i = 0; i < nkpt_base; i++) {
                ((pd_entry_t *)KPDbase)[i] = KPTbase + (i << PAGE_SHIFT);
-               ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V;
+               ((pd_entry_t *)KPDbase)[i] |=
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_V_IDX];
        }
        for (i = 0; i < nkpt_phys; i++) {
                ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
-               ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
+               ((pd_entry_t *)KPDphys)[i] |=
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_V_IDX];
        }
 
        /*
@@ -680,7 +720,11 @@ create_pagetables(vm_paddr_t *firstaddr)
         */
        for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
                ((pd_entry_t *)KPDbase)[i] = i << PDRSHIFT;
-               ((pd_entry_t *)KPDbase)[i] |= PG_RW | PG_V | PG_PS | PG_G;
+               ((pd_entry_t *)KPDbase)[i] |=
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_V_IDX] |
+                   pmap_bits_default[PG_PS_IDX] |
+                   pmap_bits_default[PG_G_IDX];
        }
 
        /*
@@ -691,7 +735,9 @@ create_pagetables(vm_paddr_t *firstaddr)
                ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] =
                                KPDphys + (i << PAGE_SHIFT);
                ((pdp_entry_t *)KPDPphys)[NPDPEPG - NKPDPE + i] |=
-                               PG_RW | PG_V | PG_U;
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_V_IDX] |
+                   pmap_bits_default[PG_U_IDX];
        }
 
        /*
@@ -704,8 +750,13 @@ create_pagetables(vm_paddr_t *firstaddr)
        if ((amd_feature & AMDID_PAGE1GB) == 0) {
                for (i = 0; i < NPDEPG * ndmpdp; i++) {
                        ((pd_entry_t *)DMPDphys)[i] = i << PDRSHIFT;
-                       ((pd_entry_t *)DMPDphys)[i] |= PG_RW | PG_V | PG_PS |
-                                                      PG_G | PG_M | PG_A;
+                       ((pd_entry_t *)DMPDphys)[i] |=
+                           pmap_bits_default[PG_RW_IDX] |
+                           pmap_bits_default[PG_V_IDX] |
+                           pmap_bits_default[PG_PS_IDX] |
+                           pmap_bits_default[PG_G_IDX] |
+                           pmap_bits_default[PG_M_IDX] |
+                           pmap_bits_default[PG_A_IDX];
                }
 
                /*
@@ -714,35 +765,51 @@ create_pagetables(vm_paddr_t *firstaddr)
                for (i = 0; i < ndmpdp; i++) {
                        ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys +
                                                        (i << PAGE_SHIFT);
-                       ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+                       ((pdp_entry_t *)DMPDPphys)[i] |=
+                           pmap_bits_default[PG_RW_IDX] |
+                           pmap_bits_default[PG_V_IDX] |
+                           pmap_bits_default[PG_U_IDX];
                }
        } else {
                for (i = 0; i < ndmpdp; i++) {
                        ((pdp_entry_t *)DMPDPphys)[i] =
                                                (vm_paddr_t)i << PDPSHIFT;
-                       ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS |
-                                                        PG_G | PG_M | PG_A;
+                       ((pdp_entry_t *)DMPDPphys)[i] |=
+                           pmap_bits_default[PG_RW_IDX] |
+                           pmap_bits_default[PG_V_IDX] |
+                           pmap_bits_default[PG_PS_IDX] |
+                           pmap_bits_default[PG_G_IDX] |
+                           pmap_bits_default[PG_M_IDX] |
+                           pmap_bits_default[PG_A_IDX];
                }
        }
 
        /* And recursively map PML4 to itself in order to get PTmap */
        ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
-       ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+       ((pdp_entry_t *)KPML4phys)[PML4PML4I] |=
+           pmap_bits_default[PG_RW_IDX] |
+           pmap_bits_default[PG_V_IDX] |
+           pmap_bits_default[PG_U_IDX];
 
        /*
         * Connect the Direct Map slots up to the PML4
         */
        for (j = 0; j < NDMPML4E; ++j) {
                ((pdp_entry_t *)KPML4phys)[DMPML4I + j] =
-                       (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
-                       PG_RW | PG_V | PG_U;
+                   (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_V_IDX] |
+                   pmap_bits_default[PG_U_IDX];
        }
 
        /*
         * Connect the KVA slot up to the PML4
         */
        ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
-       ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+       ((pdp_entry_t *)KPML4phys)[KPML4I] |=
+           pmap_bits_default[PG_RW_IDX] |
+           pmap_bits_default[PG_V_IDX] |
+           pmap_bits_default[PG_U_IDX];
 }
 
 /*
@@ -843,12 +910,12 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
         * cases rather then invl1pg.  Actually, I don't even know why it
         * works under UP because self-referential page table mappings
         */
-       pgeflag = 0;
+//     pgeflag = 0;
 
 /*
  * Initialize the 4MB page size flag
  */
-       pseflag = 0;
+//     pseflag = 0;
 /*
  * The 4MB page version of the initial
  * kernel page mapping.
@@ -861,10 +928,14 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
                /*
                 * Note that we have enabled PSE mode
                 */
-               pseflag = PG_PS;
+//             pseflag = kernel_pmap.pmap_bits[PG_PS_IDX];
                ptditmp = *(PTmap + x86_64_btop(KERNBASE));
                ptditmp &= ~(NBPDR - 1);
-               ptditmp |= PG_V | PG_RW | PG_PS | PG_U | pgeflag;
+               ptditmp |= pmap_bits_default[PG_V_IDX] |
+                   pmap_bits_default[PG_RW_IDX] |
+                   pmap_bits_default[PG_PS_IDX] |
+                   pmap_bits_default[PG_U_IDX];
+//                 pgeflag;
                pdir4mb = ptditmp;
        }
 #endif
@@ -872,6 +943,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 
        /* Initialize the PAT MSR */
        pmap_init_pat();
+
+       pmap_pinit_defaults(&kernel_pmap);
 }
 
 /*
@@ -898,9 +971,9 @@ pmap_init_pat(void)
                  PAT_VALUE(6, PAT_UNCACHED) |          /* 110 */
                  PAT_VALUE(7, PAT_UNCACHEABLE);        /* 111 */
        pat_pte_index[PAT_WRITE_BACK]   = 0;
-       pat_pte_index[PAT_WRITE_THROUGH]= 0         | PG_NC_PWT;
-       pat_pte_index[PAT_UNCACHED]     = PG_NC_PCD;
-       pat_pte_index[PAT_UNCACHEABLE]  = PG_NC_PCD | PG_NC_PWT;
+       pat_pte_index[PAT_WRITE_THROUGH]= 0         | X86_PG_NC_PWT;
+       pat_pte_index[PAT_UNCACHED]     = X86_PG_NC_PCD;
+       pat_pte_index[PAT_UNCACHEABLE]  = X86_PG_NC_PCD | X86_PG_NC_PWT;
        pat_pte_index[PAT_WRITE_PROTECTED] = pat_pte_index[PAT_UNCACHEABLE];
        pat_pte_index[PAT_WRITE_COMBINING] = pat_pte_index[PAT_UNCACHEABLE];
 
@@ -914,8 +987,8 @@ pmap_init_pat(void)
                          PAT_VALUE(4, PAT_WRITE_PROTECTED);
                pat_msr = (pat_msr & ~PAT_MASK(5)) |
                          PAT_VALUE(5, PAT_WRITE_COMBINING);
-               pat_pte_index[PAT_WRITE_PROTECTED] = PG_PTE_PAT | 0;
-               pat_pte_index[PAT_WRITE_COMBINING] = PG_PTE_PAT | PG_NC_PWT;
+               pat_pte_index[PAT_WRITE_PROTECTED] = X86_PG_PTE_PAT | 0;
+               pat_pte_index[PAT_WRITE_COMBINING] = X86_PG_PTE_PAT | X86_PG_NC_PWT;
 
                /*
                 * Then enable the PAT
@@ -953,7 +1026,7 @@ pmap_init_pat(void)
 void
 pmap_set_opt(void)
 {
-       if (pseflag && (cpu_feature & CPUID_PSE)) {
+       if (cpu_feature & CPUID_PSE) {
                load_cr4(rcr4() | CR4_PSE);
                if (pdir4mb && mycpu->gd_cpuid == 0) {  /* only on BSP */
                        cpu_invltlb();
@@ -1082,13 +1155,13 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
                pd_entry_t *pt;
 
                pt = pmap_pt(pmap, va);
-               if (pt && (*pt & PG_V)) {
-                       if (*pt & PG_PS) {
+               if (pt && (*pt & pmap->pmap_bits[PG_V_IDX])) {
+                       if (*pt & pmap->pmap_bits[PG_PS_IDX]) {
                                rtval = *pt & PG_PS_FRAME;
                                rtval |= va & PDRMASK;
                        } else {
                                ptep = pmap_pt_to_pte(*pt, va);
-                               if (*pt & PG_V) {
+                               if (*pt & pmap->pmap_bits[PG_V_IDX]) {
                                        rtval = *ptep & PG_FRAME;
                                        rtval |= va & PAGE_MASK;
                                }
@@ -1103,7 +1176,7 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
                pt_pv = pv_find(pmap, pmap_pt_pindex(va));
                if (pt_pv) {
                        ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
-                       if (*ptep & PG_V) {
+                       if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
                                rtval = *ptep & PG_FRAME;
                                rtval |= va & PAGE_MASK;
                        }
@@ -1113,6 +1186,57 @@ pmap_extract(pmap_t pmap, vm_offset_t va)
        return rtval;
 }
 
+/*
+ * Similar to extract but checks protections, SMP-friendly short-cut for
+ * vm_fault_page[_quick]().  Can return NULL to cause the caller to
+ * fall-through to the real fault code.
+ *
+ * The returned page, if not NULL, is held (and not busied).
+ */
+vm_page_t
+pmap_fault_page_quick(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
+{
+       if (pmap && va < VM_MAX_USER_ADDRESS) {
+               pv_entry_t pt_pv;
+               pv_entry_t pte_pv;
+               pt_entry_t *ptep;
+               pt_entry_t req;
+               vm_page_t m;
+               int error;
+
+               req = pmap->pmap_bits[PG_V_IDX] |
+                     pmap->pmap_bits[PG_U_IDX];
+               if (prot & VM_PROT_WRITE)
+                       req |= pmap->pmap_bits[PG_RW_IDX];
+
+               pt_pv = pv_find(pmap, pmap_pt_pindex(va));
+               if (pt_pv == NULL)
+                       return (NULL);
+               ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
+               if ((*ptep & req) != req) {
+                       pv_drop(pt_pv);
+                       return (NULL);
+               }
+               pte_pv = pv_get_try(pmap, pmap_pte_pindex(va), &error);
+               if (pte_pv && error == 0) {
+                       m = pte_pv->pv_m;
+                       vm_page_hold(m);
+                       if (prot & VM_PROT_WRITE)
+                               vm_page_dirty(m);
+                       pv_put(pte_pv);
+               } else if (pte_pv) {
+                       pv_drop(pte_pv);
+                       m = NULL;
+               } else {
+                       m = NULL;
+               }
+               pv_drop(pt_pv);
+               return(m);
+       } else {
+               return(NULL);
+       }
+}
+
 /*
  * Extract the physical page address associated kernel virtual address.
  */
@@ -1126,7 +1250,7 @@ pmap_kextract(vm_offset_t va)
                pa = DMAP_TO_PHYS(va);
        } else {
                pt = *vtopt(va);
-               if (pt & PG_PS) {
+               if (pt & kernel_pmap.pmap_bits[PG_PS_IDX]) {
                        pa = (pt & PG_PS_FRAME) | (va & PDRMASK);
                } else {
                        /*
@@ -1163,7 +1287,10 @@ pmap_kenter(vm_offset_t va, vm_paddr_t pa)
        pmap_inval_info info;
 
        pmap_inval_init(&info);                         /* XXX remove */
-       npte = pa | PG_RW | PG_V | pgeflag;
+       npte = pa |
+           kernel_pmap.pmap_bits[PG_RW_IDX] |
+           kernel_pmap.pmap_bits[PG_V_IDX];
+//         pgeflag;
        pte = vtopte(va);
        pmap_inval_interlock(&info, &kernel_pmap, va);  /* XXX remove */
        *pte = npte;
@@ -1183,7 +1310,10 @@ pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa)
        pt_entry_t *pte;
        pt_entry_t npte;
 
-       npte = pa | PG_RW | PG_V | pgeflag;
+       npte = pa |
+           kernel_pmap.pmap_bits[PG_RW_IDX] |
+           kernel_pmap.pmap_bits[PG_V_IDX];
+//         pgeflag;
        pte = vtopte(va);
        *pte = npte;
        cpu_invlpg((void *)va);
@@ -1238,16 +1368,18 @@ pmap_kremove_quick(vm_offset_t va)
 void
 pmap_kmodify_rw(vm_offset_t va)
 {
-       atomic_set_long(vtopte(va), PG_RW);
+       atomic_set_long(vtopte(va), kernel_pmap.pmap_bits[PG_RW_IDX]);
        cpu_invlpg((void *)va);
 }
 
+/* NOT USED
 void
 pmap_kmodify_nc(vm_offset_t va)
 {
        atomic_set_long(vtopte(va), PG_N);
        cpu_invlpg((void *)va);
 }
+*/
 
 /*
  * Used to map a range of physical addresses into kernel virtual
@@ -1351,8 +1483,11 @@ pmap_qenter(vm_offset_t va, vm_page_t *m, int count)
                pt_entry_t *pte;
 
                pte = vtopte(va);
-               *pte = VM_PAGE_TO_PHYS(*m) | PG_RW | PG_V |
-                       pat_pte_index[(*m)->pat_mode] | pgeflag;
+               *pte = VM_PAGE_TO_PHYS(*m) |
+                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                   kernel_pmap.pmap_bits[PG_V_IDX] |
+                   kernel_pmap.pmap_cache_bits[(*m)->pat_mode];
+//             pgeflag;
                cpu_invlpg((void *)va);
                va += PAGE_SIZE;
                m++;
@@ -1406,6 +1541,21 @@ pmap_init_proc(struct proc *p)
 {
 }
 
+static void
+pmap_pinit_defaults(struct pmap *pmap) {
+       bcopy(pmap_bits_default, pmap->pmap_bits, sizeof(pmap_bits_default));
+       bcopy(protection_codes, pmap->protection_codes, sizeof(protection_codes));
+       bcopy(pat_pte_index, pmap->pmap_cache_bits, sizeof(pat_pte_index));
+       pmap->pmap_cache_mask = X86_PG_NC_PWT | X86_PG_NC_PCD | X86_PG_PTE_PAT;
+       pmap->copyinstr = std_copyinstr;
+       pmap->copyin = std_copyin;
+       pmap->copyout = std_copyout;
+       pmap->fubyte = std_fubyte;
+       pmap->subyte = std_subyte;
+       pmap->fuword = std_fuword;
+       pmap->suword = std_suword;
+       pmap->suword32 = std_suword32;
+}
 /*
  * Initialize pmap0/vmspace0.  This pmap is not added to pmap_list because
  * it, and IdlePTD, represents the template used to update all other pmaps.
@@ -1425,6 +1575,7 @@ pmap_pinit0(struct pmap *pmap)
        spin_init(&pmap->pm_spin);
        lwkt_token_init(&pmap->pm_token, "pmap_tok");
        bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
+       pmap_pinit_defaults(pmap);
 }
 
 /*
@@ -1442,6 +1593,8 @@ pmap_pinit_simple(struct pmap *pmap)
        pmap->pm_pvhint = NULL;
        pmap->pm_flags = PMAP_FLAG_SIMPLE;
 
+       pmap_pinit_defaults(pmap);
+
        /*
         * Don't blow up locks/tokens on re-use (XXX fix/use drop code
         * for this).
@@ -1460,6 +1613,12 @@ pmap_pinit(struct pmap *pmap)
        pv_entry_t pv;
        int j;
 
+       if (pmap->pm_pmlpv) {
+               if (pmap->pmap_bits[TYPE_IDX] != REGULAR_PMAP) {
+                       pmap_puninit(pmap);
+               }
+       }
+
        pmap_pinit_simple(pmap);
        pmap->pm_flags &= ~PMAP_FLAG_SIMPLE;
 
@@ -1490,16 +1649,24 @@ pmap_pinit(struct pmap *pmap)
                 */
                for (j = 0; j < NDMPML4E; ++j) {
                        pmap->pm_pml4[DMPML4I + j] =
-                               (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
-                               PG_RW | PG_V | PG_U;
+                           (DMPDPphys + ((vm_paddr_t)j << PML4SHIFT)) |
+                           pmap->pmap_bits[PG_RW_IDX] |
+                           pmap->pmap_bits[PG_V_IDX] |
+                           pmap->pmap_bits[PG_U_IDX];
                }
-               pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+               pmap->pm_pml4[KPML4I] = KPDPphys |
+                   pmap->pmap_bits[PG_RW_IDX] |
+                   pmap->pmap_bits[PG_V_IDX] |
+                   pmap->pmap_bits[PG_U_IDX];
 
                /*
                 * install self-referential address mapping entry
                 */
                pmap->pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pv->pv_m) |
-                                          PG_V | PG_RW | PG_A | PG_M;
+                   pmap->pmap_bits[PG_V_IDX] |
+                   pmap->pmap_bits[PG_RW_IDX] |
+                   pmap->pmap_bits[PG_A_IDX] |
+                   pmap->pmap_bits[PG_M_IDX];
        } else {
                KKASSERT(pv->pv_m->flags & PG_MAPPED);
                KKASSERT(pv->pv_m->flags & PG_WRITEABLE);
@@ -1744,7 +1911,7 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
         */
        if (pvp) {
                ptep = pv_pte_lookup(pvp, ptepindex);
-               if (*ptep & PG_V) {
+               if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
                        pt_entry_t pte;
                        pmap_inval_info info;
 
@@ -1766,8 +1933,12 @@ pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, pv_entry_t *pvpp)
                } else {
                        vm_page_wire_quick(pvp->pv_m);
                }
-               *ptep = VM_PAGE_TO_PHYS(m) | (PG_U | PG_RW | PG_V |
-                                             PG_A | PG_M);
+               *ptep = VM_PAGE_TO_PHYS(m) |
+                   (pmap->pmap_bits[PG_U_IDX] |
+                   pmap->pmap_bits[PG_RW_IDX] |
+                   pmap->pmap_bits[PG_V_IDX] |
+                   pmap->pmap_bits[PG_A_IDX] |
+                   pmap->pmap_bits[PG_M_IDX]);
        }
        vm_page_wakeup(m);
 notnew:
@@ -1923,7 +2094,11 @@ retry:
         */
        pt = pv_pte_lookup(proc_pd_pv, pmap_pt_index(b));
        npte = VM_PAGE_TO_PHYS(xpv->pv_m) |
-              (PG_U | PG_RW | PG_V | PG_A | PG_M);
+           (pmap->pmap_bits[PG_U_IDX] |
+           pmap->pmap_bits[PG_RW_IDX] |
+           pmap->pmap_bits[PG_V_IDX] |
+           pmap->pmap_bits[PG_A_IDX] |
+           pmap->pmap_bits[PG_M_IDX]);
 
        /*
         * Dispose of previous page table page if it was local to the
@@ -1979,7 +2154,7 @@ retry:
                 * NOTE: replacing valid pte, wire_count on proc_pd_pv
                 * stays the same.
                 */
-               KKASSERT(opte & PG_V);
+               KKASSERT(opte & pmap->pmap_bits[PG_V_IDX]);
                m = PHYS_TO_VM_PAGE(opte & PG_FRAME);
                if (vm_page_unwire_quick(m)) {
                        panic("pmap_allocpte_seg: "
@@ -2205,7 +2380,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                        gotpvp = 1;
                }
                pdp = &pmap->pm_pml4[pdp_index & ((1ul << NPML4EPGSHIFT) - 1)];
-               KKASSERT((*pdp & PG_V) != 0);
+               KKASSERT((*pdp & pmap->pmap_bits[PG_V_IDX]) != 0);
                p = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
                *pdp = 0;
                KKASSERT(info == NULL);
@@ -2233,7 +2408,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                if (pvp) {
                        pd = pv_pte_lookup(pvp, pd_index &
                                                ((1ul << NPDPEPGSHIFT) - 1));
-                       KKASSERT((*pd & PG_V) != 0);
+                       KKASSERT((*pd & pmap->pmap_bits[PG_V_IDX]) != 0);
                        p = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
                        *pd = 0;
                } else {
@@ -2259,7 +2434,7 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                        gotpvp = 1;
                }
                pt = pv_pte_lookup(pvp, pt_index & ((1ul << NPDPEPGSHIFT) - 1));
-               KKASSERT((*pt & PG_V) != 0);
+               KKASSERT((*pt & pmap->pmap_bits[PG_V_IDX]) != 0);
                p = PHYS_TO_VM_PAGE(*pt & PG_FRAME);
                *pt = 0;
                KKASSERT(info == NULL);
@@ -2308,29 +2483,30 @@ pmap_remove_pv_pte(pv_entry_t pv, pv_entry_t pvp, struct pmap_inval_info *info)
                /*
                 * Now update the vm_page_t
                 */
-               if ((pte & (PG_MANAGED|PG_V)) != (PG_MANAGED|PG_V)) {
+               if ((pte & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) !=
+                   (pmap->pmap_bits[PG_MANAGED_IDX]|pmap->pmap_bits[PG_V_IDX])) {
                        kprintf("remove_pte badpte %016lx %016lx %d\n",
                                pte, pv->pv_pindex,
                                pv->pv_pindex < pmap_pt_pindex(0));
                }
                /* PHYS_TO_VM_PAGE() will not work for FICTITIOUS pages */
                /*KKASSERT((pte & (PG_MANAGED|PG_V)) == (PG_MANAGED|PG_V));*/
-               if (pte & PG_DEVICE)
+               if (pte & pmap->pmap_bits[PG_DEVICE_IDX])
                        p = pv->pv_m;
                else
                        p = PHYS_TO_VM_PAGE(pte & PG_FRAME);
                /* p = pv->pv_m; */
 
-               if (pte & PG_M) {
+               if (pte & pmap->pmap_bits[PG_M_IDX]) {
                        if (pmap_track_modified(ptepindex))
                                vm_page_dirty(p);
                }
-               if (pte & PG_A) {
+               if (pte & pmap->pmap_bits[PG_A_IDX]) {
                        vm_page_flag_set(p, PG_REFERENCED);
                }
-               if (pte & PG_W)
+               if (pte & pmap->pmap_bits[PG_W_IDX])
                        atomic_add_long(&pmap->pm_stats.wired_count, -1);
-               if (pte & PG_G)
+               if (pte & pmap->pmap_bits[PG_G_IDX])
                        cpu_invlpg((void *)va);
        }
 
@@ -2392,7 +2568,7 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
        if (kernel_vm_end == 0) {
                kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
                nkpt = 0;
-               while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & PG_V) != 0) {
+               while ((*pmap_pt(&kernel_pmap, kernel_vm_end) & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) {
                        kernel_vm_end = (kernel_vm_end + PAGE_SIZE * NPTEPG) &
                                        ~(PAGE_SIZE * NPTEPG - 1);
                        nkpt++;
@@ -2440,12 +2616,16 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                                pmap_zero_page(paddr);
                        vm_page_flag_clear(nkpg, PG_ZERO);
                        newpd = (pdp_entry_t)
-                               (paddr | PG_V | PG_RW | PG_A | PG_M);
+                           (paddr |
+                           kernel_pmap.pmap_bits[PG_V_IDX] |
+                           kernel_pmap.pmap_bits[PG_RW_IDX] |
+                           kernel_pmap.pmap_bits[PG_A_IDX] |
+                           kernel_pmap.pmap_bits[PG_M_IDX]);
                        *pmap_pd(&kernel_pmap, kstart) = newpd;
                        nkpt++;
                        continue; /* try again */
                }
-               if ((*pt & PG_V) != 0) {
+               if ((*pt & kernel_pmap.pmap_bits[PG_V_IDX]) != 0) {
                        kstart = (kstart + PAGE_SIZE * NPTEPG) &
                                 ~(PAGE_SIZE * NPTEPG - 1);
                        if (kstart - 1 >= kernel_map.max_offset) {
@@ -2469,7 +2649,11 @@ pmap_growkernel(vm_offset_t kstart, vm_offset_t kend)
                ptppaddr = VM_PAGE_TO_PHYS(nkpg);
                pmap_zero_page(ptppaddr);
                vm_page_flag_clear(nkpg, PG_ZERO);
-               newpt = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
+               newpt = (pd_entry_t) (ptppaddr |
+                   kernel_pmap.pmap_bits[PG_V_IDX] |
+                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                   kernel_pmap.pmap_bits[PG_A_IDX] |
+                   kernel_pmap.pmap_bits[PG_M_IDX]);
                *pmap_pt(&kernel_pmap, kstart) = newpt;
                nkpt++;
 
@@ -2714,21 +2898,21 @@ pv_get_try(pmap_t pmap, vm_pindex_t pindex, int *errorp)
 {
        pv_entry_t pv;
 
-       spin_lock(&pmap->pm_spin);
+       spin_lock_shared(&pmap->pm_spin);
        if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
                pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
        if (pv == NULL) {
-               spin_unlock(&pmap->pm_spin);
+               spin_unlock_shared(&pmap->pm_spin);
                *errorp = 0;
                return NULL;
        }
        if (pv_hold_try(pv)) {
                pv_cache(pv, pindex);
-               spin_unlock(&pmap->pm_spin);
+               spin_unlock_shared(&pmap->pm_spin);
                *errorp = 0;
                return(pv);     /* lock succeeded */
        }
-       spin_unlock(&pmap->pm_spin);
+       spin_unlock_shared(&pmap->pm_spin);
        *errorp = 1;
        return (pv);            /* lock failed */
 }
@@ -2742,7 +2926,7 @@ pv_find(pmap_t pmap, vm_pindex_t pindex)
 {
        pv_entry_t pv;
 
-       spin_lock(&pmap->pm_spin);
+       spin_lock_shared(&pmap->pm_spin);
 
        if ((pv = pmap->pm_pvhint) == NULL || pv->pv_pindex != pindex)
                pv = pv_entry_rb_tree_RB_LOOKUP(&pmap->pm_pvroot, pindex);
@@ -2752,7 +2936,7 @@ pv_find(pmap_t pmap, vm_pindex_t pindex)
        }
        pv_hold(pv);
        pv_cache(pv, pindex);
-       spin_unlock(&pmap->pm_spin);
+       spin_unlock_shared(&pmap->pm_spin);
        return(pv);
 }
 
@@ -3013,18 +3197,19 @@ pmap_scan(struct pmap_scan_info *info)
                         */
                        KKASSERT(pte_pv == NULL);
                } else if (pte_pv) {
-                       KASSERT((*ptep & (PG_MANAGED|PG_V)) == (PG_MANAGED|
-                                                               PG_V),
-                               ("bad *ptep %016lx sva %016lx pte_pv %p",
-                               *ptep, info->sva, pte_pv));
+                       KASSERT((*ptep & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
+                           (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX]),
+                           ("bad *ptep %016lx sva %016lx pte_pv %p",
+                           *ptep, info->sva, pte_pv));
                        info->func(pmap, info, pte_pv, pt_pv, 0,
                                   info->sva, ptep, info->arg);
                } else {
-                       KASSERT((*ptep & (PG_MANAGED|PG_V)) == PG_V,
-                               ("bad *ptep %016lx sva %016lx pte_pv NULL",
-                               *ptep, info->sva));
+                       KASSERT((*ptep & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
+                           pmap->pmap_bits[PG_V_IDX],
+                           ("bad *ptep %016lx sva %016lx pte_pv NULL",
+                           *ptep, info->sva));
                        info->func(pmap, info, NULL, pt_pv, 0,
-                                  info->sva, ptep, info->arg);
+                           info->sva, ptep, info->arg);
                }
                if (pt_pv)
                        pv_put(pt_pv);
@@ -3200,7 +3385,7 @@ pmap_scan_callback(pv_entry_t pv, void *data)
                                pd_pv = pv_get(pmap, pmap_pd_pindex(sva));
                        KKASSERT(pd_pv != NULL);
                        ptep = pv_pte_lookup(pd_pv, pmap_pt_index(sva));
-                       if (*ptep & PG_V) {
+                       if (*ptep & pmap->pmap_bits[PG_V_IDX]) {
                                info->func(pmap, info, NULL, pd_pv, 1,
                                           sva, ptep, info->arg);
                        }
@@ -3309,21 +3494,21 @@ kernel_skip:
                         * isn't.
                         */
                        if (pte_pv) {
-                               KASSERT((*ptep & (PG_MANAGED|PG_V)) ==
-                                        (PG_MANAGED|PG_V),
-                                       ("bad *ptep %016lx sva %016lx "
-                                        "pte_pv %p",
-                                        *ptep, sva, pte_pv));
+                               KASSERT((*ptep & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
+                                   (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX]),
+                                   ("bad *ptep %016lx sva %016lx "
+                                   "pte_pv %p",
+                                   *ptep, sva, pte_pv));
                                info->func(pmap, info, pte_pv, pt_pv, 0,
-                                          sva, ptep, info->arg);
+                                   sva, ptep, info->arg);
                        } else {
-                               KASSERT((*ptep & (PG_MANAGED|PG_V)) ==
-                                        PG_V,
-                                       ("bad *ptep %016lx sva %016lx "
-                                        "pte_pv NULL",
-                                        *ptep, sva));
+                               KASSERT((*ptep & (pmap->pmap_bits[PG_MANAGED_IDX] | pmap->pmap_bits[PG_V_IDX])) ==
+                                   pmap->pmap_bits[PG_V_IDX],
+                                   ("bad *ptep %016lx sva %016lx "
+                                   "pte_pv NULL",
+                                    *ptep, sva));
                                info->func(pmap, info, NULL, pt_pv, 0,
-                                          sva, ptep, info->arg);
+                                   sva, ptep, info->arg);
                        }
                        pte_pv = NULL;
                        sva += PAGE_SIZE;
@@ -3406,7 +3591,7 @@ pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
                pte = pte_load_clear(ptep);
                if (info->doinval)
                        pmap_inval_deinterlock(&info->inval, pmap);
-               if (pte & PG_W)
+               if (pte & pmap->pmap_bits[PG_W_IDX])
                        atomic_add_long(&pmap->pm_stats.wired_count, -1);
                atomic_add_long(&pmap->pm_stats.resident_count, -1);
                if (vm_page_unwire_quick(pt_pv->pv_m))
@@ -3425,7 +3610,7 @@ pmap_remove_callback(pmap_t pmap, struct pmap_scan_info *info,
                if (info->doinval)
                        pmap_inval_deinterlock(&info->inval, pmap);
                atomic_add_long(&pmap->pm_stats.resident_count, -1);
-               KKASSERT((pte & PG_DEVICE) == 0);
+               KKASSERT((pte & pmap->pmap_bits[PG_DEVICE_IDX]) == 0);
                if (vm_page_unwire_quick(PHYS_TO_VM_PAGE(pte & PG_FRAME)))
                        panic("pmap_remove: shared pgtable1 bad wirecount");
                if (vm_page_unwire_quick(pt_pv->pv_m))
@@ -3530,24 +3715,24 @@ again:
        cbits = pbits;
        if (pte_pv) {
                m = NULL;
-               if (pbits & PG_A) {
-                       if ((pbits & PG_DEVICE) == 0) {
+               if (pbits & pmap->pmap_bits[PG_A_IDX]) {
+                       if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
                                m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
                                KKASSERT(m == pte_pv->pv_m);
                                vm_page_flag_set(m, PG_REFERENCED);
                        }
-                       cbits &= ~PG_A;
+                       cbits &= ~pmap->pmap_bits[PG_A_IDX];
                }
-               if (pbits & PG_M) {
+               if (pbits & pmap->pmap_bits[PG_M_IDX]) {
                        if (pmap_track_modified(pte_pv->pv_pindex)) {
-                               if ((pbits & PG_DEVICE) == 0) {
+                               if ((pbits & pmap->pmap_bits[PG_DEVICE_IDX]) == 0) {
                                        if (m == NULL) {
                                                m = PHYS_TO_VM_PAGE(pbits &
                                                                    PG_FRAME);
                                        }
                                        vm_page_dirty(m);
                                }
-                               cbits &= ~PG_M;
+                               cbits &= ~pmap->pmap_bits[PG_M_IDX];
                        }
                }
        } else if (sharept) {
@@ -3574,7 +3759,7 @@ again:
        /* else unmanaged page, adjust bits, no wire changes */
 
        if (ptep) {
-               cbits &= ~PG_RW;
+               cbits &= ~pmap->pmap_bits[PG_RW_IDX];
                if (pbits != cbits && !atomic_cmpset_long(ptep, pbits, cbits)) {
                        goto again;
                }
@@ -3665,7 +3850,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                                                  NULL, entry, va);
                        ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
                }
-               KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED) == 0);
+               KKASSERT(*ptep == 0 || (*ptep & pmap->pmap_bits[PG_MANAGED_IDX]) == 0);
        } else {
                if (va >= VM_MAX_USER_ADDRESS) {
                        /*
@@ -3682,31 +3867,33 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                                                   &pt_pv, entry, va);
                        ptep = pv_pte_lookup(pt_pv, pmap_pte_index(va));
                }
-               KKASSERT(*ptep == 0 || (*ptep & PG_MANAGED));
+               KKASSERT(*ptep == 0 || (*ptep & pmap->pmap_bits[PG_MANAGED_IDX]));
        }
 
        pa = VM_PAGE_TO_PHYS(m);
        origpte = *ptep;
        opa = origpte & PG_FRAME;
 
-       newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) | PG_V | PG_A);
+       newpte = (pt_entry_t)(pa | pte_prot(pmap, prot) |
+           pmap->pmap_bits[PG_V_IDX] | pmap->pmap_bits[PG_A_IDX]);
        if (wired)
-               newpte |= PG_W;
+               newpte |= pmap->pmap_bits[PG_W_IDX];
        if (va < VM_MAX_USER_ADDRESS)
-               newpte |= PG_U;
+               newpte |= pmap->pmap_bits[PG_U_IDX];
        if (pte_pv)
-               newpte |= PG_MANAGED;
-       if (pmap == &kernel_pmap)
-               newpte |= pgeflag;
-       newpte |= pat_pte_index[m->pat_mode];
+               newpte |= pmap->pmap_bits[PG_MANAGED_IDX];
+//     if (pmap == &kernel_pmap)
+//             newpte |= pgeflag;
+       newpte |= pmap->pmap_cache_bits[m->pat_mode];
        if (m->flags & PG_FICTITIOUS)
-               newpte |= PG_DEVICE;
+               newpte |= pmap->pmap_bits[PG_DEVICE_IDX];
 
        /*
         * It is possible for multiple faults to occur in threaded
         * environments, the existing pte might be correct.
         */
-       if (((origpte ^ newpte) & ~(pt_entry_t)(PG_M|PG_A)) == 0)
+       if (((origpte ^ newpte) & ~(pt_entry_t)(pmap->pmap_bits[PG_M_IDX] |
+           pmap->pmap_bits[PG_A_IDX])) == 0)
                goto done;
 
        if ((prot & VM_PROT_NOSYNC) == 0)
@@ -3817,7 +4004,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
                        atomic_add_long(&pmap->pm_stats.wired_count, 1);
                }
        }
-       if (newpte & PG_RW)
+       if (newpte & pmap->pmap_bits[PG_RW_IDX])
                vm_page_flag_set(m, PG_WRITEABLE);
 
        /*
@@ -3832,7 +4019,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
        if ((prot & VM_PROT_NOSYNC) == 0 || pte_pv == NULL)
                pmap_inval_done(&info);
 done:
-       KKASSERT((newpte & PG_MANAGED) == 0 || (m->flags & PG_MAPPED));
+       KKASSERT((newpte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0 || (m->flags & PG_MAPPED));
 
        /*
         * Cleanup the pv entry, allowing other accessors.
@@ -3880,7 +4067,7 @@ static int pmap_object_init_pt_callback(vm_page_t p, void *data);
 
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
-                   vm_object_t object, vm_pindex_t pindex, 
+                   vm_object_t object, vm_pindex_t pindex,
                    vm_size_t size, int limit)
 {
        struct rb_vm_page_scan_info info;
@@ -4006,7 +4193,7 @@ pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
 
        /*spin_lock(&pmap->pm_spin);*/
        if ((pte = pmap_pte(pmap, addr)) != NULL) {
-               if (*pte & PG_V) {
+               if (*pte & pmap->pmap_bits[PG_V_IDX]) {
                        /*spin_unlock(&pmap->pm_spin);*/
                        return FALSE;
                }
@@ -4032,9 +4219,9 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
        pv = pmap_allocpte_seg(pmap, pmap_pt_pindex(va), NULL, entry, va);
        ptep = pv_pte_lookup(pv, pmap_pte_index(va));
 
-       if (wired && !pmap_pte_w(ptep))
+       if (wired && !pmap_pte_w(pmap, ptep))
                atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, 1);
-       else if (!wired && pmap_pte_w(ptep))
+       else if (!wired && pmap_pte_w(pmap, ptep))
                atomic_add_long(&pv->pv_pmap->pm_stats.wired_count, -1);
 
        /*
@@ -4045,9 +4232,9 @@ pmap_change_wiring(pmap_t pmap, vm_offset_t va, boolean_t wired,
         * wiring changes.
         */
        if (wired)
-               atomic_set_long(ptep, PG_W);
+               atomic_set_long(ptep, pmap->pmap_bits[PG_W_IDX]);
        else
-               atomic_clear_long(ptep, PG_W);
+               atomic_clear_long(ptep, pmap->pmap_bits[PG_W_IDX]);
        pv_put(pv);
        lwkt_reltoken(&pmap->pm_token);
 }
@@ -4205,6 +4392,7 @@ pmap_testbit(vm_page_t m, int bit)
 {
        pv_entry_t pv;
        pt_entry_t *pte;
+       pmap_t pmap;
 
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
                return FALSE;
@@ -4218,25 +4406,29 @@ pmap_testbit(vm_page_t m, int bit)
        }
 
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
+
+#if defined(PMAP_DIAGNOSTIC)
+               if (pv->pv_pmap == NULL) {
+                       kprintf("Null pmap (tb) at pindex: %"PRIu64"\n",
+                           pv->pv_pindex);
+                       continue;
+               }
+#endif
+               pmap = pv->pv_pmap;
+
                /*
                 * if the bit being tested is the modified bit, then
                 * mark clean_map and ptes as never
                 * modified.
                 */
-               if (bit & (PG_A|PG_M)) {
+               if (bit == PG_A_IDX || bit == PG_M_IDX) {
+                               //& (pmap->pmap_bits[PG_A_IDX] | pmap->pmap_bits[PG_M_IDX])) {
                        if (!pmap_track_modified(pv->pv_pindex))
                                continue;
                }
 
-#if defined(PMAP_DIAGNOSTIC)
-               if (pv->pv_pmap == NULL) {
-                       kprintf("Null pmap (tb) at pindex: %"PRIu64"\n",
-                           pv->pv_pindex);
-                       continue;
-               }
-#endif
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
-               if (*pte & bit) {
+               if (*pte & pmap->pmap_bits[bit]) {
                        vm_page_spin_unlock(m);
                        return TRUE;
                }
@@ -4253,15 +4445,16 @@ pmap_testbit(vm_page_t m, int bit)
  */
 static __inline
 void
-pmap_clearbit(vm_page_t m, int bit)
+pmap_clearbit(vm_page_t m, int bit_index)
 {
        struct pmap_inval_info info;
        pv_entry_t pv;
        pt_entry_t *pte;
        pt_entry_t pbits;
        pmap_t save_pmap;
+       pmap_t pmap;
 
-       if (bit == PG_RW)
+       if (bit_index == PG_RW_IDX)
                vm_page_flag_clear(m, PG_WRITEABLE);
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
                return;
@@ -4282,7 +4475,7 @@ pmap_clearbit(vm_page_t m, int bit)
         *
         * NOTE: Does not re-dirty the page when clearing only PG_M.
         */
-       if ((bit & PG_RW) == 0) {
+       if (bit_index != PG_RW_IDX) {
                vm_page_spin_lock(m);
                TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
        #if defined(PMAP_DIAGNOSTIC)
@@ -4292,11 +4485,12 @@ pmap_clearbit(vm_page_t m, int bit)
                                continue;
                        }
        #endif
+                       pmap = pv->pv_pmap;
                        pte = pmap_pte_quick(pv->pv_pmap,
                                             pv->pv_pindex << PAGE_SHIFT);
                        pbits = *pte;
-                       if (pbits & bit)
-                               atomic_clear_long(pte, bit);
+                       if (pbits & pmap->pmap_bits[bit_index])
+                               atomic_clear_long(pte, pmap->pmap_bits[bit_index]);
                }
                vm_page_spin_unlock(m);
                return;
@@ -4324,11 +4518,12 @@ restart:
                        continue;
                }
 #endif
+               pmap = pv->pv_pmap;
                /*
                 * Skip pages which do not have PG_RW set.
                 */
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
-               if ((*pte & PG_RW) == 0)
+               if ((*pte & pmap->pmap_bits[PG_RW_IDX]) == 0)
                        continue;
 
                /*
@@ -4349,8 +4544,9 @@ restart:
                for (;;) {
                        pbits = *pte;
                        cpu_ccfence();
-                       if (atomic_cmpset_long(pte, pbits,
-                                              pbits & ~(PG_RW|PG_M))) {
+                       if (atomic_cmpset_long(pte, pbits, pbits &
+                           ~(save_pmap->pmap_bits[PG_RW_IDX] |
+                           save_pmap->pmap_bits[PG_M_IDX]))) {
                                break;
                        }
                }
@@ -4362,7 +4558,7 @@ restart:
                 * we also clear PG_M (done above) and mark the page dirty.
                 * Callers expect this behavior.
                 */
-               if (pbits & PG_M)
+               if (pbits & save_pmap->pmap_bits[PG_M_IDX])
                        vm_page_dirty(m);
                pv_put(pv);
        }
@@ -4385,7 +4581,7 @@ pmap_page_protect(vm_page_t m, vm_prot_t prot)
                         * NOTE: pmap_clearbit(.. PG_RW) also clears
                         *       the PG_WRITEABLE flag in (m).
                         */
-                       pmap_clearbit(m, PG_RW);
+                       pmap_clearbit(m, PG_RW_IDX);
                } else {
                        pmap_remove_all(m);
                }
@@ -4415,6 +4611,7 @@ pmap_ts_referenced(vm_page_t m)
 {
        pv_entry_t pv;
        pt_entry_t *pte;
+       pmap_t pmap;
        int rtval = 0;
 
        if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
@@ -4424,9 +4621,10 @@ pmap_ts_referenced(vm_page_t m)
        TAILQ_FOREACH(pv, &m->md.pv_list, pv_list) {
                if (!pmap_track_modified(pv->pv_pindex))
                        continue;
+               pmap = pv->pv_pmap;
                pte = pmap_pte_quick(pv->pv_pmap, pv->pv_pindex << PAGE_SHIFT);
-               if (pte && (*pte & PG_A)) {
-                       atomic_clear_long(pte, PG_A);
+               if (pte && (*pte & pmap->pmap_bits[PG_A_IDX])) {
+                       atomic_clear_long(pte, pmap->pmap_bits[PG_A_IDX]);
                        rtval++;
                        if (rtval > 4)
                                break;
@@ -4447,7 +4645,7 @@ pmap_is_modified(vm_page_t m)
 {
        boolean_t res;
 
-       res = pmap_testbit(m, PG_M);
+       res = pmap_testbit(m, PG_M_IDX);
        return (res);
 }
 
@@ -4457,7 +4655,7 @@ pmap_is_modified(vm_page_t m)
 void
 pmap_clear_modify(vm_page_t m)
 {
-       pmap_clearbit(m, PG_M);
+       pmap_clearbit(m, PG_M_IDX);
 }
 
 /*
@@ -4468,7 +4666,7 @@ pmap_clear_modify(vm_page_t m)
 void
 pmap_clear_reference(vm_page_t m)
 {
-       pmap_clearbit(m, PG_A);
+       pmap_clearbit(m, PG_A_IDX);
 }
 
 /*
@@ -4483,7 +4681,7 @@ i386_protection_init(void)
 
        /* JG NX support may go here; No VM_PROT_EXECUTE ==> set NX bit  */
        kp = protection_codes;
-       for (prot = 0; prot < 8; prot++) {
+       for (prot = 0; prot < PROTECTION_CODES_SIZE; prot++) {
                switch (prot) {
                case VM_PROT_NONE | VM_PROT_NONE | VM_PROT_NONE:
                        /*
@@ -4499,7 +4697,7 @@ i386_protection_init(void)
                case VM_PROT_NONE | VM_PROT_WRITE | VM_PROT_EXECUTE:
                case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_NONE:
                case VM_PROT_READ | VM_PROT_WRITE | VM_PROT_EXECUTE:
-                       *kp++ = PG_RW;
+                       *kp++ = pmap_bits_default[PG_RW_IDX];
                        break;
                }
        }
@@ -4560,8 +4758,10 @@ pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
        pa = pa & ~PAGE_MASK;
        for (tmpva = va, tmpsize = size; tmpsize > 0;) {
                pte = vtopte(tmpva);
-               *pte = pa | PG_RW | PG_V | /* pgeflag | */
-                      pat_pte_index[mode];
+               *pte = pa |
+                   kernel_pmap.pmap_bits[PG_RW_IDX] |
+                   kernel_pmap.pmap_bits[PG_V_IDX] | /* pgeflag | */
+                   kernel_pmap.pmap_cache_bits[mode];
                tmpsize -= PAGE_SIZE;
                tmpva += PAGE_SIZE;
                pa += PAGE_SIZE;
@@ -4621,9 +4821,8 @@ pmap_change_attr(vm_offset_t va, vm_size_t count, int mode)
 
        while (count) {
                pte = vtopte(va);
-               *pte = (*pte & ~(pt_entry_t)(PG_PTE_PAT | PG_NC_PCD |
-                                            PG_NC_PWT)) |
-                      pat_pte_index[mode];
+               *pte = (*pte & ~(pt_entry_t)(kernel_pmap.pmap_cache_mask)) |
+                      kernel_pmap.pmap_cache_bits[mode];
                --count;
                va += PAGE_SIZE;
        }
@@ -4657,12 +4856,12 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr)
                vm_offset_t pa;
 
                val = MINCORE_INCORE;
-               if ((pte & PG_MANAGED) == 0)
+               if ((pte & pmap->pmap_bits[PG_MANAGED_IDX]) == 0)
                        goto done;
 
                pa = pte & PG_FRAME;
 
-               if (pte & PG_DEVICE)
+               if (pte & pmap->pmap_bits[PG_DEVICE_IDX])
                        m = NULL;
                else
                        m = PHYS_TO_VM_PAGE(pa);
@@ -4670,7 +4869,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr)
                /*
                 * Modified by us
                 */
-               if (pte & PG_M)
+               if (pte & pmap->pmap_bits[PG_M_IDX])
                        val |= MINCORE_MODIFIED|MINCORE_MODIFIED_OTHER;
                /*
                 * Modified by someone
@@ -4680,7 +4879,7 @@ pmap_mincore(pmap_t pmap, vm_offset_t addr)
                /*
                 * Referenced by us
                 */
-               if (pte & PG_A)
+               if (pte & pmap->pmap_bits[PG_A_IDX])
                        val |= MINCORE_REFERENCED|MINCORE_REFERENCED_OTHER;
 
                /*
@@ -4754,7 +4953,13 @@ pmap_setlwpvm(struct lwp *lp, struct vmspace *newvm)
 #if defined(SWTCH_OPTIM_STATS)
                        tlb_flush_count++;
 #endif
-                       curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
+                       if (pmap->pmap_bits[TYPE_IDX] == REGULAR_PMAP) {
+                               curthread->td_pcb->pcb_cr3 = vtophys(pmap->pm_pml4);
+                       } else if (pmap->pmap_bits[TYPE_IDX] == EPT_PMAP) {
+                               curthread->td_pcb->pcb_cr3 = KPML4phys;
+                       } else {
+                               panic("pmap_setlwpvm: unknown pmap type\n");
+                       }
                        load_cr3(curthread->td_pcb->pcb_cr3);
                        pmap = vmspace_pmap(oldvm);
                        atomic_clear_cpumask(&pmap->pm_active, mycpu->gd_cpumask);
@@ -4813,7 +5018,7 @@ pmap_kvtom(vm_offset_t va)
 {
        pt_entry_t *ptep = vtopte(va);
 
-       KKASSERT((*ptep & PG_DEVICE) == 0);
+       KKASSERT((*ptep & kernel_pmap.pmap_bits[PG_DEVICE_IDX]) == 0);
        return(PHYS_TO_VM_PAGE(*ptep & PG_FRAME));
 }
 
index a7f7431..9b4803a 100644 (file)
@@ -106,10 +106,12 @@ pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
     }
     DEBUG_POP_INFO();
     KKASSERT((info->pir_flags & PIRF_CPUSYNC) == 0);
+
     info->pir_va = va;
     info->pir_flags = PIRF_CPUSYNC;
     lwkt_cpusync_init(&info->pir_cpusync, oactive, pmap_inval_callback, info);
     lwkt_cpusync_interlock(&info->pir_cpusync);
+    atomic_add_acq_long(&pmap->pm_invgen, 1);
 }
 
 void
index b3fa072..dec0c98 100644 (file)
@@ -211,10 +211,10 @@ ENTRY(fillw)
  */
 
 /*
- * copyout(from_kernel, to_user, len)  - MP SAFE
+ * std_copyout(from_kernel, to_user, len)  - MP SAFE
  *         %rdi,        %rsi,    %rdx
  */
-ENTRY(copyout)
+ENTRY(std_copyout)
        movq    PCPU(curthread),%rax
        movq    TD_PCB(%rax), %rax
        movq    $copyout_fault,PCB_ONFAULT(%rax)
@@ -274,10 +274,10 @@ copyout_fault:
        ret
 
 /*
- * copyin(from_user, to_kernel, len) - MP SAFE
+ * std_copyin(from_user, to_kernel, len) - MP SAFE
  *        %rdi,      %rsi,      %rdx
  */
-ENTRY(copyin)
+ENTRY(std_copyin)
        movq    PCPU(curthread),%rax
        movq    TD_PCB(%rax), %rax
        movq    $copyin_fault,PCB_ONFAULT(%rax)
@@ -387,7 +387,7 @@ ENTRY(casuword)
  */
 
 ALTENTRY(fuword64)
-ENTRY(fuword)
+ENTRY(std_fuword)
        movq    PCPU(curthread),%rcx
        movq    TD_PCB(%rcx), %rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)
@@ -441,7 +441,7 @@ ENTRY(fuword16)
        movq    $0,PCB_ONFAULT(%rcx)
        ret
 
-ENTRY(fubyte)
+ENTRY(std_fubyte)
        movq    PCPU(curthread),%rcx
        movq    TD_PCB(%rcx), %rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)
@@ -473,7 +473,7 @@ fusufault:
  * Write a long
  */
 ALTENTRY(suword64)
-ENTRY(suword)
+ENTRY(std_suword)
        movq    PCPU(curthread),%rcx
        movq    TD_PCB(%rcx), %rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)
@@ -493,7 +493,7 @@ ENTRY(suword)
 /*
  * Write an int
  */
-ENTRY(suword32)
+ENTRY(std_suword32)
        movq    PCPU(curthread),%rcx
        movq    TD_PCB(%rcx), %rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)
@@ -527,7 +527,7 @@ ENTRY(suword16)
        movq    %rax,PCB_ONFAULT(%rcx)
        ret
 
-ENTRY(subyte)
+ENTRY(std_subyte)
        movq    PCPU(curthread),%rcx
        movq    TD_PCB(%rcx), %rcx
        movq    $fusufault,PCB_ONFAULT(%rcx)
@@ -546,7 +546,7 @@ ENTRY(subyte)
        ret
 
 /*
- * copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
+ * std_copyinstr(from, to, maxlen, int *lencopied) - MP SAFE
  *           %rdi, %rsi, %rdx, %rcx
  *
  *     copy a string from from to to, stop when a 0 character is reached.
@@ -554,7 +554,7 @@ ENTRY(subyte)
  *     EFAULT on protection violations. If lencopied is non-zero,
  *     return the actual length in *lencopied.
  */
-ENTRY(copyinstr)
+ENTRY(std_copyinstr)
        movq    %rdx,%r8                        /* %r8 = maxlen */
        movq    %rcx,%r9                        /* %r9 = *len */
        xchgq   %rdi,%rsi                       /* %rdi = from, %rsi = to */
index 19912c6..36dec4c 100644 (file)
@@ -54,6 +54,8 @@
 #include <machine/globaldata.h>                /* CPU_prvspace */
 #include <machine/smp.h>
 #include <machine/pcb.h>
+#include <machine/thread.h>
+#include <machine/vmm.h>
 
 /*
  * set a TLS descriptor.  For x86_64 descriptor 0 identifies %fs and
@@ -169,6 +171,11 @@ set_user_TLS(void)
                gd->gd_user_gs = td->td_pcb->pcb_gsbase;
                wrmsr(MSR_KGSBASE, gd->gd_user_gs);
        }
+
+       if (td->td_vmm) {
+               vmm_vm_set_tls_area();
+       }
+
        clear_quickret();
        crit_exit_quick(td);
 }
index 881c631..c106269 100644 (file)
@@ -800,6 +800,23 @@ out2:      ;
 #endif
 }
 
+void
+trap_handle_userenter(struct thread *td)
+{
+       userenter(td, td->td_proc);
+}
+
+void
+trap_handle_userexit(struct trapframe *frame, int sticks)
+{
+       struct lwp *lp = curthread->td_lwp;
+
+       if (lp) {
+               userret(lp, frame, sticks);
+               userexit(lp);
+       }
+}
+
 static int
 trap_pfault(struct trapframe *frame, int usermode)
 {
diff --git a/sys/platform/pc64/x86_64/uwrapper.c b/sys/platform/pc64/x86_64/uwrapper.c
new file mode 100644 (file)
index 0000000..f7196b0
--- /dev/null
@@ -0,0 +1,55 @@
+#include <sys/systm.h>
+#include <sys/thread.h>
+#include <sys/types.h>
+#include <sys/proc.h>
+
+#include <vm/vm_map.h>
+
+int
+copyinstr(const void *udaddr, void *kaddr, size_t len, size_t *res)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.copyinstr(udaddr, kaddr, len, res);
+}
+
+ int
+copyin(const void *udaddr, void *kaddr, size_t len)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.copyin(udaddr, kaddr, len);
+}
+
+int
+copyout(const void *kaddr, void *udaddr, size_t len)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.copyout(kaddr, udaddr, len);
+
+}
+
+int
+fubyte(const void *base)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.fubyte(base);
+}
+
+int
+subyte (void *base, int byte)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.subyte(base, byte);
+}
+
+long
+fuword(const void *base)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.fuword(base);
+}
+
+int
+suword(void *base, long word)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.suword(base, word);
+}
+
+int
+suword32(void *base, int word)
+{
+       return curthread->td_proc->p_vmspace->vm_pmap.suword32(base, word);
+}
index c68b5aa..fd3ff74 100644 (file)
@@ -62,6 +62,7 @@
 #include <machine/pcb_ext.h>
 #include <machine/segments.h>
 #include <machine/globaldata.h>        /* npxthread */
+#include <machine/vmm.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -213,8 +214,14 @@ cpu_prepare_lwp(struct lwp *lp, struct lwp_params *params)
        if (error)
                return (error);
 
-       cpu_set_fork_handler(lp,
-           (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
+       if (lp->lwp_proc->p_vmm) {
+               lp->lwp_thread->td_pcb->pcb_cr3 = KPML4phys;
+               cpu_set_fork_handler(lp,
+                   (void (*)(void *, struct trapframe *))vmm_lwp_return, lp);
+       } else {
+               cpu_set_fork_handler(lp,
+                   (void (*)(void *, struct trapframe *))generic_lwp_return, lp);
+       }
        return (0);
 }
 
index 11c2d69..7c3118f 100644 (file)
@@ -236,7 +236,7 @@ db_write_bytes(vm_offset_t addr, size_t size, char *data)
 
            ptep0 = pmap_kpte(addr);
            oldmap0 = *ptep0;
-           *ptep0 |= VPTE_W;
+           *ptep0 |= VPTE_RW;
 
            /* Map another page if the data crosses a page boundary. */
            if ((*ptep0 & PG_PS) == 0) {
@@ -244,14 +244,14 @@ db_write_bytes(vm_offset_t addr, size_t size, char *data)
                if (trunc_page(addr) != addr1) {
                    ptep1 = pmap_kpte(addr1);
                    oldmap1 = *ptep1;
-                   *ptep1 |= VPTE_W;
+                   *ptep1 |= VPTE_RW;
                }
            } else {
                addr1 = trunc_4mpage(addr + size - 1);
                if (trunc_4mpage(addr) != addr1) {
                    ptep1 = pmap_kpte(addr1);
                    oldmap1 = *ptep1;
-                   *ptep1 |= VPTE_W;
+                   *ptep1 |= VPTE_RW;
                }
            }
 
index 07ebf44..184b3aa 100644 (file)
@@ -187,6 +187,11 @@ void       pmap_release(struct pmap *pmap);
 struct vm_page *pmap_use_pt (pmap_t, vm_offset_t);
 void   pmap_set_opt (void);
 
+static __inline int
+pmap_emulate_ad_bits(pmap_t pmap) {
+       return 0;
+}
+
 #endif /* _KERNEL */
 
 #endif /* !LOCORE */
similarity index 53%
copy from sys/platform/vkernel64/include/pmap_inval.h
copy to sys/platform/vkernel/include/vmm.h
index f99fe3f..520099a 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2013 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to The DragonFly Project
- * by Matthew Dillon <dillon@backplane.com>
+ * by Mihai Carabas <mihai.carabas@gmail.com>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $DragonFly: src/sys/platform/vkernel/include/pmap_inval.h,v 1.3 2007/07/02 02:22:57 dillon Exp $
  */
 
-#ifndef _MACHINE_PMAP_INVAL_H_
-#define        _MACHINE_PMAP_INVAL_H_
+#ifndef _MACHINE_VMM_H_
+#define _MACHINE_VMM_H_
 
-#ifndef _SYS_THREAD_H_
-#include <sys/thread.h>
-#endif
+#include <sys/vmm.h>
 
-typedef struct pmap_inval_info {
-    int                        pir_flags;
-    struct lwkt_cpusync        pir_cpusync;
-} pmap_inval_info;
+static __inline
+int vmm_vminit(struct guest_options* opts) {