sys/dev/virtual/nvmm/nvmm.c

   1 /*
   2  * Copyright (c) 2018-2021 Maxime Villard, m00nbsd.net
   3  * All rights reserved.
   4  *
   5  * This code is part of the NVMM hypervisor.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  *
  16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  17  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  18  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  19  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #include <sys/param.h>
  30 #include <sys/systm.h>
  31
  32 #include <sys/kernel.h>
  33 #include <sys/mman.h>
  34
  35 #include "nvmm.h"
  36 #include "nvmm_internal.h"
  37 #include "nvmm_ioctl.h"
  38
  39 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
  40 volatile unsigned int nmachines __cacheline_aligned;
  41
  42 static const struct nvmm_impl *nvmm_impl_list[] = {
  43 #if defined(__x86_64__)
  44         &nvmm_x86_svm,  /* x86 AMD SVM */
  45         &nvmm_x86_vmx   /* x86 Intel VMX */
  46 #endif
  47 };
  48
  49 const struct nvmm_impl *nvmm_impl __read_mostly = NULL;
  50
  51 struct nvmm_owner nvmm_root_owner;
  52
  53 /* -------------------------------------------------------------------------- */
  54
  55 static int
  56 nvmm_machine_alloc(struct nvmm_machine **ret)
  57 {
  58         struct nvmm_machine *mach;
  59         size_t i;
  60
  61         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
  62                 mach = &machines[i];
  63
  64                 os_rwl_wlock(&mach->lock);
  65                 if (mach->present) {
  66                         os_rwl_unlock(&mach->lock);
  67                         continue;
  68                 }
  69
  70                 mach->present = true;
  71                 mach->time = time_second;
  72                 *ret = mach;
  73                 os_atomic_inc_uint(&nmachines);
  74                 return 0;
  75         }
  76
  77         return ENOBUFS;
  78 }
  79
  80 static void
  81 nvmm_machine_free(struct nvmm_machine *mach)
  82 {
  83         OS_ASSERT(os_rwl_wheld(&mach->lock));
  84         OS_ASSERT(mach->present);
  85         mach->present = false;
  86         os_atomic_dec_uint(&nmachines);
  87 }
  88
  89 static int
  90 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
  91     struct nvmm_machine **ret, bool writer)
  92 {
  93         struct nvmm_machine *mach;
  94
  95         if (__predict_false(machid >= NVMM_MAX_MACHINES)) {
  96                 return EINVAL;
  97         }
  98         mach = &machines[machid];
  99
 100         if (__predict_false(writer)) {
 101                 os_rwl_wlock(&mach->lock);
 102         } else {
 103                 os_rwl_rlock(&mach->lock);
 104         }
 105         if (__predict_false(!mach->present)) {
 106                 os_rwl_unlock(&mach->lock);
 107                 return ENOENT;
 108         }
 109         if (__predict_false(mach->owner != owner &&
 110             owner != &nvmm_root_owner)) {
 111                 os_rwl_unlock(&mach->lock);
 112                 return EPERM;
 113         }
 114         *ret = mach;
 115
 116         return 0;
 117 }
 118
 119 static void
 120 nvmm_machine_put(struct nvmm_machine *mach)
 121 {
 122         os_rwl_unlock(&mach->lock);
 123 }
 124
 125 /* -------------------------------------------------------------------------- */
 126
 127 static int
 128 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
 129     struct nvmm_cpu **ret)
 130 {
 131         struct nvmm_cpu *vcpu;
 132
 133         if (cpuid >= NVMM_MAX_VCPUS) {
 134                 return EINVAL;
 135         }
 136         vcpu = &mach->cpus[cpuid];
 137
 138         os_mtx_lock(&vcpu->lock);
 139         if (vcpu->present) {
 140                 os_mtx_unlock(&vcpu->lock);
 141                 return EBUSY;
 142         }
 143
 144         vcpu->present = true;
 145         vcpu->comm = NULL;
 146         vcpu->hcpu_last = -1;
 147         *ret = vcpu;
 148         return 0;
 149 }
 150
 151 static void
 152 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
 153 {
 154         OS_ASSERT(os_mtx_owned(&vcpu->lock));
 155         vcpu->present = false;
 156         if (vcpu->comm != NULL) {
 157                 os_vmobj_unmap(os_kernel_map, (vaddr_t)vcpu->comm,
 158                     (vaddr_t)vcpu->comm + NVMM_COMM_PAGE_SIZE, true);
 159                 /*
 160                  * Require userland to unmap the comm page from its address
 161                  * space, because os_curproc_map at this point (fd close)
 162                  * is not guaranteed to be the correct address space.
 163                  */
 164         }
 165 }
 166
 167 static int
 168 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
 169     struct nvmm_cpu **ret)
 170 {
 171         struct nvmm_cpu *vcpu;
 172
 173         if (__predict_false(cpuid >= NVMM_MAX_VCPUS)) {
 174                 return EINVAL;
 175         }
 176         vcpu = &mach->cpus[cpuid];
 177
 178         os_mtx_lock(&vcpu->lock);
 179         if (__predict_false(!vcpu->present)) {
 180                 os_mtx_unlock(&vcpu->lock);
 181                 return ENOENT;
 182         }
 183         *ret = vcpu;
 184
 185         return 0;
 186 }
 187
 188 static void
 189 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
 190 {
 191         os_mtx_unlock(&vcpu->lock);
 192 }
 193
 194 /* -------------------------------------------------------------------------- */
 195
 196 void
 197 nvmm_kill_machines(struct nvmm_owner *owner)
 198 {
 199         struct nvmm_machine *mach;
 200         struct nvmm_cpu *vcpu;
 201         size_t i, j;
 202         int error;
 203
 204         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
 205                 mach = &machines[i];
 206
 207                 os_rwl_wlock(&mach->lock);
 208                 if (!mach->present || mach->owner != owner) {
 209                         os_rwl_unlock(&mach->lock);
 210                         continue;
 211                 }
 212
 213                 /* Kill it. */
 214                 for (j = 0; j < NVMM_MAX_VCPUS; j++) {
 215                         error = nvmm_vcpu_get(mach, j, &vcpu);
 216                         if (error)
 217                                 continue;
 218                         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
 219                         nvmm_vcpu_free(mach, vcpu);
 220                         nvmm_vcpu_put(vcpu);
 221                         os_atomic_dec_uint(&mach->ncpus);
 222                 }
 223                 (*nvmm_impl->machine_destroy)(mach);
 224                 os_vmspace_destroy(mach->vm);
 225
 226                 /* Drop the kernel vmobj refs. */
 227                 for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
 228                         if (!mach->hmap[j].present)
 229                                 continue;
 230                         os_vmobj_rel(mach->hmap[j].vmobj);
 231                 }
 232
 233                 nvmm_machine_free(mach);
 234
 235                 os_rwl_unlock(&mach->lock);
 236         }
 237 }
 238
 239 /* -------------------------------------------------------------------------- */
 240
 241 static int
 242 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
 243 {
 244         args->cap.version = NVMM_KERN_VERSION;
 245         args->cap.state_size = nvmm_impl->state_size;
 246         args->cap.comm_size = NVMM_COMM_PAGE_SIZE;
 247         args->cap.max_machines = NVMM_MAX_MACHINES;
 248         args->cap.max_vcpus = NVMM_MAX_VCPUS;
 249         args->cap.max_ram = NVMM_MAX_RAM;
 250
 251         (*nvmm_impl->capability)(&args->cap);
 252
 253         return 0;
 254 }
 255
 256 static int
 257 nvmm_machine_create(struct nvmm_owner *owner,
 258     struct nvmm_ioc_machine_create *args)
 259 {
 260         struct nvmm_machine *mach;
 261         int error;
 262
 263         error = nvmm_machine_alloc(&mach);
 264         if (error)
 265                 return error;
 266
 267         /* Curproc owns the machine. */
 268         mach->owner = owner;
 269
 270         /* Zero out the host mappings. */
 271         memset(&mach->hmap, 0, sizeof(mach->hmap));
 272
 273         /* Create the machine vmspace. */
 274         mach->gpa_begin = 0;
 275         mach->gpa_end = NVMM_MAX_RAM;
 276         mach->vm = os_vmspace_create(mach->gpa_begin, mach->gpa_end);
 277
 278 #ifdef __DragonFly__
 279         /*
 280          * Set PMAP_MULTI on the backing pmap for the machine.  Only
 281          * pmap changes to the backing pmap for the machine affect the
 282          * guest.  Changes to the host's pmap do not affect the guest's
 283          * backing pmap.
 284          */
 285         pmap_maybethreaded(&mach->vm->vm_pmap);
 286 #endif
 287
 288         /* Create the comm vmobj. */
 289         mach->commvmobj = os_vmobj_create(
 290             NVMM_MAX_VCPUS * NVMM_COMM_PAGE_SIZE);
 291
 292         (*nvmm_impl->machine_create)(mach);
 293
 294         args->machid = mach->machid;
 295         nvmm_machine_put(mach);
 296
 297         return 0;
 298 }
 299
 300 static int
 301 nvmm_machine_destroy(struct nvmm_owner *owner,
 302     struct nvmm_ioc_machine_destroy *args)
 303 {
 304         struct nvmm_machine *mach;
 305         struct nvmm_cpu *vcpu;
 306         int error;
 307         size_t i;
 308
 309         error = nvmm_machine_get(owner, args->machid, &mach, true);
 310         if (error)
 311                 return error;
 312
 313         for (i = 0; i < NVMM_MAX_VCPUS; i++) {
 314                 error = nvmm_vcpu_get(mach, i, &vcpu);
 315                 if (error)
 316                         continue;
 317
 318                 (*nvmm_impl->vcpu_destroy)(mach, vcpu);
 319                 nvmm_vcpu_free(mach, vcpu);
 320                 nvmm_vcpu_put(vcpu);
 321                 os_atomic_dec_uint(&mach->ncpus);
 322         }
 323
 324         (*nvmm_impl->machine_destroy)(mach);
 325
 326         /* Free the machine vmspace. */
 327         os_vmspace_destroy(mach->vm);
 328
 329         /* Drop the kernel vmobj refs. */
 330         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 331                 if (!mach->hmap[i].present)
 332                         continue;
 333                 os_vmobj_rel(mach->hmap[i].vmobj);
 334         }
 335
 336         nvmm_machine_free(mach);
 337         nvmm_machine_put(mach);
 338
 339         return 0;
 340 }
 341
 342 static int
 343 nvmm_machine_configure(struct nvmm_owner *owner,
 344     struct nvmm_ioc_machine_configure *args)
 345 {
 346         struct nvmm_machine *mach;
 347         size_t allocsz;
 348         uint64_t op;
 349         void *data;
 350         int error;
 351
 352         op = NVMM_MACH_CONF_MD(args->op);
 353         if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
 354                 return EINVAL;
 355         }
 356
 357         allocsz = nvmm_impl->mach_conf_sizes[op];
 358         data = os_mem_alloc(allocsz);
 359
 360         error = nvmm_machine_get(owner, args->machid, &mach, true);
 361         if (error) {
 362                 os_mem_free(data, allocsz);
 363                 return error;
 364         }
 365
 366         error = copyin(args->conf, data, allocsz);
 367         if (error) {
 368                 goto out;
 369         }
 370
 371         error = (*nvmm_impl->machine_configure)(mach, op, data);
 372
 373 out:
 374         nvmm_machine_put(mach);
 375         os_mem_free(data, allocsz);
 376         return error;
 377 }
 378
 379 static int
 380 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
 381 {
 382         struct nvmm_machine *mach;
 383         struct nvmm_cpu *vcpu;
 384         int error;
 385
 386         error = nvmm_machine_get(owner, args->machid, &mach, false);
 387         if (error)
 388                 return error;
 389
 390         error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
 391         if (error)
 392                 goto out;
 393
 394         /* Map the comm page on the kernel side, as wired. */
 395         error = os_vmobj_map(os_kernel_map, (vaddr_t *)&vcpu->comm,
 396             NVMM_COMM_PAGE_SIZE, mach->commvmobj,
 397             args->cpuid * NVMM_COMM_PAGE_SIZE, true /* wired */,
 398             false /* !fixed */, true /* shared */, PROT_READ | PROT_WRITE,
 399             PROT_READ | PROT_WRITE);
 400         if (error) {
 401                 nvmm_vcpu_free(mach, vcpu);
 402                 nvmm_vcpu_put(vcpu);
 403                 goto out;
 404         }
 405
 406         memset(vcpu->comm, 0, NVMM_COMM_PAGE_SIZE);
 407
 408         /* Map the comm page on the user side, as pageable. */
 409         error = os_vmobj_map(os_curproc_map, (vaddr_t *)&args->comm,
 410             NVMM_COMM_PAGE_SIZE, mach->commvmobj,
 411             args->cpuid * NVMM_COMM_PAGE_SIZE, false /* !wired */,
 412             false /* !fixed */, true /* shared */, PROT_READ | PROT_WRITE,
 413             PROT_READ | PROT_WRITE);
 414         if (error) {
 415                 nvmm_vcpu_free(mach, vcpu);
 416                 nvmm_vcpu_put(vcpu);
 417                 goto out;
 418         }
 419
 420         error = (*nvmm_impl->vcpu_create)(mach, vcpu);
 421         if (error) {
 422                 nvmm_vcpu_free(mach, vcpu);
 423                 nvmm_vcpu_put(vcpu);
 424                 goto out;
 425         }
 426
 427         nvmm_vcpu_put(vcpu);
 428         os_atomic_inc_uint(&mach->ncpus);
 429
 430 out:
 431         nvmm_machine_put(mach);
 432         return error;
 433 }
 434
 435 static int
 436 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
 437 {
 438         struct nvmm_machine *mach;
 439         struct nvmm_cpu *vcpu;
 440         int error;
 441
 442         error = nvmm_machine_get(owner, args->machid, &mach, false);
 443         if (error)
 444                 return error;
 445
 446         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 447         if (error)
 448                 goto out;
 449
 450         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
 451         nvmm_vcpu_free(mach, vcpu);
 452         nvmm_vcpu_put(vcpu);
 453         os_atomic_dec_uint(&mach->ncpus);
 454
 455 out:
 456         nvmm_machine_put(mach);
 457         return error;
 458 }
 459
 460 static int
 461 nvmm_vcpu_configure(struct nvmm_owner *owner,
 462     struct nvmm_ioc_vcpu_configure *args)
 463 {
 464         struct nvmm_machine *mach;
 465         struct nvmm_cpu *vcpu;
 466         size_t allocsz;
 467         uint64_t op;
 468         void *data;
 469         int error;
 470
 471         op = NVMM_VCPU_CONF_MD(args->op);
 472         if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
 473                 return EINVAL;
 474
 475         allocsz = nvmm_impl->vcpu_conf_sizes[op];
 476         data = os_mem_alloc(allocsz);
 477
 478         error = nvmm_machine_get(owner, args->machid, &mach, false);
 479         if (error) {
 480                 os_mem_free(data, allocsz);
 481                 return error;
 482         }
 483
 484         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 485         if (error) {
 486                 nvmm_machine_put(mach);
 487                 os_mem_free(data, allocsz);
 488                 return error;
 489         }
 490
 491         error = copyin(args->conf, data, allocsz);
 492         if (error) {
 493                 goto out;
 494         }
 495
 496         error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
 497
 498 out:
 499         nvmm_vcpu_put(vcpu);
 500         nvmm_machine_put(mach);
 501         os_mem_free(data, allocsz);
 502         return error;
 503 }
 504
 505 static int
 506 nvmm_vcpu_setstate(struct nvmm_owner *owner,
 507     struct nvmm_ioc_vcpu_setstate *args)
 508 {
 509         struct nvmm_machine *mach;
 510         struct nvmm_cpu *vcpu;
 511         int error;
 512
 513         error = nvmm_machine_get(owner, args->machid, &mach, false);
 514         if (error)
 515                 return error;
 516
 517         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 518         if (error)
 519                 goto out;
 520
 521         (*nvmm_impl->vcpu_setstate)(vcpu);
 522         nvmm_vcpu_put(vcpu);
 523
 524 out:
 525         nvmm_machine_put(mach);
 526         return error;
 527 }
 528
 529 static int
 530 nvmm_vcpu_getstate(struct nvmm_owner *owner,
 531     struct nvmm_ioc_vcpu_getstate *args)
 532 {
 533         struct nvmm_machine *mach;
 534         struct nvmm_cpu *vcpu;
 535         int error;
 536
 537         error = nvmm_machine_get(owner, args->machid, &mach, false);
 538         if (error)
 539                 return error;
 540
 541         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 542         if (error)
 543                 goto out;
 544
 545         (*nvmm_impl->vcpu_getstate)(vcpu);
 546         nvmm_vcpu_put(vcpu);
 547
 548 out:
 549         nvmm_machine_put(mach);
 550         return error;
 551 }
 552
 553 static int
 554 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
 555 {
 556         struct nvmm_machine *mach;
 557         struct nvmm_cpu *vcpu;
 558         int error;
 559
 560         error = nvmm_machine_get(owner, args->machid, &mach, false);
 561         if (error)
 562                 return error;
 563
 564         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 565         if (error)
 566                 goto out;
 567
 568         error = (*nvmm_impl->vcpu_inject)(vcpu);
 569         nvmm_vcpu_put(vcpu);
 570
 571 out:
 572         nvmm_machine_put(mach);
 573         return error;
 574 }
 575
 576 static int
 577 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
 578     struct nvmm_vcpu_exit *exit)
 579 {
 580         struct vmspace *vm = mach->vm;
 581         int ret;
 582
 583         while (1) {
 584                 /* Got a signal? Or pending resched? Leave. */
 585                 if (__predict_false(os_return_needed())) {
 586                         exit->reason = NVMM_VCPU_EXIT_NONE;
 587                         return 0;
 588                 }
 589
 590                 /* Run the VCPU. */
 591                 ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
 592                 if (__predict_false(ret != 0)) {
 593                         return ret;
 594                 }
 595
 596                 /* Process nested page faults. */
 597                 if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
 598                         break;
 599                 }
 600                 if (exit->u.mem.gpa >= mach->gpa_end) {
 601                         break;
 602                 }
 603                 if (os_vmspace_fault(vm, exit->u.mem.gpa, exit->u.mem.prot)) {
 604                         break;
 605                 }
 606         }
 607
 608         return 0;
 609 }
 610
 611 static int
 612 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
 613 {
 614         struct nvmm_machine *mach;
 615         struct nvmm_cpu *vcpu;
 616         int error;
 617
 618         error = nvmm_machine_get(owner, args->machid, &mach, false);
 619         if (error)
 620                 return error;
 621
 622         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 623         if (error)
 624                 goto out;
 625
 626         error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
 627         nvmm_vcpu_put(vcpu);
 628
 629 out:
 630         nvmm_machine_put(mach);
 631         return error;
 632 }
 633
 634 /* -------------------------------------------------------------------------- */
 635
 636 static os_vmobj_t *
 637 nvmm_hmapping_getvmobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
 638    size_t *off)
 639 {
 640         struct nvmm_hmapping *hmapping;
 641         size_t i;
 642
 643         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 644                 hmapping = &mach->hmap[i];
 645                 if (!hmapping->present) {
 646                         continue;
 647                 }
 648                 if (hva >= hmapping->hva &&
 649                     hva + size <= hmapping->hva + hmapping->size) {
 650                         *off = hva - hmapping->hva;
 651                         return hmapping->vmobj;
 652                 }
 653         }
 654
 655         return NULL;
 656 }
 657
 658 static int
 659 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
 660 {
 661         struct nvmm_hmapping *hmapping;
 662         size_t i;
 663         uintptr_t hva_end;
 664         uintptr_t hmap_end;
 665
 666         if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
 667                 return EINVAL;
 668         }
 669         if (hva == 0) {
 670                 return EINVAL;
 671         }
 672
 673         /*
 674          * Overflow tests MUST be done very carefully to avoid compiler
 675          * optimizations from effectively deleting the test.
 676          */
 677         hva_end = hva + size;
 678         if (hva_end <= hva)
 679                 return EINVAL;
 680
 681         /*
 682          * Overlap tests
 683          */
 684         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 685                 hmapping = &mach->hmap[i];
 686
 687                 if (!hmapping->present) {
 688                         continue;
 689                 }
 690                 hmap_end = hmapping->hva + hmapping->size;
 691
 692                 if (hva >= hmapping->hva && hva_end <= hmap_end)
 693                         break;
 694                 if (hva >= hmapping->hva && hva < hmap_end)
 695                         return EEXIST;
 696                 if (hva_end > hmapping->hva && hva_end <= hmap_end)
 697                         return EEXIST;
 698                 if (hva <= hmapping->hva && hva_end >= hmap_end)
 699                         return EEXIST;
 700         }
 701
 702         return 0;
 703 }
 704
 705 static struct nvmm_hmapping *
 706 nvmm_hmapping_alloc(struct nvmm_machine *mach)
 707 {
 708         struct nvmm_hmapping *hmapping;
 709         size_t i;
 710
 711         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 712                 hmapping = &mach->hmap[i];
 713                 if (!hmapping->present) {
 714                         hmapping->present = true;
 715                         return hmapping;
 716                 }
 717         }
 718
 719         return NULL;
 720 }
 721
 722 static int
 723 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
 724 {
 725         struct nvmm_hmapping *hmapping;
 726         size_t i;
 727
 728         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 729                 hmapping = &mach->hmap[i];
 730                 if (!hmapping->present || hmapping->hva != hva ||
 731                     hmapping->size != size) {
 732                         continue;
 733                 }
 734
 735                 os_vmobj_unmap(os_curproc_map, hmapping->hva,
 736                     hmapping->hva + hmapping->size, false);
 737                 os_vmobj_rel(hmapping->vmobj);
 738
 739                 hmapping->vmobj = NULL;
 740                 hmapping->present = false;
 741
 742                 return 0;
 743         }
 744
 745         return ENOENT;
 746 }
 747
 748 static int
 749 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
 750 {
 751         struct nvmm_machine *mach;
 752         struct nvmm_hmapping *hmapping;
 753         vaddr_t uva;
 754         int error;
 755
 756         error = nvmm_machine_get(owner, args->machid, &mach, true);
 757         if (error)
 758                 return error;
 759
 760         error = nvmm_hmapping_validate(mach, args->hva, args->size);
 761         if (error)
 762                 goto out;
 763
 764         hmapping = nvmm_hmapping_alloc(mach);
 765         if (hmapping == NULL) {
 766                 error = ENOBUFS;
 767                 goto out;
 768         }
 769
 770         hmapping->hva = args->hva;
 771         hmapping->size = args->size;
 772         hmapping->vmobj = os_vmobj_create(hmapping->size);
 773         uva = hmapping->hva;
 774
 775         /* Map the vmobj into the user address space, as pageable. */
 776         error = os_vmobj_map(os_curproc_map, &uva, hmapping->size,
 777             hmapping->vmobj, 0, false /* !wired */, true /* fixed */,
 778             true /* shared */, PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE);
 779
 780 out:
 781         nvmm_machine_put(mach);
 782         return error;
 783 }
 784
 785 static int
 786 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
 787 {
 788         struct nvmm_machine *mach;
 789         int error;
 790
 791         error = nvmm_machine_get(owner, args->machid, &mach, true);
 792         if (error)
 793                 return error;
 794
 795         error = nvmm_hmapping_free(mach, args->hva, args->size);
 796
 797         nvmm_machine_put(mach);
 798         return error;
 799 }
 800
 801 /* -------------------------------------------------------------------------- */
 802
 803 static int
 804 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
 805 {
 806         struct nvmm_machine *mach;
 807         os_vmobj_t *vmobj;
 808         gpaddr_t gpa;
 809         gpaddr_t gpa_end;
 810         size_t off;
 811         int error;
 812
 813         error = nvmm_machine_get(owner, args->machid, &mach, false);
 814         if (error)
 815                 return error;
 816
 817         if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
 818                 error = EINVAL;
 819                 goto out;
 820         }
 821
 822         /*
 823          * Overflow tests MUST be done very carefully to avoid compiler
 824          * optimizations from effectively deleting the test.
 825          */
 826         gpa = args->gpa;
 827         gpa_end = gpa + args->size;
 828         if (gpa_end <= gpa) {
 829                 error = EINVAL;
 830                 goto out;
 831         }
 832
 833         if ((gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
 834             (args->hva % PAGE_SIZE) != 0) {
 835                 error = EINVAL;
 836                 goto out;
 837         }
 838         if (args->hva == 0) {
 839                 error = EINVAL;
 840                 goto out;
 841         }
 842
 843         if (gpa < mach->gpa_begin || gpa >= mach->gpa_end) {
 844                 error = EINVAL;
 845                 goto out;
 846         }
 847         if (gpa_end  > mach->gpa_end) {
 848                 error = EINVAL;
 849                 goto out;
 850         }
 851
 852         vmobj = nvmm_hmapping_getvmobj(mach, args->hva, args->size, &off);
 853         if (vmobj == NULL) {
 854                 error = EINVAL;
 855                 goto out;
 856         }
 857
 858         /* Map the vmobj into the machine address space, as pageable. */
 859         error = os_vmobj_map(&mach->vm->vm_map, &gpa, args->size, vmobj, off,
 860             false /* !wired */, true /* fixed */, false /* !shared */,
 861             args->prot, PROT_READ | PROT_WRITE | PROT_EXEC);
 862
 863 out:
 864         nvmm_machine_put(mach);
 865         return error;
 866 }
 867
 868 static int
 869 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
 870 {
 871         struct nvmm_machine *mach;
 872         gpaddr_t gpa;
 873         gpaddr_t gpa_end;
 874         int error;
 875
 876         error = nvmm_machine_get(owner, args->machid, &mach, false);
 877         if (error)
 878                 return error;
 879
 880         /*
 881          * Overflow tests MUST be done very carefully to avoid compiler
 882          * optimizations from effectively deleting the test.
 883          */
 884         gpa = args->gpa;
 885         gpa_end = gpa + args->size;
 886         if (gpa_end <= gpa) {
 887                 error = EINVAL;
 888                 goto out;
 889         }
 890
 891         if ((gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
 892                 error = EINVAL;
 893                 goto out;
 894         }
 895         if (gpa < mach->gpa_begin || gpa >= mach->gpa_end) {
 896                 error = EINVAL;
 897                 goto out;
 898         }
 899         if (gpa_end >= mach->gpa_end) {
 900                 error = EINVAL;
 901                 goto out;
 902         }
 903
 904         /* Unmap the memory from the machine. */
 905         os_vmobj_unmap(&mach->vm->vm_map, gpa, gpa + args->size, false);
 906
 907 out:
 908         nvmm_machine_put(mach);
 909         return error;
 910 }
 911
 912 /* -------------------------------------------------------------------------- */
 913
 914 static int
 915 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
 916 {
 917         struct nvmm_ctl_mach_info ctl;
 918         struct nvmm_machine *mach;
 919         int error;
 920         size_t i;
 921
 922         if (args->size != sizeof(ctl))
 923                 return EINVAL;
 924         error = copyin(args->data, &ctl, sizeof(ctl));
 925         if (error)
 926                 return error;
 927
 928         error = nvmm_machine_get(owner, ctl.machid, &mach, true);
 929         if (error)
 930                 return error;
 931
 932         ctl.nvcpus = mach->ncpus;
 933
 934         ctl.nram = 0;
 935         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 936                 if (!mach->hmap[i].present)
 937                         continue;
 938                 ctl.nram += mach->hmap[i].size;
 939         }
 940
 941         ctl.pid = mach->owner->pid;
 942         ctl.time = mach->time;
 943
 944         nvmm_machine_put(mach);
 945
 946         error = copyout(&ctl, args->data, sizeof(ctl));
 947         if (error)
 948                 return error;
 949
 950         return 0;
 951 }
 952
 953 static int
 954 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
 955 {
 956         switch (args->op) {
 957         case NVMM_CTL_MACH_INFO:
 958                 return nvmm_ctl_mach_info(owner, args);
 959         default:
 960                 return EINVAL;
 961         }
 962 }
 963
 964 /* -------------------------------------------------------------------------- */
 965
 966 const struct nvmm_impl *
 967 nvmm_ident(void)
 968 {
 969         size_t i;
 970
 971         for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
 972                 if ((*nvmm_impl_list[i]->ident)())
 973                         return nvmm_impl_list[i];
 974         }
 975
 976         return NULL;
 977 }
 978
 979 int
 980 nvmm_init(void)
 981 {
 982         size_t i, n;
 983
 984         nvmm_impl = nvmm_ident();
 985         if (nvmm_impl == NULL)
 986                 return ENOTSUP;
 987
 988         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
 989                 machines[i].machid = i;
 990                 os_rwl_init(&machines[i].lock);
 991                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
 992                         machines[i].cpus[n].present = false;
 993                         machines[i].cpus[n].cpuid = n;
 994                         os_mtx_init(&machines[i].cpus[n].lock);
 995                 }
 996         }
 997
 998         (*nvmm_impl->init)();
 999
1000         return 0;
1001 }
1002
1003 void
1004 nvmm_fini(void)
1005 {
1006         size_t i, n;
1007
1008         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
1009                 os_rwl_destroy(&machines[i].lock);
1010                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
1011                         os_mtx_destroy(&machines[i].cpus[n].lock);
1012                 }
1013         }
1014
1015         (*nvmm_impl->fini)();
1016         nvmm_impl = NULL;
1017 }
1018
1019 /* -------------------------------------------------------------------------- */
1020
1021 int
1022 nvmm_ioctl(struct nvmm_owner *owner, unsigned long cmd, void *data)
1023 {
1024         switch (cmd) {
1025         case NVMM_IOC_CAPABILITY:
1026                 return nvmm_capability(owner, data);
1027         case NVMM_IOC_MACHINE_CREATE:
1028                 return nvmm_machine_create(owner, data);
1029         case NVMM_IOC_MACHINE_DESTROY:
1030                 return nvmm_machine_destroy(owner, data);
1031         case NVMM_IOC_MACHINE_CONFIGURE:
1032                 return nvmm_machine_configure(owner, data);
1033         case NVMM_IOC_VCPU_CREATE:
1034                 return nvmm_vcpu_create(owner, data);
1035         case NVMM_IOC_VCPU_DESTROY:
1036                 return nvmm_vcpu_destroy(owner, data);
1037         case NVMM_IOC_VCPU_CONFIGURE:
1038                 return nvmm_vcpu_configure(owner, data);
1039         case NVMM_IOC_VCPU_SETSTATE:
1040                 return nvmm_vcpu_setstate(owner, data);
1041         case NVMM_IOC_VCPU_GETSTATE:
1042                 return nvmm_vcpu_getstate(owner, data);
1043         case NVMM_IOC_VCPU_INJECT:
1044                 return nvmm_vcpu_inject(owner, data);
1045         case NVMM_IOC_VCPU_RUN:
1046                 return nvmm_vcpu_run(owner, data);
1047         case NVMM_IOC_GPA_MAP:
1048                 return nvmm_gpa_map(owner, data);
1049         case NVMM_IOC_GPA_UNMAP:
1050                 return nvmm_gpa_unmap(owner, data);
1051         case NVMM_IOC_HVA_MAP:
1052                 return nvmm_hva_map(owner, data);
1053         case NVMM_IOC_HVA_UNMAP:
1054                 return nvmm_hva_unmap(owner, data);
1055         case NVMM_IOC_CTL:
1056                 return nvmm_ctl(owner, data);
1057         default:
1058                 return EINVAL;
1059         }
1060 }