sys/dev/virtual/nvmm/nvmm.c

   1 /*      $NetBSD: nvmm.c,v 1.43 2021/04/12 09:22:58 mrg Exp $    */
   2
   3 /*
   4  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
   5  * All rights reserved.
   6  *
   7  * This code is part of the NVMM hypervisor.
   8  *
   9  * Redistribution and use in source and binary forms, with or without
  10  * modification, are permitted provided that the following conditions
  11  * are met:
  12  * 1. Redistributions of source code must retain the above copyright
  13  *    notice, this list of conditions and the following disclaimer.
  14  * 2. Redistributions in binary form must reproduce the above copyright
  15  *    notice, this list of conditions and the following disclaimer in the
  16  *    documentation and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  28  * SUCH DAMAGE.
  29  */
  30
  31 #include <sys/param.h>
  32 #include <sys/systm.h>
  33
  34 #include <sys/conf.h>
  35 #include <sys/devfs.h>
  36 #include <sys/device.h>
  37 #include <sys/fcntl.h>
  38 #include <sys/kernel.h>
  39 #include <sys/module.h>
  40 #include <sys/priv.h>
  41 #include <sys/thread.h>
  42
  43 #include <dev/virtual/nvmm/nvmm_compat.h>
  44 #include <dev/virtual/nvmm/nvmm.h>
  45 #include <dev/virtual/nvmm/nvmm_internal.h>
  46 #include <dev/virtual/nvmm/nvmm_ioctl.h>
  47
  48 MALLOC_DEFINE(M_NVMM, "nvmm", "NVMM data");
  49
  50 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
  51 static volatile unsigned int nmachines __cacheline_aligned;
  52
  53 static const struct nvmm_impl *nvmm_impl_list[] = {
  54 #if defined(__x86_64__)
  55         &nvmm_x86_svm,  /* x86 AMD SVM */
  56         &nvmm_x86_vmx   /* x86 Intel VMX */
  57 #endif
  58 };
  59
  60 static const struct nvmm_impl *nvmm_impl __read_mostly = NULL;
  61
  62 static struct nvmm_owner root_owner;
  63
  64 /* -------------------------------------------------------------------------- */
  65
  66 static int
  67 nvmm_machine_alloc(struct nvmm_machine **ret)
  68 {
  69         struct nvmm_machine *mach;
  70         size_t i;
  71
  72         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
  73                 mach = &machines[i];
  74
  75                 rw_enter(&mach->lock, RW_WRITER);
  76                 if (mach->present) {
  77                         rw_exit(&mach->lock);
  78                         continue;
  79                 }
  80
  81                 mach->present = true;
  82                 mach->time = time_second;
  83                 *ret = mach;
  84                 atomic_inc_uint(&nmachines);
  85                 return 0;
  86         }
  87
  88         return ENOBUFS;
  89 }
  90
  91 static void
  92 nvmm_machine_free(struct nvmm_machine *mach)
  93 {
  94         KASSERT(rw_write_held(&mach->lock));
  95         KASSERT(mach->present);
  96         mach->present = false;
  97         atomic_dec_uint(&nmachines);
  98 }
  99
 100 static int
 101 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
 102     struct nvmm_machine **ret, bool writer)
 103 {
 104         struct nvmm_machine *mach;
 105         krw_t op = writer ? RW_WRITER : RW_READER;
 106
 107         if (__predict_false(machid >= NVMM_MAX_MACHINES)) {
 108                 return EINVAL;
 109         }
 110         mach = &machines[machid];
 111
 112         rw_enter(&mach->lock, op);
 113         if (__predict_false(!mach->present)) {
 114                 rw_exit(&mach->lock);
 115                 return ENOENT;
 116         }
 117         if (__predict_false(mach->owner != owner && owner != &root_owner)) {
 118                 rw_exit(&mach->lock);
 119                 return EPERM;
 120         }
 121         *ret = mach;
 122
 123         return 0;
 124 }
 125
 126 static void
 127 nvmm_machine_put(struct nvmm_machine *mach)
 128 {
 129         rw_exit(&mach->lock);
 130 }
 131
 132 /* -------------------------------------------------------------------------- */
 133
 134 static int
 135 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
 136     struct nvmm_cpu **ret)
 137 {
 138         struct nvmm_cpu *vcpu;
 139
 140         if (cpuid >= NVMM_MAX_VCPUS) {
 141                 return EINVAL;
 142         }
 143         vcpu = &mach->cpus[cpuid];
 144
 145         mutex_enter(&vcpu->lock);
 146         if (vcpu->present) {
 147                 mutex_exit(&vcpu->lock);
 148                 return EBUSY;
 149         }
 150
 151         vcpu->present = true;
 152         vcpu->comm = NULL;
 153         vcpu->hcpu_last = -1;
 154         *ret = vcpu;
 155         return 0;
 156 }
 157
 158 static void
 159 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
 160 {
 161         KASSERT(mutex_owned(&vcpu->lock));
 162         vcpu->present = false;
 163         if (vcpu->comm != NULL) {
 164                 uvm_deallocate(kernel_map, (vaddr_t)vcpu->comm, PAGE_SIZE);
 165         }
 166 }
 167
 168 static int
 169 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
 170     struct nvmm_cpu **ret)
 171 {
 172         struct nvmm_cpu *vcpu;
 173
 174         if (__predict_false(cpuid >= NVMM_MAX_VCPUS)) {
 175                 return EINVAL;
 176         }
 177         vcpu = &mach->cpus[cpuid];
 178
 179         mutex_enter(&vcpu->lock);
 180         if (__predict_false(!vcpu->present)) {
 181                 mutex_exit(&vcpu->lock);
 182                 return ENOENT;
 183         }
 184         *ret = vcpu;
 185
 186         return 0;
 187 }
 188
 189 static void
 190 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
 191 {
 192         mutex_exit(&vcpu->lock);
 193 }
 194
 195 /* -------------------------------------------------------------------------- */
 196
 197 static void
 198 nvmm_kill_machines(struct nvmm_owner *owner)
 199 {
 200         struct nvmm_machine *mach;
 201         struct nvmm_cpu *vcpu;
 202         size_t i, j;
 203         int error;
 204
 205         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
 206                 mach = &machines[i];
 207
 208                 rw_enter(&mach->lock, RW_WRITER);
 209                 if (!mach->present || mach->owner != owner) {
 210                         rw_exit(&mach->lock);
 211                         continue;
 212                 }
 213
 214                 /* Kill it. */
 215                 for (j = 0; j < NVMM_MAX_VCPUS; j++) {
 216                         error = nvmm_vcpu_get(mach, j, &vcpu);
 217                         if (error)
 218                                 continue;
 219                         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
 220                         nvmm_vcpu_free(mach, vcpu);
 221                         nvmm_vcpu_put(vcpu);
 222                         atomic_dec_uint(&mach->ncpus);
 223                 }
 224                 (*nvmm_impl->machine_destroy)(mach);
 225                 uvmspace_free(mach->vm);
 226
 227                 /* Drop the kernel UOBJ refs. */
 228                 for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
 229                         if (!mach->hmap[j].present)
 230                                 continue;
 231                         uao_detach(mach->hmap[j].uobj);
 232                 }
 233
 234                 nvmm_machine_free(mach);
 235
 236                 rw_exit(&mach->lock);
 237         }
 238 }
 239
 240 /* -------------------------------------------------------------------------- */
 241
 242 static int
 243 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
 244 {
 245         args->cap.version = NVMM_KERN_VERSION;
 246         args->cap.state_size = nvmm_impl->state_size;
 247         args->cap.max_machines = NVMM_MAX_MACHINES;
 248         args->cap.max_vcpus = NVMM_MAX_VCPUS;
 249         args->cap.max_ram = NVMM_MAX_RAM;
 250
 251         (*nvmm_impl->capability)(&args->cap);
 252
 253         return 0;
 254 }
 255
 256 static int
 257 nvmm_machine_create(struct nvmm_owner *owner,
 258     struct nvmm_ioc_machine_create *args)
 259 {
 260         struct nvmm_machine *mach;
 261         int error;
 262
 263         error = nvmm_machine_alloc(&mach);
 264         if (error)
 265                 return error;
 266
 267         /* Curproc owns the machine. */
 268         mach->owner = owner;
 269
 270         /* Zero out the host mappings. */
 271         memset(&mach->hmap, 0, sizeof(mach->hmap));
 272
 273         /* Create the machine vmspace. */
 274         mach->gpa_begin = 0;
 275         mach->gpa_end = NVMM_MAX_RAM;
 276         mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
 277
 278 #ifdef __DragonFly__
 279         /*
 280          * Set PMAP_MULTI on the backing pmap for the machine.  Only
 281          * pmap changes to the backing pmap for the machine affect the
 282          * guest.  Changes to the host's pmap do not affect the guest's
 283          * backing pmap.
 284          */
 285         pmap_maybethreaded(&mach->vm->vm_pmap);
 286 #endif
 287
 288         /* Create the comm uobj. */
 289         mach->commuobj = uao_create(NVMM_MAX_VCPUS * PAGE_SIZE, 0);
 290
 291         (*nvmm_impl->machine_create)(mach);
 292
 293         args->machid = mach->machid;
 294         nvmm_machine_put(mach);
 295
 296         return 0;
 297 }
 298
 299 static int
 300 nvmm_machine_destroy(struct nvmm_owner *owner,
 301     struct nvmm_ioc_machine_destroy *args)
 302 {
 303         struct nvmm_machine *mach;
 304         struct nvmm_cpu *vcpu;
 305         int error;
 306         size_t i;
 307
 308         error = nvmm_machine_get(owner, args->machid, &mach, true);
 309         if (error)
 310                 return error;
 311
 312         for (i = 0; i < NVMM_MAX_VCPUS; i++) {
 313                 error = nvmm_vcpu_get(mach, i, &vcpu);
 314                 if (error)
 315                         continue;
 316
 317                 (*nvmm_impl->vcpu_destroy)(mach, vcpu);
 318                 nvmm_vcpu_free(mach, vcpu);
 319                 nvmm_vcpu_put(vcpu);
 320                 atomic_dec_uint(&mach->ncpus);
 321         }
 322
 323         (*nvmm_impl->machine_destroy)(mach);
 324
 325         /* Free the machine vmspace. */
 326         uvmspace_free(mach->vm);
 327
 328         /* Drop the kernel UOBJ refs. */
 329         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 330                 if (!mach->hmap[i].present)
 331                         continue;
 332                 uao_detach(mach->hmap[i].uobj);
 333         }
 334
 335         nvmm_machine_free(mach);
 336         nvmm_machine_put(mach);
 337
 338         return 0;
 339 }
 340
 341 static int
 342 nvmm_machine_configure(struct nvmm_owner *owner,
 343     struct nvmm_ioc_machine_configure *args)
 344 {
 345         struct nvmm_machine *mach;
 346         size_t allocsz;
 347         uint64_t op;
 348         void *data;
 349         int error;
 350
 351         op = NVMM_MACH_CONF_MD(args->op);
 352         if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
 353                 return EINVAL;
 354         }
 355
 356         allocsz = nvmm_impl->mach_conf_sizes[op];
 357         data = kmem_alloc(allocsz, KM_SLEEP);
 358
 359         error = nvmm_machine_get(owner, args->machid, &mach, true);
 360         if (error) {
 361                 kmem_free(data, allocsz);
 362                 return error;
 363         }
 364
 365         error = copyin(args->conf, data, allocsz);
 366         if (error) {
 367                 goto out;
 368         }
 369
 370         error = (*nvmm_impl->machine_configure)(mach, op, data);
 371
 372 out:
 373         nvmm_machine_put(mach);
 374         kmem_free(data, allocsz);
 375         return error;
 376 }
 377
 378 static int
 379 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
 380 {
 381         struct nvmm_machine *mach;
 382         struct nvmm_cpu *vcpu;
 383         int error;
 384
 385         error = nvmm_machine_get(owner, args->machid, &mach, false);
 386         if (error)
 387                 return error;
 388
 389         error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
 390         if (error)
 391                 goto out;
 392
 393         /* Allocate the comm page. */
 394         uao_reference(mach->commuobj);
 395         error = uvm_map(kernel_map, (vaddr_t *)&vcpu->comm, PAGE_SIZE,
 396             mach->commuobj, args->cpuid * PAGE_SIZE, 0, UVM_MAPFLAG(UVM_PROT_RW,
 397             UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0));
 398         if (error) {
 399                 uao_detach(mach->commuobj);
 400                 nvmm_vcpu_free(mach, vcpu);
 401                 nvmm_vcpu_put(vcpu);
 402                 goto out;
 403         }
 404         error = uvm_map_pageable(kernel_map, (vaddr_t)vcpu->comm,
 405             (vaddr_t)vcpu->comm + PAGE_SIZE, false, 0);
 406         if (error) {
 407                 nvmm_vcpu_free(mach, vcpu);
 408                 nvmm_vcpu_put(vcpu);
 409                 goto out;
 410         }
 411         memset(vcpu->comm, 0, PAGE_SIZE);
 412
 413         error = (*nvmm_impl->vcpu_create)(mach, vcpu);
 414         if (error) {
 415                 nvmm_vcpu_free(mach, vcpu);
 416                 nvmm_vcpu_put(vcpu);
 417                 goto out;
 418         }
 419
 420         nvmm_vcpu_put(vcpu);
 421         atomic_inc_uint(&mach->ncpus);
 422
 423 out:
 424         nvmm_machine_put(mach);
 425         return error;
 426 }
 427
 428 static int
 429 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
 430 {
 431         struct nvmm_machine *mach;
 432         struct nvmm_cpu *vcpu;
 433         int error;
 434
 435         error = nvmm_machine_get(owner, args->machid, &mach, false);
 436         if (error)
 437                 return error;
 438
 439         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 440         if (error)
 441                 goto out;
 442
 443         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
 444         nvmm_vcpu_free(mach, vcpu);
 445         nvmm_vcpu_put(vcpu);
 446         atomic_dec_uint(&mach->ncpus);
 447
 448 out:
 449         nvmm_machine_put(mach);
 450         return error;
 451 }
 452
 453 static int
 454 nvmm_vcpu_configure(struct nvmm_owner *owner,
 455     struct nvmm_ioc_vcpu_configure *args)
 456 {
 457         struct nvmm_machine *mach;
 458         struct nvmm_cpu *vcpu;
 459         size_t allocsz;
 460         uint64_t op;
 461         void *data;
 462         int error;
 463
 464         op = NVMM_VCPU_CONF_MD(args->op);
 465         if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
 466                 return EINVAL;
 467
 468         allocsz = nvmm_impl->vcpu_conf_sizes[op];
 469         data = kmem_alloc(allocsz, KM_SLEEP);
 470
 471         error = nvmm_machine_get(owner, args->machid, &mach, false);
 472         if (error) {
 473                 kmem_free(data, allocsz);
 474                 return error;
 475         }
 476
 477         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 478         if (error) {
 479                 nvmm_machine_put(mach);
 480                 kmem_free(data, allocsz);
 481                 return error;
 482         }
 483
 484         error = copyin(args->conf, data, allocsz);
 485         if (error) {
 486                 goto out;
 487         }
 488
 489         error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
 490
 491 out:
 492         nvmm_vcpu_put(vcpu);
 493         nvmm_machine_put(mach);
 494         kmem_free(data, allocsz);
 495         return error;
 496 }
 497
 498 static int
 499 nvmm_vcpu_setstate(struct nvmm_owner *owner,
 500     struct nvmm_ioc_vcpu_setstate *args)
 501 {
 502         struct nvmm_machine *mach;
 503         struct nvmm_cpu *vcpu;
 504         int error;
 505
 506         error = nvmm_machine_get(owner, args->machid, &mach, false);
 507         if (error)
 508                 return error;
 509
 510         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 511         if (error)
 512                 goto out;
 513
 514         (*nvmm_impl->vcpu_setstate)(vcpu);
 515         nvmm_vcpu_put(vcpu);
 516
 517 out:
 518         nvmm_machine_put(mach);
 519         return error;
 520 }
 521
 522 static int
 523 nvmm_vcpu_getstate(struct nvmm_owner *owner,
 524     struct nvmm_ioc_vcpu_getstate *args)
 525 {
 526         struct nvmm_machine *mach;
 527         struct nvmm_cpu *vcpu;
 528         int error;
 529
 530         error = nvmm_machine_get(owner, args->machid, &mach, false);
 531         if (error)
 532                 return error;
 533
 534         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 535         if (error)
 536                 goto out;
 537
 538         (*nvmm_impl->vcpu_getstate)(vcpu);
 539         nvmm_vcpu_put(vcpu);
 540
 541 out:
 542         nvmm_machine_put(mach);
 543         return error;
 544 }
 545
 546 static int
 547 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
 548 {
 549         struct nvmm_machine *mach;
 550         struct nvmm_cpu *vcpu;
 551         int error;
 552
 553         error = nvmm_machine_get(owner, args->machid, &mach, false);
 554         if (error)
 555                 return error;
 556
 557         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 558         if (error)
 559                 goto out;
 560
 561         error = (*nvmm_impl->vcpu_inject)(vcpu);
 562         nvmm_vcpu_put(vcpu);
 563
 564 out:
 565         nvmm_machine_put(mach);
 566         return error;
 567 }
 568
 569 static int
 570 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
 571     struct nvmm_vcpu_exit *exit)
 572 {
 573         struct vmspace *vm = mach->vm;
 574         int ret;
 575
 576         while (1) {
 577                 /* Got a signal? Or pending resched? Leave. */
 578                 if (__predict_false(nvmm_return_needed())) {
 579                         exit->reason = NVMM_VCPU_EXIT_NONE;
 580                         return 0;
 581                 }
 582
 583                 /* Run the VCPU. */
 584                 ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
 585                 if (__predict_false(ret != 0)) {
 586                         return ret;
 587                 }
 588
 589                 /* Process nested page faults. */
 590                 if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
 591                         break;
 592                 }
 593                 if (exit->u.mem.gpa >= mach->gpa_end) {
 594                         break;
 595                 }
 596                 if (uvm_fault(&vm->vm_map, exit->u.mem.gpa, exit->u.mem.prot)) {
 597                         break;
 598                 }
 599         }
 600
 601         return 0;
 602 }
 603
 604 static int
 605 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
 606 {
 607         struct nvmm_machine *mach;
 608         struct nvmm_cpu *vcpu;
 609         int error;
 610
 611         error = nvmm_machine_get(owner, args->machid, &mach, false);
 612         if (error)
 613                 return error;
 614
 615         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
 616         if (error)
 617                 goto out;
 618
 619         error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
 620         nvmm_vcpu_put(vcpu);
 621
 622 out:
 623         nvmm_machine_put(mach);
 624         return error;
 625 }
 626
 627 /* -------------------------------------------------------------------------- */
 628
 629 static struct uvm_object *
 630 nvmm_hmapping_getuobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
 631    size_t *off)
 632 {
 633         struct nvmm_hmapping *hmapping;
 634         size_t i;
 635
 636         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 637                 hmapping = &mach->hmap[i];
 638                 if (!hmapping->present) {
 639                         continue;
 640                 }
 641                 if (hva >= hmapping->hva &&
 642                     hva + size <= hmapping->hva + hmapping->size) {
 643                         *off = hva - hmapping->hva;
 644                         return hmapping->uobj;
 645                 }
 646         }
 647
 648         return NULL;
 649 }
 650
 651 static int
 652 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
 653 {
 654         struct nvmm_hmapping *hmapping;
 655         size_t i;
 656
 657         if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
 658                 return EINVAL;
 659         }
 660         if (hva == 0) {
 661                 return EINVAL;
 662         }
 663
 664         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 665                 hmapping = &mach->hmap[i];
 666                 if (!hmapping->present) {
 667                         continue;
 668                 }
 669
 670                 if (hva >= hmapping->hva &&
 671                     hva + size <= hmapping->hva + hmapping->size) {
 672                         break;
 673                 }
 674
 675                 if (hva >= hmapping->hva &&
 676                     hva < hmapping->hva + hmapping->size) {
 677                         return EEXIST;
 678                 }
 679                 if (hva + size > hmapping->hva &&
 680                     hva + size <= hmapping->hva + hmapping->size) {
 681                         return EEXIST;
 682                 }
 683                 if (hva <= hmapping->hva &&
 684                     hva + size >= hmapping->hva + hmapping->size) {
 685                         return EEXIST;
 686                 }
 687         }
 688
 689         return 0;
 690 }
 691
 692 static struct nvmm_hmapping *
 693 nvmm_hmapping_alloc(struct nvmm_machine *mach)
 694 {
 695         struct nvmm_hmapping *hmapping;
 696         size_t i;
 697
 698         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 699                 hmapping = &mach->hmap[i];
 700                 if (!hmapping->present) {
 701                         hmapping->present = true;
 702                         return hmapping;
 703                 }
 704         }
 705
 706         return NULL;
 707 }
 708
 709 static int
 710 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
 711 {
 712         struct vmspace *vmspace = curproc->p_vmspace;
 713         struct nvmm_hmapping *hmapping;
 714         size_t i;
 715
 716         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 717                 hmapping = &mach->hmap[i];
 718                 if (!hmapping->present || hmapping->hva != hva ||
 719                     hmapping->size != size) {
 720                         continue;
 721                 }
 722
 723                 uvm_unmap(&vmspace->vm_map, hmapping->hva,
 724                     hmapping->hva + hmapping->size);
 725                 uao_detach(hmapping->uobj);
 726
 727                 hmapping->uobj = NULL;
 728                 hmapping->present = false;
 729
 730                 return 0;
 731         }
 732
 733         return ENOENT;
 734 }
 735
 736 static int
 737 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
 738 {
 739         struct vmspace *vmspace = curproc->p_vmspace;
 740         struct nvmm_machine *mach;
 741         struct nvmm_hmapping *hmapping;
 742         vaddr_t uva;
 743         int error;
 744
 745         error = nvmm_machine_get(owner, args->machid, &mach, true);
 746         if (error)
 747                 return error;
 748
 749         error = nvmm_hmapping_validate(mach, args->hva, args->size);
 750         if (error)
 751                 goto out;
 752
 753         hmapping = nvmm_hmapping_alloc(mach);
 754         if (hmapping == NULL) {
 755                 error = ENOBUFS;
 756                 goto out;
 757         }
 758
 759         hmapping->hva = args->hva;
 760         hmapping->size = args->size;
 761         hmapping->uobj = uao_create(hmapping->size, 0);
 762         uva = hmapping->hva;
 763
 764         /* Take a reference for the user. */
 765         uao_reference(hmapping->uobj);
 766
 767         /* Map the uobj into the user address space, as pageable. */
 768         error = uvm_map(&vmspace->vm_map, &uva, hmapping->size, hmapping->uobj,
 769             0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE,
 770             UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
 771         if (error) {
 772                 uao_detach(hmapping->uobj);
 773         }
 774
 775 out:
 776         nvmm_machine_put(mach);
 777         return error;
 778 }
 779
 780 static int
 781 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
 782 {
 783         struct nvmm_machine *mach;
 784         int error;
 785
 786         error = nvmm_machine_get(owner, args->machid, &mach, true);
 787         if (error)
 788                 return error;
 789
 790         error = nvmm_hmapping_free(mach, args->hva, args->size);
 791
 792         nvmm_machine_put(mach);
 793         return error;
 794 }
 795
 796 /* -------------------------------------------------------------------------- */
 797
 798 static int
 799 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
 800 {
 801         struct nvmm_machine *mach;
 802         struct uvm_object *uobj;
 803         gpaddr_t gpa;
 804         size_t off;
 805         int error;
 806
 807         error = nvmm_machine_get(owner, args->machid, &mach, false);
 808         if (error)
 809                 return error;
 810
 811         if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
 812                 error = EINVAL;
 813                 goto out;
 814         }
 815
 816         if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
 817             (args->hva % PAGE_SIZE) != 0) {
 818                 error = EINVAL;
 819                 goto out;
 820         }
 821         if (args->hva == 0) {
 822                 error = EINVAL;
 823                 goto out;
 824         }
 825         if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
 826                 error = EINVAL;
 827                 goto out;
 828         }
 829         if (args->gpa + args->size <= args->gpa) {
 830                 error = EINVAL;
 831                 goto out;
 832         }
 833         if (args->gpa + args->size > mach->gpa_end) {
 834                 error = EINVAL;
 835                 goto out;
 836         }
 837         gpa = args->gpa;
 838
 839         uobj = nvmm_hmapping_getuobj(mach, args->hva, args->size, &off);
 840         if (uobj == NULL) {
 841                 error = EINVAL;
 842                 goto out;
 843         }
 844
 845         /* Take a reference for the machine. */
 846         uao_reference(uobj);
 847
 848         /* Map the uobj into the machine address space, as pageable. */
 849         error = uvm_map(&mach->vm->vm_map, &gpa, args->size, uobj, off, 0,
 850             UVM_MAPFLAG(args->prot, UVM_PROT_RWX, UVM_INH_NONE,
 851             UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
 852         if (error) {
 853                 uao_detach(uobj);
 854                 goto out;
 855         }
 856         if (gpa != args->gpa) {
 857                 uao_detach(uobj);
 858                 printf("[!] uvm_map problem\n");
 859                 error = EINVAL;
 860                 goto out;
 861         }
 862
 863 out:
 864         nvmm_machine_put(mach);
 865         return error;
 866 }
 867
 868 static int
 869 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
 870 {
 871         struct nvmm_machine *mach;
 872         gpaddr_t gpa;
 873         int error;
 874
 875         error = nvmm_machine_get(owner, args->machid, &mach, false);
 876         if (error)
 877                 return error;
 878
 879         if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
 880                 error = EINVAL;
 881                 goto out;
 882         }
 883         if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
 884                 error = EINVAL;
 885                 goto out;
 886         }
 887         if (args->gpa + args->size <= args->gpa) {
 888                 error = EINVAL;
 889                 goto out;
 890         }
 891         if (args->gpa + args->size >= mach->gpa_end) {
 892                 error = EINVAL;
 893                 goto out;
 894         }
 895         gpa = args->gpa;
 896
 897         /* Unmap the memory from the machine. */
 898         uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
 899
 900 out:
 901         nvmm_machine_put(mach);
 902         return error;
 903 }
 904
 905 /* -------------------------------------------------------------------------- */
 906
 907 static int
 908 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
 909 {
 910         struct nvmm_ctl_mach_info ctl;
 911         struct nvmm_machine *mach;
 912         int error;
 913         size_t i;
 914
 915         if (args->size != sizeof(ctl))
 916                 return EINVAL;
 917         error = copyin(args->data, &ctl, sizeof(ctl));
 918         if (error)
 919                 return error;
 920
 921         error = nvmm_machine_get(owner, ctl.machid, &mach, true);
 922         if (error)
 923                 return error;
 924
 925         ctl.nvcpus = mach->ncpus;
 926
 927         ctl.nram = 0;
 928         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
 929                 if (!mach->hmap[i].present)
 930                         continue;
 931                 ctl.nram += mach->hmap[i].size;
 932         }
 933
 934         ctl.pid = mach->owner->pid;
 935         ctl.time = mach->time;
 936
 937         nvmm_machine_put(mach);
 938
 939         error = copyout(&ctl, args->data, sizeof(ctl));
 940         if (error)
 941                 return error;
 942
 943         return 0;
 944 }
 945
 946 static int
 947 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
 948 {
 949         switch (args->op) {
 950         case NVMM_CTL_MACH_INFO:
 951                 return nvmm_ctl_mach_info(owner, args);
 952         default:
 953                 return EINVAL;
 954         }
 955 }
 956
 957 /* -------------------------------------------------------------------------- */
 958
 959 static const struct nvmm_impl *
 960 nvmm_ident(void)
 961 {
 962         size_t i;
 963
 964         for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
 965                 if ((*nvmm_impl_list[i]->ident)())
 966                         return nvmm_impl_list[i];
 967         }
 968
 969         return NULL;
 970 }
 971
 972 static int
 973 nvmm_init(void)
 974 {
 975         size_t i, n;
 976
 977         nvmm_impl = nvmm_ident();
 978         if (nvmm_impl == NULL)
 979                 return ENOTSUP;
 980
 981         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
 982                 machines[i].machid = i;
 983                 rw_init(&machines[i].lock);
 984                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
 985                         machines[i].cpus[n].present = false;
 986                         machines[i].cpus[n].cpuid = n;
 987                         mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
 988                             IPL_NONE);
 989                 }
 990         }
 991
 992         (*nvmm_impl->init)();
 993
 994         return 0;
 995 }
 996
 997 static void
 998 nvmm_fini(void)
 999 {
1000         size_t i, n;
1001
1002         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
1003                 rw_destroy(&machines[i].lock);
1004                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
1005                         mutex_destroy(&machines[i].cpus[n].lock);
1006                 }
1007         }
1008
1009         (*nvmm_impl->fini)();
1010         nvmm_impl = NULL;
1011 }
1012
1013 /* -------------------------------------------------------------------------- */
1014
1015 static d_open_t nvmm_open;
1016 static d_ioctl_t nvmm_ioctl;
1017 static d_mmap_single_t nvmm_mmap_single;
1018 static d_priv_dtor_t nvmm_dtor;
1019
1020 static struct dev_ops nvmm_ops = {
1021         { "nvmm", 0, D_MPSAFE },
1022         .d_open         = nvmm_open,
1023         .d_ioctl        = nvmm_ioctl,
1024         .d_mmap_single  = nvmm_mmap_single,
1025 };
1026
1027 static int
1028 nvmm_open(struct dev_open_args *ap)
1029 {
1030         int flags = ap->a_oflags;
1031         struct nvmm_owner *owner;
1032         struct file *fp;
1033         int error;
1034
1035         if (__predict_false(nvmm_impl == NULL))
1036                 return ENXIO;
1037         if (!(flags & O_CLOEXEC))
1038                 return EINVAL;
1039
1040         if (priv_check_cred(ap->a_cred, PRIV_ROOT, 0) == 0) {
1041                 owner = &root_owner;
1042         } else {
1043                 owner = kmem_alloc(sizeof(*owner), KM_SLEEP);
1044                 owner->pid = curthread->td_proc->p_pid;
1045         }
1046
1047         fp = ap->a_fpp ? *ap->a_fpp : NULL;
1048         error = devfs_set_cdevpriv(fp, owner, nvmm_dtor);
1049         if (error) {
1050                 nvmm_dtor(owner);
1051                 return error;
1052         }
1053
1054         return 0;
1055 }
1056
1057 static void
1058 nvmm_dtor(void *arg)
1059 {
1060         struct nvmm_owner *owner = arg;
1061
1062         KASSERT(owner != NULL);
1063         nvmm_kill_machines(owner);
1064         if (owner != &root_owner) {
1065                 kmem_free(owner, sizeof(*owner));
1066         }
1067 }
1068
1069 static int
1070 nvmm_mmap_single(struct dev_mmap_single_args *ap)
1071 {
1072         vm_ooffset_t *offp = ap->a_offset;
1073         size_t size = ap->a_size;
1074         int prot = ap->a_nprot;
1075         struct vm_object **uobjp = ap->a_object;
1076         struct file *fp = ap->a_fp;
1077         struct nvmm_owner *owner = NULL;
1078         struct nvmm_machine *mach;
1079         nvmm_machid_t machid;
1080         nvmm_cpuid_t cpuid;
1081         int error;
1082
1083         devfs_get_cdevpriv(fp, (void **)&owner);
1084         KASSERT(owner != NULL);
1085
1086         if (prot & PROT_EXEC)
1087                 return EACCES;
1088         if (size != PAGE_SIZE)
1089                 return EINVAL;
1090
1091         cpuid = NVMM_COMM_CPUID(*offp);
1092         if (__predict_false(cpuid >= NVMM_MAX_VCPUS))
1093                 return EINVAL;
1094
1095         machid = NVMM_COMM_MACHID(*offp);
1096         error = nvmm_machine_get(owner, machid, &mach, false);
1097         if (error)
1098                 return error;
1099
1100         uao_reference(mach->commuobj);
1101         *uobjp = mach->commuobj;
1102         *offp = cpuid * PAGE_SIZE;
1103
1104         nvmm_machine_put(mach);
1105         return 0;
1106 }
1107
1108 static int
1109 nvmm_ioctl(struct dev_ioctl_args *ap)
1110 {
1111         unsigned long cmd = ap->a_cmd;
1112         void *data = ap->a_data;
1113         struct file *fp = ap->a_fp;
1114         struct nvmm_owner *owner = NULL;
1115
1116         devfs_get_cdevpriv(fp, (void **)&owner);
1117         KASSERT(owner != NULL);
1118
1119         switch (cmd) {
1120         case NVMM_IOC_CAPABILITY:
1121                 return nvmm_capability(owner, data);
1122         case NVMM_IOC_MACHINE_CREATE:
1123                 return nvmm_machine_create(owner, data);
1124         case NVMM_IOC_MACHINE_DESTROY:
1125                 return nvmm_machine_destroy(owner, data);
1126         case NVMM_IOC_MACHINE_CONFIGURE:
1127                 return nvmm_machine_configure(owner, data);
1128         case NVMM_IOC_VCPU_CREATE:
1129                 return nvmm_vcpu_create(owner, data);
1130         case NVMM_IOC_VCPU_DESTROY:
1131                 return nvmm_vcpu_destroy(owner, data);
1132         case NVMM_IOC_VCPU_CONFIGURE:
1133                 return nvmm_vcpu_configure(owner, data);
1134         case NVMM_IOC_VCPU_SETSTATE:
1135                 return nvmm_vcpu_setstate(owner, data);
1136         case NVMM_IOC_VCPU_GETSTATE:
1137                 return nvmm_vcpu_getstate(owner, data);
1138         case NVMM_IOC_VCPU_INJECT:
1139                 return nvmm_vcpu_inject(owner, data);
1140         case NVMM_IOC_VCPU_RUN:
1141                 return nvmm_vcpu_run(owner, data);
1142         case NVMM_IOC_GPA_MAP:
1143                 return nvmm_gpa_map(owner, data);
1144         case NVMM_IOC_GPA_UNMAP:
1145                 return nvmm_gpa_unmap(owner, data);
1146         case NVMM_IOC_HVA_MAP:
1147                 return nvmm_hva_map(owner, data);
1148         case NVMM_IOC_HVA_UNMAP:
1149                 return nvmm_hva_unmap(owner, data);
1150         case NVMM_IOC_CTL:
1151                 return nvmm_ctl(owner, data);
1152         default:
1153                 return EINVAL;
1154         }
1155 }
1156
1157 /* -------------------------------------------------------------------------- */
1158
1159 static int
1160 nvmm_attach(void)
1161 {
1162         int error;
1163
1164         error = nvmm_init();
1165         if (error)
1166                 panic("%s: impossible", __func__);
1167         printf("nvmm: attached, using backend %s\n", nvmm_impl->name);
1168
1169         return 0;
1170 }
1171
1172 static int
1173 nvmm_detach(void)
1174 {
1175         if (atomic_load_acq_int(&nmachines) > 0)
1176                 return EBUSY;
1177
1178         nvmm_fini();
1179         return 0;
1180 }
1181
1182 static int
1183 nvmm_modevent(module_t mod __unused, int type, void *data __unused)
1184 {
1185         static cdev_t dev = NULL;
1186         int error;
1187
1188         switch (type) {
1189         case MOD_LOAD:
1190                 if (nvmm_ident() == NULL) {
1191                         printf("nvmm: cpu not supported\n");
1192                         return ENOTSUP;
1193                 }
1194                 error = nvmm_attach();
1195                 if (error)
1196                         return error;
1197
1198                 dev = make_dev(&nvmm_ops, 0, UID_ROOT, GID_NVMM, 0660, "nvmm");
1199                 if (dev == NULL) {
1200                         printf("nvmm: unable to create device\n");
1201                         error = ENOMEM;
1202                 }
1203                 break;
1204
1205         case MOD_UNLOAD:
1206                 if (dev == NULL)
1207                         return 0;
1208                 error = nvmm_detach();
1209                 if (error == 0)
1210                         destroy_dev(dev);
1211                 break;
1212
1213         case MOD_SHUTDOWN:
1214                 error = 0;
1215                 break;
1216
1217         default:
1218                 error = EOPNOTSUPP;
1219                 break;
1220         }
1221
1222         return error;
1223 }
1224
1225 static moduledata_t nvmm_moddata = {
1226         .name = "nvmm",
1227         .evhand = nvmm_modevent,
1228         .priv = NULL,
1229 };
1230
1231 DECLARE_MODULE(nvmm, nvmm_moddata, SI_SUB_PSEUDO, SI_ORDER_ANY);
1232 MODULE_VERSION(nvmm, NVMM_KERN_VERSION);