nvmm: Clarify the RESET state
[dragonfly.git] / sys / dev / virtual / nvmm / nvmm.c
1 /*      $NetBSD: nvmm.c,v 1.43 2021/04/12 09:22:58 mrg Exp $    */
2
3 /*
4  * Copyright (c) 2018-2020 Maxime Villard, m00nbsd.net
5  * All rights reserved.
6  *
7  * This code is part of the NVMM hypervisor.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
19  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
20  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
21  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
22  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
23  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
25  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30
31 #include <sys/param.h>
32 #include <sys/systm.h>
33
34 #include <sys/conf.h>
35 #include <sys/devfs.h>
36 #include <sys/device.h>
37 #include <sys/fcntl.h>
38 #include <sys/kernel.h>
39 #include <sys/module.h>
40 #include <sys/priv.h>
41 #include <sys/thread.h>
42
43 #include <dev/virtual/nvmm/nvmm_compat.h>
44 #include <dev/virtual/nvmm/nvmm.h>
45 #include <dev/virtual/nvmm/nvmm_internal.h>
46 #include <dev/virtual/nvmm/nvmm_ioctl.h>
47
48 MALLOC_DEFINE(M_NVMM, "nvmm", "NVMM data");
49
50 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
51 static volatile unsigned int nmachines __cacheline_aligned;
52
53 static const struct nvmm_impl *nvmm_impl_list[] = {
54 #if defined(__x86_64__)
55         &nvmm_x86_svm,  /* x86 AMD SVM */
56         &nvmm_x86_vmx   /* x86 Intel VMX */
57 #endif
58 };
59
60 static const struct nvmm_impl *nvmm_impl __read_mostly = NULL;
61
62 static struct nvmm_owner root_owner;
63
64 /* -------------------------------------------------------------------------- */
65
66 static int
67 nvmm_machine_alloc(struct nvmm_machine **ret)
68 {
69         struct nvmm_machine *mach;
70         size_t i;
71
72         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
73                 mach = &machines[i];
74
75                 rw_enter(&mach->lock, RW_WRITER);
76                 if (mach->present) {
77                         rw_exit(&mach->lock);
78                         continue;
79                 }
80
81                 mach->present = true;
82                 mach->time = time_second;
83                 *ret = mach;
84                 atomic_inc_uint(&nmachines);
85                 return 0;
86         }
87
88         return ENOBUFS;
89 }
90
91 static void
92 nvmm_machine_free(struct nvmm_machine *mach)
93 {
94         KASSERT(rw_write_held(&mach->lock));
95         KASSERT(mach->present);
96         mach->present = false;
97         atomic_dec_uint(&nmachines);
98 }
99
100 static int
101 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
102     struct nvmm_machine **ret, bool writer)
103 {
104         struct nvmm_machine *mach;
105         krw_t op = writer ? RW_WRITER : RW_READER;
106
107         if (__predict_false(machid >= NVMM_MAX_MACHINES)) {
108                 return EINVAL;
109         }
110         mach = &machines[machid];
111
112         rw_enter(&mach->lock, op);
113         if (__predict_false(!mach->present)) {
114                 rw_exit(&mach->lock);
115                 return ENOENT;
116         }
117         if (__predict_false(mach->owner != owner && owner != &root_owner)) {
118                 rw_exit(&mach->lock);
119                 return EPERM;
120         }
121         *ret = mach;
122
123         return 0;
124 }
125
126 static void
127 nvmm_machine_put(struct nvmm_machine *mach)
128 {
129         rw_exit(&mach->lock);
130 }
131
132 /* -------------------------------------------------------------------------- */
133
134 static int
135 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
136     struct nvmm_cpu **ret)
137 {
138         struct nvmm_cpu *vcpu;
139
140         if (cpuid >= NVMM_MAX_VCPUS) {
141                 return EINVAL;
142         }
143         vcpu = &mach->cpus[cpuid];
144
145         mutex_enter(&vcpu->lock);
146         if (vcpu->present) {
147                 mutex_exit(&vcpu->lock);
148                 return EBUSY;
149         }
150
151         vcpu->present = true;
152         vcpu->comm = NULL;
153         vcpu->hcpu_last = -1;
154         *ret = vcpu;
155         return 0;
156 }
157
158 static void
159 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
160 {
161         KASSERT(mutex_owned(&vcpu->lock));
162         vcpu->present = false;
163         if (vcpu->comm != NULL) {
164                 uvm_deallocate(kernel_map, (vaddr_t)vcpu->comm, PAGE_SIZE);
165         }
166 }
167
168 static int
169 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
170     struct nvmm_cpu **ret)
171 {
172         struct nvmm_cpu *vcpu;
173
174         if (__predict_false(cpuid >= NVMM_MAX_VCPUS)) {
175                 return EINVAL;
176         }
177         vcpu = &mach->cpus[cpuid];
178
179         mutex_enter(&vcpu->lock);
180         if (__predict_false(!vcpu->present)) {
181                 mutex_exit(&vcpu->lock);
182                 return ENOENT;
183         }
184         *ret = vcpu;
185
186         return 0;
187 }
188
189 static void
190 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
191 {
192         mutex_exit(&vcpu->lock);
193 }
194
195 /* -------------------------------------------------------------------------- */
196
197 static void
198 nvmm_kill_machines(struct nvmm_owner *owner)
199 {
200         struct nvmm_machine *mach;
201         struct nvmm_cpu *vcpu;
202         size_t i, j;
203         int error;
204
205         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
206                 mach = &machines[i];
207
208                 rw_enter(&mach->lock, RW_WRITER);
209                 if (!mach->present || mach->owner != owner) {
210                         rw_exit(&mach->lock);
211                         continue;
212                 }
213
214                 /* Kill it. */
215                 for (j = 0; j < NVMM_MAX_VCPUS; j++) {
216                         error = nvmm_vcpu_get(mach, j, &vcpu);
217                         if (error)
218                                 continue;
219                         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
220                         nvmm_vcpu_free(mach, vcpu);
221                         nvmm_vcpu_put(vcpu);
222                         atomic_dec_uint(&mach->ncpus);
223                 }
224                 (*nvmm_impl->machine_destroy)(mach);
225                 uvmspace_free(mach->vm);
226
227                 /* Drop the kernel UOBJ refs. */
228                 for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
229                         if (!mach->hmap[j].present)
230                                 continue;
231                         uao_detach(mach->hmap[j].uobj);
232                 }
233
234                 nvmm_machine_free(mach);
235
236                 rw_exit(&mach->lock);
237         }
238 }
239
240 /* -------------------------------------------------------------------------- */
241
242 static int
243 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
244 {
245         args->cap.version = NVMM_KERN_VERSION;
246         args->cap.state_size = nvmm_impl->state_size;
247         args->cap.max_machines = NVMM_MAX_MACHINES;
248         args->cap.max_vcpus = NVMM_MAX_VCPUS;
249         args->cap.max_ram = NVMM_MAX_RAM;
250
251         (*nvmm_impl->capability)(&args->cap);
252
253         return 0;
254 }
255
256 static int
257 nvmm_machine_create(struct nvmm_owner *owner,
258     struct nvmm_ioc_machine_create *args)
259 {
260         struct nvmm_machine *mach;
261         int error;
262
263         error = nvmm_machine_alloc(&mach);
264         if (error)
265                 return error;
266
267         /* Curproc owns the machine. */
268         mach->owner = owner;
269
270         /* Zero out the host mappings. */
271         memset(&mach->hmap, 0, sizeof(mach->hmap));
272
273         /* Create the machine vmspace. */
274         mach->gpa_begin = 0;
275         mach->gpa_end = NVMM_MAX_RAM;
276         mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
277
278 #ifdef __DragonFly__
279         /*
280          * Set PMAP_MULTI on the backing pmap for the machine.  Only
281          * pmap changes to the backing pmap for the machine affect the
282          * guest.  Changes to the host's pmap do not affect the guest's
283          * backing pmap.
284          */
285         pmap_maybethreaded(&mach->vm->vm_pmap);
286 #endif
287
288         /* Create the comm uobj. */
289         mach->commuobj = uao_create(NVMM_MAX_VCPUS * PAGE_SIZE, 0);
290
291         (*nvmm_impl->machine_create)(mach);
292
293         args->machid = mach->machid;
294         nvmm_machine_put(mach);
295
296         return 0;
297 }
298
299 static int
300 nvmm_machine_destroy(struct nvmm_owner *owner,
301     struct nvmm_ioc_machine_destroy *args)
302 {
303         struct nvmm_machine *mach;
304         struct nvmm_cpu *vcpu;
305         int error;
306         size_t i;
307
308         error = nvmm_machine_get(owner, args->machid, &mach, true);
309         if (error)
310                 return error;
311
312         for (i = 0; i < NVMM_MAX_VCPUS; i++) {
313                 error = nvmm_vcpu_get(mach, i, &vcpu);
314                 if (error)
315                         continue;
316
317                 (*nvmm_impl->vcpu_destroy)(mach, vcpu);
318                 nvmm_vcpu_free(mach, vcpu);
319                 nvmm_vcpu_put(vcpu);
320                 atomic_dec_uint(&mach->ncpus);
321         }
322
323         (*nvmm_impl->machine_destroy)(mach);
324
325         /* Free the machine vmspace. */
326         uvmspace_free(mach->vm);
327
328         /* Drop the kernel UOBJ refs. */
329         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
330                 if (!mach->hmap[i].present)
331                         continue;
332                 uao_detach(mach->hmap[i].uobj);
333         }
334
335         nvmm_machine_free(mach);
336         nvmm_machine_put(mach);
337
338         return 0;
339 }
340
341 static int
342 nvmm_machine_configure(struct nvmm_owner *owner,
343     struct nvmm_ioc_machine_configure *args)
344 {
345         struct nvmm_machine *mach;
346         size_t allocsz;
347         uint64_t op;
348         void *data;
349         int error;
350
351         op = NVMM_MACH_CONF_MD(args->op);
352         if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
353                 return EINVAL;
354         }
355
356         allocsz = nvmm_impl->mach_conf_sizes[op];
357         data = kmem_alloc(allocsz, KM_SLEEP);
358
359         error = nvmm_machine_get(owner, args->machid, &mach, true);
360         if (error) {
361                 kmem_free(data, allocsz);
362                 return error;
363         }
364
365         error = copyin(args->conf, data, allocsz);
366         if (error) {
367                 goto out;
368         }
369
370         error = (*nvmm_impl->machine_configure)(mach, op, data);
371
372 out:
373         nvmm_machine_put(mach);
374         kmem_free(data, allocsz);
375         return error;
376 }
377
378 static int
379 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
380 {
381         struct nvmm_machine *mach;
382         struct nvmm_cpu *vcpu;
383         int error;
384
385         error = nvmm_machine_get(owner, args->machid, &mach, false);
386         if (error)
387                 return error;
388
389         error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
390         if (error)
391                 goto out;
392
393         /* Allocate the comm page. */
394         uao_reference(mach->commuobj);
395         error = uvm_map(kernel_map, (vaddr_t *)&vcpu->comm, PAGE_SIZE,
396             mach->commuobj, args->cpuid * PAGE_SIZE, 0, UVM_MAPFLAG(UVM_PROT_RW,
397             UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0));
398         if (error) {
399                 uao_detach(mach->commuobj);
400                 nvmm_vcpu_free(mach, vcpu);
401                 nvmm_vcpu_put(vcpu);
402                 goto out;
403         }
404         error = uvm_map_pageable(kernel_map, (vaddr_t)vcpu->comm,
405             (vaddr_t)vcpu->comm + PAGE_SIZE, false, 0);
406         if (error) {
407                 nvmm_vcpu_free(mach, vcpu);
408                 nvmm_vcpu_put(vcpu);
409                 goto out;
410         }
411         memset(vcpu->comm, 0, PAGE_SIZE);
412
413         error = (*nvmm_impl->vcpu_create)(mach, vcpu);
414         if (error) {
415                 nvmm_vcpu_free(mach, vcpu);
416                 nvmm_vcpu_put(vcpu);
417                 goto out;
418         }
419
420         nvmm_vcpu_put(vcpu);
421         atomic_inc_uint(&mach->ncpus);
422
423 out:
424         nvmm_machine_put(mach);
425         return error;
426 }
427
428 static int
429 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
430 {
431         struct nvmm_machine *mach;
432         struct nvmm_cpu *vcpu;
433         int error;
434
435         error = nvmm_machine_get(owner, args->machid, &mach, false);
436         if (error)
437                 return error;
438
439         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
440         if (error)
441                 goto out;
442
443         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
444         nvmm_vcpu_free(mach, vcpu);
445         nvmm_vcpu_put(vcpu);
446         atomic_dec_uint(&mach->ncpus);
447
448 out:
449         nvmm_machine_put(mach);
450         return error;
451 }
452
453 static int
454 nvmm_vcpu_configure(struct nvmm_owner *owner,
455     struct nvmm_ioc_vcpu_configure *args)
456 {
457         struct nvmm_machine *mach;
458         struct nvmm_cpu *vcpu;
459         size_t allocsz;
460         uint64_t op;
461         void *data;
462         int error;
463
464         op = NVMM_VCPU_CONF_MD(args->op);
465         if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
466                 return EINVAL;
467
468         allocsz = nvmm_impl->vcpu_conf_sizes[op];
469         data = kmem_alloc(allocsz, KM_SLEEP);
470
471         error = nvmm_machine_get(owner, args->machid, &mach, false);
472         if (error) {
473                 kmem_free(data, allocsz);
474                 return error;
475         }
476
477         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
478         if (error) {
479                 nvmm_machine_put(mach);
480                 kmem_free(data, allocsz);
481                 return error;
482         }
483
484         error = copyin(args->conf, data, allocsz);
485         if (error) {
486                 goto out;
487         }
488
489         error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
490
491 out:
492         nvmm_vcpu_put(vcpu);
493         nvmm_machine_put(mach);
494         kmem_free(data, allocsz);
495         return error;
496 }
497
498 static int
499 nvmm_vcpu_setstate(struct nvmm_owner *owner,
500     struct nvmm_ioc_vcpu_setstate *args)
501 {
502         struct nvmm_machine *mach;
503         struct nvmm_cpu *vcpu;
504         int error;
505
506         error = nvmm_machine_get(owner, args->machid, &mach, false);
507         if (error)
508                 return error;
509
510         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
511         if (error)
512                 goto out;
513
514         (*nvmm_impl->vcpu_setstate)(vcpu);
515         nvmm_vcpu_put(vcpu);
516
517 out:
518         nvmm_machine_put(mach);
519         return error;
520 }
521
522 static int
523 nvmm_vcpu_getstate(struct nvmm_owner *owner,
524     struct nvmm_ioc_vcpu_getstate *args)
525 {
526         struct nvmm_machine *mach;
527         struct nvmm_cpu *vcpu;
528         int error;
529
530         error = nvmm_machine_get(owner, args->machid, &mach, false);
531         if (error)
532                 return error;
533
534         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
535         if (error)
536                 goto out;
537
538         (*nvmm_impl->vcpu_getstate)(vcpu);
539         nvmm_vcpu_put(vcpu);
540
541 out:
542         nvmm_machine_put(mach);
543         return error;
544 }
545
546 static int
547 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
548 {
549         struct nvmm_machine *mach;
550         struct nvmm_cpu *vcpu;
551         int error;
552
553         error = nvmm_machine_get(owner, args->machid, &mach, false);
554         if (error)
555                 return error;
556
557         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
558         if (error)
559                 goto out;
560
561         error = (*nvmm_impl->vcpu_inject)(vcpu);
562         nvmm_vcpu_put(vcpu);
563
564 out:
565         nvmm_machine_put(mach);
566         return error;
567 }
568
569 static int
570 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
571     struct nvmm_vcpu_exit *exit)
572 {
573         struct vmspace *vm = mach->vm;
574         int ret;
575
576         while (1) {
577                 /* Got a signal? Or pending resched? Leave. */
578                 if (__predict_false(nvmm_return_needed())) {
579                         exit->reason = NVMM_VCPU_EXIT_NONE;
580                         return 0;
581                 }
582
583                 /* Run the VCPU. */
584                 ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
585                 if (__predict_false(ret != 0)) {
586                         return ret;
587                 }
588
589                 /* Process nested page faults. */
590                 if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
591                         break;
592                 }
593                 if (exit->u.mem.gpa >= mach->gpa_end) {
594                         break;
595                 }
596                 if (uvm_fault(&vm->vm_map, exit->u.mem.gpa, exit->u.mem.prot)) {
597                         break;
598                 }
599         }
600
601         return 0;
602 }
603
604 static int
605 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
606 {
607         struct nvmm_machine *mach;
608         struct nvmm_cpu *vcpu;
609         int error;
610
611         error = nvmm_machine_get(owner, args->machid, &mach, false);
612         if (error)
613                 return error;
614
615         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
616         if (error)
617                 goto out;
618
619         error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
620         nvmm_vcpu_put(vcpu);
621
622 out:
623         nvmm_machine_put(mach);
624         return error;
625 }
626
627 /* -------------------------------------------------------------------------- */
628
629 static struct uvm_object *
630 nvmm_hmapping_getuobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
631    size_t *off)
632 {
633         struct nvmm_hmapping *hmapping;
634         size_t i;
635
636         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
637                 hmapping = &mach->hmap[i];
638                 if (!hmapping->present) {
639                         continue;
640                 }
641                 if (hva >= hmapping->hva &&
642                     hva + size <= hmapping->hva + hmapping->size) {
643                         *off = hva - hmapping->hva;
644                         return hmapping->uobj;
645                 }
646         }
647
648         return NULL;
649 }
650
651 static int
652 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
653 {
654         struct nvmm_hmapping *hmapping;
655         size_t i;
656
657         if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
658                 return EINVAL;
659         }
660         if (hva == 0) {
661                 return EINVAL;
662         }
663
664         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
665                 hmapping = &mach->hmap[i];
666                 if (!hmapping->present) {
667                         continue;
668                 }
669
670                 if (hva >= hmapping->hva &&
671                     hva + size <= hmapping->hva + hmapping->size) {
672                         break;
673                 }
674
675                 if (hva >= hmapping->hva &&
676                     hva < hmapping->hva + hmapping->size) {
677                         return EEXIST;
678                 }
679                 if (hva + size > hmapping->hva &&
680                     hva + size <= hmapping->hva + hmapping->size) {
681                         return EEXIST;
682                 }
683                 if (hva <= hmapping->hva &&
684                     hva + size >= hmapping->hva + hmapping->size) {
685                         return EEXIST;
686                 }
687         }
688
689         return 0;
690 }
691
692 static struct nvmm_hmapping *
693 nvmm_hmapping_alloc(struct nvmm_machine *mach)
694 {
695         struct nvmm_hmapping *hmapping;
696         size_t i;
697
698         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
699                 hmapping = &mach->hmap[i];
700                 if (!hmapping->present) {
701                         hmapping->present = true;
702                         return hmapping;
703                 }
704         }
705
706         return NULL;
707 }
708
709 static int
710 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
711 {
712         struct vmspace *vmspace = curproc->p_vmspace;
713         struct nvmm_hmapping *hmapping;
714         size_t i;
715
716         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
717                 hmapping = &mach->hmap[i];
718                 if (!hmapping->present || hmapping->hva != hva ||
719                     hmapping->size != size) {
720                         continue;
721                 }
722
723                 uvm_unmap(&vmspace->vm_map, hmapping->hva,
724                     hmapping->hva + hmapping->size);
725                 uao_detach(hmapping->uobj);
726
727                 hmapping->uobj = NULL;
728                 hmapping->present = false;
729
730                 return 0;
731         }
732
733         return ENOENT;
734 }
735
736 static int
737 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
738 {
739         struct vmspace *vmspace = curproc->p_vmspace;
740         struct nvmm_machine *mach;
741         struct nvmm_hmapping *hmapping;
742         vaddr_t uva;
743         int error;
744
745         error = nvmm_machine_get(owner, args->machid, &mach, true);
746         if (error)
747                 return error;
748
749         error = nvmm_hmapping_validate(mach, args->hva, args->size);
750         if (error)
751                 goto out;
752
753         hmapping = nvmm_hmapping_alloc(mach);
754         if (hmapping == NULL) {
755                 error = ENOBUFS;
756                 goto out;
757         }
758
759         hmapping->hva = args->hva;
760         hmapping->size = args->size;
761         hmapping->uobj = uao_create(hmapping->size, 0);
762         uva = hmapping->hva;
763
764         /* Take a reference for the user. */
765         uao_reference(hmapping->uobj);
766
767         /* Map the uobj into the user address space, as pageable. */
768         error = uvm_map(&vmspace->vm_map, &uva, hmapping->size, hmapping->uobj,
769             0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE,
770             UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
771         if (error) {
772                 uao_detach(hmapping->uobj);
773         }
774
775 out:
776         nvmm_machine_put(mach);
777         return error;
778 }
779
780 static int
781 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
782 {
783         struct nvmm_machine *mach;
784         int error;
785
786         error = nvmm_machine_get(owner, args->machid, &mach, true);
787         if (error)
788                 return error;
789
790         error = nvmm_hmapping_free(mach, args->hva, args->size);
791
792         nvmm_machine_put(mach);
793         return error;
794 }
795
796 /* -------------------------------------------------------------------------- */
797
798 static int
799 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
800 {
801         struct nvmm_machine *mach;
802         struct uvm_object *uobj;
803         gpaddr_t gpa;
804         size_t off;
805         int error;
806
807         error = nvmm_machine_get(owner, args->machid, &mach, false);
808         if (error)
809                 return error;
810
811         if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
812                 error = EINVAL;
813                 goto out;
814         }
815
816         if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
817             (args->hva % PAGE_SIZE) != 0) {
818                 error = EINVAL;
819                 goto out;
820         }
821         if (args->hva == 0) {
822                 error = EINVAL;
823                 goto out;
824         }
825         if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
826                 error = EINVAL;
827                 goto out;
828         }
829         if (args->gpa + args->size <= args->gpa) {
830                 error = EINVAL;
831                 goto out;
832         }
833         if (args->gpa + args->size > mach->gpa_end) {
834                 error = EINVAL;
835                 goto out;
836         }
837         gpa = args->gpa;
838
839         uobj = nvmm_hmapping_getuobj(mach, args->hva, args->size, &off);
840         if (uobj == NULL) {
841                 error = EINVAL;
842                 goto out;
843         }
844
845         /* Take a reference for the machine. */
846         uao_reference(uobj);
847
848         /* Map the uobj into the machine address space, as pageable. */
849         error = uvm_map(&mach->vm->vm_map, &gpa, args->size, uobj, off, 0,
850             UVM_MAPFLAG(args->prot, UVM_PROT_RWX, UVM_INH_NONE,
851             UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
852         if (error) {
853                 uao_detach(uobj);
854                 goto out;
855         }
856         if (gpa != args->gpa) {
857                 uao_detach(uobj);
858                 printf("[!] uvm_map problem\n");
859                 error = EINVAL;
860                 goto out;
861         }
862
863 out:
864         nvmm_machine_put(mach);
865         return error;
866 }
867
868 static int
869 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
870 {
871         struct nvmm_machine *mach;
872         gpaddr_t gpa;
873         int error;
874
875         error = nvmm_machine_get(owner, args->machid, &mach, false);
876         if (error)
877                 return error;
878
879         if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
880                 error = EINVAL;
881                 goto out;
882         }
883         if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
884                 error = EINVAL;
885                 goto out;
886         }
887         if (args->gpa + args->size <= args->gpa) {
888                 error = EINVAL;
889                 goto out;
890         }
891         if (args->gpa + args->size >= mach->gpa_end) {
892                 error = EINVAL;
893                 goto out;
894         }
895         gpa = args->gpa;
896
897         /* Unmap the memory from the machine. */
898         uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
899
900 out:
901         nvmm_machine_put(mach);
902         return error;
903 }
904
905 /* -------------------------------------------------------------------------- */
906
907 static int
908 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
909 {
910         struct nvmm_ctl_mach_info ctl;
911         struct nvmm_machine *mach;
912         int error;
913         size_t i;
914
915         if (args->size != sizeof(ctl))
916                 return EINVAL;
917         error = copyin(args->data, &ctl, sizeof(ctl));
918         if (error)
919                 return error;
920
921         error = nvmm_machine_get(owner, ctl.machid, &mach, true);
922         if (error)
923                 return error;
924
925         ctl.nvcpus = mach->ncpus;
926
927         ctl.nram = 0;
928         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
929                 if (!mach->hmap[i].present)
930                         continue;
931                 ctl.nram += mach->hmap[i].size;
932         }
933
934         ctl.pid = mach->owner->pid;
935         ctl.time = mach->time;
936
937         nvmm_machine_put(mach);
938
939         error = copyout(&ctl, args->data, sizeof(ctl));
940         if (error)
941                 return error;
942
943         return 0;
944 }
945
946 static int
947 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
948 {
949         switch (args->op) {
950         case NVMM_CTL_MACH_INFO:
951                 return nvmm_ctl_mach_info(owner, args);
952         default:
953                 return EINVAL;
954         }
955 }
956
957 /* -------------------------------------------------------------------------- */
958
959 static const struct nvmm_impl *
960 nvmm_ident(void)
961 {
962         size_t i;
963
964         for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
965                 if ((*nvmm_impl_list[i]->ident)())
966                         return nvmm_impl_list[i];
967         }
968
969         return NULL;
970 }
971
972 static int
973 nvmm_init(void)
974 {
975         size_t i, n;
976
977         nvmm_impl = nvmm_ident();
978         if (nvmm_impl == NULL)
979                 return ENOTSUP;
980
981         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
982                 machines[i].machid = i;
983                 rw_init(&machines[i].lock);
984                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
985                         machines[i].cpus[n].present = false;
986                         machines[i].cpus[n].cpuid = n;
987                         mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
988                             IPL_NONE);
989                 }
990         }
991
992         (*nvmm_impl->init)();
993
994         return 0;
995 }
996
997 static void
998 nvmm_fini(void)
999 {
1000         size_t i, n;
1001
1002         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
1003                 rw_destroy(&machines[i].lock);
1004                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
1005                         mutex_destroy(&machines[i].cpus[n].lock);
1006                 }
1007         }
1008
1009         (*nvmm_impl->fini)();
1010         nvmm_impl = NULL;
1011 }
1012
1013 /* -------------------------------------------------------------------------- */
1014
1015 static d_open_t nvmm_open;
1016 static d_ioctl_t nvmm_ioctl;
1017 static d_mmap_single_t nvmm_mmap_single;
1018 static d_priv_dtor_t nvmm_dtor;
1019
1020 static struct dev_ops nvmm_ops = {
1021         { "nvmm", 0, D_MPSAFE },
1022         .d_open         = nvmm_open,
1023         .d_ioctl        = nvmm_ioctl,
1024         .d_mmap_single  = nvmm_mmap_single,
1025 };
1026
1027 static int
1028 nvmm_open(struct dev_open_args *ap)
1029 {
1030         int flags = ap->a_oflags;
1031         struct nvmm_owner *owner;
1032         struct file *fp;
1033         int error;
1034
1035         if (__predict_false(nvmm_impl == NULL))
1036                 return ENXIO;
1037         if (!(flags & O_CLOEXEC))
1038                 return EINVAL;
1039
1040         if (priv_check_cred(ap->a_cred, PRIV_ROOT, 0) == 0) {
1041                 owner = &root_owner;
1042         } else {
1043                 owner = kmem_alloc(sizeof(*owner), KM_SLEEP);
1044                 owner->pid = curthread->td_proc->p_pid;
1045         }
1046
1047         fp = ap->a_fpp ? *ap->a_fpp : NULL;
1048         error = devfs_set_cdevpriv(fp, owner, nvmm_dtor);
1049         if (error) {
1050                 nvmm_dtor(owner);
1051                 return error;
1052         }
1053
1054         return 0;
1055 }
1056
1057 static void
1058 nvmm_dtor(void *arg)
1059 {
1060         struct nvmm_owner *owner = arg;
1061
1062         KASSERT(owner != NULL);
1063         nvmm_kill_machines(owner);
1064         if (owner != &root_owner) {
1065                 kmem_free(owner, sizeof(*owner));
1066         }
1067 }
1068
1069 static int
1070 nvmm_mmap_single(struct dev_mmap_single_args *ap)
1071 {
1072         vm_ooffset_t *offp = ap->a_offset;
1073         size_t size = ap->a_size;
1074         int prot = ap->a_nprot;
1075         struct vm_object **uobjp = ap->a_object;
1076         struct file *fp = ap->a_fp;
1077         struct nvmm_owner *owner = NULL;
1078         struct nvmm_machine *mach;
1079         nvmm_machid_t machid;
1080         nvmm_cpuid_t cpuid;
1081         int error;
1082
1083         devfs_get_cdevpriv(fp, (void **)&owner);
1084         KASSERT(owner != NULL);
1085
1086         if (prot & PROT_EXEC)
1087                 return EACCES;
1088         if (size != PAGE_SIZE)
1089                 return EINVAL;
1090
1091         cpuid = NVMM_COMM_CPUID(*offp);
1092         if (__predict_false(cpuid >= NVMM_MAX_VCPUS))
1093                 return EINVAL;
1094
1095         machid = NVMM_COMM_MACHID(*offp);
1096         error = nvmm_machine_get(owner, machid, &mach, false);
1097         if (error)
1098                 return error;
1099
1100         uao_reference(mach->commuobj);
1101         *uobjp = mach->commuobj;
1102         *offp = cpuid * PAGE_SIZE;
1103
1104         nvmm_machine_put(mach);
1105         return 0;
1106 }
1107
1108 static int
1109 nvmm_ioctl(struct dev_ioctl_args *ap)
1110 {
1111         unsigned long cmd = ap->a_cmd;
1112         void *data = ap->a_data;
1113         struct file *fp = ap->a_fp;
1114         struct nvmm_owner *owner = NULL;
1115
1116         devfs_get_cdevpriv(fp, (void **)&owner);
1117         KASSERT(owner != NULL);
1118
1119         switch (cmd) {
1120         case NVMM_IOC_CAPABILITY:
1121                 return nvmm_capability(owner, data);
1122         case NVMM_IOC_MACHINE_CREATE:
1123                 return nvmm_machine_create(owner, data);
1124         case NVMM_IOC_MACHINE_DESTROY:
1125                 return nvmm_machine_destroy(owner, data);
1126         case NVMM_IOC_MACHINE_CONFIGURE:
1127                 return nvmm_machine_configure(owner, data);
1128         case NVMM_IOC_VCPU_CREATE:
1129                 return nvmm_vcpu_create(owner, data);
1130         case NVMM_IOC_VCPU_DESTROY:
1131                 return nvmm_vcpu_destroy(owner, data);
1132         case NVMM_IOC_VCPU_CONFIGURE:
1133                 return nvmm_vcpu_configure(owner, data);
1134         case NVMM_IOC_VCPU_SETSTATE:
1135                 return nvmm_vcpu_setstate(owner, data);
1136         case NVMM_IOC_VCPU_GETSTATE:
1137                 return nvmm_vcpu_getstate(owner, data);
1138         case NVMM_IOC_VCPU_INJECT:
1139                 return nvmm_vcpu_inject(owner, data);
1140         case NVMM_IOC_VCPU_RUN:
1141                 return nvmm_vcpu_run(owner, data);
1142         case NVMM_IOC_GPA_MAP:
1143                 return nvmm_gpa_map(owner, data);
1144         case NVMM_IOC_GPA_UNMAP:
1145                 return nvmm_gpa_unmap(owner, data);
1146         case NVMM_IOC_HVA_MAP:
1147                 return nvmm_hva_map(owner, data);
1148         case NVMM_IOC_HVA_UNMAP:
1149                 return nvmm_hva_unmap(owner, data);
1150         case NVMM_IOC_CTL:
1151                 return nvmm_ctl(owner, data);
1152         default:
1153                 return EINVAL;
1154         }
1155 }
1156
1157 /* -------------------------------------------------------------------------- */
1158
1159 static int
1160 nvmm_attach(void)
1161 {
1162         int error;
1163
1164         error = nvmm_init();
1165         if (error)
1166                 panic("%s: impossible", __func__);
1167         printf("nvmm: attached, using backend %s\n", nvmm_impl->name);
1168
1169         return 0;
1170 }
1171
1172 static int
1173 nvmm_detach(void)
1174 {
1175         if (atomic_load_acq_int(&nmachines) > 0)
1176                 return EBUSY;
1177
1178         nvmm_fini();
1179         return 0;
1180 }
1181
1182 static int
1183 nvmm_modevent(module_t mod __unused, int type, void *data __unused)
1184 {
1185         static cdev_t dev = NULL;
1186         int error;
1187
1188         switch (type) {
1189         case MOD_LOAD:
1190                 if (nvmm_ident() == NULL) {
1191                         printf("nvmm: cpu not supported\n");
1192                         return ENOTSUP;
1193                 }
1194                 error = nvmm_attach();
1195                 if (error)
1196                         return error;
1197
1198                 dev = make_dev(&nvmm_ops, 0, UID_ROOT, GID_NVMM, 0660, "nvmm");
1199                 if (dev == NULL) {
1200                         printf("nvmm: unable to create device\n");
1201                         error = ENOMEM;
1202                 }
1203                 break;
1204
1205         case MOD_UNLOAD:
1206                 if (dev == NULL)
1207                         return 0;
1208                 error = nvmm_detach();
1209                 if (error == 0)
1210                         destroy_dev(dev);
1211                 break;
1212
1213         case MOD_SHUTDOWN:
1214                 error = 0;
1215                 break;
1216
1217         default:
1218                 error = EOPNOTSUPP;
1219                 break;
1220         }
1221
1222         return error;
1223 }
1224
1225 static moduledata_t nvmm_moddata = {
1226         .name = "nvmm",
1227         .evhand = nvmm_modevent,
1228         .priv = NULL,
1229 };
1230
1231 DECLARE_MODULE(nvmm, nvmm_moddata, SI_SUB_PSEUDO, SI_ORDER_ANY);
1232 MODULE_VERSION(nvmm, NVMM_KERN_VERSION);