libnvmm: Fix mmap() failure with 'permission denied'
[dragonfly.git] / sys / dev / virtual / nvmm / nvmm.c
1 /*      $NetBSD: nvmm.c,v 1.22.2.7 2020/08/29 17:00:28 martin Exp $     */
2
3 /*
4  * Copyright (c) 2018-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34
35 #include <sys/conf.h>
36 #include <sys/devfs.h>
37 #include <sys/device.h>
38 #include <sys/fcntl.h>
39 #include <sys/kernel.h>
40 #include <sys/module.h>
41 #include <sys/priv.h>
42 #include <sys/thread.h>
43
44 #include <dev/virtual/nvmm/nvmm_compat.h>
45 #include <dev/virtual/nvmm/nvmm.h>
46 #include <dev/virtual/nvmm/nvmm_internal.h>
47 #include <dev/virtual/nvmm/nvmm_ioctl.h>
48
49 MALLOC_DEFINE(M_NVMM, "nvmm", "NVMM data");
50
51 static struct nvmm_machine machines[NVMM_MAX_MACHINES];
52 static volatile unsigned int nmachines __cacheline_aligned;
53
54 static const struct nvmm_impl *nvmm_impl_list[] = {
55 #if defined(__x86_64__)
56         &nvmm_x86_svm,  /* x86 AMD SVM */
57         &nvmm_x86_vmx   /* x86 Intel VMX */
58 #endif
59 };
60
61 static const struct nvmm_impl *nvmm_impl = NULL;
62
63 static struct nvmm_owner root_owner;
64
65 /* -------------------------------------------------------------------------- */
66
67 static int
68 nvmm_machine_alloc(struct nvmm_machine **ret)
69 {
70         struct nvmm_machine *mach;
71         size_t i;
72
73         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
74                 mach = &machines[i];
75
76                 rw_enter(&mach->lock, RW_WRITER);
77                 if (mach->present) {
78                         rw_exit(&mach->lock);
79                         continue;
80                 }
81
82                 mach->present = true;
83                 mach->time = time_second;
84                 *ret = mach;
85                 atomic_inc_uint(&nmachines);
86                 return 0;
87         }
88
89         return ENOBUFS;
90 }
91
92 static void
93 nvmm_machine_free(struct nvmm_machine *mach)
94 {
95         KASSERT(rw_write_held(&mach->lock));
96         KASSERT(mach->present);
97         mach->present = false;
98         atomic_dec_uint(&nmachines);
99 }
100
101 static int
102 nvmm_machine_get(struct nvmm_owner *owner, nvmm_machid_t machid,
103     struct nvmm_machine **ret, bool writer)
104 {
105         struct nvmm_machine *mach;
106         krw_t op = writer ? RW_WRITER : RW_READER;
107
108         if (__predict_false(machid >= NVMM_MAX_MACHINES)) {
109                 return EINVAL;
110         }
111         mach = &machines[machid];
112
113         rw_enter(&mach->lock, op);
114         if (__predict_false(!mach->present)) {
115                 rw_exit(&mach->lock);
116                 return ENOENT;
117         }
118         if (__predict_false(mach->owner != owner && owner != &root_owner)) {
119                 rw_exit(&mach->lock);
120                 return EPERM;
121         }
122         *ret = mach;
123
124         return 0;
125 }
126
127 static void
128 nvmm_machine_put(struct nvmm_machine *mach)
129 {
130         rw_exit(&mach->lock);
131 }
132
133 /* -------------------------------------------------------------------------- */
134
135 static int
136 nvmm_vcpu_alloc(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
137     struct nvmm_cpu **ret)
138 {
139         struct nvmm_cpu *vcpu;
140
141         if (cpuid >= NVMM_MAX_VCPUS) {
142                 return EINVAL;
143         }
144         vcpu = &mach->cpus[cpuid];
145
146         mutex_enter(&vcpu->lock);
147         if (vcpu->present) {
148                 mutex_exit(&vcpu->lock);
149                 return EBUSY;
150         }
151
152         vcpu->present = true;
153         vcpu->comm = NULL;
154         vcpu->hcpu_last = -1;
155         *ret = vcpu;
156         return 0;
157 }
158
159 static void
160 nvmm_vcpu_free(struct nvmm_machine *mach, struct nvmm_cpu *vcpu)
161 {
162         KASSERT(mutex_owned(&vcpu->lock));
163         vcpu->present = false;
164         if (vcpu->comm != NULL) {
165                 uvm_deallocate(kernel_map, (vaddr_t)vcpu->comm, PAGE_SIZE);
166         }
167 }
168
169 static int
170 nvmm_vcpu_get(struct nvmm_machine *mach, nvmm_cpuid_t cpuid,
171     struct nvmm_cpu **ret)
172 {
173         struct nvmm_cpu *vcpu;
174
175         if (__predict_false(cpuid >= NVMM_MAX_VCPUS)) {
176                 return EINVAL;
177         }
178         vcpu = &mach->cpus[cpuid];
179
180         mutex_enter(&vcpu->lock);
181         if (__predict_false(!vcpu->present)) {
182                 mutex_exit(&vcpu->lock);
183                 return ENOENT;
184         }
185         *ret = vcpu;
186
187         return 0;
188 }
189
190 static void
191 nvmm_vcpu_put(struct nvmm_cpu *vcpu)
192 {
193         mutex_exit(&vcpu->lock);
194 }
195
196 /* -------------------------------------------------------------------------- */
197
198 static void
199 nvmm_kill_machines(struct nvmm_owner *owner)
200 {
201         struct nvmm_machine *mach;
202         struct nvmm_cpu *vcpu;
203         size_t i, j;
204         int error;
205
206         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
207                 mach = &machines[i];
208
209                 rw_enter(&mach->lock, RW_WRITER);
210                 if (!mach->present || mach->owner != owner) {
211                         rw_exit(&mach->lock);
212                         continue;
213                 }
214
215                 /* Kill it. */
216                 for (j = 0; j < NVMM_MAX_VCPUS; j++) {
217                         error = nvmm_vcpu_get(mach, j, &vcpu);
218                         if (error)
219                                 continue;
220                         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
221                         nvmm_vcpu_free(mach, vcpu);
222                         nvmm_vcpu_put(vcpu);
223                         atomic_dec_uint(&mach->ncpus);
224                 }
225                 (*nvmm_impl->machine_destroy)(mach);
226                 uvmspace_free(mach->vm);
227
228                 /* Drop the kernel UOBJ refs. */
229                 for (j = 0; j < NVMM_MAX_HMAPPINGS; j++) {
230                         if (!mach->hmap[j].present)
231                                 continue;
232                         uao_detach(mach->hmap[j].uobj);
233                 }
234
235                 nvmm_machine_free(mach);
236
237                 rw_exit(&mach->lock);
238         }
239 }
240
241 /* -------------------------------------------------------------------------- */
242
243 static int
244 nvmm_capability(struct nvmm_owner *owner, struct nvmm_ioc_capability *args)
245 {
246         args->cap.version = NVMM_KERN_VERSION;
247         args->cap.state_size = nvmm_impl->state_size;
248         args->cap.max_machines = NVMM_MAX_MACHINES;
249         args->cap.max_vcpus = NVMM_MAX_VCPUS;
250         args->cap.max_ram = NVMM_MAX_RAM;
251
252         (*nvmm_impl->capability)(&args->cap);
253
254         return 0;
255 }
256
257 static int
258 nvmm_machine_create(struct nvmm_owner *owner,
259     struct nvmm_ioc_machine_create *args)
260 {
261         struct nvmm_machine *mach;
262         int error;
263
264         error = nvmm_machine_alloc(&mach);
265         if (error)
266                 return error;
267
268         /* Curproc owns the machine. */
269         mach->owner = owner;
270
271         /* Zero out the host mappings. */
272         memset(&mach->hmap, 0, sizeof(mach->hmap));
273
274         /* Create the machine vmspace. */
275         mach->gpa_begin = 0;
276         mach->gpa_end = NVMM_MAX_RAM;
277         mach->vm = uvmspace_alloc(0, mach->gpa_end - mach->gpa_begin, false);
278
279         /* Create the comm uobj. */
280         mach->commuobj = uao_create(NVMM_MAX_VCPUS * PAGE_SIZE, 0);
281
282         (*nvmm_impl->machine_create)(mach);
283
284         args->machid = mach->machid;
285         nvmm_machine_put(mach);
286
287         return 0;
288 }
289
290 static int
291 nvmm_machine_destroy(struct nvmm_owner *owner,
292     struct nvmm_ioc_machine_destroy *args)
293 {
294         struct nvmm_machine *mach;
295         struct nvmm_cpu *vcpu;
296         int error;
297         size_t i;
298
299         error = nvmm_machine_get(owner, args->machid, &mach, true);
300         if (error)
301                 return error;
302
303         for (i = 0; i < NVMM_MAX_VCPUS; i++) {
304                 error = nvmm_vcpu_get(mach, i, &vcpu);
305                 if (error)
306                         continue;
307
308                 (*nvmm_impl->vcpu_destroy)(mach, vcpu);
309                 nvmm_vcpu_free(mach, vcpu);
310                 nvmm_vcpu_put(vcpu);
311                 atomic_dec_uint(&mach->ncpus);
312         }
313
314         (*nvmm_impl->machine_destroy)(mach);
315
316         /* Free the machine vmspace. */
317         uvmspace_free(mach->vm);
318
319         /* Drop the kernel UOBJ refs. */
320         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
321                 if (!mach->hmap[i].present)
322                         continue;
323                 uao_detach(mach->hmap[i].uobj);
324         }
325
326         nvmm_machine_free(mach);
327         nvmm_machine_put(mach);
328
329         return 0;
330 }
331
332 static int
333 nvmm_machine_configure(struct nvmm_owner *owner,
334     struct nvmm_ioc_machine_configure *args)
335 {
336         struct nvmm_machine *mach;
337         size_t allocsz;
338         uint64_t op;
339         void *data;
340         int error;
341
342         op = NVMM_MACH_CONF_MD(args->op);
343         if (__predict_false(op >= nvmm_impl->mach_conf_max)) {
344                 return EINVAL;
345         }
346
347         allocsz = nvmm_impl->mach_conf_sizes[op];
348         data = kmem_alloc(allocsz, KM_SLEEP);
349
350         error = nvmm_machine_get(owner, args->machid, &mach, true);
351         if (error) {
352                 kmem_free(data, allocsz);
353                 return error;
354         }
355
356         error = copyin(args->conf, data, allocsz);
357         if (error) {
358                 goto out;
359         }
360
361         error = (*nvmm_impl->machine_configure)(mach, op, data);
362
363 out:
364         nvmm_machine_put(mach);
365         kmem_free(data, allocsz);
366         return error;
367 }
368
369 static int
370 nvmm_vcpu_create(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_create *args)
371 {
372         struct nvmm_machine *mach;
373         struct nvmm_cpu *vcpu;
374         int error;
375
376         error = nvmm_machine_get(owner, args->machid, &mach, false);
377         if (error)
378                 return error;
379
380         error = nvmm_vcpu_alloc(mach, args->cpuid, &vcpu);
381         if (error)
382                 goto out;
383
384         /* Allocate the comm page. */
385         uao_reference(mach->commuobj);
386         error = uvm_map(kernel_map, (vaddr_t *)&vcpu->comm, PAGE_SIZE,
387             mach->commuobj, args->cpuid * PAGE_SIZE, 0, UVM_MAPFLAG(UVM_PROT_RW,
388             UVM_PROT_RW, UVM_INH_SHARE, UVM_ADV_RANDOM, 0));
389         if (error) {
390                 uao_detach(mach->commuobj);
391                 nvmm_vcpu_free(mach, vcpu);
392                 nvmm_vcpu_put(vcpu);
393                 goto out;
394         }
395         error = uvm_map_pageable(kernel_map, (vaddr_t)vcpu->comm,
396             (vaddr_t)vcpu->comm + PAGE_SIZE, false, 0);
397         if (error) {
398                 nvmm_vcpu_free(mach, vcpu);
399                 nvmm_vcpu_put(vcpu);
400                 goto out;
401         }
402         memset(vcpu->comm, 0, PAGE_SIZE);
403
404         error = (*nvmm_impl->vcpu_create)(mach, vcpu);
405         if (error) {
406                 nvmm_vcpu_free(mach, vcpu);
407                 nvmm_vcpu_put(vcpu);
408                 goto out;
409         }
410
411         nvmm_vcpu_put(vcpu);
412         atomic_inc_uint(&mach->ncpus);
413
414 out:
415         nvmm_machine_put(mach);
416         return error;
417 }
418
419 static int
420 nvmm_vcpu_destroy(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_destroy *args)
421 {
422         struct nvmm_machine *mach;
423         struct nvmm_cpu *vcpu;
424         int error;
425
426         error = nvmm_machine_get(owner, args->machid, &mach, false);
427         if (error)
428                 return error;
429
430         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
431         if (error)
432                 goto out;
433
434         (*nvmm_impl->vcpu_destroy)(mach, vcpu);
435         nvmm_vcpu_free(mach, vcpu);
436         nvmm_vcpu_put(vcpu);
437         atomic_dec_uint(&mach->ncpus);
438
439 out:
440         nvmm_machine_put(mach);
441         return error;
442 }
443
444 static int
445 nvmm_vcpu_configure(struct nvmm_owner *owner,
446     struct nvmm_ioc_vcpu_configure *args)
447 {
448         struct nvmm_machine *mach;
449         struct nvmm_cpu *vcpu;
450         size_t allocsz;
451         uint64_t op;
452         void *data;
453         int error;
454
455         op = NVMM_VCPU_CONF_MD(args->op);
456         if (__predict_false(op >= nvmm_impl->vcpu_conf_max))
457                 return EINVAL;
458
459         allocsz = nvmm_impl->vcpu_conf_sizes[op];
460         data = kmem_alloc(allocsz, KM_SLEEP);
461
462         error = nvmm_machine_get(owner, args->machid, &mach, false);
463         if (error) {
464                 kmem_free(data, allocsz);
465                 return error;
466         }
467
468         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
469         if (error) {
470                 nvmm_machine_put(mach);
471                 kmem_free(data, allocsz);
472                 return error;
473         }
474
475         error = copyin(args->conf, data, allocsz);
476         if (error) {
477                 goto out;
478         }
479
480         error = (*nvmm_impl->vcpu_configure)(vcpu, op, data);
481
482 out:
483         nvmm_vcpu_put(vcpu);
484         nvmm_machine_put(mach);
485         kmem_free(data, allocsz);
486         return error;
487 }
488
489 static int
490 nvmm_vcpu_setstate(struct nvmm_owner *owner,
491     struct nvmm_ioc_vcpu_setstate *args)
492 {
493         struct nvmm_machine *mach;
494         struct nvmm_cpu *vcpu;
495         int error;
496
497         error = nvmm_machine_get(owner, args->machid, &mach, false);
498         if (error)
499                 return error;
500
501         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
502         if (error)
503                 goto out;
504
505         (*nvmm_impl->vcpu_setstate)(vcpu);
506         nvmm_vcpu_put(vcpu);
507
508 out:
509         nvmm_machine_put(mach);
510         return error;
511 }
512
513 static int
514 nvmm_vcpu_getstate(struct nvmm_owner *owner,
515     struct nvmm_ioc_vcpu_getstate *args)
516 {
517         struct nvmm_machine *mach;
518         struct nvmm_cpu *vcpu;
519         int error;
520
521         error = nvmm_machine_get(owner, args->machid, &mach, false);
522         if (error)
523                 return error;
524
525         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
526         if (error)
527                 goto out;
528
529         (*nvmm_impl->vcpu_getstate)(vcpu);
530         nvmm_vcpu_put(vcpu);
531
532 out:
533         nvmm_machine_put(mach);
534         return error;
535 }
536
537 static int
538 nvmm_vcpu_inject(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_inject *args)
539 {
540         struct nvmm_machine *mach;
541         struct nvmm_cpu *vcpu;
542         int error;
543
544         error = nvmm_machine_get(owner, args->machid, &mach, false);
545         if (error)
546                 return error;
547
548         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
549         if (error)
550                 goto out;
551
552         error = (*nvmm_impl->vcpu_inject)(vcpu);
553         nvmm_vcpu_put(vcpu);
554
555 out:
556         nvmm_machine_put(mach);
557         return error;
558 }
559
560 static int
561 nvmm_do_vcpu_run(struct nvmm_machine *mach, struct nvmm_cpu *vcpu,
562     struct nvmm_vcpu_exit *exit)
563 {
564         struct vmspace *vm = mach->vm;
565         int ret;
566
567         while (1) {
568                 /* Got a signal? Or pending resched? Leave. */
569                 if (__predict_false(nvmm_return_needed())) {
570                         exit->reason = NVMM_VCPU_EXIT_NONE;
571                         return 0;
572                 }
573
574                 /* Run the VCPU. */
575                 ret = (*nvmm_impl->vcpu_run)(mach, vcpu, exit);
576                 if (__predict_false(ret != 0)) {
577                         return ret;
578                 }
579
580                 /* Process nested page faults. */
581                 if (__predict_true(exit->reason != NVMM_VCPU_EXIT_MEMORY)) {
582                         break;
583                 }
584                 if (exit->u.mem.gpa >= mach->gpa_end) {
585                         break;
586                 }
587                 if (uvm_fault(&vm->vm_map, exit->u.mem.gpa, exit->u.mem.prot)) {
588                         break;
589                 }
590         }
591
592         return 0;
593 }
594
595 static int
596 nvmm_vcpu_run(struct nvmm_owner *owner, struct nvmm_ioc_vcpu_run *args)
597 {
598         struct nvmm_machine *mach;
599         struct nvmm_cpu *vcpu;
600         int error;
601
602         error = nvmm_machine_get(owner, args->machid, &mach, false);
603         if (error)
604                 return error;
605
606         error = nvmm_vcpu_get(mach, args->cpuid, &vcpu);
607         if (error)
608                 goto out;
609
610         error = nvmm_do_vcpu_run(mach, vcpu, &args->exit);
611         nvmm_vcpu_put(vcpu);
612
613 out:
614         nvmm_machine_put(mach);
615         return error;
616 }
617
618 /* -------------------------------------------------------------------------- */
619
620 static struct uvm_object *
621 nvmm_hmapping_getuobj(struct nvmm_machine *mach, uintptr_t hva, size_t size,
622    size_t *off)
623 {
624         struct nvmm_hmapping *hmapping;
625         size_t i;
626
627         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
628                 hmapping = &mach->hmap[i];
629                 if (!hmapping->present) {
630                         continue;
631                 }
632                 if (hva >= hmapping->hva &&
633                     hva + size <= hmapping->hva + hmapping->size) {
634                         *off = hva - hmapping->hva;
635                         return hmapping->uobj;
636                 }
637         }
638
639         return NULL;
640 }
641
642 static int
643 nvmm_hmapping_validate(struct nvmm_machine *mach, uintptr_t hva, size_t size)
644 {
645         struct nvmm_hmapping *hmapping;
646         size_t i;
647
648         if ((hva % PAGE_SIZE) != 0 || (size % PAGE_SIZE) != 0) {
649                 return EINVAL;
650         }
651         if (hva == 0) {
652                 return EINVAL;
653         }
654
655         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
656                 hmapping = &mach->hmap[i];
657                 if (!hmapping->present) {
658                         continue;
659                 }
660
661                 if (hva >= hmapping->hva &&
662                     hva + size <= hmapping->hva + hmapping->size) {
663                         break;
664                 }
665
666                 if (hva >= hmapping->hva &&
667                     hva < hmapping->hva + hmapping->size) {
668                         return EEXIST;
669                 }
670                 if (hva + size > hmapping->hva &&
671                     hva + size <= hmapping->hva + hmapping->size) {
672                         return EEXIST;
673                 }
674                 if (hva <= hmapping->hva &&
675                     hva + size >= hmapping->hva + hmapping->size) {
676                         return EEXIST;
677                 }
678         }
679
680         return 0;
681 }
682
683 static struct nvmm_hmapping *
684 nvmm_hmapping_alloc(struct nvmm_machine *mach)
685 {
686         struct nvmm_hmapping *hmapping;
687         size_t i;
688
689         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
690                 hmapping = &mach->hmap[i];
691                 if (!hmapping->present) {
692                         hmapping->present = true;
693                         return hmapping;
694                 }
695         }
696
697         return NULL;
698 }
699
700 static int
701 nvmm_hmapping_free(struct nvmm_machine *mach, uintptr_t hva, size_t size)
702 {
703         struct vmspace *vmspace = curproc->p_vmspace;
704         struct nvmm_hmapping *hmapping;
705         size_t i;
706
707         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
708                 hmapping = &mach->hmap[i];
709                 if (!hmapping->present || hmapping->hva != hva ||
710                     hmapping->size != size) {
711                         continue;
712                 }
713
714                 uvm_unmap(&vmspace->vm_map, hmapping->hva,
715                     hmapping->hva + hmapping->size);
716                 uao_detach(hmapping->uobj);
717
718                 hmapping->uobj = NULL;
719                 hmapping->present = false;
720
721                 return 0;
722         }
723
724         return ENOENT;
725 }
726
727 static int
728 nvmm_hva_map(struct nvmm_owner *owner, struct nvmm_ioc_hva_map *args)
729 {
730         struct vmspace *vmspace = curproc->p_vmspace;
731         struct nvmm_machine *mach;
732         struct nvmm_hmapping *hmapping;
733         vaddr_t uva;
734         int error;
735
736         error = nvmm_machine_get(owner, args->machid, &mach, true);
737         if (error)
738                 return error;
739
740         error = nvmm_hmapping_validate(mach, args->hva, args->size);
741         if (error)
742                 goto out;
743
744         hmapping = nvmm_hmapping_alloc(mach);
745         if (hmapping == NULL) {
746                 error = ENOBUFS;
747                 goto out;
748         }
749
750         hmapping->hva = args->hva;
751         hmapping->size = args->size;
752         hmapping->uobj = uao_create(hmapping->size, 0);
753         uva = hmapping->hva;
754
755         /* Take a reference for the user. */
756         uao_reference(hmapping->uobj);
757
758         /* Map the uobj into the user address space, as pageable. */
759         error = uvm_map(&vmspace->vm_map, &uva, hmapping->size, hmapping->uobj,
760             0, 0, UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW, UVM_INH_SHARE,
761             UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
762         if (error) {
763                 uao_detach(hmapping->uobj);
764         }
765
766 out:
767         nvmm_machine_put(mach);
768         return error;
769 }
770
771 static int
772 nvmm_hva_unmap(struct nvmm_owner *owner, struct nvmm_ioc_hva_unmap *args)
773 {
774         struct nvmm_machine *mach;
775         int error;
776
777         error = nvmm_machine_get(owner, args->machid, &mach, true);
778         if (error)
779                 return error;
780
781         error = nvmm_hmapping_free(mach, args->hva, args->size);
782
783         nvmm_machine_put(mach);
784         return error;
785 }
786
787 /* -------------------------------------------------------------------------- */
788
789 static int
790 nvmm_gpa_map(struct nvmm_owner *owner, struct nvmm_ioc_gpa_map *args)
791 {
792         struct nvmm_machine *mach;
793         struct uvm_object *uobj;
794         gpaddr_t gpa;
795         size_t off;
796         int error;
797
798         error = nvmm_machine_get(owner, args->machid, &mach, false);
799         if (error)
800                 return error;
801
802         if ((args->prot & ~(PROT_READ|PROT_WRITE|PROT_EXEC)) != 0) {
803                 error = EINVAL;
804                 goto out;
805         }
806
807         if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0 ||
808             (args->hva % PAGE_SIZE) != 0) {
809                 error = EINVAL;
810                 goto out;
811         }
812         if (args->hva == 0) {
813                 error = EINVAL;
814                 goto out;
815         }
816         if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
817                 error = EINVAL;
818                 goto out;
819         }
820         if (args->gpa + args->size <= args->gpa) {
821                 error = EINVAL;
822                 goto out;
823         }
824         if (args->gpa + args->size > mach->gpa_end) {
825                 error = EINVAL;
826                 goto out;
827         }
828         gpa = args->gpa;
829
830         uobj = nvmm_hmapping_getuobj(mach, args->hva, args->size, &off);
831         if (uobj == NULL) {
832                 error = EINVAL;
833                 goto out;
834         }
835
836         /* Take a reference for the machine. */
837         uao_reference(uobj);
838
839         /* Map the uobj into the machine address space, as pageable. */
840         error = uvm_map(&mach->vm->vm_map, &gpa, args->size, uobj, off, 0,
841             UVM_MAPFLAG(args->prot, UVM_PROT_RWX, UVM_INH_NONE,
842             UVM_ADV_RANDOM, UVM_FLAG_FIXED|UVM_FLAG_UNMAP));
843         if (error) {
844                 uao_detach(uobj);
845                 goto out;
846         }
847         if (gpa != args->gpa) {
848                 uao_detach(uobj);
849                 printf("[!] uvm_map problem\n");
850                 error = EINVAL;
851                 goto out;
852         }
853
854 out:
855         nvmm_machine_put(mach);
856         return error;
857 }
858
859 static int
860 nvmm_gpa_unmap(struct nvmm_owner *owner, struct nvmm_ioc_gpa_unmap *args)
861 {
862         struct nvmm_machine *mach;
863         gpaddr_t gpa;
864         int error;
865
866         error = nvmm_machine_get(owner, args->machid, &mach, false);
867         if (error)
868                 return error;
869
870         if ((args->gpa % PAGE_SIZE) != 0 || (args->size % PAGE_SIZE) != 0) {
871                 error = EINVAL;
872                 goto out;
873         }
874         if (args->gpa < mach->gpa_begin || args->gpa >= mach->gpa_end) {
875                 error = EINVAL;
876                 goto out;
877         }
878         if (args->gpa + args->size <= args->gpa) {
879                 error = EINVAL;
880                 goto out;
881         }
882         if (args->gpa + args->size >= mach->gpa_end) {
883                 error = EINVAL;
884                 goto out;
885         }
886         gpa = args->gpa;
887
888         /* Unmap the memory from the machine. */
889         uvm_unmap(&mach->vm->vm_map, gpa, gpa + args->size);
890
891 out:
892         nvmm_machine_put(mach);
893         return error;
894 }
895
896 /* -------------------------------------------------------------------------- */
897
898 static int
899 nvmm_ctl_mach_info(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
900 {
901         struct nvmm_ctl_mach_info ctl;
902         struct nvmm_machine *mach;
903         int error;
904         size_t i;
905
906         if (args->size != sizeof(ctl))
907                 return EINVAL;
908         error = copyin(args->data, &ctl, sizeof(ctl));
909         if (error)
910                 return error;
911
912         error = nvmm_machine_get(owner, ctl.machid, &mach, true);
913         if (error)
914                 return error;
915
916         ctl.nvcpus = mach->ncpus;
917
918         ctl.nram = 0;
919         for (i = 0; i < NVMM_MAX_HMAPPINGS; i++) {
920                 if (!mach->hmap[i].present)
921                         continue;
922                 ctl.nram += mach->hmap[i].size;
923         }
924
925         ctl.pid = mach->owner->pid;
926         ctl.time = mach->time;
927
928         nvmm_machine_put(mach);
929
930         error = copyout(&ctl, args->data, sizeof(ctl));
931         if (error)
932                 return error;
933
934         return 0;
935 }
936
937 static int
938 nvmm_ctl(struct nvmm_owner *owner, struct nvmm_ioc_ctl *args)
939 {
940         switch (args->op) {
941         case NVMM_CTL_MACH_INFO:
942                 return nvmm_ctl_mach_info(owner, args);
943         default:
944                 return EINVAL;
945         }
946 }
947
948 /* -------------------------------------------------------------------------- */
949
950 static const struct nvmm_impl *
951 nvmm_ident(void)
952 {
953         size_t i;
954
955         for (i = 0; i < __arraycount(nvmm_impl_list); i++) {
956                 if ((*nvmm_impl_list[i]->ident)())
957                         return nvmm_impl_list[i];
958         }
959
960         return NULL;
961 }
962
963 static int
964 nvmm_init(void)
965 {
966         size_t i, n;
967
968         nvmm_impl = nvmm_ident();
969         if (nvmm_impl == NULL)
970                 return ENOTSUP;
971
972         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
973                 machines[i].machid = i;
974                 rw_init(&machines[i].lock);
975                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
976                         machines[i].cpus[n].present = false;
977                         machines[i].cpus[n].cpuid = n;
978                         mutex_init(&machines[i].cpus[n].lock, MUTEX_DEFAULT,
979                             IPL_NONE);
980                 }
981         }
982
983         (*nvmm_impl->init)();
984
985         return 0;
986 }
987
988 static void
989 nvmm_fini(void)
990 {
991         size_t i, n;
992
993         for (i = 0; i < NVMM_MAX_MACHINES; i++) {
994                 rw_destroy(&machines[i].lock);
995                 for (n = 0; n < NVMM_MAX_VCPUS; n++) {
996                         mutex_destroy(&machines[i].cpus[n].lock);
997                 }
998         }
999
1000         (*nvmm_impl->fini)();
1001         nvmm_impl = NULL;
1002 }
1003
1004 /* -------------------------------------------------------------------------- */
1005
1006 static d_open_t nvmm_open;
1007 static d_ioctl_t nvmm_ioctl;
1008 static d_mmap_single_t nvmm_mmap_single;
1009 static d_priv_dtor_t nvmm_dtor;
1010
1011 static struct dev_ops nvmm_ops = {
1012         { "nvmm", 0, D_MPSAFE },
1013         .d_open         = nvmm_open,
1014         .d_ioctl        = nvmm_ioctl,
1015         .d_mmap_single  = nvmm_mmap_single,
1016 };
1017
1018 static int
1019 nvmm_open(struct dev_open_args *ap)
1020 {
1021         int flags = ap->a_oflags;
1022         struct nvmm_owner *owner;
1023         struct file *fp;
1024         int error;
1025
1026         if (__predict_false(nvmm_impl == NULL))
1027                 return ENXIO;
1028         if (!(flags & O_CLOEXEC))
1029                 return EINVAL;
1030
1031         if (priv_check_cred(ap->a_cred, PRIV_ROOT, 0) == 0) {
1032                 owner = &root_owner;
1033         } else {
1034                 owner = kmem_alloc(sizeof(*owner), KM_SLEEP);
1035                 owner->pid = curthread->td_proc->p_pid;
1036         }
1037
1038         fp = ap->a_fpp ? *ap->a_fpp : NULL;
1039         error = devfs_set_cdevpriv(fp, owner, nvmm_dtor);
1040         if (error) {
1041                 nvmm_dtor(owner);
1042                 return error;
1043         }
1044
1045         return 0;
1046 }
1047
1048 static void
1049 nvmm_dtor(void *arg)
1050 {
1051         struct nvmm_owner *owner = arg;
1052
1053         KASSERT(owner != NULL);
1054         nvmm_kill_machines(owner);
1055         if (owner != &root_owner) {
1056                 kmem_free(owner, sizeof(*owner));
1057         }
1058 }
1059
1060 static int
1061 nvmm_mmap_single(struct dev_mmap_single_args *ap)
1062 {
1063         vm_ooffset_t *offp = ap->a_offset;
1064         size_t size = ap->a_size;
1065         int prot = ap->a_nprot;
1066         struct vm_object **uobjp = ap->a_object;
1067         struct file *fp = ap->a_fp;
1068         struct nvmm_owner *owner = NULL;
1069         struct nvmm_machine *mach;
1070         nvmm_machid_t machid;
1071         nvmm_cpuid_t cpuid;
1072         int error;
1073
1074         devfs_get_cdevpriv(fp, (void **)&owner);
1075         KASSERT(owner != NULL);
1076
1077         if (prot & PROT_EXEC)
1078                 return EACCES;
1079         if (size != PAGE_SIZE)
1080                 return EINVAL;
1081
1082         cpuid = NVMM_COMM_CPUID(*offp);
1083         if (__predict_false(cpuid >= NVMM_MAX_VCPUS))
1084                 return EINVAL;
1085
1086         machid = NVMM_COMM_MACHID(*offp);
1087         error = nvmm_machine_get(owner, machid, &mach, false);
1088         if (error)
1089                 return error;
1090
1091         uao_reference(mach->commuobj);
1092         *uobjp = mach->commuobj;
1093         *offp = cpuid * PAGE_SIZE;
1094
1095         nvmm_machine_put(mach);
1096         return 0;
1097 }
1098
1099 static int
1100 nvmm_ioctl(struct dev_ioctl_args *ap)
1101 {
1102         unsigned long cmd = ap->a_cmd;
1103         void *data = ap->a_data;
1104         struct file *fp = ap->a_fp;
1105         struct nvmm_owner *owner = NULL;
1106
1107         devfs_get_cdevpriv(fp, (void **)&owner);
1108         KASSERT(owner != NULL);
1109
1110         switch (cmd) {
1111         case NVMM_IOC_CAPABILITY:
1112                 return nvmm_capability(owner, data);
1113         case NVMM_IOC_MACHINE_CREATE:
1114                 return nvmm_machine_create(owner, data);
1115         case NVMM_IOC_MACHINE_DESTROY:
1116                 return nvmm_machine_destroy(owner, data);
1117         case NVMM_IOC_MACHINE_CONFIGURE:
1118                 return nvmm_machine_configure(owner, data);
1119         case NVMM_IOC_VCPU_CREATE:
1120                 return nvmm_vcpu_create(owner, data);
1121         case NVMM_IOC_VCPU_DESTROY:
1122                 return nvmm_vcpu_destroy(owner, data);
1123         case NVMM_IOC_VCPU_CONFIGURE:
1124                 return nvmm_vcpu_configure(owner, data);
1125         case NVMM_IOC_VCPU_SETSTATE:
1126                 return nvmm_vcpu_setstate(owner, data);
1127         case NVMM_IOC_VCPU_GETSTATE:
1128                 return nvmm_vcpu_getstate(owner, data);
1129         case NVMM_IOC_VCPU_INJECT:
1130                 return nvmm_vcpu_inject(owner, data);
1131         case NVMM_IOC_VCPU_RUN:
1132                 return nvmm_vcpu_run(owner, data);
1133         case NVMM_IOC_GPA_MAP:
1134                 return nvmm_gpa_map(owner, data);
1135         case NVMM_IOC_GPA_UNMAP:
1136                 return nvmm_gpa_unmap(owner, data);
1137         case NVMM_IOC_HVA_MAP:
1138                 return nvmm_hva_map(owner, data);
1139         case NVMM_IOC_HVA_UNMAP:
1140                 return nvmm_hva_unmap(owner, data);
1141         case NVMM_IOC_CTL:
1142                 return nvmm_ctl(owner, data);
1143         default:
1144                 return EINVAL;
1145         }
1146 }
1147
1148 /* -------------------------------------------------------------------------- */
1149
1150 static int
1151 nvmm_attach(void)
1152 {
1153         int error;
1154
1155         error = nvmm_init();
1156         if (error)
1157                 panic("%s: impossible", __func__);
1158         printf("nvmm: attached, using backend %s\n", nvmm_impl->name);
1159
1160         return 0;
1161 }
1162
1163 static int
1164 nvmm_detach(void)
1165 {
1166         if (atomic_load_acq_int(&nmachines) > 0)
1167                 return EBUSY;
1168
1169         nvmm_fini();
1170         return 0;
1171 }
1172
1173 static int
1174 nvmm_modevent(module_t mod __unused, int type, void *data __unused)
1175 {
1176         static cdev_t dev = NULL;
1177         int error;
1178
1179         switch (type) {
1180         case MOD_LOAD:
1181                 if (nvmm_ident() == NULL) {
1182                         printf("nvmm: cpu not supported\n");
1183                         return ENOTSUP;
1184                 }
1185                 error = nvmm_attach();
1186                 if (error)
1187                         return error;
1188
1189                 dev = make_dev(&nvmm_ops, 0, UID_ROOT, GID_NVMM, 0660, "nvmm");
1190                 if (dev == NULL) {
1191                         printf("nvmm: unable to create device\n");
1192                         error = ENOMEM;
1193                 }
1194                 break;
1195
1196         case MOD_UNLOAD:
1197                 if (dev == NULL)
1198                         return 0;
1199                 error = nvmm_detach();
1200                 if (error == 0)
1201                         destroy_dev(dev);
1202                 break;
1203
1204         case MOD_SHUTDOWN:
1205                 error = 0;
1206                 break;
1207
1208         default:
1209                 error = EOPNOTSUPP;
1210                 break;
1211         }
1212
1213         return error;
1214 }
1215
1216 static moduledata_t nvmm_moddata = {
1217         .name = "nvmm",
1218         .evhand = nvmm_modevent,
1219         .priv = NULL,
1220 };
1221
1222 DECLARE_MODULE(nvmm, nvmm_moddata, SI_SUB_PSEUDO, SI_ORDER_ANY);
1223 MODULE_VERSION(nvmm, NVMM_KERN_VERSION);