Merge tag 'pci-v6.4-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci
[linux.git] / drivers / vfio / vfio_main.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * VFIO core
4  *
5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
6  *     Author: Alex Williamson <alex.williamson@redhat.com>
7  *
8  * Derived from original vfio:
9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
10  * Author: Tom Lyon, pugs@cisco.com
11  */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41
42 #define DRIVER_VERSION  "0.3"
43 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC     "VFIO - User Level meta-driver"
45
46 static struct vfio {
47         struct class                    *device_class;
48         struct ida                      device_ida;
49 } vfio;
50
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54                    vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
56 #endif
57
58 static DEFINE_XARRAY(vfio_device_set_xa);
59
60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62         unsigned long idx = (unsigned long)set_id;
63         struct vfio_device_set *new_dev_set;
64         struct vfio_device_set *dev_set;
65
66         if (WARN_ON(!set_id))
67                 return -EINVAL;
68
69         /*
70          * Atomically acquire a singleton object in the xarray for this set_id
71          */
72         xa_lock(&vfio_device_set_xa);
73         dev_set = xa_load(&vfio_device_set_xa, idx);
74         if (dev_set)
75                 goto found_get_ref;
76         xa_unlock(&vfio_device_set_xa);
77
78         new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79         if (!new_dev_set)
80                 return -ENOMEM;
81         mutex_init(&new_dev_set->lock);
82         INIT_LIST_HEAD(&new_dev_set->device_list);
83         new_dev_set->set_id = set_id;
84
85         xa_lock(&vfio_device_set_xa);
86         dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87                                GFP_KERNEL);
88         if (!dev_set) {
89                 dev_set = new_dev_set;
90                 goto found_get_ref;
91         }
92
93         kfree(new_dev_set);
94         if (xa_is_err(dev_set)) {
95                 xa_unlock(&vfio_device_set_xa);
96                 return xa_err(dev_set);
97         }
98
99 found_get_ref:
100         dev_set->device_count++;
101         xa_unlock(&vfio_device_set_xa);
102         mutex_lock(&dev_set->lock);
103         device->dev_set = dev_set;
104         list_add_tail(&device->dev_set_list, &dev_set->device_list);
105         mutex_unlock(&dev_set->lock);
106         return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112         struct vfio_device_set *dev_set = device->dev_set;
113
114         if (!dev_set)
115                 return;
116
117         mutex_lock(&dev_set->lock);
118         list_del(&device->dev_set_list);
119         mutex_unlock(&dev_set->lock);
120
121         xa_lock(&vfio_device_set_xa);
122         if (!--dev_set->device_count) {
123                 __xa_erase(&vfio_device_set_xa,
124                            (unsigned long)dev_set->set_id);
125                 mutex_destroy(&dev_set->lock);
126                 kfree(dev_set);
127         }
128         xa_unlock(&vfio_device_set_xa);
129 }
130
131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133         struct vfio_device *cur;
134         unsigned int open_count = 0;
135
136         lockdep_assert_held(&dev_set->lock);
137
138         list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139                 open_count += cur->open_count;
140         return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144 /*
145  * Device objects - create, release, get, put, search
146  */
147 /* Device reference always implies a group reference */
148 void vfio_device_put_registration(struct vfio_device *device)
149 {
150         if (refcount_dec_and_test(&device->refcount))
151                 complete(&device->comp);
152 }
153
154 bool vfio_device_try_get_registration(struct vfio_device *device)
155 {
156         return refcount_inc_not_zero(&device->refcount);
157 }
158
159 /*
160  * VFIO driver API
161  */
162 /* Release helper called by vfio_put_device() */
163 static void vfio_device_release(struct device *dev)
164 {
165         struct vfio_device *device =
166                         container_of(dev, struct vfio_device, device);
167
168         vfio_release_device_set(device);
169         ida_free(&vfio.device_ida, device->index);
170
171         if (device->ops->release)
172                 device->ops->release(device);
173
174         kvfree(device);
175 }
176
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178                             const struct vfio_device_ops *ops);
179
180 /*
181  * Allocate and initialize vfio_device so it can be registered to vfio
182  * core.
183  *
184  * Drivers should use the wrapper vfio_alloc_device() for allocation.
185  * @size is the size of the structure to be allocated, including any
186  * private data used by the driver.
187  *
188  * Driver may provide an @init callback to cover device private data.
189  *
190  * Use vfio_put_device() to release the structure after success return.
191  */
192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193                                        const struct vfio_device_ops *ops)
194 {
195         struct vfio_device *device;
196         int ret;
197
198         if (WARN_ON(size < sizeof(struct vfio_device)))
199                 return ERR_PTR(-EINVAL);
200
201         device = kvzalloc(size, GFP_KERNEL);
202         if (!device)
203                 return ERR_PTR(-ENOMEM);
204
205         ret = vfio_init_device(device, dev, ops);
206         if (ret)
207                 goto out_free;
208         return device;
209
210 out_free:
211         kvfree(device);
212         return ERR_PTR(ret);
213 }
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215
216 /*
217  * Initialize a vfio_device so it can be registered to vfio core.
218  */
219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220                             const struct vfio_device_ops *ops)
221 {
222         int ret;
223
224         ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225         if (ret < 0) {
226                 dev_dbg(dev, "Error to alloc index\n");
227                 return ret;
228         }
229
230         device->index = ret;
231         init_completion(&device->comp);
232         device->dev = dev;
233         device->ops = ops;
234
235         if (ops->init) {
236                 ret = ops->init(device);
237                 if (ret)
238                         goto out_uninit;
239         }
240
241         device_initialize(&device->device);
242         device->device.release = vfio_device_release;
243         device->device.class = vfio.device_class;
244         device->device.parent = device->dev;
245         return 0;
246
247 out_uninit:
248         vfio_release_device_set(device);
249         ida_free(&vfio.device_ida, device->index);
250         return ret;
251 }
252
253 static int __vfio_register_dev(struct vfio_device *device,
254                                enum vfio_group_type type)
255 {
256         int ret;
257
258         if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
259                     (!device->ops->bind_iommufd ||
260                      !device->ops->unbind_iommufd ||
261                      !device->ops->attach_ioas)))
262                 return -EINVAL;
263
264         /*
265          * If the driver doesn't specify a set then the device is added to a
266          * singleton set just for itself.
267          */
268         if (!device->dev_set)
269                 vfio_assign_device_set(device, device);
270
271         ret = dev_set_name(&device->device, "vfio%d", device->index);
272         if (ret)
273                 return ret;
274
275         ret = vfio_device_set_group(device, type);
276         if (ret)
277                 return ret;
278
279         ret = device_add(&device->device);
280         if (ret)
281                 goto err_out;
282
283         /* Refcounting can't start until the driver calls register */
284         refcount_set(&device->refcount, 1);
285
286         vfio_device_group_register(device);
287
288         return 0;
289 err_out:
290         vfio_device_remove_group(device);
291         return ret;
292 }
293
294 int vfio_register_group_dev(struct vfio_device *device)
295 {
296         return __vfio_register_dev(device, VFIO_IOMMU);
297 }
298 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
299
300 /*
301  * Register a virtual device without IOMMU backing.  The user of this
302  * device must not be able to directly trigger unmediated DMA.
303  */
304 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
305 {
306         return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
307 }
308 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
309
310 /*
311  * Decrement the device reference count and wait for the device to be
312  * removed.  Open file descriptors for the device... */
313 void vfio_unregister_group_dev(struct vfio_device *device)
314 {
315         unsigned int i = 0;
316         bool interrupted = false;
317         long rc;
318
319         vfio_device_put_registration(device);
320         rc = try_wait_for_completion(&device->comp);
321         while (rc <= 0) {
322                 if (device->ops->request)
323                         device->ops->request(device, i++);
324
325                 if (interrupted) {
326                         rc = wait_for_completion_timeout(&device->comp,
327                                                          HZ * 10);
328                 } else {
329                         rc = wait_for_completion_interruptible_timeout(
330                                 &device->comp, HZ * 10);
331                         if (rc < 0) {
332                                 interrupted = true;
333                                 dev_warn(device->dev,
334                                          "Device is currently in use, task"
335                                          " \"%s\" (%d) "
336                                          "blocked until device is released",
337                                          current->comm, task_pid_nr(current));
338                         }
339                 }
340         }
341
342         vfio_device_group_unregister(device);
343
344         /* Balances device_add in register path */
345         device_del(&device->device);
346
347         /* Balances vfio_device_set_group in register path */
348         vfio_device_remove_group(device);
349 }
350 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
351
352 #ifdef CONFIG_HAVE_KVM
353 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
354 {
355         void (*pfn)(struct kvm *kvm);
356         bool (*fn)(struct kvm *kvm);
357         bool ret;
358
359         lockdep_assert_held(&device->dev_set->lock);
360
361         pfn = symbol_get(kvm_put_kvm);
362         if (WARN_ON(!pfn))
363                 return;
364
365         fn = symbol_get(kvm_get_kvm_safe);
366         if (WARN_ON(!fn)) {
367                 symbol_put(kvm_put_kvm);
368                 return;
369         }
370
371         ret = fn(kvm);
372         symbol_put(kvm_get_kvm_safe);
373         if (!ret) {
374                 symbol_put(kvm_put_kvm);
375                 return;
376         }
377
378         device->put_kvm = pfn;
379         device->kvm = kvm;
380 }
381
382 void vfio_device_put_kvm(struct vfio_device *device)
383 {
384         lockdep_assert_held(&device->dev_set->lock);
385
386         if (!device->kvm)
387                 return;
388
389         if (WARN_ON(!device->put_kvm))
390                 goto clear;
391
392         device->put_kvm(device->kvm);
393         device->put_kvm = NULL;
394         symbol_put(kvm_put_kvm);
395
396 clear:
397         device->kvm = NULL;
398 }
399 #endif
400
401 /* true if the vfio_device has open_device() called but not close_device() */
402 static bool vfio_assert_device_open(struct vfio_device *device)
403 {
404         return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
405 }
406
407 static int vfio_device_first_open(struct vfio_device *device,
408                                   struct iommufd_ctx *iommufd)
409 {
410         int ret;
411
412         lockdep_assert_held(&device->dev_set->lock);
413
414         if (!try_module_get(device->dev->driver->owner))
415                 return -ENODEV;
416
417         if (iommufd)
418                 ret = vfio_iommufd_bind(device, iommufd);
419         else
420                 ret = vfio_device_group_use_iommu(device);
421         if (ret)
422                 goto err_module_put;
423
424         if (device->ops->open_device) {
425                 ret = device->ops->open_device(device);
426                 if (ret)
427                         goto err_unuse_iommu;
428         }
429         return 0;
430
431 err_unuse_iommu:
432         if (iommufd)
433                 vfio_iommufd_unbind(device);
434         else
435                 vfio_device_group_unuse_iommu(device);
436 err_module_put:
437         module_put(device->dev->driver->owner);
438         return ret;
439 }
440
441 static void vfio_device_last_close(struct vfio_device *device,
442                                    struct iommufd_ctx *iommufd)
443 {
444         lockdep_assert_held(&device->dev_set->lock);
445
446         if (device->ops->close_device)
447                 device->ops->close_device(device);
448         if (iommufd)
449                 vfio_iommufd_unbind(device);
450         else
451                 vfio_device_group_unuse_iommu(device);
452         module_put(device->dev->driver->owner);
453 }
454
455 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
456 {
457         int ret = 0;
458
459         lockdep_assert_held(&device->dev_set->lock);
460
461         device->open_count++;
462         if (device->open_count == 1) {
463                 ret = vfio_device_first_open(device, iommufd);
464                 if (ret)
465                         device->open_count--;
466         }
467
468         return ret;
469 }
470
471 void vfio_device_close(struct vfio_device *device,
472                        struct iommufd_ctx *iommufd)
473 {
474         lockdep_assert_held(&device->dev_set->lock);
475
476         vfio_assert_device_open(device);
477         if (device->open_count == 1)
478                 vfio_device_last_close(device, iommufd);
479         device->open_count--;
480 }
481
482 /*
483  * Wrapper around pm_runtime_resume_and_get().
484  * Return error code on failure or 0 on success.
485  */
486 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
487 {
488         struct device *dev = device->dev;
489
490         if (dev->driver && dev->driver->pm) {
491                 int ret;
492
493                 ret = pm_runtime_resume_and_get(dev);
494                 if (ret) {
495                         dev_info_ratelimited(dev,
496                                 "vfio: runtime resume failed %d\n", ret);
497                         return -EIO;
498                 }
499         }
500
501         return 0;
502 }
503
504 /*
505  * Wrapper around pm_runtime_put().
506  */
507 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
508 {
509         struct device *dev = device->dev;
510
511         if (dev->driver && dev->driver->pm)
512                 pm_runtime_put(dev);
513 }
514
515 /*
516  * VFIO Device fd
517  */
518 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
519 {
520         struct vfio_device *device = filep->private_data;
521
522         vfio_device_group_close(device);
523
524         vfio_device_put_registration(device);
525
526         return 0;
527 }
528
529 /*
530  * vfio_mig_get_next_state - Compute the next step in the FSM
531  * @cur_fsm - The current state the device is in
532  * @new_fsm - The target state to reach
533  * @next_fsm - Pointer to the next step to get to new_fsm
534  *
535  * Return 0 upon success, otherwise -errno
536  * Upon success the next step in the state progression between cur_fsm and
537  * new_fsm will be set in next_fsm.
538  *
539  * This breaks down requests for combination transitions into smaller steps and
540  * returns the next step to get to new_fsm. The function may need to be called
541  * multiple times before reaching new_fsm.
542  *
543  */
544 int vfio_mig_get_next_state(struct vfio_device *device,
545                             enum vfio_device_mig_state cur_fsm,
546                             enum vfio_device_mig_state new_fsm,
547                             enum vfio_device_mig_state *next_fsm)
548 {
549         enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
550         /*
551          * The coding in this table requires the driver to implement the
552          * following FSM arcs:
553          *         RESUMING -> STOP
554          *         STOP -> RESUMING
555          *         STOP -> STOP_COPY
556          *         STOP_COPY -> STOP
557          *
558          * If P2P is supported then the driver must also implement these FSM
559          * arcs:
560          *         RUNNING -> RUNNING_P2P
561          *         RUNNING_P2P -> RUNNING
562          *         RUNNING_P2P -> STOP
563          *         STOP -> RUNNING_P2P
564          *
565          * If precopy is supported then the driver must support these additional
566          * FSM arcs:
567          *         RUNNING -> PRE_COPY
568          *         PRE_COPY -> RUNNING
569          *         PRE_COPY -> STOP_COPY
570          * However, if precopy and P2P are supported together then the driver
571          * must support these additional arcs beyond the P2P arcs above:
572          *         PRE_COPY -> RUNNING
573          *         PRE_COPY -> PRE_COPY_P2P
574          *         PRE_COPY_P2P -> PRE_COPY
575          *         PRE_COPY_P2P -> RUNNING_P2P
576          *         PRE_COPY_P2P -> STOP_COPY
577          *         RUNNING -> PRE_COPY
578          *         RUNNING_P2P -> PRE_COPY_P2P
579          *
580          * Without P2P and precopy the driver must implement:
581          *         RUNNING -> STOP
582          *         STOP -> RUNNING
583          *
584          * The coding will step through multiple states for some combination
585          * transitions; if all optional features are supported, this means the
586          * following ones:
587          *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
588          *         PRE_COPY -> RUNNING -> RUNNING_P2P
589          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
590          *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
591          *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
592          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
593          *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
594          *         RESUMING -> STOP -> RUNNING_P2P
595          *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
596          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
597          *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
598          *         RESUMING -> STOP -> STOP_COPY
599          *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
600          *         RUNNING -> RUNNING_P2P -> STOP
601          *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
602          *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
603          *         RUNNING_P2P -> RUNNING -> PRE_COPY
604          *         RUNNING_P2P -> STOP -> RESUMING
605          *         RUNNING_P2P -> STOP -> STOP_COPY
606          *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
607          *         STOP -> RUNNING_P2P -> RUNNING
608          *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
609          *         STOP_COPY -> STOP -> RESUMING
610          *         STOP_COPY -> STOP -> RUNNING_P2P
611          *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
612          *
613          *  The following transitions are blocked:
614          *         STOP_COPY -> PRE_COPY
615          *         STOP_COPY -> PRE_COPY_P2P
616          */
617         static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
618                 [VFIO_DEVICE_STATE_STOP] = {
619                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
620                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
621                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
622                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
623                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
624                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
625                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
626                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
627                 },
628                 [VFIO_DEVICE_STATE_RUNNING] = {
629                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
630                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
631                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
632                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
633                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
634                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
635                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
636                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
637                 },
638                 [VFIO_DEVICE_STATE_PRE_COPY] = {
639                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
640                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
641                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
642                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
644                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
645                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
646                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
647                 },
648                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
649                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
650                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
651                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
652                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
653                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
654                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
655                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
656                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
657                 },
658                 [VFIO_DEVICE_STATE_STOP_COPY] = {
659                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
660                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
661                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
662                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
663                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
664                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
665                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
666                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
667                 },
668                 [VFIO_DEVICE_STATE_RESUMING] = {
669                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
670                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
671                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
672                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
673                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
674                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
675                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
676                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
677                 },
678                 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
679                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
680                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
681                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
682                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
683                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
684                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
685                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
686                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
687                 },
688                 [VFIO_DEVICE_STATE_ERROR] = {
689                         [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
690                         [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
691                         [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
692                         [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
693                         [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
694                         [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
695                         [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
696                         [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
697                 },
698         };
699
700         static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
701                 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
702                 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
703                 [VFIO_DEVICE_STATE_PRE_COPY] =
704                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
705                 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
706                                                    VFIO_MIGRATION_P2P |
707                                                    VFIO_MIGRATION_PRE_COPY,
708                 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
709                 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
710                 [VFIO_DEVICE_STATE_RUNNING_P2P] =
711                         VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
712                 [VFIO_DEVICE_STATE_ERROR] = ~0U,
713         };
714
715         if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
716                     (state_flags_table[cur_fsm] & device->migration_flags) !=
717                         state_flags_table[cur_fsm]))
718                 return -EINVAL;
719
720         if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
721            (state_flags_table[new_fsm] & device->migration_flags) !=
722                         state_flags_table[new_fsm])
723                 return -EINVAL;
724
725         /*
726          * Arcs touching optional and unsupported states are skipped over. The
727          * driver will instead see an arc from the original state to the next
728          * logical state, as per the above comment.
729          */
730         *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
731         while ((state_flags_table[*next_fsm] & device->migration_flags) !=
732                         state_flags_table[*next_fsm])
733                 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
734
735         return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
736 }
737 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
738
739 /*
740  * Convert the drivers's struct file into a FD number and return it to userspace
741  */
742 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
743                                    struct vfio_device_feature_mig_state *mig)
744 {
745         int ret;
746         int fd;
747
748         fd = get_unused_fd_flags(O_CLOEXEC);
749         if (fd < 0) {
750                 ret = fd;
751                 goto out_fput;
752         }
753
754         mig->data_fd = fd;
755         if (copy_to_user(arg, mig, sizeof(*mig))) {
756                 ret = -EFAULT;
757                 goto out_put_unused;
758         }
759         fd_install(fd, filp);
760         return 0;
761
762 out_put_unused:
763         put_unused_fd(fd);
764 out_fput:
765         fput(filp);
766         return ret;
767 }
768
769 static int
770 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
771                                            u32 flags, void __user *arg,
772                                            size_t argsz)
773 {
774         size_t minsz =
775                 offsetofend(struct vfio_device_feature_mig_state, data_fd);
776         struct vfio_device_feature_mig_state mig;
777         struct file *filp = NULL;
778         int ret;
779
780         if (!device->mig_ops)
781                 return -ENOTTY;
782
783         ret = vfio_check_feature(flags, argsz,
784                                  VFIO_DEVICE_FEATURE_SET |
785                                  VFIO_DEVICE_FEATURE_GET,
786                                  sizeof(mig));
787         if (ret != 1)
788                 return ret;
789
790         if (copy_from_user(&mig, arg, minsz))
791                 return -EFAULT;
792
793         if (flags & VFIO_DEVICE_FEATURE_GET) {
794                 enum vfio_device_mig_state curr_state;
795
796                 ret = device->mig_ops->migration_get_state(device,
797                                                            &curr_state);
798                 if (ret)
799                         return ret;
800                 mig.device_state = curr_state;
801                 goto out_copy;
802         }
803
804         /* Handle the VFIO_DEVICE_FEATURE_SET */
805         filp = device->mig_ops->migration_set_state(device, mig.device_state);
806         if (IS_ERR(filp) || !filp)
807                 goto out_copy;
808
809         return vfio_ioct_mig_return_fd(filp, arg, &mig);
810 out_copy:
811         mig.data_fd = -1;
812         if (copy_to_user(arg, &mig, sizeof(mig)))
813                 return -EFAULT;
814         if (IS_ERR(filp))
815                 return PTR_ERR(filp);
816         return 0;
817 }
818
819 static int
820 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
821                                               u32 flags, void __user *arg,
822                                               size_t argsz)
823 {
824         struct vfio_device_feature_mig_data_size data_size = {};
825         unsigned long stop_copy_length;
826         int ret;
827
828         if (!device->mig_ops)
829                 return -ENOTTY;
830
831         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
832                                  sizeof(data_size));
833         if (ret != 1)
834                 return ret;
835
836         ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
837         if (ret)
838                 return ret;
839
840         data_size.stop_copy_length = stop_copy_length;
841         if (copy_to_user(arg, &data_size, sizeof(data_size)))
842                 return -EFAULT;
843
844         return 0;
845 }
846
847 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
848                                                u32 flags, void __user *arg,
849                                                size_t argsz)
850 {
851         struct vfio_device_feature_migration mig = {
852                 .flags = device->migration_flags,
853         };
854         int ret;
855
856         if (!device->mig_ops)
857                 return -ENOTTY;
858
859         ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
860                                  sizeof(mig));
861         if (ret != 1)
862                 return ret;
863         if (copy_to_user(arg, &mig, sizeof(mig)))
864                 return -EFAULT;
865         return 0;
866 }
867
868 /* Ranges should fit into a single kernel page */
869 #define LOG_MAX_RANGES \
870         (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
871
872 static int
873 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
874                                         u32 flags, void __user *arg,
875                                         size_t argsz)
876 {
877         size_t minsz =
878                 offsetofend(struct vfio_device_feature_dma_logging_control,
879                             ranges);
880         struct vfio_device_feature_dma_logging_range __user *ranges;
881         struct vfio_device_feature_dma_logging_control control;
882         struct vfio_device_feature_dma_logging_range range;
883         struct rb_root_cached root = RB_ROOT_CACHED;
884         struct interval_tree_node *nodes;
885         u64 iova_end;
886         u32 nnodes;
887         int i, ret;
888
889         if (!device->log_ops)
890                 return -ENOTTY;
891
892         ret = vfio_check_feature(flags, argsz,
893                                  VFIO_DEVICE_FEATURE_SET,
894                                  sizeof(control));
895         if (ret != 1)
896                 return ret;
897
898         if (copy_from_user(&control, arg, minsz))
899                 return -EFAULT;
900
901         nnodes = control.num_ranges;
902         if (!nnodes)
903                 return -EINVAL;
904
905         if (nnodes > LOG_MAX_RANGES)
906                 return -E2BIG;
907
908         ranges = u64_to_user_ptr(control.ranges);
909         nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
910                               GFP_KERNEL);
911         if (!nodes)
912                 return -ENOMEM;
913
914         for (i = 0; i < nnodes; i++) {
915                 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
916                         ret = -EFAULT;
917                         goto end;
918                 }
919                 if (!IS_ALIGNED(range.iova, control.page_size) ||
920                     !IS_ALIGNED(range.length, control.page_size)) {
921                         ret = -EINVAL;
922                         goto end;
923                 }
924
925                 if (check_add_overflow(range.iova, range.length, &iova_end) ||
926                     iova_end > ULONG_MAX) {
927                         ret = -EOVERFLOW;
928                         goto end;
929                 }
930
931                 nodes[i].start = range.iova;
932                 nodes[i].last = range.iova + range.length - 1;
933                 if (interval_tree_iter_first(&root, nodes[i].start,
934                                              nodes[i].last)) {
935                         /* Range overlapping */
936                         ret = -EINVAL;
937                         goto end;
938                 }
939                 interval_tree_insert(nodes + i, &root);
940         }
941
942         ret = device->log_ops->log_start(device, &root, nnodes,
943                                          &control.page_size);
944         if (ret)
945                 goto end;
946
947         if (copy_to_user(arg, &control, sizeof(control))) {
948                 ret = -EFAULT;
949                 device->log_ops->log_stop(device);
950         }
951
952 end:
953         kfree(nodes);
954         return ret;
955 }
956
957 static int
958 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
959                                        u32 flags, void __user *arg,
960                                        size_t argsz)
961 {
962         int ret;
963
964         if (!device->log_ops)
965                 return -ENOTTY;
966
967         ret = vfio_check_feature(flags, argsz,
968                                  VFIO_DEVICE_FEATURE_SET, 0);
969         if (ret != 1)
970                 return ret;
971
972         return device->log_ops->log_stop(device);
973 }
974
975 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
976                                           unsigned long iova, size_t length,
977                                           void *opaque)
978 {
979         struct vfio_device *device = opaque;
980
981         return device->log_ops->log_read_and_clear(device, iova, length, iter);
982 }
983
984 static int
985 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
986                                          u32 flags, void __user *arg,
987                                          size_t argsz)
988 {
989         size_t minsz =
990                 offsetofend(struct vfio_device_feature_dma_logging_report,
991                             bitmap);
992         struct vfio_device_feature_dma_logging_report report;
993         struct iova_bitmap *iter;
994         u64 iova_end;
995         int ret;
996
997         if (!device->log_ops)
998                 return -ENOTTY;
999
1000         ret = vfio_check_feature(flags, argsz,
1001                                  VFIO_DEVICE_FEATURE_GET,
1002                                  sizeof(report));
1003         if (ret != 1)
1004                 return ret;
1005
1006         if (copy_from_user(&report, arg, minsz))
1007                 return -EFAULT;
1008
1009         if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1010                 return -EINVAL;
1011
1012         if (check_add_overflow(report.iova, report.length, &iova_end) ||
1013             iova_end > ULONG_MAX)
1014                 return -EOVERFLOW;
1015
1016         iter = iova_bitmap_alloc(report.iova, report.length,
1017                                  report.page_size,
1018                                  u64_to_user_ptr(report.bitmap));
1019         if (IS_ERR(iter))
1020                 return PTR_ERR(iter);
1021
1022         ret = iova_bitmap_for_each(iter, device,
1023                                    vfio_device_log_read_and_clear);
1024
1025         iova_bitmap_free(iter);
1026         return ret;
1027 }
1028
1029 static int vfio_ioctl_device_feature(struct vfio_device *device,
1030                                      struct vfio_device_feature __user *arg)
1031 {
1032         size_t minsz = offsetofend(struct vfio_device_feature, flags);
1033         struct vfio_device_feature feature;
1034
1035         if (copy_from_user(&feature, arg, minsz))
1036                 return -EFAULT;
1037
1038         if (feature.argsz < minsz)
1039                 return -EINVAL;
1040
1041         /* Check unknown flags */
1042         if (feature.flags &
1043             ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1044               VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1045                 return -EINVAL;
1046
1047         /* GET & SET are mutually exclusive except with PROBE */
1048         if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1049             (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1050             (feature.flags & VFIO_DEVICE_FEATURE_GET))
1051                 return -EINVAL;
1052
1053         switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1054         case VFIO_DEVICE_FEATURE_MIGRATION:
1055                 return vfio_ioctl_device_feature_migration(
1056                         device, feature.flags, arg->data,
1057                         feature.argsz - minsz);
1058         case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1059                 return vfio_ioctl_device_feature_mig_device_state(
1060                         device, feature.flags, arg->data,
1061                         feature.argsz - minsz);
1062         case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1063                 return vfio_ioctl_device_feature_logging_start(
1064                         device, feature.flags, arg->data,
1065                         feature.argsz - minsz);
1066         case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1067                 return vfio_ioctl_device_feature_logging_stop(
1068                         device, feature.flags, arg->data,
1069                         feature.argsz - minsz);
1070         case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1071                 return vfio_ioctl_device_feature_logging_report(
1072                         device, feature.flags, arg->data,
1073                         feature.argsz - minsz);
1074         case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1075                 return vfio_ioctl_device_feature_migration_data_size(
1076                         device, feature.flags, arg->data,
1077                         feature.argsz - minsz);
1078         default:
1079                 if (unlikely(!device->ops->device_feature))
1080                         return -EINVAL;
1081                 return device->ops->device_feature(device, feature.flags,
1082                                                    arg->data,
1083                                                    feature.argsz - minsz);
1084         }
1085 }
1086
1087 static long vfio_device_fops_unl_ioctl(struct file *filep,
1088                                        unsigned int cmd, unsigned long arg)
1089 {
1090         struct vfio_device *device = filep->private_data;
1091         int ret;
1092
1093         ret = vfio_device_pm_runtime_get(device);
1094         if (ret)
1095                 return ret;
1096
1097         switch (cmd) {
1098         case VFIO_DEVICE_FEATURE:
1099                 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1100                 break;
1101
1102         default:
1103                 if (unlikely(!device->ops->ioctl))
1104                         ret = -EINVAL;
1105                 else
1106                         ret = device->ops->ioctl(device, cmd, arg);
1107                 break;
1108         }
1109
1110         vfio_device_pm_runtime_put(device);
1111         return ret;
1112 }
1113
1114 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1115                                      size_t count, loff_t *ppos)
1116 {
1117         struct vfio_device *device = filep->private_data;
1118
1119         if (unlikely(!device->ops->read))
1120                 return -EINVAL;
1121
1122         return device->ops->read(device, buf, count, ppos);
1123 }
1124
1125 static ssize_t vfio_device_fops_write(struct file *filep,
1126                                       const char __user *buf,
1127                                       size_t count, loff_t *ppos)
1128 {
1129         struct vfio_device *device = filep->private_data;
1130
1131         if (unlikely(!device->ops->write))
1132                 return -EINVAL;
1133
1134         return device->ops->write(device, buf, count, ppos);
1135 }
1136
1137 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1138 {
1139         struct vfio_device *device = filep->private_data;
1140
1141         if (unlikely(!device->ops->mmap))
1142                 return -EINVAL;
1143
1144         return device->ops->mmap(device, vma);
1145 }
1146
1147 const struct file_operations vfio_device_fops = {
1148         .owner          = THIS_MODULE,
1149         .release        = vfio_device_fops_release,
1150         .read           = vfio_device_fops_read,
1151         .write          = vfio_device_fops_write,
1152         .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1153         .compat_ioctl   = compat_ptr_ioctl,
1154         .mmap           = vfio_device_fops_mmap,
1155 };
1156
1157 /*
1158  * Sub-module support
1159  */
1160 /*
1161  * Helper for managing a buffer of info chain capabilities, allocate or
1162  * reallocate a buffer with additional @size, filling in @id and @version
1163  * of the capability.  A pointer to the new capability is returned.
1164  *
1165  * NB. The chain is based at the head of the buffer, so new entries are
1166  * added to the tail, vfio_info_cap_shift() should be called to fixup the
1167  * next offsets prior to copying to the user buffer.
1168  */
1169 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1170                                                size_t size, u16 id, u16 version)
1171 {
1172         void *buf;
1173         struct vfio_info_cap_header *header, *tmp;
1174
1175         buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1176         if (!buf) {
1177                 kfree(caps->buf);
1178                 caps->buf = NULL;
1179                 caps->size = 0;
1180                 return ERR_PTR(-ENOMEM);
1181         }
1182
1183         caps->buf = buf;
1184         header = buf + caps->size;
1185
1186         /* Eventually copied to user buffer, zero */
1187         memset(header, 0, size);
1188
1189         header->id = id;
1190         header->version = version;
1191
1192         /* Add to the end of the capability chain */
1193         for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1194                 ; /* nothing */
1195
1196         tmp->next = caps->size;
1197         caps->size += size;
1198
1199         return header;
1200 }
1201 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1202
1203 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1204 {
1205         struct vfio_info_cap_header *tmp;
1206         void *buf = (void *)caps->buf;
1207
1208         for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1209                 tmp->next += offset;
1210 }
1211 EXPORT_SYMBOL(vfio_info_cap_shift);
1212
1213 int vfio_info_add_capability(struct vfio_info_cap *caps,
1214                              struct vfio_info_cap_header *cap, size_t size)
1215 {
1216         struct vfio_info_cap_header *header;
1217
1218         header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1219         if (IS_ERR(header))
1220                 return PTR_ERR(header);
1221
1222         memcpy(header + 1, cap + 1, size - sizeof(*header));
1223
1224         return 0;
1225 }
1226 EXPORT_SYMBOL(vfio_info_add_capability);
1227
1228 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1229                                        int max_irq_type, size_t *data_size)
1230 {
1231         unsigned long minsz;
1232         size_t size;
1233
1234         minsz = offsetofend(struct vfio_irq_set, count);
1235
1236         if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1237             (hdr->count >= (U32_MAX - hdr->start)) ||
1238             (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1239                                 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1240                 return -EINVAL;
1241
1242         if (data_size)
1243                 *data_size = 0;
1244
1245         if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1246                 return -EINVAL;
1247
1248         switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1249         case VFIO_IRQ_SET_DATA_NONE:
1250                 size = 0;
1251                 break;
1252         case VFIO_IRQ_SET_DATA_BOOL:
1253                 size = sizeof(uint8_t);
1254                 break;
1255         case VFIO_IRQ_SET_DATA_EVENTFD:
1256                 size = sizeof(int32_t);
1257                 break;
1258         default:
1259                 return -EINVAL;
1260         }
1261
1262         if (size) {
1263                 if (hdr->argsz - minsz < hdr->count * size)
1264                         return -EINVAL;
1265
1266                 if (!data_size)
1267                         return -EINVAL;
1268
1269                 *data_size = hdr->count * size;
1270         }
1271
1272         return 0;
1273 }
1274 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1275
1276 /*
1277  * Pin contiguous user pages and return their associated host pages for local
1278  * domain only.
1279  * @device [in]  : device
1280  * @iova [in]    : starting IOVA of user pages to be pinned.
1281  * @npage [in]   : count of pages to be pinned.  This count should not
1282  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1283  * @prot [in]    : protection flags
1284  * @pages[out]   : array of host pages
1285  * Return error or number of pages pinned.
1286  *
1287  * A driver may only call this function if the vfio_device was created
1288  * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1289  */
1290 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1291                    int npage, int prot, struct page **pages)
1292 {
1293         /* group->container cannot change while a vfio device is open */
1294         if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1295                 return -EINVAL;
1296         if (vfio_device_has_container(device))
1297                 return vfio_device_container_pin_pages(device, iova,
1298                                                        npage, prot, pages);
1299         if (device->iommufd_access) {
1300                 int ret;
1301
1302                 if (iova > ULONG_MAX)
1303                         return -EINVAL;
1304                 /*
1305                  * VFIO ignores the sub page offset, npages is from the start of
1306                  * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1307                  * the sub page offset by doing:
1308                  *     pages[0] + (iova % PAGE_SIZE)
1309                  */
1310                 ret = iommufd_access_pin_pages(
1311                         device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1312                         npage * PAGE_SIZE, pages,
1313                         (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1314                 if (ret)
1315                         return ret;
1316                 return npage;
1317         }
1318         return -EINVAL;
1319 }
1320 EXPORT_SYMBOL(vfio_pin_pages);
1321
1322 /*
1323  * Unpin contiguous host pages for local domain only.
1324  * @device [in]  : device
1325  * @iova [in]    : starting address of user pages to be unpinned.
1326  * @npage [in]   : count of pages to be unpinned.  This count should not
1327  *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1328  */
1329 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1330 {
1331         if (WARN_ON(!vfio_assert_device_open(device)))
1332                 return;
1333
1334         if (vfio_device_has_container(device)) {
1335                 vfio_device_container_unpin_pages(device, iova, npage);
1336                 return;
1337         }
1338         if (device->iommufd_access) {
1339                 if (WARN_ON(iova > ULONG_MAX))
1340                         return;
1341                 iommufd_access_unpin_pages(device->iommufd_access,
1342                                            ALIGN_DOWN(iova, PAGE_SIZE),
1343                                            npage * PAGE_SIZE);
1344                 return;
1345         }
1346 }
1347 EXPORT_SYMBOL(vfio_unpin_pages);
1348
1349 /*
1350  * This interface allows the CPUs to perform some sort of virtual DMA on
1351  * behalf of the device.
1352  *
1353  * CPUs read/write from/into a range of IOVAs pointing to user space memory
1354  * into/from a kernel buffer.
1355  *
1356  * As the read/write of user space memory is conducted via the CPUs and is
1357  * not a real device DMA, it is not necessary to pin the user space memory.
1358  *
1359  * @device [in]         : VFIO device
1360  * @iova [in]           : base IOVA of a user space buffer
1361  * @data [in]           : pointer to kernel buffer
1362  * @len [in]            : kernel buffer length
1363  * @write               : indicate read or write
1364  * Return error code on failure or 0 on success.
1365  */
1366 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1367                 size_t len, bool write)
1368 {
1369         if (!data || len <= 0 || !vfio_assert_device_open(device))
1370                 return -EINVAL;
1371
1372         if (vfio_device_has_container(device))
1373                 return vfio_device_container_dma_rw(device, iova,
1374                                                     data, len, write);
1375
1376         if (device->iommufd_access) {
1377                 unsigned int flags = 0;
1378
1379                 if (iova > ULONG_MAX)
1380                         return -EINVAL;
1381
1382                 /* VFIO historically tries to auto-detect a kthread */
1383                 if (!current->mm)
1384                         flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1385                 if (write)
1386                         flags |= IOMMUFD_ACCESS_RW_WRITE;
1387                 return iommufd_access_rw(device->iommufd_access, iova, data,
1388                                          len, flags);
1389         }
1390         return -EINVAL;
1391 }
1392 EXPORT_SYMBOL(vfio_dma_rw);
1393
1394 /*
1395  * Module/class support
1396  */
1397 static int __init vfio_init(void)
1398 {
1399         int ret;
1400
1401         ida_init(&vfio.device_ida);
1402
1403         ret = vfio_group_init();
1404         if (ret)
1405                 return ret;
1406
1407         ret = vfio_virqfd_init();
1408         if (ret)
1409                 goto err_virqfd;
1410
1411         /* /sys/class/vfio-dev/vfioX */
1412         vfio.device_class = class_create("vfio-dev");
1413         if (IS_ERR(vfio.device_class)) {
1414                 ret = PTR_ERR(vfio.device_class);
1415                 goto err_dev_class;
1416         }
1417
1418         pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1419         return 0;
1420
1421 err_dev_class:
1422         vfio_virqfd_exit();
1423 err_virqfd:
1424         vfio_group_cleanup();
1425         return ret;
1426 }
1427
1428 static void __exit vfio_cleanup(void)
1429 {
1430         ida_destroy(&vfio.device_ida);
1431         class_destroy(vfio.device_class);
1432         vfio.device_class = NULL;
1433         vfio_virqfd_exit();
1434         vfio_group_cleanup();
1435         xa_destroy(&vfio_device_set_xa);
1436 }
1437
1438 module_init(vfio_init);
1439 module_exit(vfio_cleanup);
1440
1441 MODULE_VERSION(DRIVER_VERSION);
1442 MODULE_LICENSE("GPL v2");
1443 MODULE_AUTHOR(DRIVER_AUTHOR);
1444 MODULE_DESCRIPTION(DRIVER_DESC);
1445 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");