bhyve nvme: Add Select support to Get Features
[freebsd.git] / usr.sbin / bhyve / pci_nvme.c
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2017 Shunsuke Mie
5  * Copyright (c) 2018 Leon Dang
6  * Copyright (c) 2020 Chuck Tuffli
7  *
8  * Function crc16 Copyright (c) 2017, Fedor Uporov
9  *     Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32
33 /*
34  * bhyve PCIe-NVMe device emulation.
35  *
36  * options:
37  *  -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
38  *
39  *  accepted devpath:
40  *    /dev/blockdev
41  *    /path/to/image
42  *    ram=size_in_MiB
43  *
44  *  maxq    = max number of queues
45  *  qsz     = max elements in each queue
46  *  ioslots = max number of concurrent io requests
47  *  sectsz  = sector size (defaults to blockif sector size)
48  *  ser     = serial number (20-chars max)
49  *  eui64   = IEEE Extended Unique Identifier (8 byte value)
50  *  dsm     = DataSet Management support. Option is one of auto, enable,disable
51  *
52  */
53
54 /* TODO:
55     - create async event for smart and log
56     - intr coalesce
57  */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
70 #include <stdbool.h>
71 #include <stddef.h>
72 #include <stdint.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <string.h>
76
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
79 #include <vmmapi.h>
80
81 #include <dev/nvme/nvme.h>
82
83 #include "bhyverun.h"
84 #include "block_if.h"
85 #include "config.h"
86 #include "debug.h"
87 #include "pci_emul.h"
88
89
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
93
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR           4
96
97 #define NVME_IOSLOTS            8
98
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN     (1 << 14)
101
102 #define NVME_QUEUES             16
103 #define NVME_MAX_QENTRIES       2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN             0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES       (1 << (12 + NVME_MPSMIN))
108
109 #define NVME_PRP2_ITEMS         (PAGE_SIZE/sizeof(uint64_t))
110 #define NVME_MDTS               9
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC          ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE      ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
114
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS          0xffff
117 #define NVME_COMPLETION_VALID(c)        ((c).status != NVME_NO_STATUS)
118
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
121
122 /* helpers */
123
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero)         ((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one)         ((one)  - 1)
128
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131         (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132         (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
133
134 #define NVME_DOORBELL_OFFSET    offsetof(struct nvme_registers, doorbell)
135
136 enum nvme_controller_register_offsets {
137         NVME_CR_CAP_LOW = 0x00,
138         NVME_CR_CAP_HI  = 0x04,
139         NVME_CR_VS      = 0x08,
140         NVME_CR_INTMS   = 0x0c,
141         NVME_CR_INTMC   = 0x10,
142         NVME_CR_CC      = 0x14,
143         NVME_CR_CSTS    = 0x1c,
144         NVME_CR_NSSR    = 0x20,
145         NVME_CR_AQA     = 0x24,
146         NVME_CR_ASQ_LOW = 0x28,
147         NVME_CR_ASQ_HI  = 0x2c,
148         NVME_CR_ACQ_LOW = 0x30,
149         NVME_CR_ACQ_HI  = 0x34,
150 };
151
152 enum nvme_cmd_cdw11 {
153         NVME_CMD_CDW11_PC  = 0x0001,
154         NVME_CMD_CDW11_IEN = 0x0002,
155         NVME_CMD_CDW11_IV  = 0xFFFF0000,
156 };
157
158 enum nvme_copy_dir {
159         NVME_COPY_TO_PRP,
160         NVME_COPY_FROM_PRP,
161 };
162
163 #define NVME_CQ_INTEN   0x01
164 #define NVME_CQ_INTCOAL 0x02
165
166 struct nvme_completion_queue {
167         struct nvme_completion *qbase;
168         pthread_mutex_t mtx;
169         uint32_t        size;
170         uint16_t        tail; /* nvme progress */
171         uint16_t        head; /* guest progress */
172         uint16_t        intr_vec;
173         uint32_t        intr_en;
174 };
175
176 struct nvme_submission_queue {
177         struct nvme_command *qbase;
178         pthread_mutex_t mtx;
179         uint32_t        size;
180         uint16_t        head; /* nvme progress */
181         uint16_t        tail; /* guest progress */
182         uint16_t        cqid; /* completion queue id */
183         int             qpriority;
184 };
185
186 enum nvme_storage_type {
187         NVME_STOR_BLOCKIF = 0,
188         NVME_STOR_RAM = 1,
189 };
190
191 struct pci_nvme_blockstore {
192         enum nvme_storage_type type;
193         void            *ctx;
194         uint64_t        size;
195         uint32_t        sectsz;
196         uint32_t        sectsz_bits;
197         uint64_t        eui64;
198         uint32_t        deallocate:1;
199 };
200
201 /*
202  * Calculate the number of additional page descriptors for guest IO requests
203  * based on the advertised Max Data Transfer (MDTS) and given the number of
204  * default iovec's in a struct blockif_req.
205  */
206 #define MDTS_PAD_SIZE \
207         ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208           NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
209           0 )
210
211 struct pci_nvme_ioreq {
212         struct pci_nvme_softc *sc;
213         STAILQ_ENTRY(pci_nvme_ioreq) link;
214         struct nvme_submission_queue *nvme_sq;
215         uint16_t        sqid;
216
217         /* command information */
218         uint16_t        opc;
219         uint16_t        cid;
220         uint32_t        nsid;
221
222         uint64_t        prev_gpaddr;
223         size_t          prev_size;
224         size_t          bytes;
225
226         struct blockif_req io_req;
227
228         struct iovec    iovpadding[MDTS_PAD_SIZE];
229 };
230
231 enum nvme_dsm_type {
232         /* Dataset Management bit in ONCS reflects backing storage capability */
233         NVME_DATASET_MANAGEMENT_AUTO,
234         /* Unconditionally set Dataset Management bit in ONCS */
235         NVME_DATASET_MANAGEMENT_ENABLE,
236         /* Unconditionally clear Dataset Management bit in ONCS */
237         NVME_DATASET_MANAGEMENT_DISABLE,
238 };
239
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
242
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244     struct nvme_feature_obj *,
245     struct nvme_command *,
246     struct nvme_completion *);
247
248 struct nvme_feature_obj {
249         uint32_t        cdw11;
250         nvme_feature_cb set;
251         nvme_feature_cb get;
252         bool namespace_specific;
253 };
254
255 #define NVME_FID_MAX            (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
256
257 typedef enum {
258         PCI_NVME_AE_TYPE_ERROR = 0,
259         PCI_NVME_AE_TYPE_SMART,
260         PCI_NVME_AE_TYPE_NOTICE,
261         PCI_NVME_AE_TYPE_IO_CMD = 6,
262         PCI_NVME_AE_TYPE_VENDOR = 7,
263         PCI_NVME_AE_TYPE_MAX            /* Must be last */
264 } pci_nvme_async_type;
265
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268         STAILQ_ENTRY(pci_nvme_aer) link;
269         uint16_t        cid;    /* Command ID of the submitted AER */
270 };
271
272 typedef enum {
273         PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
274         PCI_NVME_AE_INFO_FW_ACTIVATION,
275         PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
276         PCI_NVME_AE_INFO_ANA_CHANGE,
277         PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
278         PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
279         PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
280         PCI_NVME_AE_INFO_MAX,
281 } pci_nvme_async_info;
282
283 /* Asynchronous Event Notifications */
284 struct pci_nvme_aen {
285         pci_nvme_async_type atype;
286         uint32_t        event_data;
287         bool            posted;
288 };
289
290 typedef enum {
291         NVME_CNTRLTYPE_IO = 1,
292         NVME_CNTRLTYPE_DISCOVERY = 2,
293         NVME_CNTRLTYPE_ADMIN = 3,
294 } pci_nvme_cntrl_type;
295
296 struct pci_nvme_softc {
297         struct pci_devinst *nsc_pi;
298
299         pthread_mutex_t mtx;
300
301         struct nvme_registers regs;
302
303         struct nvme_namespace_data  nsdata;
304         struct nvme_controller_data ctrldata;
305         struct nvme_error_information_entry err_log;
306         struct nvme_health_information_page health_log;
307         struct nvme_firmware_page fw_log;
308         struct nvme_ns_list ns_log;
309
310         struct pci_nvme_blockstore nvstore;
311
312         uint16_t        max_qentries;   /* max entries per queue */
313         uint32_t        max_queues;     /* max number of IO SQ's or CQ's */
314         uint32_t        num_cqueues;
315         uint32_t        num_squeues;
316         bool            num_q_is_set; /* Has host set Number of Queues */
317
318         struct pci_nvme_ioreq *ioreqs;
319         STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
320         uint32_t        pending_ios;
321         uint32_t        ioslots;
322         sem_t           iosemlock;
323
324         /*
325          * Memory mapped Submission and Completion queues
326          * Each array includes both Admin and IO queues
327          */
328         struct nvme_completion_queue *compl_queues;
329         struct nvme_submission_queue *submit_queues;
330
331         struct nvme_feature_obj feat[NVME_FID_MAX];
332
333         enum nvme_dsm_type dataset_management;
334
335         /* Accounting for SMART data */
336         __uint128_t     read_data_units;
337         __uint128_t     write_data_units;
338         __uint128_t     read_commands;
339         __uint128_t     write_commands;
340         uint32_t        read_dunits_remainder;
341         uint32_t        write_dunits_remainder;
342
343         STAILQ_HEAD(, pci_nvme_aer) aer_list;
344         pthread_mutex_t aer_mtx;
345         uint32_t        aer_count;
346         struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
347         pthread_t       aen_tid;
348         pthread_mutex_t aen_mtx;
349         pthread_cond_t  aen_cond;
350 };
351
352
353 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
354     struct nvme_completion_queue *cq,
355     uint32_t cdw0,
356     uint16_t cid,
357     uint16_t sqid,
358     uint16_t status);
359 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
360 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
361 static void pci_nvme_io_done(struct blockif_req *, int);
362
363 /* Controller Configuration utils */
364 #define NVME_CC_GET_EN(cc) \
365         ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
366 #define NVME_CC_GET_CSS(cc) \
367         ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
368 #define NVME_CC_GET_SHN(cc) \
369         ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
370 #define NVME_CC_GET_IOSQES(cc) \
371         ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
372 #define NVME_CC_GET_IOCQES(cc) \
373         ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
374
375 #define NVME_CC_WRITE_MASK \
376         ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
377          (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
378          (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
379
380 #define NVME_CC_NEN_WRITE_MASK \
381         ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
382          (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
383          (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
384
385 /* Controller Status utils */
386 #define NVME_CSTS_GET_RDY(sts) \
387         ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
388
389 #define NVME_CSTS_RDY   (1 << NVME_CSTS_REG_RDY_SHIFT)
390
391 /* Completion Queue status word utils */
392 #define NVME_STATUS_P   (1 << NVME_STATUS_P_SHIFT)
393 #define NVME_STATUS_MASK \
394         ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
395          (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
396
397 #define NVME_ONCS_DSM   (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
398         NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
399
400 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
401     struct nvme_feature_obj *,
402     struct nvme_command *,
403     struct nvme_completion *);
404 static void nvme_feature_temperature(struct pci_nvme_softc *,
405     struct nvme_feature_obj *,
406     struct nvme_command *,
407     struct nvme_completion *);
408 static void nvme_feature_num_queues(struct pci_nvme_softc *,
409     struct nvme_feature_obj *,
410     struct nvme_command *,
411     struct nvme_completion *);
412 static void nvme_feature_iv_config(struct pci_nvme_softc *,
413     struct nvme_feature_obj *,
414     struct nvme_command *,
415     struct nvme_completion *);
416
417 static void *aen_thr(void *arg);
418
419 static __inline void
420 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
421 {
422         size_t len;
423
424         len = strnlen(src, dst_size);
425         memset(dst, pad, dst_size);
426         memcpy(dst, src, len);
427 }
428
429 static __inline void
430 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
431 {
432
433         *status &= ~NVME_STATUS_MASK;
434         *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
435                 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
436 }
437
438 static __inline void
439 pci_nvme_status_genc(uint16_t *status, uint16_t code)
440 {
441
442         pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
443 }
444
445 /*
446  * Initialize the requested number or IO Submission and Completion Queues.
447  * Admin queues are allocated implicitly.
448  */
449 static void
450 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
451 {
452         uint32_t i;
453
454         /*
455          * Allocate and initialize the Submission Queues
456          */
457         if (nsq > NVME_QUEUES) {
458                 WPRINTF("%s: clamping number of SQ from %u to %u",
459                                         __func__, nsq, NVME_QUEUES);
460                 nsq = NVME_QUEUES;
461         }
462
463         sc->num_squeues = nsq;
464
465         sc->submit_queues = calloc(sc->num_squeues + 1,
466                                 sizeof(struct nvme_submission_queue));
467         if (sc->submit_queues == NULL) {
468                 WPRINTF("%s: SQ allocation failed", __func__);
469                 sc->num_squeues = 0;
470         } else {
471                 struct nvme_submission_queue *sq = sc->submit_queues;
472
473                 for (i = 0; i < sc->num_squeues; i++)
474                         pthread_mutex_init(&sq[i].mtx, NULL);
475         }
476
477         /*
478          * Allocate and initialize the Completion Queues
479          */
480         if (ncq > NVME_QUEUES) {
481                 WPRINTF("%s: clamping number of CQ from %u to %u",
482                                         __func__, ncq, NVME_QUEUES);
483                 ncq = NVME_QUEUES;
484         }
485
486         sc->num_cqueues = ncq;
487
488         sc->compl_queues = calloc(sc->num_cqueues + 1,
489                                 sizeof(struct nvme_completion_queue));
490         if (sc->compl_queues == NULL) {
491                 WPRINTF("%s: CQ allocation failed", __func__);
492                 sc->num_cqueues = 0;
493         } else {
494                 struct nvme_completion_queue *cq = sc->compl_queues;
495
496                 for (i = 0; i < sc->num_cqueues; i++)
497                         pthread_mutex_init(&cq[i].mtx, NULL);
498         }
499 }
500
501 static void
502 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
503 {
504         struct nvme_controller_data *cd = &sc->ctrldata;
505
506         cd->vid = 0xFB5D;
507         cd->ssvid = 0x0000;
508
509         cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
510         cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
511
512         /* Num of submission commands that we can handle at a time (2^rab) */
513         cd->rab   = 4;
514
515         /* FreeBSD OUI */
516         cd->ieee[0] = 0x58;
517         cd->ieee[1] = 0x9c;
518         cd->ieee[2] = 0xfc;
519
520         cd->mic = 0;
521
522         cd->mdts = NVME_MDTS;   /* max data transfer size (2^mdts * CAP.MPSMIN) */
523
524         cd->ver = NVME_REV(1,4);
525
526         cd->cntrltype = NVME_CNTRLTYPE_IO;
527         cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
528         cd->acl = 2;
529         cd->aerl = 4;
530
531         /* Advertise 1, Read-only firmware slot */
532         cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
533             (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
534         cd->lpa = 0;    /* TODO: support some simple things like SMART */
535         cd->elpe = 0;   /* max error log page entries */
536         cd->npss = 1;   /* number of power states support */
537
538         /* Warning Composite Temperature Threshold */
539         cd->wctemp = 0x0157;
540         cd->cctemp = 0x0157;
541
542         cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
543             (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
544         cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
545             (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
546         cd->nn = 1;     /* number of namespaces */
547
548         cd->oncs = 0;
549         switch (sc->dataset_management) {
550         case NVME_DATASET_MANAGEMENT_AUTO:
551                 if (sc->nvstore.deallocate)
552                         cd->oncs |= NVME_ONCS_DSM;
553                 break;
554         case NVME_DATASET_MANAGEMENT_ENABLE:
555                 cd->oncs |= NVME_ONCS_DSM;
556                 break;
557         default:
558                 break;
559         }
560
561         cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
562             NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
563
564         cd->power_state[0].mp = 10;
565 }
566
567 /*
568  * Calculate the CRC-16 of the given buffer
569  * See copyright attribution at top of file
570  */
571 static uint16_t
572 crc16(uint16_t crc, const void *buffer, unsigned int len)
573 {
574         const unsigned char *cp = buffer;
575         /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
576         static uint16_t const crc16_table[256] = {
577                 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
578                 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
579                 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
580                 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
581                 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
582                 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
583                 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
584                 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
585                 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
586                 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
587                 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
588                 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
589                 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
590                 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
591                 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
592                 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
593                 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
594                 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
595                 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
596                 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
597                 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
598                 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
599                 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
600                 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
601                 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
602                 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
603                 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
604                 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
605                 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
606                 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
607                 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
608                 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
609         };
610
611         while (len--)
612                 crc = (((crc >> 8) & 0xffU) ^
613                     crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
614         return crc;
615 }
616
617 static void
618 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
619     struct nvme_namespace_data *nd)
620 {
621
622         /* Get capacity and block size information from backing store */
623         nd->nsze = nvstore->size / nvstore->sectsz;
624         nd->ncap = nd->nsze;
625         nd->nuse = nd->nsze;
626 }
627
628 static void
629 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
630     struct nvme_namespace_data *nd, uint32_t nsid,
631     struct pci_nvme_blockstore *nvstore)
632 {
633
634         pci_nvme_init_nsdata_size(nvstore, nd);
635
636         if (nvstore->type == NVME_STOR_BLOCKIF)
637                 nvstore->deallocate = blockif_candelete(nvstore->ctx);
638
639         nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
640         nd->flbas = 0;
641
642         /* Create an EUI-64 if user did not provide one */
643         if (nvstore->eui64 == 0) {
644                 char *data = NULL;
645                 uint64_t eui64 = nvstore->eui64;
646
647                 asprintf(&data, "%s%u%u%u", get_config_value("name"),
648                     sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
649                     sc->nsc_pi->pi_func);
650
651                 if (data != NULL) {
652                         eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
653                         free(data);
654                 }
655                 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
656         }
657         be64enc(nd->eui64, nvstore->eui64);
658
659         /* LBA data-sz = 2^lbads */
660         nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
661 }
662
663 static void
664 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
665 {
666
667         memset(&sc->err_log, 0, sizeof(sc->err_log));
668         memset(&sc->health_log, 0, sizeof(sc->health_log));
669         memset(&sc->fw_log, 0, sizeof(sc->fw_log));
670         memset(&sc->ns_log, 0, sizeof(sc->ns_log));
671
672         /* Set read/write remainder to round up according to spec */
673         sc->read_dunits_remainder = 999;
674         sc->write_dunits_remainder = 999;
675
676         /* Set nominal Health values checked by implementations */
677         sc->health_log.temperature = NVME_TEMPERATURE;
678         sc->health_log.available_spare = 100;
679         sc->health_log.available_spare_threshold = 10;
680 }
681
682 static void
683 pci_nvme_init_features(struct pci_nvme_softc *sc)
684 {
685         enum nvme_feature       fid;
686
687         for (fid = 0; fid < NVME_FID_MAX; fid++) {
688                 switch (fid) {
689                 case NVME_FEAT_ARBITRATION:
690                 case NVME_FEAT_POWER_MANAGEMENT:
691                 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
692                 case NVME_FEAT_WRITE_ATOMICITY:
693                         /* Mandatory but no special handling required */
694                 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
695                 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
696                 //                this returns a data buffer
697                         break;
698                 case NVME_FEAT_TEMPERATURE_THRESHOLD:
699                         sc->feat[fid].set = nvme_feature_temperature;
700                         break;
701                 case NVME_FEAT_ERROR_RECOVERY:
702                         sc->feat[fid].namespace_specific = true;
703                         break;
704                 case NVME_FEAT_NUMBER_OF_QUEUES:
705                         sc->feat[fid].set = nvme_feature_num_queues;
706                         break;
707                 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
708                         sc->feat[fid].set = nvme_feature_iv_config;
709                         break;
710                 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
711                         /* Enable all AENs by default */
712                         sc->feat[fid].cdw11 = 0x31f;
713                         break;
714                 default:
715                         sc->feat[fid].set = nvme_feature_invalid_cb;
716                         sc->feat[fid].get = nvme_feature_invalid_cb;
717                 }
718         }
719 }
720
721 static void
722 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
723 {
724
725         STAILQ_INIT(&sc->aer_list);
726         sc->aer_count = 0;
727 }
728
729 static void
730 pci_nvme_aer_init(struct pci_nvme_softc *sc)
731 {
732
733         pthread_mutex_init(&sc->aer_mtx, NULL);
734         pci_nvme_aer_reset(sc);
735 }
736
737 static void
738 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
739 {
740         struct pci_nvme_aer *aer = NULL;
741
742         pthread_mutex_lock(&sc->aer_mtx);
743         while (!STAILQ_EMPTY(&sc->aer_list)) {
744                 aer = STAILQ_FIRST(&sc->aer_list);
745                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
746                 free(aer);
747         }
748         pthread_mutex_unlock(&sc->aer_mtx);
749
750         pci_nvme_aer_reset(sc);
751 }
752
753 static bool
754 pci_nvme_aer_available(struct pci_nvme_softc *sc)
755 {
756
757         return (sc->aer_count != 0);
758 }
759
760 static bool
761 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
762 {
763         struct nvme_controller_data *cd = &sc->ctrldata;
764
765         /* AERL is a zero based value while aer_count is one's based */
766         return (sc->aer_count == (cd->aerl + 1));
767 }
768
769 /*
770  * Add an Async Event Request
771  *
772  * Stores an AER to be returned later if the Controller needs to notify the
773  * host of an event.
774  * Note that while the NVMe spec doesn't require Controllers to return AER's
775  * in order, this implementation does preserve the order.
776  */
777 static int
778 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
779 {
780         struct pci_nvme_aer *aer = NULL;
781
782         aer = calloc(1, sizeof(struct pci_nvme_aer));
783         if (aer == NULL)
784                 return (-1);
785
786         /* Save the Command ID for use in the completion message */
787         aer->cid = cid;
788
789         pthread_mutex_lock(&sc->aer_mtx);
790         sc->aer_count++;
791         STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
792         pthread_mutex_unlock(&sc->aer_mtx);
793
794         return (0);
795 }
796
797 /*
798  * Get an Async Event Request structure
799  *
800  * Returns a pointer to an AER previously submitted by the host or NULL if
801  * no AER's exist. Caller is responsible for freeing the returned struct.
802  */
803 static struct pci_nvme_aer *
804 pci_nvme_aer_get(struct pci_nvme_softc *sc)
805 {
806         struct pci_nvme_aer *aer = NULL;
807
808         pthread_mutex_lock(&sc->aer_mtx);
809         aer = STAILQ_FIRST(&sc->aer_list);
810         if (aer != NULL) {
811                 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
812                 sc->aer_count--;
813         }
814         pthread_mutex_unlock(&sc->aer_mtx);
815
816         return (aer);
817 }
818
819 static void
820 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
821 {
822         uint32_t        atype;
823
824         memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
825
826         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
827                 sc->aen[atype].atype = atype;
828         }
829 }
830
831 static void
832 pci_nvme_aen_init(struct pci_nvme_softc *sc)
833 {
834         char nstr[80];
835
836         pci_nvme_aen_reset(sc);
837
838         pthread_mutex_init(&sc->aen_mtx, NULL);
839         pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
840         snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
841             sc->nsc_pi->pi_func);
842         pthread_set_name_np(sc->aen_tid, nstr);
843 }
844
845 static void
846 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
847 {
848
849         pci_nvme_aen_reset(sc);
850 }
851
852 /* Notify the AEN thread of pending work */
853 static void
854 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
855 {
856
857         pthread_cond_signal(&sc->aen_cond);
858 }
859
860 /*
861  * Post an Asynchronous Event Notification
862  */
863 static int32_t
864 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
865                 uint32_t event_data)
866 {
867         struct pci_nvme_aen *aen;
868
869         if (atype >= PCI_NVME_AE_TYPE_MAX) {
870                 return(EINVAL);
871         }
872
873         pthread_mutex_lock(&sc->aen_mtx);
874         aen = &sc->aen[atype];
875
876         /* Has the controller already posted an event of this type? */
877         if (aen->posted) {
878                 pthread_mutex_unlock(&sc->aen_mtx);
879                 return(EALREADY);
880         }
881
882         aen->event_data = event_data;
883         aen->posted = true;
884         pthread_mutex_unlock(&sc->aen_mtx);
885
886         pci_nvme_aen_notify(sc);
887
888         return(0);
889 }
890
891 static void
892 pci_nvme_aen_process(struct pci_nvme_softc *sc)
893 {
894         struct pci_nvme_aer *aer;
895         struct pci_nvme_aen *aen;
896         pci_nvme_async_type atype;
897         uint32_t mask;
898         uint16_t status;
899         uint8_t lid;
900
901         assert(pthread_mutex_isowned_np(&sc->aen_mtx));
902         for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
903                 aen = &sc->aen[atype];
904                 /* Previous iterations may have depleted the available AER's */
905                 if (!pci_nvme_aer_available(sc)) {
906                         DPRINTF("%s: no AER", __func__);
907                         break;
908                 }
909
910                 if (!aen->posted) {
911                         DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
912                         continue;
913                 }
914
915                 status = NVME_SC_SUCCESS;
916
917                 /* Is the event masked? */
918                 mask =
919                     sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
920
921                 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
922                 switch (atype) {
923                 case PCI_NVME_AE_TYPE_ERROR:
924                         lid = NVME_LOG_ERROR;
925                         break;
926                 case PCI_NVME_AE_TYPE_SMART:
927                         mask &= 0xff;
928                         if ((mask & aen->event_data) == 0)
929                                 continue;
930                         lid = NVME_LOG_HEALTH_INFORMATION;
931                         break;
932                 case PCI_NVME_AE_TYPE_NOTICE:
933                         if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
934                                 EPRINTLN("%s unknown AEN notice type %u",
935                                     __func__, aen->event_data);
936                                 status = NVME_SC_INTERNAL_DEVICE_ERROR;
937                                 break;
938                         }
939                         mask >>= 8;
940                         if (((1 << aen->event_data) & mask) == 0)
941                                 continue;
942                         switch (aen->event_data) {
943                         case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
944                                 lid = NVME_LOG_CHANGED_NAMESPACE;
945                                 break;
946                         case PCI_NVME_AE_INFO_FW_ACTIVATION:
947                                 lid = NVME_LOG_FIRMWARE_SLOT;
948                                 break;
949                         case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
950                                 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
951                                 break;
952                         case PCI_NVME_AE_INFO_ANA_CHANGE:
953                                 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
954                                 break;
955                         case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
956                                 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
957                                 break;
958                         case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
959                                 lid = NVME_LOG_LBA_STATUS_INFORMATION;
960                                 break;
961                         case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
962                                 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
963                                 break;
964                         default:
965                                 lid = 0;
966                         }
967                         break;
968                 default:
969                         /* bad type?!? */
970                         EPRINTLN("%s unknown AEN type %u", __func__, atype);
971                         status = NVME_SC_INTERNAL_DEVICE_ERROR;
972                         break;
973                 }
974
975                 aer = pci_nvme_aer_get(sc);
976                 assert(aer != NULL);
977
978                 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
979                 pci_nvme_cq_update(sc, &sc->compl_queues[0],
980                     (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
981                     aer->cid,
982                     0,          /* SQID */
983                     status);
984
985                 aen->event_data = 0;
986                 aen->posted = false;
987
988                 pci_generate_msix(sc->nsc_pi, 0);
989         }
990 }
991
992 static void *
993 aen_thr(void *arg)
994 {
995         struct pci_nvme_softc *sc;
996
997         sc = arg;
998
999         pthread_mutex_lock(&sc->aen_mtx);
1000         for (;;) {
1001                 pci_nvme_aen_process(sc);
1002                 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1003         }
1004         pthread_mutex_unlock(&sc->aen_mtx);
1005
1006         pthread_exit(NULL);
1007         return (NULL);
1008 }
1009
1010 static void
1011 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1012 {
1013         uint32_t i;
1014
1015         DPRINTF("%s", __func__);
1016
1017         sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1018             (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1019             (60 << NVME_CAP_LO_REG_TO_SHIFT);
1020
1021         sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1022
1023         sc->regs.vs = NVME_REV(1,4);    /* NVMe v1.4 */
1024
1025         sc->regs.cc = 0;
1026
1027         assert(sc->submit_queues != NULL);
1028
1029         for (i = 0; i < sc->num_squeues + 1; i++) {
1030                 sc->submit_queues[i].qbase = NULL;
1031                 sc->submit_queues[i].size = 0;
1032                 sc->submit_queues[i].cqid = 0;
1033                 sc->submit_queues[i].tail = 0;
1034                 sc->submit_queues[i].head = 0;
1035         }
1036
1037         assert(sc->compl_queues != NULL);
1038
1039         for (i = 0; i < sc->num_cqueues + 1; i++) {
1040                 sc->compl_queues[i].qbase = NULL;
1041                 sc->compl_queues[i].size = 0;
1042                 sc->compl_queues[i].tail = 0;
1043                 sc->compl_queues[i].head = 0;
1044         }
1045
1046         sc->num_q_is_set = false;
1047
1048         pci_nvme_aer_destroy(sc);
1049         pci_nvme_aen_destroy(sc);
1050
1051         /*
1052          * Clear CSTS.RDY last to prevent the host from enabling Controller
1053          * before cleanup completes
1054          */
1055         sc->regs.csts = 0;
1056 }
1057
1058 static void
1059 pci_nvme_reset(struct pci_nvme_softc *sc)
1060 {
1061         pthread_mutex_lock(&sc->mtx);
1062         pci_nvme_reset_locked(sc);
1063         pthread_mutex_unlock(&sc->mtx);
1064 }
1065
1066 static void
1067 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1068 {
1069         uint16_t acqs, asqs;
1070
1071         DPRINTF("%s", __func__);
1072
1073         asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1074         sc->submit_queues[0].size = asqs;
1075         sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1076                     sizeof(struct nvme_command) * asqs);
1077
1078         DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1079                 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1080
1081         acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1082             NVME_AQA_REG_ACQS_MASK) + 1;
1083         sc->compl_queues[0].size = acqs;
1084         sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1085                  sizeof(struct nvme_completion) * acqs);
1086         sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1087
1088         DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1089                 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1090 }
1091
1092 static int
1093 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1094         size_t len, enum nvme_copy_dir dir)
1095 {
1096         uint8_t *p;
1097         size_t bytes;
1098
1099         if (len > (8 * 1024)) {
1100                 return (-1);
1101         }
1102
1103         /* Copy from the start of prp1 to the end of the physical page */
1104         bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1105         bytes = MIN(bytes, len);
1106
1107         p = vm_map_gpa(ctx, prp1, bytes);
1108         if (p == NULL) {
1109                 return (-1);
1110         }
1111
1112         if (dir == NVME_COPY_TO_PRP)
1113                 memcpy(p, b, bytes);
1114         else
1115                 memcpy(b, p, bytes);
1116
1117         b += bytes;
1118
1119         len -= bytes;
1120         if (len == 0) {
1121                 return (0);
1122         }
1123
1124         len = MIN(len, PAGE_SIZE);
1125
1126         p = vm_map_gpa(ctx, prp2, len);
1127         if (p == NULL) {
1128                 return (-1);
1129         }
1130
1131         if (dir == NVME_COPY_TO_PRP)
1132                 memcpy(p, b, len);
1133         else
1134                 memcpy(b, p, len);
1135
1136         return (0);
1137 }
1138
1139 /*
1140  * Write a Completion Queue Entry update
1141  *
1142  * Write the completion and update the doorbell value
1143  */
1144 static void
1145 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1146                 struct nvme_completion_queue *cq,
1147                 uint32_t cdw0,
1148                 uint16_t cid,
1149                 uint16_t sqid,
1150                 uint16_t status)
1151 {
1152         struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1153         struct nvme_completion *cqe;
1154
1155         assert(cq->qbase != NULL);
1156
1157         pthread_mutex_lock(&cq->mtx);
1158
1159         cqe = &cq->qbase[cq->tail];
1160
1161         /* Flip the phase bit */
1162         status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1163
1164         cqe->cdw0 = cdw0;
1165         cqe->sqhd = sq->head;
1166         cqe->sqid = sqid;
1167         cqe->cid = cid;
1168         cqe->status = status;
1169
1170         cq->tail++;
1171         if (cq->tail >= cq->size) {
1172                 cq->tail = 0;
1173         }
1174
1175         pthread_mutex_unlock(&cq->mtx);
1176 }
1177
1178 static int
1179 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1180         struct nvme_completion* compl)
1181 {
1182         uint16_t qid = command->cdw10 & 0xffff;
1183
1184         DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1185         if (qid == 0 || qid > sc->num_squeues ||
1186             (sc->submit_queues[qid].qbase == NULL)) {
1187                 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1188                         __func__, qid, sc->num_squeues);
1189                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1190                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1191                 return (1);
1192         }
1193
1194         sc->submit_queues[qid].qbase = NULL;
1195         sc->submit_queues[qid].cqid = 0;
1196         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1197         return (1);
1198 }
1199
1200 static int
1201 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1202         struct nvme_completion* compl)
1203 {
1204         if (command->cdw11 & NVME_CMD_CDW11_PC) {
1205                 uint16_t qid = command->cdw10 & 0xffff;
1206                 struct nvme_submission_queue *nsq;
1207
1208                 if ((qid == 0) || (qid > sc->num_squeues) ||
1209                     (sc->submit_queues[qid].qbase != NULL)) {
1210                         WPRINTF("%s queue index %u > num_squeues %u",
1211                                 __func__, qid, sc->num_squeues);
1212                         pci_nvme_status_tc(&compl->status,
1213                             NVME_SCT_COMMAND_SPECIFIC,
1214                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1215                         return (1);
1216                 }
1217
1218                 nsq = &sc->submit_queues[qid];
1219                 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1220                 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1221                 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1222                         /*
1223                          * Queues must specify at least two entries
1224                          * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1225                          * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1226                          */
1227                         pci_nvme_status_tc(&compl->status,
1228                             NVME_SCT_COMMAND_SPECIFIC,
1229                             NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1230                         return (1);
1231                 }
1232                 nsq->head = nsq->tail = 0;
1233
1234                 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1235                 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1236                         pci_nvme_status_tc(&compl->status,
1237                             NVME_SCT_COMMAND_SPECIFIC,
1238                             NVME_SC_INVALID_QUEUE_IDENTIFIER);
1239                         return (1);
1240                 }
1241
1242                 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1243                         pci_nvme_status_tc(&compl->status,
1244                             NVME_SCT_COMMAND_SPECIFIC,
1245                             NVME_SC_COMPLETION_QUEUE_INVALID);
1246                         return (1);
1247                 }
1248
1249                 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1250
1251                 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1252                               sizeof(struct nvme_command) * (size_t)nsq->size);
1253
1254                 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1255                         qid, nsq->size, nsq->qbase, nsq->cqid);
1256
1257                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1258
1259                 DPRINTF("%s completed creating IOSQ qid %u",
1260                          __func__, qid);
1261         } else {
1262                 /*
1263                  * Guest sent non-cont submission queue request.
1264                  * This setting is unsupported by this emulation.
1265                  */
1266                 WPRINTF("%s unsupported non-contig (list-based) "
1267                          "create i/o submission queue", __func__);
1268
1269                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1270         }
1271         return (1);
1272 }
1273
1274 static int
1275 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1276         struct nvme_completion* compl)
1277 {
1278         uint16_t qid = command->cdw10 & 0xffff;
1279         uint16_t sqid;
1280
1281         DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1282         if (qid == 0 || qid > sc->num_cqueues ||
1283             (sc->compl_queues[qid].qbase == NULL)) {
1284                 WPRINTF("%s queue index %u / num_cqueues %u",
1285                         __func__, qid, sc->num_cqueues);
1286                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1287                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1288                 return (1);
1289         }
1290
1291         /* Deleting an Active CQ is an error */
1292         for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1293                 if (sc->submit_queues[sqid].cqid == qid) {
1294                         pci_nvme_status_tc(&compl->status,
1295                             NVME_SCT_COMMAND_SPECIFIC,
1296                             NVME_SC_INVALID_QUEUE_DELETION);
1297                         return (1);
1298                 }
1299
1300         sc->compl_queues[qid].qbase = NULL;
1301         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1302         return (1);
1303 }
1304
1305 static int
1306 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1307         struct nvme_completion* compl)
1308 {
1309         struct nvme_completion_queue *ncq;
1310         uint16_t qid = command->cdw10 & 0xffff;
1311
1312         /* Only support Physically Contiguous queues */
1313         if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1314                 WPRINTF("%s unsupported non-contig (list-based) "
1315                          "create i/o completion queue",
1316                          __func__);
1317
1318                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1319                 return (1);
1320         }
1321
1322         if ((qid == 0) || (qid > sc->num_cqueues) ||
1323             (sc->compl_queues[qid].qbase != NULL)) {
1324                 WPRINTF("%s queue index %u > num_cqueues %u",
1325                         __func__, qid, sc->num_cqueues);
1326                 pci_nvme_status_tc(&compl->status,
1327                     NVME_SCT_COMMAND_SPECIFIC,
1328                     NVME_SC_INVALID_QUEUE_IDENTIFIER);
1329                 return (1);
1330         }
1331
1332         ncq = &sc->compl_queues[qid];
1333         ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1334         ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1335         if (ncq->intr_vec > (sc->max_queues + 1)) {
1336                 pci_nvme_status_tc(&compl->status,
1337                     NVME_SCT_COMMAND_SPECIFIC,
1338                     NVME_SC_INVALID_INTERRUPT_VECTOR);
1339                 return (1);
1340         }
1341
1342         ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1343         if ((ncq->size < 2) || (ncq->size > sc->max_qentries))  {
1344                 /*
1345                  * Queues must specify at least two entries
1346                  * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1347                  * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1348                  */
1349                 pci_nvme_status_tc(&compl->status,
1350                     NVME_SCT_COMMAND_SPECIFIC,
1351                     NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1352                 return (1);
1353         }
1354         ncq->head = ncq->tail = 0;
1355         ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1356                      command->prp1,
1357                      sizeof(struct nvme_command) * (size_t)ncq->size);
1358
1359         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1360
1361
1362         return (1);
1363 }
1364
1365 static int
1366 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1367         struct nvme_completion* compl)
1368 {
1369         uint64_t logoff;
1370         uint32_t logsize;
1371         uint8_t logpage = command->cdw10 & 0xFF;
1372
1373         DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1374
1375         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1376
1377         /*
1378          * Command specifies the number of dwords to return in fields NUMDU
1379          * and NUMDL. This is a zero-based value.
1380          */
1381         logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1382         logsize *= sizeof(uint32_t);
1383         logoff  = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1384
1385         switch (logpage) {
1386         case NVME_LOG_ERROR:
1387                 if (logoff >= sizeof(sc->err_log)) {
1388                         pci_nvme_status_genc(&compl->status,
1389                             NVME_SC_INVALID_FIELD);
1390                         break;
1391                 }
1392
1393                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1394                     command->prp2, (uint8_t *)&sc->err_log + logoff,
1395                     MIN(logsize - logoff, sizeof(sc->err_log)),
1396                     NVME_COPY_TO_PRP);
1397                 break;
1398         case NVME_LOG_HEALTH_INFORMATION:
1399                 if (logoff >= sizeof(sc->health_log)) {
1400                         pci_nvme_status_genc(&compl->status,
1401                             NVME_SC_INVALID_FIELD);
1402                         break;
1403                 }
1404
1405                 pthread_mutex_lock(&sc->mtx);
1406                 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1407                     sizeof(sc->health_log.data_units_read));
1408                 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1409                     sizeof(sc->health_log.data_units_written));
1410                 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1411                     sizeof(sc->health_log.host_read_commands));
1412                 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1413                     sizeof(sc->health_log.host_write_commands));
1414                 pthread_mutex_unlock(&sc->mtx);
1415
1416                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1417                     command->prp2, (uint8_t *)&sc->health_log + logoff,
1418                     MIN(logsize - logoff, sizeof(sc->health_log)),
1419                     NVME_COPY_TO_PRP);
1420                 break;
1421         case NVME_LOG_FIRMWARE_SLOT:
1422                 if (logoff >= sizeof(sc->fw_log)) {
1423                         pci_nvme_status_genc(&compl->status,
1424                             NVME_SC_INVALID_FIELD);
1425                         break;
1426                 }
1427
1428                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1429                     command->prp2, (uint8_t *)&sc->fw_log + logoff,
1430                     MIN(logsize - logoff, sizeof(sc->fw_log)),
1431                     NVME_COPY_TO_PRP);
1432                 break;
1433         case NVME_LOG_CHANGED_NAMESPACE:
1434                 if (logoff >= sizeof(sc->ns_log)) {
1435                         pci_nvme_status_genc(&compl->status,
1436                             NVME_SC_INVALID_FIELD);
1437                         break;
1438                 }
1439
1440                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1441                     command->prp2, (uint8_t *)&sc->ns_log + logoff,
1442                     MIN(logsize - logoff, sizeof(sc->ns_log)),
1443                     NVME_COPY_TO_PRP);
1444                 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1445                 break;
1446         default:
1447                 DPRINTF("%s get log page %x command not supported",
1448                         __func__, logpage);
1449
1450                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1451                     NVME_SC_INVALID_LOG_PAGE);
1452         }
1453
1454         return (1);
1455 }
1456
1457 static int
1458 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1459         struct nvme_completion* compl)
1460 {
1461         void *dest;
1462         uint16_t status;
1463
1464         DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1465                 command->cdw10 & 0xFF, command->nsid);
1466
1467         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1468
1469         switch (command->cdw10 & 0xFF) {
1470         case 0x00: /* return Identify Namespace data structure */
1471                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1472                     command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1473                     NVME_COPY_TO_PRP);
1474                 break;
1475         case 0x01: /* return Identify Controller data structure */
1476                 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1477                     command->prp2, (uint8_t *)&sc->ctrldata,
1478                     sizeof(sc->ctrldata),
1479                     NVME_COPY_TO_PRP);
1480                 break;
1481         case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1482                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1483                                   sizeof(uint32_t) * 1024);
1484                 /* All unused entries shall be zero */
1485                 bzero(dest, sizeof(uint32_t) * 1024);
1486                 ((uint32_t *)dest)[0] = 1;
1487                 break;
1488         case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1489                 if (command->nsid != 1) {
1490                         pci_nvme_status_genc(&status,
1491                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1492                         break;
1493                 }
1494                 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1495                                   sizeof(uint32_t) * 1024);
1496                 /* All bytes after the descriptor shall be zero */
1497                 bzero(dest, sizeof(uint32_t) * 1024);
1498
1499                 /* Return NIDT=1 (i.e. EUI64) descriptor */
1500                 ((uint8_t *)dest)[0] = 1;
1501                 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1502                 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1503                 break;
1504         default:
1505                 DPRINTF("%s unsupported identify command requested 0x%x",
1506                          __func__, command->cdw10 & 0xFF);
1507                 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1508                 break;
1509         }
1510
1511         compl->status = status;
1512         return (1);
1513 }
1514
1515 static const char *
1516 nvme_fid_to_name(uint8_t fid)
1517 {
1518         const char *name;
1519
1520         switch (fid) {
1521         case NVME_FEAT_ARBITRATION:
1522                 name = "Arbitration";
1523                 break;
1524         case NVME_FEAT_POWER_MANAGEMENT:
1525                 name = "Power Management";
1526                 break;
1527         case NVME_FEAT_LBA_RANGE_TYPE:
1528                 name = "LBA Range Type";
1529                 break;
1530         case NVME_FEAT_TEMPERATURE_THRESHOLD:
1531                 name = "Temperature Threshold";
1532                 break;
1533         case NVME_FEAT_ERROR_RECOVERY:
1534                 name = "Error Recovery";
1535                 break;
1536         case NVME_FEAT_VOLATILE_WRITE_CACHE:
1537                 name = "Volatile Write Cache";
1538                 break;
1539         case NVME_FEAT_NUMBER_OF_QUEUES:
1540                 name = "Number of Queues";
1541                 break;
1542         case NVME_FEAT_INTERRUPT_COALESCING:
1543                 name = "Interrupt Coalescing";
1544                 break;
1545         case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1546                 name = "Interrupt Vector Configuration";
1547                 break;
1548         case NVME_FEAT_WRITE_ATOMICITY:
1549                 name = "Write Atomicity Normal";
1550                 break;
1551         case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1552                 name = "Asynchronous Event Configuration";
1553                 break;
1554         case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1555                 name = "Autonomous Power State Transition";
1556                 break;
1557         case NVME_FEAT_HOST_MEMORY_BUFFER:
1558                 name = "Host Memory Buffer";
1559                 break;
1560         case NVME_FEAT_TIMESTAMP:
1561                 name = "Timestamp";
1562                 break;
1563         case NVME_FEAT_KEEP_ALIVE_TIMER:
1564                 name = "Keep Alive Timer";
1565                 break;
1566         case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1567                 name = "Host Controlled Thermal Management";
1568                 break;
1569         case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1570                 name = "Non-Operation Power State Config";
1571                 break;
1572         case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1573                 name = "Read Recovery Level Config";
1574                 break;
1575         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1576                 name = "Predictable Latency Mode Config";
1577                 break;
1578         case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1579                 name = "Predictable Latency Mode Window";
1580                 break;
1581         case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1582                 name = "LBA Status Information Report Interval";
1583                 break;
1584         case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1585                 name = "Host Behavior Support";
1586                 break;
1587         case NVME_FEAT_SANITIZE_CONFIG:
1588                 name = "Sanitize Config";
1589                 break;
1590         case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1591                 name = "Endurance Group Event Configuration";
1592                 break;
1593         case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1594                 name = "Software Progress Marker";
1595                 break;
1596         case NVME_FEAT_HOST_IDENTIFIER:
1597                 name = "Host Identifier";
1598                 break;
1599         case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1600                 name = "Reservation Notification Mask";
1601                 break;
1602         case NVME_FEAT_RESERVATION_PERSISTENCE:
1603                 name = "Reservation Persistence";
1604                 break;
1605         case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1606                 name = "Namespace Write Protection Config";
1607                 break;
1608         default:
1609                 name = "Unknown";
1610                 break;
1611         }
1612
1613         return (name);
1614 }
1615
1616 static void
1617 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1618     struct nvme_feature_obj *feat,
1619     struct nvme_command *command,
1620     struct nvme_completion *compl)
1621 {
1622
1623         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1624 }
1625
1626 static void
1627 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1628     struct nvme_feature_obj *feat,
1629     struct nvme_command *command,
1630     struct nvme_completion *compl)
1631 {
1632         uint32_t i;
1633         uint32_t cdw11 = command->cdw11;
1634         uint16_t iv;
1635         bool cd;
1636
1637         pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1638
1639         iv = cdw11 & 0xffff;
1640         cd = cdw11 & (1 << 16);
1641
1642         if (iv > (sc->max_queues + 1)) {
1643                 return;
1644         }
1645
1646         /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1647         if ((iv == 0) && !cd)
1648                 return;
1649
1650         /* Requested Interrupt Vector must be used by a CQ */
1651         for (i = 0; i < sc->num_cqueues + 1; i++) {
1652                 if (sc->compl_queues[i].intr_vec == iv) {
1653                         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1654                 }
1655         }
1656 }
1657
1658 #define NVME_TEMP_THRESH_OVER   0
1659 #define NVME_TEMP_THRESH_UNDER  1
1660 static void
1661 nvme_feature_temperature(struct pci_nvme_softc *sc,
1662     struct nvme_feature_obj *feat,
1663     struct nvme_command *command,
1664     struct nvme_completion *compl)
1665 {
1666         uint16_t        tmpth;  /* Temperature Threshold */
1667         uint8_t         tmpsel; /* Threshold Temperature Select */
1668         uint8_t         thsel;  /* Threshold Type Select */
1669         bool            set_crit = false;
1670
1671         tmpth  = command->cdw11 & 0xffff;
1672         tmpsel = (command->cdw11 >> 16) & 0xf;
1673         thsel  = (command->cdw11 >> 20) & 0x3;
1674
1675         DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1676
1677         /* Check for unsupported values */
1678         if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1679             (thsel > NVME_TEMP_THRESH_UNDER)) {
1680                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1681                 return;
1682         }
1683
1684         if (((thsel == NVME_TEMP_THRESH_OVER)  && (NVME_TEMPERATURE >= tmpth)) ||
1685             ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1686                 set_crit = true;
1687
1688         pthread_mutex_lock(&sc->mtx);
1689         if (set_crit)
1690                 sc->health_log.critical_warning |=
1691                     NVME_CRIT_WARN_ST_TEMPERATURE;
1692         else
1693                 sc->health_log.critical_warning &=
1694                     ~NVME_CRIT_WARN_ST_TEMPERATURE;
1695         pthread_mutex_unlock(&sc->mtx);
1696
1697         if (set_crit)
1698                 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1699                     sc->health_log.critical_warning);
1700
1701
1702         DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1703 }
1704
1705 static void
1706 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1707     struct nvme_feature_obj *feat,
1708     struct nvme_command *command,
1709     struct nvme_completion *compl)
1710 {
1711         uint16_t nqr;   /* Number of Queues Requested */
1712
1713         if (sc->num_q_is_set) {
1714                 WPRINTF("%s: Number of Queues already set", __func__);
1715                 pci_nvme_status_genc(&compl->status,
1716                     NVME_SC_COMMAND_SEQUENCE_ERROR);
1717                 return;
1718         }
1719
1720         nqr = command->cdw11 & 0xFFFF;
1721         if (nqr == 0xffff) {
1722                 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1723                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1724                 return;
1725         }
1726
1727         sc->num_squeues = ONE_BASED(nqr);
1728         if (sc->num_squeues > sc->max_queues) {
1729                 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1730                                         sc->max_queues);
1731                 sc->num_squeues = sc->max_queues;
1732         }
1733
1734         nqr = (command->cdw11 >> 16) & 0xFFFF;
1735         if (nqr == 0xffff) {
1736                 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1737                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1738                 return;
1739         }
1740
1741         sc->num_cqueues = ONE_BASED(nqr);
1742         if (sc->num_cqueues > sc->max_queues) {
1743                 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1744                                         sc->max_queues);
1745                 sc->num_cqueues = sc->max_queues;
1746         }
1747
1748         /* Patch the command value which will be saved on callback's return */
1749         command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1750         compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1751
1752         sc->num_q_is_set = true;
1753 }
1754
1755 static int
1756 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1757         struct nvme_completion *compl)
1758 {
1759         struct nvme_feature_obj *feat;
1760         uint32_t nsid = command->nsid;
1761         uint8_t fid = command->cdw10 & 0xFF;
1762
1763         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1764
1765         if (fid >= NVME_FID_MAX) {
1766                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1767                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1768                 return (1);
1769         }
1770         feat = &sc->feat[fid];
1771
1772         if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1773                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1774                 return (1);
1775         }
1776
1777         if (!feat->namespace_specific &&
1778             !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1779                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1780                     NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1781                 return (1);
1782         }
1783
1784         compl->cdw0 = 0;
1785         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1786
1787         if (feat->set)
1788                 feat->set(sc, feat, command, compl);
1789
1790         DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1791         if (compl->status == NVME_SC_SUCCESS) {
1792                 feat->cdw11 = command->cdw11;
1793                 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1794                     (command->cdw11 != 0))
1795                         pci_nvme_aen_notify(sc);
1796         }
1797
1798         return (0);
1799 }
1800
1801 #define NVME_FEATURES_SEL_SUPPORTED     0x3
1802 #define NVME_FEATURES_NS_SPECIFIC       (1 << 1)
1803
1804 static int
1805 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1806         struct nvme_completion* compl)
1807 {
1808         struct nvme_feature_obj *feat;
1809         uint8_t fid = command->cdw10 & 0xFF;
1810         uint8_t sel = (command->cdw10 >> 8) & 0x7;
1811
1812         DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1813
1814         if (fid >= NVME_FID_MAX) {
1815                 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1816                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1817                 return (1);
1818         }
1819
1820         compl->cdw0 = 0;
1821         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1822
1823         feat = &sc->feat[fid];
1824         if (feat->get) {
1825                 feat->get(sc, feat, command, compl);
1826         }
1827
1828         if (compl->status == NVME_SC_SUCCESS) {
1829                 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1830                         compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1831                 else
1832                         compl->cdw0 = feat->cdw11;
1833         }
1834
1835         return (0);
1836 }
1837
1838 static int
1839 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1840         struct nvme_completion* compl)
1841 {
1842         uint8_t ses, lbaf, pi;
1843
1844         /* Only supports Secure Erase Setting - User Data Erase */
1845         ses = (command->cdw10 >> 9) & 0x7;
1846         if (ses > 0x1) {
1847                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1848                 return (1);
1849         }
1850
1851         /* Only supports a single LBA Format */
1852         lbaf = command->cdw10 & 0xf;
1853         if (lbaf != 0) {
1854                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1855                     NVME_SC_INVALID_FORMAT);
1856                 return (1);
1857         }
1858
1859         /* Doesn't support Protection Infomation */
1860         pi = (command->cdw10 >> 5) & 0x7;
1861         if (pi != 0) {
1862                 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1863                 return (1);
1864         }
1865
1866         if (sc->nvstore.type == NVME_STOR_RAM) {
1867                 if (sc->nvstore.ctx)
1868                         free(sc->nvstore.ctx);
1869                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1870                 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1871         } else {
1872                 struct pci_nvme_ioreq *req;
1873                 int err;
1874
1875                 req = pci_nvme_get_ioreq(sc);
1876                 if (req == NULL) {
1877                         pci_nvme_status_genc(&compl->status,
1878                             NVME_SC_INTERNAL_DEVICE_ERROR);
1879                         WPRINTF("%s: unable to allocate IO req", __func__);
1880                         return (1);
1881                 }
1882                 req->nvme_sq = &sc->submit_queues[0];
1883                 req->sqid = 0;
1884                 req->opc = command->opc;
1885                 req->cid = command->cid;
1886                 req->nsid = command->nsid;
1887
1888                 req->io_req.br_offset = 0;
1889                 req->io_req.br_resid = sc->nvstore.size;
1890                 req->io_req.br_callback = pci_nvme_io_done;
1891
1892                 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1893                 if (err) {
1894                         pci_nvme_status_genc(&compl->status,
1895                             NVME_SC_INTERNAL_DEVICE_ERROR);
1896                         pci_nvme_release_ioreq(sc, req);
1897                 } else
1898                         compl->status = NVME_NO_STATUS;
1899         }
1900
1901         return (1);
1902 }
1903
1904 static int
1905 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1906         struct nvme_completion* compl)
1907 {
1908         DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1909                 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1910
1911         /* TODO: search for the command ID and abort it */
1912
1913         compl->cdw0 = 1;
1914         pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1915         return (1);
1916 }
1917
1918 static int
1919 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1920         struct nvme_command* command, struct nvme_completion* compl)
1921 {
1922         DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1923             sc->aer_count, sc->ctrldata.aerl, command->cid);
1924
1925         /* Don't exceed the Async Event Request Limit (AERL). */
1926         if (pci_nvme_aer_limit_reached(sc)) {
1927                 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1928                                 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1929                 return (1);
1930         }
1931
1932         if (pci_nvme_aer_add(sc, command->cid)) {
1933                 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1934                                 NVME_SC_INTERNAL_DEVICE_ERROR);
1935                 return (1);
1936         }
1937
1938         /*
1939          * Raise events when they happen based on the Set Features cmd.
1940          * These events happen async, so only set completion successful if
1941          * there is an event reflective of the request to get event.
1942          */
1943         compl->status = NVME_NO_STATUS;
1944         pci_nvme_aen_notify(sc);
1945
1946         return (0);
1947 }
1948
1949 static void
1950 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1951 {
1952         struct nvme_completion compl;
1953         struct nvme_command *cmd;
1954         struct nvme_submission_queue *sq;
1955         struct nvme_completion_queue *cq;
1956         uint16_t sqhead;
1957
1958         DPRINTF("%s index %u", __func__, (uint32_t)value);
1959
1960         sq = &sc->submit_queues[0];
1961         cq = &sc->compl_queues[0];
1962
1963         pthread_mutex_lock(&sq->mtx);
1964
1965         sqhead = sq->head;
1966         DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1967
1968         while (sqhead != atomic_load_acq_short(&sq->tail)) {
1969                 cmd = &(sq->qbase)[sqhead];
1970                 compl.cdw0 = 0;
1971                 compl.status = 0;
1972
1973                 switch (cmd->opc) {
1974                 case NVME_OPC_DELETE_IO_SQ:
1975                         DPRINTF("%s command DELETE_IO_SQ", __func__);
1976                         nvme_opc_delete_io_sq(sc, cmd, &compl);
1977                         break;
1978                 case NVME_OPC_CREATE_IO_SQ:
1979                         DPRINTF("%s command CREATE_IO_SQ", __func__);
1980                         nvme_opc_create_io_sq(sc, cmd, &compl);
1981                         break;
1982                 case NVME_OPC_DELETE_IO_CQ:
1983                         DPRINTF("%s command DELETE_IO_CQ", __func__);
1984                         nvme_opc_delete_io_cq(sc, cmd, &compl);
1985                         break;
1986                 case NVME_OPC_CREATE_IO_CQ:
1987                         DPRINTF("%s command CREATE_IO_CQ", __func__);
1988                         nvme_opc_create_io_cq(sc, cmd, &compl);
1989                         break;
1990                 case NVME_OPC_GET_LOG_PAGE:
1991                         DPRINTF("%s command GET_LOG_PAGE", __func__);
1992                         nvme_opc_get_log_page(sc, cmd, &compl);
1993                         break;
1994                 case NVME_OPC_IDENTIFY:
1995                         DPRINTF("%s command IDENTIFY", __func__);
1996                         nvme_opc_identify(sc, cmd, &compl);
1997                         break;
1998                 case NVME_OPC_ABORT:
1999                         DPRINTF("%s command ABORT", __func__);
2000                         nvme_opc_abort(sc, cmd, &compl);
2001                         break;
2002                 case NVME_OPC_SET_FEATURES:
2003                         DPRINTF("%s command SET_FEATURES", __func__);
2004                         nvme_opc_set_features(sc, cmd, &compl);
2005                         break;
2006                 case NVME_OPC_GET_FEATURES:
2007                         DPRINTF("%s command GET_FEATURES", __func__);
2008                         nvme_opc_get_features(sc, cmd, &compl);
2009                         break;
2010                 case NVME_OPC_FIRMWARE_ACTIVATE:
2011                         DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2012                         pci_nvme_status_tc(&compl.status,
2013                             NVME_SCT_COMMAND_SPECIFIC,
2014                             NVME_SC_INVALID_FIRMWARE_SLOT);
2015                         break;
2016                 case NVME_OPC_ASYNC_EVENT_REQUEST:
2017                         DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2018                         nvme_opc_async_event_req(sc, cmd, &compl);
2019                         break;
2020                 case NVME_OPC_FORMAT_NVM:
2021                         DPRINTF("%s command FORMAT_NVM", __func__);
2022                         if ((sc->ctrldata.oacs &
2023                             (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2024                                 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2025                                 break;
2026                         }
2027                         nvme_opc_format_nvm(sc, cmd, &compl);
2028                         break;
2029                 case NVME_OPC_SECURITY_SEND:
2030                 case NVME_OPC_SECURITY_RECEIVE:
2031                 case NVME_OPC_SANITIZE:
2032                 case NVME_OPC_GET_LBA_STATUS:
2033                         DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2034                             cmd->opc);
2035                         /* Valid but unsupported opcodes */
2036                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2037                         break;
2038                 default:
2039                         DPRINTF("%s command OPC=%#X (not implemented)",
2040                             __func__,
2041                             cmd->opc);
2042                         pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2043                 }
2044                 sqhead = (sqhead + 1) % sq->size;
2045
2046                 if (NVME_COMPLETION_VALID(compl)) {
2047                         pci_nvme_cq_update(sc, &sc->compl_queues[0],
2048                             compl.cdw0,
2049                             cmd->cid,
2050                             0,          /* SQID */
2051                             compl.status);
2052                 }
2053         }
2054
2055         DPRINTF("setting sqhead %u", sqhead);
2056         sq->head = sqhead;
2057
2058         if (cq->head != cq->tail)
2059                 pci_generate_msix(sc->nsc_pi, 0);
2060
2061         pthread_mutex_unlock(&sq->mtx);
2062 }
2063
2064 /*
2065  * Update the Write and Read statistics reported in SMART data
2066  *
2067  * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2068  * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2069  * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2070  */
2071 static void
2072 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2073     size_t bytes, uint16_t status)
2074 {
2075
2076         pthread_mutex_lock(&sc->mtx);
2077         switch (opc) {
2078         case NVME_OPC_WRITE:
2079                 sc->write_commands++;
2080                 if (status != NVME_SC_SUCCESS)
2081                         break;
2082                 sc->write_dunits_remainder += (bytes / 512);
2083                 while (sc->write_dunits_remainder >= 1000) {
2084                         sc->write_data_units++;
2085                         sc->write_dunits_remainder -= 1000;
2086                 }
2087                 break;
2088         case NVME_OPC_READ:
2089                 sc->read_commands++;
2090                 if (status != NVME_SC_SUCCESS)
2091                         break;
2092                 sc->read_dunits_remainder += (bytes / 512);
2093                 while (sc->read_dunits_remainder >= 1000) {
2094                         sc->read_data_units++;
2095                         sc->read_dunits_remainder -= 1000;
2096                 }
2097                 break;
2098         default:
2099                 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2100                 break;
2101         }
2102         pthread_mutex_unlock(&sc->mtx);
2103 }
2104
2105 /*
2106  * Check if the combination of Starting LBA (slba) and Number of Logical
2107  * Blocks (nlb) exceeds the range of the underlying storage.
2108  *
2109  * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2110  * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2111  * overflow.
2112  */
2113 static bool
2114 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2115     uint32_t nlb)
2116 {
2117         size_t  offset, bytes;
2118
2119         /* Overflow check of multiplying Starting LBA by the sector size */
2120         if (slba >> (64 - nvstore->sectsz_bits))
2121                 return (true);
2122
2123         offset = slba << nvstore->sectsz_bits;
2124         bytes = nlb << nvstore->sectsz_bits;
2125
2126         /* Overflow check of Number of Logical Blocks */
2127         if ((nvstore->size - offset) < bytes)
2128                 return (true);
2129
2130         return (false);
2131 }
2132
2133 static int
2134 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2135         uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2136 {
2137         int iovidx;
2138
2139         if (req == NULL)
2140                 return (-1);
2141
2142         if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2143                 return (-1);
2144         }
2145
2146         /* concatenate contig block-iovs to minimize number of iovs */
2147         if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2148                 iovidx = req->io_req.br_iovcnt - 1;
2149
2150                 req->io_req.br_iov[iovidx].iov_base =
2151                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2152                                      req->prev_gpaddr, size);
2153
2154                 req->prev_size += size;
2155                 req->io_req.br_resid += size;
2156
2157                 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2158         } else {
2159                 iovidx = req->io_req.br_iovcnt;
2160                 if (iovidx == 0) {
2161                         req->io_req.br_offset = lba;
2162                         req->io_req.br_resid = 0;
2163                         req->io_req.br_param = req;
2164                 }
2165
2166                 req->io_req.br_iov[iovidx].iov_base =
2167                     paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2168                                      gpaddr, size);
2169
2170                 req->io_req.br_iov[iovidx].iov_len = size;
2171
2172                 req->prev_gpaddr = gpaddr;
2173                 req->prev_size = size;
2174                 req->io_req.br_resid += size;
2175
2176                 req->io_req.br_iovcnt++;
2177         }
2178
2179         return (0);
2180 }
2181
2182 static void
2183 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2184         struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2185         uint32_t cdw0, uint16_t status)
2186 {
2187         struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2188
2189         DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2190                  __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2191                  NVME_STATUS_GET_SC(status));
2192
2193         pci_nvme_cq_update(sc, cq,
2194             0,          /* CDW0 */
2195             cid,
2196             sqid,
2197             status);
2198
2199         if (cq->head != cq->tail) {
2200                 if (cq->intr_en & NVME_CQ_INTEN) {
2201                         pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2202                 } else {
2203                         DPRINTF("%s: CQ%u interrupt disabled",
2204                                                 __func__, sq->cqid);
2205                 }
2206         }
2207 }
2208
2209 static void
2210 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2211 {
2212         req->sc = NULL;
2213         req->nvme_sq = NULL;
2214         req->sqid = 0;
2215
2216         pthread_mutex_lock(&sc->mtx);
2217
2218         STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2219         sc->pending_ios--;
2220
2221         /* when no more IO pending, can set to ready if device reset/enabled */
2222         if (sc->pending_ios == 0 &&
2223             NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2224                 sc->regs.csts |= NVME_CSTS_RDY;
2225
2226         pthread_mutex_unlock(&sc->mtx);
2227
2228         sem_post(&sc->iosemlock);
2229 }
2230
2231 static struct pci_nvme_ioreq *
2232 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2233 {
2234         struct pci_nvme_ioreq *req = NULL;
2235
2236         sem_wait(&sc->iosemlock);
2237         pthread_mutex_lock(&sc->mtx);
2238
2239         req = STAILQ_FIRST(&sc->ioreqs_free);
2240         assert(req != NULL);
2241         STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2242
2243         req->sc = sc;
2244
2245         sc->pending_ios++;
2246
2247         pthread_mutex_unlock(&sc->mtx);
2248
2249         req->io_req.br_iovcnt = 0;
2250         req->io_req.br_offset = 0;
2251         req->io_req.br_resid = 0;
2252         req->io_req.br_param = req;
2253         req->prev_gpaddr = 0;
2254         req->prev_size = 0;
2255
2256         return req;
2257 }
2258
2259 static void
2260 pci_nvme_io_done(struct blockif_req *br, int err)
2261 {
2262         struct pci_nvme_ioreq *req = br->br_param;
2263         struct nvme_submission_queue *sq = req->nvme_sq;
2264         uint16_t code, status;
2265
2266         DPRINTF("%s error %d %s", __func__, err, strerror(err));
2267
2268         /* TODO return correct error */
2269         code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2270         pci_nvme_status_genc(&status, code);
2271
2272         pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2273         pci_nvme_stats_write_read_update(req->sc, req->opc,
2274             req->bytes, status);
2275         pci_nvme_release_ioreq(req->sc, req);
2276 }
2277
2278 /*
2279  * Implements the Flush command. The specification states:
2280  *    If a volatile write cache is not present, Flush commands complete
2281  *    successfully and have no effect
2282  * in the description of the Volatile Write Cache (VWC) field of the Identify
2283  * Controller data. Therefore, set status to Success if the command is
2284  * not supported (i.e. RAM or as indicated by the blockif).
2285  */
2286 static bool
2287 nvme_opc_flush(struct pci_nvme_softc *sc,
2288     struct nvme_command *cmd,
2289     struct pci_nvme_blockstore *nvstore,
2290     struct pci_nvme_ioreq *req,
2291     uint16_t *status)
2292 {
2293         bool pending = false;
2294
2295         if (nvstore->type == NVME_STOR_RAM) {
2296                 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2297         } else {
2298                 int err;
2299
2300                 req->io_req.br_callback = pci_nvme_io_done;
2301
2302                 err = blockif_flush(nvstore->ctx, &req->io_req);
2303                 switch (err) {
2304                 case 0:
2305                         pending = true;
2306                         break;
2307                 case EOPNOTSUPP:
2308                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2309                         break;
2310                 default:
2311                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2312                 }
2313         }
2314
2315         return (pending);
2316 }
2317
2318 static uint16_t
2319 nvme_write_read_ram(struct pci_nvme_softc *sc,
2320     struct pci_nvme_blockstore *nvstore,
2321     uint64_t prp1, uint64_t prp2,
2322     size_t offset, uint64_t bytes,
2323     bool is_write)
2324 {
2325         uint8_t *buf = nvstore->ctx;
2326         enum nvme_copy_dir dir;
2327         uint16_t status;
2328
2329         if (is_write)
2330                 dir = NVME_COPY_TO_PRP;
2331         else
2332                 dir = NVME_COPY_FROM_PRP;
2333
2334         if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2335             buf + offset, bytes, dir))
2336                 pci_nvme_status_genc(&status,
2337                     NVME_SC_DATA_TRANSFER_ERROR);
2338         else
2339                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2340
2341         return (status);
2342 }
2343
2344 static uint16_t
2345 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2346     struct pci_nvme_blockstore *nvstore,
2347     struct pci_nvme_ioreq *req,
2348     uint64_t prp1, uint64_t prp2,
2349     size_t offset, uint64_t bytes,
2350     bool is_write)
2351 {
2352         uint64_t size;
2353         int err;
2354         uint16_t status = NVME_NO_STATUS;
2355
2356         size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2357         if (pci_nvme_append_iov_req(sc, req, prp1,
2358             size, is_write, offset)) {
2359                 pci_nvme_status_genc(&status,
2360                     NVME_SC_DATA_TRANSFER_ERROR);
2361                 goto out;
2362         }
2363
2364         offset += size;
2365         bytes  -= size;
2366
2367         if (bytes == 0) {
2368                 ;
2369         } else if (bytes <= PAGE_SIZE) {
2370                 size = bytes;
2371                 if (pci_nvme_append_iov_req(sc, req, prp2,
2372                     size, is_write, offset)) {
2373                         pci_nvme_status_genc(&status,
2374                             NVME_SC_DATA_TRANSFER_ERROR);
2375                         goto out;
2376                 }
2377         } else {
2378                 void *vmctx = sc->nsc_pi->pi_vmctx;
2379                 uint64_t *prp_list = &prp2;
2380                 uint64_t *last = prp_list;
2381
2382                 /* PRP2 is pointer to a physical region page list */
2383                 while (bytes) {
2384                         /* Last entry in list points to the next list */
2385                         if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2386                                 uint64_t prp = *prp_list;
2387
2388                                 prp_list = paddr_guest2host(vmctx, prp,
2389                                     PAGE_SIZE - (prp % PAGE_SIZE));
2390                                 last = prp_list + (NVME_PRP2_ITEMS - 1);
2391                         }
2392
2393                         size = MIN(bytes, PAGE_SIZE);
2394
2395                         if (pci_nvme_append_iov_req(sc, req, *prp_list,
2396                             size, is_write, offset)) {
2397                                 pci_nvme_status_genc(&status,
2398                                     NVME_SC_DATA_TRANSFER_ERROR);
2399                                 goto out;
2400                         }
2401
2402                         offset += size;
2403                         bytes  -= size;
2404
2405                         prp_list++;
2406                 }
2407         }
2408         req->io_req.br_callback = pci_nvme_io_done;
2409         if (is_write)
2410                 err = blockif_write(nvstore->ctx, &req->io_req);
2411         else
2412                 err = blockif_read(nvstore->ctx, &req->io_req);
2413
2414         if (err)
2415                 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2416 out:
2417         return (status);
2418 }
2419
2420 static bool
2421 nvme_opc_write_read(struct pci_nvme_softc *sc,
2422     struct nvme_command *cmd,
2423     struct pci_nvme_blockstore *nvstore,
2424     struct pci_nvme_ioreq *req,
2425     uint16_t *status)
2426 {
2427         uint64_t lba, nblocks, bytes;
2428         size_t offset;
2429         bool is_write = cmd->opc == NVME_OPC_WRITE;
2430         bool pending = false;
2431
2432         lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2433         nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2434
2435         if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2436                 WPRINTF("%s command would exceed LBA range", __func__);
2437                 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2438                 goto out;
2439         }
2440
2441         bytes  = nblocks << nvstore->sectsz_bits;
2442         if (bytes > NVME_MAX_DATA_SIZE) {
2443                 WPRINTF("%s command would exceed MDTS", __func__);
2444                 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2445                 goto out;
2446         }
2447
2448         offset = lba << nvstore->sectsz_bits;
2449
2450         req->bytes = bytes;
2451         req->io_req.br_offset = lba;
2452
2453         /* PRP bits 1:0 must be zero */
2454         cmd->prp1 &= ~0x3UL;
2455         cmd->prp2 &= ~0x3UL;
2456
2457         if (nvstore->type == NVME_STOR_RAM) {
2458                 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2459                     cmd->prp2, offset, bytes, is_write);
2460         } else {
2461                 *status = nvme_write_read_blockif(sc, nvstore, req,
2462                     cmd->prp1, cmd->prp2, offset, bytes, is_write);
2463
2464                 if (*status == NVME_NO_STATUS)
2465                         pending = true;
2466         }
2467 out:
2468         if (!pending)
2469                 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2470
2471         return (pending);
2472 }
2473
2474 static void
2475 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2476 {
2477         struct pci_nvme_ioreq *req = br->br_param;
2478         struct pci_nvme_softc *sc = req->sc;
2479         bool done = true;
2480         uint16_t status;
2481
2482         if (err) {
2483                 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2484         } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2485                 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2486         } else {
2487                 struct iovec *iov = req->io_req.br_iov;
2488
2489                 req->prev_gpaddr++;
2490                 iov += req->prev_gpaddr;
2491
2492                 /* The iov_* values already include the sector size */
2493                 req->io_req.br_offset = (off_t)iov->iov_base;
2494                 req->io_req.br_resid = iov->iov_len;
2495                 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2496                         pci_nvme_status_genc(&status,
2497                             NVME_SC_INTERNAL_DEVICE_ERROR);
2498                 } else
2499                         done = false;
2500         }
2501
2502         if (done) {
2503                 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2504                     req->cid, 0, status);
2505                 pci_nvme_release_ioreq(sc, req);
2506         }
2507 }
2508
2509 static bool
2510 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2511     struct nvme_command *cmd,
2512     struct pci_nvme_blockstore *nvstore,
2513     struct pci_nvme_ioreq *req,
2514     uint16_t *status)
2515 {
2516         struct nvme_dsm_range *range;
2517         uint32_t nr, r, non_zero, dr;
2518         int err;
2519         bool pending = false;
2520
2521         if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2522                 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2523                 goto out;
2524         }
2525
2526         nr = cmd->cdw10 & 0xff;
2527
2528         /* copy locally because a range entry could straddle PRPs */
2529         range = calloc(1, NVME_MAX_DSM_TRIM);
2530         if (range == NULL) {
2531                 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2532                 goto out;
2533         }
2534         nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2535             (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2536
2537         /* Check for invalid ranges and the number of non-zero lengths */
2538         non_zero = 0;
2539         for (r = 0; r <= nr; r++) {
2540                 if (pci_nvme_out_of_range(nvstore,
2541                     range[r].starting_lba, range[r].length)) {
2542                         pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2543                         goto out;
2544                 }
2545                 if (range[r].length != 0)
2546                         non_zero++;
2547         }
2548
2549         if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2550                 size_t offset, bytes;
2551                 int sectsz_bits = sc->nvstore.sectsz_bits;
2552
2553                 /*
2554                  * DSM calls are advisory only, and compliant controllers
2555                  * may choose to take no actions (i.e. return Success).
2556                  */
2557                 if (!nvstore->deallocate) {
2558                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2559                         goto out;
2560                 }
2561
2562                 /* If all ranges have a zero length, return Success */
2563                 if (non_zero == 0) {
2564                         pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2565                         goto out;
2566                 }
2567
2568                 if (req == NULL) {
2569                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2570                         goto out;
2571                 }
2572
2573                 offset = range[0].starting_lba << sectsz_bits;
2574                 bytes = range[0].length << sectsz_bits;
2575
2576                 /*
2577                  * If the request is for more than a single range, store
2578                  * the ranges in the br_iov. Optimize for the common case
2579                  * of a single range.
2580                  *
2581                  * Note that NVMe Number of Ranges is a zero based value
2582                  */
2583                 req->io_req.br_iovcnt = 0;
2584                 req->io_req.br_offset = offset;
2585                 req->io_req.br_resid = bytes;
2586
2587                 if (nr == 0) {
2588                         req->io_req.br_callback = pci_nvme_io_done;
2589                 } else {
2590                         struct iovec *iov = req->io_req.br_iov;
2591
2592                         for (r = 0, dr = 0; r <= nr; r++) {
2593                                 offset = range[r].starting_lba << sectsz_bits;
2594                                 bytes = range[r].length << sectsz_bits;
2595                                 if (bytes == 0)
2596                                         continue;
2597
2598                                 if ((nvstore->size - offset) < bytes) {
2599                                         pci_nvme_status_genc(status,
2600                                             NVME_SC_LBA_OUT_OF_RANGE);
2601                                         goto out;
2602                                 }
2603                                 iov[dr].iov_base = (void *)offset;
2604                                 iov[dr].iov_len = bytes;
2605                                 dr++;
2606                         }
2607                         req->io_req.br_callback = pci_nvme_dealloc_sm;
2608
2609                         /*
2610                          * Use prev_gpaddr to track the current entry and
2611                          * prev_size to track the number of entries
2612                          */
2613                         req->prev_gpaddr = 0;
2614                         req->prev_size = dr;
2615                 }
2616
2617                 err = blockif_delete(nvstore->ctx, &req->io_req);
2618                 if (err)
2619                         pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2620                 else
2621                         pending = true;
2622         }
2623 out:
2624         free(range);
2625         return (pending);
2626 }
2627
2628 static void
2629 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2630 {
2631         struct nvme_submission_queue *sq;
2632         uint16_t status;
2633         uint16_t sqhead;
2634
2635         /* handle all submissions up to sq->tail index */
2636         sq = &sc->submit_queues[idx];
2637
2638         pthread_mutex_lock(&sq->mtx);
2639
2640         sqhead = sq->head;
2641         DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2642                  idx, sqhead, sq->tail, sq->qbase);
2643
2644         while (sqhead != atomic_load_acq_short(&sq->tail)) {
2645                 struct nvme_command *cmd;
2646                 struct pci_nvme_ioreq *req;
2647                 uint32_t nsid;
2648                 bool pending;
2649
2650                 pending = false;
2651                 req = NULL;
2652                 status = 0;
2653
2654                 cmd = &sq->qbase[sqhead];
2655                 sqhead = (sqhead + 1) % sq->size;
2656
2657                 nsid = le32toh(cmd->nsid);
2658                 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2659                         pci_nvme_status_genc(&status,
2660                             NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2661                         status |=
2662                             NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2663                         goto complete;
2664                 }
2665
2666                 req = pci_nvme_get_ioreq(sc);
2667                 if (req == NULL) {
2668                         pci_nvme_status_genc(&status,
2669                             NVME_SC_INTERNAL_DEVICE_ERROR);
2670                         WPRINTF("%s: unable to allocate IO req", __func__);
2671                         goto complete;
2672                 }
2673                 req->nvme_sq = sq;
2674                 req->sqid = idx;
2675                 req->opc = cmd->opc;
2676                 req->cid = cmd->cid;
2677                 req->nsid = cmd->nsid;
2678
2679                 switch (cmd->opc) {
2680                 case NVME_OPC_FLUSH:
2681                         pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2682                             req, &status);
2683                         break;
2684                 case NVME_OPC_WRITE:
2685                 case NVME_OPC_READ:
2686                         pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2687                             req, &status);
2688                         break;
2689                 case NVME_OPC_WRITE_ZEROES:
2690                         /* TODO: write zeroes
2691                         WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2692                                 __func__, lba, cmd->cdw12 & 0xFFFF); */
2693                         pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2694                         break;
2695                 case NVME_OPC_DATASET_MANAGEMENT:
2696                         pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2697                             req, &status);
2698                         break;
2699                 default:
2700                         WPRINTF("%s unhandled io command 0x%x",
2701                             __func__, cmd->opc);
2702                         pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2703                 }
2704 complete:
2705                 if (!pending) {
2706                         pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2707                             status);
2708                         if (req != NULL)
2709                                 pci_nvme_release_ioreq(sc, req);
2710                 }
2711         }
2712
2713         sq->head = sqhead;
2714
2715         pthread_mutex_unlock(&sq->mtx);
2716 }
2717
2718 static void
2719 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2720         uint64_t idx, int is_sq, uint64_t value)
2721 {
2722         DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2723                 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2724
2725         if (is_sq) {
2726                 if (idx > sc->num_squeues) {
2727                         WPRINTF("%s queue index %lu overflow from "
2728                                  "guest (max %u)",
2729                                  __func__, idx, sc->num_squeues);
2730                         return;
2731                 }
2732
2733                 atomic_store_short(&sc->submit_queues[idx].tail,
2734                                    (uint16_t)value);
2735
2736                 if (idx == 0) {
2737                         pci_nvme_handle_admin_cmd(sc, value);
2738                 } else {
2739                         /* submission queue; handle new entries in SQ */
2740                         if (idx > sc->num_squeues) {
2741                                 WPRINTF("%s SQ index %lu overflow from "
2742                                          "guest (max %u)",
2743                                          __func__, idx, sc->num_squeues);
2744                                 return;
2745                         }
2746                         pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2747                 }
2748         } else {
2749                 if (idx > sc->num_cqueues) {
2750                         WPRINTF("%s queue index %lu overflow from "
2751                                  "guest (max %u)",
2752                                  __func__, idx, sc->num_cqueues);
2753                         return;
2754                 }
2755
2756                 atomic_store_short(&sc->compl_queues[idx].head,
2757                                 (uint16_t)value);
2758         }
2759 }
2760
2761 static void
2762 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2763 {
2764         const char *s = iswrite ? "WRITE" : "READ";
2765
2766         switch (offset) {
2767         case NVME_CR_CAP_LOW:
2768                 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2769                 break;
2770         case NVME_CR_CAP_HI:
2771                 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2772                 break;
2773         case NVME_CR_VS:
2774                 DPRINTF("%s %s NVME_CR_VS", func, s);
2775                 break;
2776         case NVME_CR_INTMS:
2777                 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2778                 break;
2779         case NVME_CR_INTMC:
2780                 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2781                 break;
2782         case NVME_CR_CC:
2783                 DPRINTF("%s %s NVME_CR_CC", func, s);
2784                 break;
2785         case NVME_CR_CSTS:
2786                 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2787                 break;
2788         case NVME_CR_NSSR:
2789                 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2790                 break;
2791         case NVME_CR_AQA:
2792                 DPRINTF("%s %s NVME_CR_AQA", func, s);
2793                 break;
2794         case NVME_CR_ASQ_LOW:
2795                 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2796                 break;
2797         case NVME_CR_ASQ_HI:
2798                 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2799                 break;
2800         case NVME_CR_ACQ_LOW:
2801                 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2802                 break;
2803         case NVME_CR_ACQ_HI:
2804                 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2805                 break;
2806         default:
2807                 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2808         }
2809
2810 }
2811
2812 static void
2813 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2814         uint64_t offset, int size, uint64_t value)
2815 {
2816         uint32_t ccreg;
2817
2818         if (offset >= NVME_DOORBELL_OFFSET) {
2819                 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2820                 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2821                 int is_sq = (belloffset % 8) < 4;
2822
2823                 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2824                         WPRINTF("guest attempted an overflow write offset "
2825                                  "0x%lx, val 0x%lx in %s",
2826                                  offset, value, __func__);
2827                         return;
2828                 }
2829
2830                 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2831                 return;
2832         }
2833
2834         DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2835                 offset, size, value);
2836
2837         if (size != 4) {
2838                 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2839                          "val 0x%lx) to bar0 in %s",
2840                          size, offset, value, __func__);
2841                 /* TODO: shutdown device */
2842                 return;
2843         }
2844
2845         pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2846
2847         pthread_mutex_lock(&sc->mtx);
2848
2849         switch (offset) {
2850         case NVME_CR_CAP_LOW:
2851         case NVME_CR_CAP_HI:
2852                 /* readonly */
2853                 break;
2854         case NVME_CR_VS:
2855                 /* readonly */
2856                 break;
2857         case NVME_CR_INTMS:
2858                 /* MSI-X, so ignore */
2859                 break;
2860         case NVME_CR_INTMC:
2861                 /* MSI-X, so ignore */
2862                 break;
2863         case NVME_CR_CC:
2864                 ccreg = (uint32_t)value;
2865
2866                 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2867                          "iocqes %u",
2868                         __func__,
2869                          NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2870                          NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2871                          NVME_CC_GET_IOCQES(ccreg));
2872
2873                 if (NVME_CC_GET_SHN(ccreg)) {
2874                         /* perform shutdown - flush out data to backend */
2875                         sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2876                             NVME_CSTS_REG_SHST_SHIFT);
2877                         sc->regs.csts |= NVME_SHST_COMPLETE <<
2878                             NVME_CSTS_REG_SHST_SHIFT;
2879                 }
2880                 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2881                         if (NVME_CC_GET_EN(ccreg) == 0)
2882                                 /* transition 1-> causes controller reset */
2883                                 pci_nvme_reset_locked(sc);
2884                         else
2885                                 pci_nvme_init_controller(ctx, sc);
2886                 }
2887
2888                 /* Insert the iocqes, iosqes and en bits from the write */
2889                 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2890                 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2891                 if (NVME_CC_GET_EN(ccreg) == 0) {
2892                         /* Insert the ams, mps and css bit fields */
2893                         sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2894                         sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2895                         sc->regs.csts &= ~NVME_CSTS_RDY;
2896                 } else if (sc->pending_ios == 0) {
2897                         sc->regs.csts |= NVME_CSTS_RDY;
2898                 }
2899                 break;
2900         case NVME_CR_CSTS:
2901                 break;
2902         case NVME_CR_NSSR:
2903                 /* ignore writes; don't support subsystem reset */
2904                 break;
2905         case NVME_CR_AQA:
2906                 sc->regs.aqa = (uint32_t)value;
2907                 break;
2908         case NVME_CR_ASQ_LOW:
2909                 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2910                                (0xFFFFF000 & value);
2911                 break;
2912         case NVME_CR_ASQ_HI:
2913                 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2914                                (value << 32);
2915                 break;
2916         case NVME_CR_ACQ_LOW:
2917                 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2918                                (0xFFFFF000 & value);
2919                 break;
2920         case NVME_CR_ACQ_HI:
2921                 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2922                                (value << 32);
2923                 break;
2924         default:
2925                 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2926                          __func__, offset, value, size);
2927         }
2928         pthread_mutex_unlock(&sc->mtx);
2929 }
2930
2931 static void
2932 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2933                 int baridx, uint64_t offset, int size, uint64_t value)
2934 {
2935         struct pci_nvme_softc* sc = pi->pi_arg;
2936
2937         if (baridx == pci_msix_table_bar(pi) ||
2938             baridx == pci_msix_pba_bar(pi)) {
2939                 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2940                          " value 0x%lx", baridx, offset, size, value);
2941
2942                 pci_emul_msix_twrite(pi, offset, size, value);
2943                 return;
2944         }
2945
2946         switch (baridx) {
2947         case 0:
2948                 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2949                 break;
2950
2951         default:
2952                 DPRINTF("%s unknown baridx %d, val 0x%lx",
2953                          __func__, baridx, value);
2954         }
2955 }
2956
2957 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2958         uint64_t offset, int size)
2959 {
2960         uint64_t value;
2961
2962         pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2963
2964         if (offset < NVME_DOORBELL_OFFSET) {
2965                 void *p = &(sc->regs);
2966                 pthread_mutex_lock(&sc->mtx);
2967                 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2968                 pthread_mutex_unlock(&sc->mtx);
2969         } else {
2970                 value = 0;
2971                 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2972         }
2973
2974         switch (size) {
2975         case 1:
2976                 value &= 0xFF;
2977                 break;
2978         case 2:
2979                 value &= 0xFFFF;
2980                 break;
2981         case 4:
2982                 value &= 0xFFFFFFFF;
2983                 break;
2984         }
2985
2986         DPRINTF("   nvme-read offset 0x%lx, size %d -> value 0x%x",
2987                  offset, size, (uint32_t)value);
2988
2989         return (value);
2990 }
2991
2992
2993
2994 static uint64_t
2995 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2996     uint64_t offset, int size)
2997 {
2998         struct pci_nvme_softc* sc = pi->pi_arg;
2999
3000         if (baridx == pci_msix_table_bar(pi) ||
3001             baridx == pci_msix_pba_bar(pi)) {
3002                 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3003                         baridx, offset, size);
3004
3005                 return pci_emul_msix_tread(pi, offset, size);
3006         }
3007
3008         switch (baridx) {
3009         case 0:
3010                 return pci_nvme_read_bar_0(sc, offset, size);
3011
3012         default:
3013                 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3014         }
3015
3016         return (0);
3017 }
3018
3019 static int
3020 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3021 {
3022         char bident[sizeof("XX:X:X")];
3023         const char *value;
3024         uint32_t sectsz;
3025
3026         sc->max_queues = NVME_QUEUES;
3027         sc->max_qentries = NVME_MAX_QENTRIES;
3028         sc->ioslots = NVME_IOSLOTS;
3029         sc->num_squeues = sc->max_queues;
3030         sc->num_cqueues = sc->max_queues;
3031         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3032         sectsz = 0;
3033         snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3034                  "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3035
3036         value = get_config_value_node(nvl, "maxq");
3037         if (value != NULL)
3038                 sc->max_queues = atoi(value);
3039         value = get_config_value_node(nvl, "qsz");
3040         if (value != NULL) {
3041                 sc->max_qentries = atoi(value);
3042                 if (sc->max_qentries <= 0) {
3043                         EPRINTLN("nvme: Invalid qsz option %d",
3044                             sc->max_qentries);
3045                         return (-1);
3046                 }
3047         }
3048         value = get_config_value_node(nvl, "ioslots");
3049         if (value != NULL) {
3050                 sc->ioslots = atoi(value);
3051                 if (sc->ioslots <= 0) {
3052                         EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3053                         return (-1);
3054                 }
3055         }
3056         value = get_config_value_node(nvl, "sectsz");
3057         if (value != NULL)
3058                 sectsz = atoi(value);
3059         value = get_config_value_node(nvl, "ser");
3060         if (value != NULL) {
3061                 /*
3062                  * This field indicates the Product Serial Number in
3063                  * 7-bit ASCII, unused bytes should be space characters.
3064                  * Ref: NVMe v1.3c.
3065                  */
3066                 cpywithpad((char *)sc->ctrldata.sn,
3067                     sizeof(sc->ctrldata.sn), value, ' ');
3068         }
3069         value = get_config_value_node(nvl, "eui64");
3070         if (value != NULL)
3071                 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3072         value = get_config_value_node(nvl, "dsm");
3073         if (value != NULL) {
3074                 if (strcmp(value, "auto") == 0)
3075                         sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3076                 else if (strcmp(value, "enable") == 0)
3077                         sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3078                 else if (strcmp(value, "disable") == 0)
3079                         sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3080         }
3081
3082         value = get_config_value_node(nvl, "ram");
3083         if (value != NULL) {
3084                 uint64_t sz = strtoull(value, NULL, 10);
3085
3086                 sc->nvstore.type = NVME_STOR_RAM;
3087                 sc->nvstore.size = sz * 1024 * 1024;
3088                 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3089                 sc->nvstore.sectsz = 4096;
3090                 sc->nvstore.sectsz_bits = 12;
3091                 if (sc->nvstore.ctx == NULL) {
3092                         EPRINTLN("nvme: Unable to allocate RAM");
3093                         return (-1);
3094                 }
3095         } else {
3096                 snprintf(bident, sizeof(bident), "%d:%d",
3097                     sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3098                 sc->nvstore.ctx = blockif_open(nvl, bident);
3099                 if (sc->nvstore.ctx == NULL) {
3100                         EPRINTLN("nvme: Could not open backing file: %s",
3101                             strerror(errno));
3102                         return (-1);
3103                 }
3104                 sc->nvstore.type = NVME_STOR_BLOCKIF;
3105                 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3106         }
3107
3108         if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3109                 sc->nvstore.sectsz = sectsz;
3110         else if (sc->nvstore.type != NVME_STOR_RAM)
3111                 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3112         for (sc->nvstore.sectsz_bits = 9;
3113              (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3114              sc->nvstore.sectsz_bits++);
3115
3116         if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3117                 sc->max_queues = NVME_QUEUES;
3118
3119         return (0);
3120 }
3121
3122 static void
3123 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3124 {
3125         struct pci_nvme_softc *sc;
3126         struct pci_nvme_blockstore *nvstore;
3127         struct nvme_namespace_data *nd;
3128
3129         sc = arg;
3130         nvstore = &sc->nvstore;
3131         nd = &sc->nsdata;
3132
3133         nvstore->size = new_size;
3134         pci_nvme_init_nsdata_size(nvstore, nd);
3135
3136         /* Add changed NSID to list */
3137         sc->ns_log.ns[0] = 1;
3138         sc->ns_log.ns[1] = 0;
3139
3140         pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3141             PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3142 }
3143
3144 static int
3145 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3146 {
3147         struct pci_nvme_softc *sc;
3148         uint32_t pci_membar_sz;
3149         int     error;
3150
3151         error = 0;
3152
3153         sc = calloc(1, sizeof(struct pci_nvme_softc));
3154         pi->pi_arg = sc;
3155         sc->nsc_pi = pi;
3156
3157         error = pci_nvme_parse_config(sc, nvl);
3158         if (error < 0)
3159                 goto done;
3160         else
3161                 error = 0;
3162
3163         STAILQ_INIT(&sc->ioreqs_free);
3164         sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3165         for (int i = 0; i < sc->ioslots; i++) {
3166                 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3167         }
3168
3169         pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3170         pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3171         pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3172         pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3173         pci_set_cfgdata8(pi, PCIR_PROGIF,
3174                          PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3175
3176         /*
3177          * Allocate size of NVMe registers + doorbell space for all queues.
3178          *
3179          * The specification requires a minimum memory I/O window size of 16K.
3180          * The Windows driver will refuse to start a device with a smaller
3181          * window.
3182          */
3183         pci_membar_sz = sizeof(struct nvme_registers) +
3184             2 * sizeof(uint32_t) * (sc->max_queues + 1);
3185         pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3186
3187         DPRINTF("nvme membar size: %u", pci_membar_sz);
3188
3189         error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3190         if (error) {
3191                 WPRINTF("%s pci alloc mem bar failed", __func__);
3192                 goto done;
3193         }
3194
3195         error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3196         if (error) {
3197                 WPRINTF("%s pci add msixcap failed", __func__);
3198                 goto done;
3199         }
3200
3201         error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3202         if (error) {
3203                 WPRINTF("%s pci add Express capability failed", __func__);
3204                 goto done;
3205         }
3206
3207         pthread_mutex_init(&sc->mtx, NULL);
3208         sem_init(&sc->iosemlock, 0, sc->ioslots);
3209         blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3210
3211         pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3212         /*
3213          * Controller data depends on Namespace data so initialize Namespace
3214          * data first.
3215          */
3216         pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3217         pci_nvme_init_ctrldata(sc);
3218         pci_nvme_init_logpages(sc);
3219         pci_nvme_init_features(sc);
3220
3221         pci_nvme_aer_init(sc);
3222         pci_nvme_aen_init(sc);
3223
3224         pci_nvme_reset(sc);
3225
3226         pci_lintr_request(pi);
3227
3228 done:
3229         return (error);
3230 }
3231
3232 static int
3233 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3234 {
3235         char *cp, *ram;
3236
3237         if (opts == NULL)
3238                 return (0);
3239
3240         if (strncmp(opts, "ram=", 4) == 0) {
3241                 cp = strchr(opts, ',');
3242                 if (cp == NULL) {
3243                         set_config_value_node(nvl, "ram", opts + 4);
3244                         return (0);
3245                 }
3246                 ram = strndup(opts + 4, cp - opts - 4);
3247                 set_config_value_node(nvl, "ram", ram);
3248                 free(ram);
3249                 return (pci_parse_legacy_config(nvl, cp + 1));
3250         } else
3251                 return (blockif_legacy_config(nvl, opts));
3252 }
3253
3254 struct pci_devemu pci_de_nvme = {
3255         .pe_emu =       "nvme",
3256         .pe_init =      pci_nvme_init,
3257         .pe_legacy_config = pci_nvme_legacy_config,
3258         .pe_barwrite =  pci_nvme_write,
3259         .pe_barread =   pci_nvme_read
3260 };
3261 PCI_EMUL_SET(pci_de_nvme);