2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
68 #include <pthread_np.h>
69 #include <semaphore.h>
77 #include <machine/atomic.h>
78 #include <machine/vmm.h>
81 #include <dev/nvme/nvme.h>
90 static int nvme_debug = 0;
91 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
92 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
94 /* defaults; can be overridden */
95 #define NVME_MSIX_BAR 4
97 #define NVME_IOSLOTS 8
99 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
100 #define NVME_MMIO_SPACE_MIN (1 << 14)
102 #define NVME_QUEUES 16
103 #define NVME_MAX_QENTRIES 2048
104 /* Memory Page size Minimum reported in CAP register */
105 #define NVME_MPSMIN 0
106 /* MPSMIN converted to bytes */
107 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
109 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
111 /* Note the + 1 allows for the initial descriptor to not be page aligned */
112 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
113 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
115 /* This is a synthetic status code to indicate there is no status */
116 #define NVME_NO_STATUS 0xffff
117 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
119 /* Reported temperature in Kelvin (i.e. room temperature) */
120 #define NVME_TEMPERATURE 296
124 /* Convert a zero-based value into a one-based value */
125 #define ONE_BASED(zero) ((zero) + 1)
126 /* Convert a one-based value into a zero-based value */
127 #define ZERO_BASED(one) ((one) - 1)
129 /* Encode number of SQ's and CQ's for Set/Get Features */
130 #define NVME_FEATURE_NUM_QUEUES(sc) \
131 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
132 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
134 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
136 enum nvme_controller_register_offsets {
137 NVME_CR_CAP_LOW = 0x00,
138 NVME_CR_CAP_HI = 0x04,
140 NVME_CR_INTMS = 0x0c,
141 NVME_CR_INTMC = 0x10,
146 NVME_CR_ASQ_LOW = 0x28,
147 NVME_CR_ASQ_HI = 0x2c,
148 NVME_CR_ACQ_LOW = 0x30,
149 NVME_CR_ACQ_HI = 0x34,
152 enum nvme_cmd_cdw11 {
153 NVME_CMD_CDW11_PC = 0x0001,
154 NVME_CMD_CDW11_IEN = 0x0002,
155 NVME_CMD_CDW11_IV = 0xFFFF0000,
163 #define NVME_CQ_INTEN 0x01
164 #define NVME_CQ_INTCOAL 0x02
166 struct nvme_completion_queue {
167 struct nvme_completion *qbase;
170 uint16_t tail; /* nvme progress */
171 uint16_t head; /* guest progress */
176 struct nvme_submission_queue {
177 struct nvme_command *qbase;
180 uint16_t head; /* nvme progress */
181 uint16_t tail; /* guest progress */
182 uint16_t cqid; /* completion queue id */
186 enum nvme_storage_type {
187 NVME_STOR_BLOCKIF = 0,
191 struct pci_nvme_blockstore {
192 enum nvme_storage_type type;
196 uint32_t sectsz_bits;
198 uint32_t deallocate:1;
202 * Calculate the number of additional page descriptors for guest IO requests
203 * based on the advertised Max Data Transfer (MDTS) and given the number of
204 * default iovec's in a struct blockif_req.
206 #define MDTS_PAD_SIZE \
207 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
208 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
211 struct pci_nvme_ioreq {
212 struct pci_nvme_softc *sc;
213 STAILQ_ENTRY(pci_nvme_ioreq) link;
214 struct nvme_submission_queue *nvme_sq;
217 /* command information */
222 uint64_t prev_gpaddr;
226 struct blockif_req io_req;
228 struct iovec iovpadding[MDTS_PAD_SIZE];
232 /* Dataset Management bit in ONCS reflects backing storage capability */
233 NVME_DATASET_MANAGEMENT_AUTO,
234 /* Unconditionally set Dataset Management bit in ONCS */
235 NVME_DATASET_MANAGEMENT_ENABLE,
236 /* Unconditionally clear Dataset Management bit in ONCS */
237 NVME_DATASET_MANAGEMENT_DISABLE,
240 struct pci_nvme_softc;
241 struct nvme_feature_obj;
243 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
244 struct nvme_feature_obj *,
245 struct nvme_command *,
246 struct nvme_completion *);
248 struct nvme_feature_obj {
252 bool namespace_specific;
255 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
258 PCI_NVME_AE_TYPE_ERROR = 0,
259 PCI_NVME_AE_TYPE_SMART,
260 PCI_NVME_AE_TYPE_NOTICE,
261 PCI_NVME_AE_TYPE_IO_CMD = 6,
262 PCI_NVME_AE_TYPE_VENDOR = 7,
263 PCI_NVME_AE_TYPE_MAX /* Must be last */
264 } pci_nvme_async_type;
266 /* Asynchronous Event Requests */
267 struct pci_nvme_aer {
268 STAILQ_ENTRY(pci_nvme_aer) link;
269 uint16_t cid; /* Command ID of the submitted AER */
273 PCI_NVME_AE_INFO_NS_ATTR_CHANGED = 0,
274 PCI_NVME_AE_INFO_FW_ACTIVATION,
275 PCI_NVME_AE_INFO_TELEMETRY_CHANGE,
276 PCI_NVME_AE_INFO_ANA_CHANGE,
277 PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE,
278 PCI_NVME_AE_INFO_LBA_STATUS_ALERT,
279 PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE,
280 PCI_NVME_AE_INFO_MAX,
281 } pci_nvme_async_info;
283 /* Asynchronous Event Notifications */
284 struct pci_nvme_aen {
285 pci_nvme_async_type atype;
291 NVME_CNTRLTYPE_IO = 1,
292 NVME_CNTRLTYPE_DISCOVERY = 2,
293 NVME_CNTRLTYPE_ADMIN = 3,
294 } pci_nvme_cntrl_type;
296 struct pci_nvme_softc {
297 struct pci_devinst *nsc_pi;
301 struct nvme_registers regs;
303 struct nvme_namespace_data nsdata;
304 struct nvme_controller_data ctrldata;
305 struct nvme_error_information_entry err_log;
306 struct nvme_health_information_page health_log;
307 struct nvme_firmware_page fw_log;
308 struct nvme_ns_list ns_log;
310 struct pci_nvme_blockstore nvstore;
312 uint16_t max_qentries; /* max entries per queue */
313 uint32_t max_queues; /* max number of IO SQ's or CQ's */
314 uint32_t num_cqueues;
315 uint32_t num_squeues;
316 bool num_q_is_set; /* Has host set Number of Queues */
318 struct pci_nvme_ioreq *ioreqs;
319 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
320 uint32_t pending_ios;
325 * Memory mapped Submission and Completion queues
326 * Each array includes both Admin and IO queues
328 struct nvme_completion_queue *compl_queues;
329 struct nvme_submission_queue *submit_queues;
331 struct nvme_feature_obj feat[NVME_FID_MAX];
333 enum nvme_dsm_type dataset_management;
335 /* Accounting for SMART data */
336 __uint128_t read_data_units;
337 __uint128_t write_data_units;
338 __uint128_t read_commands;
339 __uint128_t write_commands;
340 uint32_t read_dunits_remainder;
341 uint32_t write_dunits_remainder;
343 STAILQ_HEAD(, pci_nvme_aer) aer_list;
344 pthread_mutex_t aer_mtx;
346 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
348 pthread_mutex_t aen_mtx;
349 pthread_cond_t aen_cond;
353 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
354 struct nvme_completion_queue *cq,
359 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
360 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
361 static void pci_nvme_io_done(struct blockif_req *, int);
363 /* Controller Configuration utils */
364 #define NVME_CC_GET_EN(cc) \
365 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
366 #define NVME_CC_GET_CSS(cc) \
367 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
368 #define NVME_CC_GET_SHN(cc) \
369 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
370 #define NVME_CC_GET_IOSQES(cc) \
371 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
372 #define NVME_CC_GET_IOCQES(cc) \
373 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
375 #define NVME_CC_WRITE_MASK \
376 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
377 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
378 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
380 #define NVME_CC_NEN_WRITE_MASK \
381 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
382 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
383 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
385 /* Controller Status utils */
386 #define NVME_CSTS_GET_RDY(sts) \
387 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
389 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
391 /* Completion Queue status word utils */
392 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
393 #define NVME_STATUS_MASK \
394 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
395 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
397 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
398 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
400 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
401 struct nvme_feature_obj *,
402 struct nvme_command *,
403 struct nvme_completion *);
404 static void nvme_feature_temperature(struct pci_nvme_softc *,
405 struct nvme_feature_obj *,
406 struct nvme_command *,
407 struct nvme_completion *);
408 static void nvme_feature_num_queues(struct pci_nvme_softc *,
409 struct nvme_feature_obj *,
410 struct nvme_command *,
411 struct nvme_completion *);
412 static void nvme_feature_iv_config(struct pci_nvme_softc *,
413 struct nvme_feature_obj *,
414 struct nvme_command *,
415 struct nvme_completion *);
417 static void *aen_thr(void *arg);
420 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
424 len = strnlen(src, dst_size);
425 memset(dst, pad, dst_size);
426 memcpy(dst, src, len);
430 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
433 *status &= ~NVME_STATUS_MASK;
434 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
435 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
439 pci_nvme_status_genc(uint16_t *status, uint16_t code)
442 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
446 * Initialize the requested number or IO Submission and Completion Queues.
447 * Admin queues are allocated implicitly.
450 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
455 * Allocate and initialize the Submission Queues
457 if (nsq > NVME_QUEUES) {
458 WPRINTF("%s: clamping number of SQ from %u to %u",
459 __func__, nsq, NVME_QUEUES);
463 sc->num_squeues = nsq;
465 sc->submit_queues = calloc(sc->num_squeues + 1,
466 sizeof(struct nvme_submission_queue));
467 if (sc->submit_queues == NULL) {
468 WPRINTF("%s: SQ allocation failed", __func__);
471 struct nvme_submission_queue *sq = sc->submit_queues;
473 for (i = 0; i < sc->num_squeues; i++)
474 pthread_mutex_init(&sq[i].mtx, NULL);
478 * Allocate and initialize the Completion Queues
480 if (ncq > NVME_QUEUES) {
481 WPRINTF("%s: clamping number of CQ from %u to %u",
482 __func__, ncq, NVME_QUEUES);
486 sc->num_cqueues = ncq;
488 sc->compl_queues = calloc(sc->num_cqueues + 1,
489 sizeof(struct nvme_completion_queue));
490 if (sc->compl_queues == NULL) {
491 WPRINTF("%s: CQ allocation failed", __func__);
494 struct nvme_completion_queue *cq = sc->compl_queues;
496 for (i = 0; i < sc->num_cqueues; i++)
497 pthread_mutex_init(&cq[i].mtx, NULL);
502 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
504 struct nvme_controller_data *cd = &sc->ctrldata;
509 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
510 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
512 /* Num of submission commands that we can handle at a time (2^rab) */
522 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
524 cd->ver = NVME_REV(1,4);
526 cd->cntrltype = NVME_CNTRLTYPE_IO;
527 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
531 /* Advertise 1, Read-only firmware slot */
532 cd->frmw = NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK |
533 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
534 cd->lpa = 0; /* TODO: support some simple things like SMART */
535 cd->elpe = 0; /* max error log page entries */
536 cd->npss = 1; /* number of power states support */
538 /* Warning Composite Temperature Threshold */
542 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
543 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
544 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
545 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
546 cd->nn = 1; /* number of namespaces */
549 switch (sc->dataset_management) {
550 case NVME_DATASET_MANAGEMENT_AUTO:
551 if (sc->nvstore.deallocate)
552 cd->oncs |= NVME_ONCS_DSM;
554 case NVME_DATASET_MANAGEMENT_ENABLE:
555 cd->oncs |= NVME_ONCS_DSM;
561 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
562 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
564 cd->power_state[0].mp = 10;
568 * Calculate the CRC-16 of the given buffer
569 * See copyright attribution at top of file
572 crc16(uint16_t crc, const void *buffer, unsigned int len)
574 const unsigned char *cp = buffer;
575 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
576 static uint16_t const crc16_table[256] = {
577 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
578 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
579 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
580 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
581 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
582 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
583 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
584 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
585 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
586 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
587 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
588 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
589 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
590 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
591 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
592 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
593 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
594 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
595 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
596 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
597 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
598 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
599 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
600 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
601 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
602 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
603 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
604 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
605 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
606 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
607 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
608 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
612 crc = (((crc >> 8) & 0xffU) ^
613 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
618 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
619 struct nvme_namespace_data *nd)
622 /* Get capacity and block size information from backing store */
623 nd->nsze = nvstore->size / nvstore->sectsz;
629 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
630 struct nvme_namespace_data *nd, uint32_t nsid,
631 struct pci_nvme_blockstore *nvstore)
634 pci_nvme_init_nsdata_size(nvstore, nd);
636 if (nvstore->type == NVME_STOR_BLOCKIF)
637 nvstore->deallocate = blockif_candelete(nvstore->ctx);
639 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
642 /* Create an EUI-64 if user did not provide one */
643 if (nvstore->eui64 == 0) {
645 uint64_t eui64 = nvstore->eui64;
647 asprintf(&data, "%s%u%u%u", get_config_value("name"),
648 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
649 sc->nsc_pi->pi_func);
652 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
655 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
657 be64enc(nd->eui64, nvstore->eui64);
659 /* LBA data-sz = 2^lbads */
660 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
664 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
667 memset(&sc->err_log, 0, sizeof(sc->err_log));
668 memset(&sc->health_log, 0, sizeof(sc->health_log));
669 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
670 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
672 /* Set read/write remainder to round up according to spec */
673 sc->read_dunits_remainder = 999;
674 sc->write_dunits_remainder = 999;
676 /* Set nominal Health values checked by implementations */
677 sc->health_log.temperature = NVME_TEMPERATURE;
678 sc->health_log.available_spare = 100;
679 sc->health_log.available_spare_threshold = 10;
683 pci_nvme_init_features(struct pci_nvme_softc *sc)
685 enum nvme_feature fid;
687 for (fid = 0; fid < NVME_FID_MAX; fid++) {
689 case NVME_FEAT_ARBITRATION:
690 case NVME_FEAT_POWER_MANAGEMENT:
691 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
692 case NVME_FEAT_WRITE_ATOMICITY:
693 /* Mandatory but no special handling required */
694 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
695 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
696 // this returns a data buffer
698 case NVME_FEAT_TEMPERATURE_THRESHOLD:
699 sc->feat[fid].set = nvme_feature_temperature;
701 case NVME_FEAT_ERROR_RECOVERY:
702 sc->feat[fid].namespace_specific = true;
704 case NVME_FEAT_NUMBER_OF_QUEUES:
705 sc->feat[fid].set = nvme_feature_num_queues;
707 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
708 sc->feat[fid].set = nvme_feature_iv_config;
710 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
711 /* Enable all AENs by default */
712 sc->feat[fid].cdw11 = 0x31f;
715 sc->feat[fid].set = nvme_feature_invalid_cb;
716 sc->feat[fid].get = nvme_feature_invalid_cb;
722 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
725 STAILQ_INIT(&sc->aer_list);
730 pci_nvme_aer_init(struct pci_nvme_softc *sc)
733 pthread_mutex_init(&sc->aer_mtx, NULL);
734 pci_nvme_aer_reset(sc);
738 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
740 struct pci_nvme_aer *aer = NULL;
742 pthread_mutex_lock(&sc->aer_mtx);
743 while (!STAILQ_EMPTY(&sc->aer_list)) {
744 aer = STAILQ_FIRST(&sc->aer_list);
745 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
748 pthread_mutex_unlock(&sc->aer_mtx);
750 pci_nvme_aer_reset(sc);
754 pci_nvme_aer_available(struct pci_nvme_softc *sc)
757 return (sc->aer_count != 0);
761 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
763 struct nvme_controller_data *cd = &sc->ctrldata;
765 /* AERL is a zero based value while aer_count is one's based */
766 return (sc->aer_count == (cd->aerl + 1));
770 * Add an Async Event Request
772 * Stores an AER to be returned later if the Controller needs to notify the
774 * Note that while the NVMe spec doesn't require Controllers to return AER's
775 * in order, this implementation does preserve the order.
778 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
780 struct pci_nvme_aer *aer = NULL;
782 aer = calloc(1, sizeof(struct pci_nvme_aer));
786 /* Save the Command ID for use in the completion message */
789 pthread_mutex_lock(&sc->aer_mtx);
791 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
792 pthread_mutex_unlock(&sc->aer_mtx);
798 * Get an Async Event Request structure
800 * Returns a pointer to an AER previously submitted by the host or NULL if
801 * no AER's exist. Caller is responsible for freeing the returned struct.
803 static struct pci_nvme_aer *
804 pci_nvme_aer_get(struct pci_nvme_softc *sc)
806 struct pci_nvme_aer *aer = NULL;
808 pthread_mutex_lock(&sc->aer_mtx);
809 aer = STAILQ_FIRST(&sc->aer_list);
811 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
814 pthread_mutex_unlock(&sc->aer_mtx);
820 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
824 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
826 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
827 sc->aen[atype].atype = atype;
832 pci_nvme_aen_init(struct pci_nvme_softc *sc)
836 pci_nvme_aen_reset(sc);
838 pthread_mutex_init(&sc->aen_mtx, NULL);
839 pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
840 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
841 sc->nsc_pi->pi_func);
842 pthread_set_name_np(sc->aen_tid, nstr);
846 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
849 pci_nvme_aen_reset(sc);
852 /* Notify the AEN thread of pending work */
854 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
857 pthread_cond_signal(&sc->aen_cond);
861 * Post an Asynchronous Event Notification
864 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
867 struct pci_nvme_aen *aen;
869 if (atype >= PCI_NVME_AE_TYPE_MAX) {
873 pthread_mutex_lock(&sc->aen_mtx);
874 aen = &sc->aen[atype];
876 /* Has the controller already posted an event of this type? */
878 pthread_mutex_unlock(&sc->aen_mtx);
882 aen->event_data = event_data;
884 pthread_mutex_unlock(&sc->aen_mtx);
886 pci_nvme_aen_notify(sc);
892 pci_nvme_aen_process(struct pci_nvme_softc *sc)
894 struct pci_nvme_aer *aer;
895 struct pci_nvme_aen *aen;
896 pci_nvme_async_type atype;
901 assert(pthread_mutex_isowned_np(&sc->aen_mtx));
902 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
903 aen = &sc->aen[atype];
904 /* Previous iterations may have depleted the available AER's */
905 if (!pci_nvme_aer_available(sc)) {
906 DPRINTF("%s: no AER", __func__);
911 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
915 status = NVME_SC_SUCCESS;
917 /* Is the event masked? */
919 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
921 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
923 case PCI_NVME_AE_TYPE_ERROR:
924 lid = NVME_LOG_ERROR;
926 case PCI_NVME_AE_TYPE_SMART:
928 if ((mask & aen->event_data) == 0)
930 lid = NVME_LOG_HEALTH_INFORMATION;
932 case PCI_NVME_AE_TYPE_NOTICE:
933 if (aen->event_data >= PCI_NVME_AE_INFO_MAX) {
934 EPRINTLN("%s unknown AEN notice type %u",
935 __func__, aen->event_data);
936 status = NVME_SC_INTERNAL_DEVICE_ERROR;
940 if (((1 << aen->event_data) & mask) == 0)
942 switch (aen->event_data) {
943 case PCI_NVME_AE_INFO_NS_ATTR_CHANGED:
944 lid = NVME_LOG_CHANGED_NAMESPACE;
946 case PCI_NVME_AE_INFO_FW_ACTIVATION:
947 lid = NVME_LOG_FIRMWARE_SLOT;
949 case PCI_NVME_AE_INFO_TELEMETRY_CHANGE:
950 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
952 case PCI_NVME_AE_INFO_ANA_CHANGE:
953 lid = NVME_LOG_ASYMMETRIC_NAMESPAVE_ACCESS; //TODO spelling
955 case PCI_NVME_AE_INFO_PREDICT_LATENCY_CHANGE:
956 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
958 case PCI_NVME_AE_INFO_LBA_STATUS_ALERT:
959 lid = NVME_LOG_LBA_STATUS_INFORMATION;
961 case PCI_NVME_AE_INFO_ENDURANCE_GROUP_CHANGE:
962 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
970 EPRINTLN("%s unknown AEN type %u", __func__, atype);
971 status = NVME_SC_INTERNAL_DEVICE_ERROR;
975 aer = pci_nvme_aer_get(sc);
978 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
979 pci_nvme_cq_update(sc, &sc->compl_queues[0],
980 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
988 pci_generate_msix(sc->nsc_pi, 0);
995 struct pci_nvme_softc *sc;
999 pthread_mutex_lock(&sc->aen_mtx);
1001 pci_nvme_aen_process(sc);
1002 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1004 pthread_mutex_unlock(&sc->aen_mtx);
1011 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1015 DPRINTF("%s", __func__);
1017 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1018 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1019 (60 << NVME_CAP_LO_REG_TO_SHIFT);
1021 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1023 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */
1027 assert(sc->submit_queues != NULL);
1029 for (i = 0; i < sc->num_squeues + 1; i++) {
1030 sc->submit_queues[i].qbase = NULL;
1031 sc->submit_queues[i].size = 0;
1032 sc->submit_queues[i].cqid = 0;
1033 sc->submit_queues[i].tail = 0;
1034 sc->submit_queues[i].head = 0;
1037 assert(sc->compl_queues != NULL);
1039 for (i = 0; i < sc->num_cqueues + 1; i++) {
1040 sc->compl_queues[i].qbase = NULL;
1041 sc->compl_queues[i].size = 0;
1042 sc->compl_queues[i].tail = 0;
1043 sc->compl_queues[i].head = 0;
1046 sc->num_q_is_set = false;
1048 pci_nvme_aer_destroy(sc);
1049 pci_nvme_aen_destroy(sc);
1052 * Clear CSTS.RDY last to prevent the host from enabling Controller
1053 * before cleanup completes
1059 pci_nvme_reset(struct pci_nvme_softc *sc)
1061 pthread_mutex_lock(&sc->mtx);
1062 pci_nvme_reset_locked(sc);
1063 pthread_mutex_unlock(&sc->mtx);
1067 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1069 uint16_t acqs, asqs;
1071 DPRINTF("%s", __func__);
1073 asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1;
1074 sc->submit_queues[0].size = asqs;
1075 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1076 sizeof(struct nvme_command) * asqs);
1078 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1079 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1081 acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1082 NVME_AQA_REG_ACQS_MASK) + 1;
1083 sc->compl_queues[0].size = acqs;
1084 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1085 sizeof(struct nvme_completion) * acqs);
1086 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1088 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1089 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1093 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1094 size_t len, enum nvme_copy_dir dir)
1099 if (len > (8 * 1024)) {
1103 /* Copy from the start of prp1 to the end of the physical page */
1104 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1105 bytes = MIN(bytes, len);
1107 p = vm_map_gpa(ctx, prp1, bytes);
1112 if (dir == NVME_COPY_TO_PRP)
1113 memcpy(p, b, bytes);
1115 memcpy(b, p, bytes);
1124 len = MIN(len, PAGE_SIZE);
1126 p = vm_map_gpa(ctx, prp2, len);
1131 if (dir == NVME_COPY_TO_PRP)
1140 * Write a Completion Queue Entry update
1142 * Write the completion and update the doorbell value
1145 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1146 struct nvme_completion_queue *cq,
1152 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1153 struct nvme_completion *cqe;
1155 assert(cq->qbase != NULL);
1157 pthread_mutex_lock(&cq->mtx);
1159 cqe = &cq->qbase[cq->tail];
1161 /* Flip the phase bit */
1162 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1165 cqe->sqhd = sq->head;
1168 cqe->status = status;
1171 if (cq->tail >= cq->size) {
1175 pthread_mutex_unlock(&cq->mtx);
1179 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1180 struct nvme_completion* compl)
1182 uint16_t qid = command->cdw10 & 0xffff;
1184 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1185 if (qid == 0 || qid > sc->num_squeues ||
1186 (sc->submit_queues[qid].qbase == NULL)) {
1187 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1188 __func__, qid, sc->num_squeues);
1189 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1190 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1194 sc->submit_queues[qid].qbase = NULL;
1195 sc->submit_queues[qid].cqid = 0;
1196 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1201 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1202 struct nvme_completion* compl)
1204 if (command->cdw11 & NVME_CMD_CDW11_PC) {
1205 uint16_t qid = command->cdw10 & 0xffff;
1206 struct nvme_submission_queue *nsq;
1208 if ((qid == 0) || (qid > sc->num_squeues) ||
1209 (sc->submit_queues[qid].qbase != NULL)) {
1210 WPRINTF("%s queue index %u > num_squeues %u",
1211 __func__, qid, sc->num_squeues);
1212 pci_nvme_status_tc(&compl->status,
1213 NVME_SCT_COMMAND_SPECIFIC,
1214 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1218 nsq = &sc->submit_queues[qid];
1219 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1220 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1221 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1223 * Queues must specify at least two entries
1224 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1225 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1227 pci_nvme_status_tc(&compl->status,
1228 NVME_SCT_COMMAND_SPECIFIC,
1229 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1232 nsq->head = nsq->tail = 0;
1234 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1235 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1236 pci_nvme_status_tc(&compl->status,
1237 NVME_SCT_COMMAND_SPECIFIC,
1238 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1242 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1243 pci_nvme_status_tc(&compl->status,
1244 NVME_SCT_COMMAND_SPECIFIC,
1245 NVME_SC_COMPLETION_QUEUE_INVALID);
1249 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1251 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1252 sizeof(struct nvme_command) * (size_t)nsq->size);
1254 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1255 qid, nsq->size, nsq->qbase, nsq->cqid);
1257 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1259 DPRINTF("%s completed creating IOSQ qid %u",
1263 * Guest sent non-cont submission queue request.
1264 * This setting is unsupported by this emulation.
1266 WPRINTF("%s unsupported non-contig (list-based) "
1267 "create i/o submission queue", __func__);
1269 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1275 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1276 struct nvme_completion* compl)
1278 uint16_t qid = command->cdw10 & 0xffff;
1281 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1282 if (qid == 0 || qid > sc->num_cqueues ||
1283 (sc->compl_queues[qid].qbase == NULL)) {
1284 WPRINTF("%s queue index %u / num_cqueues %u",
1285 __func__, qid, sc->num_cqueues);
1286 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1287 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1291 /* Deleting an Active CQ is an error */
1292 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1293 if (sc->submit_queues[sqid].cqid == qid) {
1294 pci_nvme_status_tc(&compl->status,
1295 NVME_SCT_COMMAND_SPECIFIC,
1296 NVME_SC_INVALID_QUEUE_DELETION);
1300 sc->compl_queues[qid].qbase = NULL;
1301 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1306 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1307 struct nvme_completion* compl)
1309 struct nvme_completion_queue *ncq;
1310 uint16_t qid = command->cdw10 & 0xffff;
1312 /* Only support Physically Contiguous queues */
1313 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1314 WPRINTF("%s unsupported non-contig (list-based) "
1315 "create i/o completion queue",
1318 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1322 if ((qid == 0) || (qid > sc->num_cqueues) ||
1323 (sc->compl_queues[qid].qbase != NULL)) {
1324 WPRINTF("%s queue index %u > num_cqueues %u",
1325 __func__, qid, sc->num_cqueues);
1326 pci_nvme_status_tc(&compl->status,
1327 NVME_SCT_COMMAND_SPECIFIC,
1328 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1332 ncq = &sc->compl_queues[qid];
1333 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1334 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1335 if (ncq->intr_vec > (sc->max_queues + 1)) {
1336 pci_nvme_status_tc(&compl->status,
1337 NVME_SCT_COMMAND_SPECIFIC,
1338 NVME_SC_INVALID_INTERRUPT_VECTOR);
1342 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1343 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1345 * Queues must specify at least two entries
1346 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1347 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1349 pci_nvme_status_tc(&compl->status,
1350 NVME_SCT_COMMAND_SPECIFIC,
1351 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1354 ncq->head = ncq->tail = 0;
1355 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1357 sizeof(struct nvme_command) * (size_t)ncq->size);
1359 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1366 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1367 struct nvme_completion* compl)
1371 uint8_t logpage = command->cdw10 & 0xFF;
1373 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1375 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1378 * Command specifies the number of dwords to return in fields NUMDU
1379 * and NUMDL. This is a zero-based value.
1381 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1382 logsize *= sizeof(uint32_t);
1383 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1386 case NVME_LOG_ERROR:
1387 if (logoff >= sizeof(sc->err_log)) {
1388 pci_nvme_status_genc(&compl->status,
1389 NVME_SC_INVALID_FIELD);
1393 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1394 command->prp2, (uint8_t *)&sc->err_log + logoff,
1395 MIN(logsize - logoff, sizeof(sc->err_log)),
1398 case NVME_LOG_HEALTH_INFORMATION:
1399 if (logoff >= sizeof(sc->health_log)) {
1400 pci_nvme_status_genc(&compl->status,
1401 NVME_SC_INVALID_FIELD);
1405 pthread_mutex_lock(&sc->mtx);
1406 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1407 sizeof(sc->health_log.data_units_read));
1408 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1409 sizeof(sc->health_log.data_units_written));
1410 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1411 sizeof(sc->health_log.host_read_commands));
1412 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1413 sizeof(sc->health_log.host_write_commands));
1414 pthread_mutex_unlock(&sc->mtx);
1416 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1417 command->prp2, (uint8_t *)&sc->health_log + logoff,
1418 MIN(logsize - logoff, sizeof(sc->health_log)),
1421 case NVME_LOG_FIRMWARE_SLOT:
1422 if (logoff >= sizeof(sc->fw_log)) {
1423 pci_nvme_status_genc(&compl->status,
1424 NVME_SC_INVALID_FIELD);
1428 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1429 command->prp2, (uint8_t *)&sc->fw_log + logoff,
1430 MIN(logsize - logoff, sizeof(sc->fw_log)),
1433 case NVME_LOG_CHANGED_NAMESPACE:
1434 if (logoff >= sizeof(sc->ns_log)) {
1435 pci_nvme_status_genc(&compl->status,
1436 NVME_SC_INVALID_FIELD);
1440 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1441 command->prp2, (uint8_t *)&sc->ns_log + logoff,
1442 MIN(logsize - logoff, sizeof(sc->ns_log)),
1444 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1447 DPRINTF("%s get log page %x command not supported",
1450 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1451 NVME_SC_INVALID_LOG_PAGE);
1458 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1459 struct nvme_completion* compl)
1464 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1465 command->cdw10 & 0xFF, command->nsid);
1467 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1469 switch (command->cdw10 & 0xFF) {
1470 case 0x00: /* return Identify Namespace data structure */
1471 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1472 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1475 case 0x01: /* return Identify Controller data structure */
1476 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1477 command->prp2, (uint8_t *)&sc->ctrldata,
1478 sizeof(sc->ctrldata),
1481 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1482 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1483 sizeof(uint32_t) * 1024);
1484 /* All unused entries shall be zero */
1485 bzero(dest, sizeof(uint32_t) * 1024);
1486 ((uint32_t *)dest)[0] = 1;
1488 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1489 if (command->nsid != 1) {
1490 pci_nvme_status_genc(&status,
1491 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1494 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1495 sizeof(uint32_t) * 1024);
1496 /* All bytes after the descriptor shall be zero */
1497 bzero(dest, sizeof(uint32_t) * 1024);
1499 /* Return NIDT=1 (i.e. EUI64) descriptor */
1500 ((uint8_t *)dest)[0] = 1;
1501 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1502 bcopy(sc->nsdata.eui64, ((uint8_t *)dest) + 4, sizeof(uint64_t));
1505 DPRINTF("%s unsupported identify command requested 0x%x",
1506 __func__, command->cdw10 & 0xFF);
1507 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1511 compl->status = status;
1516 nvme_fid_to_name(uint8_t fid)
1521 case NVME_FEAT_ARBITRATION:
1522 name = "Arbitration";
1524 case NVME_FEAT_POWER_MANAGEMENT:
1525 name = "Power Management";
1527 case NVME_FEAT_LBA_RANGE_TYPE:
1528 name = "LBA Range Type";
1530 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1531 name = "Temperature Threshold";
1533 case NVME_FEAT_ERROR_RECOVERY:
1534 name = "Error Recovery";
1536 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1537 name = "Volatile Write Cache";
1539 case NVME_FEAT_NUMBER_OF_QUEUES:
1540 name = "Number of Queues";
1542 case NVME_FEAT_INTERRUPT_COALESCING:
1543 name = "Interrupt Coalescing";
1545 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1546 name = "Interrupt Vector Configuration";
1548 case NVME_FEAT_WRITE_ATOMICITY:
1549 name = "Write Atomicity Normal";
1551 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1552 name = "Asynchronous Event Configuration";
1554 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1555 name = "Autonomous Power State Transition";
1557 case NVME_FEAT_HOST_MEMORY_BUFFER:
1558 name = "Host Memory Buffer";
1560 case NVME_FEAT_TIMESTAMP:
1563 case NVME_FEAT_KEEP_ALIVE_TIMER:
1564 name = "Keep Alive Timer";
1566 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1567 name = "Host Controlled Thermal Management";
1569 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1570 name = "Non-Operation Power State Config";
1572 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1573 name = "Read Recovery Level Config";
1575 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1576 name = "Predictable Latency Mode Config";
1578 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1579 name = "Predictable Latency Mode Window";
1581 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1582 name = "LBA Status Information Report Interval";
1584 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1585 name = "Host Behavior Support";
1587 case NVME_FEAT_SANITIZE_CONFIG:
1588 name = "Sanitize Config";
1590 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1591 name = "Endurance Group Event Configuration";
1593 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1594 name = "Software Progress Marker";
1596 case NVME_FEAT_HOST_IDENTIFIER:
1597 name = "Host Identifier";
1599 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1600 name = "Reservation Notification Mask";
1602 case NVME_FEAT_RESERVATION_PERSISTENCE:
1603 name = "Reservation Persistence";
1605 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1606 name = "Namespace Write Protection Config";
1617 nvme_feature_invalid_cb(struct pci_nvme_softc *sc,
1618 struct nvme_feature_obj *feat,
1619 struct nvme_command *command,
1620 struct nvme_completion *compl)
1623 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1627 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1628 struct nvme_feature_obj *feat,
1629 struct nvme_command *command,
1630 struct nvme_completion *compl)
1633 uint32_t cdw11 = command->cdw11;
1637 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1639 iv = cdw11 & 0xffff;
1640 cd = cdw11 & (1 << 16);
1642 if (iv > (sc->max_queues + 1)) {
1646 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1647 if ((iv == 0) && !cd)
1650 /* Requested Interrupt Vector must be used by a CQ */
1651 for (i = 0; i < sc->num_cqueues + 1; i++) {
1652 if (sc->compl_queues[i].intr_vec == iv) {
1653 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1658 #define NVME_TEMP_THRESH_OVER 0
1659 #define NVME_TEMP_THRESH_UNDER 1
1661 nvme_feature_temperature(struct pci_nvme_softc *sc,
1662 struct nvme_feature_obj *feat,
1663 struct nvme_command *command,
1664 struct nvme_completion *compl)
1666 uint16_t tmpth; /* Temperature Threshold */
1667 uint8_t tmpsel; /* Threshold Temperature Select */
1668 uint8_t thsel; /* Threshold Type Select */
1669 bool set_crit = false;
1671 tmpth = command->cdw11 & 0xffff;
1672 tmpsel = (command->cdw11 >> 16) & 0xf;
1673 thsel = (command->cdw11 >> 20) & 0x3;
1675 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1677 /* Check for unsupported values */
1678 if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1679 (thsel > NVME_TEMP_THRESH_UNDER)) {
1680 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1684 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) ||
1685 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1688 pthread_mutex_lock(&sc->mtx);
1690 sc->health_log.critical_warning |=
1691 NVME_CRIT_WARN_ST_TEMPERATURE;
1693 sc->health_log.critical_warning &=
1694 ~NVME_CRIT_WARN_ST_TEMPERATURE;
1695 pthread_mutex_unlock(&sc->mtx);
1698 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1699 sc->health_log.critical_warning);
1702 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1706 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1707 struct nvme_feature_obj *feat,
1708 struct nvme_command *command,
1709 struct nvme_completion *compl)
1711 uint16_t nqr; /* Number of Queues Requested */
1713 if (sc->num_q_is_set) {
1714 WPRINTF("%s: Number of Queues already set", __func__);
1715 pci_nvme_status_genc(&compl->status,
1716 NVME_SC_COMMAND_SEQUENCE_ERROR);
1720 nqr = command->cdw11 & 0xFFFF;
1721 if (nqr == 0xffff) {
1722 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1723 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1727 sc->num_squeues = ONE_BASED(nqr);
1728 if (sc->num_squeues > sc->max_queues) {
1729 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1731 sc->num_squeues = sc->max_queues;
1734 nqr = (command->cdw11 >> 16) & 0xFFFF;
1735 if (nqr == 0xffff) {
1736 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1737 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1741 sc->num_cqueues = ONE_BASED(nqr);
1742 if (sc->num_cqueues > sc->max_queues) {
1743 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1745 sc->num_cqueues = sc->max_queues;
1748 /* Patch the command value which will be saved on callback's return */
1749 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1750 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1752 sc->num_q_is_set = true;
1756 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1757 struct nvme_completion *compl)
1759 struct nvme_feature_obj *feat;
1760 uint32_t nsid = command->nsid;
1761 uint8_t fid = command->cdw10 & 0xFF;
1763 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1765 if (fid >= NVME_FID_MAX) {
1766 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1767 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1770 feat = &sc->feat[fid];
1772 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1773 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1777 if (!feat->namespace_specific &&
1778 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1779 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1780 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1785 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1788 feat->set(sc, feat, command, compl);
1790 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1791 if (compl->status == NVME_SC_SUCCESS) {
1792 feat->cdw11 = command->cdw11;
1793 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1794 (command->cdw11 != 0))
1795 pci_nvme_aen_notify(sc);
1801 #define NVME_FEATURES_SEL_SUPPORTED 0x3
1802 #define NVME_FEATURES_NS_SPECIFIC (1 << 1)
1805 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1806 struct nvme_completion* compl)
1808 struct nvme_feature_obj *feat;
1809 uint8_t fid = command->cdw10 & 0xFF;
1810 uint8_t sel = (command->cdw10 >> 8) & 0x7;
1812 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1814 if (fid >= NVME_FID_MAX) {
1815 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1816 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1821 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1823 feat = &sc->feat[fid];
1825 feat->get(sc, feat, command, compl);
1828 if (compl->status == NVME_SC_SUCCESS) {
1829 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1830 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1832 compl->cdw0 = feat->cdw11;
1839 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1840 struct nvme_completion* compl)
1842 uint8_t ses, lbaf, pi;
1844 /* Only supports Secure Erase Setting - User Data Erase */
1845 ses = (command->cdw10 >> 9) & 0x7;
1847 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1851 /* Only supports a single LBA Format */
1852 lbaf = command->cdw10 & 0xf;
1854 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1855 NVME_SC_INVALID_FORMAT);
1859 /* Doesn't support Protection Infomation */
1860 pi = (command->cdw10 >> 5) & 0x7;
1862 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1866 if (sc->nvstore.type == NVME_STOR_RAM) {
1867 if (sc->nvstore.ctx)
1868 free(sc->nvstore.ctx);
1869 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1870 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1872 struct pci_nvme_ioreq *req;
1875 req = pci_nvme_get_ioreq(sc);
1877 pci_nvme_status_genc(&compl->status,
1878 NVME_SC_INTERNAL_DEVICE_ERROR);
1879 WPRINTF("%s: unable to allocate IO req", __func__);
1882 req->nvme_sq = &sc->submit_queues[0];
1884 req->opc = command->opc;
1885 req->cid = command->cid;
1886 req->nsid = command->nsid;
1888 req->io_req.br_offset = 0;
1889 req->io_req.br_resid = sc->nvstore.size;
1890 req->io_req.br_callback = pci_nvme_io_done;
1892 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
1894 pci_nvme_status_genc(&compl->status,
1895 NVME_SC_INTERNAL_DEVICE_ERROR);
1896 pci_nvme_release_ioreq(sc, req);
1898 compl->status = NVME_NO_STATUS;
1905 nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command,
1906 struct nvme_completion* compl)
1908 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
1909 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
1911 /* TODO: search for the command ID and abort it */
1914 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1919 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
1920 struct nvme_command* command, struct nvme_completion* compl)
1922 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
1923 sc->aer_count, sc->ctrldata.aerl, command->cid);
1925 /* Don't exceed the Async Event Request Limit (AERL). */
1926 if (pci_nvme_aer_limit_reached(sc)) {
1927 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1928 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
1932 if (pci_nvme_aer_add(sc, command->cid)) {
1933 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
1934 NVME_SC_INTERNAL_DEVICE_ERROR);
1939 * Raise events when they happen based on the Set Features cmd.
1940 * These events happen async, so only set completion successful if
1941 * there is an event reflective of the request to get event.
1943 compl->status = NVME_NO_STATUS;
1944 pci_nvme_aen_notify(sc);
1950 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
1952 struct nvme_completion compl;
1953 struct nvme_command *cmd;
1954 struct nvme_submission_queue *sq;
1955 struct nvme_completion_queue *cq;
1958 DPRINTF("%s index %u", __func__, (uint32_t)value);
1960 sq = &sc->submit_queues[0];
1961 cq = &sc->compl_queues[0];
1963 pthread_mutex_lock(&sq->mtx);
1966 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
1968 while (sqhead != atomic_load_acq_short(&sq->tail)) {
1969 cmd = &(sq->qbase)[sqhead];
1974 case NVME_OPC_DELETE_IO_SQ:
1975 DPRINTF("%s command DELETE_IO_SQ", __func__);
1976 nvme_opc_delete_io_sq(sc, cmd, &compl);
1978 case NVME_OPC_CREATE_IO_SQ:
1979 DPRINTF("%s command CREATE_IO_SQ", __func__);
1980 nvme_opc_create_io_sq(sc, cmd, &compl);
1982 case NVME_OPC_DELETE_IO_CQ:
1983 DPRINTF("%s command DELETE_IO_CQ", __func__);
1984 nvme_opc_delete_io_cq(sc, cmd, &compl);
1986 case NVME_OPC_CREATE_IO_CQ:
1987 DPRINTF("%s command CREATE_IO_CQ", __func__);
1988 nvme_opc_create_io_cq(sc, cmd, &compl);
1990 case NVME_OPC_GET_LOG_PAGE:
1991 DPRINTF("%s command GET_LOG_PAGE", __func__);
1992 nvme_opc_get_log_page(sc, cmd, &compl);
1994 case NVME_OPC_IDENTIFY:
1995 DPRINTF("%s command IDENTIFY", __func__);
1996 nvme_opc_identify(sc, cmd, &compl);
1998 case NVME_OPC_ABORT:
1999 DPRINTF("%s command ABORT", __func__);
2000 nvme_opc_abort(sc, cmd, &compl);
2002 case NVME_OPC_SET_FEATURES:
2003 DPRINTF("%s command SET_FEATURES", __func__);
2004 nvme_opc_set_features(sc, cmd, &compl);
2006 case NVME_OPC_GET_FEATURES:
2007 DPRINTF("%s command GET_FEATURES", __func__);
2008 nvme_opc_get_features(sc, cmd, &compl);
2010 case NVME_OPC_FIRMWARE_ACTIVATE:
2011 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2012 pci_nvme_status_tc(&compl.status,
2013 NVME_SCT_COMMAND_SPECIFIC,
2014 NVME_SC_INVALID_FIRMWARE_SLOT);
2016 case NVME_OPC_ASYNC_EVENT_REQUEST:
2017 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2018 nvme_opc_async_event_req(sc, cmd, &compl);
2020 case NVME_OPC_FORMAT_NVM:
2021 DPRINTF("%s command FORMAT_NVM", __func__);
2022 if ((sc->ctrldata.oacs &
2023 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2024 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2027 nvme_opc_format_nvm(sc, cmd, &compl);
2029 case NVME_OPC_SECURITY_SEND:
2030 case NVME_OPC_SECURITY_RECEIVE:
2031 case NVME_OPC_SANITIZE:
2032 case NVME_OPC_GET_LBA_STATUS:
2033 DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2035 /* Valid but unsupported opcodes */
2036 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2039 DPRINTF("%s command OPC=%#X (not implemented)",
2042 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2044 sqhead = (sqhead + 1) % sq->size;
2046 if (NVME_COMPLETION_VALID(compl)) {
2047 pci_nvme_cq_update(sc, &sc->compl_queues[0],
2055 DPRINTF("setting sqhead %u", sqhead);
2058 if (cq->head != cq->tail)
2059 pci_generate_msix(sc->nsc_pi, 0);
2061 pthread_mutex_unlock(&sq->mtx);
2065 * Update the Write and Read statistics reported in SMART data
2067 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2068 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2069 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2072 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2073 size_t bytes, uint16_t status)
2076 pthread_mutex_lock(&sc->mtx);
2078 case NVME_OPC_WRITE:
2079 sc->write_commands++;
2080 if (status != NVME_SC_SUCCESS)
2082 sc->write_dunits_remainder += (bytes / 512);
2083 while (sc->write_dunits_remainder >= 1000) {
2084 sc->write_data_units++;
2085 sc->write_dunits_remainder -= 1000;
2089 sc->read_commands++;
2090 if (status != NVME_SC_SUCCESS)
2092 sc->read_dunits_remainder += (bytes / 512);
2093 while (sc->read_dunits_remainder >= 1000) {
2094 sc->read_data_units++;
2095 sc->read_dunits_remainder -= 1000;
2099 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2102 pthread_mutex_unlock(&sc->mtx);
2106 * Check if the combination of Starting LBA (slba) and Number of Logical
2107 * Blocks (nlb) exceeds the range of the underlying storage.
2109 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2110 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2114 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2117 size_t offset, bytes;
2119 /* Overflow check of multiplying Starting LBA by the sector size */
2120 if (slba >> (64 - nvstore->sectsz_bits))
2123 offset = slba << nvstore->sectsz_bits;
2124 bytes = nlb << nvstore->sectsz_bits;
2126 /* Overflow check of Number of Logical Blocks */
2127 if ((nvstore->size - offset) < bytes)
2134 pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req,
2135 uint64_t gpaddr, size_t size, int do_write, uint64_t lba)
2142 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2146 /* concatenate contig block-iovs to minimize number of iovs */
2147 if ((req->prev_gpaddr + req->prev_size) == gpaddr) {
2148 iovidx = req->io_req.br_iovcnt - 1;
2150 req->io_req.br_iov[iovidx].iov_base =
2151 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2152 req->prev_gpaddr, size);
2154 req->prev_size += size;
2155 req->io_req.br_resid += size;
2157 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2159 iovidx = req->io_req.br_iovcnt;
2161 req->io_req.br_offset = lba;
2162 req->io_req.br_resid = 0;
2163 req->io_req.br_param = req;
2166 req->io_req.br_iov[iovidx].iov_base =
2167 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2170 req->io_req.br_iov[iovidx].iov_len = size;
2172 req->prev_gpaddr = gpaddr;
2173 req->prev_size = size;
2174 req->io_req.br_resid += size;
2176 req->io_req.br_iovcnt++;
2183 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2184 struct nvme_submission_queue *sq, int sqid, uint16_t cid,
2185 uint32_t cdw0, uint16_t status)
2187 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2189 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2190 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2191 NVME_STATUS_GET_SC(status));
2193 pci_nvme_cq_update(sc, cq,
2199 if (cq->head != cq->tail) {
2200 if (cq->intr_en & NVME_CQ_INTEN) {
2201 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2203 DPRINTF("%s: CQ%u interrupt disabled",
2204 __func__, sq->cqid);
2210 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2213 req->nvme_sq = NULL;
2216 pthread_mutex_lock(&sc->mtx);
2218 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2221 /* when no more IO pending, can set to ready if device reset/enabled */
2222 if (sc->pending_ios == 0 &&
2223 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2224 sc->regs.csts |= NVME_CSTS_RDY;
2226 pthread_mutex_unlock(&sc->mtx);
2228 sem_post(&sc->iosemlock);
2231 static struct pci_nvme_ioreq *
2232 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2234 struct pci_nvme_ioreq *req = NULL;
2236 sem_wait(&sc->iosemlock);
2237 pthread_mutex_lock(&sc->mtx);
2239 req = STAILQ_FIRST(&sc->ioreqs_free);
2240 assert(req != NULL);
2241 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2247 pthread_mutex_unlock(&sc->mtx);
2249 req->io_req.br_iovcnt = 0;
2250 req->io_req.br_offset = 0;
2251 req->io_req.br_resid = 0;
2252 req->io_req.br_param = req;
2253 req->prev_gpaddr = 0;
2260 pci_nvme_io_done(struct blockif_req *br, int err)
2262 struct pci_nvme_ioreq *req = br->br_param;
2263 struct nvme_submission_queue *sq = req->nvme_sq;
2264 uint16_t code, status;
2266 DPRINTF("%s error %d %s", __func__, err, strerror(err));
2268 /* TODO return correct error */
2269 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2270 pci_nvme_status_genc(&status, code);
2272 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status);
2273 pci_nvme_stats_write_read_update(req->sc, req->opc,
2274 req->bytes, status);
2275 pci_nvme_release_ioreq(req->sc, req);
2279 * Implements the Flush command. The specification states:
2280 * If a volatile write cache is not present, Flush commands complete
2281 * successfully and have no effect
2282 * in the description of the Volatile Write Cache (VWC) field of the Identify
2283 * Controller data. Therefore, set status to Success if the command is
2284 * not supported (i.e. RAM or as indicated by the blockif).
2287 nvme_opc_flush(struct pci_nvme_softc *sc,
2288 struct nvme_command *cmd,
2289 struct pci_nvme_blockstore *nvstore,
2290 struct pci_nvme_ioreq *req,
2293 bool pending = false;
2295 if (nvstore->type == NVME_STOR_RAM) {
2296 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2300 req->io_req.br_callback = pci_nvme_io_done;
2302 err = blockif_flush(nvstore->ctx, &req->io_req);
2308 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2311 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2319 nvme_write_read_ram(struct pci_nvme_softc *sc,
2320 struct pci_nvme_blockstore *nvstore,
2321 uint64_t prp1, uint64_t prp2,
2322 size_t offset, uint64_t bytes,
2325 uint8_t *buf = nvstore->ctx;
2326 enum nvme_copy_dir dir;
2330 dir = NVME_COPY_TO_PRP;
2332 dir = NVME_COPY_FROM_PRP;
2334 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2335 buf + offset, bytes, dir))
2336 pci_nvme_status_genc(&status,
2337 NVME_SC_DATA_TRANSFER_ERROR);
2339 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2345 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2346 struct pci_nvme_blockstore *nvstore,
2347 struct pci_nvme_ioreq *req,
2348 uint64_t prp1, uint64_t prp2,
2349 size_t offset, uint64_t bytes,
2354 uint16_t status = NVME_NO_STATUS;
2356 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2357 if (pci_nvme_append_iov_req(sc, req, prp1,
2358 size, is_write, offset)) {
2359 pci_nvme_status_genc(&status,
2360 NVME_SC_DATA_TRANSFER_ERROR);
2369 } else if (bytes <= PAGE_SIZE) {
2371 if (pci_nvme_append_iov_req(sc, req, prp2,
2372 size, is_write, offset)) {
2373 pci_nvme_status_genc(&status,
2374 NVME_SC_DATA_TRANSFER_ERROR);
2378 void *vmctx = sc->nsc_pi->pi_vmctx;
2379 uint64_t *prp_list = &prp2;
2380 uint64_t *last = prp_list;
2382 /* PRP2 is pointer to a physical region page list */
2384 /* Last entry in list points to the next list */
2385 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2386 uint64_t prp = *prp_list;
2388 prp_list = paddr_guest2host(vmctx, prp,
2389 PAGE_SIZE - (prp % PAGE_SIZE));
2390 last = prp_list + (NVME_PRP2_ITEMS - 1);
2393 size = MIN(bytes, PAGE_SIZE);
2395 if (pci_nvme_append_iov_req(sc, req, *prp_list,
2396 size, is_write, offset)) {
2397 pci_nvme_status_genc(&status,
2398 NVME_SC_DATA_TRANSFER_ERROR);
2408 req->io_req.br_callback = pci_nvme_io_done;
2410 err = blockif_write(nvstore->ctx, &req->io_req);
2412 err = blockif_read(nvstore->ctx, &req->io_req);
2415 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2421 nvme_opc_write_read(struct pci_nvme_softc *sc,
2422 struct nvme_command *cmd,
2423 struct pci_nvme_blockstore *nvstore,
2424 struct pci_nvme_ioreq *req,
2427 uint64_t lba, nblocks, bytes;
2429 bool is_write = cmd->opc == NVME_OPC_WRITE;
2430 bool pending = false;
2432 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2433 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2435 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2436 WPRINTF("%s command would exceed LBA range", __func__);
2437 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2441 bytes = nblocks << nvstore->sectsz_bits;
2442 if (bytes > NVME_MAX_DATA_SIZE) {
2443 WPRINTF("%s command would exceed MDTS", __func__);
2444 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2448 offset = lba << nvstore->sectsz_bits;
2451 req->io_req.br_offset = lba;
2453 /* PRP bits 1:0 must be zero */
2454 cmd->prp1 &= ~0x3UL;
2455 cmd->prp2 &= ~0x3UL;
2457 if (nvstore->type == NVME_STOR_RAM) {
2458 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2459 cmd->prp2, offset, bytes, is_write);
2461 *status = nvme_write_read_blockif(sc, nvstore, req,
2462 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2464 if (*status == NVME_NO_STATUS)
2469 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2475 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2477 struct pci_nvme_ioreq *req = br->br_param;
2478 struct pci_nvme_softc *sc = req->sc;
2483 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2484 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2485 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2487 struct iovec *iov = req->io_req.br_iov;
2490 iov += req->prev_gpaddr;
2492 /* The iov_* values already include the sector size */
2493 req->io_req.br_offset = (off_t)iov->iov_base;
2494 req->io_req.br_resid = iov->iov_len;
2495 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2496 pci_nvme_status_genc(&status,
2497 NVME_SC_INTERNAL_DEVICE_ERROR);
2503 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid,
2504 req->cid, 0, status);
2505 pci_nvme_release_ioreq(sc, req);
2510 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2511 struct nvme_command *cmd,
2512 struct pci_nvme_blockstore *nvstore,
2513 struct pci_nvme_ioreq *req,
2516 struct nvme_dsm_range *range;
2517 uint32_t nr, r, non_zero, dr;
2519 bool pending = false;
2521 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2522 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2526 nr = cmd->cdw10 & 0xff;
2528 /* copy locally because a range entry could straddle PRPs */
2529 range = calloc(1, NVME_MAX_DSM_TRIM);
2530 if (range == NULL) {
2531 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2534 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2535 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2537 /* Check for invalid ranges and the number of non-zero lengths */
2539 for (r = 0; r <= nr; r++) {
2540 if (pci_nvme_out_of_range(nvstore,
2541 range[r].starting_lba, range[r].length)) {
2542 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2545 if (range[r].length != 0)
2549 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2550 size_t offset, bytes;
2551 int sectsz_bits = sc->nvstore.sectsz_bits;
2554 * DSM calls are advisory only, and compliant controllers
2555 * may choose to take no actions (i.e. return Success).
2557 if (!nvstore->deallocate) {
2558 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2562 /* If all ranges have a zero length, return Success */
2563 if (non_zero == 0) {
2564 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2569 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2573 offset = range[0].starting_lba << sectsz_bits;
2574 bytes = range[0].length << sectsz_bits;
2577 * If the request is for more than a single range, store
2578 * the ranges in the br_iov. Optimize for the common case
2579 * of a single range.
2581 * Note that NVMe Number of Ranges is a zero based value
2583 req->io_req.br_iovcnt = 0;
2584 req->io_req.br_offset = offset;
2585 req->io_req.br_resid = bytes;
2588 req->io_req.br_callback = pci_nvme_io_done;
2590 struct iovec *iov = req->io_req.br_iov;
2592 for (r = 0, dr = 0; r <= nr; r++) {
2593 offset = range[r].starting_lba << sectsz_bits;
2594 bytes = range[r].length << sectsz_bits;
2598 if ((nvstore->size - offset) < bytes) {
2599 pci_nvme_status_genc(status,
2600 NVME_SC_LBA_OUT_OF_RANGE);
2603 iov[dr].iov_base = (void *)offset;
2604 iov[dr].iov_len = bytes;
2607 req->io_req.br_callback = pci_nvme_dealloc_sm;
2610 * Use prev_gpaddr to track the current entry and
2611 * prev_size to track the number of entries
2613 req->prev_gpaddr = 0;
2614 req->prev_size = dr;
2617 err = blockif_delete(nvstore->ctx, &req->io_req);
2619 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2629 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2631 struct nvme_submission_queue *sq;
2635 /* handle all submissions up to sq->tail index */
2636 sq = &sc->submit_queues[idx];
2638 pthread_mutex_lock(&sq->mtx);
2641 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2642 idx, sqhead, sq->tail, sq->qbase);
2644 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2645 struct nvme_command *cmd;
2646 struct pci_nvme_ioreq *req;
2654 cmd = &sq->qbase[sqhead];
2655 sqhead = (sqhead + 1) % sq->size;
2657 nsid = le32toh(cmd->nsid);
2658 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2659 pci_nvme_status_genc(&status,
2660 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2662 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2666 req = pci_nvme_get_ioreq(sc);
2668 pci_nvme_status_genc(&status,
2669 NVME_SC_INTERNAL_DEVICE_ERROR);
2670 WPRINTF("%s: unable to allocate IO req", __func__);
2675 req->opc = cmd->opc;
2676 req->cid = cmd->cid;
2677 req->nsid = cmd->nsid;
2680 case NVME_OPC_FLUSH:
2681 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2684 case NVME_OPC_WRITE:
2686 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2689 case NVME_OPC_WRITE_ZEROES:
2690 /* TODO: write zeroes
2691 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2692 __func__, lba, cmd->cdw12 & 0xFFFF); */
2693 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2695 case NVME_OPC_DATASET_MANAGEMENT:
2696 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2700 WPRINTF("%s unhandled io command 0x%x",
2701 __func__, cmd->opc);
2702 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2706 pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0,
2709 pci_nvme_release_ioreq(sc, req);
2715 pthread_mutex_unlock(&sq->mtx);
2719 pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc,
2720 uint64_t idx, int is_sq, uint64_t value)
2722 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2723 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2726 if (idx > sc->num_squeues) {
2727 WPRINTF("%s queue index %lu overflow from "
2729 __func__, idx, sc->num_squeues);
2733 atomic_store_short(&sc->submit_queues[idx].tail,
2737 pci_nvme_handle_admin_cmd(sc, value);
2739 /* submission queue; handle new entries in SQ */
2740 if (idx > sc->num_squeues) {
2741 WPRINTF("%s SQ index %lu overflow from "
2743 __func__, idx, sc->num_squeues);
2746 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2749 if (idx > sc->num_cqueues) {
2750 WPRINTF("%s queue index %lu overflow from "
2752 __func__, idx, sc->num_cqueues);
2756 atomic_store_short(&sc->compl_queues[idx].head,
2762 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2764 const char *s = iswrite ? "WRITE" : "READ";
2767 case NVME_CR_CAP_LOW:
2768 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2770 case NVME_CR_CAP_HI:
2771 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2774 DPRINTF("%s %s NVME_CR_VS", func, s);
2777 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2780 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2783 DPRINTF("%s %s NVME_CR_CC", func, s);
2786 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2789 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2792 DPRINTF("%s %s NVME_CR_AQA", func, s);
2794 case NVME_CR_ASQ_LOW:
2795 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2797 case NVME_CR_ASQ_HI:
2798 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2800 case NVME_CR_ACQ_LOW:
2801 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2803 case NVME_CR_ACQ_HI:
2804 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2807 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2813 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2814 uint64_t offset, int size, uint64_t value)
2818 if (offset >= NVME_DOORBELL_OFFSET) {
2819 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2820 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2821 int is_sq = (belloffset % 8) < 4;
2823 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2824 WPRINTF("guest attempted an overflow write offset "
2825 "0x%lx, val 0x%lx in %s",
2826 offset, value, __func__);
2830 pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value);
2834 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2835 offset, size, value);
2838 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2839 "val 0x%lx) to bar0 in %s",
2840 size, offset, value, __func__);
2841 /* TODO: shutdown device */
2845 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2847 pthread_mutex_lock(&sc->mtx);
2850 case NVME_CR_CAP_LOW:
2851 case NVME_CR_CAP_HI:
2858 /* MSI-X, so ignore */
2861 /* MSI-X, so ignore */
2864 ccreg = (uint32_t)value;
2866 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
2869 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
2870 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
2871 NVME_CC_GET_IOCQES(ccreg));
2873 if (NVME_CC_GET_SHN(ccreg)) {
2874 /* perform shutdown - flush out data to backend */
2875 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
2876 NVME_CSTS_REG_SHST_SHIFT);
2877 sc->regs.csts |= NVME_SHST_COMPLETE <<
2878 NVME_CSTS_REG_SHST_SHIFT;
2880 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
2881 if (NVME_CC_GET_EN(ccreg) == 0)
2882 /* transition 1-> causes controller reset */
2883 pci_nvme_reset_locked(sc);
2885 pci_nvme_init_controller(ctx, sc);
2888 /* Insert the iocqes, iosqes and en bits from the write */
2889 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
2890 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
2891 if (NVME_CC_GET_EN(ccreg) == 0) {
2892 /* Insert the ams, mps and css bit fields */
2893 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
2894 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
2895 sc->regs.csts &= ~NVME_CSTS_RDY;
2896 } else if (sc->pending_ios == 0) {
2897 sc->regs.csts |= NVME_CSTS_RDY;
2903 /* ignore writes; don't support subsystem reset */
2906 sc->regs.aqa = (uint32_t)value;
2908 case NVME_CR_ASQ_LOW:
2909 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
2910 (0xFFFFF000 & value);
2912 case NVME_CR_ASQ_HI:
2913 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
2916 case NVME_CR_ACQ_LOW:
2917 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
2918 (0xFFFFF000 & value);
2920 case NVME_CR_ACQ_HI:
2921 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
2925 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
2926 __func__, offset, value, size);
2928 pthread_mutex_unlock(&sc->mtx);
2932 pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
2933 int baridx, uint64_t offset, int size, uint64_t value)
2935 struct pci_nvme_softc* sc = pi->pi_arg;
2937 if (baridx == pci_msix_table_bar(pi) ||
2938 baridx == pci_msix_pba_bar(pi)) {
2939 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
2940 " value 0x%lx", baridx, offset, size, value);
2942 pci_emul_msix_twrite(pi, offset, size, value);
2948 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
2952 DPRINTF("%s unknown baridx %d, val 0x%lx",
2953 __func__, baridx, value);
2957 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
2958 uint64_t offset, int size)
2962 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
2964 if (offset < NVME_DOORBELL_OFFSET) {
2965 void *p = &(sc->regs);
2966 pthread_mutex_lock(&sc->mtx);
2967 memcpy(&value, (void *)((uintptr_t)p + offset), size);
2968 pthread_mutex_unlock(&sc->mtx);
2971 WPRINTF("pci_nvme: read invalid offset %ld", offset);
2982 value &= 0xFFFFFFFF;
2986 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
2987 offset, size, (uint32_t)value);
2995 pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
2996 uint64_t offset, int size)
2998 struct pci_nvme_softc* sc = pi->pi_arg;
3000 if (baridx == pci_msix_table_bar(pi) ||
3001 baridx == pci_msix_pba_bar(pi)) {
3002 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3003 baridx, offset, size);
3005 return pci_emul_msix_tread(pi, offset, size);
3010 return pci_nvme_read_bar_0(sc, offset, size);
3013 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3020 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3022 char bident[sizeof("XX:X:X")];
3026 sc->max_queues = NVME_QUEUES;
3027 sc->max_qentries = NVME_MAX_QENTRIES;
3028 sc->ioslots = NVME_IOSLOTS;
3029 sc->num_squeues = sc->max_queues;
3030 sc->num_cqueues = sc->max_queues;
3031 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3033 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3034 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3036 value = get_config_value_node(nvl, "maxq");
3038 sc->max_queues = atoi(value);
3039 value = get_config_value_node(nvl, "qsz");
3040 if (value != NULL) {
3041 sc->max_qentries = atoi(value);
3042 if (sc->max_qentries <= 0) {
3043 EPRINTLN("nvme: Invalid qsz option %d",
3048 value = get_config_value_node(nvl, "ioslots");
3049 if (value != NULL) {
3050 sc->ioslots = atoi(value);
3051 if (sc->ioslots <= 0) {
3052 EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3056 value = get_config_value_node(nvl, "sectsz");
3058 sectsz = atoi(value);
3059 value = get_config_value_node(nvl, "ser");
3060 if (value != NULL) {
3062 * This field indicates the Product Serial Number in
3063 * 7-bit ASCII, unused bytes should be space characters.
3066 cpywithpad((char *)sc->ctrldata.sn,
3067 sizeof(sc->ctrldata.sn), value, ' ');
3069 value = get_config_value_node(nvl, "eui64");
3071 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3072 value = get_config_value_node(nvl, "dsm");
3073 if (value != NULL) {
3074 if (strcmp(value, "auto") == 0)
3075 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3076 else if (strcmp(value, "enable") == 0)
3077 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3078 else if (strcmp(value, "disable") == 0)
3079 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3082 value = get_config_value_node(nvl, "ram");
3083 if (value != NULL) {
3084 uint64_t sz = strtoull(value, NULL, 10);
3086 sc->nvstore.type = NVME_STOR_RAM;
3087 sc->nvstore.size = sz * 1024 * 1024;
3088 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3089 sc->nvstore.sectsz = 4096;
3090 sc->nvstore.sectsz_bits = 12;
3091 if (sc->nvstore.ctx == NULL) {
3092 EPRINTLN("nvme: Unable to allocate RAM");
3096 snprintf(bident, sizeof(bident), "%d:%d",
3097 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3098 sc->nvstore.ctx = blockif_open(nvl, bident);
3099 if (sc->nvstore.ctx == NULL) {
3100 EPRINTLN("nvme: Could not open backing file: %s",
3104 sc->nvstore.type = NVME_STOR_BLOCKIF;
3105 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3108 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3109 sc->nvstore.sectsz = sectsz;
3110 else if (sc->nvstore.type != NVME_STOR_RAM)
3111 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3112 for (sc->nvstore.sectsz_bits = 9;
3113 (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3114 sc->nvstore.sectsz_bits++);
3116 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3117 sc->max_queues = NVME_QUEUES;
3123 pci_nvme_resized(struct blockif_ctxt *bctxt, void *arg, size_t new_size)
3125 struct pci_nvme_softc *sc;
3126 struct pci_nvme_blockstore *nvstore;
3127 struct nvme_namespace_data *nd;
3130 nvstore = &sc->nvstore;
3133 nvstore->size = new_size;
3134 pci_nvme_init_nsdata_size(nvstore, nd);
3136 /* Add changed NSID to list */
3137 sc->ns_log.ns[0] = 1;
3138 sc->ns_log.ns[1] = 0;
3140 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3141 PCI_NVME_AE_INFO_NS_ATTR_CHANGED);
3145 pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, nvlist_t *nvl)
3147 struct pci_nvme_softc *sc;
3148 uint32_t pci_membar_sz;
3153 sc = calloc(1, sizeof(struct pci_nvme_softc));
3157 error = pci_nvme_parse_config(sc, nvl);
3163 STAILQ_INIT(&sc->ioreqs_free);
3164 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3165 for (int i = 0; i < sc->ioslots; i++) {
3166 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3169 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3170 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3171 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3172 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3173 pci_set_cfgdata8(pi, PCIR_PROGIF,
3174 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3177 * Allocate size of NVMe registers + doorbell space for all queues.
3179 * The specification requires a minimum memory I/O window size of 16K.
3180 * The Windows driver will refuse to start a device with a smaller
3183 pci_membar_sz = sizeof(struct nvme_registers) +
3184 2 * sizeof(uint32_t) * (sc->max_queues + 1);
3185 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3187 DPRINTF("nvme membar size: %u", pci_membar_sz);
3189 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3191 WPRINTF("%s pci alloc mem bar failed", __func__);
3195 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3197 WPRINTF("%s pci add msixcap failed", __func__);
3201 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3203 WPRINTF("%s pci add Express capability failed", __func__);
3207 pthread_mutex_init(&sc->mtx, NULL);
3208 sem_init(&sc->iosemlock, 0, sc->ioslots);
3209 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3211 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3213 * Controller data depends on Namespace data so initialize Namespace
3216 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3217 pci_nvme_init_ctrldata(sc);
3218 pci_nvme_init_logpages(sc);
3219 pci_nvme_init_features(sc);
3221 pci_nvme_aer_init(sc);
3222 pci_nvme_aen_init(sc);
3226 pci_lintr_request(pi);
3233 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3240 if (strncmp(opts, "ram=", 4) == 0) {
3241 cp = strchr(opts, ',');
3243 set_config_value_node(nvl, "ram", opts + 4);
3246 ram = strndup(opts + 4, cp - opts - 4);
3247 set_config_value_node(nvl, "ram", ram);
3249 return (pci_parse_legacy_config(nvl, cp + 1));
3251 return (blockif_legacy_config(nvl, opts));
3254 struct pci_devemu pci_de_nvme = {
3256 .pe_init = pci_nvme_init,
3257 .pe_legacy_config = pci_nvme_legacy_config,
3258 .pe_barwrite = pci_nvme_write,
3259 .pe_barread = pci_nvme_read
3261 PCI_EMUL_SET(pci_de_nvme);