2 * Copyright (c) 2016 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 static void nvme_disk_callback(nvme_request_t *req, struct lock *lk);
38 static int nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay);
40 static d_open_t nvme_open;
41 static d_close_t nvme_close;
42 static d_ioctl_t nvme_ioctl;
43 static d_strategy_t nvme_strategy;
44 static d_dump_t nvme_dump;
46 static struct dev_ops nvme_ops = {
47 { "nvme", 0, D_DISK | D_MPSAFE | D_CANFREE | D_TRACKCLOSE | D_KVABIO },
49 .d_close = nvme_close,
53 .d_ioctl = nvme_ioctl,
54 .d_strategy = nvme_strategy,
57 static int nvme_sync_delay = 0;
58 SYSCTL_INT(_debug, OID_AUTO, nvme_sync_delay, CTLFLAG_RW, &nvme_sync_delay, 0,
59 "Enable synchronous delay/completion-check, uS");
62 * Attach a namespace as a disk, making the disk available to the system.
65 nvme_disk_attach(nvme_softns_t *nsc)
68 struct disk_info info;
74 devstat_add_entry(&nsc->stats, "nvme", nsc->unit, nsc->blksize,
75 DEVSTAT_NO_ORDERED_TAGS,
76 DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
77 DEVSTAT_PRIORITY_OTHER);
78 nsc->cdev = disk_create(nsc->unit, &nsc->disk, &nvme_ops);
79 nsc->cdev->si_drv1 = nsc;
80 nsc->cdev->si_iosize_max = MAXPHYS; /* XXX */
81 disk_setdisktype(&nsc->disk, "ssd");
83 bzero(&info, sizeof(info));
84 info.d_media_blksize = nsc->blksize;
85 info.d_media_blocks = nsc->idns.size;
86 info.d_secpertrack = 1024;
88 info.d_secpercyl = info.d_secpertrack * info.d_nheads;
89 info.d_ncylinders = (u_int)(info.d_media_blocks / info.d_secpercyl);
91 KKASSERT(sizeof(sc->idctlr.serialno) == 20);
92 bzero(serial, sizeof(serial));
93 bcopy(sc->idctlr.serialno, serial, sizeof(sc->idctlr.serialno));
94 len = string_cleanup(serial, 1);
96 ksnprintf(serial + len, sizeof(serial) - len, "-%u", nsc->nsid);
98 info.d_serialno = serial;
100 cap_gb = nsc->idns.size / (1024 * 1024 * 1024 / nsc->blksize);
101 device_printf(sc->dev,
103 "blksize=%u lbacnt=%ju cap=%juGB serno=%s\n",
104 nsc->unit, nsc->nsid,
105 nsc->blksize, nsc->idns.size, cap_gb, serial);
107 disk_setdiskinfo(&nsc->disk, &info);
108 /* serial is copied and does not have to be persistent */
112 nvme_disk_detach(nvme_softns_t *nsc)
115 disk_destroy(&nsc->disk);
116 devstat_remove_entry(&nsc->stats);
122 nvme_open(struct dev_open_args *ap)
124 cdev_t dev = ap->a_head.a_dev;
125 nvme_softns_t *nsc = dev->si_drv1;
126 nvme_softc_t *sc = nsc->sc;
128 if (sc->flags & NVME_SC_UNLOADING)
131 atomic_add_long(&sc->opencnt, 1);
138 nvme_close(struct dev_close_args *ap)
140 cdev_t dev = ap->a_head.a_dev;
141 nvme_softns_t *nsc = dev->si_drv1;
142 nvme_softc_t *sc = nsc->sc;
144 atomic_add_long(&sc->opencnt, -1);
150 nvme_ioctl(struct dev_ioctl_args *ap)
152 cdev_t dev = ap->a_head.a_dev;
153 nvme_softns_t *nsc = dev->si_drv1;
154 nvme_softc_t *sc = nsc->sc;
159 error = nvme_getlog_ioctl(sc, (void *)ap->a_data);
169 nvme_strategy(struct dev_strategy_args *ap)
171 cdev_t dev = ap->a_head.a_dev;
172 nvme_softns_t *nsc = dev->si_drv1;
174 nvme_strategy_core(nsc, ap->a_bio, nvme_sync_delay);
180 * Called from admin thread to requeue BIOs. We must call
181 * nvme_strategy_core() with delay = 0 to disable synchronous
182 * optimizations to avoid deadlocking the admin thread.
185 nvme_disk_requeues(nvme_softc_t *sc)
191 for (i = 0; i < sc->nscmax; ++i) {
193 if (nsc == NULL || nsc->sc == NULL)
195 if (bioq_first(&nsc->bioq)) {
196 lockmgr(&nsc->lk, LK_EXCLUSIVE);
197 while ((bio = bioq_first(&nsc->bioq)) != NULL) {
198 bioq_remove(&nsc->bioq, bio);
199 lockmgr(&nsc->lk, LK_RELEASE);
200 if (nvme_strategy_core(nsc, bio, 0))
202 lockmgr(&nsc->lk, LK_EXCLUSIVE);
204 lockmgr(&nsc->lk, LK_RELEASE);
213 * Returns non-zero if no requests are available.
215 * WARNING! We are using the KVABIO API and must not access memory
216 * through bp->b_data without first calling bkvasync(bp).
219 nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay)
221 nvme_softc_t *sc = nsc->sc;
222 struct buf *bp = bio->bio_buf;
225 nvme_subqueue_t *subq;
230 * Calculate sector/extent
232 secno = bio->bio_offset / nsc->blksize;
233 nlba = bp->b_bcount / nsc->blksize;
235 devstat_start_transaction(&nsc->stats);
242 * Convert bio to low-level request
250 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_RD]];
251 /* get_request does not need the subq lock */
252 req = nvme_get_request(subq, NVME_IOCMD_READ,
253 bp->b_data, nlba * nsc->blksize);
257 req->cmd.read.head.nsid = nsc->nsid;
258 req->cmd.read.start_lba = secno;
259 req->cmd.read.count_lba = nlba - 1; /* 0's based */
260 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
261 req->cmd.read.dsm = 0; /* NVME_DSM_INCOMPRESSIBLE */
262 /* NVME_DSM_SEQREQ */
269 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
270 /* get_request does not need the subq lock */
271 req = nvme_get_request(subq, NVME_IOCMD_WRITE,
272 bp->b_data, nlba * nsc->blksize);
275 req->cmd.write.head.nsid = nsc->nsid;
276 req->cmd.write.start_lba = secno;
277 req->cmd.write.count_lba = nlba - 1; /* 0's based */
279 case BUF_CMD_FREEBLKS:
285 /* will cause INVAL error */
288 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
289 /* get_request does not need the subq lock */
290 req = nvme_get_request(subq, NVME_IOCMD_WRITEZ, NULL, 0);
293 req->cmd.writez.head.nsid = nsc->nsid;
294 req->cmd.writez.start_lba = secno;
295 req->cmd.writez.count_lba = nlba - 1; /* 0's based */
296 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
297 req->cmd.read.dsm = 0; /* NVME_DSM_INCOMPRESSIBLE */
298 /* NVME_DSM_SEQREQ */
301 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
302 /* get_request does not need the subq lock */
303 req = nvme_get_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
306 req->cmd.flush.head.nsid = nsc->nsid;
316 nvme_comqueue_t *comq;
318 /* HACK OPTIMIZATIONS - TODO NEEDS WORK */
321 * Prevent callback from occurring if the synchronous
322 * delay optimization is enabled.
324 * NOTE: subq lock does not protect the I/O (completion
325 * only needs the comq lock).
328 req->callback = nvme_disk_callback;
331 BUF_KERNPROC(bp); /* do before submit */
332 lockmgr(&subq->lk, LK_EXCLUSIVE);
333 nvme_submit_request(req); /* needs subq lock */
334 lockmgr(&subq->lk, LK_RELEASE);
337 DELAY(delay); /* XXX */
338 lockmgr(&comq->lk, LK_EXCLUSIVE);
339 nvme_poll_completions(comq, &comq->lk);
340 if (req->state == NVME_REQ_SUBMITTED) {
342 * Didn't finish, do it the slow way
343 * (restore async completion).
345 req->callback = nvme_disk_callback;
346 lockmgr(&comq->lk, LK_RELEASE);
349 * Jeeze, that was fast.
351 nvme_disk_callback(req, &comq->lk);
352 lockmgr(&comq->lk, LK_RELEASE);
354 } /* else async completion */
355 } else if (nobytes) {
356 devstat_end_transaction_buf(&nsc->stats, bp);
359 bp->b_error = EINVAL;
360 bp->b_flags |= B_ERROR;
361 devstat_end_transaction_buf(&nsc->stats, bp);
367 * No requests were available, requeue the bio.
369 * The nvme_get_request() call armed the requeue signal but
370 * it is possible that it was picked up too quickly. If it
371 * was, signal the admin thread ourselves. This case will occur
372 * relatively rarely and only under heavy I/O conditions so we
373 * don't have to be entirely efficient about dealing with it.
377 lockmgr(&nsc->lk, LK_EXCLUSIVE);
378 bioqdisksort(&nsc->bioq, bio);
379 lockmgr(&nsc->lk, LK_RELEASE);
380 if (atomic_swap_int(&subq->signal_requeue, 1) == 0) {
381 atomic_swap_int(&subq->signal_requeue, 0);
382 atomic_set_int(&subq->sc->admin_signal, ADMIN_SIG_REQUEUE);
383 wakeup(&subq->sc->admin_signal);
390 nvme_disk_callback(nvme_request_t *req, struct lock *lk)
392 nvme_softns_t *nsc = req->nsc;
397 status = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
401 if (lk) /* comq lock */
402 lockmgr(lk, LK_RELEASE);
403 nvme_put_request(req); /* does not need subq lock */
404 devstat_end_transaction_buf(&nsc->stats, bp);
407 bp->b_flags |= B_ERROR;
413 if (lk) /* comq lock */
414 lockmgr(lk, LK_EXCLUSIVE);
418 nvme_alloc_disk_unit(void)
420 static int unit_counter = 0;
423 unit = atomic_fetchadd_int(&unit_counter, 1);
429 nvme_dump(struct dev_dump_args *ap)
431 cdev_t dev = ap->a_head.a_dev;
432 nvme_softns_t *nsc = dev->si_drv1;
433 nvme_softc_t *sc = nsc->sc;
436 nvme_subqueue_t *subq;
437 nvme_comqueue_t *comq;
442 * Calculate sector/extent
444 secno = ap->a_offset / nsc->blksize;
445 nlba = ap->a_length / nsc->blksize;
447 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
453 * get_request does not need the subq lock.
455 req = nvme_get_dump_request(subq, NVME_IOCMD_WRITE,
456 ap->a_virtual, nlba * nsc->blksize);
457 req->cmd.write.head.nsid = nsc->nsid;
458 req->cmd.write.start_lba = secno;
459 req->cmd.write.count_lba = nlba - 1; /* 0's based */
464 * get_request does not need the subq lock.
466 req = nvme_get_dump_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
467 req->cmd.flush.head.nsid = nsc->nsid;
471 * Prevent callback from occurring if the synchronous
472 * delay optimization is enabled.
474 req->callback = NULL;
478 * 500 x 1uS poll wait on lock. We might be the idle thread, so
479 * we can't safely block during a dump.
482 while (lockmgr(&subq->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
485 tsc_delay(1000); /* 1uS */
488 nvme_submit_request(req); /* needs subq lock */
490 lockmgr(&subq->lk, LK_RELEASE);
493 nvme_poll_request(req);
494 nvme_put_dump_request(req); /* does not need subq lock */
497 * Shut the nvme controller down nicely when we finish the dump.
498 * We should to do this whether we are in a panic or not because
499 * frankly the dump is overwriting swap space, thus the system is
500 * probably not stable.
503 nvme_issue_shutdown(sc, 1);