nvme - Improve likelihood of dump success
[dragonfly.git] / sys / dev / disk / nvme / nvme_disk.c
1 /*
2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "nvme.h"
36
37 static void nvme_disk_callback(nvme_request_t *req, struct lock *lk);
38 static int nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay);
39
40 static d_open_t nvme_open;
41 static d_close_t nvme_close;
42 static d_ioctl_t nvme_ioctl;
43 static d_strategy_t nvme_strategy;
44 static d_dump_t nvme_dump;
45
46 static struct dev_ops nvme_ops = {
47         { "nvme", 0, D_DISK | D_MPSAFE | D_CANFREE | D_TRACKCLOSE | D_KVABIO },
48         .d_open =       nvme_open,
49         .d_close =      nvme_close,
50         .d_read =       physread,
51         .d_dump =       nvme_dump,
52         .d_write =      physwrite,
53         .d_ioctl =      nvme_ioctl,
54         .d_strategy =   nvme_strategy,
55 };
56
57 static int nvme_sync_delay = 0;
58 SYSCTL_INT(_debug, OID_AUTO, nvme_sync_delay, CTLFLAG_RW, &nvme_sync_delay, 0,
59            "Enable synchronous delay/completion-check, uS");
60
61 /*
62  * Attach a namespace as a disk, making the disk available to the system.
63  */
64 void
65 nvme_disk_attach(nvme_softns_t *nsc)
66 {
67         nvme_softc_t *sc;
68         struct disk_info info;
69         char serial[20+16];
70         size_t len;
71         uint64_t cap_gb;
72
73         sc = nsc->sc;
74         devstat_add_entry(&nsc->stats, "nvme", nsc->unit, nsc->blksize,
75                           DEVSTAT_NO_ORDERED_TAGS,
76                           DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
77                           DEVSTAT_PRIORITY_OTHER);
78         nsc->cdev = disk_create(nsc->unit, &nsc->disk, &nvme_ops);
79         nsc->cdev->si_drv1 = nsc;
80         nsc->cdev->si_iosize_max = MAXPHYS;     /* XXX */
81         disk_setdisktype(&nsc->disk, "ssd");
82
83         bzero(&info, sizeof(info));
84         info.d_media_blksize = nsc->blksize;
85         info.d_media_blocks = nsc->idns.size;
86         info.d_secpertrack = 1024;
87         info.d_nheads = 1;
88         info.d_secpercyl = info.d_secpertrack * info.d_nheads;
89         info.d_ncylinders =  (u_int)(info.d_media_blocks / info.d_secpercyl);
90
91         KKASSERT(sizeof(sc->idctlr.serialno) == 20);
92         bzero(serial, sizeof(serial));
93         bcopy(sc->idctlr.serialno, serial, sizeof(sc->idctlr.serialno));
94         len = string_cleanup(serial, 1);
95
96         ksnprintf(serial + len, sizeof(serial) - len, "-%u", nsc->nsid);
97
98         info.d_serialno = serial;
99
100         cap_gb = nsc->idns.size / (1024 * 1024 * 1024 / nsc->blksize);
101         device_printf(sc->dev,
102                 "Disk nvme%d ns=%u "
103                 "blksize=%u lbacnt=%ju cap=%juGB serno=%s\n",
104                 nsc->unit, nsc->nsid,
105                 nsc->blksize, nsc->idns.size, cap_gb, serial);
106
107         disk_setdiskinfo(&nsc->disk, &info);
108         /* serial is copied and does not have to be persistent */
109 }
110
111 void
112 nvme_disk_detach(nvme_softns_t *nsc)
113 {
114         if (nsc->cdev) {
115                 disk_destroy(&nsc->disk);
116                 devstat_remove_entry(&nsc->stats);
117         }
118 }
119
120 static
121 int
122 nvme_open(struct dev_open_args *ap)
123 {
124         cdev_t dev = ap->a_head.a_dev;
125         nvme_softns_t *nsc = dev->si_drv1;
126         nvme_softc_t *sc = nsc->sc;
127
128         if (sc->flags & NVME_SC_UNLOADING)
129                 return ENXIO;
130
131         atomic_add_long(&sc->opencnt, 1);
132
133         return 0;
134 }
135
136 static
137 int
138 nvme_close(struct dev_close_args *ap)
139 {
140         cdev_t dev = ap->a_head.a_dev;
141         nvme_softns_t *nsc = dev->si_drv1;
142         nvme_softc_t *sc = nsc->sc;
143
144         atomic_add_long(&sc->opencnt, -1);
145
146         return 0;
147 }
148
149 static int
150 nvme_ioctl(struct dev_ioctl_args *ap)
151 {
152         cdev_t dev = ap->a_head.a_dev;
153         nvme_softns_t *nsc = dev->si_drv1;
154         nvme_softc_t *sc = nsc->sc;
155         int error;
156
157         switch(ap->a_cmd) {
158         case NVMEIOCGETLOG:
159                 error = nvme_getlog_ioctl(sc, (void *)ap->a_data);
160                 break;
161         default:
162                 error = ENOIOCTL;
163                 break;
164         }
165         return error;
166 }
167
168 static int
169 nvme_strategy(struct dev_strategy_args *ap)
170 {
171         cdev_t dev = ap->a_head.a_dev;
172         nvme_softns_t *nsc = dev->si_drv1;
173
174         nvme_strategy_core(nsc, ap->a_bio, nvme_sync_delay);
175
176         return 0;
177 }
178
179 /*
180  * Called from admin thread to requeue BIOs.  We must call
181  * nvme_strategy_core() with delay = 0 to disable synchronous
182  * optimizations to avoid deadlocking the admin thread.
183  */
184 void
185 nvme_disk_requeues(nvme_softc_t *sc)
186 {
187         nvme_softns_t *nsc;
188         struct bio *bio;
189         int i;
190
191         for (i = 0; i < sc->nscmax; ++i) {
192                 nsc = sc->nscary[i];
193                 if (nsc == NULL || nsc->sc == NULL)
194                         continue;
195                 if (bioq_first(&nsc->bioq)) {
196                         lockmgr(&nsc->lk, LK_EXCLUSIVE);
197                         while ((bio = bioq_first(&nsc->bioq)) != NULL) {
198                                 bioq_remove(&nsc->bioq, bio);
199                                 lockmgr(&nsc->lk, LK_RELEASE);
200                                 if (nvme_strategy_core(nsc, bio, 0))
201                                         goto next;
202                                 lockmgr(&nsc->lk, LK_EXCLUSIVE);
203                         }
204                         lockmgr(&nsc->lk, LK_RELEASE);
205                 }
206 next:
207                 ;
208         }
209 }
210
211
212 /*
213  * Returns non-zero if no requests are available.
214  *
215  * WARNING! We are using the KVABIO API and must not access memory
216  *          through bp->b_data without first calling bkvasync(bp).
217  */
218 static int
219 nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay)
220 {
221         nvme_softc_t *sc = nsc->sc;
222         struct buf *bp = bio->bio_buf;
223         uint64_t nlba;
224         uint64_t secno;
225         nvme_subqueue_t *subq;
226         nvme_request_t *req;
227         int nobytes;
228
229         /*
230          * Calculate sector/extent
231          */
232         secno = bio->bio_offset / nsc->blksize;
233         nlba = bp->b_bcount / nsc->blksize;
234
235         devstat_start_transaction(&nsc->stats);
236
237         subq = NULL;
238         req = NULL;
239         nobytes = 0;
240
241         /*
242          * Convert bio to low-level request
243          */
244         switch (bp->b_cmd) {
245         case BUF_CMD_READ:
246                 if (nlba == 0) {
247                         nobytes = 1;
248                         break;
249                 }
250                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_RD]];
251                 /* get_request does not need the subq lock */
252                 req = nvme_get_request(subq, NVME_IOCMD_READ,
253                                        bp->b_data, nlba * nsc->blksize);
254                 if (req == NULL)
255                         goto requeue;
256
257                 req->cmd.read.head.nsid = nsc->nsid;
258                 req->cmd.read.start_lba = secno;
259                 req->cmd.read.count_lba = nlba - 1;     /* 0's based */
260                 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
261                 req->cmd.read.dsm = 0;     /* NVME_DSM_INCOMPRESSIBLE */
262                                            /* NVME_DSM_SEQREQ */
263                 break;
264         case BUF_CMD_WRITE:
265                 if (nlba == 0) {
266                         nobytes = 1;
267                         break;
268                 }
269                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
270                 /* get_request does not need the subq lock */
271                 req = nvme_get_request(subq, NVME_IOCMD_WRITE,
272                                        bp->b_data, nlba * nsc->blksize);
273                 if (req == NULL)
274                         goto requeue;
275                 req->cmd.write.head.nsid = nsc->nsid;
276                 req->cmd.write.start_lba = secno;
277                 req->cmd.write.count_lba = nlba - 1;    /* 0's based */
278                 break;
279         case BUF_CMD_FREEBLKS:
280                 if (nlba == 0) {
281                         nobytes = 1;
282                         break;
283                 }
284                 if (nlba > 65536) {
285                         /* will cause INVAL error */
286                         break;
287                 }
288                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
289                 /* get_request does not need the subq lock */
290                 req = nvme_get_request(subq, NVME_IOCMD_WRITEZ, NULL, 0);
291                 if (req == NULL)
292                         goto requeue;
293                 req->cmd.writez.head.nsid = nsc->nsid;
294                 req->cmd.writez.start_lba = secno;
295                 req->cmd.writez.count_lba = nlba - 1;   /* 0's based */
296                 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
297                 req->cmd.read.dsm = 0;     /* NVME_DSM_INCOMPRESSIBLE */
298                                            /* NVME_DSM_SEQREQ */
299                 break;
300         case BUF_CMD_FLUSH:
301                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
302                 /* get_request does not need the subq lock */
303                 req = nvme_get_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
304                 if (req == NULL)
305                         goto requeue;
306                 req->cmd.flush.head.nsid = nsc->nsid;
307                 break;
308         default:
309                 break;
310         }
311
312         /*
313          * Submit the request
314          */
315         if (req) {
316                 nvme_comqueue_t *comq;
317
318                 /* HACK OPTIMIZATIONS - TODO NEEDS WORK */
319
320                 /*
321                  * Prevent callback from occurring if the synchronous
322                  * delay optimization is enabled.
323                  *
324                  * NOTE: subq lock does not protect the I/O (completion
325                  *       only needs the comq lock).
326                  */
327                 if (delay == 0)
328                         req->callback = nvme_disk_callback;
329                 req->nsc = nsc;
330                 req->bio = bio;
331                 BUF_KERNPROC(bp);               /* do before submit */
332                 lockmgr(&subq->lk, LK_EXCLUSIVE);
333                 nvme_submit_request(req);       /* needs subq lock */
334                 lockmgr(&subq->lk, LK_RELEASE);
335                 if (delay) {
336                         comq = req->comq;
337                         DELAY(delay);           /* XXX */
338                         lockmgr(&comq->lk, LK_EXCLUSIVE);
339                         nvme_poll_completions(comq, &comq->lk);
340                         if (req->state == NVME_REQ_SUBMITTED) {
341                                 /*
342                                  * Didn't finish, do it the slow way
343                                  * (restore async completion).
344                                  */
345                                 req->callback = nvme_disk_callback;
346                                 lockmgr(&comq->lk, LK_RELEASE);
347                         } else {
348                                 /*
349                                  * Jeeze, that was fast.
350                                  */
351                                 nvme_disk_callback(req, &comq->lk);
352                                 lockmgr(&comq->lk, LK_RELEASE);
353                         }
354                 } /* else async completion */
355         } else if (nobytes) {
356                 devstat_end_transaction_buf(&nsc->stats, bp);
357                 biodone(bio);
358         } else {
359                 bp->b_error = EINVAL;
360                 bp->b_flags |= B_ERROR;
361                 devstat_end_transaction_buf(&nsc->stats, bp);
362                 biodone(bio);
363         }
364         return 0;
365
366         /*
367          * No requests were available, requeue the bio.
368          *
369          * The nvme_get_request() call armed the requeue signal but
370          * it is possible that it was picked up too quickly.  If it
371          * was, signal the admin thread ourselves.  This case will occur
372          * relatively rarely and only under heavy I/O conditions so we
373          * don't have to be entirely efficient about dealing with it.
374          */
375 requeue:
376         BUF_KERNPROC(bp);
377         lockmgr(&nsc->lk, LK_EXCLUSIVE);
378         bioqdisksort(&nsc->bioq, bio);
379         lockmgr(&nsc->lk, LK_RELEASE);
380         if (atomic_swap_int(&subq->signal_requeue, 1) == 0) {
381                 atomic_swap_int(&subq->signal_requeue, 0);
382                 atomic_set_int(&subq->sc->admin_signal, ADMIN_SIG_REQUEUE);
383                 wakeup(&subq->sc->admin_signal);
384         }
385         return 1;
386 }
387
388 static
389 void
390 nvme_disk_callback(nvme_request_t *req, struct lock *lk)
391 {
392         nvme_softns_t *nsc = req->nsc;
393         struct bio *bio;
394         struct buf *bp;
395         int status;
396
397         status = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
398         bio = req->bio;
399         bp = bio->bio_buf;
400
401         if (lk)                                 /* comq lock */
402                 lockmgr(lk, LK_RELEASE);
403         nvme_put_request(req);                  /* does not need subq lock */
404         devstat_end_transaction_buf(&nsc->stats, bp);
405         if (status) {
406                 bp->b_error = EIO;
407                 bp->b_flags |= B_ERROR;
408                 biodone(bio);
409         } else {
410                 bp->b_resid = 0;
411                 biodone(bio);
412         }
413         if (lk)                                 /* comq lock */
414                 lockmgr(lk, LK_EXCLUSIVE);
415 }
416
417 int
418 nvme_alloc_disk_unit(void)
419 {
420         static int unit_counter = 0;
421         int unit;
422
423         unit = atomic_fetchadd_int(&unit_counter, 1);
424
425         return unit;
426 }
427
428 static int
429 nvme_dump(struct dev_dump_args *ap)
430 {
431         cdev_t dev = ap->a_head.a_dev;
432         nvme_softns_t *nsc = dev->si_drv1;
433         nvme_softc_t *sc = nsc->sc;
434         uint64_t nlba;
435         uint64_t secno;
436         nvme_subqueue_t *subq;
437         nvme_comqueue_t *comq;
438         nvme_request_t *req;
439         int didlock;
440
441         /*
442          * Calculate sector/extent
443          */
444         secno = ap->a_offset / nsc->blksize;
445         nlba = ap->a_length / nsc->blksize;
446
447         subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
448
449         if (nlba) {
450                 /*
451                  * Issue a WRITE
452                  *
453                  * get_request does not need the subq lock.
454                  */
455                 req = nvme_get_dump_request(subq, NVME_IOCMD_WRITE,
456                                        ap->a_virtual, nlba * nsc->blksize);
457                 req->cmd.write.head.nsid = nsc->nsid;
458                 req->cmd.write.start_lba = secno;
459                 req->cmd.write.count_lba = nlba - 1;    /* 0's based */
460         } else {
461                 /*
462                  * Issue a FLUSH
463                  *
464                  * get_request does not need the subq lock.
465                  */
466                 req = nvme_get_dump_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
467                 req->cmd.flush.head.nsid = nsc->nsid;
468         }
469
470         /*
471          * Prevent callback from occurring if the synchronous
472          * delay optimization is enabled.
473          */
474         req->callback = NULL;
475         req->nsc = nsc;
476
477         /*
478          * 500 x 1uS poll wait on lock.  We might be the idle thread, so
479          * we can't safely block during a dump.
480          */
481         didlock = 500;
482         while (lockmgr(&subq->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
483                 if (--didlock == 0)
484                         break;
485                 tsc_delay(1000);        /* 1uS */
486                 lwkt_switch();
487         }
488         nvme_submit_request(req);       /* needs subq lock */
489         if (didlock)
490                 lockmgr(&subq->lk, LK_RELEASE);
491
492         comq = req->comq;
493         nvme_poll_request(req);
494         nvme_put_dump_request(req);             /* does not need subq lock */
495
496         /*
497          * Shut the nvme controller down nicely when we finish the dump.
498          * We should to do this whether we are in a panic or not because
499          * frankly the dump is overwriting swap space, thus the system is
500          * probably not stable.
501          */
502         if (nlba == 0)
503                 nvme_issue_shutdown(sc, 1);
504         return 0;
505 }