sys/dev/disk/nvme/nvme_disk.c

   1 /*
   2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 #include "nvme.h"
  36
  37 static void nvme_disk_callback(nvme_request_t *req, struct lock *lk);
  38 static int nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay);
  39
  40 static d_open_t nvme_open;
  41 static d_close_t nvme_close;
  42 static d_ioctl_t nvme_ioctl;
  43 static d_strategy_t nvme_strategy;
  44 static d_dump_t nvme_dump;
  45
  46 static struct dev_ops nvme_ops = {
  47         { "nvme", 0, D_DISK | D_MPSAFE | D_CANFREE | D_TRACKCLOSE | D_KVABIO },
  48         .d_open =       nvme_open,
  49         .d_close =      nvme_close,
  50         .d_read =       physread,
  51         .d_dump =       nvme_dump,
  52         .d_write =      physwrite,
  53         .d_ioctl =      nvme_ioctl,
  54         .d_strategy =   nvme_strategy,
  55 };
  56
  57 static int nvme_sync_delay = 0;
  58 SYSCTL_INT(_debug, OID_AUTO, nvme_sync_delay, CTLFLAG_RW, &nvme_sync_delay, 0,
  59            "Enable synchronous delay/completion-check, uS");
  60
  61 /*
  62  * Attach a namespace as a disk, making the disk available to the system.
  63  */
  64 void
  65 nvme_disk_attach(nvme_softns_t *nsc)
  66 {
  67         nvme_softc_t *sc;
  68         struct disk_info info;
  69         char serial[20+16];
  70         size_t len;
  71         uint64_t cap_gb;
  72
  73         sc = nsc->sc;
  74         devstat_add_entry(&nsc->stats, "nvme", nsc->unit, nsc->blksize,
  75                           DEVSTAT_NO_ORDERED_TAGS,
  76                           DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
  77                           DEVSTAT_PRIORITY_OTHER);
  78         nsc->cdev = disk_create(nsc->unit, &nsc->disk, &nvme_ops);
  79         nsc->cdev->si_drv1 = nsc;
  80         nsc->cdev->si_iosize_max = MAXPHYS;     /* XXX */
  81         disk_setdisktype(&nsc->disk, "ssd");
  82
  83         bzero(&info, sizeof(info));
  84         info.d_media_blksize = nsc->blksize;
  85         info.d_media_blocks = nsc->idns.size;
  86         info.d_secpertrack = 1024;
  87         info.d_nheads = 1;
  88         info.d_secpercyl = info.d_secpertrack * info.d_nheads;
  89         info.d_ncylinders =  (u_int)(info.d_media_blocks / info.d_secpercyl);
  90
  91         KKASSERT(sizeof(sc->idctlr.serialno) == 20);
  92         bzero(serial, sizeof(serial));
  93         bcopy(sc->idctlr.serialno, serial, sizeof(sc->idctlr.serialno));
  94         len = string_cleanup(serial, 1);
  95
  96         ksnprintf(serial + len, sizeof(serial) - len, "-%u", nsc->nsid);
  97
  98         info.d_serialno = serial;
  99
 100         cap_gb = nsc->idns.size / (1024 * 1024 * 1024 / nsc->blksize);
 101         device_printf(sc->dev,
 102                 "Disk nvme%d ns=%u "
 103                 "blksize=%u lbacnt=%ju cap=%juGB serno=%s\n",
 104                 nsc->unit, nsc->nsid,
 105                 nsc->blksize, nsc->idns.size, cap_gb, serial);
 106
 107         disk_setdiskinfo(&nsc->disk, &info);
 108         /* serial is copied and does not have to be persistent */
 109 }
 110
 111 void
 112 nvme_disk_detach(nvme_softns_t *nsc)
 113 {
 114         if (nsc->cdev) {
 115                 disk_destroy(&nsc->disk);
 116                 devstat_remove_entry(&nsc->stats);
 117         }
 118 }
 119
 120 static
 121 int
 122 nvme_open(struct dev_open_args *ap)
 123 {
 124         cdev_t dev = ap->a_head.a_dev;
 125         nvme_softns_t *nsc = dev->si_drv1;
 126         nvme_softc_t *sc = nsc->sc;
 127
 128         if (sc->flags & NVME_SC_UNLOADING)
 129                 return ENXIO;
 130
 131         atomic_add_long(&sc->opencnt, 1);
 132
 133         return 0;
 134 }
 135
 136 static
 137 int
 138 nvme_close(struct dev_close_args *ap)
 139 {
 140         cdev_t dev = ap->a_head.a_dev;
 141         nvme_softns_t *nsc = dev->si_drv1;
 142         nvme_softc_t *sc = nsc->sc;
 143
 144         atomic_add_long(&sc->opencnt, -1);
 145
 146         return 0;
 147 }
 148
 149 static int
 150 nvme_ioctl(struct dev_ioctl_args *ap)
 151 {
 152         cdev_t dev = ap->a_head.a_dev;
 153         nvme_softns_t *nsc = dev->si_drv1;
 154         nvme_softc_t *sc = nsc->sc;
 155         int error;
 156
 157         switch(ap->a_cmd) {
 158         case NVMEIOCGETLOG:
 159                 error = nvme_getlog_ioctl(sc, (void *)ap->a_data);
 160                 break;
 161         default:
 162                 error = ENOIOCTL;
 163                 break;
 164         }
 165         return error;
 166 }
 167
 168 static int
 169 nvme_strategy(struct dev_strategy_args *ap)
 170 {
 171         cdev_t dev = ap->a_head.a_dev;
 172         nvme_softns_t *nsc = dev->si_drv1;
 173
 174         nvme_strategy_core(nsc, ap->a_bio, nvme_sync_delay);
 175
 176         return 0;
 177 }
 178
 179 /*
 180  * Called from admin thread to requeue BIOs.  We must call
 181  * nvme_strategy_core() with delay = 0 to disable synchronous
 182  * optimizations to avoid deadlocking the admin thread.
 183  */
 184 void
 185 nvme_disk_requeues(nvme_softc_t *sc)
 186 {
 187         nvme_softns_t *nsc;
 188         struct bio *bio;
 189         int i;
 190
 191         for (i = 0; i < sc->nscmax; ++i) {
 192                 nsc = sc->nscary[i];
 193                 if (nsc == NULL || nsc->sc == NULL)
 194                         continue;
 195                 if (bioq_first(&nsc->bioq)) {
 196                         lockmgr(&nsc->lk, LK_EXCLUSIVE);
 197                         while ((bio = bioq_first(&nsc->bioq)) != NULL) {
 198                                 bioq_remove(&nsc->bioq, bio);
 199                                 lockmgr(&nsc->lk, LK_RELEASE);
 200                                 if (nvme_strategy_core(nsc, bio, 0))
 201                                         goto next;
 202                                 lockmgr(&nsc->lk, LK_EXCLUSIVE);
 203                         }
 204                         lockmgr(&nsc->lk, LK_RELEASE);
 205                 }
 206 next:
 207                 ;
 208         }
 209 }
 210
 211
 212 /*
 213  * Returns non-zero if no requests are available.
 214  *
 215  * WARNING! We are using the KVABIO API and must not access memory
 216  *          through bp->b_data without first calling bkvasync(bp).
 217  */
 218 static int
 219 nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay)
 220 {
 221         nvme_softc_t *sc = nsc->sc;
 222         struct buf *bp = bio->bio_buf;
 223         uint64_t nlba;
 224         uint64_t secno;
 225         nvme_subqueue_t *subq;
 226         nvme_request_t *req;
 227         int nobytes;
 228
 229         /*
 230          * Calculate sector/extent
 231          */
 232         secno = bio->bio_offset / nsc->blksize;
 233         nlba = bp->b_bcount / nsc->blksize;
 234
 235         devstat_start_transaction(&nsc->stats);
 236
 237         subq = NULL;
 238         req = NULL;
 239         nobytes = 0;
 240
 241         /*
 242          * Convert bio to low-level request
 243          */
 244         switch (bp->b_cmd) {
 245         case BUF_CMD_READ:
 246                 if (nlba == 0) {
 247                         nobytes = 1;
 248                         break;
 249                 }
 250                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_RD]];
 251                 /* get_request does not need the subq lock */
 252                 req = nvme_get_request(subq, NVME_IOCMD_READ,
 253                                        bp->b_data, nlba * nsc->blksize);
 254                 if (req == NULL)
 255                         goto requeue;
 256
 257                 req->cmd.read.head.nsid = nsc->nsid;
 258                 req->cmd.read.start_lba = secno;
 259                 req->cmd.read.count_lba = nlba - 1;     /* 0's based */
 260                 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
 261                 req->cmd.read.dsm = 0;     /* NVME_DSM_INCOMPRESSIBLE */
 262                                            /* NVME_DSM_SEQREQ */
 263                 break;
 264         case BUF_CMD_WRITE:
 265                 if (nlba == 0) {
 266                         nobytes = 1;
 267                         break;
 268                 }
 269                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
 270                 /* get_request does not need the subq lock */
 271                 req = nvme_get_request(subq, NVME_IOCMD_WRITE,
 272                                        bp->b_data, nlba * nsc->blksize);
 273                 if (req == NULL)
 274                         goto requeue;
 275                 req->cmd.write.head.nsid = nsc->nsid;
 276                 req->cmd.write.start_lba = secno;
 277                 req->cmd.write.count_lba = nlba - 1;    /* 0's based */
 278                 break;
 279         case BUF_CMD_FREEBLKS:
 280                 if (nlba == 0) {
 281                         nobytes = 1;
 282                         break;
 283                 }
 284                 if (nlba > 65536) {
 285                         /* will cause INVAL error */
 286                         break;
 287                 }
 288                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
 289                 /* get_request does not need the subq lock */
 290                 req = nvme_get_request(subq, NVME_IOCMD_WRITEZ, NULL, 0);
 291                 if (req == NULL)
 292                         goto requeue;
 293                 req->cmd.writez.head.nsid = nsc->nsid;
 294                 req->cmd.writez.start_lba = secno;
 295                 req->cmd.writez.count_lba = nlba - 1;   /* 0's based */
 296                 req->cmd.read.ioflags = 0; /* NVME_IOFLG_LR, NVME_IOFLG_FUA */
 297                 req->cmd.read.dsm = 0;     /* NVME_DSM_INCOMPRESSIBLE */
 298                                            /* NVME_DSM_SEQREQ */
 299                 break;
 300         case BUF_CMD_FLUSH:
 301                 subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
 302                 /* get_request does not need the subq lock */
 303                 req = nvme_get_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
 304                 if (req == NULL)
 305                         goto requeue;
 306                 req->cmd.flush.head.nsid = nsc->nsid;
 307                 break;
 308         default:
 309                 break;
 310         }
 311
 312         /*
 313          * Submit the request
 314          */
 315         if (req) {
 316                 nvme_comqueue_t *comq;
 317
 318                 /* HACK OPTIMIZATIONS - TODO NEEDS WORK */
 319
 320                 /*
 321                  * Prevent callback from occurring if the synchronous
 322                  * delay optimization is enabled.
 323                  *
 324                  * NOTE: subq lock does not protect the I/O (completion
 325                  *       only needs the comq lock).
 326                  */
 327                 if (delay == 0)
 328                         req->callback = nvme_disk_callback;
 329                 req->nsc = nsc;
 330                 req->bio = bio;
 331                 BUF_KERNPROC(bp);               /* do before submit */
 332                 lockmgr(&subq->lk, LK_EXCLUSIVE);
 333                 nvme_submit_request(req);       /* needs subq lock */
 334                 lockmgr(&subq->lk, LK_RELEASE);
 335                 if (delay) {
 336                         comq = req->comq;
 337                         DELAY(delay);           /* XXX */
 338                         lockmgr(&comq->lk, LK_EXCLUSIVE);
 339                         nvme_poll_completions(comq, &comq->lk);
 340                         if (req->state == NVME_REQ_SUBMITTED) {
 341                                 /*
 342                                  * Didn't finish, do it the slow way
 343                                  * (restore async completion).
 344                                  */
 345                                 req->callback = nvme_disk_callback;
 346                                 lockmgr(&comq->lk, LK_RELEASE);
 347                         } else {
 348                                 /*
 349                                  * Jeeze, that was fast.
 350                                  */
 351                                 nvme_disk_callback(req, &comq->lk);
 352                                 lockmgr(&comq->lk, LK_RELEASE);
 353                         }
 354                 } /* else async completion */
 355         } else if (nobytes) {
 356                 devstat_end_transaction_buf(&nsc->stats, bp);
 357                 biodone(bio);
 358         } else {
 359                 bp->b_error = EINVAL;
 360                 bp->b_flags |= B_ERROR;
 361                 devstat_end_transaction_buf(&nsc->stats, bp);
 362                 biodone(bio);
 363         }
 364         return 0;
 365
 366         /*
 367          * No requests were available, requeue the bio.
 368          *
 369          * The nvme_get_request() call armed the requeue signal but
 370          * it is possible that it was picked up too quickly.  If it
 371          * was, signal the admin thread ourselves.  This case will occur
 372          * relatively rarely and only under heavy I/O conditions so we
 373          * don't have to be entirely efficient about dealing with it.
 374          */
 375 requeue:
 376         BUF_KERNPROC(bp);
 377         lockmgr(&nsc->lk, LK_EXCLUSIVE);
 378         bioqdisksort(&nsc->bioq, bio);
 379         lockmgr(&nsc->lk, LK_RELEASE);
 380         if (atomic_swap_int(&subq->signal_requeue, 1) == 0) {
 381                 atomic_swap_int(&subq->signal_requeue, 0);
 382                 atomic_set_int(&subq->sc->admin_signal, ADMIN_SIG_REQUEUE);
 383                 wakeup(&subq->sc->admin_signal);
 384         }
 385         return 1;
 386 }
 387
 388 static
 389 void
 390 nvme_disk_callback(nvme_request_t *req, struct lock *lk)
 391 {
 392         nvme_softns_t *nsc = req->nsc;
 393         struct bio *bio;
 394         struct buf *bp;
 395         int status;
 396
 397         status = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
 398         bio = req->bio;
 399         bp = bio->bio_buf;
 400
 401         if (lk)                                 /* comq lock */
 402                 lockmgr(lk, LK_RELEASE);
 403         nvme_put_request(req);                  /* does not need subq lock */
 404         devstat_end_transaction_buf(&nsc->stats, bp);
 405         if (status) {
 406                 bp->b_error = EIO;
 407                 bp->b_flags |= B_ERROR;
 408                 biodone(bio);
 409         } else {
 410                 bp->b_resid = 0;
 411                 biodone(bio);
 412         }
 413         if (lk)                                 /* comq lock */
 414                 lockmgr(lk, LK_EXCLUSIVE);
 415 }
 416
 417 int
 418 nvme_alloc_disk_unit(void)
 419 {
 420         static int unit_counter = 0;
 421         int unit;
 422
 423         unit = atomic_fetchadd_int(&unit_counter, 1);
 424
 425         return unit;
 426 }
 427
 428 static int
 429 nvme_dump(struct dev_dump_args *ap)
 430 {
 431         cdev_t dev = ap->a_head.a_dev;
 432         nvme_softns_t *nsc = dev->si_drv1;
 433         nvme_softc_t *sc = nsc->sc;
 434         uint64_t nlba;
 435         uint64_t secno;
 436         nvme_subqueue_t *subq;
 437         nvme_comqueue_t *comq;
 438         nvme_request_t *req;
 439         int didlock;
 440
 441         /*
 442          * Calculate sector/extent
 443          */
 444         secno = ap->a_offset / nsc->blksize;
 445         nlba = ap->a_length / nsc->blksize;
 446
 447         subq = &sc->subqueues[sc->qmap[mycpuid][NVME_QMAP_WR]];
 448
 449         if (nlba) {
 450                 /*
 451                  * Issue a WRITE
 452                  *
 453                  * get_request does not need the subq lock.
 454                  */
 455                 req = nvme_get_dump_request(subq, NVME_IOCMD_WRITE,
 456                                        ap->a_virtual, nlba * nsc->blksize);
 457                 req->cmd.write.head.nsid = nsc->nsid;
 458                 req->cmd.write.start_lba = secno;
 459                 req->cmd.write.count_lba = nlba - 1;    /* 0's based */
 460         } else {
 461                 /*
 462                  * Issue a FLUSH
 463                  *
 464                  * get_request does not need the subq lock.
 465                  */
 466                 req = nvme_get_dump_request(subq, NVME_IOCMD_FLUSH, NULL, 0);
 467                 req->cmd.flush.head.nsid = nsc->nsid;
 468         }
 469
 470         /*
 471          * Prevent callback from occurring if the synchronous
 472          * delay optimization is enabled.
 473          */
 474         req->callback = NULL;
 475         req->nsc = nsc;
 476
 477         /*
 478          * 500 x 1uS poll wait on lock.  We might be the idle thread, so
 479          * we can't safely block during a dump.
 480          */
 481         didlock = 500;
 482         while (lockmgr(&subq->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 483                 if (--didlock == 0)
 484                         break;
 485                 tsc_delay(1000);        /* 1uS */
 486                 lwkt_switch();
 487         }
 488         nvme_submit_request(req);       /* needs subq lock */
 489         if (didlock)
 490                 lockmgr(&subq->lk, LK_RELEASE);
 491
 492         comq = req->comq;
 493         nvme_poll_request(req);
 494         nvme_put_dump_request(req);             /* does not need subq lock */
 495
 496         /*
 497          * Shut the nvme controller down nicely when we finish the dump.
 498          * We should to do this whether we are in a panic or not because
 499          * frankly the dump is overwriting swap space, thus the system is
 500          * probably not stable.
 501          */
 502         if (nlba == 0)
 503                 nvme_issue_shutdown(sc, 1);
 504         return 0;
 505 }