sys/dev/disk/nvme/nvme_attach.c

   1 /*
   2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 #include "nvme.h"
  36
  37 static int      nvme_pci_attach(device_t);
  38 static int      nvme_pci_detach(device_t);
  39
  40 static const nvme_device_t nvme_devices[] = {
  41         /* Vendor-specific table goes here (see ahci for example) */
  42         { 0, 0, nvme_pci_attach, nvme_pci_detach, "NVME-PCIe" }
  43 };
  44
  45 static int      nvme_msix_enable = 1;
  46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable);
  47 static int      nvme_msi_enable = 0;
  48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable);
  49
  50 TAILQ_HEAD(, nvme_softc) nvme_sc_list = TAILQ_HEAD_INITIALIZER(nvme_sc_list);
  51 struct lock nvme_master_lock = LOCK_INITIALIZER("nvmstr", 0, 0);
  52
  53 static int last_global_cpu;
  54
  55 /*
  56  * Match during probe and attach.  The device does not yet have a softc.
  57  */
  58 const nvme_device_t *
  59 nvme_lookup_device(device_t dev)
  60 {
  61         const nvme_device_t *ad;
  62         uint16_t vendor = pci_get_vendor(dev);
  63         uint16_t product = pci_get_device(dev);
  64         uint8_t class = pci_get_class(dev);
  65         uint8_t subclass = pci_get_subclass(dev);
  66         uint8_t progif = pci_read_config(dev, PCIR_PROGIF, 1);
  67         int is_nvme;
  68
  69         /*
  70          * Generally speaking if the pci device does not identify as
  71          * AHCI we skip it.
  72          */
  73         if (class == PCIC_STORAGE && subclass == PCIS_STORAGE_NVM &&
  74             progif == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) {
  75                 is_nvme = 1;
  76         } else {
  77                 is_nvme = 0;
  78         }
  79
  80         for (ad = &nvme_devices[0]; ad->vendor; ++ad) {
  81                 if (ad->vendor == vendor && ad->product == product)
  82                         return (ad);
  83         }
  84
  85         /*
  86          * Last ad is the default match if the PCI device matches SATA.
  87          */
  88         if (is_nvme == 0)
  89                 ad = NULL;
  90         return (ad);
  91 }
  92
  93 /*
  94  * Attach functions.  They all eventually fall through to nvme_pci_attach().
  95  */
  96 static int
  97 nvme_pci_attach(device_t dev)
  98 {
  99         nvme_softc_t *sc = device_get_softc(dev);
 100         uint32_t reg;
 101         int error;
 102         int msi_enable;
 103         int msix_enable;
 104
 105 #if 0
 106         if (pci_read_config(dev, PCIR_COMMAND, 2) & 0x0400) {
 107                 device_printf(dev, "BIOS disabled PCI interrupt, "
 108                                    "re-enabling\n");
 109                 pci_write_config(dev, PCIR_COMMAND,
 110                         pci_read_config(dev, PCIR_COMMAND, 2) & ~0x0400, 2);
 111         }
 112 #endif
 113
 114         sc->dev = dev;
 115
 116         /*
 117          * Map the register window
 118          */
 119         sc->rid_regs = PCIR_BAR(0);
 120         sc->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
 121                                           &sc->rid_regs, RF_ACTIVE);
 122         if (sc->regs == NULL) {
 123                 device_printf(dev, "unable to map registers\n");
 124                 nvme_pci_detach(dev);
 125                 return (ENXIO);
 126         }
 127         sc->iot = rman_get_bustag(sc->regs);
 128         sc->ioh = rman_get_bushandle(sc->regs);
 129
 130         /*
 131          * NVMe allows the MSI-X table to be mapped to BAR 4/5.
 132          * Always try to map BAR4, but it's ok if it fails.  Must
 133          * be done prior to allocating our interrupts.
 134          */
 135         sc->rid_bar4 = PCIR_BAR(4);
 136         sc->bar4 = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
 137                                           &sc->rid_bar4, RF_ACTIVE);
 138
 139         /*
 140          * Map the interrupt or initial interrupt which will be used for
 141          * the admin queue.  NVME chipsets can potentially support a huge
 142          * number of MSIX vectors but we really only need enough for
 143          * available cpus, plus 1.
 144          */
 145         msi_enable = device_getenv_int(dev, "msi.enable", nvme_msi_enable);
 146         msix_enable = device_getenv_int(dev, "msix.enable", nvme_msix_enable);
 147
 148         error = 0;
 149         if (msix_enable) {
 150                 int i;
 151                 int cpu;
 152
 153                 sc->nirqs = pci_msix_count(dev);
 154                 sc->irq_type = PCI_INTR_TYPE_MSIX;
 155                 if (sc->nirqs > ncpus + 1)              /* max we need */
 156                         sc->nirqs = ncpus + 1;
 157
 158                 error = pci_setup_msix(dev);
 159                 cpu = (last_global_cpu + 0) % ncpus;    /* GCC warn */
 160                 for (i = 0; error == 0 && i < sc->nirqs; ++i) {
 161                         cpu = (last_global_cpu + i) % ncpus;
 162                         error = pci_alloc_msix_vector(dev, i,
 163                                                       &sc->rid_irq[i], cpu);
 164                         if (error)
 165                                 break;
 166                         sc->irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
 167                                                             &sc->rid_irq[i],
 168                                                             RF_ACTIVE);
 169                         /*
 170                          * We want this to overwrite queue 0's cpu vector
 171                          * when the cpu's rotate through later on.
 172                          */
 173                         if (sc->cputovect[cpu] == 0)
 174                                 sc->cputovect[cpu] = i;
 175                 }
 176
 177                 /*
 178                  * If we did not iterate enough cpus (that is, there weren't
 179                  * enough irqs for all available cpus) we still need to
 180                  * finish or sc->cputovect[] mapping.
 181                  */
 182                 while (error == 0) {
 183                         cpu = (cpu + 1) % ncpus;
 184                         i = (i + 1) % sc->nirqs;
 185                         if (i == 0)
 186                                 i = 1;
 187                         if (sc->cputovect[cpu] != 0)
 188                                 break;
 189                         sc->cputovect[cpu] = i;
 190                 }
 191
 192                 if (error) {
 193                         while (--i >= 0) {
 194                                 bus_release_resource(dev, SYS_RES_IRQ,
 195                                                      sc->rid_irq[i],
 196                                                      sc->irq[i]);
 197                                 pci_release_msix_vector(dev, sc->rid_irq[i]);
 198                                 sc->irq[i] = NULL;
 199                         }
 200                         /* leave error intact to fall through to normal */
 201                 } else {
 202                         last_global_cpu = (last_global_cpu + sc->nirqs) % ncpus;
 203                         pci_enable_msix(dev);
 204                 }
 205         }
 206
 207         /*
 208          * If we have to use a normal interrupt we fake the cputovect[] in
 209          * order to try to map at least (ncpus) submission queues.  The admin
 210          * code will limit the number of completion queues to something
 211          * reasonable when nirqs is 1 since the single interrupt polls all
 212          * completion queues.
 213          *
 214          * NOTE: We do NOT want to map a single completion queue (#0), because
 215          *       then an I/O submission and/or completion queue will overlap
 216          *       the admin submission or completion queue, and that can cause
 217          *       havoc when admin commands are submitted that don't return
 218          *       for long periods of time.
 219          *
 220          * NOTE: Chipsets supporting MSI-X *MIGHT* *NOT* properly support
 221          *       a normal pin-based level interrupt.  For example, the BPX
 222          *       NVMe SSD just leaves the level interrupt stuck on.  Do not
 223          *       disable MSI-X unless you have no choice.
 224          */
 225         if (msix_enable == 0 || error) {
 226                 uint32_t irq_flags;
 227                 int i;
 228
 229                 error = 0;
 230                 sc->nirqs = 1;
 231                 sc->irq_type = pci_alloc_1intr(dev, msi_enable,
 232                                                &sc->rid_irq[0], &irq_flags);
 233                 sc->irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
 234                                                  &sc->rid_irq[0], irq_flags);
 235
 236                 for (i = 0; i < ncpus; ++i)
 237                         sc->cputovect[i] = i + 1;
 238         }
 239         if (sc->irq[0] == NULL) {
 240                 device_printf(dev, "unable to map interrupt\n");
 241                 nvme_pci_detach(dev);
 242                 return (ENXIO);
 243         } else {
 244                 const char *type;
 245                 switch(sc->irq_type) {
 246                 case PCI_INTR_TYPE_MSI:
 247                         type = "MSI";
 248                         break;
 249                 case PCI_INTR_TYPE_MSIX:
 250                         type = "MSIX";
 251                         break;
 252                 default:
 253                         type = "normal-int";
 254                         break;
 255                 }
 256                 device_printf(dev, "mapped %d %s IRQs\n", sc->nirqs, type);
 257         }
 258
 259         /*
 260          * Make sure the chip is disabled, which will reset all controller
 261          * registers except for the admin queue registers.  Device should
 262          * already be disabled so this is usually instantanious.  Use a
 263          * fixed 5-second timeout in case it is not.  I'd like my other
 264          * reads to occur after the device has been disabled.
 265          */
 266         sc->entimo = hz * 5;
 267         error = nvme_enable(sc, 0);
 268         if (error) {
 269                 nvme_pci_detach(dev);
 270                 return (ENXIO);
 271         }
 272
 273         /*
 274          * Get capabillities and version and report
 275          */
 276         sc->vers = nvme_read(sc, NVME_REG_VERS);
 277         sc->cap = nvme_read8(sc, NVME_REG_CAP);
 278         sc->maxqe = NVME_CAP_MQES_GET(sc->cap);
 279         sc->dstrd4 = NVME_CAP_DSTRD_GET(sc->cap);
 280
 281         device_printf(dev, "NVME Version %u.%u maxqe=%u caps=%016jx\n",
 282                       NVME_VERS_MAJOR_GET(sc->vers),
 283                       NVME_VERS_MINOR_GET(sc->vers),
 284                       sc->maxqe, sc->cap);
 285
 286         /*
 287          * Enable timeout, 500ms increments.  Convert to ticks.
 288          */
 289         sc->entimo = NVME_CAP_TIMEOUT_GET(sc->cap) * hz / 2; /* in ticks */
 290         ++sc->entimo;           /* fudge */
 291
 292         /*
 293          * Validate maxqe.  To cap the amount of memory we reserve for
 294          * PRPs we limit maxqe to 256.  Also make sure it is a power of
 295          * two.
 296          */
 297         if (sc->maxqe < 2) {
 298                 device_printf(dev,
 299                               "Attach failed, max queue entries (%d) "
 300                               "below minimum (2)\n", sc->maxqe);
 301                 nvme_pci_detach(dev);
 302                 return (ENXIO);
 303         }
 304         if (sc->maxqe > 256)
 305                 sc->maxqe = 256;
 306         for (reg = 2; reg <= sc->maxqe; reg <<= 1)
 307                 ;
 308         sc->maxqe = reg >> 1;
 309
 310         /*
 311          * DMA tags
 312          *
 313          * PRP  - Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE
 314          *        (typically 64), multiplied by maxqe (typ 256).  Roughly
 315          *        ~128KB per queue.  Align for cache performance.  We actually
 316          *        need one more PRP per queue entry worst-case to handle
 317          *        buffer overlap, but we have an extra one in the command
 318          *        structure so we don't have to calculate that out.
 319          *
 320          *        Remember that we intend to allocate potentially many queues,
 321          *        so we don't want to bloat this too much.  A queue depth of
 322          *        256 is plenty.
 323          *
 324          * CMD - Storage for the submit queue.  maxqe * 64      (~16KB)
 325          *
 326          * RES - Storage for the completion queue.  maxqe * 16  (~4KB)
 327          *
 328          * ADM - Storage for admin command DMA data.  Maximum admin command
 329          *       DMA data is 4KB so reserve maxqe * 4KB (~1MB).  There is only
 330          *       one admin queue.
 331          *
 332          * NOTE: There are no boundary requirements for NVMe, but I specify a
 333          *       4MB boundary anyway because this reduces mass-bit flipping
 334          *       of address bits inside the controller when incrementing
 335          *       DMA addresses.  Why not?  Can't hurt.
 336          */
 337         sc->prp_bytes = sizeof(uint64_t) * (MAXPHYS / PAGE_SIZE) * sc->maxqe;
 338         sc->cmd_bytes = sizeof(nvme_subq_item_t) * sc->maxqe;
 339         sc->res_bytes = sizeof(nvme_comq_item_t) * sc->maxqe;
 340         sc->adm_bytes = NVME_MAX_ADMIN_BUFFER * sc->maxqe;
 341
 342         error = 0;
 343
 344         error += bus_dma_tag_create(
 345                         NULL,                           /* parent tag */
 346                         PAGE_SIZE,                      /* alignment */
 347                         4 * 1024 * 1024,                /* boundary */
 348                         BUS_SPACE_MAXADDR,              /* loaddr? */
 349                         BUS_SPACE_MAXADDR,              /* hiaddr */
 350                         NULL,                           /* filter */
 351                         NULL,                           /* filterarg */
 352                         sc->prp_bytes,                  /* [max]size */
 353                         1,                              /* maxsegs */
 354                         sc->prp_bytes,                  /* maxsegsz */
 355                         0,                              /* flags */
 356                         &sc->prps_tag);                 /* return tag */
 357
 358         error += bus_dma_tag_create(
 359                         NULL,                           /* parent tag */
 360                         PAGE_SIZE,                      /* alignment */
 361                         4 * 1024 * 1024,                /* boundary */
 362                         BUS_SPACE_MAXADDR,              /* loaddr? */
 363                         BUS_SPACE_MAXADDR,              /* hiaddr */
 364                         NULL,                           /* filter */
 365                         NULL,                           /* filterarg */
 366                         sc->cmd_bytes,                  /* [max]size */
 367                         1,                              /* maxsegs */
 368                         sc->cmd_bytes,                  /* maxsegsz */
 369                         0,                              /* flags */
 370                         &sc->sque_tag);                 /* return tag */
 371
 372         error += bus_dma_tag_create(
 373                         NULL,                           /* parent tag */
 374                         PAGE_SIZE,                      /* alignment */
 375                         4 * 1024 * 1024,                /* boundary */
 376                         BUS_SPACE_MAXADDR,              /* loaddr? */
 377                         BUS_SPACE_MAXADDR,              /* hiaddr */
 378                         NULL,                           /* filter */
 379                         NULL,                           /* filterarg */
 380                         sc->res_bytes,                  /* [max]size */
 381                         1,                              /* maxsegs */
 382                         sc->res_bytes,                  /* maxsegsz */
 383                         0,                              /* flags */
 384                         &sc->cque_tag);                 /* return tag */
 385
 386         error += bus_dma_tag_create(
 387                         NULL,                           /* parent tag */
 388                         PAGE_SIZE,                      /* alignment */
 389                         4 * 1024 * 1024,                /* boundary */
 390                         BUS_SPACE_MAXADDR,              /* loaddr? */
 391                         BUS_SPACE_MAXADDR,              /* hiaddr */
 392                         NULL,                           /* filter */
 393                         NULL,                           /* filterarg */
 394                         sc->adm_bytes,                  /* [max]size */
 395                         1,                              /* maxsegs */
 396                         sc->adm_bytes,                  /* maxsegsz */
 397                         0,                              /* flags */
 398                         &sc->adm_tag);                  /* return tag */
 399
 400         if (error) {
 401                 device_printf(dev, "unable to create dma tags\n");
 402                 nvme_pci_detach(dev);
 403                 return (ENXIO);
 404         }
 405
 406         /*
 407          * Setup the admin queues (qid 0).
 408          */
 409         error = nvme_alloc_subqueue(sc, 0);
 410         if (error) {
 411                 device_printf(dev, "unable to allocate admin subqueue\n");
 412                 nvme_pci_detach(dev);
 413                 return (ENXIO);
 414         }
 415         error = nvme_alloc_comqueue(sc, 0);
 416         if (error) {
 417                 device_printf(dev, "unable to allocate admin comqueue\n");
 418                 nvme_pci_detach(dev);
 419                 return (ENXIO);
 420         }
 421
 422         /*
 423          * Initialize the admin queue registers
 424          */
 425         reg = NVME_ATTR_COM_SET(sc->maxqe) | NVME_ATTR_SUB_SET(sc->maxqe);
 426         nvme_write(sc, NVME_REG_ADM_ATTR, reg);
 427         nvme_write8(sc, NVME_REG_ADM_SUBADR, (uint64_t)sc->subqueues[0].psubq);
 428         nvme_write8(sc, NVME_REG_ADM_COMADR, (uint64_t)sc->comqueues[0].pcomq);
 429
 430         /*
 431          * qemu appears to require this, real hardware does not appear
 432          * to require this.
 433          */
 434         pci_enable_busmaster(dev);
 435
 436         /*
 437          * Other configuration registers
 438          */
 439         reg = NVME_CONFIG_IOSUB_ES_SET(6) |             /* 64 byte sub entry */
 440               NVME_CONFIG_IOCOM_ES_SET(4) |             /* 16 byte com entry */
 441               NVME_CONFIG_MEMPG_SET(PAGE_SHIFT) |       /* 4K pages */
 442               NVME_CONFIG_CSS_NVM;                      /* NVME command set */
 443         nvme_write(sc, NVME_REG_CONFIG, reg);
 444
 445         reg = nvme_read(sc, NVME_REG_MEMSIZE);
 446
 447         /*
 448          * Enable the chip for operation
 449          */
 450         error = nvme_enable(sc, 1);
 451         if (error) {
 452                 nvme_enable(sc, 0);
 453                 nvme_pci_detach(dev);
 454                 return (ENXIO);
 455         }
 456
 457         /*
 458          * Start the admin thread.  This will also setup the admin queue
 459          * interrupt.
 460          */
 461         error = nvme_start_admin_thread(sc);
 462         if (error) {
 463                 nvme_pci_detach(dev);
 464                 return (ENXIO);
 465         }
 466         lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
 467         sc->flags |= NVME_SC_ATTACHED;
 468         TAILQ_INSERT_TAIL(&nvme_sc_list, sc, entry);
 469         lockmgr(&nvme_master_lock, LK_RELEASE);
 470
 471         return(0);
 472 }
 473
 474 /*
 475  * Device unload / detachment
 476  */
 477 static int
 478 nvme_pci_detach(device_t dev)
 479 {
 480         nvme_softc_t *sc = device_get_softc(dev);
 481         int i;
 482
 483         /*
 484          * Stop the admin thread
 485          */
 486         nvme_stop_admin_thread(sc);
 487
 488         /*
 489          * Issue a normal shutdown and wait for completion
 490          */
 491         nvme_issue_shutdown(sc, 0);
 492
 493         /*
 494          * Disable the chip
 495          */
 496         nvme_enable(sc, 0);
 497
 498         /*
 499          * Free admin memory
 500          */
 501         nvme_free_subqueue(sc, 0);
 502         nvme_free_comqueue(sc, 0);
 503
 504         /*
 505          * Release related resources.
 506          */
 507         for (i = 0; i < sc->nirqs; ++i) {
 508                 if (sc->irq[i]) {
 509                         bus_release_resource(dev, SYS_RES_IRQ,
 510                                              sc->rid_irq[i], sc->irq[i]);
 511                         sc->irq[i] = NULL;
 512                         if (sc->irq_type == PCI_INTR_TYPE_MSIX)
 513                                 pci_release_msix_vector(dev, sc->rid_irq[i]);
 514                 }
 515         }
 516         switch(sc->irq_type) {
 517         case PCI_INTR_TYPE_MSI:
 518                 pci_release_msi(dev);
 519                 break;
 520         case PCI_INTR_TYPE_MSIX:
 521                 pci_teardown_msix(dev);
 522                 break;
 523         default:
 524                 break;
 525         }
 526
 527         /*
 528          * Release remaining chipset resources
 529          */
 530         if (sc->regs) {
 531                 bus_release_resource(dev, SYS_RES_MEMORY,
 532                                      sc->rid_regs, sc->regs);
 533                 sc->regs = NULL;
 534         }
 535         if (sc->bar4) {
 536                 bus_release_resource(dev, SYS_RES_MEMORY,
 537                                      sc->rid_bar4, sc->bar4);
 538                 sc->bar4 = NULL;
 539         }
 540
 541         /*
 542          * Cleanup the DMA tags
 543          */
 544         if (sc->prps_tag) {
 545                 bus_dma_tag_destroy(sc->prps_tag);
 546                 sc->prps_tag = NULL;
 547         }
 548         if (sc->sque_tag) {
 549                 bus_dma_tag_destroy(sc->sque_tag);
 550                 sc->sque_tag = NULL;
 551         }
 552         if (sc->cque_tag) {
 553                 bus_dma_tag_destroy(sc->cque_tag);
 554                 sc->cque_tag = NULL;
 555         }
 556         if (sc->adm_tag) {
 557                 bus_dma_tag_destroy(sc->adm_tag);
 558                 sc->adm_tag = NULL;
 559         }
 560
 561         if (sc->flags & NVME_SC_ATTACHED) {
 562                 lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
 563                 sc->flags &= ~NVME_SC_ATTACHED;
 564                 TAILQ_REMOVE(&nvme_sc_list, sc, entry);
 565                 lockmgr(&nvme_master_lock, LK_RELEASE);
 566         }
 567
 568         return (0);
 569 }