Merge branch 'vendor/LIBRESSL'
[dragonfly.git] / sys / dev / disk / nvme / nvme_attach.c
1 /*
2  * Copyright (c) 2016 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include "nvme.h"
36
37 static int      nvme_pci_attach(device_t);
38 static int      nvme_pci_detach(device_t);
39
40 static const nvme_device_t nvme_devices[] = {
41         /* Vendor-specific table goes here (see ahci for example) */
42         { 0, 0, nvme_pci_attach, nvme_pci_detach, "NVME-PCIe" }
43 };
44
45 static int      nvme_msix_enable = 1;
46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable);
47 static int      nvme_msi_enable = 0;
48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable);
49
50 TAILQ_HEAD(, nvme_softc) nvme_sc_list = TAILQ_HEAD_INITIALIZER(nvme_sc_list);
51 struct lock nvme_master_lock = LOCK_INITIALIZER("nvmstr", 0, 0);
52
53 static int last_global_cpu;
54
55 /*
56  * Match during probe and attach.  The device does not yet have a softc.
57  */
58 const nvme_device_t *
59 nvme_lookup_device(device_t dev)
60 {
61         const nvme_device_t *ad;
62         uint16_t vendor = pci_get_vendor(dev);
63         uint16_t product = pci_get_device(dev);
64         uint8_t class = pci_get_class(dev);
65         uint8_t subclass = pci_get_subclass(dev);
66         uint8_t progif = pci_read_config(dev, PCIR_PROGIF, 1);
67         int is_nvme;
68
69         /*
70          * Generally speaking if the pci device does not identify as
71          * AHCI we skip it.
72          */
73         if (class == PCIC_STORAGE && subclass == PCIS_STORAGE_NVM &&
74             progif == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) {
75                 is_nvme = 1;
76         } else {
77                 is_nvme = 0;
78         }
79
80         for (ad = &nvme_devices[0]; ad->vendor; ++ad) {
81                 if (ad->vendor == vendor && ad->product == product)
82                         return (ad);
83         }
84
85         /*
86          * Last ad is the default match if the PCI device matches SATA.
87          */
88         if (is_nvme == 0)
89                 ad = NULL;
90         return (ad);
91 }
92
93 /*
94  * Attach functions.  They all eventually fall through to nvme_pci_attach().
95  */
96 static int
97 nvme_pci_attach(device_t dev)
98 {
99         nvme_softc_t *sc = device_get_softc(dev);
100         uint32_t reg;
101         int error;
102         int msi_enable;
103         int msix_enable;
104
105 #if 0
106         if (pci_read_config(dev, PCIR_COMMAND, 2) & 0x0400) {
107                 device_printf(dev, "BIOS disabled PCI interrupt, "
108                                    "re-enabling\n");
109                 pci_write_config(dev, PCIR_COMMAND,
110                         pci_read_config(dev, PCIR_COMMAND, 2) & ~0x0400, 2);
111         }
112 #endif
113
114         sc->dev = dev;
115
116         /*
117          * Map the register window
118          */
119         sc->rid_regs = PCIR_BAR(0);
120         sc->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
121                                           &sc->rid_regs, RF_ACTIVE);
122         if (sc->regs == NULL) {
123                 device_printf(dev, "unable to map registers\n");
124                 nvme_pci_detach(dev);
125                 return (ENXIO);
126         }
127         sc->iot = rman_get_bustag(sc->regs);
128         sc->ioh = rman_get_bushandle(sc->regs);
129
130         /*
131          * NVMe allows the MSI-X table to be mapped to BAR 4/5.
132          * Always try to map BAR4, but it's ok if it fails.  Must
133          * be done prior to allocating our interrupts.
134          */
135         sc->rid_bar4 = PCIR_BAR(4);
136         sc->bar4 = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
137                                           &sc->rid_bar4, RF_ACTIVE);
138
139         /*
140          * Map the interrupt or initial interrupt which will be used for
141          * the admin queue.  NVME chipsets can potentially support a huge
142          * number of MSIX vectors but we really only need enough for
143          * available cpus, plus 1.
144          */
145         msi_enable = device_getenv_int(dev, "msi.enable", nvme_msi_enable);
146         msix_enable = device_getenv_int(dev, "msix.enable", nvme_msix_enable);
147
148         error = 0;
149         if (msix_enable) {
150                 int i;
151                 int cpu;
152
153                 sc->nirqs = pci_msix_count(dev);
154                 sc->irq_type = PCI_INTR_TYPE_MSIX;
155                 if (sc->nirqs > ncpus + 1)              /* max we need */
156                         sc->nirqs = ncpus + 1;
157
158                 error = pci_setup_msix(dev);
159                 cpu = (last_global_cpu + 0) % ncpus;    /* GCC warn */
160                 for (i = 0; error == 0 && i < sc->nirqs; ++i) {
161                         cpu = (last_global_cpu + i) % ncpus;
162                         error = pci_alloc_msix_vector(dev, i,
163                                                       &sc->rid_irq[i], cpu);
164                         if (error)
165                                 break;
166                         sc->irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
167                                                             &sc->rid_irq[i],
168                                                             RF_ACTIVE);
169                         /*
170                          * We want this to overwrite queue 0's cpu vector
171                          * when the cpu's rotate through later on.
172                          */
173                         if (sc->cputovect[cpu] == 0)
174                                 sc->cputovect[cpu] = i;
175                 }
176
177                 /*
178                  * If we did not iterate enough cpus (that is, there weren't
179                  * enough irqs for all available cpus) we still need to
180                  * finish or sc->cputovect[] mapping.
181                  */
182                 while (error == 0) {
183                         cpu = (cpu + 1) % ncpus;
184                         i = (i + 1) % sc->nirqs;
185                         if (i == 0)
186                                 i = 1;
187                         if (sc->cputovect[cpu] != 0)
188                                 break;
189                         sc->cputovect[cpu] = i;
190                 }
191
192                 if (error) {
193                         while (--i >= 0) {
194                                 bus_release_resource(dev, SYS_RES_IRQ,
195                                                      sc->rid_irq[i],
196                                                      sc->irq[i]);
197                                 pci_release_msix_vector(dev, sc->rid_irq[i]);
198                                 sc->irq[i] = NULL;
199                         }
200                         /* leave error intact to fall through to normal */
201                 } else {
202                         last_global_cpu = (last_global_cpu + sc->nirqs) % ncpus;
203                         pci_enable_msix(dev);
204                 }
205         }
206
207         /*
208          * If we have to use a normal interrupt we fake the cputovect[] in
209          * order to try to map at least (ncpus) submission queues.  The admin
210          * code will limit the number of completion queues to something
211          * reasonable when nirqs is 1 since the single interrupt polls all
212          * completion queues.
213          *
214          * NOTE: We do NOT want to map a single completion queue (#0), because
215          *       then an I/O submission and/or completion queue will overlap
216          *       the admin submission or completion queue, and that can cause
217          *       havoc when admin commands are submitted that don't return
218          *       for long periods of time.
219          *
220          * NOTE: Chipsets supporting MSI-X *MIGHT* *NOT* properly support
221          *       a normal pin-based level interrupt.  For example, the BPX
222          *       NVMe SSD just leaves the level interrupt stuck on.  Do not
223          *       disable MSI-X unless you have no choice.
224          */
225         if (msix_enable == 0 || error) {
226                 uint32_t irq_flags;
227                 int i;
228
229                 error = 0;
230                 sc->nirqs = 1;
231                 sc->irq_type = pci_alloc_1intr(dev, msi_enable,
232                                                &sc->rid_irq[0], &irq_flags);
233                 sc->irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
234                                                  &sc->rid_irq[0], irq_flags);
235
236                 for (i = 0; i < ncpus; ++i)
237                         sc->cputovect[i] = i + 1;
238         }
239         if (sc->irq[0] == NULL) {
240                 device_printf(dev, "unable to map interrupt\n");
241                 nvme_pci_detach(dev);
242                 return (ENXIO);
243         } else {
244                 const char *type;
245                 switch(sc->irq_type) {
246                 case PCI_INTR_TYPE_MSI:
247                         type = "MSI";
248                         break;
249                 case PCI_INTR_TYPE_MSIX:
250                         type = "MSIX";
251                         break;
252                 default:
253                         type = "normal-int";
254                         break;
255                 }
256                 device_printf(dev, "mapped %d %s IRQs\n", sc->nirqs, type);
257         }
258
259         /*
260          * Make sure the chip is disabled, which will reset all controller
261          * registers except for the admin queue registers.  Device should
262          * already be disabled so this is usually instantanious.  Use a
263          * fixed 5-second timeout in case it is not.  I'd like my other
264          * reads to occur after the device has been disabled.
265          */
266         sc->entimo = hz * 5;
267         error = nvme_enable(sc, 0);
268         if (error) {
269                 nvme_pci_detach(dev);
270                 return (ENXIO);
271         }
272
273         /*
274          * Get capabillities and version and report
275          */
276         sc->vers = nvme_read(sc, NVME_REG_VERS);
277         sc->cap = nvme_read8(sc, NVME_REG_CAP);
278         sc->maxqe = NVME_CAP_MQES_GET(sc->cap);
279         sc->dstrd4 = NVME_CAP_DSTRD_GET(sc->cap);
280
281         device_printf(dev, "NVME Version %u.%u maxqe=%u caps=%016jx\n",
282                       NVME_VERS_MAJOR_GET(sc->vers),
283                       NVME_VERS_MINOR_GET(sc->vers),
284                       sc->maxqe, sc->cap);
285
286         /*
287          * Enable timeout, 500ms increments.  Convert to ticks.
288          */
289         sc->entimo = NVME_CAP_TIMEOUT_GET(sc->cap) * hz / 2; /* in ticks */
290         ++sc->entimo;           /* fudge */
291
292         /*
293          * Validate maxqe.  To cap the amount of memory we reserve for
294          * PRPs we limit maxqe to 256.  Also make sure it is a power of
295          * two.
296          */
297         if (sc->maxqe < 2) {
298                 device_printf(dev,
299                               "Attach failed, max queue entries (%d) "
300                               "below minimum (2)\n", sc->maxqe);
301                 nvme_pci_detach(dev);
302                 return (ENXIO);
303         }
304         if (sc->maxqe > 256)
305                 sc->maxqe = 256;
306         for (reg = 2; reg <= sc->maxqe; reg <<= 1)
307                 ;
308         sc->maxqe = reg >> 1;
309
310         /*
311          * DMA tags
312          *
313          * PRP  - Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE
314          *        (typically 64), multiplied by maxqe (typ 256).  Roughly
315          *        ~128KB per queue.  Align for cache performance.  We actually
316          *        need one more PRP per queue entry worst-case to handle
317          *        buffer overlap, but we have an extra one in the command
318          *        structure so we don't have to calculate that out.
319          *
320          *        Remember that we intend to allocate potentially many queues,
321          *        so we don't want to bloat this too much.  A queue depth of
322          *        256 is plenty.
323          *
324          * CMD - Storage for the submit queue.  maxqe * 64      (~16KB)
325          *
326          * RES - Storage for the completion queue.  maxqe * 16  (~4KB)
327          *
328          * ADM - Storage for admin command DMA data.  Maximum admin command
329          *       DMA data is 4KB so reserve maxqe * 4KB (~1MB).  There is only
330          *       one admin queue.
331          *
332          * NOTE: There are no boundary requirements for NVMe, but I specify a
333          *       4MB boundary anyway because this reduces mass-bit flipping
334          *       of address bits inside the controller when incrementing
335          *       DMA addresses.  Why not?  Can't hurt.
336          */
337         sc->prp_bytes = sizeof(uint64_t) * (MAXPHYS / PAGE_SIZE) * sc->maxqe;
338         sc->cmd_bytes = sizeof(nvme_subq_item_t) * sc->maxqe;
339         sc->res_bytes = sizeof(nvme_comq_item_t) * sc->maxqe;
340         sc->adm_bytes = NVME_MAX_ADMIN_BUFFER * sc->maxqe;
341
342         error = 0;
343
344         error += bus_dma_tag_create(
345                         NULL,                           /* parent tag */
346                         PAGE_SIZE,                      /* alignment */
347                         4 * 1024 * 1024,                /* boundary */
348                         BUS_SPACE_MAXADDR,              /* loaddr? */
349                         BUS_SPACE_MAXADDR,              /* hiaddr */
350                         NULL,                           /* filter */
351                         NULL,                           /* filterarg */
352                         sc->prp_bytes,                  /* [max]size */
353                         1,                              /* maxsegs */
354                         sc->prp_bytes,                  /* maxsegsz */
355                         0,                              /* flags */
356                         &sc->prps_tag);                 /* return tag */
357
358         error += bus_dma_tag_create(
359                         NULL,                           /* parent tag */
360                         PAGE_SIZE,                      /* alignment */
361                         4 * 1024 * 1024,                /* boundary */
362                         BUS_SPACE_MAXADDR,              /* loaddr? */
363                         BUS_SPACE_MAXADDR,              /* hiaddr */
364                         NULL,                           /* filter */
365                         NULL,                           /* filterarg */
366                         sc->cmd_bytes,                  /* [max]size */
367                         1,                              /* maxsegs */
368                         sc->cmd_bytes,                  /* maxsegsz */
369                         0,                              /* flags */
370                         &sc->sque_tag);                 /* return tag */
371
372         error += bus_dma_tag_create(
373                         NULL,                           /* parent tag */
374                         PAGE_SIZE,                      /* alignment */
375                         4 * 1024 * 1024,                /* boundary */
376                         BUS_SPACE_MAXADDR,              /* loaddr? */
377                         BUS_SPACE_MAXADDR,              /* hiaddr */
378                         NULL,                           /* filter */
379                         NULL,                           /* filterarg */
380                         sc->res_bytes,                  /* [max]size */
381                         1,                              /* maxsegs */
382                         sc->res_bytes,                  /* maxsegsz */
383                         0,                              /* flags */
384                         &sc->cque_tag);                 /* return tag */
385
386         error += bus_dma_tag_create(
387                         NULL,                           /* parent tag */
388                         PAGE_SIZE,                      /* alignment */
389                         4 * 1024 * 1024,                /* boundary */
390                         BUS_SPACE_MAXADDR,              /* loaddr? */
391                         BUS_SPACE_MAXADDR,              /* hiaddr */
392                         NULL,                           /* filter */
393                         NULL,                           /* filterarg */
394                         sc->adm_bytes,                  /* [max]size */
395                         1,                              /* maxsegs */
396                         sc->adm_bytes,                  /* maxsegsz */
397                         0,                              /* flags */
398                         &sc->adm_tag);                  /* return tag */
399
400         if (error) {
401                 device_printf(dev, "unable to create dma tags\n");
402                 nvme_pci_detach(dev);
403                 return (ENXIO);
404         }
405
406         /*
407          * Setup the admin queues (qid 0).
408          */
409         error = nvme_alloc_subqueue(sc, 0);
410         if (error) {
411                 device_printf(dev, "unable to allocate admin subqueue\n");
412                 nvme_pci_detach(dev);
413                 return (ENXIO);
414         }
415         error = nvme_alloc_comqueue(sc, 0);
416         if (error) {
417                 device_printf(dev, "unable to allocate admin comqueue\n");
418                 nvme_pci_detach(dev);
419                 return (ENXIO);
420         }
421
422         /*
423          * Initialize the admin queue registers
424          */
425         reg = NVME_ATTR_COM_SET(sc->maxqe) | NVME_ATTR_SUB_SET(sc->maxqe);
426         nvme_write(sc, NVME_REG_ADM_ATTR, reg);
427         nvme_write8(sc, NVME_REG_ADM_SUBADR, (uint64_t)sc->subqueues[0].psubq);
428         nvme_write8(sc, NVME_REG_ADM_COMADR, (uint64_t)sc->comqueues[0].pcomq);
429
430         /*
431          * qemu appears to require this, real hardware does not appear
432          * to require this.
433          */
434         pci_enable_busmaster(dev);
435
436         /*
437          * Other configuration registers
438          */
439         reg = NVME_CONFIG_IOSUB_ES_SET(6) |             /* 64 byte sub entry */
440               NVME_CONFIG_IOCOM_ES_SET(4) |             /* 16 byte com entry */
441               NVME_CONFIG_MEMPG_SET(PAGE_SHIFT) |       /* 4K pages */
442               NVME_CONFIG_CSS_NVM;                      /* NVME command set */
443         nvme_write(sc, NVME_REG_CONFIG, reg);
444
445         reg = nvme_read(sc, NVME_REG_MEMSIZE);
446
447         /*
448          * Enable the chip for operation
449          */
450         error = nvme_enable(sc, 1);
451         if (error) {
452                 nvme_enable(sc, 0);
453                 nvme_pci_detach(dev);
454                 return (ENXIO);
455         }
456
457         /*
458          * Start the admin thread.  This will also setup the admin queue
459          * interrupt.
460          */
461         error = nvme_start_admin_thread(sc);
462         if (error) {
463                 nvme_pci_detach(dev);
464                 return (ENXIO);
465         }
466         lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
467         sc->flags |= NVME_SC_ATTACHED;
468         TAILQ_INSERT_TAIL(&nvme_sc_list, sc, entry);
469         lockmgr(&nvme_master_lock, LK_RELEASE);
470
471         return(0);
472 }
473
474 /*
475  * Device unload / detachment
476  */
477 static int
478 nvme_pci_detach(device_t dev)
479 {
480         nvme_softc_t *sc = device_get_softc(dev);
481         int i;
482
483         /*
484          * Stop the admin thread
485          */
486         nvme_stop_admin_thread(sc);
487
488         /*
489          * Issue a normal shutdown and wait for completion
490          */
491         nvme_issue_shutdown(sc, 0);
492
493         /*
494          * Disable the chip
495          */
496         nvme_enable(sc, 0);
497
498         /*
499          * Free admin memory
500          */
501         nvme_free_subqueue(sc, 0);
502         nvme_free_comqueue(sc, 0);
503
504         /*
505          * Release related resources.
506          */
507         for (i = 0; i < sc->nirqs; ++i) {
508                 if (sc->irq[i]) {
509                         bus_release_resource(dev, SYS_RES_IRQ,
510                                              sc->rid_irq[i], sc->irq[i]);
511                         sc->irq[i] = NULL;
512                         if (sc->irq_type == PCI_INTR_TYPE_MSIX)
513                                 pci_release_msix_vector(dev, sc->rid_irq[i]);
514                 }
515         }
516         switch(sc->irq_type) {
517         case PCI_INTR_TYPE_MSI:
518                 pci_release_msi(dev);
519                 break;
520         case PCI_INTR_TYPE_MSIX:
521                 pci_teardown_msix(dev);
522                 break;
523         default:
524                 break;
525         }
526
527         /*
528          * Release remaining chipset resources
529          */
530         if (sc->regs) {
531                 bus_release_resource(dev, SYS_RES_MEMORY,
532                                      sc->rid_regs, sc->regs);
533                 sc->regs = NULL;
534         }
535         if (sc->bar4) {
536                 bus_release_resource(dev, SYS_RES_MEMORY,
537                                      sc->rid_bar4, sc->bar4);
538                 sc->bar4 = NULL;
539         }
540
541         /*
542          * Cleanup the DMA tags
543          */
544         if (sc->prps_tag) {
545                 bus_dma_tag_destroy(sc->prps_tag);
546                 sc->prps_tag = NULL;
547         }
548         if (sc->sque_tag) {
549                 bus_dma_tag_destroy(sc->sque_tag);
550                 sc->sque_tag = NULL;
551         }
552         if (sc->cque_tag) {
553                 bus_dma_tag_destroy(sc->cque_tag);
554                 sc->cque_tag = NULL;
555         }
556         if (sc->adm_tag) {
557                 bus_dma_tag_destroy(sc->adm_tag);
558                 sc->adm_tag = NULL;
559         }
560
561         if (sc->flags & NVME_SC_ATTACHED) {
562                 lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
563                 sc->flags &= ~NVME_SC_ATTACHED;
564                 TAILQ_REMOVE(&nvme_sc_list, sc, entry);
565                 lockmgr(&nvme_master_lock, LK_RELEASE);
566         }
567
568         return (0);
569 }