sys/dev/disk/nvme/nvme.c

   1 /*
   2  * Copyright (c) 2016-2018 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * Most low-level chip related functions (other than attachment) reside in
  36  * this module.  Most functions assume that the caller is already holding
  37  * appropriate locks to prevent SMP collisions.
  38  */
  39
  40 #include "nvme.h"
  41
  42 MALLOC_DEFINE(M_NVME, "NVMe Driver", "NVME");
  43
  44 /*
  45  * DMA mapping callbacks.
  46  */
  47 static
  48 void
  49 nvme_dmamem_saveseg(void *info, bus_dma_segment_t *segs, int nsegs, int error)
  50 {
  51         KKASSERT(error == 0);
  52         KKASSERT(nsegs == 1);
  53         *(bus_addr_t *)info = segs->ds_addr;
  54 }
  55
  56 /*
  57  * Low-level chip enable/disable.
  58  */
  59 int
  60 nvme_enable(nvme_softc_t *sc, int enable)
  61 {
  62         uint32_t reg;
  63         int error = 0;
  64         int base_ticks;
  65
  66         reg = nvme_read(sc, NVME_REG_CONFIG);
  67         if (enable == 0 && (reg & NVME_CONFIG_EN)) {
  68                 /*
  69                  * Disable the chip so we can program it.
  70                  */
  71                 reg &= ~NVME_CONFIG_EN;
  72                 nvme_write(sc, NVME_REG_CONFIG, reg);
  73         } else if (enable && (reg & NVME_CONFIG_EN) == 0) {
  74                 /*
  75                  * Enable the chip once programmed.
  76                  */
  77                 reg |= NVME_CONFIG_EN;
  78                 nvme_write(sc, NVME_REG_CONFIG, reg);
  79         }
  80         error = ENXIO;
  81         base_ticks = ticks;
  82         while ((int)(ticks - base_ticks) < sc->entimo) {
  83                 reg = nvme_read(sc, NVME_REG_STATUS);
  84                 if (enable == 0 && (reg & NVME_STATUS_RDY) == 0) {
  85                         error = 0;
  86                         break;
  87                 }
  88                 if (enable && (reg & NVME_STATUS_RDY)) {
  89                         error = 0;
  90                         break;
  91                 }
  92                 nvme_os_sleep(50);      /* 50ms poll */
  93         }
  94
  95         /*
  96          * Interrupt masking (only applicable when MSI-X not used, 3.1.3 and
  97          * 3.1.4 state that these registers should not be accessed with MSI-X)
  98          */
  99         if (error == 0 && sc->nirqs == 1) {
 100                 if (enable) {
 101                         nvme_write(sc, NVME_REG_INTSET, ~1);
 102                         nvme_write(sc, NVME_REG_INTCLR, 1);
 103                 } else {
 104                         nvme_write(sc, NVME_REG_INTSET, ~1);
 105                 }
 106         }
 107
 108         if (error) {
 109                 device_printf(sc->dev, "Cannot %s device\n",
 110                               (enable ? "enable" : "disable"));
 111         } else {
 112 #if 0
 113                 kprintf("gratuitous 15 second sleep\n");
 114                 nvme_os_sleep(15000);
 115                 kprintf("gratuitous 15 second sleep done\n");
 116 #endif
 117         }
 118         return error;
 119 }
 120
 121 /*
 122  * Allocate submission and completion queues.  If qid is 0 we are allocating
 123  * the ADMIN queues, otherwise we are allocating I/O queues.
 124  */
 125 int
 126 nvme_alloc_subqueue(nvme_softc_t *sc, uint16_t qid)
 127 {
 128         nvme_subqueue_t *queue = &sc->subqueues[qid];
 129         int error = 0;
 130
 131         /*
 132          * For now implement the maximum queue size negotiated in the
 133          * attach.
 134          */
 135         lockinit(&queue->lk, "nvqlk", 0, 0);
 136         queue->sc = sc;
 137         queue->nqe = sc->maxqe;
 138         queue->qid = qid;
 139         queue->subq_doorbell_reg = NVME_REG_SUBQ_BELL(qid, sc->dstrd4);
 140
 141         /*
 142          * dma memory for the submission queue
 143          */
 144         if (error == 0) {
 145                 error = bus_dmamem_alloc(sc->sque_tag, (void **)&queue->ksubq,
 146                                          BUS_DMA_ZERO, &queue->sque_map);
 147         }
 148         if (error == 0) {
 149                 error = bus_dmamap_load(sc->sque_tag, queue->sque_map,
 150                                         queue->ksubq,
 151                                         bus_dma_tag_getmaxsize(sc->sque_tag),
 152                                         nvme_dmamem_saveseg, &queue->psubq,
 153                                         0);
 154         }
 155
 156         /*
 157          * dma memory for enough PRPs to map MAXPHYS bytes of memory per
 158          * request.  A MAXPHYS buffer which begins partially straddling
 159          * a page boundary can still be accomodated because we have an
 160          * additional PRP entry in cmd.head.
 161          */
 162         if (error == 0) {
 163                 error = bus_dmamem_alloc(sc->prps_tag, (void **)&queue->kprps,
 164                                          BUS_DMA_ZERO, &queue->prps_map);
 165         }
 166         if (error == 0) {
 167                 error = bus_dmamap_load(sc->prps_tag, queue->prps_map,
 168                                         queue->kprps,
 169                                         bus_dma_tag_getmaxsize(sc->prps_tag),
 170                                         nvme_dmamem_saveseg, &queue->pprps,
 171                                         0);
 172         }
 173
 174         /*
 175          * dma memory for admin data
 176          */
 177         if (qid == 0 && error == 0) {
 178                 error = bus_dmamem_alloc(sc->adm_tag,
 179                                          (void **)&queue->kdatapgs,
 180                                          BUS_DMA_ZERO, &queue->adm_map);
 181         }
 182         if (qid == 0 && error == 0) {
 183                 error = bus_dmamap_load(sc->adm_tag, queue->adm_map,
 184                                         queue->kdatapgs,
 185                                         bus_dma_tag_getmaxsize(sc->adm_tag),
 186                                         nvme_dmamem_saveseg, &queue->pdatapgs,
 187                                         0);
 188         }
 189
 190         /*
 191          * Driver request structures
 192          */
 193         if (error == 0) {
 194                 nvme_request_t *req;
 195                 uint32_t i;
 196
 197                 queue->reqary = kmalloc(sizeof(nvme_request_t) * queue->nqe,
 198                                         M_NVME, M_WAITOK | M_ZERO);
 199                 for (i = 0; i < queue->nqe; ++i) {
 200                         req = &queue->reqary[i];
 201                         if (i == 0) {
 202                                 /*
 203                                  * Set aside one request for dump operation
 204                                  */
 205                                 queue->dump_req = req;
 206                         } else {
 207                                 /*
 208                                  * The rest go through the normal list
 209                                  */
 210                                 req->next_avail = queue->first_avail;
 211                                 queue->first_avail = req;
 212                         }
 213                         req->subq = queue;
 214                         req->comq = &sc->comqueues[queue->comqid];
 215                         req->cmd_id = i;
 216                         if (qid == 0) {
 217                                 req->info = &queue->kdatapgs[i];
 218                                 req->pinfo = queue->pdatapgs +
 219                                              i * sizeof(nvme_admin_data_t);
 220                         }
 221                 }
 222         }
 223
 224         /*
 225          * Error handling
 226          */
 227         if (error)
 228                 nvme_free_subqueue(sc, qid);
 229         return error;
 230 }
 231
 232 int
 233 nvme_alloc_comqueue(nvme_softc_t *sc, uint16_t qid)
 234 {
 235         nvme_comqueue_t *queue = &sc->comqueues[qid];
 236         int error = 0;
 237
 238         /*
 239          * For now implement the maximum queue size negotiated in the
 240          * attach.
 241          */
 242         lockinit(&queue->lk, "nvqlk", 0, 0);
 243         queue->sc = sc;
 244         queue->qid = qid;
 245         queue->phase = NVME_COMQ_STATUS_PHASE;
 246         queue->comq_doorbell_reg = NVME_REG_COMQ_BELL(qid, sc->dstrd4);
 247
 248         if (error == 0) {
 249                 error = bus_dmamem_alloc(sc->cque_tag, (void **)&queue->kcomq,
 250                                          BUS_DMA_ZERO, &queue->cque_map);
 251         }
 252         if (error == 0) {
 253                 error = bus_dmamap_load(sc->cque_tag, queue->cque_map,
 254                                         queue->kcomq,
 255                                         bus_dma_tag_getmaxsize(sc->cque_tag),
 256                                         nvme_dmamem_saveseg, &queue->pcomq,
 257                                         0);
 258         }
 259
 260         /*
 261          * Set nqe last.  The comq polling loop tests this field and we
 262          * do not want it to spuriously assume that the comq is initialized
 263          * until it actually is.
 264          */
 265         if (error == 0)
 266                 queue->nqe = sc->maxqe;
 267
 268         if (error)
 269                 nvme_free_comqueue(sc, qid);
 270         return error;
 271 }
 272
 273 void
 274 nvme_free_subqueue(nvme_softc_t *sc, uint16_t qid)
 275 {
 276         nvme_subqueue_t *queue = &sc->subqueues[qid];
 277
 278         queue->first_avail = NULL;
 279         if (queue->reqary) {
 280                 kfree(queue->reqary, M_NVME);
 281                 queue->reqary = NULL;
 282         }
 283         if (queue->ksubq) {
 284                 bus_dmamem_free(sc->sque_tag, queue->ksubq, queue->sque_map);
 285                 bus_dmamap_unload(sc->sque_tag, queue->sque_map);
 286                 bus_dmamap_destroy(sc->sque_tag, queue->sque_map);
 287         }
 288         if (queue->kprps) {
 289                 bus_dmamem_free(sc->prps_tag, queue->kprps, queue->prps_map);
 290                 bus_dmamap_unload(sc->prps_tag, queue->prps_map);
 291                 bus_dmamap_destroy(sc->prps_tag, queue->prps_map);
 292         }
 293         if (queue->kdatapgs) {
 294                 bus_dmamem_free(sc->adm_tag, queue->kdatapgs, queue->adm_map);
 295                 bus_dmamap_unload(sc->adm_tag, queue->adm_map);
 296                 bus_dmamap_destroy(sc->adm_tag, queue->adm_map);
 297         }
 298         bzero(queue, sizeof(*queue));
 299 }
 300
 301 void
 302 nvme_free_comqueue(nvme_softc_t *sc, uint16_t qid)
 303 {
 304         nvme_comqueue_t *queue = &sc->comqueues[qid];
 305
 306         /*
 307          * Clear this field first so poll loops ignore the comq.
 308          */
 309         queue->nqe = 0;
 310
 311         if (queue->kcomq) {
 312                 bus_dmamem_free(sc->cque_tag, queue->kcomq, queue->cque_map);
 313                 bus_dmamap_unload(sc->cque_tag, queue->cque_map);
 314                 bus_dmamap_destroy(sc->cque_tag, queue->cque_map);
 315         }
 316         bzero(queue, sizeof(*queue));
 317 }
 318
 319 /*
 320  * ADMIN AND I/O REQUEST HANDLING
 321  */
 322
 323 /*
 324  * Obtain a request and handle DMA mapping the supplied kernel buffer.
 325  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
 326  * Caller is responsible for filling in remaining fields as appropriate.
 327  *
 328  * Caller must hold the queue lock.
 329  */
 330 nvme_request_t *
 331 nvme_get_admin_request(nvme_softc_t *sc, uint8_t opcode)
 332 {
 333         nvme_request_t *req;
 334
 335         req = nvme_get_request(&sc->subqueues[0], opcode, NULL, 0);
 336         req->cmd.head.prp1 = req->pinfo;
 337         req->callback = NULL;
 338
 339         return req;
 340 }
 341
 342 /*
 343  * ADMIN AND I/O REQUEST HANDLING
 344  */
 345
 346 static __inline
 347 void
 348 _nvme_fill_request(nvme_subqueue_t *queue, uint8_t opcode,
 349                    char *kva, size_t bytes,
 350                    nvme_request_t *req)
 351 {
 352         /*
 353          * Fill-in basic fields and do the DMA mapping.
 354          */
 355         req->next_avail = NULL;
 356         KKASSERT(req->state == NVME_REQ_AVAIL);
 357         req->state = NVME_REQ_ALLOCATED;
 358         req->callback = NULL;
 359         req->waiting = 0;
 360
 361         req->cmd.head.opcode = opcode;
 362         req->cmd.head.flags = NVME_SUBQFLG_PRP | NVME_SUBQFLG_NORM;
 363         req->cmd.head.cid = req->cmd_id;
 364         req->cmd.head.nsid = 0;
 365         req->cmd.head.mptr = 0;
 366         req->cmd.head.prp1 = 0;
 367         req->cmd.head.prp2 = 0;
 368         req->cmd.dw10 = 0;
 369         req->cmd.dw11 = 0;
 370         req->cmd.dw12 = 0;
 371         req->cmd.dw13 = 0;
 372         req->cmd.dw14 = 0;
 373         req->cmd.dw15 = 0;
 374
 375         if (kva) {
 376                 size_t count = 0;
 377                 size_t idx = 0;
 378                 vm_paddr_t paddr;
 379                 vm_paddr_t pprptab;
 380                 uint64_t *kprptab;
 381                 KKASSERT(bytes >= 0 && bytes <= MAXPHYS);
 382
 383                 kprptab = queue->kprps +
 384                           (MAXPHYS / PAGE_SIZE) * req->cmd_id;
 385                 pprptab = queue->pprps +
 386                           (MAXPHYS / PAGE_SIZE) * req->cmd_id *
 387                           sizeof(uint64_t);
 388
 389                 while (count < bytes) {
 390                         paddr = vtophys(kva + count);
 391                         if (idx == 0) {
 392                                 KKASSERT((paddr & 3) == 0);
 393                                 req->cmd.head.prp1 = paddr;
 394                                 count += (((intptr_t)kva + PAGE_SIZE) &
 395                                           ~(intptr_t)PAGE_MASK) -
 396                                          (intptr_t)kva;
 397                         } else if (idx == 1 && count + PAGE_SIZE >= bytes) {
 398                                 KKASSERT((paddr & PAGE_MASK) == 0);
 399                                 req->cmd.head.prp2 = paddr;
 400                                 count += PAGE_SIZE;
 401                         } else {
 402                                 KKASSERT((paddr & PAGE_MASK) == 0);
 403                                 /* if (idx == 1) -- not needed, just repeat */
 404                                 req->cmd.head.prp2 = pprptab; /* repeat */
 405                                 kprptab[idx - 1] = paddr;
 406                                 count += PAGE_SIZE;
 407                         }
 408                         ++idx;
 409                 }
 410         }
 411 }
 412
 413
 414 /*
 415  * Obtain a request and handle DMA mapping the supplied kernel buffer.
 416  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
 417  * Caller is responsible for filling in remaining fields as appropriate.
 418  *
 419  * May return NULL if no requests are available or if there is no room in
 420  * the submission queue to handle it (should only be possible on an I/O queue,
 421  * admin queue operations are managed).
 422  *
 423  * Caller should NOT hold the queue lock.
 424  */
 425 nvme_request_t *
 426 nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode,
 427                  char *kva, size_t bytes)
 428 {
 429         nvme_request_t *req;
 430         nvme_request_t *next;
 431
 432         /*
 433          * No easy lockless way to pull a new request off.  We have to check
 434          * for a number of conditions and there may be multiple threads
 435          * making this call simultaneously, which complicates matters even
 436          * more.
 437          */
 438         lockmgr(&queue->lk, LK_EXCLUSIVE);
 439
 440         /*
 441          * Make sure the submission queue has room to accomodate the
 442          * request.  Requests can be completed out of order so the
 443          * submission ring could still be full even though we have
 444          * requests available.
 445          */
 446         if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe ==
 447             queue->subq_head) {
 448                 lockmgr(&queue->lk, LK_RELEASE);
 449                 KKASSERT(queue->qid != 0);
 450                 atomic_swap_int(&queue->signal_requeue, 1);
 451
 452                 return NULL;
 453         }
 454
 455         /*
 456          * Pop the next available request off of the first_avail linked
 457          * list.  An atomic op must be used here because nvme_put_request()
 458          * returns requests to the list without holding queue->lk.
 459          */
 460         for (;;) {
 461                 req = queue->first_avail;
 462                 cpu_ccfence();
 463                 if (req == NULL) {
 464                         lockmgr(&queue->lk, LK_RELEASE);
 465                         KKASSERT(queue->qid != 0);
 466                         atomic_swap_int(&queue->signal_requeue, 1);
 467
 468                         return NULL;
 469                 }
 470                 next = req->next_avail;
 471                 if (atomic_cmpset_ptr(&queue->first_avail, req, next))
 472                         break;
 473         }
 474
 475         /*
 476          * We have to keep track of unsubmitted requests in order to be
 477          * able to properly check whether the ring is full or not (check
 478          * is done at the top of this procedure, above).
 479          */
 480         ++queue->unsubmitted;
 481         lockmgr(&queue->lk, LK_RELEASE);
 482
 483         _nvme_fill_request(queue, opcode, kva, bytes, req);
 484
 485         return req;
 486 }
 487
 488 /*
 489  * dump path only, cannot block.  Allow the lock to fail and bump
 490  * queue->unsubmitted anyway.
 491  */
 492 nvme_request_t *
 493 nvme_get_dump_request(nvme_subqueue_t *queue, uint8_t opcode,
 494                  char *kva, size_t bytes)
 495 {
 496         nvme_request_t *req;
 497         int error;
 498
 499         error = lockmgr(&queue->lk, LK_EXCLUSIVE | LK_NOWAIT);
 500         req = queue->dump_req;
 501         ++queue->unsubmitted;
 502         if (error == 0)
 503                 lockmgr(&queue->lk, LK_RELEASE);
 504         _nvme_fill_request(queue, opcode, kva, bytes, req);
 505
 506         return req;
 507 }
 508
 509 /*
 510  * Submit request for execution.  This will doorbell the subq.
 511  *
 512  * Caller must hold the queue lock.
 513  */
 514 void
 515 nvme_submit_request(nvme_request_t *req)
 516 {
 517         nvme_subqueue_t *queue = req->subq;
 518         nvme_allcmd_t *cmd;
 519
 520         cmd = &queue->ksubq[queue->subq_tail];
 521         --queue->unsubmitted;
 522         if (++queue->subq_tail == queue->nqe)
 523                 queue->subq_tail = 0;
 524         KKASSERT(queue->subq_tail != queue->subq_head);
 525         *cmd = req->cmd;
 526         cpu_sfence();   /* needed? */
 527         req->state = NVME_REQ_SUBMITTED;
 528         nvme_write(queue->sc, queue->subq_doorbell_reg, queue->subq_tail);
 529 }
 530
 531 /*
 532  * Wait for a request to complete.
 533  *
 534  * Caller does not need to hold the queue lock.  If it does, or if it
 535  * holds some other lock, it should pass it in so it can be released across
 536  * sleeps, else pass NULL.
 537  */
 538 int
 539 nvme_wait_request(nvme_request_t *req)
 540 {
 541         struct lock *lk;
 542         int code;
 543
 544         req->waiting = 1;
 545         if (req->state != NVME_REQ_COMPLETED) {
 546                 lk = &req->comq->lk;
 547                 cpu_lfence();
 548                 lockmgr(lk, LK_EXCLUSIVE);
 549                 while (req->state == NVME_REQ_SUBMITTED) {
 550                         nvme_poll_completions(req->comq, lk);
 551                         if (req->state != NVME_REQ_SUBMITTED)
 552                                 break;
 553                         lksleep(req, lk, 0, "nvwait", hz);
 554                 }
 555                 lockmgr(lk, LK_RELEASE);
 556                 KKASSERT(req->state == NVME_REQ_COMPLETED);
 557         }
 558         cpu_lfence();
 559         code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
 560
 561         return code;
 562 }
 563
 564 /*
 565  * dump path only, we cannot block, and the lock is allowed
 566  * to fail.  But still try to play nice with interrupt threads.
 567  */
 568 int
 569 nvme_poll_request(nvme_request_t *req)
 570 {
 571         struct lock *lk;
 572         int code;
 573         int didlock = 500;      /* 500uS max */
 574
 575         req->waiting = 1;
 576         if (req->state != NVME_REQ_COMPLETED) {
 577                 lk = &req->comq->lk;
 578                 cpu_lfence();
 579                 while (lockmgr(lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 580                         if (--didlock == 0)
 581                                 break;
 582                         tsc_delay(1000);        /* 1uS */
 583                 }
 584                 while (req->state == NVME_REQ_SUBMITTED) {
 585                         nvme_poll_completions(req->comq, lk);
 586                         if (req->state != NVME_REQ_SUBMITTED)
 587                                 break;
 588                         lwkt_switch();
 589                 }
 590                 if (didlock)
 591                         lockmgr(lk, LK_RELEASE);
 592                 KKASSERT(req->state == NVME_REQ_COMPLETED);
 593         }
 594         cpu_lfence();
 595         code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
 596
 597         return code;
 598 }
 599
 600 /*
 601  * Put request away, making it available for reuse.  If this is an admin
 602  * request its auxillary data page is also being released for reuse.
 603  *
 604  * Caller does NOT have to hold the queue lock.
 605  */
 606 void
 607 nvme_put_request(nvme_request_t *req)
 608 {
 609         nvme_subqueue_t *queue = req->subq;
 610         nvme_request_t *next;
 611
 612         /*
 613          * Insert on head for best cache reuse.
 614          */
 615         KKASSERT(req->state == NVME_REQ_COMPLETED);
 616         req->state = NVME_REQ_AVAIL;
 617         for (;;) {
 618                 next = queue->first_avail;
 619                 cpu_ccfence();
 620                 req->next_avail = next;
 621                 if (atomic_cmpset_ptr(&queue->first_avail, next, req))
 622                         break;
 623         }
 624
 625         /*
 626          * If BIOs were deferred due to lack of request space signal the
 627          * admin thread to requeue them.  This is a bit messy and normally
 628          * should not happen due to the large number of queue entries nvme
 629          * usually has.  Let it race for now (admin has a 1hz tick).
 630          */
 631         if (atomic_swap_int(&queue->signal_requeue, 0)) {
 632                 atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE);
 633                 wakeup(&queue->sc->admin_signal);
 634         }
 635 }
 636
 637 /*
 638  * dump path only.
 639  */
 640 void
 641 nvme_put_dump_request(nvme_request_t *req)
 642 {
 643         KKASSERT(req->state == NVME_REQ_COMPLETED);
 644         req->state = NVME_REQ_AVAIL;
 645 }
 646
 647 /*
 648  * Poll for completions on queue, copy the 16-byte hw result entry
 649  * into the request and poke the doorbell to update the controller's
 650  * understanding of comq_head.
 651  *
 652  * If lk is non-NULL it will be passed to the callback which typically
 653  * releases it temporarily when calling biodone() or doing other complex
 654  * work on the result.
 655  *
 656  * Caller must usually hold comq->lk.
 657  */
 658 void
 659 nvme_poll_completions(nvme_comqueue_t *comq, struct lock *lk)
 660 {
 661         nvme_softc_t *sc = comq->sc;
 662         nvme_request_t *req;
 663         nvme_subqueue_t *subq;
 664         nvme_allres_t *res;
 665 #if 0
 666         int didwork = 0;
 667 #endif
 668
 669         KKASSERT(comq->comq_tail < comq->nqe);
 670         cpu_lfence();           /* needed prior to first phase test */
 671         for (;;) {
 672                 /*
 673                  * WARNING! LOCK MAY HAVE BEEN TEMPORARILY LOST DURING LOOP.
 674                  */
 675                 res = &comq->kcomq[comq->comq_tail];
 676                 if ((res->tail.status ^ comq->phase) & NVME_COMQ_STATUS_PHASE)
 677                         break;
 678
 679                 /*
 680                  * Process result on completion queue.
 681                  *
 682                  * Bump comq_tail, flip the phase detect when we roll-over.
 683                  * doorbell every 1/4 queue and at the end of the loop.
 684                  */
 685                 if (++comq->comq_tail == comq->nqe) {
 686                         comq->comq_tail = 0;
 687                         comq->phase ^= NVME_COMQ_STATUS_PHASE;
 688                 }
 689
 690                 /*
 691                  * WARNING! I imploded the chip by reusing a command id
 692                  *          before it was discarded in the completion queue
 693                  *          via the doorbell, so for now we always write
 694                  *          the doorbell before marking the request as
 695                  *          COMPLETED (it can be reused instantly upon
 696                  *          being marked).
 697                  */
 698 #if 0
 699                 if (++didwork == (comq->nqe >> 2)) {
 700                         didwork = 0;
 701                         nvme_write(comq->sc, comq->comq_doorbell_reg,
 702                                    comq->comq_tail);
 703                 }
 704 #endif
 705                 cpu_lfence();   /* needed prior to content check */
 706
 707                 /*
 708                  * Locate the request and related submission queue.  The
 709                  * request could be on a different queue.  A submission
 710                  * queue can have only one completion queue, so we can
 711                  * update subq_head without locking the submission queue.
 712                  */
 713                 subq = &sc->subqueues[res->tail.subq_id];
 714                 subq->subq_head = res->tail.subq_head_ptr;
 715                 req = &subq->reqary[res->tail.cmd_id];
 716
 717                 /*
 718                  * Copy the fields and wakeup anyone waiting on req.
 719                  * The response field in the completion queue can be reused
 720                  * once we doorbell which is why we make a copy.
 721                  */
 722                 KKASSERT(req->state == NVME_REQ_SUBMITTED &&
 723                          req->comq == comq);
 724                 req->res = *res;
 725                 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
 726                 cpu_sfence();
 727                 req->state = NVME_REQ_COMPLETED;
 728                 if (req->callback) {
 729                         req->callback(req, lk);
 730                 } else if (req->waiting) {
 731                         wakeup(req);
 732                 }
 733         }
 734 #if 0
 735         if (didwork)
 736                 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
 737 #endif
 738 }
 739
 740 /*
 741  * Core interrupt handler (called from dedicated interrupt thread, possibly
 742  * preempts other threads).
 743  *
 744  * NOTE: For pin-based level interrupts, the chipset interrupt is cleared
 745  *       automatically once all the head doorbells are updated.  However,
 746  *       most chipsets assume MSI-X will be used and MAY NOT IMPLEMENT
 747  *       pin-based interrupts properly.  I found the BPX card, for example,
 748  *       is unable to clear a pin-based interrupt.
 749  */
 750 void
 751 nvme_intr(void *arg)
 752 {
 753         nvme_comqueue_t *comq = arg;
 754         nvme_softc_t *sc;
 755         int i;
 756         int skip;
 757
 758         /*
 759          * Process all completion queues associated with this vector.  The
 760          * interrupt is masked in the APIC.  Do NOT mess with the NVMe
 761          * masking registers because (1) We don't need to and it wastes time,
 762          * and (2) We aren't supposed to touch them if using MSI-X anyway.
 763          */
 764         sc = comq->sc;
 765         if (sc->nirqs == 1)
 766                 skip = 1;
 767         else
 768                 skip = sc->nirqs - 1;
 769
 770         for (i = comq->qid; i <= sc->niocomqs; i += skip) {
 771                 if (comq->nqe) {
 772                         lockmgr(&comq->lk, LK_EXCLUSIVE);
 773                         nvme_poll_completions(comq, &comq->lk);
 774                         lockmgr(&comq->lk, LK_RELEASE);
 775                 }
 776                 comq += skip;
 777         }
 778 }
 779
 780 /*
 781  * ADMIN HELPER COMMAND ROLLUP FUNCTIONS
 782  */
 783 /*
 784  * Issue command to create a submission queue.
 785  */
 786 int
 787 nvme_create_subqueue(nvme_softc_t *sc, uint16_t qid)
 788 {
 789         nvme_request_t *req;
 790         nvme_subqueue_t *subq = &sc->subqueues[qid];
 791         int status;
 792
 793         req = nvme_get_admin_request(sc, NVME_OP_CREATE_SUBQ);
 794         req->cmd.head.prp1 = subq->psubq;
 795         req->cmd.crsub.subq_id = qid;
 796         req->cmd.crsub.subq_size = subq->nqe - 1;       /* 0's based value */
 797         req->cmd.crsub.flags = NVME_CREATESUB_PC | NVME_CREATESUB_PRI_URG;
 798         req->cmd.crsub.comq_id = subq->comqid;
 799
 800         nvme_submit_request(req);
 801         status = nvme_wait_request(req);
 802         nvme_put_request(req);
 803
 804         return status;
 805 }
 806
 807 /*
 808  * Issue command to create a completion queue.
 809  */
 810 int
 811 nvme_create_comqueue(nvme_softc_t *sc, uint16_t qid)
 812 {
 813         nvme_request_t *req;
 814         nvme_comqueue_t *comq = &sc->comqueues[qid];
 815         int status;
 816         int error;
 817         uint16_t ivect;
 818
 819         error = 0;
 820         if (sc->nirqs > 1) {
 821                 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
 822                 if (qid && ivect == qid) {
 823                         error = bus_setup_intr(sc->dev, sc->irq[ivect],
 824                                                 INTR_MPSAFE | INTR_HIFREQ,
 825                                                 nvme_intr,
 826                                                 &sc->comqueues[ivect],
 827                                                 &sc->irq_handle[ivect],
 828                                                 NULL);
 829                 }
 830         } else {
 831                 ivect = 0;
 832         }
 833         if (error)
 834                 return error;
 835
 836         req = nvme_get_admin_request(sc, NVME_OP_CREATE_COMQ);
 837         req->cmd.head.prp1 = comq->pcomq;
 838         req->cmd.crcom.comq_id = qid;
 839         req->cmd.crcom.comq_size = comq->nqe - 1;       /* 0's based value */
 840         req->cmd.crcom.ivect = ivect;
 841         req->cmd.crcom.flags = NVME_CREATECOM_PC | NVME_CREATECOM_IEN;
 842
 843         nvme_submit_request(req);
 844         status = nvme_wait_request(req);
 845         nvme_put_request(req);
 846
 847         return status;
 848 }
 849
 850 /*
 851  * Issue command to delete a submission queue.
 852  */
 853 int
 854 nvme_delete_subqueue(nvme_softc_t *sc, uint16_t qid)
 855 {
 856         nvme_request_t *req;
 857         /*nvme_subqueue_t *subq = &sc->subqueues[qid];*/
 858         int status;
 859
 860         req = nvme_get_admin_request(sc, NVME_OP_DELETE_SUBQ);
 861         req->cmd.head.prp1 = 0;
 862         req->cmd.delete.qid = qid;
 863
 864         nvme_submit_request(req);
 865         status = nvme_wait_request(req);
 866         nvme_put_request(req);
 867
 868         return status;
 869 }
 870
 871 /*
 872  * Issue command to delete a completion queue.
 873  */
 874 int
 875 nvme_delete_comqueue(nvme_softc_t *sc, uint16_t qid)
 876 {
 877         nvme_request_t *req;
 878         /*nvme_comqueue_t *comq = &sc->comqueues[qid];*/
 879         int status;
 880         uint16_t ivect;
 881
 882         req = nvme_get_admin_request(sc, NVME_OP_DELETE_COMQ);
 883         req->cmd.head.prp1 = 0;
 884         req->cmd.delete.qid = qid;
 885
 886         nvme_submit_request(req);
 887         status = nvme_wait_request(req);
 888         nvme_put_request(req);
 889
 890         if (qid && sc->nirqs > 1) {
 891                 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
 892                 if (ivect == qid) {
 893                         bus_teardown_intr(sc->dev,
 894                                           sc->irq[ivect],
 895                                           sc->irq_handle[ivect]);
 896                 }
 897         }
 898
 899         return status;
 900 }
 901
 902 /*
 903  * Issue friendly shutdown to controller.
 904  */
 905 int
 906 nvme_issue_shutdown(nvme_softc_t *sc, int dopoll)
 907 {
 908         uint32_t reg;
 909         int base_ticks;
 910         int error;
 911
 912         /*
 913          * Put us in shutdown
 914          */
 915         reg = nvme_read(sc, NVME_REG_CONFIG);
 916         reg &= ~NVME_CONFIG_SHUT_MASK;
 917         reg |= NVME_CONFIG_SHUT_NORM;
 918         nvme_write(sc, NVME_REG_CONFIG, reg);
 919
 920         /*
 921          * Wait up to 10 seconds for acknowlegement
 922          */
 923         error = ENXIO;
 924         base_ticks = ticks;
 925         while ((int)(ticks - base_ticks) < 10 * 20) {
 926                 reg = nvme_read(sc, NVME_REG_STATUS);
 927                 if ((reg & NVME_STATUS_SHUT_MASK) & NVME_STATUS_SHUT_DONE) {
 928                         error = 0;
 929                         break;
 930                 }
 931                 if (dopoll == 0)
 932                         nvme_os_sleep(50);      /* 50ms poll */
 933         }
 934         if (error)
 935                 device_printf(sc->dev, "Unable to shutdown chip nicely\n");
 936         else
 937                 device_printf(sc->dev, "Normal chip shutdown succeeded\n");
 938
 939         return error;
 940 }
 941
 942 /*
 943  * Make space-padded string serial and model numbers more readable.
 944  */
 945 size_t
 946 string_cleanup(char *str, int domiddle)
 947 {
 948         size_t i;
 949         size_t j;
 950         int atbeg = 1;
 951
 952         for (i = j = 0; str[i]; ++i) {
 953                 if ((str[i] == ' ' || str[i] == '\r') &&
 954                     (atbeg || domiddle)) {
 955                         continue;
 956                 } else {
 957                         atbeg = 0;
 958                 }
 959                 str[j] = str[i];
 960                 ++j;
 961         }
 962         while (domiddle == 0 && j > 0 && (str[j-1] == ' ' || str[j-1] == '\r'))
 963                 --j;
 964         str[j] = 0;
 965         if (domiddle == 0) {
 966                 for (j = 0; str[j]; ++j) {
 967                         if (str[j] == ' ')
 968                                 str[j] = '_';
 969                 }
 970         }
 971
 972         return j;
 973 }