nvme - Improve likelihood of dump success
[dragonfly.git] / sys / dev / disk / nvme / nvme.c
1 /*
2  * Copyright (c) 2016-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 /*
35  * Most low-level chip related functions (other than attachment) reside in
36  * this module.  Most functions assume that the caller is already holding
37  * appropriate locks to prevent SMP collisions.
38  */
39
40 #include "nvme.h"
41
42 MALLOC_DEFINE(M_NVME, "NVMe Driver", "NVME");
43
44 /*
45  * DMA mapping callbacks.
46  */
47 static
48 void
49 nvme_dmamem_saveseg(void *info, bus_dma_segment_t *segs, int nsegs, int error)
50 {
51         KKASSERT(error == 0);
52         KKASSERT(nsegs == 1);
53         *(bus_addr_t *)info = segs->ds_addr;
54 }
55
56 /*
57  * Low-level chip enable/disable.
58  */
59 int
60 nvme_enable(nvme_softc_t *sc, int enable)
61 {
62         uint32_t reg;
63         int error = 0;
64         int base_ticks;
65
66         reg = nvme_read(sc, NVME_REG_CONFIG);
67         if (enable == 0 && (reg & NVME_CONFIG_EN)) {
68                 /*
69                  * Disable the chip so we can program it.
70                  */
71                 reg &= ~NVME_CONFIG_EN;
72                 nvme_write(sc, NVME_REG_CONFIG, reg);
73         } else if (enable && (reg & NVME_CONFIG_EN) == 0) {
74                 /*
75                  * Enable the chip once programmed.
76                  */
77                 reg |= NVME_CONFIG_EN;
78                 nvme_write(sc, NVME_REG_CONFIG, reg);
79         }
80         error = ENXIO;
81         base_ticks = ticks;
82         while ((int)(ticks - base_ticks) < sc->entimo) {
83                 reg = nvme_read(sc, NVME_REG_STATUS);
84                 if (enable == 0 && (reg & NVME_STATUS_RDY) == 0) {
85                         error = 0;
86                         break;
87                 }
88                 if (enable && (reg & NVME_STATUS_RDY)) {
89                         error = 0;
90                         break;
91                 }
92                 nvme_os_sleep(50);      /* 50ms poll */
93         }
94
95         /*
96          * Interrupt masking (only applicable when MSI-X not used, 3.1.3 and
97          * 3.1.4 state that these registers should not be accessed with MSI-X)
98          */
99         if (error == 0 && sc->nirqs == 1) {
100                 if (enable) {
101                         nvme_write(sc, NVME_REG_INTSET, ~1);
102                         nvme_write(sc, NVME_REG_INTCLR, 1);
103                 } else {
104                         nvme_write(sc, NVME_REG_INTSET, ~1);
105                 }
106         }
107
108         if (error) {
109                 device_printf(sc->dev, "Cannot %s device\n",
110                               (enable ? "enable" : "disable"));
111         } else {
112 #if 0
113                 kprintf("gratuitous 15 second sleep\n");
114                 nvme_os_sleep(15000);
115                 kprintf("gratuitous 15 second sleep done\n");
116 #endif
117         }
118         return error;
119 }
120
121 /*
122  * Allocate submission and completion queues.  If qid is 0 we are allocating
123  * the ADMIN queues, otherwise we are allocating I/O queues.
124  */
125 int
126 nvme_alloc_subqueue(nvme_softc_t *sc, uint16_t qid)
127 {
128         nvme_subqueue_t *queue = &sc->subqueues[qid];
129         int error = 0;
130
131         /*
132          * For now implement the maximum queue size negotiated in the
133          * attach.
134          */
135         lockinit(&queue->lk, "nvqlk", 0, 0);
136         queue->sc = sc;
137         queue->nqe = sc->maxqe;
138         queue->qid = qid;
139         queue->subq_doorbell_reg = NVME_REG_SUBQ_BELL(qid, sc->dstrd4);
140
141         /*
142          * dma memory for the submission queue
143          */
144         if (error == 0) {
145                 error = bus_dmamem_alloc(sc->sque_tag, (void **)&queue->ksubq,
146                                          BUS_DMA_ZERO, &queue->sque_map);
147         }
148         if (error == 0) {
149                 error = bus_dmamap_load(sc->sque_tag, queue->sque_map,
150                                         queue->ksubq,
151                                         bus_dma_tag_getmaxsize(sc->sque_tag),
152                                         nvme_dmamem_saveseg, &queue->psubq,
153                                         0);
154         }
155
156         /*
157          * dma memory for enough PRPs to map MAXPHYS bytes of memory per
158          * request.  A MAXPHYS buffer which begins partially straddling
159          * a page boundary can still be accomodated because we have an
160          * additional PRP entry in cmd.head.
161          */
162         if (error == 0) {
163                 error = bus_dmamem_alloc(sc->prps_tag, (void **)&queue->kprps,
164                                          BUS_DMA_ZERO, &queue->prps_map);
165         }
166         if (error == 0) {
167                 error = bus_dmamap_load(sc->prps_tag, queue->prps_map,
168                                         queue->kprps,
169                                         bus_dma_tag_getmaxsize(sc->prps_tag),
170                                         nvme_dmamem_saveseg, &queue->pprps,
171                                         0);
172         }
173
174         /*
175          * dma memory for admin data
176          */
177         if (qid == 0 && error == 0) {
178                 error = bus_dmamem_alloc(sc->adm_tag,
179                                          (void **)&queue->kdatapgs,
180                                          BUS_DMA_ZERO, &queue->adm_map);
181         }
182         if (qid == 0 && error == 0) {
183                 error = bus_dmamap_load(sc->adm_tag, queue->adm_map,
184                                         queue->kdatapgs,
185                                         bus_dma_tag_getmaxsize(sc->adm_tag),
186                                         nvme_dmamem_saveseg, &queue->pdatapgs,
187                                         0);
188         }
189
190         /*
191          * Driver request structures
192          */
193         if (error == 0) {
194                 nvme_request_t *req;
195                 uint32_t i;
196
197                 queue->reqary = kmalloc(sizeof(nvme_request_t) * queue->nqe,
198                                         M_NVME, M_WAITOK | M_ZERO);
199                 for (i = 0; i < queue->nqe; ++i) {
200                         req = &queue->reqary[i];
201                         if (i == 0) {
202                                 /*
203                                  * Set aside one request for dump operation
204                                  */
205                                 queue->dump_req = req;
206                         } else {
207                                 /*
208                                  * The rest go through the normal list
209                                  */
210                                 req->next_avail = queue->first_avail;
211                                 queue->first_avail = req;
212                         }
213                         req->subq = queue;
214                         req->comq = &sc->comqueues[queue->comqid];
215                         req->cmd_id = i;
216                         if (qid == 0) {
217                                 req->info = &queue->kdatapgs[i];
218                                 req->pinfo = queue->pdatapgs +
219                                              i * sizeof(nvme_admin_data_t);
220                         }
221                 }
222         }
223
224         /*
225          * Error handling
226          */
227         if (error)
228                 nvme_free_subqueue(sc, qid);
229         return error;
230 }
231
232 int
233 nvme_alloc_comqueue(nvme_softc_t *sc, uint16_t qid)
234 {
235         nvme_comqueue_t *queue = &sc->comqueues[qid];
236         int error = 0;
237
238         /*
239          * For now implement the maximum queue size negotiated in the
240          * attach.
241          */
242         lockinit(&queue->lk, "nvqlk", 0, 0);
243         queue->sc = sc;
244         queue->qid = qid;
245         queue->phase = NVME_COMQ_STATUS_PHASE;
246         queue->comq_doorbell_reg = NVME_REG_COMQ_BELL(qid, sc->dstrd4);
247
248         if (error == 0) {
249                 error = bus_dmamem_alloc(sc->cque_tag, (void **)&queue->kcomq,
250                                          BUS_DMA_ZERO, &queue->cque_map);
251         }
252         if (error == 0) {
253                 error = bus_dmamap_load(sc->cque_tag, queue->cque_map,
254                                         queue->kcomq,
255                                         bus_dma_tag_getmaxsize(sc->cque_tag),
256                                         nvme_dmamem_saveseg, &queue->pcomq,
257                                         0);
258         }
259
260         /*
261          * Set nqe last.  The comq polling loop tests this field and we
262          * do not want it to spuriously assume that the comq is initialized
263          * until it actually is.
264          */
265         if (error == 0)
266                 queue->nqe = sc->maxqe;
267
268         if (error)
269                 nvme_free_comqueue(sc, qid);
270         return error;
271 }
272
273 void
274 nvme_free_subqueue(nvme_softc_t *sc, uint16_t qid)
275 {
276         nvme_subqueue_t *queue = &sc->subqueues[qid];
277
278         queue->first_avail = NULL;
279         if (queue->reqary) {
280                 kfree(queue->reqary, M_NVME);
281                 queue->reqary = NULL;
282         }
283         if (queue->ksubq) {
284                 bus_dmamem_free(sc->sque_tag, queue->ksubq, queue->sque_map);
285                 bus_dmamap_unload(sc->sque_tag, queue->sque_map);
286                 bus_dmamap_destroy(sc->sque_tag, queue->sque_map);
287         }
288         if (queue->kprps) {
289                 bus_dmamem_free(sc->prps_tag, queue->kprps, queue->prps_map);
290                 bus_dmamap_unload(sc->prps_tag, queue->prps_map);
291                 bus_dmamap_destroy(sc->prps_tag, queue->prps_map);
292         }
293         if (queue->kdatapgs) {
294                 bus_dmamem_free(sc->adm_tag, queue->kdatapgs, queue->adm_map);
295                 bus_dmamap_unload(sc->adm_tag, queue->adm_map);
296                 bus_dmamap_destroy(sc->adm_tag, queue->adm_map);
297         }
298         bzero(queue, sizeof(*queue));
299 }
300
301 void
302 nvme_free_comqueue(nvme_softc_t *sc, uint16_t qid)
303 {
304         nvme_comqueue_t *queue = &sc->comqueues[qid];
305
306         /*
307          * Clear this field first so poll loops ignore the comq.
308          */
309         queue->nqe = 0;
310
311         if (queue->kcomq) {
312                 bus_dmamem_free(sc->cque_tag, queue->kcomq, queue->cque_map);
313                 bus_dmamap_unload(sc->cque_tag, queue->cque_map);
314                 bus_dmamap_destroy(sc->cque_tag, queue->cque_map);
315         }
316         bzero(queue, sizeof(*queue));
317 }
318
319 /*
320  * ADMIN AND I/O REQUEST HANDLING
321  */
322
323 /*
324  * Obtain a request and handle DMA mapping the supplied kernel buffer.
325  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
326  * Caller is responsible for filling in remaining fields as appropriate.
327  *
328  * Caller must hold the queue lock.
329  */
330 nvme_request_t *
331 nvme_get_admin_request(nvme_softc_t *sc, uint8_t opcode)
332 {
333         nvme_request_t *req;
334
335         req = nvme_get_request(&sc->subqueues[0], opcode, NULL, 0);
336         req->cmd.head.prp1 = req->pinfo;
337         req->callback = NULL;
338
339         return req;
340 }
341
342 /*
343  * ADMIN AND I/O REQUEST HANDLING
344  */
345
346 static __inline
347 void
348 _nvme_fill_request(nvme_subqueue_t *queue, uint8_t opcode,
349                    char *kva, size_t bytes,
350                    nvme_request_t *req)
351 {
352         /*
353          * Fill-in basic fields and do the DMA mapping.
354          */
355         req->next_avail = NULL;
356         KKASSERT(req->state == NVME_REQ_AVAIL);
357         req->state = NVME_REQ_ALLOCATED;
358         req->callback = NULL;
359         req->waiting = 0;
360
361         req->cmd.head.opcode = opcode;
362         req->cmd.head.flags = NVME_SUBQFLG_PRP | NVME_SUBQFLG_NORM;
363         req->cmd.head.cid = req->cmd_id;
364         req->cmd.head.nsid = 0;
365         req->cmd.head.mptr = 0;
366         req->cmd.head.prp1 = 0;
367         req->cmd.head.prp2 = 0;
368         req->cmd.dw10 = 0;
369         req->cmd.dw11 = 0;
370         req->cmd.dw12 = 0;
371         req->cmd.dw13 = 0;
372         req->cmd.dw14 = 0;
373         req->cmd.dw15 = 0;
374
375         if (kva) {
376                 size_t count = 0;
377                 size_t idx = 0;
378                 vm_paddr_t paddr;
379                 vm_paddr_t pprptab;
380                 uint64_t *kprptab;
381                 KKASSERT(bytes >= 0 && bytes <= MAXPHYS);
382
383                 kprptab = queue->kprps +
384                           (MAXPHYS / PAGE_SIZE) * req->cmd_id;
385                 pprptab = queue->pprps +
386                           (MAXPHYS / PAGE_SIZE) * req->cmd_id *
387                           sizeof(uint64_t);
388
389                 while (count < bytes) {
390                         paddr = vtophys(kva + count);
391                         if (idx == 0) {
392                                 KKASSERT((paddr & 3) == 0);
393                                 req->cmd.head.prp1 = paddr;
394                                 count += (((intptr_t)kva + PAGE_SIZE) &
395                                           ~(intptr_t)PAGE_MASK) -
396                                          (intptr_t)kva;
397                         } else if (idx == 1 && count + PAGE_SIZE >= bytes) {
398                                 KKASSERT((paddr & PAGE_MASK) == 0);
399                                 req->cmd.head.prp2 = paddr;
400                                 count += PAGE_SIZE;
401                         } else {
402                                 KKASSERT((paddr & PAGE_MASK) == 0);
403                                 /* if (idx == 1) -- not needed, just repeat */
404                                 req->cmd.head.prp2 = pprptab; /* repeat */
405                                 kprptab[idx - 1] = paddr;
406                                 count += PAGE_SIZE;
407                         }
408                         ++idx;
409                 }
410         }
411 }
412
413
414 /*
415  * Obtain a request and handle DMA mapping the supplied kernel buffer.
416  * Fields in cmd.head will be initialized and remaining fields will be zero'd.
417  * Caller is responsible for filling in remaining fields as appropriate.
418  *
419  * May return NULL if no requests are available or if there is no room in
420  * the submission queue to handle it (should only be possible on an I/O queue,
421  * admin queue operations are managed).
422  *
423  * Caller should NOT hold the queue lock.
424  */
425 nvme_request_t *
426 nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode,
427                  char *kva, size_t bytes)
428 {
429         nvme_request_t *req;
430         nvme_request_t *next;
431
432         /*
433          * No easy lockless way to pull a new request off.  We have to check
434          * for a number of conditions and there may be multiple threads
435          * making this call simultaneously, which complicates matters even
436          * more.
437          */
438         lockmgr(&queue->lk, LK_EXCLUSIVE);
439
440         /*
441          * Make sure the submission queue has room to accomodate the
442          * request.  Requests can be completed out of order so the
443          * submission ring could still be full even though we have
444          * requests available.
445          */
446         if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe ==
447             queue->subq_head) {
448                 lockmgr(&queue->lk, LK_RELEASE);
449                 KKASSERT(queue->qid != 0);
450                 atomic_swap_int(&queue->signal_requeue, 1);
451
452                 return NULL;
453         }
454
455         /*
456          * Pop the next available request off of the first_avail linked
457          * list.  An atomic op must be used here because nvme_put_request()
458          * returns requests to the list without holding queue->lk.
459          */
460         for (;;) {
461                 req = queue->first_avail;
462                 cpu_ccfence();
463                 if (req == NULL) {
464                         lockmgr(&queue->lk, LK_RELEASE);
465                         KKASSERT(queue->qid != 0);
466                         atomic_swap_int(&queue->signal_requeue, 1);
467
468                         return NULL;
469                 }
470                 next = req->next_avail;
471                 if (atomic_cmpset_ptr(&queue->first_avail, req, next))
472                         break;
473         }
474
475         /*
476          * We have to keep track of unsubmitted requests in order to be
477          * able to properly check whether the ring is full or not (check
478          * is done at the top of this procedure, above).
479          */
480         ++queue->unsubmitted;
481         lockmgr(&queue->lk, LK_RELEASE);
482
483         _nvme_fill_request(queue, opcode, kva, bytes, req);
484
485         return req;
486 }
487
488 /*
489  * dump path only, cannot block.  Allow the lock to fail and bump
490  * queue->unsubmitted anyway.
491  */
492 nvme_request_t *
493 nvme_get_dump_request(nvme_subqueue_t *queue, uint8_t opcode,
494                  char *kva, size_t bytes)
495 {
496         nvme_request_t *req;
497         int error;
498
499         error = lockmgr(&queue->lk, LK_EXCLUSIVE | LK_NOWAIT);
500         req = queue->dump_req;
501         ++queue->unsubmitted;
502         if (error == 0)
503                 lockmgr(&queue->lk, LK_RELEASE);
504         _nvme_fill_request(queue, opcode, kva, bytes, req);
505
506         return req;
507 }
508
509 /*
510  * Submit request for execution.  This will doorbell the subq.
511  *
512  * Caller must hold the queue lock.
513  */
514 void
515 nvme_submit_request(nvme_request_t *req)
516 {
517         nvme_subqueue_t *queue = req->subq;
518         nvme_allcmd_t *cmd;
519
520         cmd = &queue->ksubq[queue->subq_tail];
521         --queue->unsubmitted;
522         if (++queue->subq_tail == queue->nqe)
523                 queue->subq_tail = 0;
524         KKASSERT(queue->subq_tail != queue->subq_head);
525         *cmd = req->cmd;
526         cpu_sfence();   /* needed? */
527         req->state = NVME_REQ_SUBMITTED;
528         nvme_write(queue->sc, queue->subq_doorbell_reg, queue->subq_tail);
529 }
530
531 /*
532  * Wait for a request to complete.
533  *
534  * Caller does not need to hold the queue lock.  If it does, or if it
535  * holds some other lock, it should pass it in so it can be released across
536  * sleeps, else pass NULL.
537  */
538 int
539 nvme_wait_request(nvme_request_t *req)
540 {
541         struct lock *lk;
542         int code;
543
544         req->waiting = 1;
545         if (req->state != NVME_REQ_COMPLETED) {
546                 lk = &req->comq->lk;
547                 cpu_lfence();
548                 lockmgr(lk, LK_EXCLUSIVE);
549                 while (req->state == NVME_REQ_SUBMITTED) {
550                         nvme_poll_completions(req->comq, lk);
551                         if (req->state != NVME_REQ_SUBMITTED)
552                                 break;
553                         lksleep(req, lk, 0, "nvwait", hz);
554                 }
555                 lockmgr(lk, LK_RELEASE);
556                 KKASSERT(req->state == NVME_REQ_COMPLETED);
557         }
558         cpu_lfence();
559         code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
560
561         return code;
562 }
563
564 /*
565  * dump path only, we cannot block, and the lock is allowed
566  * to fail.  But still try to play nice with interrupt threads.
567  */
568 int
569 nvme_poll_request(nvme_request_t *req)
570 {
571         struct lock *lk;
572         int code;
573         int didlock = 500;      /* 500uS max */
574
575         req->waiting = 1;
576         if (req->state != NVME_REQ_COMPLETED) {
577                 lk = &req->comq->lk;
578                 cpu_lfence();
579                 while (lockmgr(lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
580                         if (--didlock == 0)
581                                 break;
582                         tsc_delay(1000);        /* 1uS */
583                 }
584                 while (req->state == NVME_REQ_SUBMITTED) {
585                         nvme_poll_completions(req->comq, lk);
586                         if (req->state != NVME_REQ_SUBMITTED)
587                                 break;
588                         lwkt_switch();
589                 }
590                 if (didlock)
591                         lockmgr(lk, LK_RELEASE);
592                 KKASSERT(req->state == NVME_REQ_COMPLETED);
593         }
594         cpu_lfence();
595         code = NVME_COMQ_STATUS_CODE_GET(req->res.tail.status);
596
597         return code;
598 }
599
600 /*
601  * Put request away, making it available for reuse.  If this is an admin
602  * request its auxillary data page is also being released for reuse.
603  *
604  * Caller does NOT have to hold the queue lock.
605  */
606 void
607 nvme_put_request(nvme_request_t *req)
608 {
609         nvme_subqueue_t *queue = req->subq;
610         nvme_request_t *next;
611
612         /*
613          * Insert on head for best cache reuse.
614          */
615         KKASSERT(req->state == NVME_REQ_COMPLETED);
616         req->state = NVME_REQ_AVAIL;
617         for (;;) {
618                 next = queue->first_avail;
619                 cpu_ccfence();
620                 req->next_avail = next;
621                 if (atomic_cmpset_ptr(&queue->first_avail, next, req))
622                         break;
623         }
624
625         /*
626          * If BIOs were deferred due to lack of request space signal the
627          * admin thread to requeue them.  This is a bit messy and normally
628          * should not happen due to the large number of queue entries nvme
629          * usually has.  Let it race for now (admin has a 1hz tick).
630          */
631         if (atomic_swap_int(&queue->signal_requeue, 0)) {
632                 atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE);
633                 wakeup(&queue->sc->admin_signal);
634         }
635 }
636
637 /*
638  * dump path only.
639  */
640 void
641 nvme_put_dump_request(nvme_request_t *req)
642 {
643         KKASSERT(req->state == NVME_REQ_COMPLETED);
644         req->state = NVME_REQ_AVAIL;
645 }
646
647 /*
648  * Poll for completions on queue, copy the 16-byte hw result entry
649  * into the request and poke the doorbell to update the controller's
650  * understanding of comq_head.
651  *
652  * If lk is non-NULL it will be passed to the callback which typically
653  * releases it temporarily when calling biodone() or doing other complex
654  * work on the result.
655  *
656  * Caller must usually hold comq->lk.
657  */
658 void
659 nvme_poll_completions(nvme_comqueue_t *comq, struct lock *lk)
660 {
661         nvme_softc_t *sc = comq->sc;
662         nvme_request_t *req;
663         nvme_subqueue_t *subq;
664         nvme_allres_t *res;
665 #if 0
666         int didwork = 0;
667 #endif
668
669         KKASSERT(comq->comq_tail < comq->nqe);
670         cpu_lfence();           /* needed prior to first phase test */
671         for (;;) {
672                 /*
673                  * WARNING! LOCK MAY HAVE BEEN TEMPORARILY LOST DURING LOOP.
674                  */
675                 res = &comq->kcomq[comq->comq_tail];
676                 if ((res->tail.status ^ comq->phase) & NVME_COMQ_STATUS_PHASE)
677                         break;
678
679                 /*
680                  * Process result on completion queue.
681                  *
682                  * Bump comq_tail, flip the phase detect when we roll-over.
683                  * doorbell every 1/4 queue and at the end of the loop.
684                  */
685                 if (++comq->comq_tail == comq->nqe) {
686                         comq->comq_tail = 0;
687                         comq->phase ^= NVME_COMQ_STATUS_PHASE;
688                 }
689
690                 /*
691                  * WARNING! I imploded the chip by reusing a command id
692                  *          before it was discarded in the completion queue
693                  *          via the doorbell, so for now we always write
694                  *          the doorbell before marking the request as
695                  *          COMPLETED (it can be reused instantly upon
696                  *          being marked).
697                  */
698 #if 0
699                 if (++didwork == (comq->nqe >> 2)) {
700                         didwork = 0;
701                         nvme_write(comq->sc, comq->comq_doorbell_reg,
702                                    comq->comq_tail);
703                 }
704 #endif
705                 cpu_lfence();   /* needed prior to content check */
706
707                 /*
708                  * Locate the request and related submission queue.  The
709                  * request could be on a different queue.  A submission
710                  * queue can have only one completion queue, so we can
711                  * update subq_head without locking the submission queue.
712                  */
713                 subq = &sc->subqueues[res->tail.subq_id];
714                 subq->subq_head = res->tail.subq_head_ptr;
715                 req = &subq->reqary[res->tail.cmd_id];
716
717                 /*
718                  * Copy the fields and wakeup anyone waiting on req.
719                  * The response field in the completion queue can be reused
720                  * once we doorbell which is why we make a copy.
721                  */
722                 KKASSERT(req->state == NVME_REQ_SUBMITTED &&
723                          req->comq == comq);
724                 req->res = *res;
725                 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
726                 cpu_sfence();
727                 req->state = NVME_REQ_COMPLETED;
728                 if (req->callback) {
729                         req->callback(req, lk);
730                 } else if (req->waiting) {
731                         wakeup(req);
732                 }
733         }
734 #if 0
735         if (didwork)
736                 nvme_write(comq->sc, comq->comq_doorbell_reg, comq->comq_tail);
737 #endif
738 }
739
740 /*
741  * Core interrupt handler (called from dedicated interrupt thread, possibly
742  * preempts other threads).
743  *
744  * NOTE: For pin-based level interrupts, the chipset interrupt is cleared
745  *       automatically once all the head doorbells are updated.  However,
746  *       most chipsets assume MSI-X will be used and MAY NOT IMPLEMENT
747  *       pin-based interrupts properly.  I found the BPX card, for example,
748  *       is unable to clear a pin-based interrupt.
749  */
750 void
751 nvme_intr(void *arg)
752 {
753         nvme_comqueue_t *comq = arg;
754         nvme_softc_t *sc;
755         int i;
756         int skip;
757
758         /*
759          * Process all completion queues associated with this vector.  The
760          * interrupt is masked in the APIC.  Do NOT mess with the NVMe
761          * masking registers because (1) We don't need to and it wastes time,
762          * and (2) We aren't supposed to touch them if using MSI-X anyway.
763          */
764         sc = comq->sc;
765         if (sc->nirqs == 1)
766                 skip = 1;
767         else
768                 skip = sc->nirqs - 1;
769
770         for (i = comq->qid; i <= sc->niocomqs; i += skip) {
771                 if (comq->nqe) {
772                         lockmgr(&comq->lk, LK_EXCLUSIVE);
773                         nvme_poll_completions(comq, &comq->lk);
774                         lockmgr(&comq->lk, LK_RELEASE);
775                 }
776                 comq += skip;
777         }
778 }
779
780 /*
781  * ADMIN HELPER COMMAND ROLLUP FUNCTIONS
782  */
783 /*
784  * Issue command to create a submission queue.
785  */
786 int
787 nvme_create_subqueue(nvme_softc_t *sc, uint16_t qid)
788 {
789         nvme_request_t *req;
790         nvme_subqueue_t *subq = &sc->subqueues[qid];
791         int status;
792
793         req = nvme_get_admin_request(sc, NVME_OP_CREATE_SUBQ);
794         req->cmd.head.prp1 = subq->psubq;
795         req->cmd.crsub.subq_id = qid;
796         req->cmd.crsub.subq_size = subq->nqe - 1;       /* 0's based value */
797         req->cmd.crsub.flags = NVME_CREATESUB_PC | NVME_CREATESUB_PRI_URG;
798         req->cmd.crsub.comq_id = subq->comqid;
799
800         nvme_submit_request(req);
801         status = nvme_wait_request(req);
802         nvme_put_request(req);
803
804         return status;
805 }
806
807 /*
808  * Issue command to create a completion queue.
809  */
810 int
811 nvme_create_comqueue(nvme_softc_t *sc, uint16_t qid)
812 {
813         nvme_request_t *req;
814         nvme_comqueue_t *comq = &sc->comqueues[qid];
815         int status;
816         int error;
817         uint16_t ivect;
818
819         error = 0;
820         if (sc->nirqs > 1) {
821                 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
822                 if (qid && ivect == qid) {
823                         error = bus_setup_intr(sc->dev, sc->irq[ivect],
824                                                 INTR_MPSAFE | INTR_HIFREQ,
825                                                 nvme_intr,
826                                                 &sc->comqueues[ivect],
827                                                 &sc->irq_handle[ivect],
828                                                 NULL);
829                 }
830         } else {
831                 ivect = 0;
832         }
833         if (error)
834                 return error;
835
836         req = nvme_get_admin_request(sc, NVME_OP_CREATE_COMQ);
837         req->cmd.head.prp1 = comq->pcomq;
838         req->cmd.crcom.comq_id = qid;
839         req->cmd.crcom.comq_size = comq->nqe - 1;       /* 0's based value */
840         req->cmd.crcom.ivect = ivect;
841         req->cmd.crcom.flags = NVME_CREATECOM_PC | NVME_CREATECOM_IEN;
842
843         nvme_submit_request(req);
844         status = nvme_wait_request(req);
845         nvme_put_request(req);
846
847         return status;
848 }
849
850 /*
851  * Issue command to delete a submission queue.
852  */
853 int
854 nvme_delete_subqueue(nvme_softc_t *sc, uint16_t qid)
855 {
856         nvme_request_t *req;
857         /*nvme_subqueue_t *subq = &sc->subqueues[qid];*/
858         int status;
859
860         req = nvme_get_admin_request(sc, NVME_OP_DELETE_SUBQ);
861         req->cmd.head.prp1 = 0;
862         req->cmd.delete.qid = qid;
863
864         nvme_submit_request(req);
865         status = nvme_wait_request(req);
866         nvme_put_request(req);
867
868         return status;
869 }
870
871 /*
872  * Issue command to delete a completion queue.
873  */
874 int
875 nvme_delete_comqueue(nvme_softc_t *sc, uint16_t qid)
876 {
877         nvme_request_t *req;
878         /*nvme_comqueue_t *comq = &sc->comqueues[qid];*/
879         int status;
880         uint16_t ivect;
881
882         req = nvme_get_admin_request(sc, NVME_OP_DELETE_COMQ);
883         req->cmd.head.prp1 = 0;
884         req->cmd.delete.qid = qid;
885
886         nvme_submit_request(req);
887         status = nvme_wait_request(req);
888         nvme_put_request(req);
889
890         if (qid && sc->nirqs > 1) {
891                 ivect = 1 + (qid - 1) % (sc->nirqs - 1);
892                 if (ivect == qid) {
893                         bus_teardown_intr(sc->dev,
894                                           sc->irq[ivect],
895                                           sc->irq_handle[ivect]);
896                 }
897         }
898
899         return status;
900 }
901
902 /*
903  * Issue friendly shutdown to controller.
904  */
905 int
906 nvme_issue_shutdown(nvme_softc_t *sc, int dopoll)
907 {
908         uint32_t reg;
909         int base_ticks;
910         int error;
911
912         /*
913          * Put us in shutdown
914          */
915         reg = nvme_read(sc, NVME_REG_CONFIG);
916         reg &= ~NVME_CONFIG_SHUT_MASK;
917         reg |= NVME_CONFIG_SHUT_NORM;
918         nvme_write(sc, NVME_REG_CONFIG, reg);
919
920         /*
921          * Wait up to 10 seconds for acknowlegement
922          */
923         error = ENXIO;
924         base_ticks = ticks;
925         while ((int)(ticks - base_ticks) < 10 * 20) {
926                 reg = nvme_read(sc, NVME_REG_STATUS);
927                 if ((reg & NVME_STATUS_SHUT_MASK) & NVME_STATUS_SHUT_DONE) {
928                         error = 0;
929                         break;
930                 }
931                 if (dopoll == 0)
932                         nvme_os_sleep(50);      /* 50ms poll */
933         }
934         if (error)
935                 device_printf(sc->dev, "Unable to shutdown chip nicely\n");
936         else
937                 device_printf(sc->dev, "Normal chip shutdown succeeded\n");
938
939         return error;
940 }
941
942 /*
943  * Make space-padded string serial and model numbers more readable.
944  */
945 size_t
946 string_cleanup(char *str, int domiddle)
947 {
948         size_t i;
949         size_t j;
950         int atbeg = 1;
951
952         for (i = j = 0; str[i]; ++i) {
953                 if ((str[i] == ' ' || str[i] == '\r') &&
954                     (atbeg || domiddle)) {
955                         continue;
956                 } else {
957                         atbeg = 0;
958                 }
959                 str[j] = str[i];
960                 ++j;
961         }
962         while (domiddle == 0 && j > 0 && (str[j-1] == ' ' || str[j-1] == '\r'))
963                 --j;
964         str[j] = 0;
965         if (domiddle == 0) {
966                 for (j = 0; str[j]; ++j) {
967                         if (str[j] == ' ')
968                                 str[j] = '_';
969                 }
970         }
971
972         return j;
973 }