The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/nvme/nvme_ctrlr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (C) 2012-2016 Intel Corporation
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include "opt_cam.h"
   33 
   34 #include <sys/param.h>
   35 #include <sys/systm.h>
   36 #include <sys/buf.h>
   37 #include <sys/bus.h>
   38 #include <sys/conf.h>
   39 #include <sys/ioccom.h>
   40 #include <sys/proc.h>
   41 #include <sys/smp.h>
   42 #include <sys/uio.h>
   43 #include <sys/sbuf.h>
   44 #include <sys/endian.h>
   45 #include <machine/stdarg.h>
   46 #include <vm/vm.h>
   47 
   48 #include "nvme_private.h"
   49 
   50 #define B4_CHK_RDY_DELAY_MS     2300            /* work around controller bug */
   51 
   52 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
   53                                                 struct nvme_async_event_request *aer);
   54 
   55 static void
   56 nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...)
   57 {
   58         struct sbuf sb;
   59         va_list ap;
   60         int error;
   61 
   62         if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL)
   63                 return;
   64         sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev));
   65         va_start(ap, msg);
   66         sbuf_vprintf(&sb, msg, ap);
   67         va_end(ap);
   68         error = sbuf_finish(&sb);
   69         if (error == 0)
   70                 printf("%s\n", sbuf_data(&sb));
   71 
   72         sbuf_clear(&sb);
   73         sbuf_printf(&sb, "name=\"%s\" reason=\"", device_get_nameunit(ctrlr->dev));
   74         va_start(ap, msg);
   75         sbuf_vprintf(&sb, msg, ap);
   76         va_end(ap);
   77         sbuf_printf(&sb, "\"");
   78         error = sbuf_finish(&sb);
   79         if (error == 0)
   80                 devctl_notify("nvme", "controller", type, sbuf_data(&sb));
   81         sbuf_delete(&sb);
   82 }
   83 
   84 static int
   85 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
   86 {
   87         struct nvme_qpair       *qpair;
   88         uint32_t                num_entries;
   89         int                     error;
   90 
   91         qpair = &ctrlr->adminq;
   92         qpair->id = 0;
   93         qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
   94         qpair->domain = ctrlr->domain;
   95 
   96         num_entries = NVME_ADMIN_ENTRIES;
   97         TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
   98         /*
   99          * If admin_entries was overridden to an invalid value, revert it
  100          *  back to our default value.
  101          */
  102         if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
  103             num_entries > NVME_MAX_ADMIN_ENTRIES) {
  104                 nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
  105                     "specified\n", num_entries);
  106                 num_entries = NVME_ADMIN_ENTRIES;
  107         }
  108 
  109         /*
  110          * The admin queue's max xfer size is treated differently than the
  111          *  max I/O xfer size.  16KB is sufficient here - maybe even less?
  112          */
  113         error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
  114              ctrlr);
  115         return (error);
  116 }
  117 
  118 #define QP(ctrlr, c)    ((c) * (ctrlr)->num_io_queues / mp_ncpus)
  119 
  120 static int
  121 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
  122 {
  123         struct nvme_qpair       *qpair;
  124         uint32_t                cap_lo;
  125         uint16_t                mqes;
  126         int                     c, error, i, n;
  127         int                     num_entries, num_trackers, max_entries;
  128 
  129         /*
  130          * NVMe spec sets a hard limit of 64K max entries, but devices may
  131          * specify a smaller limit, so we need to check the MQES field in the
  132          * capabilities register. We have to cap the number of entries to the
  133          * current stride allows for in BAR 0/1, otherwise the remainder entries
  134          * are inaccessable. MQES should reflect this, and this is just a
  135          * fail-safe.
  136          */
  137         max_entries =
  138             (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) /
  139             (1 << (ctrlr->dstrd + 1));
  140         num_entries = NVME_IO_ENTRIES;
  141         TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
  142         cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
  143         mqes = NVME_CAP_LO_MQES(cap_lo);
  144         num_entries = min(num_entries, mqes + 1);
  145         num_entries = min(num_entries, max_entries);
  146 
  147         num_trackers = NVME_IO_TRACKERS;
  148         TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
  149 
  150         num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
  151         num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
  152         /*
  153          * No need to have more trackers than entries in the submit queue.  Note
  154          * also that for a queue size of N, we can only have (N-1) commands
  155          * outstanding, hence the "-1" here.
  156          */
  157         num_trackers = min(num_trackers, (num_entries-1));
  158 
  159         /*
  160          * Our best estimate for the maximum number of I/Os that we should
  161          * normally have in flight at one time. This should be viewed as a hint,
  162          * not a hard limit and will need to be revisited when the upper layers
  163          * of the storage system grows multi-queue support.
  164          */
  165         ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
  166 
  167         ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
  168             M_NVME, M_ZERO | M_WAITOK);
  169 
  170         for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
  171                 qpair = &ctrlr->ioq[i];
  172 
  173                 /*
  174                  * Admin queue has ID=0. IO queues start at ID=1 -
  175                  *  hence the 'i+1' here.
  176                  */
  177                 qpair->id = i + 1;
  178                 if (ctrlr->num_io_queues > 1) {
  179                         /* Find number of CPUs served by this queue. */
  180                         for (n = 1; QP(ctrlr, c + n) == i; n++)
  181                                 ;
  182                         /* Shuffle multiple NVMe devices between CPUs. */
  183                         qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
  184                         qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
  185                 } else {
  186                         qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
  187                         qpair->domain = ctrlr->domain;
  188                 }
  189 
  190                 /*
  191                  * For I/O queues, use the controller-wide max_xfer_size
  192                  *  calculated in nvme_attach().
  193                  */
  194                 error = nvme_qpair_construct(qpair, num_entries, num_trackers,
  195                     ctrlr);
  196                 if (error)
  197                         return (error);
  198 
  199                 /*
  200                  * Do not bother binding interrupts if we only have one I/O
  201                  *  interrupt thread for this controller.
  202                  */
  203                 if (ctrlr->num_io_queues > 1)
  204                         bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
  205         }
  206 
  207         return (0);
  208 }
  209 
  210 static void
  211 nvme_ctrlr_fail(struct nvme_controller *ctrlr)
  212 {
  213         int i;
  214 
  215         ctrlr->is_failed = true;
  216         nvme_admin_qpair_disable(&ctrlr->adminq);
  217         nvme_qpair_fail(&ctrlr->adminq);
  218         if (ctrlr->ioq != NULL) {
  219                 for (i = 0; i < ctrlr->num_io_queues; i++) {
  220                         nvme_io_qpair_disable(&ctrlr->ioq[i]);
  221                         nvme_qpair_fail(&ctrlr->ioq[i]);
  222                 }
  223         }
  224         nvme_notify_fail_consumers(ctrlr);
  225 }
  226 
  227 void
  228 nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
  229     struct nvme_request *req)
  230 {
  231 
  232         mtx_lock(&ctrlr->lock);
  233         STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
  234         mtx_unlock(&ctrlr->lock);
  235         taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
  236 }
  237 
  238 static void
  239 nvme_ctrlr_fail_req_task(void *arg, int pending)
  240 {
  241         struct nvme_controller  *ctrlr = arg;
  242         struct nvme_request     *req;
  243 
  244         mtx_lock(&ctrlr->lock);
  245         while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) {
  246                 STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
  247                 mtx_unlock(&ctrlr->lock);
  248                 nvme_qpair_manual_complete_request(req->qpair, req,
  249                     NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
  250                 mtx_lock(&ctrlr->lock);
  251         }
  252         mtx_unlock(&ctrlr->lock);
  253 }
  254 
  255 static int
  256 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
  257 {
  258         int timeout = ticks + (uint64_t)ctrlr->ready_timeout_in_ms * hz / 1000;
  259         uint32_t csts;
  260 
  261         while (1) {
  262                 csts = nvme_mmio_read_4(ctrlr, csts);
  263                 if (csts == NVME_GONE)          /* Hot unplug. */
  264                         return (ENXIO);
  265                 if (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK)
  266                     == desired_val)
  267                         break;
  268                 if (timeout - ticks < 0) {
  269                         nvme_printf(ctrlr, "controller ready did not become %d "
  270                             "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
  271                         return (ENXIO);
  272                 }
  273                 pause("nvmerdy", 1);
  274         }
  275 
  276         return (0);
  277 }
  278 
  279 static int
  280 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
  281 {
  282         uint32_t cc;
  283         uint32_t csts;
  284         uint8_t  en, rdy;
  285         int err;
  286 
  287         cc = nvme_mmio_read_4(ctrlr, cc);
  288         csts = nvme_mmio_read_4(ctrlr, csts);
  289 
  290         en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
  291         rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
  292 
  293         /*
  294          * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
  295          * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
  296          * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
  297          * isn't the desired value. Short circuit if we're already disabled.
  298          */
  299         if (en == 1) {
  300                 if (rdy == 0) {
  301                         /* EN == 1, wait for  RDY == 1 or fail */
  302                         err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
  303                         if (err != 0)
  304                                 return (err);
  305                 }
  306         } else {
  307                 /* EN == 0 already wait for RDY == 0 */
  308                 if (rdy == 0)
  309                         return (0);
  310                 else
  311                         return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
  312         }
  313 
  314         cc &= ~NVME_CC_REG_EN_MASK;
  315         nvme_mmio_write_4(ctrlr, cc, cc);
  316         /*
  317          * Some drives have issues with accessing the mmio after we
  318          * disable, so delay for a bit after we write the bit to
  319          * cope with these issues.
  320          */
  321         if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
  322                 pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000);
  323         return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
  324 }
  325 
  326 static int
  327 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
  328 {
  329         uint32_t        cc;
  330         uint32_t        csts;
  331         uint32_t        aqa;
  332         uint32_t        qsize;
  333         uint8_t         en, rdy;
  334         int             err;
  335 
  336         cc = nvme_mmio_read_4(ctrlr, cc);
  337         csts = nvme_mmio_read_4(ctrlr, csts);
  338 
  339         en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
  340         rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
  341 
  342         /*
  343          * See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
  344          */
  345         if (en == 1) {
  346                 if (rdy == 1)
  347                         return (0);
  348                 else
  349                         return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
  350         } else {
  351                 /* EN == 0 already wait for RDY == 0 or fail */
  352                 err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
  353                 if (err != 0)
  354                         return (err);
  355         }
  356 
  357         nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
  358         DELAY(5000);
  359         nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
  360         DELAY(5000);
  361 
  362         /* acqs and asqs are 0-based. */
  363         qsize = ctrlr->adminq.num_entries - 1;
  364 
  365         aqa = 0;
  366         aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT;
  367         aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT;
  368         nvme_mmio_write_4(ctrlr, aqa, aqa);
  369         DELAY(5000);
  370 
  371         /* Initialization values for CC */
  372         cc = 0;
  373         cc |= 1 << NVME_CC_REG_EN_SHIFT;
  374         cc |= 0 << NVME_CC_REG_CSS_SHIFT;
  375         cc |= 0 << NVME_CC_REG_AMS_SHIFT;
  376         cc |= 0 << NVME_CC_REG_SHN_SHIFT;
  377         cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */
  378         cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */
  379 
  380         /* This evaluates to 0, which is according to spec. */
  381         cc |= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT;
  382 
  383         nvme_mmio_write_4(ctrlr, cc, cc);
  384 
  385         return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
  386 }
  387 
  388 static void
  389 nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
  390 {
  391         int i;
  392 
  393         nvme_admin_qpair_disable(&ctrlr->adminq);
  394         /*
  395          * I/O queues are not allocated before the initial HW
  396          *  reset, so do not try to disable them.  Use is_initialized
  397          *  to determine if this is the initial HW reset.
  398          */
  399         if (ctrlr->is_initialized) {
  400                 for (i = 0; i < ctrlr->num_io_queues; i++)
  401                         nvme_io_qpair_disable(&ctrlr->ioq[i]);
  402         }
  403 }
  404 
  405 static int
  406 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
  407 {
  408         int err;
  409 
  410         nvme_ctrlr_disable_qpairs(ctrlr);
  411 
  412         pause("nvmehwreset", hz / 10);
  413 
  414         err = nvme_ctrlr_disable(ctrlr);
  415         if (err != 0)
  416                 return err;
  417         return (nvme_ctrlr_enable(ctrlr));
  418 }
  419 
  420 void
  421 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
  422 {
  423         int cmpset;
  424 
  425         cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
  426 
  427         if (cmpset == 0 || ctrlr->is_failed)
  428                 /*
  429                  * Controller is already resetting or has failed.  Return
  430                  *  immediately since there is no need to kick off another
  431                  *  reset in these cases.
  432                  */
  433                 return;
  434 
  435         taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
  436 }
  437 
  438 static int
  439 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
  440 {
  441         struct nvme_completion_poll_status      status;
  442 
  443         status.done = 0;
  444         nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
  445             nvme_completion_poll_cb, &status);
  446         nvme_completion_poll(&status);
  447         if (nvme_completion_is_error(&status.cpl)) {
  448                 nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
  449                 return (ENXIO);
  450         }
  451 
  452         /* Convert data to host endian */
  453         nvme_controller_data_swapbytes(&ctrlr->cdata);
  454 
  455         /*
  456          * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
  457          *  controller supports.
  458          */
  459         if (ctrlr->cdata.mdts > 0)
  460                 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
  461                     ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
  462 
  463         return (0);
  464 }
  465 
  466 static int
  467 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
  468 {
  469         struct nvme_completion_poll_status      status;
  470         int                                     cq_allocated, sq_allocated;
  471 
  472         status.done = 0;
  473         nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
  474             nvme_completion_poll_cb, &status);
  475         nvme_completion_poll(&status);
  476         if (nvme_completion_is_error(&status.cpl)) {
  477                 nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
  478                 return (ENXIO);
  479         }
  480 
  481         /*
  482          * Data in cdw0 is 0-based.
  483          * Lower 16-bits indicate number of submission queues allocated.
  484          * Upper 16-bits indicate number of completion queues allocated.
  485          */
  486         sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
  487         cq_allocated = (status.cpl.cdw0 >> 16) + 1;
  488 
  489         /*
  490          * Controller may allocate more queues than we requested,
  491          *  so use the minimum of the number requested and what was
  492          *  actually allocated.
  493          */
  494         ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
  495         ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
  496         if (ctrlr->num_io_queues > vm_ndomains)
  497                 ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;
  498 
  499         return (0);
  500 }
  501 
  502 static int
  503 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
  504 {
  505         struct nvme_completion_poll_status      status;
  506         struct nvme_qpair                       *qpair;
  507         int                                     i;
  508 
  509         for (i = 0; i < ctrlr->num_io_queues; i++) {
  510                 qpair = &ctrlr->ioq[i];
  511 
  512                 status.done = 0;
  513                 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
  514                     nvme_completion_poll_cb, &status);
  515                 nvme_completion_poll(&status);
  516                 if (nvme_completion_is_error(&status.cpl)) {
  517                         nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
  518                         return (ENXIO);
  519                 }
  520 
  521                 status.done = 0;
  522                 nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair,
  523                     nvme_completion_poll_cb, &status);
  524                 nvme_completion_poll(&status);
  525                 if (nvme_completion_is_error(&status.cpl)) {
  526                         nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
  527                         return (ENXIO);
  528                 }
  529         }
  530 
  531         return (0);
  532 }
  533 
  534 static int
  535 nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr)
  536 {
  537         struct nvme_completion_poll_status      status;
  538         struct nvme_qpair                       *qpair;
  539 
  540         for (int i = 0; i < ctrlr->num_io_queues; i++) {
  541                 qpair = &ctrlr->ioq[i];
  542 
  543                 status.done = 0;
  544                 nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
  545                     nvme_completion_poll_cb, &status);
  546                 nvme_completion_poll(&status);
  547                 if (nvme_completion_is_error(&status.cpl)) {
  548                         nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
  549                         return (ENXIO);
  550                 }
  551 
  552                 status.done = 0;
  553                 nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
  554                     nvme_completion_poll_cb, &status);
  555                 nvme_completion_poll(&status);
  556                 if (nvme_completion_is_error(&status.cpl)) {
  557                         nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
  558                         return (ENXIO);
  559                 }
  560         }
  561 
  562         return (0);
  563 }
  564 
  565 static int
  566 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
  567 {
  568         struct nvme_namespace   *ns;
  569         uint32_t                i;
  570 
  571         for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
  572                 ns = &ctrlr->ns[i];
  573                 nvme_ns_construct(ns, i+1, ctrlr);
  574         }
  575 
  576         return (0);
  577 }
  578 
  579 static bool
  580 is_log_page_id_valid(uint8_t page_id)
  581 {
  582 
  583         switch (page_id) {
  584         case NVME_LOG_ERROR:
  585         case NVME_LOG_HEALTH_INFORMATION:
  586         case NVME_LOG_FIRMWARE_SLOT:
  587         case NVME_LOG_CHANGED_NAMESPACE:
  588         case NVME_LOG_COMMAND_EFFECT:
  589         case NVME_LOG_RES_NOTIFICATION:
  590         case NVME_LOG_SANITIZE_STATUS:
  591                 return (true);
  592         }
  593 
  594         return (false);
  595 }
  596 
  597 static uint32_t
  598 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
  599 {
  600         uint32_t        log_page_size;
  601 
  602         switch (page_id) {
  603         case NVME_LOG_ERROR:
  604                 log_page_size = min(
  605                     sizeof(struct nvme_error_information_entry) *
  606                     (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
  607                 break;
  608         case NVME_LOG_HEALTH_INFORMATION:
  609                 log_page_size = sizeof(struct nvme_health_information_page);
  610                 break;
  611         case NVME_LOG_FIRMWARE_SLOT:
  612                 log_page_size = sizeof(struct nvme_firmware_page);
  613                 break;
  614         case NVME_LOG_CHANGED_NAMESPACE:
  615                 log_page_size = sizeof(struct nvme_ns_list);
  616                 break;
  617         case NVME_LOG_COMMAND_EFFECT:
  618                 log_page_size = sizeof(struct nvme_command_effects_page);
  619                 break;
  620         case NVME_LOG_RES_NOTIFICATION:
  621                 log_page_size = sizeof(struct nvme_res_notification_page);
  622                 break;
  623         case NVME_LOG_SANITIZE_STATUS:
  624                 log_page_size = sizeof(struct nvme_sanitize_status_page);
  625                 break;
  626         default:
  627                 log_page_size = 0;
  628                 break;
  629         }
  630 
  631         return (log_page_size);
  632 }
  633 
  634 static void
  635 nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
  636     uint8_t state)
  637 {
  638 
  639         if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
  640                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  641                     "available spare space below threshold");
  642 
  643         if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
  644                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  645                     "temperature above threshold");
  646 
  647         if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
  648                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  649                     "device reliability degraded");
  650 
  651         if (state & NVME_CRIT_WARN_ST_READ_ONLY)
  652                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  653                     "media placed in read only mode");
  654 
  655         if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
  656                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  657                     "volatile memory backup device failed");
  658 
  659         if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
  660                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  661                     "unknown critical warning(s): state = 0x%02x", state);
  662 }
  663 
  664 static void
  665 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
  666 {
  667         struct nvme_async_event_request         *aer = arg;
  668         struct nvme_health_information_page     *health_info;
  669         struct nvme_ns_list                     *nsl;
  670         struct nvme_error_information_entry     *err;
  671         int i;
  672 
  673         /*
  674          * If the log page fetch for some reason completed with an error,
  675          *  don't pass log page data to the consumers.  In practice, this case
  676          *  should never happen.
  677          */
  678         if (nvme_completion_is_error(cpl))
  679                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
  680                     aer->log_page_id, NULL, 0);
  681         else {
  682                 /* Convert data to host endian */
  683                 switch (aer->log_page_id) {
  684                 case NVME_LOG_ERROR:
  685                         err = (struct nvme_error_information_entry *)aer->log_page_buffer;
  686                         for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
  687                                 nvme_error_information_entry_swapbytes(err++);
  688                         break;
  689                 case NVME_LOG_HEALTH_INFORMATION:
  690                         nvme_health_information_page_swapbytes(
  691                             (struct nvme_health_information_page *)aer->log_page_buffer);
  692                         break;
  693                 case NVME_LOG_FIRMWARE_SLOT:
  694                         nvme_firmware_page_swapbytes(
  695                             (struct nvme_firmware_page *)aer->log_page_buffer);
  696                         break;
  697                 case NVME_LOG_CHANGED_NAMESPACE:
  698                         nvme_ns_list_swapbytes(
  699                             (struct nvme_ns_list *)aer->log_page_buffer);
  700                         break;
  701                 case NVME_LOG_COMMAND_EFFECT:
  702                         nvme_command_effects_page_swapbytes(
  703                             (struct nvme_command_effects_page *)aer->log_page_buffer);
  704                         break;
  705                 case NVME_LOG_RES_NOTIFICATION:
  706                         nvme_res_notification_page_swapbytes(
  707                             (struct nvme_res_notification_page *)aer->log_page_buffer);
  708                         break;
  709                 case NVME_LOG_SANITIZE_STATUS:
  710                         nvme_sanitize_status_page_swapbytes(
  711                             (struct nvme_sanitize_status_page *)aer->log_page_buffer);
  712                         break;
  713                 case INTEL_LOG_TEMP_STATS:
  714                         intel_log_temp_stats_swapbytes(
  715                             (struct intel_log_temp_stats *)aer->log_page_buffer);
  716                         break;
  717                 default:
  718                         break;
  719                 }
  720 
  721                 if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
  722                         health_info = (struct nvme_health_information_page *)
  723                             aer->log_page_buffer;
  724                         nvme_ctrlr_log_critical_warnings(aer->ctrlr,
  725                             health_info->critical_warning);
  726                         /*
  727                          * Critical warnings reported through the
  728                          *  SMART/health log page are persistent, so
  729                          *  clear the associated bits in the async event
  730                          *  config so that we do not receive repeated
  731                          *  notifications for the same event.
  732                          */
  733                         aer->ctrlr->async_event_config &=
  734                             ~health_info->critical_warning;
  735                         nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
  736                             aer->ctrlr->async_event_config, NULL, NULL);
  737                 } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
  738                     !nvme_use_nvd) {
  739                         nsl = (struct nvme_ns_list *)aer->log_page_buffer;
  740                         for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
  741                                 if (nsl->ns[i] > NVME_MAX_NAMESPACES)
  742                                         break;
  743                                 nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
  744                         }
  745                 }
  746 
  747                 /*
  748                  * Pass the cpl data from the original async event completion,
  749                  *  not the log page fetch.
  750                  */
  751                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
  752                     aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
  753         }
  754 
  755         /*
  756          * Repost another asynchronous event request to replace the one
  757          *  that just completed.
  758          */
  759         nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
  760 }
  761 
  762 static void
  763 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
  764 {
  765         struct nvme_async_event_request *aer = arg;
  766 
  767         if (nvme_completion_is_error(cpl)) {
  768                 /*
  769                  *  Do not retry failed async event requests.  This avoids
  770                  *  infinite loops where a new async event request is submitted
  771                  *  to replace the one just failed, only to fail again and
  772                  *  perpetuate the loop.
  773                  */
  774                 return;
  775         }
  776 
  777         /* Associated log page is in bits 23:16 of completion entry dw0. */
  778         aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
  779 
  780         nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
  781             " page 0x%02x)\n", (cpl->cdw0 & 0x07), (cpl->cdw0 & 0xFF00) >> 8,
  782             aer->log_page_id);
  783 
  784         if (is_log_page_id_valid(aer->log_page_id)) {
  785                 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
  786                     aer->log_page_id);
  787                 memcpy(&aer->cpl, cpl, sizeof(*cpl));
  788                 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
  789                     NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
  790                     aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
  791                     aer);
  792                 /* Wait to notify consumers until after log page is fetched. */
  793         } else {
  794                 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
  795                     NULL, 0);
  796 
  797                 /*
  798                  * Repost another asynchronous event request to replace the one
  799                  *  that just completed.
  800                  */
  801                 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
  802         }
  803 }
  804 
  805 static void
  806 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
  807     struct nvme_async_event_request *aer)
  808 {
  809         struct nvme_request *req;
  810 
  811         aer->ctrlr = ctrlr;
  812         req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
  813         aer->req = req;
  814 
  815         /*
  816          * Disable timeout here, since asynchronous event requests should by
  817          *  nature never be timed out.
  818          */
  819         req->timeout = false;
  820         req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
  821         nvme_ctrlr_submit_admin_request(ctrlr, req);
  822 }
  823 
  824 static void
  825 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
  826 {
  827         struct nvme_completion_poll_status      status;
  828         struct nvme_async_event_request         *aer;
  829         uint32_t                                i;
  830 
  831         ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
  832             NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
  833             NVME_CRIT_WARN_ST_READ_ONLY |
  834             NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
  835         if (ctrlr->cdata.ver >= NVME_REV(1, 2))
  836                 ctrlr->async_event_config |= NVME_ASYNC_EVENT_NS_ATTRIBUTE |
  837                     NVME_ASYNC_EVENT_FW_ACTIVATE;
  838 
  839         status.done = 0;
  840         nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
  841             0, NULL, 0, nvme_completion_poll_cb, &status);
  842         nvme_completion_poll(&status);
  843         if (nvme_completion_is_error(&status.cpl) ||
  844             (status.cpl.cdw0 & 0xFFFF) == 0xFFFF ||
  845             (status.cpl.cdw0 & 0xFFFF) == 0x0000) {
  846                 nvme_printf(ctrlr, "temperature threshold not supported\n");
  847         } else
  848                 ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE;
  849 
  850         nvme_ctrlr_cmd_set_async_event_config(ctrlr,
  851             ctrlr->async_event_config, NULL, NULL);
  852 
  853         /* aerl is a zero-based value, so we need to add 1 here. */
  854         ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
  855 
  856         for (i = 0; i < ctrlr->num_aers; i++) {
  857                 aer = &ctrlr->aer[i];
  858                 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
  859         }
  860 }
  861 
  862 static void
  863 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
  864 {
  865 
  866         ctrlr->int_coal_time = 0;
  867         TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
  868             &ctrlr->int_coal_time);
  869 
  870         ctrlr->int_coal_threshold = 0;
  871         TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
  872             &ctrlr->int_coal_threshold);
  873 
  874         nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
  875             ctrlr->int_coal_threshold, NULL, NULL);
  876 }
  877 
  878 static void
  879 nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr)
  880 {
  881         struct nvme_hmb_chunk *hmbc;
  882         int i;
  883 
  884         if (ctrlr->hmb_desc_paddr) {
  885                 bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map);
  886                 bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
  887                     ctrlr->hmb_desc_map);
  888                 ctrlr->hmb_desc_paddr = 0;
  889         }
  890         if (ctrlr->hmb_desc_tag) {
  891                 bus_dma_tag_destroy(ctrlr->hmb_desc_tag);
  892                 ctrlr->hmb_desc_tag = NULL;
  893         }
  894         for (i = 0; i < ctrlr->hmb_nchunks; i++) {
  895                 hmbc = &ctrlr->hmb_chunks[i];
  896                 bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map);
  897                 bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
  898                     hmbc->hmbc_map);
  899         }
  900         ctrlr->hmb_nchunks = 0;
  901         if (ctrlr->hmb_tag) {
  902                 bus_dma_tag_destroy(ctrlr->hmb_tag);
  903                 ctrlr->hmb_tag = NULL;
  904         }
  905         if (ctrlr->hmb_chunks) {
  906                 free(ctrlr->hmb_chunks, M_NVME);
  907                 ctrlr->hmb_chunks = NULL;
  908         }
  909 }
  910 
  911 static void
  912 nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr)
  913 {
  914         struct nvme_hmb_chunk *hmbc;
  915         size_t pref, min, minc, size;
  916         int err, i;
  917         uint64_t max;
  918 
  919         /* Limit HMB to 5% of RAM size per device by default. */
  920         max = (uint64_t)physmem * PAGE_SIZE / 20;
  921         TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max);
  922 
  923         min = (long long unsigned)ctrlr->cdata.hmmin * 4096;
  924         if (max == 0 || max < min)
  925                 return;
  926         pref = MIN((long long unsigned)ctrlr->cdata.hmpre * 4096, max);
  927         minc = MAX(ctrlr->cdata.hmminds * 4096, PAGE_SIZE);
  928         if (min > 0 && ctrlr->cdata.hmmaxd > 0)
  929                 minc = MAX(minc, min / ctrlr->cdata.hmmaxd);
  930         ctrlr->hmb_chunk = pref;
  931 
  932 again:
  933         ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, PAGE_SIZE);
  934         ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk);
  935         if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd)
  936                 ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd;
  937         ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) *
  938             ctrlr->hmb_nchunks, M_NVME, M_WAITOK);
  939         err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
  940             PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
  941             ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag);
  942         if (err != 0) {
  943                 nvme_printf(ctrlr, "HMB tag create failed %d\n", err);
  944                 nvme_ctrlr_hmb_free(ctrlr);
  945                 return;
  946         }
  947 
  948         for (i = 0; i < ctrlr->hmb_nchunks; i++) {
  949                 hmbc = &ctrlr->hmb_chunks[i];
  950                 if (bus_dmamem_alloc(ctrlr->hmb_tag,
  951                     (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT,
  952                     &hmbc->hmbc_map)) {
  953                         nvme_printf(ctrlr, "failed to alloc HMB\n");
  954                         break;
  955                 }
  956                 if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map,
  957                     hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map,
  958                     &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) {
  959                         bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
  960                             hmbc->hmbc_map);
  961                         nvme_printf(ctrlr, "failed to load HMB\n");
  962                         break;
  963                 }
  964                 bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map,
  965                     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
  966         }
  967 
  968         if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min &&
  969             ctrlr->hmb_chunk / 2 >= minc) {
  970                 ctrlr->hmb_nchunks = i;
  971                 nvme_ctrlr_hmb_free(ctrlr);
  972                 ctrlr->hmb_chunk /= 2;
  973                 goto again;
  974         }
  975         ctrlr->hmb_nchunks = i;
  976         if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) {
  977                 nvme_ctrlr_hmb_free(ctrlr);
  978                 return;
  979         }
  980 
  981         size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks;
  982         err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
  983             16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
  984             size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag);
  985         if (err != 0) {
  986                 nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err);
  987                 nvme_ctrlr_hmb_free(ctrlr);
  988                 return;
  989         }
  990         if (bus_dmamem_alloc(ctrlr->hmb_desc_tag,
  991             (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK,
  992             &ctrlr->hmb_desc_map)) {
  993                 nvme_printf(ctrlr, "failed to alloc HMB desc\n");
  994                 nvme_ctrlr_hmb_free(ctrlr);
  995                 return;
  996         }
  997         if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
  998             ctrlr->hmb_desc_vaddr, size, nvme_single_map,
  999             &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) {
 1000                 bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
 1001                     ctrlr->hmb_desc_map);
 1002                 nvme_printf(ctrlr, "failed to load HMB desc\n");
 1003                 nvme_ctrlr_hmb_free(ctrlr);
 1004                 return;
 1005         }
 1006 
 1007         for (i = 0; i < ctrlr->hmb_nchunks; i++) {
 1008                 ctrlr->hmb_desc_vaddr[i].addr =
 1009                     htole64(ctrlr->hmb_chunks[i].hmbc_paddr);
 1010                 ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / 4096);
 1011         }
 1012         bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
 1013             BUS_DMASYNC_PREWRITE);
 1014 
 1015         nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n",
 1016             (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk
 1017             / 1024 / 1024);
 1018 }
 1019 
 1020 static void
 1021 nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret)
 1022 {
 1023         struct nvme_completion_poll_status      status;
 1024         uint32_t cdw11;
 1025 
 1026         cdw11 = 0;
 1027         if (enable)
 1028                 cdw11 |= 1;
 1029         if (memret)
 1030                 cdw11 |= 2;
 1031         status.done = 0;
 1032         nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11,
 1033             ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 4096, ctrlr->hmb_desc_paddr,
 1034             ctrlr->hmb_desc_paddr >> 32, ctrlr->hmb_nchunks, NULL, 0,
 1035             nvme_completion_poll_cb, &status);
 1036         nvme_completion_poll(&status);
 1037         if (nvme_completion_is_error(&status.cpl))
 1038                 nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n");
 1039 }
 1040 
 1041 static void
 1042 nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 1043 {
 1044         struct nvme_controller *ctrlr = ctrlr_arg;
 1045         uint32_t old_num_io_queues;
 1046         int i;
 1047 
 1048         /*
 1049          * Only reset adminq here when we are restarting the
 1050          *  controller after a reset.  During initialization,
 1051          *  we have already submitted admin commands to get
 1052          *  the number of I/O queues supported, so cannot reset
 1053          *  the adminq again here.
 1054          */
 1055         if (resetting) {
 1056                 nvme_qpair_reset(&ctrlr->adminq);
 1057                 nvme_admin_qpair_enable(&ctrlr->adminq);
 1058         }
 1059 
 1060         if (ctrlr->ioq != NULL) {
 1061                 for (i = 0; i < ctrlr->num_io_queues; i++)
 1062                         nvme_qpair_reset(&ctrlr->ioq[i]);
 1063         }
 1064 
 1065         /*
 1066          * If it was a reset on initialization command timeout, just
 1067          * return here, letting initialization code fail gracefully.
 1068          */
 1069         if (resetting && !ctrlr->is_initialized)
 1070                 return;
 1071 
 1072         if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
 1073                 nvme_ctrlr_fail(ctrlr);
 1074                 return;
 1075         }
 1076 
 1077         /*
 1078          * The number of qpairs are determined during controller initialization,
 1079          *  including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
 1080          *  HW limit.  We call SET_FEATURES again here so that it gets called
 1081          *  after any reset for controllers that depend on the driver to
 1082          *  explicit specify how many queues it will use.  This value should
 1083          *  never change between resets, so panic if somehow that does happen.
 1084          */
 1085         if (resetting) {
 1086                 old_num_io_queues = ctrlr->num_io_queues;
 1087                 if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
 1088                         nvme_ctrlr_fail(ctrlr);
 1089                         return;
 1090                 }
 1091 
 1092                 if (old_num_io_queues != ctrlr->num_io_queues) {
 1093                         panic("num_io_queues changed from %u to %u",
 1094                               old_num_io_queues, ctrlr->num_io_queues);
 1095                 }
 1096         }
 1097 
 1098         if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) {
 1099                 nvme_ctrlr_hmb_alloc(ctrlr);
 1100                 if (ctrlr->hmb_nchunks > 0)
 1101                         nvme_ctrlr_hmb_enable(ctrlr, true, false);
 1102         } else if (ctrlr->hmb_nchunks > 0)
 1103                 nvme_ctrlr_hmb_enable(ctrlr, true, true);
 1104 
 1105         if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
 1106                 nvme_ctrlr_fail(ctrlr);
 1107                 return;
 1108         }
 1109 
 1110         if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
 1111                 nvme_ctrlr_fail(ctrlr);
 1112                 return;
 1113         }
 1114 
 1115         nvme_ctrlr_configure_aer(ctrlr);
 1116         nvme_ctrlr_configure_int_coalescing(ctrlr);
 1117 
 1118         for (i = 0; i < ctrlr->num_io_queues; i++)
 1119                 nvme_io_qpair_enable(&ctrlr->ioq[i]);
 1120 }
 1121 
 1122 void
 1123 nvme_ctrlr_start_config_hook(void *arg)
 1124 {
 1125         struct nvme_controller *ctrlr = arg;
 1126 
 1127         /*
 1128          * Reset controller twice to ensure we do a transition from cc.en==1 to
 1129          * cc.en==0.  This is because we don't really know what status the
 1130          * controller was left in when boot handed off to OS.  Linux doesn't do
 1131          * this, however. If we adopt that policy, see also nvme_ctrlr_resume().
 1132          */
 1133         if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
 1134 fail:
 1135                 nvme_ctrlr_fail(ctrlr);
 1136                 config_intrhook_disestablish(&ctrlr->config_hook);
 1137                 return;
 1138         }
 1139 
 1140         if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 1141                 goto fail;
 1142 
 1143         nvme_qpair_reset(&ctrlr->adminq);
 1144         nvme_admin_qpair_enable(&ctrlr->adminq);
 1145 
 1146         if (nvme_ctrlr_identify(ctrlr) == 0 &&
 1147             nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
 1148             nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
 1149                 nvme_ctrlr_start(ctrlr, false);
 1150         else
 1151                 goto fail;
 1152 
 1153         nvme_sysctl_initialize_ctrlr(ctrlr);
 1154         config_intrhook_disestablish(&ctrlr->config_hook);
 1155 
 1156         ctrlr->is_initialized = 1;
 1157         nvme_notify_new_controller(ctrlr);
 1158 }
 1159 
 1160 static void
 1161 nvme_ctrlr_reset_task(void *arg, int pending)
 1162 {
 1163         struct nvme_controller  *ctrlr = arg;
 1164         int                     status;
 1165 
 1166         nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
 1167         status = nvme_ctrlr_hw_reset(ctrlr);
 1168         /*
 1169          * Use pause instead of DELAY, so that we yield to any nvme interrupt
 1170          *  handlers on this CPU that were blocked on a qpair lock. We want
 1171          *  all nvme interrupts completed before proceeding with restarting the
 1172          *  controller.
 1173          *
 1174          * XXX - any way to guarantee the interrupt handlers have quiesced?
 1175          */
 1176         pause("nvmereset", hz / 10);
 1177         if (status == 0)
 1178                 nvme_ctrlr_start(ctrlr, true);
 1179         else
 1180                 nvme_ctrlr_fail(ctrlr);
 1181 
 1182         atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 1183 }
 1184 
 1185 /*
 1186  * Poll all the queues enabled on the device for completion.
 1187  */
 1188 void
 1189 nvme_ctrlr_poll(struct nvme_controller *ctrlr)
 1190 {
 1191         int i;
 1192 
 1193         nvme_qpair_process_completions(&ctrlr->adminq);
 1194 
 1195         for (i = 0; i < ctrlr->num_io_queues; i++)
 1196                 if (ctrlr->ioq && ctrlr->ioq[i].cpl)
 1197                         nvme_qpair_process_completions(&ctrlr->ioq[i]);
 1198 }
 1199 
 1200 /*
 1201  * Poll the single-vector interrupt case: num_io_queues will be 1 and
 1202  * there's only a single vector. While we're polling, we mask further
 1203  * interrupts in the controller.
 1204  */
 1205 void
 1206 nvme_ctrlr_shared_handler(void *arg)
 1207 {
 1208         struct nvme_controller *ctrlr = arg;
 1209 
 1210         nvme_mmio_write_4(ctrlr, intms, 1);
 1211         nvme_ctrlr_poll(ctrlr);
 1212         nvme_mmio_write_4(ctrlr, intmc, 1);
 1213 }
 1214 
 1215 static void
 1216 nvme_pt_done(void *arg, const struct nvme_completion *cpl)
 1217 {
 1218         struct nvme_pt_command *pt = arg;
 1219         struct mtx *mtx = pt->driver_lock;
 1220         uint16_t status;
 1221 
 1222         bzero(&pt->cpl, sizeof(pt->cpl));
 1223         pt->cpl.cdw0 = cpl->cdw0;
 1224 
 1225         status = cpl->status;
 1226         status &= ~NVME_STATUS_P_MASK;
 1227         pt->cpl.status = status;
 1228 
 1229         mtx_lock(mtx);
 1230         pt->driver_lock = NULL;
 1231         wakeup(pt);
 1232         mtx_unlock(mtx);
 1233 }
 1234 
 1235 int
 1236 nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 1237     struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
 1238     int is_admin_cmd)
 1239 {
 1240         struct nvme_request     *req;
 1241         struct mtx              *mtx;
 1242         struct buf              *buf = NULL;
 1243         int                     ret = 0;
 1244 
 1245         if (pt->len > 0) {
 1246                 if (pt->len > ctrlr->max_xfer_size) {
 1247                         nvme_printf(ctrlr, "pt->len (%d) "
 1248                             "exceeds max_xfer_size (%d)\n", pt->len,
 1249                             ctrlr->max_xfer_size);
 1250                         return EIO;
 1251                 }
 1252                 if (is_user_buffer) {
 1253                         /*
 1254                          * Ensure the user buffer is wired for the duration of
 1255                          *  this pass-through command.
 1256                          */
 1257                         PHOLD(curproc);
 1258                         buf = uma_zalloc(pbuf_zone, M_WAITOK);
 1259                         buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
 1260                         if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
 1261                                 ret = EFAULT;
 1262                                 goto err;
 1263                         }
 1264                         req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
 1265                             nvme_pt_done, pt);
 1266                 } else
 1267                         req = nvme_allocate_request_vaddr(pt->buf, pt->len,
 1268                             nvme_pt_done, pt);
 1269         } else
 1270                 req = nvme_allocate_request_null(nvme_pt_done, pt);
 1271 
 1272         /* Assume user space already converted to little-endian */
 1273         req->cmd.opc = pt->cmd.opc;
 1274         req->cmd.fuse = pt->cmd.fuse;
 1275         req->cmd.rsvd2 = pt->cmd.rsvd2;
 1276         req->cmd.rsvd3 = pt->cmd.rsvd3;
 1277         req->cmd.cdw10 = pt->cmd.cdw10;
 1278         req->cmd.cdw11 = pt->cmd.cdw11;
 1279         req->cmd.cdw12 = pt->cmd.cdw12;
 1280         req->cmd.cdw13 = pt->cmd.cdw13;
 1281         req->cmd.cdw14 = pt->cmd.cdw14;
 1282         req->cmd.cdw15 = pt->cmd.cdw15;
 1283 
 1284         req->cmd.nsid = htole32(nsid);
 1285 
 1286         mtx = mtx_pool_find(mtxpool_sleep, pt);
 1287         pt->driver_lock = mtx;
 1288 
 1289         if (is_admin_cmd)
 1290                 nvme_ctrlr_submit_admin_request(ctrlr, req);
 1291         else
 1292                 nvme_ctrlr_submit_io_request(ctrlr, req);
 1293 
 1294         mtx_lock(mtx);
 1295         while (pt->driver_lock != NULL)
 1296                 mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
 1297         mtx_unlock(mtx);
 1298 
 1299 err:
 1300         if (buf != NULL) {
 1301                 uma_zfree(pbuf_zone, buf);
 1302                 PRELE(curproc);
 1303         }
 1304 
 1305         return (ret);
 1306 }
 1307 
 1308 static int
 1309 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 1310     struct thread *td)
 1311 {
 1312         struct nvme_controller                  *ctrlr;
 1313         struct nvme_pt_command                  *pt;
 1314 
 1315         ctrlr = cdev->si_drv1;
 1316 
 1317         switch (cmd) {
 1318         case NVME_RESET_CONTROLLER:
 1319                 nvme_ctrlr_reset(ctrlr);
 1320                 break;
 1321         case NVME_PASSTHROUGH_CMD:
 1322                 pt = (struct nvme_pt_command *)arg;
 1323                 return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
 1324                     1 /* is_user_buffer */, 1 /* is_admin_cmd */));
 1325         case NVME_GET_NSID:
 1326         {
 1327                 struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
 1328                 strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
 1329                     sizeof(gnsid->cdev));
 1330                 gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 1331                 gnsid->nsid = 0;
 1332                 break;
 1333         }
 1334         case NVME_GET_MAX_XFER_SIZE:
 1335                 *(uint64_t *)arg = ctrlr->max_xfer_size;
 1336                 break;
 1337         default:
 1338                 return (ENOTTY);
 1339         }
 1340 
 1341         return (0);
 1342 }
 1343 
 1344 static struct cdevsw nvme_ctrlr_cdevsw = {
 1345         .d_version =    D_VERSION,
 1346         .d_flags =      0,
 1347         .d_ioctl =      nvme_ctrlr_ioctl
 1348 };
 1349 
 1350 int
 1351 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 1352 {
 1353         struct make_dev_args    md_args;
 1354         uint32_t        cap_lo;
 1355         uint32_t        cap_hi;
 1356         uint32_t        to, vs, pmrcap;
 1357         uint8_t         mpsmin;
 1358         int             status, timeout_period;
 1359 
 1360         ctrlr->dev = dev;
 1361 
 1362         mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
 1363         if (bus_get_domain(dev, &ctrlr->domain) != 0)
 1364                 ctrlr->domain = 0;
 1365 
 1366         cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
 1367         if (bootverbose) {
 1368                 device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n",
 1369                     cap_lo, NVME_CAP_LO_MQES(cap_lo),
 1370                     NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "",
 1371                     NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "",
 1372                     (NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "",
 1373                     (NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "",
 1374                     NVME_CAP_LO_TO(cap_lo));
 1375         }
 1376         cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
 1377         if (bootverbose) {
 1378                 device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, "
 1379                     "MPSMIN %u, MPSMAX %u%s%s\n", cap_hi,
 1380                     NVME_CAP_HI_DSTRD(cap_hi),
 1381                     NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "",
 1382                     NVME_CAP_HI_CSS(cap_hi),
 1383                     NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "",
 1384                     NVME_CAP_HI_MPSMIN(cap_hi),
 1385                     NVME_CAP_HI_MPSMAX(cap_hi),
 1386                     NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "",
 1387                     NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : "");
 1388         }
 1389         if (bootverbose) {
 1390                 vs = nvme_mmio_read_4(ctrlr, vs);
 1391                 device_printf(dev, "Version: 0x%08x: %d.%d\n", vs,
 1392                     NVME_MAJOR(vs), NVME_MINOR(vs));
 1393         }
 1394         if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) {
 1395                 pmrcap = nvme_mmio_read_4(ctrlr, pmrcap);
 1396                 device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, "
 1397                     "PMRWBM %x, PMRTO %u%s\n", pmrcap,
 1398                     NVME_PMRCAP_BIR(pmrcap),
 1399                     NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "",
 1400                     NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "",
 1401                     NVME_PMRCAP_PMRTU(pmrcap),
 1402                     NVME_PMRCAP_PMRWBM(pmrcap),
 1403                     NVME_PMRCAP_PMRTO(pmrcap),
 1404                     NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : "");
 1405         }
 1406 
 1407         ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;
 1408 
 1409         mpsmin = NVME_CAP_HI_MPSMIN(cap_hi);
 1410         ctrlr->min_page_size = 1 << (12 + mpsmin);
 1411 
 1412         /* Get ready timeout value from controller, in units of 500ms. */
 1413         to = NVME_CAP_LO_TO(cap_lo) + 1;
 1414         ctrlr->ready_timeout_in_ms = to * 500;
 1415 
 1416         timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
 1417         TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
 1418         timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
 1419         timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
 1420         ctrlr->timeout_period = timeout_period;
 1421 
 1422         nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
 1423         TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
 1424 
 1425         ctrlr->enable_aborts = 0;
 1426         TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
 1427 
 1428         ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
 1429         if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
 1430                 return (ENXIO);
 1431 
 1432         /*
 1433          * Create 2 threads for the taskqueue. The reset thread will block when
 1434          * it detects that the controller has failed until all I/O has been
 1435          * failed up the stack. The fail_req task needs to be able to run in
 1436          * this case to finish the request failure for some cases.
 1437          *
 1438          * We could partially solve this race by draining the failed requeust
 1439          * queue before proceding to free the sim, though nothing would stop
 1440          * new I/O from coming in after we do that drain, but before we reach
 1441          * cam_sim_free, so this big hammer is used instead.
 1442          */
 1443         ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
 1444             taskqueue_thread_enqueue, &ctrlr->taskqueue);
 1445         taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq");
 1446 
 1447         ctrlr->is_resetting = 0;
 1448         ctrlr->is_initialized = 0;
 1449         ctrlr->notification_sent = 0;
 1450         TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
 1451         TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
 1452         STAILQ_INIT(&ctrlr->fail_req);
 1453         ctrlr->is_failed = false;
 1454 
 1455         make_dev_args_init(&md_args);
 1456         md_args.mda_devsw = &nvme_ctrlr_cdevsw;
 1457         md_args.mda_uid = UID_ROOT;
 1458         md_args.mda_gid = GID_WHEEL;
 1459         md_args.mda_mode = 0600;
 1460         md_args.mda_unit = device_get_unit(dev);
 1461         md_args.mda_si_drv1 = (void *)ctrlr;
 1462         status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
 1463             device_get_unit(dev));
 1464         if (status != 0)
 1465                 return (ENXIO);
 1466 
 1467         return (0);
 1468 }
 1469 
 1470 void
 1471 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 1472 {
 1473         int     gone, i;
 1474 
 1475         if (ctrlr->resource == NULL)
 1476                 goto nores;
 1477         if (!mtx_initialized(&ctrlr->adminq.lock))
 1478                 goto noadminq;
 1479 
 1480         /*
 1481          * Check whether it is a hot unplug or a clean driver detach.
 1482          * If device is not there any more, skip any shutdown commands.
 1483          */
 1484         gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
 1485         if (gone)
 1486                 nvme_ctrlr_fail(ctrlr);
 1487         else
 1488                 nvme_notify_fail_consumers(ctrlr);
 1489 
 1490         for (i = 0; i < NVME_MAX_NAMESPACES; i++)
 1491                 nvme_ns_destruct(&ctrlr->ns[i]);
 1492 
 1493         if (ctrlr->cdev)
 1494                 destroy_dev(ctrlr->cdev);
 1495 
 1496         if (ctrlr->is_initialized) {
 1497                 if (!gone) {
 1498                         if (ctrlr->hmb_nchunks > 0)
 1499                                 nvme_ctrlr_hmb_enable(ctrlr, false, false);
 1500                         nvme_ctrlr_delete_qpairs(ctrlr);
 1501                 }
 1502                 nvme_ctrlr_hmb_free(ctrlr);
 1503         }
 1504         if (ctrlr->ioq != NULL) {
 1505                 for (i = 0; i < ctrlr->num_io_queues; i++)
 1506                         nvme_io_qpair_destroy(&ctrlr->ioq[i]);
 1507                 free(ctrlr->ioq, M_NVME);
 1508         }
 1509         nvme_admin_qpair_destroy(&ctrlr->adminq);
 1510 
 1511         /*
 1512          *  Notify the controller of a shutdown, even though this is due to
 1513          *   a driver unload, not a system shutdown (this path is not invoked
 1514          *   during shutdown).  This ensures the controller receives a
 1515          *   shutdown notification in case the system is shutdown before
 1516          *   reloading the driver.
 1517          */
 1518         if (!gone)
 1519                 nvme_ctrlr_shutdown(ctrlr);
 1520 
 1521         if (!gone)
 1522                 nvme_ctrlr_disable(ctrlr);
 1523 
 1524 noadminq:
 1525         if (ctrlr->taskqueue)
 1526                 taskqueue_free(ctrlr->taskqueue);
 1527 
 1528         if (ctrlr->tag)
 1529                 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
 1530 
 1531         if (ctrlr->res)
 1532                 bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
 1533                     rman_get_rid(ctrlr->res), ctrlr->res);
 1534 
 1535         if (ctrlr->bar4_resource != NULL) {
 1536                 bus_release_resource(dev, SYS_RES_MEMORY,
 1537                     ctrlr->bar4_resource_id, ctrlr->bar4_resource);
 1538         }
 1539 
 1540         bus_release_resource(dev, SYS_RES_MEMORY,
 1541             ctrlr->resource_id, ctrlr->resource);
 1542 
 1543 nores:
 1544         mtx_destroy(&ctrlr->lock);
 1545 }
 1546 
 1547 void
 1548 nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
 1549 {
 1550         uint32_t        cc;
 1551         uint32_t        csts;
 1552         int             timeout;
 1553 
 1554         cc = nvme_mmio_read_4(ctrlr, cc);
 1555         cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT);
 1556         cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT;
 1557         nvme_mmio_write_4(ctrlr, cc, cc);
 1558 
 1559         timeout = ticks + (ctrlr->cdata.rtd3e == 0 ? 5 * hz :
 1560             ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000);
 1561         while (1) {
 1562                 csts = nvme_mmio_read_4(ctrlr, csts);
 1563                 if (csts == NVME_GONE)          /* Hot unplug. */
 1564                         break;
 1565                 if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE)
 1566                         break;
 1567                 if (timeout - ticks < 0) {
 1568                         nvme_printf(ctrlr, "shutdown timeout\n");
 1569                         break;
 1570                 }
 1571                 pause("nvmeshut", 1);
 1572         }
 1573 }
 1574 
 1575 void
 1576 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
 1577     struct nvme_request *req)
 1578 {
 1579 
 1580         nvme_qpair_submit_request(&ctrlr->adminq, req);
 1581 }
 1582 
 1583 void
 1584 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
 1585     struct nvme_request *req)
 1586 {
 1587         struct nvme_qpair       *qpair;
 1588 
 1589         qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
 1590         nvme_qpair_submit_request(qpair, req);
 1591 }
 1592 
 1593 device_t
 1594 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
 1595 {
 1596 
 1597         return (ctrlr->dev);
 1598 }
 1599 
 1600 const struct nvme_controller_data *
 1601 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
 1602 {
 1603 
 1604         return (&ctrlr->cdata);
 1605 }
 1606 
 1607 int
 1608 nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
 1609 {
 1610         int to = hz;
 1611 
 1612         /*
 1613          * Can't touch failed controllers, so it's already suspended.
 1614          */
 1615         if (ctrlr->is_failed)
 1616                 return (0);
 1617 
 1618         /*
 1619          * We don't want the reset taskqueue running, since it does similar
 1620          * things, so prevent it from running after we start. Wait for any reset
 1621          * that may have been started to complete. The reset process we follow
 1622          * will ensure that any new I/O will queue and be given to the hardware
 1623          * after we resume (though there should be none).
 1624          */
 1625         while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0)
 1626                 pause("nvmesusp", 1);
 1627         if (to <= 0) {
 1628                 nvme_printf(ctrlr,
 1629                     "Competing reset task didn't finish. Try again later.\n");
 1630                 return (EWOULDBLOCK);
 1631         }
 1632 
 1633         if (ctrlr->hmb_nchunks > 0)
 1634                 nvme_ctrlr_hmb_enable(ctrlr, false, false);
 1635 
 1636         /*
 1637          * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to
 1638          * delete the hardware I/O queues, and then shutdown. This properly
 1639          * flushes any metadata the drive may have stored so it can survive
 1640          * having its power removed and prevents the unsafe shutdown count from
 1641          * incriminating. Once we delete the qpairs, we have to disable them
 1642          * before shutting down. The delay is out of paranoia in
 1643          * nvme_ctrlr_hw_reset, and is repeated here (though we should have no
 1644          * pending I/O that the delay copes with).
 1645          */
 1646         nvme_ctrlr_delete_qpairs(ctrlr);
 1647         nvme_ctrlr_disable_qpairs(ctrlr);
 1648         pause("nvmesusp", hz / 10);
 1649         nvme_ctrlr_shutdown(ctrlr);
 1650 
 1651         return (0);
 1652 }
 1653 
 1654 int
 1655 nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 1656 {
 1657 
 1658         /*
 1659          * Can't touch failed controllers, so nothing to do to resume.
 1660          */
 1661         if (ctrlr->is_failed)
 1662                 return (0);
 1663 
 1664         /*
 1665          * Have to reset the hardware twice, just like we do on attach. See
 1666          * nmve_attach() for why.
 1667          */
 1668         if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 1669                 goto fail;
 1670         if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 1671                 goto fail;
 1672 
 1673         /*
 1674          * Now that we've reset the hardware, we can restart the controller. Any
 1675          * I/O that was pending is requeued. Any admin commands are aborted with
 1676          * an error. Once we've restarted, take the controller out of reset.
 1677          */
 1678         nvme_ctrlr_start(ctrlr, true);
 1679         (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 1680 
 1681         return (0);
 1682 fail:
 1683         /*
 1684          * Since we can't bring the controller out of reset, announce and fail
 1685          * the controller. However, we have to return success for the resume
 1686          * itself, due to questionable APIs.
 1687          */
 1688         nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
 1689         nvme_ctrlr_fail(ctrlr);
 1690         (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 1691         return (0);
 1692 }

Cache object: af2580b5da7c13e46d855174814e5999


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.