The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/nvme/nvme_ctrlr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (C) 2012-2016 Intel Corporation
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 #include <sys/cdefs.h>
   30 __FBSDID("$FreeBSD$");
   31 
   32 #include "opt_cam.h"
   33 #include "opt_nvme.h"
   34 
   35 #include <sys/param.h>
   36 #include <sys/systm.h>
   37 #include <sys/buf.h>
   38 #include <sys/bus.h>
   39 #include <sys/conf.h>
   40 #include <sys/ioccom.h>
   41 #include <sys/proc.h>
   42 #include <sys/smp.h>
   43 #include <sys/uio.h>
   44 #include <sys/sbuf.h>
   45 #include <sys/endian.h>
   46 #include <machine/stdarg.h>
   47 #include <vm/vm.h>
   48 
   49 #include "nvme_private.h"
   50 
   51 #define B4_CHK_RDY_DELAY_MS     2300            /* work around controller bug */
   52 
   53 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
   54                                                 struct nvme_async_event_request *aer);
   55 
   56 static void
   57 nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags)
   58 {
   59         bus_barrier(ctrlr->resource, 0, rman_get_size(ctrlr->resource), flags);
   60 }
   61 
   62 static void
   63 nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...)
   64 {
   65         struct sbuf sb;
   66         va_list ap;
   67         int error;
   68 
   69         if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL)
   70                 return;
   71         sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev));
   72         va_start(ap, msg);
   73         sbuf_vprintf(&sb, msg, ap);
   74         va_end(ap);
   75         error = sbuf_finish(&sb);
   76         if (error == 0)
   77                 printf("%s\n", sbuf_data(&sb));
   78 
   79         sbuf_clear(&sb);
   80         sbuf_printf(&sb, "name=\"%s\" reason=\"", device_get_nameunit(ctrlr->dev));
   81         va_start(ap, msg);
   82         sbuf_vprintf(&sb, msg, ap);
   83         va_end(ap);
   84         sbuf_printf(&sb, "\"");
   85         error = sbuf_finish(&sb);
   86         if (error == 0)
   87                 devctl_notify("nvme", "controller", type, sbuf_data(&sb));
   88         sbuf_delete(&sb);
   89 }
   90 
   91 static int
   92 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
   93 {
   94         struct nvme_qpair       *qpair;
   95         uint32_t                num_entries;
   96         int                     error;
   97 
   98         qpair = &ctrlr->adminq;
   99         qpair->id = 0;
  100         qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
  101         qpair->domain = ctrlr->domain;
  102 
  103         num_entries = NVME_ADMIN_ENTRIES;
  104         TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
  105         /*
  106          * If admin_entries was overridden to an invalid value, revert it
  107          *  back to our default value.
  108          */
  109         if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
  110             num_entries > NVME_MAX_ADMIN_ENTRIES) {
  111                 nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
  112                     "specified\n", num_entries);
  113                 num_entries = NVME_ADMIN_ENTRIES;
  114         }
  115 
  116         /*
  117          * The admin queue's max xfer size is treated differently than the
  118          *  max I/O xfer size.  16KB is sufficient here - maybe even less?
  119          */
  120         error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS,
  121              ctrlr);
  122         return (error);
  123 }
  124 
  125 #define QP(ctrlr, c)    ((c) * (ctrlr)->num_io_queues / mp_ncpus)
  126 
  127 static int
  128 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
  129 {
  130         struct nvme_qpair       *qpair;
  131         uint32_t                cap_lo;
  132         uint16_t                mqes;
  133         int                     c, error, i, n;
  134         int                     num_entries, num_trackers, max_entries;
  135 
  136         /*
  137          * NVMe spec sets a hard limit of 64K max entries, but devices may
  138          * specify a smaller limit, so we need to check the MQES field in the
  139          * capabilities register. We have to cap the number of entries to the
  140          * current stride allows for in BAR 0/1, otherwise the remainder entries
  141          * are inaccessible. MQES should reflect this, and this is just a
  142          * fail-safe.
  143          */
  144         max_entries =
  145             (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) /
  146             (1 << (ctrlr->dstrd + 1));
  147         num_entries = NVME_IO_ENTRIES;
  148         TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
  149         cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
  150         mqes = NVME_CAP_LO_MQES(cap_lo);
  151         num_entries = min(num_entries, mqes + 1);
  152         num_entries = min(num_entries, max_entries);
  153 
  154         num_trackers = NVME_IO_TRACKERS;
  155         TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
  156 
  157         num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
  158         num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
  159         /*
  160          * No need to have more trackers than entries in the submit queue.  Note
  161          * also that for a queue size of N, we can only have (N-1) commands
  162          * outstanding, hence the "-1" here.
  163          */
  164         num_trackers = min(num_trackers, (num_entries-1));
  165 
  166         /*
  167          * Our best estimate for the maximum number of I/Os that we should
  168          * normally have in flight at one time. This should be viewed as a hint,
  169          * not a hard limit and will need to be revisited when the upper layers
  170          * of the storage system grows multi-queue support.
  171          */
  172         ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
  173 
  174         ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
  175             M_NVME, M_ZERO | M_WAITOK);
  176 
  177         for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) {
  178                 qpair = &ctrlr->ioq[i];
  179 
  180                 /*
  181                  * Admin queue has ID=0. IO queues start at ID=1 -
  182                  *  hence the 'i+1' here.
  183                  */
  184                 qpair->id = i + 1;
  185                 if (ctrlr->num_io_queues > 1) {
  186                         /* Find number of CPUs served by this queue. */
  187                         for (n = 1; QP(ctrlr, c + n) == i; n++)
  188                                 ;
  189                         /* Shuffle multiple NVMe devices between CPUs. */
  190                         qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n;
  191                         qpair->domain = pcpu_find(qpair->cpu)->pc_domain;
  192                 } else {
  193                         qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1;
  194                         qpair->domain = ctrlr->domain;
  195                 }
  196 
  197                 /*
  198                  * For I/O queues, use the controller-wide max_xfer_size
  199                  *  calculated in nvme_attach().
  200                  */
  201                 error = nvme_qpair_construct(qpair, num_entries, num_trackers,
  202                     ctrlr);
  203                 if (error)
  204                         return (error);
  205 
  206                 /*
  207                  * Do not bother binding interrupts if we only have one I/O
  208                  *  interrupt thread for this controller.
  209                  */
  210                 if (ctrlr->num_io_queues > 1)
  211                         bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu);
  212         }
  213 
  214         return (0);
  215 }
  216 
  217 static void
  218 nvme_ctrlr_fail(struct nvme_controller *ctrlr)
  219 {
  220         int i;
  221 
  222         ctrlr->is_failed = true;
  223         nvme_admin_qpair_disable(&ctrlr->adminq);
  224         nvme_qpair_fail(&ctrlr->adminq);
  225         if (ctrlr->ioq != NULL) {
  226                 for (i = 0; i < ctrlr->num_io_queues; i++) {
  227                         nvme_io_qpair_disable(&ctrlr->ioq[i]);
  228                         nvme_qpair_fail(&ctrlr->ioq[i]);
  229                 }
  230         }
  231         nvme_notify_fail_consumers(ctrlr);
  232 }
  233 
  234 void
  235 nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
  236     struct nvme_request *req)
  237 {
  238 
  239         mtx_lock(&ctrlr->lock);
  240         STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
  241         mtx_unlock(&ctrlr->lock);
  242         if (!ctrlr->is_dying)
  243                 taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
  244 }
  245 
  246 static void
  247 nvme_ctrlr_fail_req_task(void *arg, int pending)
  248 {
  249         struct nvme_controller  *ctrlr = arg;
  250         struct nvme_request     *req;
  251 
  252         mtx_lock(&ctrlr->lock);
  253         while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) {
  254                 STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
  255                 mtx_unlock(&ctrlr->lock);
  256                 nvme_qpair_manual_complete_request(req->qpair, req,
  257                     NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
  258                 mtx_lock(&ctrlr->lock);
  259         }
  260         mtx_unlock(&ctrlr->lock);
  261 }
  262 
  263 /*
  264  * Wait for RDY to change.
  265  *
  266  * Starts sleeping for 1us and geometrically increases it the longer we wait,
  267  * capped at 1ms.
  268  */
  269 static int
  270 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
  271 {
  272         int timeout = ticks + MSEC_2_TICKS(ctrlr->ready_timeout_in_ms);
  273         sbintime_t delta_t = SBT_1US;
  274         uint32_t csts;
  275 
  276         while (1) {
  277                 csts = nvme_mmio_read_4(ctrlr, csts);
  278                 if (csts == NVME_GONE)          /* Hot unplug. */
  279                         return (ENXIO);
  280                 if (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK)
  281                     == desired_val)
  282                         break;
  283                 if (timeout - ticks < 0) {
  284                         nvme_printf(ctrlr, "controller ready did not become %d "
  285                             "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
  286                         return (ENXIO);
  287                 }
  288 
  289                 pause_sbt("nvmerdy", delta_t, 0, C_PREL(1));
  290                 delta_t = min(SBT_1MS, delta_t * 3 / 2);
  291         }
  292 
  293         return (0);
  294 }
  295 
  296 static int
  297 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
  298 {
  299         uint32_t cc;
  300         uint32_t csts;
  301         uint8_t  en, rdy;
  302         int err;
  303 
  304         cc = nvme_mmio_read_4(ctrlr, cc);
  305         csts = nvme_mmio_read_4(ctrlr, csts);
  306 
  307         en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
  308         rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
  309 
  310         /*
  311          * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
  312          * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
  313          * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
  314          * isn't the desired value. Short circuit if we're already disabled.
  315          */
  316         if (en == 0) {
  317                 /* Wait for RDY == 0 or timeout & fail */
  318                 if (rdy == 0)
  319                         return (0);
  320                 return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
  321         }
  322         if (rdy == 0) {
  323                 /* EN == 1, wait for  RDY == 1 or timeout & fail */
  324                 err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
  325                 if (err != 0)
  326                         return (err);
  327         }
  328 
  329         cc &= ~NVME_CC_REG_EN_MASK;
  330         nvme_mmio_write_4(ctrlr, cc, cc);
  331 
  332         /*
  333          * A few drives have firmware bugs that freeze the drive if we access
  334          * the mmio too soon after we disable.
  335          */
  336         if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
  337                 pause("nvmeR", MSEC_2_TICKS(B4_CHK_RDY_DELAY_MS));
  338         return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
  339 }
  340 
  341 static int
  342 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
  343 {
  344         uint32_t        cc;
  345         uint32_t        csts;
  346         uint32_t        aqa;
  347         uint32_t        qsize;
  348         uint8_t         en, rdy;
  349         int             err;
  350 
  351         cc = nvme_mmio_read_4(ctrlr, cc);
  352         csts = nvme_mmio_read_4(ctrlr, csts);
  353 
  354         en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
  355         rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
  356 
  357         /*
  358          * See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
  359          */
  360         if (en == 1) {
  361                 if (rdy == 1)
  362                         return (0);
  363                 return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
  364         }
  365 
  366         /* EN == 0 already wait for RDY == 0 or timeout & fail */
  367         err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
  368         if (err != 0)
  369                 return (err);
  370 
  371         nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
  372         nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
  373 
  374         /* acqs and asqs are 0-based. */
  375         qsize = ctrlr->adminq.num_entries - 1;
  376 
  377         aqa = 0;
  378         aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT;
  379         aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT;
  380         nvme_mmio_write_4(ctrlr, aqa, aqa);
  381 
  382         /* Initialization values for CC */
  383         cc = 0;
  384         cc |= 1 << NVME_CC_REG_EN_SHIFT;
  385         cc |= 0 << NVME_CC_REG_CSS_SHIFT;
  386         cc |= 0 << NVME_CC_REG_AMS_SHIFT;
  387         cc |= 0 << NVME_CC_REG_SHN_SHIFT;
  388         cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */
  389         cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */
  390 
  391         /*
  392          * Use the Memory Page Size selected during device initialization.  Note
  393          * that value stored in mps is suitable to use here without adjusting by
  394          * NVME_MPS_SHIFT.
  395          */
  396         cc |= ctrlr->mps << NVME_CC_REG_MPS_SHIFT;
  397 
  398         nvme_ctrlr_barrier(ctrlr, BUS_SPACE_BARRIER_WRITE);
  399         nvme_mmio_write_4(ctrlr, cc, cc);
  400 
  401         return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
  402 }
  403 
  404 static void
  405 nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr)
  406 {
  407         int i;
  408 
  409         nvme_admin_qpair_disable(&ctrlr->adminq);
  410         /*
  411          * I/O queues are not allocated before the initial HW
  412          *  reset, so do not try to disable them.  Use is_initialized
  413          *  to determine if this is the initial HW reset.
  414          */
  415         if (ctrlr->is_initialized) {
  416                 for (i = 0; i < ctrlr->num_io_queues; i++)
  417                         nvme_io_qpair_disable(&ctrlr->ioq[i]);
  418         }
  419 }
  420 
  421 static int
  422 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
  423 {
  424         int err;
  425 
  426         TSENTER();
  427 
  428         nvme_ctrlr_disable_qpairs(ctrlr);
  429 
  430         err = nvme_ctrlr_disable(ctrlr);
  431         if (err != 0)
  432                 return err;
  433 
  434         err = nvme_ctrlr_enable(ctrlr);
  435         TSEXIT();
  436         return (err);
  437 }
  438 
  439 void
  440 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
  441 {
  442         int cmpset;
  443 
  444         cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
  445 
  446         if (cmpset == 0 || ctrlr->is_failed)
  447                 /*
  448                  * Controller is already resetting or has failed.  Return
  449                  *  immediately since there is no need to kick off another
  450                  *  reset in these cases.
  451                  */
  452                 return;
  453 
  454         if (!ctrlr->is_dying)
  455                 taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
  456 }
  457 
  458 static int
  459 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
  460 {
  461         struct nvme_completion_poll_status      status;
  462 
  463         status.done = 0;
  464         nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
  465             nvme_completion_poll_cb, &status);
  466         nvme_completion_poll(&status);
  467         if (nvme_completion_is_error(&status.cpl)) {
  468                 nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
  469                 return (ENXIO);
  470         }
  471 
  472         /* Convert data to host endian */
  473         nvme_controller_data_swapbytes(&ctrlr->cdata);
  474 
  475         /*
  476          * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
  477          *  controller supports.
  478          */
  479         if (ctrlr->cdata.mdts > 0)
  480                 ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
  481                     1 << (ctrlr->cdata.mdts + NVME_MPS_SHIFT +
  482                         NVME_CAP_HI_MPSMIN(ctrlr->cap_hi)));
  483 
  484         return (0);
  485 }
  486 
  487 static int
  488 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
  489 {
  490         struct nvme_completion_poll_status      status;
  491         int                                     cq_allocated, sq_allocated;
  492 
  493         status.done = 0;
  494         nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
  495             nvme_completion_poll_cb, &status);
  496         nvme_completion_poll(&status);
  497         if (nvme_completion_is_error(&status.cpl)) {
  498                 nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
  499                 return (ENXIO);
  500         }
  501 
  502         /*
  503          * Data in cdw0 is 0-based.
  504          * Lower 16-bits indicate number of submission queues allocated.
  505          * Upper 16-bits indicate number of completion queues allocated.
  506          */
  507         sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
  508         cq_allocated = (status.cpl.cdw0 >> 16) + 1;
  509 
  510         /*
  511          * Controller may allocate more queues than we requested,
  512          *  so use the minimum of the number requested and what was
  513          *  actually allocated.
  514          */
  515         ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
  516         ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
  517         if (ctrlr->num_io_queues > vm_ndomains)
  518                 ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains;
  519 
  520         return (0);
  521 }
  522 
  523 static int
  524 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
  525 {
  526         struct nvme_completion_poll_status      status;
  527         struct nvme_qpair                       *qpair;
  528         int                                     i;
  529 
  530         for (i = 0; i < ctrlr->num_io_queues; i++) {
  531                 qpair = &ctrlr->ioq[i];
  532 
  533                 status.done = 0;
  534                 nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair,
  535                     nvme_completion_poll_cb, &status);
  536                 nvme_completion_poll(&status);
  537                 if (nvme_completion_is_error(&status.cpl)) {
  538                         nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
  539                         return (ENXIO);
  540                 }
  541 
  542                 status.done = 0;
  543                 nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair,
  544                     nvme_completion_poll_cb, &status);
  545                 nvme_completion_poll(&status);
  546                 if (nvme_completion_is_error(&status.cpl)) {
  547                         nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
  548                         return (ENXIO);
  549                 }
  550         }
  551 
  552         return (0);
  553 }
  554 
  555 static int
  556 nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr)
  557 {
  558         struct nvme_completion_poll_status      status;
  559         struct nvme_qpair                       *qpair;
  560 
  561         for (int i = 0; i < ctrlr->num_io_queues; i++) {
  562                 qpair = &ctrlr->ioq[i];
  563 
  564                 status.done = 0;
  565                 nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
  566                     nvme_completion_poll_cb, &status);
  567                 nvme_completion_poll(&status);
  568                 if (nvme_completion_is_error(&status.cpl)) {
  569                         nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
  570                         return (ENXIO);
  571                 }
  572 
  573                 status.done = 0;
  574                 nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
  575                     nvme_completion_poll_cb, &status);
  576                 nvme_completion_poll(&status);
  577                 if (nvme_completion_is_error(&status.cpl)) {
  578                         nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
  579                         return (ENXIO);
  580                 }
  581         }
  582 
  583         return (0);
  584 }
  585 
  586 static int
  587 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
  588 {
  589         struct nvme_namespace   *ns;
  590         uint32_t                i;
  591 
  592         for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
  593                 ns = &ctrlr->ns[i];
  594                 nvme_ns_construct(ns, i+1, ctrlr);
  595         }
  596 
  597         return (0);
  598 }
  599 
  600 static bool
  601 is_log_page_id_valid(uint8_t page_id)
  602 {
  603 
  604         switch (page_id) {
  605         case NVME_LOG_ERROR:
  606         case NVME_LOG_HEALTH_INFORMATION:
  607         case NVME_LOG_FIRMWARE_SLOT:
  608         case NVME_LOG_CHANGED_NAMESPACE:
  609         case NVME_LOG_COMMAND_EFFECT:
  610         case NVME_LOG_RES_NOTIFICATION:
  611         case NVME_LOG_SANITIZE_STATUS:
  612                 return (true);
  613         }
  614 
  615         return (false);
  616 }
  617 
  618 static uint32_t
  619 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
  620 {
  621         uint32_t        log_page_size;
  622 
  623         switch (page_id) {
  624         case NVME_LOG_ERROR:
  625                 log_page_size = min(
  626                     sizeof(struct nvme_error_information_entry) *
  627                     (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
  628                 break;
  629         case NVME_LOG_HEALTH_INFORMATION:
  630                 log_page_size = sizeof(struct nvme_health_information_page);
  631                 break;
  632         case NVME_LOG_FIRMWARE_SLOT:
  633                 log_page_size = sizeof(struct nvme_firmware_page);
  634                 break;
  635         case NVME_LOG_CHANGED_NAMESPACE:
  636                 log_page_size = sizeof(struct nvme_ns_list);
  637                 break;
  638         case NVME_LOG_COMMAND_EFFECT:
  639                 log_page_size = sizeof(struct nvme_command_effects_page);
  640                 break;
  641         case NVME_LOG_RES_NOTIFICATION:
  642                 log_page_size = sizeof(struct nvme_res_notification_page);
  643                 break;
  644         case NVME_LOG_SANITIZE_STATUS:
  645                 log_page_size = sizeof(struct nvme_sanitize_status_page);
  646                 break;
  647         default:
  648                 log_page_size = 0;
  649                 break;
  650         }
  651 
  652         return (log_page_size);
  653 }
  654 
  655 static void
  656 nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
  657     uint8_t state)
  658 {
  659 
  660         if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
  661                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  662                     "available spare space below threshold");
  663 
  664         if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
  665                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  666                     "temperature above threshold");
  667 
  668         if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
  669                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  670                     "device reliability degraded");
  671 
  672         if (state & NVME_CRIT_WARN_ST_READ_ONLY)
  673                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  674                     "media placed in read only mode");
  675 
  676         if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
  677                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  678                     "volatile memory backup device failed");
  679 
  680         if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
  681                 nvme_ctrlr_devctl_log(ctrlr, "critical",
  682                     "unknown critical warning(s): state = 0x%02x", state);
  683 }
  684 
  685 static void
  686 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
  687 {
  688         struct nvme_async_event_request         *aer = arg;
  689         struct nvme_health_information_page     *health_info;
  690         struct nvme_ns_list                     *nsl;
  691         struct nvme_error_information_entry     *err;
  692         int i;
  693 
  694         /*
  695          * If the log page fetch for some reason completed with an error,
  696          *  don't pass log page data to the consumers.  In practice, this case
  697          *  should never happen.
  698          */
  699         if (nvme_completion_is_error(cpl))
  700                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
  701                     aer->log_page_id, NULL, 0);
  702         else {
  703                 /* Convert data to host endian */
  704                 switch (aer->log_page_id) {
  705                 case NVME_LOG_ERROR:
  706                         err = (struct nvme_error_information_entry *)aer->log_page_buffer;
  707                         for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
  708                                 nvme_error_information_entry_swapbytes(err++);
  709                         break;
  710                 case NVME_LOG_HEALTH_INFORMATION:
  711                         nvme_health_information_page_swapbytes(
  712                             (struct nvme_health_information_page *)aer->log_page_buffer);
  713                         break;
  714                 case NVME_LOG_FIRMWARE_SLOT:
  715                         nvme_firmware_page_swapbytes(
  716                             (struct nvme_firmware_page *)aer->log_page_buffer);
  717                         break;
  718                 case NVME_LOG_CHANGED_NAMESPACE:
  719                         nvme_ns_list_swapbytes(
  720                             (struct nvme_ns_list *)aer->log_page_buffer);
  721                         break;
  722                 case NVME_LOG_COMMAND_EFFECT:
  723                         nvme_command_effects_page_swapbytes(
  724                             (struct nvme_command_effects_page *)aer->log_page_buffer);
  725                         break;
  726                 case NVME_LOG_RES_NOTIFICATION:
  727                         nvme_res_notification_page_swapbytes(
  728                             (struct nvme_res_notification_page *)aer->log_page_buffer);
  729                         break;
  730                 case NVME_LOG_SANITIZE_STATUS:
  731                         nvme_sanitize_status_page_swapbytes(
  732                             (struct nvme_sanitize_status_page *)aer->log_page_buffer);
  733                         break;
  734                 case INTEL_LOG_TEMP_STATS:
  735                         intel_log_temp_stats_swapbytes(
  736                             (struct intel_log_temp_stats *)aer->log_page_buffer);
  737                         break;
  738                 default:
  739                         break;
  740                 }
  741 
  742                 if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
  743                         health_info = (struct nvme_health_information_page *)
  744                             aer->log_page_buffer;
  745                         nvme_ctrlr_log_critical_warnings(aer->ctrlr,
  746                             health_info->critical_warning);
  747                         /*
  748                          * Critical warnings reported through the
  749                          *  SMART/health log page are persistent, so
  750                          *  clear the associated bits in the async event
  751                          *  config so that we do not receive repeated
  752                          *  notifications for the same event.
  753                          */
  754                         aer->ctrlr->async_event_config &=
  755                             ~health_info->critical_warning;
  756                         nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
  757                             aer->ctrlr->async_event_config, NULL, NULL);
  758                 } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
  759                     !nvme_use_nvd) {
  760                         nsl = (struct nvme_ns_list *)aer->log_page_buffer;
  761                         for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
  762                                 if (nsl->ns[i] > NVME_MAX_NAMESPACES)
  763                                         break;
  764                                 nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
  765                         }
  766                 }
  767 
  768                 /*
  769                  * Pass the cpl data from the original async event completion,
  770                  *  not the log page fetch.
  771                  */
  772                 nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
  773                     aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
  774         }
  775 
  776         /*
  777          * Repost another asynchronous event request to replace the one
  778          *  that just completed.
  779          */
  780         nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
  781 }
  782 
  783 static void
  784 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
  785 {
  786         struct nvme_async_event_request *aer = arg;
  787 
  788         if (nvme_completion_is_error(cpl)) {
  789                 /*
  790                  *  Do not retry failed async event requests.  This avoids
  791                  *  infinite loops where a new async event request is submitted
  792                  *  to replace the one just failed, only to fail again and
  793                  *  perpetuate the loop.
  794                  */
  795                 return;
  796         }
  797 
  798         /* Associated log page is in bits 23:16 of completion entry dw0. */
  799         aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
  800 
  801         nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
  802             " page 0x%02x)\n", (cpl->cdw0 & 0x07), (cpl->cdw0 & 0xFF00) >> 8,
  803             aer->log_page_id);
  804 
  805         if (is_log_page_id_valid(aer->log_page_id)) {
  806                 aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
  807                     aer->log_page_id);
  808                 memcpy(&aer->cpl, cpl, sizeof(*cpl));
  809                 nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
  810                     NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
  811                     aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
  812                     aer);
  813                 /* Wait to notify consumers until after log page is fetched. */
  814         } else {
  815                 nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
  816                     NULL, 0);
  817 
  818                 /*
  819                  * Repost another asynchronous event request to replace the one
  820                  *  that just completed.
  821                  */
  822                 nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
  823         }
  824 }
  825 
  826 static void
  827 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
  828     struct nvme_async_event_request *aer)
  829 {
  830         struct nvme_request *req;
  831 
  832         aer->ctrlr = ctrlr;
  833         req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
  834         aer->req = req;
  835 
  836         /*
  837          * Disable timeout here, since asynchronous event requests should by
  838          *  nature never be timed out.
  839          */
  840         req->timeout = false;
  841         req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
  842         nvme_ctrlr_submit_admin_request(ctrlr, req);
  843 }
  844 
  845 static void
  846 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
  847 {
  848         struct nvme_completion_poll_status      status;
  849         struct nvme_async_event_request         *aer;
  850         uint32_t                                i;
  851 
  852         ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
  853             NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
  854             NVME_CRIT_WARN_ST_READ_ONLY |
  855             NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
  856         if (ctrlr->cdata.ver >= NVME_REV(1, 2))
  857                 ctrlr->async_event_config |= NVME_ASYNC_EVENT_NS_ATTRIBUTE |
  858                     NVME_ASYNC_EVENT_FW_ACTIVATE;
  859 
  860         status.done = 0;
  861         nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
  862             0, NULL, 0, nvme_completion_poll_cb, &status);
  863         nvme_completion_poll(&status);
  864         if (nvme_completion_is_error(&status.cpl) ||
  865             (status.cpl.cdw0 & 0xFFFF) == 0xFFFF ||
  866             (status.cpl.cdw0 & 0xFFFF) == 0x0000) {
  867                 nvme_printf(ctrlr, "temperature threshold not supported\n");
  868         } else
  869                 ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE;
  870 
  871         nvme_ctrlr_cmd_set_async_event_config(ctrlr,
  872             ctrlr->async_event_config, NULL, NULL);
  873 
  874         /* aerl is a zero-based value, so we need to add 1 here. */
  875         ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
  876 
  877         for (i = 0; i < ctrlr->num_aers; i++) {
  878                 aer = &ctrlr->aer[i];
  879                 nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
  880         }
  881 }
  882 
  883 static void
  884 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
  885 {
  886 
  887         ctrlr->int_coal_time = 0;
  888         TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
  889             &ctrlr->int_coal_time);
  890 
  891         ctrlr->int_coal_threshold = 0;
  892         TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
  893             &ctrlr->int_coal_threshold);
  894 
  895         nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
  896             ctrlr->int_coal_threshold, NULL, NULL);
  897 }
  898 
  899 static void
  900 nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr)
  901 {
  902         struct nvme_hmb_chunk *hmbc;
  903         int i;
  904 
  905         if (ctrlr->hmb_desc_paddr) {
  906                 bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map);
  907                 bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
  908                     ctrlr->hmb_desc_map);
  909                 ctrlr->hmb_desc_paddr = 0;
  910         }
  911         if (ctrlr->hmb_desc_tag) {
  912                 bus_dma_tag_destroy(ctrlr->hmb_desc_tag);
  913                 ctrlr->hmb_desc_tag = NULL;
  914         }
  915         for (i = 0; i < ctrlr->hmb_nchunks; i++) {
  916                 hmbc = &ctrlr->hmb_chunks[i];
  917                 bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map);
  918                 bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
  919                     hmbc->hmbc_map);
  920         }
  921         ctrlr->hmb_nchunks = 0;
  922         if (ctrlr->hmb_tag) {
  923                 bus_dma_tag_destroy(ctrlr->hmb_tag);
  924                 ctrlr->hmb_tag = NULL;
  925         }
  926         if (ctrlr->hmb_chunks) {
  927                 free(ctrlr->hmb_chunks, M_NVME);
  928                 ctrlr->hmb_chunks = NULL;
  929         }
  930 }
  931 
  932 static void
  933 nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr)
  934 {
  935         struct nvme_hmb_chunk *hmbc;
  936         size_t pref, min, minc, size;
  937         int err, i;
  938         uint64_t max;
  939 
  940         /* Limit HMB to 5% of RAM size per device by default. */
  941         max = (uint64_t)physmem * PAGE_SIZE / 20;
  942         TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max);
  943 
  944         /*
  945          * Units of Host Memory Buffer in the Identify info are always in terms
  946          * of 4k units.
  947          */
  948         min = (long long unsigned)ctrlr->cdata.hmmin * NVME_HMB_UNITS;
  949         if (max == 0 || max < min)
  950                 return;
  951         pref = MIN((long long unsigned)ctrlr->cdata.hmpre * NVME_HMB_UNITS, max);
  952         minc = MAX(ctrlr->cdata.hmminds * NVME_HMB_UNITS, ctrlr->page_size);
  953         if (min > 0 && ctrlr->cdata.hmmaxd > 0)
  954                 minc = MAX(minc, min / ctrlr->cdata.hmmaxd);
  955         ctrlr->hmb_chunk = pref;
  956 
  957 again:
  958         /*
  959          * However, the chunk sizes, number of chunks, and alignment of chunks
  960          * are all based on the current MPS (ctrlr->page_size).
  961          */
  962         ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, ctrlr->page_size);
  963         ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk);
  964         if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd)
  965                 ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd;
  966         ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) *
  967             ctrlr->hmb_nchunks, M_NVME, M_WAITOK);
  968         err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
  969             ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
  970             ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag);
  971         if (err != 0) {
  972                 nvme_printf(ctrlr, "HMB tag create failed %d\n", err);
  973                 nvme_ctrlr_hmb_free(ctrlr);
  974                 return;
  975         }
  976 
  977         for (i = 0; i < ctrlr->hmb_nchunks; i++) {
  978                 hmbc = &ctrlr->hmb_chunks[i];
  979                 if (bus_dmamem_alloc(ctrlr->hmb_tag,
  980                     (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT,
  981                     &hmbc->hmbc_map)) {
  982                         nvme_printf(ctrlr, "failed to alloc HMB\n");
  983                         break;
  984                 }
  985                 if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map,
  986                     hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map,
  987                     &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) {
  988                         bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr,
  989                             hmbc->hmbc_map);
  990                         nvme_printf(ctrlr, "failed to load HMB\n");
  991                         break;
  992                 }
  993                 bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map,
  994                     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
  995         }
  996 
  997         if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min &&
  998             ctrlr->hmb_chunk / 2 >= minc) {
  999                 ctrlr->hmb_nchunks = i;
 1000                 nvme_ctrlr_hmb_free(ctrlr);
 1001                 ctrlr->hmb_chunk /= 2;
 1002                 goto again;
 1003         }
 1004         ctrlr->hmb_nchunks = i;
 1005         if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) {
 1006                 nvme_ctrlr_hmb_free(ctrlr);
 1007                 return;
 1008         }
 1009 
 1010         size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks;
 1011         err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev),
 1012             16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL,
 1013             size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag);
 1014         if (err != 0) {
 1015                 nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err);
 1016                 nvme_ctrlr_hmb_free(ctrlr);
 1017                 return;
 1018         }
 1019         if (bus_dmamem_alloc(ctrlr->hmb_desc_tag,
 1020             (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK,
 1021             &ctrlr->hmb_desc_map)) {
 1022                 nvme_printf(ctrlr, "failed to alloc HMB desc\n");
 1023                 nvme_ctrlr_hmb_free(ctrlr);
 1024                 return;
 1025         }
 1026         if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
 1027             ctrlr->hmb_desc_vaddr, size, nvme_single_map,
 1028             &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) {
 1029                 bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr,
 1030                     ctrlr->hmb_desc_map);
 1031                 nvme_printf(ctrlr, "failed to load HMB desc\n");
 1032                 nvme_ctrlr_hmb_free(ctrlr);
 1033                 return;
 1034         }
 1035 
 1036         for (i = 0; i < ctrlr->hmb_nchunks; i++) {
 1037                 ctrlr->hmb_desc_vaddr[i].addr =
 1038                     htole64(ctrlr->hmb_chunks[i].hmbc_paddr);
 1039                 ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / ctrlr->page_size);
 1040         }
 1041         bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map,
 1042             BUS_DMASYNC_PREWRITE);
 1043 
 1044         nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n",
 1045             (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk
 1046             / 1024 / 1024);
 1047 }
 1048 
 1049 static void
 1050 nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret)
 1051 {
 1052         struct nvme_completion_poll_status      status;
 1053         uint32_t cdw11;
 1054 
 1055         cdw11 = 0;
 1056         if (enable)
 1057                 cdw11 |= 1;
 1058         if (memret)
 1059                 cdw11 |= 2;
 1060         status.done = 0;
 1061         nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11,
 1062             ctrlr->hmb_nchunks * ctrlr->hmb_chunk / ctrlr->page_size,
 1063             ctrlr->hmb_desc_paddr, ctrlr->hmb_desc_paddr >> 32,
 1064             ctrlr->hmb_nchunks, NULL, 0,
 1065             nvme_completion_poll_cb, &status);
 1066         nvme_completion_poll(&status);
 1067         if (nvme_completion_is_error(&status.cpl))
 1068                 nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n");
 1069 }
 1070 
 1071 static void
 1072 nvme_ctrlr_start(void *ctrlr_arg, bool resetting)
 1073 {
 1074         struct nvme_controller *ctrlr = ctrlr_arg;
 1075         uint32_t old_num_io_queues;
 1076         int i;
 1077 
 1078         TSENTER();
 1079 
 1080         /*
 1081          * Only reset adminq here when we are restarting the
 1082          *  controller after a reset.  During initialization,
 1083          *  we have already submitted admin commands to get
 1084          *  the number of I/O queues supported, so cannot reset
 1085          *  the adminq again here.
 1086          */
 1087         if (resetting) {
 1088                 nvme_qpair_reset(&ctrlr->adminq);
 1089                 nvme_admin_qpair_enable(&ctrlr->adminq);
 1090         }
 1091 
 1092         if (ctrlr->ioq != NULL) {
 1093                 for (i = 0; i < ctrlr->num_io_queues; i++)
 1094                         nvme_qpair_reset(&ctrlr->ioq[i]);
 1095         }
 1096 
 1097         /*
 1098          * If it was a reset on initialization command timeout, just
 1099          * return here, letting initialization code fail gracefully.
 1100          */
 1101         if (resetting && !ctrlr->is_initialized)
 1102                 return;
 1103 
 1104         if (resetting && nvme_ctrlr_identify(ctrlr) != 0) {
 1105                 nvme_ctrlr_fail(ctrlr);
 1106                 return;
 1107         }
 1108 
 1109         /*
 1110          * The number of qpairs are determined during controller initialization,
 1111          *  including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
 1112          *  HW limit.  We call SET_FEATURES again here so that it gets called
 1113          *  after any reset for controllers that depend on the driver to
 1114          *  explicit specify how many queues it will use.  This value should
 1115          *  never change between resets, so panic if somehow that does happen.
 1116          */
 1117         if (resetting) {
 1118                 old_num_io_queues = ctrlr->num_io_queues;
 1119                 if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
 1120                         nvme_ctrlr_fail(ctrlr);
 1121                         return;
 1122                 }
 1123 
 1124                 if (old_num_io_queues != ctrlr->num_io_queues) {
 1125                         panic("num_io_queues changed from %u to %u",
 1126                               old_num_io_queues, ctrlr->num_io_queues);
 1127                 }
 1128         }
 1129 
 1130         if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) {
 1131                 nvme_ctrlr_hmb_alloc(ctrlr);
 1132                 if (ctrlr->hmb_nchunks > 0)
 1133                         nvme_ctrlr_hmb_enable(ctrlr, true, false);
 1134         } else if (ctrlr->hmb_nchunks > 0)
 1135                 nvme_ctrlr_hmb_enable(ctrlr, true, true);
 1136 
 1137         if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
 1138                 nvme_ctrlr_fail(ctrlr);
 1139                 return;
 1140         }
 1141 
 1142         if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
 1143                 nvme_ctrlr_fail(ctrlr);
 1144                 return;
 1145         }
 1146 
 1147         nvme_ctrlr_configure_aer(ctrlr);
 1148         nvme_ctrlr_configure_int_coalescing(ctrlr);
 1149 
 1150         for (i = 0; i < ctrlr->num_io_queues; i++)
 1151                 nvme_io_qpair_enable(&ctrlr->ioq[i]);
 1152         TSEXIT();
 1153 }
 1154 
 1155 void
 1156 nvme_ctrlr_start_config_hook(void *arg)
 1157 {
 1158         struct nvme_controller *ctrlr = arg;
 1159 
 1160         TSENTER();
 1161 
 1162         if (nvme_ctrlr_hw_reset(ctrlr) != 0) {
 1163 fail:
 1164                 nvme_ctrlr_fail(ctrlr);
 1165                 config_intrhook_disestablish(&ctrlr->config_hook);
 1166                 return;
 1167         }
 1168 
 1169 #ifdef NVME_2X_RESET
 1170         /*
 1171          * Reset controller twice to ensure we do a transition from cc.en==1 to
 1172          * cc.en==0.  This is because we don't really know what status the
 1173          * controller was left in when boot handed off to OS.  Linux doesn't do
 1174          * this, however, and when the controller is in state cc.en == 0, no
 1175          * I/O can happen.
 1176          */
 1177         if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 1178                 goto fail;
 1179 #endif
 1180 
 1181         nvme_qpair_reset(&ctrlr->adminq);
 1182         nvme_admin_qpair_enable(&ctrlr->adminq);
 1183 
 1184         if (nvme_ctrlr_identify(ctrlr) == 0 &&
 1185             nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
 1186             nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
 1187                 nvme_ctrlr_start(ctrlr, false);
 1188         else
 1189                 goto fail;
 1190 
 1191         nvme_sysctl_initialize_ctrlr(ctrlr);
 1192         config_intrhook_disestablish(&ctrlr->config_hook);
 1193 
 1194         ctrlr->is_initialized = 1;
 1195         nvme_notify_new_controller(ctrlr);
 1196         TSEXIT();
 1197 }
 1198 
 1199 static void
 1200 nvme_ctrlr_reset_task(void *arg, int pending)
 1201 {
 1202         struct nvme_controller  *ctrlr = arg;
 1203         int                     status;
 1204 
 1205         nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller");
 1206         status = nvme_ctrlr_hw_reset(ctrlr);
 1207         /*
 1208          * Use pause instead of DELAY, so that we yield to any nvme interrupt
 1209          *  handlers on this CPU that were blocked on a qpair lock. We want
 1210          *  all nvme interrupts completed before proceeding with restarting the
 1211          *  controller.
 1212          *
 1213          * XXX - any way to guarantee the interrupt handlers have quiesced?
 1214          */
 1215         pause("nvmereset", hz / 10);
 1216         if (status == 0)
 1217                 nvme_ctrlr_start(ctrlr, true);
 1218         else
 1219                 nvme_ctrlr_fail(ctrlr);
 1220 
 1221         atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 1222 }
 1223 
 1224 /*
 1225  * Poll all the queues enabled on the device for completion.
 1226  */
 1227 void
 1228 nvme_ctrlr_poll(struct nvme_controller *ctrlr)
 1229 {
 1230         int i;
 1231 
 1232         nvme_qpair_process_completions(&ctrlr->adminq);
 1233 
 1234         for (i = 0; i < ctrlr->num_io_queues; i++)
 1235                 if (ctrlr->ioq && ctrlr->ioq[i].cpl)
 1236                         nvme_qpair_process_completions(&ctrlr->ioq[i]);
 1237 }
 1238 
 1239 /*
 1240  * Poll the single-vector interrupt case: num_io_queues will be 1 and
 1241  * there's only a single vector. While we're polling, we mask further
 1242  * interrupts in the controller.
 1243  */
 1244 void
 1245 nvme_ctrlr_shared_handler(void *arg)
 1246 {
 1247         struct nvme_controller *ctrlr = arg;
 1248 
 1249         nvme_mmio_write_4(ctrlr, intms, 1);
 1250         nvme_ctrlr_poll(ctrlr);
 1251         nvme_mmio_write_4(ctrlr, intmc, 1);
 1252 }
 1253 
 1254 static void
 1255 nvme_pt_done(void *arg, const struct nvme_completion *cpl)
 1256 {
 1257         struct nvme_pt_command *pt = arg;
 1258         struct mtx *mtx = pt->driver_lock;
 1259         uint16_t status;
 1260 
 1261         bzero(&pt->cpl, sizeof(pt->cpl));
 1262         pt->cpl.cdw0 = cpl->cdw0;
 1263 
 1264         status = cpl->status;
 1265         status &= ~NVME_STATUS_P_MASK;
 1266         pt->cpl.status = status;
 1267 
 1268         mtx_lock(mtx);
 1269         pt->driver_lock = NULL;
 1270         wakeup(pt);
 1271         mtx_unlock(mtx);
 1272 }
 1273 
 1274 int
 1275 nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 1276     struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
 1277     int is_admin_cmd)
 1278 {
 1279         struct nvme_request     *req;
 1280         struct mtx              *mtx;
 1281         struct buf              *buf = NULL;
 1282         int                     ret = 0;
 1283 
 1284         if (pt->len > 0) {
 1285                 if (pt->len > ctrlr->max_xfer_size) {
 1286                         nvme_printf(ctrlr, "pt->len (%d) "
 1287                             "exceeds max_xfer_size (%d)\n", pt->len,
 1288                             ctrlr->max_xfer_size);
 1289                         return EIO;
 1290                 }
 1291                 if (is_user_buffer) {
 1292                         /*
 1293                          * Ensure the user buffer is wired for the duration of
 1294                          *  this pass-through command.
 1295                          */
 1296                         PHOLD(curproc);
 1297                         buf = uma_zalloc(pbuf_zone, M_WAITOK);
 1298                         buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
 1299                         if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) {
 1300                                 ret = EFAULT;
 1301                                 goto err;
 1302                         }
 1303                         req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
 1304                             nvme_pt_done, pt);
 1305                 } else
 1306                         req = nvme_allocate_request_vaddr(pt->buf, pt->len,
 1307                             nvme_pt_done, pt);
 1308         } else
 1309                 req = nvme_allocate_request_null(nvme_pt_done, pt);
 1310 
 1311         /* Assume user space already converted to little-endian */
 1312         req->cmd.opc = pt->cmd.opc;
 1313         req->cmd.fuse = pt->cmd.fuse;
 1314         req->cmd.rsvd2 = pt->cmd.rsvd2;
 1315         req->cmd.rsvd3 = pt->cmd.rsvd3;
 1316         req->cmd.cdw10 = pt->cmd.cdw10;
 1317         req->cmd.cdw11 = pt->cmd.cdw11;
 1318         req->cmd.cdw12 = pt->cmd.cdw12;
 1319         req->cmd.cdw13 = pt->cmd.cdw13;
 1320         req->cmd.cdw14 = pt->cmd.cdw14;
 1321         req->cmd.cdw15 = pt->cmd.cdw15;
 1322 
 1323         req->cmd.nsid = htole32(nsid);
 1324 
 1325         mtx = mtx_pool_find(mtxpool_sleep, pt);
 1326         pt->driver_lock = mtx;
 1327 
 1328         if (is_admin_cmd)
 1329                 nvme_ctrlr_submit_admin_request(ctrlr, req);
 1330         else
 1331                 nvme_ctrlr_submit_io_request(ctrlr, req);
 1332 
 1333         mtx_lock(mtx);
 1334         while (pt->driver_lock != NULL)
 1335                 mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
 1336         mtx_unlock(mtx);
 1337 
 1338 err:
 1339         if (buf != NULL) {
 1340                 uma_zfree(pbuf_zone, buf);
 1341                 PRELE(curproc);
 1342         }
 1343 
 1344         return (ret);
 1345 }
 1346 
 1347 static int
 1348 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
 1349     struct thread *td)
 1350 {
 1351         struct nvme_controller                  *ctrlr;
 1352         struct nvme_pt_command                  *pt;
 1353 
 1354         ctrlr = cdev->si_drv1;
 1355 
 1356         switch (cmd) {
 1357         case NVME_RESET_CONTROLLER:
 1358                 nvme_ctrlr_reset(ctrlr);
 1359                 break;
 1360         case NVME_PASSTHROUGH_CMD:
 1361                 pt = (struct nvme_pt_command *)arg;
 1362                 return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
 1363                     1 /* is_user_buffer */, 1 /* is_admin_cmd */));
 1364         case NVME_GET_NSID:
 1365         {
 1366                 struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg;
 1367                 strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev),
 1368                     sizeof(gnsid->cdev));
 1369                 gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0';
 1370                 gnsid->nsid = 0;
 1371                 break;
 1372         }
 1373         case NVME_GET_MAX_XFER_SIZE:
 1374                 *(uint64_t *)arg = ctrlr->max_xfer_size;
 1375                 break;
 1376         default:
 1377                 return (ENOTTY);
 1378         }
 1379 
 1380         return (0);
 1381 }
 1382 
 1383 static struct cdevsw nvme_ctrlr_cdevsw = {
 1384         .d_version =    D_VERSION,
 1385         .d_flags =      0,
 1386         .d_ioctl =      nvme_ctrlr_ioctl
 1387 };
 1388 
 1389 int
 1390 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 1391 {
 1392         struct make_dev_args    md_args;
 1393         uint32_t        cap_lo;
 1394         uint32_t        cap_hi;
 1395         uint32_t        to, vs, pmrcap;
 1396         int             status, timeout_period;
 1397 
 1398         ctrlr->dev = dev;
 1399 
 1400         mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
 1401         if (bus_get_domain(dev, &ctrlr->domain) != 0)
 1402                 ctrlr->domain = 0;
 1403 
 1404         ctrlr->cap_lo = cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
 1405         if (bootverbose) {
 1406                 device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n",
 1407                     cap_lo, NVME_CAP_LO_MQES(cap_lo),
 1408                     NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "",
 1409                     NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "",
 1410                     (NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "",
 1411                     (NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "",
 1412                     NVME_CAP_LO_TO(cap_lo));
 1413         }
 1414         ctrlr->cap_hi = cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
 1415         if (bootverbose) {
 1416                 device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, "
 1417                     "MPSMIN %u, MPSMAX %u%s%s\n", cap_hi,
 1418                     NVME_CAP_HI_DSTRD(cap_hi),
 1419                     NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "",
 1420                     NVME_CAP_HI_CSS(cap_hi),
 1421                     NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "",
 1422                     NVME_CAP_HI_MPSMIN(cap_hi),
 1423                     NVME_CAP_HI_MPSMAX(cap_hi),
 1424                     NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "",
 1425                     NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : "");
 1426         }
 1427         if (bootverbose) {
 1428                 vs = nvme_mmio_read_4(ctrlr, vs);
 1429                 device_printf(dev, "Version: 0x%08x: %d.%d\n", vs,
 1430                     NVME_MAJOR(vs), NVME_MINOR(vs));
 1431         }
 1432         if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) {
 1433                 pmrcap = nvme_mmio_read_4(ctrlr, pmrcap);
 1434                 device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, "
 1435                     "PMRWBM %x, PMRTO %u%s\n", pmrcap,
 1436                     NVME_PMRCAP_BIR(pmrcap),
 1437                     NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "",
 1438                     NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "",
 1439                     NVME_PMRCAP_PMRTU(pmrcap),
 1440                     NVME_PMRCAP_PMRWBM(pmrcap),
 1441                     NVME_PMRCAP_PMRTO(pmrcap),
 1442                     NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : "");
 1443         }
 1444 
 1445         ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2;
 1446 
 1447         ctrlr->mps = NVME_CAP_HI_MPSMIN(cap_hi);
 1448         ctrlr->page_size = 1 << (NVME_MPS_SHIFT + ctrlr->mps);
 1449 
 1450         /* Get ready timeout value from controller, in units of 500ms. */
 1451         to = NVME_CAP_LO_TO(cap_lo) + 1;
 1452         ctrlr->ready_timeout_in_ms = to * 500;
 1453 
 1454         timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
 1455         TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
 1456         timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
 1457         timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
 1458         ctrlr->timeout_period = timeout_period;
 1459 
 1460         nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
 1461         TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
 1462 
 1463         ctrlr->enable_aborts = 0;
 1464         TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
 1465 
 1466         /* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */
 1467         ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size));
 1468         if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
 1469                 return (ENXIO);
 1470 
 1471         /*
 1472          * Create 2 threads for the taskqueue. The reset thread will block when
 1473          * it detects that the controller has failed until all I/O has been
 1474          * failed up the stack. The fail_req task needs to be able to run in
 1475          * this case to finish the request failure for some cases.
 1476          *
 1477          * We could partially solve this race by draining the failed requeust
 1478          * queue before proceding to free the sim, though nothing would stop
 1479          * new I/O from coming in after we do that drain, but before we reach
 1480          * cam_sim_free, so this big hammer is used instead.
 1481          */
 1482         ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
 1483             taskqueue_thread_enqueue, &ctrlr->taskqueue);
 1484         taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq");
 1485 
 1486         ctrlr->is_resetting = 0;
 1487         ctrlr->is_initialized = 0;
 1488         ctrlr->notification_sent = 0;
 1489         TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
 1490         TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
 1491         STAILQ_INIT(&ctrlr->fail_req);
 1492         ctrlr->is_failed = false;
 1493 
 1494         make_dev_args_init(&md_args);
 1495         md_args.mda_devsw = &nvme_ctrlr_cdevsw;
 1496         md_args.mda_uid = UID_ROOT;
 1497         md_args.mda_gid = GID_WHEEL;
 1498         md_args.mda_mode = 0600;
 1499         md_args.mda_unit = device_get_unit(dev);
 1500         md_args.mda_si_drv1 = (void *)ctrlr;
 1501         status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
 1502             device_get_unit(dev));
 1503         if (status != 0)
 1504                 return (ENXIO);
 1505 
 1506         return (0);
 1507 }
 1508 
 1509 void
 1510 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 1511 {
 1512         int     gone, i;
 1513 
 1514         ctrlr->is_dying = true;
 1515 
 1516         if (ctrlr->resource == NULL)
 1517                 goto nores;
 1518         if (!mtx_initialized(&ctrlr->adminq.lock))
 1519                 goto noadminq;
 1520 
 1521         /*
 1522          * Check whether it is a hot unplug or a clean driver detach.
 1523          * If device is not there any more, skip any shutdown commands.
 1524          */
 1525         gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE);
 1526         if (gone)
 1527                 nvme_ctrlr_fail(ctrlr);
 1528         else
 1529                 nvme_notify_fail_consumers(ctrlr);
 1530 
 1531         for (i = 0; i < NVME_MAX_NAMESPACES; i++)
 1532                 nvme_ns_destruct(&ctrlr->ns[i]);
 1533 
 1534         if (ctrlr->cdev)
 1535                 destroy_dev(ctrlr->cdev);
 1536 
 1537         if (ctrlr->is_initialized) {
 1538                 if (!gone) {
 1539                         if (ctrlr->hmb_nchunks > 0)
 1540                                 nvme_ctrlr_hmb_enable(ctrlr, false, false);
 1541                         nvme_ctrlr_delete_qpairs(ctrlr);
 1542                 }
 1543                 nvme_ctrlr_hmb_free(ctrlr);
 1544         }
 1545         if (ctrlr->ioq != NULL) {
 1546                 for (i = 0; i < ctrlr->num_io_queues; i++)
 1547                         nvme_io_qpair_destroy(&ctrlr->ioq[i]);
 1548                 free(ctrlr->ioq, M_NVME);
 1549         }
 1550         nvme_admin_qpair_destroy(&ctrlr->adminq);
 1551 
 1552         /*
 1553          *  Notify the controller of a shutdown, even though this is due to
 1554          *   a driver unload, not a system shutdown (this path is not invoked
 1555          *   during shutdown).  This ensures the controller receives a
 1556          *   shutdown notification in case the system is shutdown before
 1557          *   reloading the driver.
 1558          */
 1559         if (!gone)
 1560                 nvme_ctrlr_shutdown(ctrlr);
 1561 
 1562         if (!gone)
 1563                 nvme_ctrlr_disable(ctrlr);
 1564 
 1565 noadminq:
 1566         if (ctrlr->taskqueue)
 1567                 taskqueue_free(ctrlr->taskqueue);
 1568 
 1569         if (ctrlr->tag)
 1570                 bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
 1571 
 1572         if (ctrlr->res)
 1573                 bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
 1574                     rman_get_rid(ctrlr->res), ctrlr->res);
 1575 
 1576         if (ctrlr->bar4_resource != NULL) {
 1577                 bus_release_resource(dev, SYS_RES_MEMORY,
 1578                     ctrlr->bar4_resource_id, ctrlr->bar4_resource);
 1579         }
 1580 
 1581         bus_release_resource(dev, SYS_RES_MEMORY,
 1582             ctrlr->resource_id, ctrlr->resource);
 1583 
 1584 nores:
 1585         mtx_destroy(&ctrlr->lock);
 1586 }
 1587 
 1588 void
 1589 nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
 1590 {
 1591         uint32_t        cc;
 1592         uint32_t        csts;
 1593         int             timeout;
 1594 
 1595         cc = nvme_mmio_read_4(ctrlr, cc);
 1596         cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT);
 1597         cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT;
 1598         nvme_mmio_write_4(ctrlr, cc, cc);
 1599 
 1600         timeout = ticks + (ctrlr->cdata.rtd3e == 0 ? 5 * hz :
 1601             ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000);
 1602         while (1) {
 1603                 csts = nvme_mmio_read_4(ctrlr, csts);
 1604                 if (csts == NVME_GONE)          /* Hot unplug. */
 1605                         break;
 1606                 if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE)
 1607                         break;
 1608                 if (timeout - ticks < 0) {
 1609                         nvme_printf(ctrlr, "shutdown timeout\n");
 1610                         break;
 1611                 }
 1612                 pause("nvmeshut", 1);
 1613         }
 1614 }
 1615 
 1616 void
 1617 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
 1618     struct nvme_request *req)
 1619 {
 1620 
 1621         nvme_qpair_submit_request(&ctrlr->adminq, req);
 1622 }
 1623 
 1624 void
 1625 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
 1626     struct nvme_request *req)
 1627 {
 1628         struct nvme_qpair       *qpair;
 1629 
 1630         qpair = &ctrlr->ioq[QP(ctrlr, curcpu)];
 1631         nvme_qpair_submit_request(qpair, req);
 1632 }
 1633 
 1634 device_t
 1635 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
 1636 {
 1637 
 1638         return (ctrlr->dev);
 1639 }
 1640 
 1641 const struct nvme_controller_data *
 1642 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
 1643 {
 1644 
 1645         return (&ctrlr->cdata);
 1646 }
 1647 
 1648 int
 1649 nvme_ctrlr_suspend(struct nvme_controller *ctrlr)
 1650 {
 1651         int to = hz;
 1652 
 1653         /*
 1654          * Can't touch failed controllers, so it's already suspended.
 1655          */
 1656         if (ctrlr->is_failed)
 1657                 return (0);
 1658 
 1659         /*
 1660          * We don't want the reset taskqueue running, since it does similar
 1661          * things, so prevent it from running after we start. Wait for any reset
 1662          * that may have been started to complete. The reset process we follow
 1663          * will ensure that any new I/O will queue and be given to the hardware
 1664          * after we resume (though there should be none).
 1665          */
 1666         while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0)
 1667                 pause("nvmesusp", 1);
 1668         if (to <= 0) {
 1669                 nvme_printf(ctrlr,
 1670                     "Competing reset task didn't finish. Try again later.\n");
 1671                 return (EWOULDBLOCK);
 1672         }
 1673 
 1674         if (ctrlr->hmb_nchunks > 0)
 1675                 nvme_ctrlr_hmb_enable(ctrlr, false, false);
 1676 
 1677         /*
 1678          * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to
 1679          * delete the hardware I/O queues, and then shutdown. This properly
 1680          * flushes any metadata the drive may have stored so it can survive
 1681          * having its power removed and prevents the unsafe shutdown count from
 1682          * incriminating. Once we delete the qpairs, we have to disable them
 1683          * before shutting down.
 1684          */
 1685         nvme_ctrlr_delete_qpairs(ctrlr);
 1686         nvme_ctrlr_disable_qpairs(ctrlr);
 1687         nvme_ctrlr_shutdown(ctrlr);
 1688 
 1689         return (0);
 1690 }
 1691 
 1692 int
 1693 nvme_ctrlr_resume(struct nvme_controller *ctrlr)
 1694 {
 1695 
 1696         /*
 1697          * Can't touch failed controllers, so nothing to do to resume.
 1698          */
 1699         if (ctrlr->is_failed)
 1700                 return (0);
 1701 
 1702         if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 1703                 goto fail;
 1704 #ifdef NVME_2X_RESET
 1705         /*
 1706          * Prior to FreeBSD 13.1, FreeBSD's nvme driver reset the hardware twice
 1707          * to get it into a known good state. However, the hardware's state is
 1708          * good and we don't need to do this for proper functioning.
 1709          */
 1710         if (nvme_ctrlr_hw_reset(ctrlr) != 0)
 1711                 goto fail;
 1712 #endif
 1713 
 1714         /*
 1715          * Now that we've reset the hardware, we can restart the controller. Any
 1716          * I/O that was pending is requeued. Any admin commands are aborted with
 1717          * an error. Once we've restarted, take the controller out of reset.
 1718          */
 1719         nvme_ctrlr_start(ctrlr, true);
 1720         (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 1721 
 1722         return (0);
 1723 fail:
 1724         /*
 1725          * Since we can't bring the controller out of reset, announce and fail
 1726          * the controller. However, we have to return success for the resume
 1727          * itself, due to questionable APIs.
 1728          */
 1729         nvme_printf(ctrlr, "Failed to reset on resume, failing.\n");
 1730         nvme_ctrlr_fail(ctrlr);
 1731         (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 1732         return (0);
 1733 }

Cache object: 17ed00d52774eb0b7c5f4070e6fdd6ab


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.