mpr.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 2009 Yahoo! Inc.
    3  * Copyright (c) 2011-2015 LSI Corp.
    4  * Copyright (c) 2013-2016 Avago Technologies
    5  * Copyright 2000-2020 Broadcom Inc.
    6  * All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  * Broadcom Inc. (LSI) MPT-Fusion Host Adapter FreeBSD
   30  *
   31  */
   32 
   33 #include <sys/cdefs.h>
   34 __FBSDID("$FreeBSD$");
   35 
   36 /* Communications core for Avago Technologies (LSI) MPT3 */
   37 
   38 /* TODO Move headers to mprvar */
   39 #include <sys/types.h>
   40 #include <sys/param.h>
   41 #include <sys/systm.h>
   42 #include <sys/kernel.h>
   43 #include <sys/selinfo.h>
   44 #include <sys/lock.h>
   45 #include <sys/mutex.h>
   46 #include <sys/module.h>
   47 #include <sys/bus.h>
   48 #include <sys/conf.h>
   49 #include <sys/bio.h>
   50 #include <sys/malloc.h>
   51 #include <sys/uio.h>
   52 #include <sys/sysctl.h>
   53 #include <sys/smp.h>
   54 #include <sys/queue.h>
   55 #include <sys/kthread.h>
   56 #include <sys/taskqueue.h>
   57 #include <sys/endian.h>
   58 #include <sys/eventhandler.h>
   59 #include <sys/sbuf.h>
   60 #include <sys/priv.h>
   61 
   62 #include <machine/bus.h>
   63 #include <machine/resource.h>
   64 #include <sys/rman.h>
   65 #include <sys/proc.h>
   66 
   67 #include <dev/pci/pcivar.h>
   68 
   69 #include <cam/cam.h>
   70 #include <cam/cam_ccb.h>
   71 #include <cam/scsi/scsi_all.h>
   72 
   73 #include <dev/mpr/mpi/mpi2_type.h>
   74 #include <dev/mpr/mpi/mpi2.h>
   75 #include <dev/mpr/mpi/mpi2_ioc.h>
   76 #include <dev/mpr/mpi/mpi2_sas.h>
   77 #include <dev/mpr/mpi/mpi2_pci.h>
   78 #include <dev/mpr/mpi/mpi2_cnfg.h>
   79 #include <dev/mpr/mpi/mpi2_init.h>
   80 #include <dev/mpr/mpi/mpi2_tool.h>
   81 #include <dev/mpr/mpr_ioctl.h>
   82 #include <dev/mpr/mprvar.h>
   83 #include <dev/mpr/mpr_table.h>
   84 #include <dev/mpr/mpr_sas.h>
   85 
   86 static int mpr_diag_reset(struct mpr_softc *sc, int sleep_flag);
   87 static int mpr_init_queues(struct mpr_softc *sc);
   88 static void mpr_resize_queues(struct mpr_softc *sc);
   89 static int mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag);
   90 static int mpr_transition_operational(struct mpr_softc *sc);
   91 static int mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching);
   92 static void mpr_iocfacts_free(struct mpr_softc *sc);
   93 static void mpr_startup(void *arg);
   94 static int mpr_send_iocinit(struct mpr_softc *sc);
   95 static int mpr_alloc_queues(struct mpr_softc *sc);
   96 static int mpr_alloc_hw_queues(struct mpr_softc *sc);
   97 static int mpr_alloc_replies(struct mpr_softc *sc);
   98 static int mpr_alloc_requests(struct mpr_softc *sc);
   99 static int mpr_alloc_nvme_prp_pages(struct mpr_softc *sc);
  100 static int mpr_attach_log(struct mpr_softc *sc);
  101 static __inline void mpr_complete_command(struct mpr_softc *sc,
  102     struct mpr_command *cm);
  103 static void mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
  104     MPI2_EVENT_NOTIFICATION_REPLY *reply);
  105 static void mpr_config_complete(struct mpr_softc *sc, struct mpr_command *cm);
  106 static void mpr_periodic(void *);
  107 static int mpr_reregister_events(struct mpr_softc *sc);
  108 static void mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm);
  109 static int mpr_get_iocfacts(struct mpr_softc *sc, MPI2_IOC_FACTS_REPLY *facts);
  110 static int mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag);
  111 static int mpr_debug_sysctl(SYSCTL_HANDLER_ARGS);
  112 static int mpr_dump_reqs(SYSCTL_HANDLER_ARGS);
  113 static void mpr_parse_debug(struct mpr_softc *sc, char *list);
  114 static void adjust_iocfacts_endianness(MPI2_IOC_FACTS_REPLY *facts);
  115 
  116 SYSCTL_NODE(_hw, OID_AUTO, mpr, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  117     "MPR Driver Parameters");
  118 
  119 MALLOC_DEFINE(M_MPR, "mpr", "mpr driver memory");
  120 
  121 /*
  122  * Do a "Diagnostic Reset" aka a hard reset.  This should get the chip out of
  123  * any state and back to its initialization state machine.
  124  */
  125 static char mpt2_reset_magic[] = { 0x00, 0x0f, 0x04, 0x0b, 0x02, 0x07, 0x0d };
  126 
  127 /* 
  128  * Added this union to smoothly convert le64toh cm->cm_desc.Words.
  129  * Compiler only supports uint64_t to be passed as an argument.
  130  * Otherwise it will throw this error:
  131  * "aggregate value used where an integer was expected"
  132  */
  133 typedef union {
  134         u64 word;
  135         struct {
  136                 u32 low;
  137                 u32 high;
  138         } u;
  139 } request_descriptor_t;
  140 
  141 /* Rate limit chain-fail messages to 1 per minute */
  142 static struct timeval mpr_chainfail_interval = { 60, 0 };
  143 
  144 /* 
  145  * sleep_flag can be either CAN_SLEEP or NO_SLEEP.
  146  * If this function is called from process context, it can sleep
  147  * and there is no harm to sleep, in case if this fuction is called
  148  * from Interrupt handler, we can not sleep and need NO_SLEEP flag set.
  149  * based on sleep flags driver will call either msleep, pause or DELAY.
  150  * msleep and pause are of same variant, but pause is used when mpr_mtx
  151  * is not hold by driver.
  152  */
  153 static int
  154 mpr_diag_reset(struct mpr_softc *sc,int sleep_flag)
  155 {
  156         uint32_t reg;
  157         int i, error, tries = 0;
  158         uint8_t first_wait_done = FALSE;
  159 
  160         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
  161 
  162         /* Clear any pending interrupts */
  163         mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
  164 
  165         /*
  166          * Force NO_SLEEP for threads prohibited to sleep
  167          * e.a Thread from interrupt handler are prohibited to sleep.
  168          */
  169         if (curthread->td_no_sleeping)
  170                 sleep_flag = NO_SLEEP;
  171 
  172         mpr_dprint(sc, MPR_INIT, "sequence start, sleep_flag=%d\n", sleep_flag);
  173         /* Push the magic sequence */
  174         error = ETIMEDOUT;
  175         while (tries++ < 20) {
  176                 for (i = 0; i < sizeof(mpt2_reset_magic); i++)
  177                         mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET,
  178                             mpt2_reset_magic[i]);
  179 
  180                 /* wait 100 msec */
  181                 if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
  182                         msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
  183                             "mprdiag", hz/10);
  184                 else if (sleep_flag == CAN_SLEEP)
  185                         pause("mprdiag", hz/10);
  186                 else
  187                         DELAY(100 * 1000);
  188 
  189                 reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
  190                 if (reg & MPI2_DIAG_DIAG_WRITE_ENABLE) {
  191                         error = 0;
  192                         break;
  193                 }
  194         }
  195         if (error) {
  196                 mpr_dprint(sc, MPR_INIT, "sequence failed, error=%d, exit\n",
  197                     error);
  198                 return (error);
  199         }
  200 
  201         /* Send the actual reset.  XXX need to refresh the reg? */
  202         reg |= MPI2_DIAG_RESET_ADAPTER;
  203         mpr_dprint(sc, MPR_INIT, "sequence success, sending reset, reg= 0x%x\n",
  204             reg);
  205         mpr_regwrite(sc, MPI2_HOST_DIAGNOSTIC_OFFSET, reg);
  206 
  207         /* Wait up to 300 seconds in 50ms intervals */
  208         error = ETIMEDOUT;
  209         for (i = 0; i < 6000; i++) {
  210                 /*
  211                  * Wait 50 msec. If this is the first time through, wait 256
  212                  * msec to satisfy Diag Reset timing requirements.
  213                  */
  214                 if (first_wait_done) {
  215                         if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
  216                                 msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
  217                                     "mprdiag", hz/20);
  218                         else if (sleep_flag == CAN_SLEEP)
  219                                 pause("mprdiag", hz/20);
  220                         else
  221                                 DELAY(50 * 1000);
  222                 } else {
  223                         DELAY(256 * 1000);
  224                         first_wait_done = TRUE;
  225                 }
  226                 /*
  227                  * Check for the RESET_ADAPTER bit to be cleared first, then
  228                  * wait for the RESET state to be cleared, which takes a little
  229                  * longer.
  230                  */
  231                 reg = mpr_regread(sc, MPI2_HOST_DIAGNOSTIC_OFFSET);
  232                 if (reg & MPI2_DIAG_RESET_ADAPTER) {
  233                         continue;
  234                 }
  235                 reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
  236                 if ((reg & MPI2_IOC_STATE_MASK) != MPI2_IOC_STATE_RESET) {
  237                         error = 0;
  238                         break;
  239                 }
  240         }
  241         if (error) {
  242                 mpr_dprint(sc, MPR_INIT, "reset failed, error= %d, exit\n",
  243                     error);
  244                 return (error);
  245         }
  246 
  247         mpr_regwrite(sc, MPI2_WRITE_SEQUENCE_OFFSET, 0x0);
  248         mpr_dprint(sc, MPR_INIT, "diag reset success, exit\n");
  249 
  250         return (0);
  251 }
  252 
  253 static int
  254 mpr_message_unit_reset(struct mpr_softc *sc, int sleep_flag)
  255 {
  256         int error;
  257 
  258         MPR_FUNCTRACE(sc);
  259 
  260         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
  261 
  262         error = 0;
  263         mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
  264             MPI2_FUNCTION_IOC_MESSAGE_UNIT_RESET <<
  265             MPI2_DOORBELL_FUNCTION_SHIFT);
  266 
  267         if (mpr_wait_db_ack(sc, 5, sleep_flag) != 0) {
  268                 mpr_dprint(sc, MPR_INIT|MPR_FAULT,
  269                     "Doorbell handshake failed\n");
  270                 error = ETIMEDOUT;
  271         }
  272 
  273         mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
  274         return (error);
  275 }
  276 
  277 static int
  278 mpr_transition_ready(struct mpr_softc *sc)
  279 {
  280         uint32_t reg, state;
  281         int error, tries = 0;
  282         int sleep_flags;
  283 
  284         MPR_FUNCTRACE(sc);
  285         /* If we are in attach call, do not sleep */
  286         sleep_flags = (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE)
  287             ? CAN_SLEEP : NO_SLEEP;
  288 
  289         error = 0;
  290 
  291         mpr_dprint(sc, MPR_INIT, "%s entered, sleep_flags= %d\n",
  292             __func__, sleep_flags);
  293 
  294         while (tries++ < 1200) {
  295                 reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
  296                 mpr_dprint(sc, MPR_INIT, "  Doorbell= 0x%x\n", reg);
  297 
  298                 /*
  299                  * Ensure the IOC is ready to talk.  If it's not, try
  300                  * resetting it.
  301                  */
  302                 if (reg & MPI2_DOORBELL_USED) {
  303                         mpr_dprint(sc, MPR_INIT, "  Not ready, sending diag "
  304                             "reset\n");
  305                         mpr_diag_reset(sc, sleep_flags);
  306                         DELAY(50000);
  307                         continue;
  308                 }
  309 
  310                 /* Is the adapter owned by another peer? */
  311                 if ((reg & MPI2_DOORBELL_WHO_INIT_MASK) ==
  312                     (MPI2_WHOINIT_PCI_PEER << MPI2_DOORBELL_WHO_INIT_SHIFT)) {
  313                         mpr_dprint(sc, MPR_INIT|MPR_FAULT, "IOC is under the "
  314                             "control of another peer host, aborting "
  315                             "initialization.\n");
  316                         error = ENXIO;
  317                         break;
  318                 }
  319                 
  320                 state = reg & MPI2_IOC_STATE_MASK;
  321                 if (state == MPI2_IOC_STATE_READY) {
  322                         /* Ready to go! */
  323                         error = 0;
  324                         break;
  325                 } else if (state == MPI2_IOC_STATE_FAULT) {
  326                         mpr_dprint(sc, MPR_INIT|MPR_FAULT, "IOC in fault "
  327                             "state 0x%x, resetting\n",
  328                             state & MPI2_DOORBELL_FAULT_CODE_MASK);
  329                         mpr_diag_reset(sc, sleep_flags);
  330                 } else if (state == MPI2_IOC_STATE_OPERATIONAL) {
  331                         /* Need to take ownership */
  332                         mpr_message_unit_reset(sc, sleep_flags);
  333                 } else if (state == MPI2_IOC_STATE_RESET) {
  334                         /* Wait a bit, IOC might be in transition */
  335                         mpr_dprint(sc, MPR_INIT|MPR_FAULT,
  336                             "IOC in unexpected reset state\n");
  337                 } else {
  338                         mpr_dprint(sc, MPR_INIT|MPR_FAULT,
  339                             "IOC in unknown state 0x%x\n", state);
  340                         error = EINVAL;
  341                         break;
  342                 }
  343 
  344                 /* Wait 50ms for things to settle down. */
  345                 DELAY(50000);
  346         }
  347 
  348         if (error)
  349                 mpr_dprint(sc, MPR_INIT|MPR_FAULT,
  350                     "Cannot transition IOC to ready\n");
  351         mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
  352         return (error);
  353 }
  354 
  355 static int
  356 mpr_transition_operational(struct mpr_softc *sc)
  357 {
  358         uint32_t reg, state;
  359         int error;
  360 
  361         MPR_FUNCTRACE(sc);
  362 
  363         error = 0;
  364         reg = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
  365         mpr_dprint(sc, MPR_INIT, "%s entered, Doorbell= 0x%x\n", __func__, reg);
  366 
  367         state = reg & MPI2_IOC_STATE_MASK;
  368         if (state != MPI2_IOC_STATE_READY) {
  369                 mpr_dprint(sc, MPR_INIT, "IOC not ready\n");
  370                 if ((error = mpr_transition_ready(sc)) != 0) {
  371                         mpr_dprint(sc, MPR_INIT|MPR_FAULT, 
  372                             "failed to transition ready, exit\n");
  373                         return (error);
  374                 }
  375         }
  376 
  377         error = mpr_send_iocinit(sc);
  378         mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
  379 
  380         return (error);
  381 }
  382 
  383 static void
  384 mpr_resize_queues(struct mpr_softc *sc)
  385 {
  386         u_int reqcr, prireqcr, maxio, sges_per_frame, chain_seg_size;
  387 
  388         /*
  389          * Size the queues. Since the reply queues always need one free
  390          * entry, we'll deduct one reply message here.  The LSI documents
  391          * suggest instead to add a count to the request queue, but I think
  392          * that it's better to deduct from reply queue.
  393          */
  394         prireqcr = MAX(1, sc->max_prireqframes);
  395         prireqcr = MIN(prireqcr, sc->facts->HighPriorityCredit);
  396 
  397         reqcr = MAX(2, sc->max_reqframes);
  398         reqcr = MIN(reqcr, sc->facts->RequestCredit);
  399 
  400         sc->num_reqs = prireqcr + reqcr;
  401         sc->num_prireqs = prireqcr;
  402         sc->num_replies = MIN(sc->max_replyframes + sc->max_evtframes,
  403             sc->facts->MaxReplyDescriptorPostQueueDepth) - 1;
  404 
  405         /* Store the request frame size in bytes rather than as 32bit words */
  406         sc->reqframesz = sc->facts->IOCRequestFrameSize * 4;
  407 
  408         /*
  409          * Gen3 and beyond uses the IOCMaxChainSegmentSize from IOC Facts to
  410          * get the size of a Chain Frame.  Previous versions use the size as a
  411          * Request Frame for the Chain Frame size.  If IOCMaxChainSegmentSize
  412          * is 0, use the default value.  The IOCMaxChainSegmentSize is the
  413          * number of 16-byte elelements that can fit in a Chain Frame, which is
  414          * the size of an IEEE Simple SGE.
  415          */
  416         if (sc->facts->MsgVersion >= MPI2_VERSION_02_05) {
  417                 chain_seg_size = sc->facts->IOCMaxChainSegmentSize;
  418                 if (chain_seg_size == 0)
  419                         chain_seg_size = MPR_DEFAULT_CHAIN_SEG_SIZE;
  420                 sc->chain_frame_size = chain_seg_size *
  421                     MPR_MAX_CHAIN_ELEMENT_SIZE;
  422         } else {
  423                 sc->chain_frame_size = sc->reqframesz;
  424         }
  425 
  426         /*
  427          * Max IO Size is Page Size * the following:
  428          * ((SGEs per frame - 1 for chain element) * Max Chain Depth)
  429          * + 1 for no chain needed in last frame
  430          *
  431          * If user suggests a Max IO size to use, use the smaller of the
  432          * user's value and the calculated value as long as the user's
  433          * value is larger than 0. The user's value is in pages.
  434          */
  435         sges_per_frame = sc->chain_frame_size/sizeof(MPI2_IEEE_SGE_SIMPLE64)-1;
  436         maxio = (sges_per_frame * sc->facts->MaxChainDepth + 1) * PAGE_SIZE;
  437 
  438         /*
  439          * If I/O size limitation requested then use it and pass up to CAM.
  440          * If not, use maxphys as an optimization hint, but report HW limit.
  441          */
  442         if (sc->max_io_pages > 0) {
  443                 maxio = min(maxio, sc->max_io_pages * PAGE_SIZE);
  444                 sc->maxio = maxio;
  445         } else {
  446                 sc->maxio = maxio;
  447                 maxio = min(maxio, maxphys);
  448         }
  449 
  450         sc->num_chains = (maxio / PAGE_SIZE + sges_per_frame - 2) /
  451             sges_per_frame * reqcr;
  452         if (sc->max_chains > 0 && sc->max_chains < sc->num_chains)
  453                 sc->num_chains = sc->max_chains;
  454 
  455         /*
  456          * Figure out the number of MSIx-based queues.  If the firmware or
  457          * user has done something crazy and not allowed enough credit for
  458          * the queues to be useful then don't enable multi-queue.
  459          */
  460         if (sc->facts->MaxMSIxVectors < 2)
  461                 sc->msi_msgs = 1;
  462 
  463         if (sc->msi_msgs > 1) {
  464                 sc->msi_msgs = MIN(sc->msi_msgs, mp_ncpus);
  465                 sc->msi_msgs = MIN(sc->msi_msgs, sc->facts->MaxMSIxVectors);
  466                 if (sc->num_reqs / sc->msi_msgs < 2)
  467                         sc->msi_msgs = 1;
  468         }
  469 
  470         mpr_dprint(sc, MPR_INIT, "Sized queues to q=%d reqs=%d replies=%d\n",
  471             sc->msi_msgs, sc->num_reqs, sc->num_replies);
  472 }
  473 
  474 /*
  475  * This is called during attach and when re-initializing due to a Diag Reset.
  476  * IOC Facts is used to allocate many of the structures needed by the driver.
  477  * If called from attach, de-allocation is not required because the driver has
  478  * not allocated any structures yet, but if called from a Diag Reset, previously
  479  * allocated structures based on IOC Facts will need to be freed and re-
  480  * allocated bases on the latest IOC Facts.
  481  */
  482 static int
  483 mpr_iocfacts_allocate(struct mpr_softc *sc, uint8_t attaching)
  484 {
  485         int error;
  486         Mpi2IOCFactsReply_t saved_facts;
  487         uint8_t saved_mode, reallocating;
  488 
  489         mpr_dprint(sc, MPR_INIT|MPR_TRACE, "%s entered\n", __func__);
  490 
  491         /* Save old IOC Facts and then only reallocate if Facts have changed */
  492         if (!attaching) {
  493                 bcopy(sc->facts, &saved_facts, sizeof(MPI2_IOC_FACTS_REPLY));
  494         }
  495 
  496         /*
  497          * Get IOC Facts.  In all cases throughout this function, panic if doing
  498          * a re-initialization and only return the error if attaching so the OS
  499          * can handle it.
  500          */
  501         if ((error = mpr_get_iocfacts(sc, sc->facts)) != 0) {
  502                 if (attaching) {
  503                         mpr_dprint(sc, MPR_INIT|MPR_FAULT, "Failed to get "
  504                             "IOC Facts with error %d, exit\n", error);
  505                         return (error);
  506                 } else {
  507                         panic("%s failed to get IOC Facts with error %d\n",
  508                             __func__, error);
  509                 }
  510         }
  511 
  512         MPR_DPRINT_PAGE(sc, MPR_XINFO, iocfacts, sc->facts);
  513 
  514         snprintf(sc->fw_version, sizeof(sc->fw_version), 
  515             "%02d.%02d.%02d.%02d", 
  516             sc->facts->FWVersion.Struct.Major,
  517             sc->facts->FWVersion.Struct.Minor,
  518             sc->facts->FWVersion.Struct.Unit,
  519             sc->facts->FWVersion.Struct.Dev);
  520 
  521         snprintf(sc->msg_version, sizeof(sc->msg_version), "%d.%d",
  522             (sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MAJOR_MASK) >>
  523             MPI2_IOCFACTS_MSGVERSION_MAJOR_SHIFT,
  524             (sc->facts->MsgVersion & MPI2_IOCFACTS_MSGVERSION_MINOR_MASK) >>
  525             MPI2_IOCFACTS_MSGVERSION_MINOR_SHIFT);
  526 
  527         mpr_dprint(sc, MPR_INFO, "Firmware: %s, Driver: %s\n", sc->fw_version,
  528             MPR_DRIVER_VERSION);
  529         mpr_dprint(sc, MPR_INFO,
  530             "IOCCapabilities: %b\n", sc->facts->IOCCapabilities,
  531             "\2" "\3ScsiTaskFull" "\4DiagTrace" "\5SnapBuf" "\6ExtBuf"
  532             "\7EEDP" "\10BiDirTarg" "\11Multicast" "\14TransRetry" "\15IR"
  533             "\16EventReplay" "\17RaidAccel" "\20MSIXIndex" "\21HostDisc"
  534             "\22FastPath" "\23RDPQArray" "\24AtomicReqDesc" "\25PCIeSRIOV");
  535 
  536         /*
  537          * If the chip doesn't support event replay then a hard reset will be
  538          * required to trigger a full discovery.  Do the reset here then
  539          * retransition to Ready.  A hard reset might have already been done,
  540          * but it doesn't hurt to do it again.  Only do this if attaching, not
  541          * for a Diag Reset.
  542          */
  543         if (attaching && ((sc->facts->IOCCapabilities &
  544             MPI2_IOCFACTS_CAPABILITY_EVENT_REPLAY) == 0)) {
  545                 mpr_dprint(sc, MPR_INIT, "No event replay, resetting\n");
  546                 mpr_diag_reset(sc, NO_SLEEP);
  547                 if ((error = mpr_transition_ready(sc)) != 0) {
  548                         mpr_dprint(sc, MPR_INIT|MPR_FAULT, "Failed to "
  549                             "transition to ready with error %d, exit\n",
  550                             error);
  551                         return (error);
  552                 }
  553         }
  554 
  555         /*
  556          * Set flag if IR Firmware is loaded.  If the RAID Capability has
  557          * changed from the previous IOC Facts, log a warning, but only if
  558          * checking this after a Diag Reset and not during attach.
  559          */
  560         saved_mode = sc->ir_firmware;
  561         if (sc->facts->IOCCapabilities &
  562             MPI2_IOCFACTS_CAPABILITY_INTEGRATED_RAID)
  563                 sc->ir_firmware = 1;
  564         if (!attaching) {
  565                 if (sc->ir_firmware != saved_mode) {
  566                         mpr_dprint(sc, MPR_INIT|MPR_FAULT, "new IR/IT mode "
  567                             "in IOC Facts does not match previous mode\n");
  568                 }
  569         }
  570 
  571         /* Only deallocate and reallocate if relevant IOC Facts have changed */
  572         reallocating = FALSE;
  573         sc->mpr_flags &= ~MPR_FLAGS_REALLOCATED;
  574 
  575         if ((!attaching) &&
  576             ((saved_facts.MsgVersion != sc->facts->MsgVersion) ||
  577             (saved_facts.HeaderVersion != sc->facts->HeaderVersion) ||
  578             (saved_facts.MaxChainDepth != sc->facts->MaxChainDepth) ||
  579             (saved_facts.RequestCredit != sc->facts->RequestCredit) ||
  580             (saved_facts.ProductID != sc->facts->ProductID) ||
  581             (saved_facts.IOCCapabilities != sc->facts->IOCCapabilities) ||
  582             (saved_facts.IOCRequestFrameSize !=
  583             sc->facts->IOCRequestFrameSize) ||
  584             (saved_facts.IOCMaxChainSegmentSize !=
  585             sc->facts->IOCMaxChainSegmentSize) ||
  586             (saved_facts.MaxTargets != sc->facts->MaxTargets) ||
  587             (saved_facts.MaxSasExpanders != sc->facts->MaxSasExpanders) ||
  588             (saved_facts.MaxEnclosures != sc->facts->MaxEnclosures) ||
  589             (saved_facts.HighPriorityCredit != sc->facts->HighPriorityCredit) ||
  590             (saved_facts.MaxReplyDescriptorPostQueueDepth !=
  591             sc->facts->MaxReplyDescriptorPostQueueDepth) ||
  592             (saved_facts.ReplyFrameSize != sc->facts->ReplyFrameSize) ||
  593             (saved_facts.MaxVolumes != sc->facts->MaxVolumes) ||
  594             (saved_facts.MaxPersistentEntries !=
  595             sc->facts->MaxPersistentEntries))) {
  596                 reallocating = TRUE;
  597 
  598                 /* Record that we reallocated everything */
  599                 sc->mpr_flags |= MPR_FLAGS_REALLOCATED;
  600         }
  601 
  602         /*
  603          * Some things should be done if attaching or re-allocating after a Diag
  604          * Reset, but are not needed after a Diag Reset if the FW has not
  605          * changed.
  606          */
  607         if (attaching || reallocating) {
  608                 /*
  609                  * Check if controller supports FW diag buffers and set flag to
  610                  * enable each type.
  611                  */
  612                 if (sc->facts->IOCCapabilities &
  613                     MPI2_IOCFACTS_CAPABILITY_DIAG_TRACE_BUFFER)
  614                         sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_TRACE].
  615                             enabled = TRUE;
  616                 if (sc->facts->IOCCapabilities &
  617                     MPI2_IOCFACTS_CAPABILITY_SNAPSHOT_BUFFER)
  618                         sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_SNAPSHOT].
  619                             enabled = TRUE;
  620                 if (sc->facts->IOCCapabilities &
  621                     MPI2_IOCFACTS_CAPABILITY_EXTENDED_BUFFER)
  622                         sc->fw_diag_buffer_list[MPI2_DIAG_BUF_TYPE_EXTENDED].
  623                             enabled = TRUE;
  624 
  625                 /*
  626                  * Set flags for some supported items.
  627                  */
  628                 if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_EEDP)
  629                         sc->eedp_enabled = TRUE;
  630                 if (sc->facts->IOCCapabilities & MPI2_IOCFACTS_CAPABILITY_TLR)
  631                         sc->control_TLR = TRUE;
  632                 if ((sc->facts->IOCCapabilities &
  633                     MPI26_IOCFACTS_CAPABILITY_ATOMIC_REQ) &&
  634                     (sc->mpr_flags & MPR_FLAGS_SEA_IOC))
  635                         sc->atomic_desc_capable = TRUE;
  636 
  637                 mpr_resize_queues(sc);
  638 
  639                 /*
  640                  * Initialize all Tail Queues
  641                  */
  642                 TAILQ_INIT(&sc->req_list);
  643                 TAILQ_INIT(&sc->high_priority_req_list);
  644                 TAILQ_INIT(&sc->chain_list);
  645                 TAILQ_INIT(&sc->prp_page_list);
  646                 TAILQ_INIT(&sc->tm_list);
  647         }
  648 
  649         /*
  650          * If doing a Diag Reset and the FW is significantly different
  651          * (reallocating will be set above in IOC Facts comparison), then all
  652          * buffers based on the IOC Facts will need to be freed before they are
  653          * reallocated.
  654          */
  655         if (reallocating) {
  656                 mpr_iocfacts_free(sc);
  657                 mprsas_realloc_targets(sc, saved_facts.MaxTargets +
  658                     saved_facts.MaxVolumes);
  659         }
  660 
  661         /*
  662          * Any deallocation has been completed.  Now start reallocating
  663          * if needed.  Will only need to reallocate if attaching or if the new
  664          * IOC Facts are different from the previous IOC Facts after a Diag
  665          * Reset. Targets have already been allocated above if needed.
  666          */
  667         error = 0;
  668         while (attaching || reallocating) {
  669                 if ((error = mpr_alloc_hw_queues(sc)) != 0)
  670                         break;
  671                 if ((error = mpr_alloc_replies(sc)) != 0)
  672                         break;
  673                 if ((error = mpr_alloc_requests(sc)) != 0)
  674                         break;
  675                 if ((error = mpr_alloc_queues(sc)) != 0)
  676                         break;
  677                 break;
  678         }
  679         if (error) {
  680                 mpr_dprint(sc, MPR_INIT|MPR_ERROR,
  681                     "Failed to alloc queues with error %d\n", error);
  682                 mpr_free(sc);
  683                 return (error);
  684         }
  685 
  686         /* Always initialize the queues */
  687         bzero(sc->free_queue, sc->fqdepth * 4);
  688         mpr_init_queues(sc);
  689 
  690         /*
  691          * Always get the chip out of the reset state, but only panic if not
  692          * attaching.  If attaching and there is an error, that is handled by
  693          * the OS.
  694          */
  695         error = mpr_transition_operational(sc);
  696         if (error != 0) {
  697                 mpr_dprint(sc, MPR_INIT|MPR_FAULT, "Failed to "
  698                     "transition to operational with error %d\n", error);
  699                 mpr_free(sc);
  700                 return (error);
  701         }
  702 
  703         /*
  704          * Finish the queue initialization.
  705          * These are set here instead of in mpr_init_queues() because the
  706          * IOC resets these values during the state transition in
  707          * mpr_transition_operational().  The free index is set to 1
  708          * because the corresponding index in the IOC is set to 0, and the
  709          * IOC treats the queues as full if both are set to the same value.
  710          * Hence the reason that the queue can't hold all of the possible
  711          * replies.
  712          */
  713         sc->replypostindex = 0;
  714         mpr_regwrite(sc, MPI2_REPLY_FREE_HOST_INDEX_OFFSET, sc->replyfreeindex);
  715         mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET, 0);
  716 
  717         /*
  718          * Attach the subsystems so they can prepare their event masks.
  719          * XXX Should be dynamic so that IM/IR and user modules can attach
  720          */
  721         error = 0;
  722         while (attaching) {
  723                 mpr_dprint(sc, MPR_INIT, "Attaching subsystems\n");
  724                 if ((error = mpr_attach_log(sc)) != 0)
  725                         break;
  726                 if ((error = mpr_attach_sas(sc)) != 0)
  727                         break;
  728                 if ((error = mpr_attach_user(sc)) != 0)
  729                         break;
  730                 break;
  731         }
  732         if (error) {
  733                 mpr_dprint(sc, MPR_INIT|MPR_ERROR,
  734                     "Failed to attach all subsystems: error %d\n", error);
  735                 mpr_free(sc);
  736                 return (error);
  737         }
  738 
  739         /*
  740          * XXX If the number of MSI-X vectors changes during re-init, this
  741          * won't see it and adjust.
  742          */
  743         if ((attaching || reallocating) && (error = mpr_pci_setup_interrupts(sc)) != 0) {
  744                 mpr_dprint(sc, MPR_INIT|MPR_ERROR,
  745                     "Failed to setup interrupts\n");
  746                 mpr_free(sc);
  747                 return (error);
  748         }
  749 
  750         return (error);
  751 }
  752 
  753 /*
  754  * This is called if memory is being free (during detach for example) and when
  755  * buffers need to be reallocated due to a Diag Reset.
  756  */
  757 static void
  758 mpr_iocfacts_free(struct mpr_softc *sc)
  759 {
  760         struct mpr_command *cm;
  761         int i;
  762 
  763         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
  764 
  765         if (sc->free_busaddr != 0)
  766                 bus_dmamap_unload(sc->queues_dmat, sc->queues_map);
  767         if (sc->free_queue != NULL)
  768                 bus_dmamem_free(sc->queues_dmat, sc->free_queue,
  769                     sc->queues_map);
  770         if (sc->queues_dmat != NULL)
  771                 bus_dma_tag_destroy(sc->queues_dmat);
  772 
  773         if (sc->chain_frames != NULL) {
  774                 bus_dmamap_unload(sc->chain_dmat, sc->chain_map);
  775                 bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
  776                     sc->chain_map);
  777         }
  778         if (sc->chain_dmat != NULL)
  779                 bus_dma_tag_destroy(sc->chain_dmat);
  780 
  781         if (sc->sense_busaddr != 0)
  782                 bus_dmamap_unload(sc->sense_dmat, sc->sense_map);
  783         if (sc->sense_frames != NULL)
  784                 bus_dmamem_free(sc->sense_dmat, sc->sense_frames,
  785                     sc->sense_map);
  786         if (sc->sense_dmat != NULL)
  787                 bus_dma_tag_destroy(sc->sense_dmat);
  788 
  789         if (sc->prp_page_busaddr != 0)
  790                 bus_dmamap_unload(sc->prp_page_dmat, sc->prp_page_map);
  791         if (sc->prp_pages != NULL)
  792                 bus_dmamem_free(sc->prp_page_dmat, sc->prp_pages,
  793                     sc->prp_page_map);
  794         if (sc->prp_page_dmat != NULL)
  795                 bus_dma_tag_destroy(sc->prp_page_dmat);
  796 
  797         if (sc->reply_busaddr != 0)
  798                 bus_dmamap_unload(sc->reply_dmat, sc->reply_map);
  799         if (sc->reply_frames != NULL)
  800                 bus_dmamem_free(sc->reply_dmat, sc->reply_frames,
  801                     sc->reply_map);
  802         if (sc->reply_dmat != NULL)
  803                 bus_dma_tag_destroy(sc->reply_dmat);
  804 
  805         if (sc->req_busaddr != 0)
  806                 bus_dmamap_unload(sc->req_dmat, sc->req_map);
  807         if (sc->req_frames != NULL)
  808                 bus_dmamem_free(sc->req_dmat, sc->req_frames, sc->req_map);
  809         if (sc->req_dmat != NULL)
  810                 bus_dma_tag_destroy(sc->req_dmat);
  811 
  812         if (sc->chains != NULL)
  813                 free(sc->chains, M_MPR);
  814         if (sc->prps != NULL)
  815                 free(sc->prps, M_MPR);
  816         if (sc->commands != NULL) {
  817                 for (i = 1; i < sc->num_reqs; i++) {
  818                         cm = &sc->commands[i];
  819                         bus_dmamap_destroy(sc->buffer_dmat, cm->cm_dmamap);
  820                 }
  821                 free(sc->commands, M_MPR);
  822         }
  823         if (sc->buffer_dmat != NULL)
  824                 bus_dma_tag_destroy(sc->buffer_dmat);
  825 
  826         mpr_pci_free_interrupts(sc);
  827         free(sc->queues, M_MPR);
  828         sc->queues = NULL;
  829 }
  830 
  831 /* 
  832  * The terms diag reset and hard reset are used interchangeably in the MPI
  833  * docs to mean resetting the controller chip.  In this code diag reset
  834  * cleans everything up, and the hard reset function just sends the reset
  835  * sequence to the chip.  This should probably be refactored so that every
  836  * subsystem gets a reset notification of some sort, and can clean up
  837  * appropriately.
  838  */
  839 int
  840 mpr_reinit(struct mpr_softc *sc)
  841 {
  842         int error;
  843         struct mprsas_softc *sassc;
  844 
  845         sassc = sc->sassc;
  846 
  847         MPR_FUNCTRACE(sc);
  848 
  849         mtx_assert(&sc->mpr_mtx, MA_OWNED);
  850 
  851         mpr_dprint(sc, MPR_INIT|MPR_INFO, "Reinitializing controller\n");
  852         if (sc->mpr_flags & MPR_FLAGS_DIAGRESET) {
  853                 mpr_dprint(sc, MPR_INIT, "Reset already in progress\n");
  854                 return 0;
  855         }
  856 
  857         /*
  858          * Make sure the completion callbacks can recognize they're getting
  859          * a NULL cm_reply due to a reset.
  860          */
  861         sc->mpr_flags |= MPR_FLAGS_DIAGRESET;
  862 
  863         /*
  864          * Mask interrupts here.
  865          */
  866         mpr_dprint(sc, MPR_INIT, "Masking interrupts and resetting\n");
  867         mpr_mask_intr(sc);
  868 
  869         error = mpr_diag_reset(sc, CAN_SLEEP);
  870         if (error != 0) {
  871                 panic("%s hard reset failed with error %d\n", __func__, error);
  872         }
  873 
  874         /* Restore the PCI state, including the MSI-X registers */
  875         mpr_pci_restore(sc);
  876 
  877         /* Give the I/O subsystem special priority to get itself prepared */
  878         mprsas_handle_reinit(sc);
  879 
  880         /*
  881          * Get IOC Facts and allocate all structures based on this information.
  882          * The attach function will also call mpr_iocfacts_allocate at startup.
  883          * If relevant values have changed in IOC Facts, this function will free
  884          * all of the memory based on IOC Facts and reallocate that memory.
  885          */
  886         if ((error = mpr_iocfacts_allocate(sc, FALSE)) != 0) {
  887                 panic("%s IOC Facts based allocation failed with error %d\n",
  888                     __func__, error);
  889         }
  890 
  891         /*
  892          * Mapping structures will be re-allocated after getting IOC Page8, so
  893          * free these structures here.
  894          */
  895         mpr_mapping_exit(sc);
  896 
  897         /*
  898          * The static page function currently read is IOC Page8.  Others can be
  899          * added in future.  It's possible that the values in IOC Page8 have
  900          * changed after a Diag Reset due to user modification, so always read
  901          * these.  Interrupts are masked, so unmask them before getting config
  902          * pages.
  903          */
  904         mpr_unmask_intr(sc);
  905         sc->mpr_flags &= ~MPR_FLAGS_DIAGRESET;
  906         mpr_base_static_config_pages(sc);
  907 
  908         /*
  909          * Some mapping info is based in IOC Page8 data, so re-initialize the
  910          * mapping tables.
  911          */
  912         mpr_mapping_initialize(sc);
  913 
  914         /*
  915          * Restart will reload the event masks clobbered by the reset, and
  916          * then enable the port.
  917          */
  918         mpr_reregister_events(sc);
  919 
  920         /* the end of discovery will release the simq, so we're done. */
  921         mpr_dprint(sc, MPR_INIT|MPR_XINFO, "Finished sc %p post %u free %u\n", 
  922             sc, sc->replypostindex, sc->replyfreeindex);
  923         mprsas_release_simq_reinit(sassc);
  924         mpr_dprint(sc, MPR_INIT, "%s exit error= %d\n", __func__, error);
  925 
  926         return 0;
  927 }
  928 
  929 /* Wait for the chip to ACK a word that we've put into its FIFO 
  930  * Wait for <timeout> seconds. In single loop wait for busy loop
  931  * for 500 microseconds.
  932  * Total is [ 0.5 * (2000 * <timeout>) ] in miliseconds.
  933  * */
  934 static int
  935 mpr_wait_db_ack(struct mpr_softc *sc, int timeout, int sleep_flag)
  936 {
  937         u32 cntdn, count;
  938         u32 int_status;
  939         u32 doorbell;
  940 
  941         count = 0;
  942         cntdn = (sleep_flag == CAN_SLEEP) ? 1000*timeout : 2000*timeout;
  943         do {
  944                 int_status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
  945                 if (!(int_status & MPI2_HIS_SYS2IOC_DB_STATUS)) {
  946                         mpr_dprint(sc, MPR_TRACE, "%s: successful count(%d), "
  947                             "timeout(%d)\n", __func__, count, timeout);
  948                         return 0;
  949                 } else if (int_status & MPI2_HIS_IOC2SYS_DB_STATUS) {
  950                         doorbell = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
  951                         if ((doorbell & MPI2_IOC_STATE_MASK) ==
  952                             MPI2_IOC_STATE_FAULT) {
  953                                 mpr_dprint(sc, MPR_FAULT,
  954                                     "fault_state(0x%04x)!\n", doorbell);
  955                                 return (EFAULT);
  956                         }
  957                 } else if (int_status == 0xFFFFFFFF)
  958                         goto out;
  959                         
  960                 /*
  961                  * If it can sleep, sleep for 1 milisecond, else busy loop for
  962                  * 0.5 milisecond
  963                  */
  964                 if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP)
  965                         msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0, "mprdba",
  966                             hz/1000);
  967                 else if (sleep_flag == CAN_SLEEP)
  968                         pause("mprdba", hz/1000);
  969                 else
  970                         DELAY(500);
  971                 count++;
  972         } while (--cntdn);
  973 
  974 out:
  975         mpr_dprint(sc, MPR_FAULT, "%s: failed due to timeout count(%d), "
  976                 "int_status(%x)!\n", __func__, count, int_status);
  977         return (ETIMEDOUT);
  978 }
  979 
  980 /* Wait for the chip to signal that the next word in its FIFO can be fetched */
  981 static int
  982 mpr_wait_db_int(struct mpr_softc *sc)
  983 {
  984         int retry;
  985 
  986         for (retry = 0; retry < MPR_DB_MAX_WAIT; retry++) {
  987                 if ((mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET) &
  988                     MPI2_HIS_IOC2SYS_DB_STATUS) != 0)
  989                         return (0);
  990                 DELAY(2000);
  991         }
  992         return (ETIMEDOUT);
  993 }
  994 
  995 /* Step through the synchronous command state machine, i.e. "Doorbell mode" */
  996 static int
  997 mpr_request_sync(struct mpr_softc *sc, void *req, MPI2_DEFAULT_REPLY *reply,
  998     int req_sz, int reply_sz, int timeout)
  999 {
 1000         uint32_t *data32;
 1001         uint16_t *data16;
 1002         int i, count, ioc_sz, residual;
 1003         int sleep_flags = CAN_SLEEP;
 1004 
 1005         if (curthread->td_no_sleeping)
 1006                 sleep_flags = NO_SLEEP;
 1007 
 1008         /* Step 1 */
 1009         mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1010 
 1011         /* Step 2 */
 1012         if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
 1013                 return (EBUSY);
 1014 
 1015         /* Step 3
 1016          * Announce that a message is coming through the doorbell.  Messages
 1017          * are pushed at 32bit words, so round up if needed.
 1018          */
 1019         count = (req_sz + 3) / 4;
 1020         mpr_regwrite(sc, MPI2_DOORBELL_OFFSET,
 1021             (MPI2_FUNCTION_HANDSHAKE << MPI2_DOORBELL_FUNCTION_SHIFT) |
 1022             (count << MPI2_DOORBELL_ADD_DWORDS_SHIFT));
 1023 
 1024         /* Step 4 */
 1025         if (mpr_wait_db_int(sc) ||
 1026             (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED) == 0) {
 1027                 mpr_dprint(sc, MPR_FAULT, "Doorbell failed to activate\n");
 1028                 return (ENXIO);
 1029         }
 1030         mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1031         if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
 1032                 mpr_dprint(sc, MPR_FAULT, "Doorbell handshake failed\n");
 1033                 return (ENXIO);
 1034         }
 1035 
 1036         /* Step 5 */
 1037         /* Clock out the message data synchronously in 32-bit dwords*/
 1038         data32 = (uint32_t *)req;
 1039         for (i = 0; i < count; i++) {
 1040                 mpr_regwrite(sc, MPI2_DOORBELL_OFFSET, htole32(data32[i]));
 1041                 if (mpr_wait_db_ack(sc, 5, sleep_flags) != 0) {
 1042                         mpr_dprint(sc, MPR_FAULT,
 1043                             "Timeout while writing doorbell\n");
 1044                         return (ENXIO);
 1045                 }
 1046         }
 1047 
 1048         /* Step 6 */
 1049         /* Clock in the reply in 16-bit words.  The total length of the
 1050          * message is always in the 4th byte, so clock out the first 2 words
 1051          * manually, then loop the rest.
 1052          */
 1053         data16 = (uint16_t *)reply;
 1054         if (mpr_wait_db_int(sc) != 0) {
 1055                 mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 0\n");
 1056                 return (ENXIO);
 1057         }
 1058 
 1059         /*
 1060          * If in a BE platform, swap bytes using le16toh to not
 1061          * disturb 8 bit field neighbors in destination structure
 1062          * pointed by data16.
 1063          */
 1064         data16[0] =
 1065             le16toh(mpr_regread(sc, MPI2_DOORBELL_OFFSET)) & MPI2_DOORBELL_DATA_MASK;
 1066         mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1067         if (mpr_wait_db_int(sc) != 0) {
 1068                 mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell 1\n");
 1069                 return (ENXIO);
 1070         }
 1071         data16[1] =
 1072             le16toh(mpr_regread(sc, MPI2_DOORBELL_OFFSET)) & MPI2_DOORBELL_DATA_MASK;
 1073         mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1074 
 1075         /* Number of 32bit words in the message */
 1076         ioc_sz = reply->MsgLength;
 1077 
 1078         /*
 1079          * Figure out how many 16bit words to clock in without overrunning.
 1080          * The precision loss with dividing reply_sz can safely be
 1081          * ignored because the messages can only be multiples of 32bits.
 1082          */
 1083         residual = 0;
 1084         count = MIN((reply_sz / 4), ioc_sz) * 2;
 1085         if (count < ioc_sz * 2) {
 1086                 residual = ioc_sz * 2 - count;
 1087                 mpr_dprint(sc, MPR_ERROR, "Driver error, throwing away %d "
 1088                     "residual message words\n", residual);
 1089         }
 1090 
 1091         for (i = 2; i < count; i++) {
 1092                 if (mpr_wait_db_int(sc) != 0) {
 1093                         mpr_dprint(sc, MPR_FAULT,
 1094                             "Timeout reading doorbell %d\n", i);
 1095                         return (ENXIO);
 1096                 }
 1097                 data16[i] = le16toh(mpr_regread(sc, MPI2_DOORBELL_OFFSET)) &
 1098                     MPI2_DOORBELL_DATA_MASK;
 1099                 mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1100         }
 1101 
 1102         /*
 1103          * Pull out residual words that won't fit into the provided buffer.
 1104          * This keeps the chip from hanging due to a driver programming
 1105          * error.
 1106          */
 1107         while (residual--) {
 1108                 if (mpr_wait_db_int(sc) != 0) {
 1109                         mpr_dprint(sc, MPR_FAULT, "Timeout reading doorbell\n");
 1110                         return (ENXIO);
 1111                 }
 1112                 (void)mpr_regread(sc, MPI2_DOORBELL_OFFSET);
 1113                 mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1114         }
 1115 
 1116         /* Step 7 */
 1117         if (mpr_wait_db_int(sc) != 0) {
 1118                 mpr_dprint(sc, MPR_FAULT, "Timeout waiting to exit doorbell\n");
 1119                 return (ENXIO);
 1120         }
 1121         if (mpr_regread(sc, MPI2_DOORBELL_OFFSET) & MPI2_DOORBELL_USED)
 1122                 mpr_dprint(sc, MPR_FAULT, "Warning, doorbell still active\n");
 1123         mpr_regwrite(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET, 0x0);
 1124 
 1125         return (0);
 1126 }
 1127 
 1128 static void
 1129 mpr_enqueue_request(struct mpr_softc *sc, struct mpr_command *cm)
 1130 {
 1131         request_descriptor_t rd;
 1132 
 1133         MPR_FUNCTRACE(sc);
 1134         mpr_dprint(sc, MPR_TRACE, "SMID %u cm %p ccb %p\n",
 1135             cm->cm_desc.Default.SMID, cm, cm->cm_ccb);
 1136 
 1137         if (sc->mpr_flags & MPR_FLAGS_ATTACH_DONE && !(sc->mpr_flags &
 1138             MPR_FLAGS_SHUTDOWN))
 1139                 mtx_assert(&sc->mpr_mtx, MA_OWNED);
 1140 
 1141         if (++sc->io_cmds_active > sc->io_cmds_highwater)
 1142                 sc->io_cmds_highwater++;
 1143 
 1144         KASSERT(cm->cm_state == MPR_CM_STATE_BUSY,
 1145             ("command not busy, state = %u\n", cm->cm_state));
 1146         cm->cm_state = MPR_CM_STATE_INQUEUE;
 1147 
 1148         if (sc->atomic_desc_capable) {
 1149                 rd.u.low = cm->cm_desc.Words.Low;
 1150                 mpr_regwrite(sc, MPI26_ATOMIC_REQUEST_DESCRIPTOR_POST_OFFSET,
 1151                     rd.u.low);
 1152         } else {
 1153                 rd.u.low = htole32(cm->cm_desc.Words.Low);
 1154                 rd.u.high = htole32(cm->cm_desc.Words.High);
 1155                 mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_LOW_OFFSET,
 1156                     rd.u.low);
 1157                 mpr_regwrite(sc, MPI2_REQUEST_DESCRIPTOR_POST_HIGH_OFFSET,
 1158                     rd.u.high);
 1159         }
 1160 }
 1161 
 1162 /*
 1163  * Ioc facts are read in 16 bit words and and stored with le16toh,
 1164  * this takes care of proper U8 fields endianness in
 1165  * MPI2_IOC_FACTS_REPLY, but we still need to swap back U16 fields.
 1166  */
 1167 static void
 1168 adjust_iocfacts_endianness(MPI2_IOC_FACTS_REPLY *facts)
 1169 {
 1170         facts->HeaderVersion = le16toh(facts->HeaderVersion);
 1171         facts->Reserved1 = le16toh(facts->Reserved1);
 1172         facts->IOCExceptions = le16toh(facts->IOCExceptions);
 1173         facts->IOCStatus = le16toh(facts->IOCStatus);
 1174         facts->IOCLogInfo = le32toh(facts->IOCLogInfo);
 1175         facts->RequestCredit = le16toh(facts->RequestCredit);
 1176         facts->ProductID = le16toh(facts->ProductID);
 1177         facts->IOCCapabilities = le32toh(facts->IOCCapabilities);
 1178         facts->IOCRequestFrameSize = le16toh(facts->IOCRequestFrameSize);
 1179         facts->IOCMaxChainSegmentSize = le16toh(facts->IOCMaxChainSegmentSize);
 1180         facts->MaxInitiators = le16toh(facts->MaxInitiators);
 1181         facts->MaxTargets = le16toh(facts->MaxTargets);
 1182         facts->MaxSasExpanders = le16toh(facts->MaxSasExpanders);
 1183         facts->MaxEnclosures = le16toh(facts->MaxEnclosures);
 1184         facts->ProtocolFlags = le16toh(facts->ProtocolFlags);
 1185         facts->HighPriorityCredit = le16toh(facts->HighPriorityCredit);
 1186         facts->MaxReplyDescriptorPostQueueDepth = le16toh(facts->MaxReplyDescriptorPostQueueDepth);
 1187         facts->MaxDevHandle = le16toh(facts->MaxDevHandle);
 1188         facts->MaxPersistentEntries = le16toh(facts->MaxPersistentEntries);
 1189         facts->MinDevHandle = le16toh(facts->MinDevHandle);
 1190 }
 1191 
 1192 /*
 1193  * Just the FACTS, ma'am.
 1194  */
 1195 static int
 1196 mpr_get_iocfacts(struct mpr_softc *sc, MPI2_IOC_FACTS_REPLY *facts)
 1197 {
 1198         MPI2_DEFAULT_REPLY *reply;
 1199         MPI2_IOC_FACTS_REQUEST request;
 1200         int error, req_sz, reply_sz;
 1201 
 1202         MPR_FUNCTRACE(sc);
 1203         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
 1204 
 1205         req_sz = sizeof(MPI2_IOC_FACTS_REQUEST);
 1206         reply_sz = sizeof(MPI2_IOC_FACTS_REPLY);
 1207         reply = (MPI2_DEFAULT_REPLY *)facts;
 1208 
 1209         bzero(&request, req_sz);
 1210         request.Function = MPI2_FUNCTION_IOC_FACTS;
 1211         error = mpr_request_sync(sc, &request, reply, req_sz, reply_sz, 5);
 1212 
 1213         adjust_iocfacts_endianness(facts);
 1214         mpr_dprint(sc, MPR_TRACE, "facts->IOCCapabilities 0x%x\n", facts->IOCCapabilities);
 1215 
 1216         mpr_dprint(sc, MPR_INIT, "%s exit, error= %d\n", __func__, error);
 1217         return (error);
 1218 }
 1219 
 1220 static int
 1221 mpr_send_iocinit(struct mpr_softc *sc)
 1222 {
 1223         MPI2_IOC_INIT_REQUEST   init;
 1224         MPI2_DEFAULT_REPLY      reply;
 1225         int req_sz, reply_sz, error;
 1226         struct timeval now;
 1227         uint64_t time_in_msec;
 1228 
 1229         MPR_FUNCTRACE(sc);
 1230         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
 1231 
 1232         /* Do a quick sanity check on proper initialization */
 1233         if ((sc->pqdepth == 0) || (sc->fqdepth == 0) || (sc->reqframesz == 0)
 1234             || (sc->replyframesz == 0)) {
 1235                 mpr_dprint(sc, MPR_INIT|MPR_ERROR,
 1236                     "Driver not fully initialized for IOCInit\n");
 1237                 return (EINVAL);
 1238         }
 1239 
 1240         req_sz = sizeof(MPI2_IOC_INIT_REQUEST);
 1241         reply_sz = sizeof(MPI2_IOC_INIT_REPLY);
 1242         bzero(&init, req_sz);
 1243         bzero(&reply, reply_sz);
 1244 
 1245         /*
 1246          * Fill in the init block.  Note that most addresses are
 1247          * deliberately in the lower 32bits of memory.  This is a micro-
 1248          * optimzation for PCI/PCIX, though it's not clear if it helps PCIe.
 1249          */
 1250         init.Function = MPI2_FUNCTION_IOC_INIT;
 1251         init.WhoInit = MPI2_WHOINIT_HOST_DRIVER;
 1252         init.MsgVersion = htole16(MPI2_VERSION);
 1253         init.HeaderVersion = htole16(MPI2_HEADER_VERSION);
 1254         init.SystemRequestFrameSize = htole16((uint16_t)(sc->reqframesz / 4));
 1255         init.ReplyDescriptorPostQueueDepth = htole16(sc->pqdepth);
 1256         init.ReplyFreeQueueDepth = htole16(sc->fqdepth);
 1257         init.SenseBufferAddressHigh = 0;
 1258         init.SystemReplyAddressHigh = 0;
 1259         init.SystemRequestFrameBaseAddress.High = 0;
 1260         init.SystemRequestFrameBaseAddress.Low =
 1261             htole32((uint32_t)sc->req_busaddr);
 1262         init.ReplyDescriptorPostQueueAddress.High = 0;
 1263         init.ReplyDescriptorPostQueueAddress.Low =
 1264             htole32((uint32_t)sc->post_busaddr);
 1265         init.ReplyFreeQueueAddress.High = 0;
 1266         init.ReplyFreeQueueAddress.Low = htole32((uint32_t)sc->free_busaddr);
 1267         getmicrotime(&now);
 1268         time_in_msec = (now.tv_sec * 1000 + now.tv_usec/1000);
 1269         init.TimeStamp.High = htole32((time_in_msec >> 32) & 0xFFFFFFFF);
 1270         init.TimeStamp.Low = htole32(time_in_msec & 0xFFFFFFFF);
 1271         init.HostPageSize = HOST_PAGE_SIZE_4K;
 1272 
 1273         error = mpr_request_sync(sc, &init, &reply, req_sz, reply_sz, 5);
 1274         if ((le16toh(reply.IOCStatus) & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
 1275                 error = ENXIO;
 1276 
 1277         mpr_dprint(sc, MPR_INIT, "IOCInit status= 0x%x\n", le16toh(reply.IOCStatus));
 1278         mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
 1279         return (error);
 1280 }
 1281 
 1282 void
 1283 mpr_memaddr_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 1284 {
 1285         bus_addr_t *addr;
 1286 
 1287         addr = arg;
 1288         *addr = segs[0].ds_addr;
 1289 }
 1290 
 1291 void
 1292 mpr_memaddr_wait_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 1293 {
 1294         struct mpr_busdma_context *ctx;
 1295         int need_unload, need_free;
 1296 
 1297         ctx = (struct mpr_busdma_context *)arg;
 1298         need_unload = 0;
 1299         need_free = 0;
 1300 
 1301         mpr_lock(ctx->softc);
 1302         ctx->error = error;
 1303         ctx->completed = 1;
 1304         if ((error == 0) && (ctx->abandoned == 0)) {
 1305                 *ctx->addr = segs[0].ds_addr;
 1306         } else {
 1307                 if (nsegs != 0)
 1308                         need_unload = 1;
 1309                 if (ctx->abandoned != 0)
 1310                         need_free = 1;
 1311         }
 1312         if (need_free == 0)
 1313                 wakeup(ctx);
 1314 
 1315         mpr_unlock(ctx->softc);
 1316 
 1317         if (need_unload != 0) {
 1318                 bus_dmamap_unload(ctx->buffer_dmat,
 1319                                   ctx->buffer_dmamap);
 1320                 *ctx->addr = 0;
 1321         }
 1322 
 1323         if (need_free != 0)
 1324                 free(ctx, M_MPR);
 1325 }
 1326 
 1327 static int
 1328 mpr_alloc_queues(struct mpr_softc *sc)
 1329 {
 1330         struct mpr_queue *q;
 1331         int nq, i;
 1332 
 1333         nq = sc->msi_msgs;
 1334         mpr_dprint(sc, MPR_INIT|MPR_XINFO, "Allocating %d I/O queues\n", nq);
 1335 
 1336         sc->queues = malloc(sizeof(struct mpr_queue) * nq, M_MPR,
 1337              M_NOWAIT|M_ZERO);
 1338         if (sc->queues == NULL)
 1339                 return (ENOMEM);
 1340 
 1341         for (i = 0; i < nq; i++) {
 1342                 q = &sc->queues[i];
 1343                 mpr_dprint(sc, MPR_INIT, "Configuring queue %d %p\n", i, q);
 1344                 q->sc = sc;
 1345                 q->qnum = i;
 1346         }
 1347         return (0);
 1348 }
 1349 
 1350 static int
 1351 mpr_alloc_hw_queues(struct mpr_softc *sc)
 1352 {
 1353         bus_dma_template_t t;
 1354         bus_addr_t queues_busaddr;
 1355         uint8_t *queues;
 1356         int qsize, fqsize, pqsize;
 1357 
 1358         /*
 1359          * The reply free queue contains 4 byte entries in multiples of 16 and
 1360          * aligned on a 16 byte boundary. There must always be an unused entry.
 1361          * This queue supplies fresh reply frames for the firmware to use.
 1362          *
 1363          * The reply descriptor post queue contains 8 byte entries in
 1364          * multiples of 16 and aligned on a 16 byte boundary.  This queue
 1365          * contains filled-in reply frames sent from the firmware to the host.
 1366          *
 1367          * These two queues are allocated together for simplicity.
 1368          */
 1369         sc->fqdepth = roundup2(sc->num_replies + 1, 16);
 1370         sc->pqdepth = roundup2(sc->num_replies + 1, 16);
 1371         fqsize= sc->fqdepth * 4;
 1372         pqsize = sc->pqdepth * 8;
 1373         qsize = fqsize + pqsize;
 1374 
 1375         bus_dma_template_init(&t, sc->mpr_parent_dmat);
 1376         BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(qsize),
 1377             BD_MAXSEGSIZE(qsize), BD_NSEGMENTS(1),
 1378             BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
 1379         if (bus_dma_template_tag(&t, &sc->queues_dmat)) {
 1380                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate queues DMA tag\n");
 1381                 return (ENOMEM);
 1382         }
 1383         if (bus_dmamem_alloc(sc->queues_dmat, (void **)&queues, BUS_DMA_NOWAIT,
 1384             &sc->queues_map)) {
 1385                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate queues memory\n");
 1386                 return (ENOMEM);
 1387         }
 1388         bzero(queues, qsize);
 1389         bus_dmamap_load(sc->queues_dmat, sc->queues_map, queues, qsize,
 1390             mpr_memaddr_cb, &queues_busaddr, 0);
 1391 
 1392         sc->free_queue = (uint32_t *)queues;
 1393         sc->free_busaddr = queues_busaddr;
 1394         sc->post_queue = (MPI2_REPLY_DESCRIPTORS_UNION *)(queues + fqsize);
 1395         sc->post_busaddr = queues_busaddr + fqsize;
 1396         mpr_dprint(sc, MPR_INIT, "free queue busaddr= %#016jx size= %d\n",
 1397             (uintmax_t)sc->free_busaddr, fqsize);
 1398         mpr_dprint(sc, MPR_INIT, "reply queue busaddr= %#016jx size= %d\n",
 1399             (uintmax_t)sc->post_busaddr, pqsize);
 1400 
 1401         return (0);
 1402 }
 1403 
 1404 static int
 1405 mpr_alloc_replies(struct mpr_softc *sc)
 1406 {
 1407         bus_dma_template_t t;
 1408         int rsize, num_replies;
 1409 
 1410         /* Store the reply frame size in bytes rather than as 32bit words */
 1411         sc->replyframesz = sc->facts->ReplyFrameSize * 4;
 1412 
 1413         /*
 1414          * sc->num_replies should be one less than sc->fqdepth.  We need to
 1415          * allocate space for sc->fqdepth replies, but only sc->num_replies
 1416          * replies can be used at once.
 1417          */
 1418         num_replies = max(sc->fqdepth, sc->num_replies);
 1419 
 1420         rsize = sc->replyframesz * num_replies; 
 1421         bus_dma_template_init(&t, sc->mpr_parent_dmat);
 1422         BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
 1423             BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
 1424             BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
 1425         if (bus_dma_template_tag(&t, &sc->reply_dmat)) {
 1426                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate replies DMA tag\n");
 1427                 return (ENOMEM);
 1428         }
 1429         if (bus_dmamem_alloc(sc->reply_dmat, (void **)&sc->reply_frames,
 1430             BUS_DMA_NOWAIT, &sc->reply_map)) {
 1431                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate replies memory\n");
 1432                 return (ENOMEM);
 1433         }
 1434         bzero(sc->reply_frames, rsize);
 1435         bus_dmamap_load(sc->reply_dmat, sc->reply_map, sc->reply_frames, rsize,
 1436             mpr_memaddr_cb, &sc->reply_busaddr, 0);
 1437         mpr_dprint(sc, MPR_INIT, "reply frames busaddr= %#016jx size= %d\n",
 1438             (uintmax_t)sc->reply_busaddr, rsize);
 1439 
 1440         return (0);
 1441 }
 1442 
 1443 static void
 1444 mpr_load_chains_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 1445 {
 1446         struct mpr_softc *sc = arg;
 1447         struct mpr_chain *chain;
 1448         bus_size_t bo;
 1449         int i, o, s;
 1450 
 1451         if (error != 0)
 1452                 return;
 1453 
 1454         for (i = 0, o = 0, s = 0; s < nsegs; s++) {
 1455                 for (bo = 0; bo + sc->chain_frame_size <= segs[s].ds_len;
 1456                     bo += sc->chain_frame_size) {
 1457                         chain = &sc->chains[i++];
 1458                         chain->chain =(MPI2_SGE_IO_UNION *)(sc->chain_frames+o);
 1459                         chain->chain_busaddr = segs[s].ds_addr + bo;
 1460                         o += sc->chain_frame_size;
 1461                         mpr_free_chain(sc, chain);
 1462                 }
 1463                 if (bo != segs[s].ds_len)
 1464                         o += segs[s].ds_len - bo;
 1465         }
 1466         sc->chain_free_lowwater = i;
 1467 }
 1468 
 1469 static int
 1470 mpr_alloc_requests(struct mpr_softc *sc)
 1471 {
 1472         bus_dma_template_t t;
 1473         struct mpr_command *cm;
 1474         int i, rsize, nsegs;
 1475 
 1476         rsize = sc->reqframesz * sc->num_reqs;
 1477         bus_dma_template_init(&t, sc->mpr_parent_dmat);
 1478         BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
 1479             BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
 1480             BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
 1481         if (bus_dma_template_tag(&t, &sc->req_dmat)) {
 1482                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate request DMA tag\n");
 1483                 return (ENOMEM);
 1484         }
 1485         if (bus_dmamem_alloc(sc->req_dmat, (void **)&sc->req_frames,
 1486             BUS_DMA_NOWAIT, &sc->req_map)) {
 1487                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate request memory\n");
 1488                 return (ENOMEM);
 1489         }
 1490         bzero(sc->req_frames, rsize);
 1491         bus_dmamap_load(sc->req_dmat, sc->req_map, sc->req_frames, rsize,
 1492             mpr_memaddr_cb, &sc->req_busaddr, 0);
 1493         mpr_dprint(sc, MPR_INIT, "request frames busaddr= %#016jx size= %d\n",
 1494             (uintmax_t)sc->req_busaddr, rsize);
 1495 
 1496         sc->chains = malloc(sizeof(struct mpr_chain) * sc->num_chains, M_MPR,
 1497             M_NOWAIT | M_ZERO);
 1498         if (!sc->chains) {
 1499                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain memory\n");
 1500                 return (ENOMEM);
 1501         }
 1502         rsize = sc->chain_frame_size * sc->num_chains;
 1503         bus_dma_template_init(&t, sc->mpr_parent_dmat);
 1504         BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(16), BD_MAXSIZE(rsize),
 1505             BD_MAXSEGSIZE(rsize), BD_NSEGMENTS((howmany(rsize, PAGE_SIZE))));
 1506         if (bus_dma_template_tag(&t, &sc->chain_dmat)) {
 1507                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain DMA tag\n");
 1508                 return (ENOMEM);
 1509         }
 1510         if (bus_dmamem_alloc(sc->chain_dmat, (void **)&sc->chain_frames,
 1511             BUS_DMA_NOWAIT | BUS_DMA_ZERO, &sc->chain_map)) {
 1512                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate chain memory\n");
 1513                 return (ENOMEM);
 1514         }
 1515         if (bus_dmamap_load(sc->chain_dmat, sc->chain_map, sc->chain_frames,
 1516             rsize, mpr_load_chains_cb, sc, BUS_DMA_NOWAIT)) {
 1517                 mpr_dprint(sc, MPR_ERROR, "Cannot load chain memory\n");
 1518                 bus_dmamem_free(sc->chain_dmat, sc->chain_frames,
 1519                     sc->chain_map);
 1520                 return (ENOMEM);
 1521         }
 1522 
 1523         rsize = MPR_SENSE_LEN * sc->num_reqs;
 1524         bus_dma_template_clone(&t, sc->req_dmat);
 1525         BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(1), BD_MAXSIZE(rsize),
 1526             BD_MAXSEGSIZE(rsize));
 1527         if (bus_dma_template_tag(&t, &sc->sense_dmat)) {
 1528                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate sense DMA tag\n");
 1529                 return (ENOMEM);
 1530         }
 1531         if (bus_dmamem_alloc(sc->sense_dmat, (void **)&sc->sense_frames,
 1532             BUS_DMA_NOWAIT, &sc->sense_map)) {
 1533                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate sense memory\n");
 1534                 return (ENOMEM);
 1535         }
 1536         bzero(sc->sense_frames, rsize);
 1537         bus_dmamap_load(sc->sense_dmat, sc->sense_map, sc->sense_frames, rsize,
 1538             mpr_memaddr_cb, &sc->sense_busaddr, 0);
 1539         mpr_dprint(sc, MPR_INIT, "sense frames busaddr= %#016jx size= %d\n",
 1540             (uintmax_t)sc->sense_busaddr, rsize);
 1541 
 1542         /*
 1543          * Allocate NVMe PRP Pages for NVMe SGL support only if the FW supports
 1544          * these devices.
 1545          */
 1546         if ((sc->facts->MsgVersion >= MPI2_VERSION_02_06) &&
 1547             (sc->facts->ProtocolFlags & MPI2_IOCFACTS_PROTOCOL_NVME_DEVICES)) {
 1548                 if (mpr_alloc_nvme_prp_pages(sc) == ENOMEM)
 1549                         return (ENOMEM);
 1550         }
 1551 
 1552         nsegs = (sc->maxio / PAGE_SIZE) + 1;
 1553         bus_dma_template_init(&t, sc->mpr_parent_dmat);
 1554         BUS_DMA_TEMPLATE_FILL(&t, BD_MAXSIZE(BUS_SPACE_MAXSIZE_32BIT),
 1555             BD_NSEGMENTS(nsegs), BD_MAXSEGSIZE(BUS_SPACE_MAXSIZE_32BIT),
 1556             BD_FLAGS(BUS_DMA_ALLOCNOW), BD_LOCKFUNC(busdma_lock_mutex),
 1557             BD_LOCKFUNCARG(&sc->mpr_mtx));
 1558         if (bus_dma_template_tag(&t, &sc->buffer_dmat)) {
 1559                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate buffer DMA tag\n");
 1560                 return (ENOMEM);
 1561         }
 1562 
 1563         /*
 1564          * SMID 0 cannot be used as a free command per the firmware spec.
 1565          * Just drop that command instead of risking accounting bugs.
 1566          */
 1567         sc->commands = malloc(sizeof(struct mpr_command) * sc->num_reqs,
 1568             M_MPR, M_WAITOK | M_ZERO);
 1569         for (i = 1; i < sc->num_reqs; i++) {
 1570                 cm = &sc->commands[i];
 1571                 cm->cm_req = sc->req_frames + i * sc->reqframesz;
 1572                 cm->cm_req_busaddr = sc->req_busaddr + i * sc->reqframesz;
 1573                 cm->cm_sense = &sc->sense_frames[i];
 1574                 cm->cm_sense_busaddr = sc->sense_busaddr + i * MPR_SENSE_LEN;
 1575                 cm->cm_desc.Default.SMID = htole16(i);
 1576                 cm->cm_sc = sc;
 1577                 cm->cm_state = MPR_CM_STATE_BUSY;
 1578                 TAILQ_INIT(&cm->cm_chain_list);
 1579                 TAILQ_INIT(&cm->cm_prp_page_list);
 1580                 callout_init_mtx(&cm->cm_callout, &sc->mpr_mtx, 0);
 1581 
 1582                 /* XXX Is a failure here a critical problem? */
 1583                 if (bus_dmamap_create(sc->buffer_dmat, 0, &cm->cm_dmamap)
 1584                     == 0) {
 1585                         if (i <= sc->num_prireqs)
 1586                                 mpr_free_high_priority_command(sc, cm);
 1587                         else
 1588                                 mpr_free_command(sc, cm);
 1589                 } else {
 1590                         panic("failed to allocate command %d\n", i);
 1591                         sc->num_reqs = i;
 1592                         break;
 1593                 }
 1594         }
 1595 
 1596         return (0);
 1597 }
 1598 
 1599 /*
 1600  * Allocate contiguous buffers for PCIe NVMe devices for building native PRPs,
 1601  * which are scatter/gather lists for NVMe devices. 
 1602  *
 1603  * This buffer must be contiguous due to the nature of how NVMe PRPs are built
 1604  * and translated by FW.
 1605  *
 1606  * returns ENOMEM if memory could not be allocated, otherwise returns 0.
 1607  */
 1608 static int
 1609 mpr_alloc_nvme_prp_pages(struct mpr_softc *sc)
 1610 {
 1611         bus_dma_template_t t;
 1612         struct mpr_prp_page *prp_page;
 1613         int PRPs_per_page, PRPs_required, pages_required;
 1614         int rsize, i;
 1615 
 1616         /*
 1617          * Assuming a MAX_IO_SIZE of 1MB and a PAGE_SIZE of 4k, the max number
 1618          * of PRPs (NVMe's Scatter/Gather Element) needed per I/O is:
 1619          * MAX_IO_SIZE / PAGE_SIZE = 256
 1620          * 
 1621          * 1 PRP entry in main frame for PRP list pointer still leaves 255 PRPs
 1622          * required for the remainder of the 1MB I/O. 512 PRPs can fit into one
 1623          * page (4096 / 8 = 512), so only one page is required for each I/O.
 1624          *
 1625          * Each of these buffers will need to be contiguous. For simplicity,
 1626          * only one buffer is allocated here, which has all of the space
 1627          * required for the NVMe Queue Depth. If there are problems allocating
 1628          * this one buffer, this function will need to change to allocate
 1629          * individual, contiguous NVME_QDEPTH buffers.
 1630          *
 1631          * The real calculation will use the real max io size. Above is just an
 1632          * example.
 1633          *
 1634          */
 1635         PRPs_required = sc->maxio / PAGE_SIZE;
 1636         PRPs_per_page = (PAGE_SIZE / PRP_ENTRY_SIZE) - 1;
 1637         pages_required = (PRPs_required / PRPs_per_page) + 1;
 1638 
 1639         sc->prp_buffer_size = PAGE_SIZE * pages_required; 
 1640         rsize = sc->prp_buffer_size * NVME_QDEPTH; 
 1641         bus_dma_template_init(&t, sc->mpr_parent_dmat);
 1642         BUS_DMA_TEMPLATE_FILL(&t, BD_ALIGNMENT(4), BD_MAXSIZE(rsize),
 1643             BD_MAXSEGSIZE(rsize), BD_NSEGMENTS(1),
 1644             BD_LOWADDR(BUS_SPACE_MAXADDR_32BIT));
 1645         if (bus_dma_template_tag(&t, &sc->prp_page_dmat)) {
 1646                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate NVMe PRP DMA "
 1647                     "tag\n");
 1648                 return (ENOMEM);
 1649         }
 1650         if (bus_dmamem_alloc(sc->prp_page_dmat, (void **)&sc->prp_pages,
 1651             BUS_DMA_NOWAIT, &sc->prp_page_map)) {
 1652                 mpr_dprint(sc, MPR_ERROR, "Cannot allocate NVMe PRP memory\n");
 1653                 return (ENOMEM);
 1654         }
 1655         bzero(sc->prp_pages, rsize);
 1656         bus_dmamap_load(sc->prp_page_dmat, sc->prp_page_map, sc->prp_pages,
 1657             rsize, mpr_memaddr_cb, &sc->prp_page_busaddr, 0);
 1658 
 1659         sc->prps = malloc(sizeof(struct mpr_prp_page) * NVME_QDEPTH, M_MPR,
 1660             M_WAITOK | M_ZERO);
 1661         for (i = 0; i < NVME_QDEPTH; i++) {
 1662                 prp_page = &sc->prps[i];
 1663                 prp_page->prp_page = (uint64_t *)(sc->prp_pages +
 1664                     i * sc->prp_buffer_size);
 1665                 prp_page->prp_page_busaddr = (uint64_t)(sc->prp_page_busaddr +
 1666                     i * sc->prp_buffer_size);
 1667                 mpr_free_prp_page(sc, prp_page);
 1668                 sc->prp_pages_free_lowwater++;
 1669         }
 1670 
 1671         return (0);
 1672 }
 1673 
 1674 static int
 1675 mpr_init_queues(struct mpr_softc *sc)
 1676 {
 1677         int i;
 1678 
 1679         memset((uint8_t *)sc->post_queue, 0xff, sc->pqdepth * 8);
 1680 
 1681         /*
 1682          * According to the spec, we need to use one less reply than we
 1683          * have space for on the queue.  So sc->num_replies (the number we
 1684          * use) should be less than sc->fqdepth (allocated size).
 1685          */
 1686         if (sc->num_replies >= sc->fqdepth)
 1687                 return (EINVAL);
 1688 
 1689         /*
 1690          * Initialize all of the free queue entries.
 1691          */
 1692         for (i = 0; i < sc->fqdepth; i++) {
 1693                 sc->free_queue[i] = htole32(sc->reply_busaddr + (i * sc->replyframesz));
 1694         }
 1695         sc->replyfreeindex = sc->num_replies;
 1696 
 1697         return (0);
 1698 }
 1699 
 1700 /* Get the driver parameter tunables.  Lowest priority are the driver defaults.
 1701  * Next are the global settings, if they exist.  Highest are the per-unit
 1702  * settings, if they exist.
 1703  */
 1704 void
 1705 mpr_get_tunables(struct mpr_softc *sc)
 1706 {
 1707         char tmpstr[80], mpr_debug[80];
 1708 
 1709         /* XXX default to some debugging for now */
 1710         sc->mpr_debug = MPR_INFO | MPR_FAULT;
 1711         sc->disable_msix = 0;
 1712         sc->disable_msi = 0;
 1713         sc->max_msix = MPR_MSIX_MAX;
 1714         sc->max_chains = MPR_CHAIN_FRAMES;
 1715         sc->max_io_pages = MPR_MAXIO_PAGES;
 1716         sc->enable_ssu = MPR_SSU_ENABLE_SSD_DISABLE_HDD;
 1717         sc->spinup_wait_time = DEFAULT_SPINUP_WAIT;
 1718         sc->use_phynum = 1;
 1719         sc->max_reqframes = MPR_REQ_FRAMES;
 1720         sc->max_prireqframes = MPR_PRI_REQ_FRAMES;
 1721         sc->max_replyframes = MPR_REPLY_FRAMES;
 1722         sc->max_evtframes = MPR_EVT_REPLY_FRAMES;
 1723 
 1724         /*
 1725          * Grab the global variables.
 1726          */
 1727         bzero(mpr_debug, 80);
 1728         if (TUNABLE_STR_FETCH("hw.mpr.debug_level", mpr_debug, 80) != 0)
 1729                 mpr_parse_debug(sc, mpr_debug);
 1730         TUNABLE_INT_FETCH("hw.mpr.disable_msix", &sc->disable_msix);
 1731         TUNABLE_INT_FETCH("hw.mpr.disable_msi", &sc->disable_msi);
 1732         TUNABLE_INT_FETCH("hw.mpr.max_msix", &sc->max_msix);
 1733         TUNABLE_INT_FETCH("hw.mpr.max_chains", &sc->max_chains);
 1734         TUNABLE_INT_FETCH("hw.mpr.max_io_pages", &sc->max_io_pages);
 1735         TUNABLE_INT_FETCH("hw.mpr.enable_ssu", &sc->enable_ssu);
 1736         TUNABLE_INT_FETCH("hw.mpr.spinup_wait_time", &sc->spinup_wait_time);
 1737         TUNABLE_INT_FETCH("hw.mpr.use_phy_num", &sc->use_phynum);
 1738         TUNABLE_INT_FETCH("hw.mpr.max_reqframes", &sc->max_reqframes);
 1739         TUNABLE_INT_FETCH("hw.mpr.max_prireqframes", &sc->max_prireqframes);
 1740         TUNABLE_INT_FETCH("hw.mpr.max_replyframes", &sc->max_replyframes);
 1741         TUNABLE_INT_FETCH("hw.mpr.max_evtframes", &sc->max_evtframes);
 1742 
 1743         /* Grab the unit-instance variables */
 1744         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.debug_level",
 1745             device_get_unit(sc->mpr_dev));
 1746         bzero(mpr_debug, 80);
 1747         if (TUNABLE_STR_FETCH(tmpstr, mpr_debug, 80) != 0)
 1748                 mpr_parse_debug(sc, mpr_debug);
 1749 
 1750         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msix",
 1751             device_get_unit(sc->mpr_dev));
 1752         TUNABLE_INT_FETCH(tmpstr, &sc->disable_msix);
 1753 
 1754         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.disable_msi",
 1755             device_get_unit(sc->mpr_dev));
 1756         TUNABLE_INT_FETCH(tmpstr, &sc->disable_msi);
 1757 
 1758         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_msix",
 1759             device_get_unit(sc->mpr_dev));
 1760         TUNABLE_INT_FETCH(tmpstr, &sc->max_msix);
 1761 
 1762         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_chains",
 1763             device_get_unit(sc->mpr_dev));
 1764         TUNABLE_INT_FETCH(tmpstr, &sc->max_chains);
 1765 
 1766         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_io_pages",
 1767             device_get_unit(sc->mpr_dev));
 1768         TUNABLE_INT_FETCH(tmpstr, &sc->max_io_pages);
 1769 
 1770         bzero(sc->exclude_ids, sizeof(sc->exclude_ids));
 1771         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.exclude_ids",
 1772             device_get_unit(sc->mpr_dev));
 1773         TUNABLE_STR_FETCH(tmpstr, sc->exclude_ids, sizeof(sc->exclude_ids));
 1774 
 1775         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.enable_ssu",
 1776             device_get_unit(sc->mpr_dev));
 1777         TUNABLE_INT_FETCH(tmpstr, &sc->enable_ssu);
 1778 
 1779         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.spinup_wait_time",
 1780             device_get_unit(sc->mpr_dev));
 1781         TUNABLE_INT_FETCH(tmpstr, &sc->spinup_wait_time);
 1782 
 1783         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.use_phy_num",
 1784             device_get_unit(sc->mpr_dev));
 1785         TUNABLE_INT_FETCH(tmpstr, &sc->use_phynum);
 1786 
 1787         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_reqframes",
 1788             device_get_unit(sc->mpr_dev));
 1789         TUNABLE_INT_FETCH(tmpstr, &sc->max_reqframes);
 1790 
 1791         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_prireqframes",
 1792             device_get_unit(sc->mpr_dev));
 1793         TUNABLE_INT_FETCH(tmpstr, &sc->max_prireqframes);
 1794 
 1795         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_replyframes",
 1796             device_get_unit(sc->mpr_dev));
 1797         TUNABLE_INT_FETCH(tmpstr, &sc->max_replyframes);
 1798 
 1799         snprintf(tmpstr, sizeof(tmpstr), "dev.mpr.%d.max_evtframes",
 1800             device_get_unit(sc->mpr_dev));
 1801         TUNABLE_INT_FETCH(tmpstr, &sc->max_evtframes);
 1802 }
 1803 
 1804 static void
 1805 mpr_setup_sysctl(struct mpr_softc *sc)
 1806 {
 1807         struct sysctl_ctx_list  *sysctl_ctx = NULL;
 1808         struct sysctl_oid       *sysctl_tree = NULL;
 1809         char tmpstr[80], tmpstr2[80];
 1810 
 1811         /*
 1812          * Setup the sysctl variable so the user can change the debug level
 1813          * on the fly.
 1814          */
 1815         snprintf(tmpstr, sizeof(tmpstr), "MPR controller %d",
 1816             device_get_unit(sc->mpr_dev));
 1817         snprintf(tmpstr2, sizeof(tmpstr2), "%d", device_get_unit(sc->mpr_dev));
 1818 
 1819         sysctl_ctx = device_get_sysctl_ctx(sc->mpr_dev);
 1820         if (sysctl_ctx != NULL)
 1821                 sysctl_tree = device_get_sysctl_tree(sc->mpr_dev);
 1822 
 1823         if (sysctl_tree == NULL) {
 1824                 sysctl_ctx_init(&sc->sysctl_ctx);
 1825                 sc->sysctl_tree = SYSCTL_ADD_NODE(&sc->sysctl_ctx,
 1826                     SYSCTL_STATIC_CHILDREN(_hw_mpr), OID_AUTO, tmpstr2,
 1827                     CTLFLAG_RD | CTLFLAG_MPSAFE, 0, tmpstr);
 1828                 if (sc->sysctl_tree == NULL)
 1829                         return;
 1830                 sysctl_ctx = &sc->sysctl_ctx;
 1831                 sysctl_tree = sc->sysctl_tree;
 1832         }
 1833 
 1834         SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1835             OID_AUTO, "debug_level", CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE,
 1836             sc, 0, mpr_debug_sysctl, "A", "mpr debug level");
 1837 
 1838         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1839             OID_AUTO, "disable_msix", CTLFLAG_RD, &sc->disable_msix, 0,
 1840             "Disable the use of MSI-X interrupts");
 1841 
 1842         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1843             OID_AUTO, "max_msix", CTLFLAG_RD, &sc->max_msix, 0,
 1844             "User-defined maximum number of MSIX queues");
 1845 
 1846         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1847             OID_AUTO, "msix_msgs", CTLFLAG_RD, &sc->msi_msgs, 0,
 1848             "Negotiated number of MSIX queues");
 1849 
 1850         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1851             OID_AUTO, "max_reqframes", CTLFLAG_RD, &sc->max_reqframes, 0,
 1852             "Total number of allocated request frames");
 1853 
 1854         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1855             OID_AUTO, "max_prireqframes", CTLFLAG_RD, &sc->max_prireqframes, 0,
 1856             "Total number of allocated high priority request frames");
 1857 
 1858         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1859             OID_AUTO, "max_replyframes", CTLFLAG_RD, &sc->max_replyframes, 0,
 1860             "Total number of allocated reply frames");
 1861 
 1862         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1863             OID_AUTO, "max_evtframes", CTLFLAG_RD, &sc->max_evtframes, 0,
 1864             "Total number of event frames allocated");
 1865 
 1866         SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1867             OID_AUTO, "firmware_version", CTLFLAG_RD, sc->fw_version,
 1868             strlen(sc->fw_version), "firmware version");
 1869 
 1870         SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1871             OID_AUTO, "driver_version", CTLFLAG_RD, MPR_DRIVER_VERSION,
 1872             strlen(MPR_DRIVER_VERSION), "driver version");
 1873 
 1874         SYSCTL_ADD_STRING(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1875             OID_AUTO, "msg_version", CTLFLAG_RD, sc->msg_version,
 1876             strlen(sc->msg_version), "message interface version");
 1877 
 1878         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1879             OID_AUTO, "io_cmds_active", CTLFLAG_RD,
 1880             &sc->io_cmds_active, 0, "number of currently active commands");
 1881 
 1882         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1883             OID_AUTO, "io_cmds_highwater", CTLFLAG_RD,
 1884             &sc->io_cmds_highwater, 0, "maximum active commands seen");
 1885 
 1886         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1887             OID_AUTO, "chain_free", CTLFLAG_RD,
 1888             &sc->chain_free, 0, "number of free chain elements");
 1889 
 1890         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1891             OID_AUTO, "chain_free_lowwater", CTLFLAG_RD,
 1892             &sc->chain_free_lowwater, 0,"lowest number of free chain elements");
 1893 
 1894         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1895             OID_AUTO, "max_chains", CTLFLAG_RD,
 1896             &sc->max_chains, 0,"maximum chain frames that will be allocated");
 1897 
 1898         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1899             OID_AUTO, "max_io_pages", CTLFLAG_RD,
 1900             &sc->max_io_pages, 0,"maximum pages to allow per I/O (if <1 use "
 1901             "IOCFacts)");
 1902 
 1903         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1904             OID_AUTO, "enable_ssu", CTLFLAG_RW, &sc->enable_ssu, 0,
 1905             "enable SSU to SATA SSD/HDD at shutdown");
 1906 
 1907         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1908             OID_AUTO, "chain_alloc_fail", CTLFLAG_RD,
 1909             &sc->chain_alloc_fail, "chain allocation failures");
 1910 
 1911         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1912             OID_AUTO, "spinup_wait_time", CTLFLAG_RD,
 1913             &sc->spinup_wait_time, DEFAULT_SPINUP_WAIT, "seconds to wait for "
 1914             "spinup after SATA ID error");
 1915 
 1916         SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1917             OID_AUTO, "dump_reqs",
 1918             CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE,
 1919             sc, 0, mpr_dump_reqs, "I", "Dump Active Requests");
 1920 
 1921         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1922             OID_AUTO, "dump_reqs_alltypes", CTLFLAG_RW,
 1923             &sc->dump_reqs_alltypes, 0,
 1924             "dump all request types not just inqueue");
 1925 
 1926         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1927             OID_AUTO, "use_phy_num", CTLFLAG_RD, &sc->use_phynum, 0,
 1928             "Use the phy number for enumeration");
 1929 
 1930         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1931             OID_AUTO, "prp_pages_free", CTLFLAG_RD,
 1932             &sc->prp_pages_free, 0, "number of free PRP pages");
 1933 
 1934         SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1935             OID_AUTO, "prp_pages_free_lowwater", CTLFLAG_RD,
 1936             &sc->prp_pages_free_lowwater, 0,"lowest number of free PRP pages");
 1937 
 1938         SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree),
 1939             OID_AUTO, "prp_page_alloc_fail", CTLFLAG_RD,
 1940             &sc->prp_page_alloc_fail, "PRP page allocation failures");
 1941 }
 1942 
 1943 static struct mpr_debug_string {
 1944         char *name;
 1945         int flag;
 1946 } mpr_debug_strings[] = {
 1947         {"info", MPR_INFO},
 1948         {"fault", MPR_FAULT},
 1949         {"event", MPR_EVENT},
 1950         {"log", MPR_LOG},
 1951         {"recovery", MPR_RECOVERY},
 1952         {"error", MPR_ERROR},
 1953         {"init", MPR_INIT},
 1954         {"xinfo", MPR_XINFO},
 1955         {"user", MPR_USER},
 1956         {"mapping", MPR_MAPPING},
 1957         {"trace", MPR_TRACE}
 1958 };
 1959 
 1960 enum mpr_debug_level_combiner {
 1961         COMB_NONE,
 1962         COMB_ADD,
 1963         COMB_SUB
 1964 };
 1965 
 1966 static int
 1967 mpr_debug_sysctl(SYSCTL_HANDLER_ARGS)
 1968 {
 1969         struct mpr_softc *sc;
 1970         struct mpr_debug_string *string;
 1971         struct sbuf *sbuf;
 1972         char *buffer;
 1973         size_t sz;
 1974         int i, len, debug, error;
 1975 
 1976         sc = (struct mpr_softc *)arg1;
 1977 
 1978         error = sysctl_wire_old_buffer(req, 0);
 1979         if (error != 0)
 1980                 return (error);
 1981 
 1982         sbuf = sbuf_new_for_sysctl(NULL, NULL, 128, req);
 1983         debug = sc->mpr_debug;
 1984 
 1985         sbuf_printf(sbuf, "%#x", debug);
 1986 
 1987         sz = sizeof(mpr_debug_strings) / sizeof(mpr_debug_strings[0]);
 1988         for (i = 0; i < sz; i++) {
 1989                 string = &mpr_debug_strings[i];
 1990                 if (debug & string->flag) 
 1991                         sbuf_printf(sbuf, ",%s", string->name);
 1992         }
 1993 
 1994         error = sbuf_finish(sbuf);
 1995         sbuf_delete(sbuf);
 1996 
 1997         if (error || req->newptr == NULL)
 1998                 return (error);
 1999 
 2000         len = req->newlen - req->newidx;
 2001         if (len == 0)
 2002                 return (0);
 2003 
 2004         buffer = malloc(len, M_MPR, M_ZERO|M_WAITOK);
 2005         error = SYSCTL_IN(req, buffer, len);
 2006 
 2007         mpr_parse_debug(sc, buffer);
 2008 
 2009         free(buffer, M_MPR);
 2010         return (error);
 2011 }
 2012 
 2013 static void
 2014 mpr_parse_debug(struct mpr_softc *sc, char *list)
 2015 {
 2016         struct mpr_debug_string *string;
 2017         enum mpr_debug_level_combiner op;
 2018         char *token, *endtoken;
 2019         size_t sz;
 2020         int flags, i;
 2021 
 2022         if (list == NULL || *list == '\0')
 2023                 return;
 2024 
 2025         if (*list == '+') {
 2026                 op = COMB_ADD;
 2027                 list++;
 2028         } else if (*list == '-') {
 2029                 op = COMB_SUB;
 2030                 list++;
 2031         } else
 2032                 op = COMB_NONE;
 2033         if (*list == '\0')
 2034                 return;
 2035 
 2036         flags = 0;
 2037         sz = sizeof(mpr_debug_strings) / sizeof(mpr_debug_strings[0]);
 2038         while ((token = strsep(&list, ":,")) != NULL) {
 2039                 /* Handle integer flags */
 2040                 flags |= strtol(token, &endtoken, 0);
 2041                 if (token != endtoken)
 2042                         continue;
 2043 
 2044                 /* Handle text flags */
 2045                 for (i = 0; i < sz; i++) {
 2046                         string = &mpr_debug_strings[i];
 2047                         if (strcasecmp(token, string->name) == 0) {
 2048                                 flags |= string->flag;
 2049                                 break;
 2050                         }
 2051                 }
 2052         }
 2053 
 2054         switch (op) {
 2055         case COMB_NONE:
 2056                 sc->mpr_debug = flags;
 2057                 break;
 2058         case COMB_ADD:
 2059                 sc->mpr_debug |= flags;
 2060                 break;
 2061         case COMB_SUB:
 2062                 sc->mpr_debug &= (~flags);
 2063                 break;
 2064         }
 2065         return;
 2066 }
 2067 
 2068 struct mpr_dumpreq_hdr {
 2069         uint32_t        smid;
 2070         uint32_t        state;
 2071         uint32_t        numframes;
 2072         uint32_t        deschi;
 2073         uint32_t        desclo;
 2074 };
 2075 
 2076 static int
 2077 mpr_dump_reqs(SYSCTL_HANDLER_ARGS)
 2078 {
 2079         struct mpr_softc *sc;
 2080         struct mpr_chain *chain, *chain1;
 2081         struct mpr_command *cm;
 2082         struct mpr_dumpreq_hdr hdr;
 2083         struct sbuf *sb;
 2084         uint32_t smid, state;
 2085         int i, numreqs, error = 0;
 2086 
 2087         sc = (struct mpr_softc *)arg1;
 2088 
 2089         if ((error = priv_check(curthread, PRIV_DRIVER)) != 0) {
 2090                 printf("priv check error %d\n", error);
 2091                 return (error);
 2092         }
 2093 
 2094         state = MPR_CM_STATE_INQUEUE;
 2095         smid = 1;
 2096         numreqs = sc->num_reqs;
 2097 
 2098         if (req->newptr != NULL)
 2099                 return (EINVAL);
 2100 
 2101         if (smid == 0 || smid > sc->num_reqs)
 2102                 return (EINVAL);
 2103         if (numreqs <= 0 || (numreqs + smid > sc->num_reqs))
 2104                 numreqs = sc->num_reqs;
 2105         sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
 2106 
 2107         /* Best effort, no locking */
 2108         for (i = smid; i < numreqs; i++) {
 2109                 cm = &sc->commands[i];
 2110                 if ((sc->dump_reqs_alltypes == 0) && (cm->cm_state != state))
 2111                         continue;
 2112                 hdr.smid = i;
 2113                 hdr.state = cm->cm_state;
 2114                 hdr.numframes = 1;
 2115                 hdr.deschi = cm->cm_desc.Words.High;
 2116                 hdr.desclo = cm->cm_desc.Words.Low;
 2117                 TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
 2118                    chain1)
 2119                         hdr.numframes++;
 2120                 sbuf_bcat(sb, &hdr, sizeof(hdr));
 2121                 sbuf_bcat(sb, cm->cm_req, 128);
 2122                 TAILQ_FOREACH_SAFE(chain, &cm->cm_chain_list, chain_link,
 2123                     chain1)
 2124                         sbuf_bcat(sb, chain->chain, 128);
 2125         }
 2126 
 2127         error = sbuf_finish(sb);
 2128         sbuf_delete(sb);
 2129         return (error);
 2130 }
 2131 
 2132 int
 2133 mpr_attach(struct mpr_softc *sc)
 2134 {
 2135         int error;
 2136 
 2137         MPR_FUNCTRACE(sc);
 2138         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
 2139 
 2140         mtx_init(&sc->mpr_mtx, "MPR lock", NULL, MTX_DEF);
 2141         callout_init_mtx(&sc->periodic, &sc->mpr_mtx, 0);
 2142         callout_init_mtx(&sc->device_check_callout, &sc->mpr_mtx, 0);
 2143         TAILQ_INIT(&sc->event_list);
 2144         timevalclear(&sc->lastfail);
 2145 
 2146         if ((error = mpr_transition_ready(sc)) != 0) {
 2147                 mpr_dprint(sc, MPR_INIT|MPR_FAULT,
 2148                     "Failed to transition ready\n");
 2149                 return (error);
 2150         }
 2151 
 2152         sc->facts = malloc(sizeof(MPI2_IOC_FACTS_REPLY), M_MPR,
 2153             M_ZERO|M_NOWAIT);
 2154         if (!sc->facts) {
 2155                 mpr_dprint(sc, MPR_INIT|MPR_FAULT,
 2156                     "Cannot allocate memory, exit\n");
 2157                 return (ENOMEM);
 2158         }
 2159 
 2160         /*
 2161          * Get IOC Facts and allocate all structures based on this information.
 2162          * A Diag Reset will also call mpr_iocfacts_allocate and re-read the IOC
 2163          * Facts. If relevant values have changed in IOC Facts, this function
 2164          * will free all of the memory based on IOC Facts and reallocate that
 2165          * memory.  If this fails, any allocated memory should already be freed.
 2166          */
 2167         if ((error = mpr_iocfacts_allocate(sc, TRUE)) != 0) {
 2168                 mpr_dprint(sc, MPR_INIT|MPR_FAULT, "IOC Facts allocation "
 2169                     "failed with error %d\n", error);
 2170                 return (error);
 2171         }
 2172 
 2173         /* Start the periodic watchdog check on the IOC Doorbell */
 2174         mpr_periodic(sc);
 2175 
 2176         /*
 2177          * The portenable will kick off discovery events that will drive the
 2178          * rest of the initialization process.  The CAM/SAS module will
 2179          * hold up the boot sequence until discovery is complete.
 2180          */
 2181         sc->mpr_ich.ich_func = mpr_startup;
 2182         sc->mpr_ich.ich_arg = sc;
 2183         if (config_intrhook_establish(&sc->mpr_ich) != 0) {
 2184                 mpr_dprint(sc, MPR_INIT|MPR_ERROR,
 2185                     "Cannot establish MPR config hook\n");
 2186                 error = EINVAL;
 2187         }
 2188 
 2189         /*
 2190          * Allow IR to shutdown gracefully when shutdown occurs.
 2191          */
 2192         sc->shutdown_eh = EVENTHANDLER_REGISTER(shutdown_final,
 2193             mprsas_ir_shutdown, sc, SHUTDOWN_PRI_DEFAULT);
 2194 
 2195         if (sc->shutdown_eh == NULL)
 2196                 mpr_dprint(sc, MPR_INIT|MPR_ERROR,
 2197                     "shutdown event registration failed\n");
 2198 
 2199         mpr_setup_sysctl(sc);
 2200 
 2201         sc->mpr_flags |= MPR_FLAGS_ATTACH_DONE;
 2202         mpr_dprint(sc, MPR_INIT, "%s exit error= %d\n", __func__, error);
 2203 
 2204         return (error);
 2205 }
 2206 
 2207 /* Run through any late-start handlers. */
 2208 static void
 2209 mpr_startup(void *arg)
 2210 {
 2211         struct mpr_softc *sc;
 2212 
 2213         sc = (struct mpr_softc *)arg;
 2214         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
 2215 
 2216         mpr_lock(sc);
 2217         mpr_unmask_intr(sc);
 2218 
 2219         /* initialize device mapping tables */
 2220         mpr_base_static_config_pages(sc);
 2221         mpr_mapping_initialize(sc);
 2222         mprsas_startup(sc);
 2223         mpr_unlock(sc);
 2224 
 2225         mpr_dprint(sc, MPR_INIT, "disestablish config intrhook\n");
 2226         config_intrhook_disestablish(&sc->mpr_ich);
 2227         sc->mpr_ich.ich_arg = NULL;
 2228 
 2229         mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
 2230 }
 2231 
 2232 /* Periodic watchdog.  Is called with the driver lock already held. */
 2233 static void
 2234 mpr_periodic(void *arg)
 2235 {
 2236         struct mpr_softc *sc;
 2237         uint32_t db;
 2238 
 2239         sc = (struct mpr_softc *)arg;
 2240         if (sc->mpr_flags & MPR_FLAGS_SHUTDOWN)
 2241                 return;
 2242 
 2243         db = mpr_regread(sc, MPI2_DOORBELL_OFFSET);
 2244         if ((db & MPI2_IOC_STATE_MASK) == MPI2_IOC_STATE_FAULT) {
 2245                 if ((db & MPI2_DOORBELL_FAULT_CODE_MASK) ==
 2246                     IFAULT_IOP_OVER_TEMP_THRESHOLD_EXCEEDED) {
 2247                         panic("TEMPERATURE FAULT: STOPPING.");
 2248                 }
 2249                 mpr_dprint(sc, MPR_FAULT, "IOC Fault 0x%08x, Resetting\n", db);
 2250                 mpr_reinit(sc);
 2251         }
 2252 
 2253         callout_reset_sbt(&sc->periodic, MPR_PERIODIC_DELAY * SBT_1S, 0,
 2254             mpr_periodic, sc, C_PREL(1));
 2255 }
 2256 
 2257 static void
 2258 mpr_log_evt_handler(struct mpr_softc *sc, uintptr_t data,
 2259     MPI2_EVENT_NOTIFICATION_REPLY *event)
 2260 {
 2261         MPI2_EVENT_DATA_LOG_ENTRY_ADDED *entry;
 2262 
 2263         MPR_DPRINT_EVENT(sc, generic, event);
 2264 
 2265         switch (event->Event) {
 2266         case MPI2_EVENT_LOG_DATA:
 2267                 mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_DATA:\n");
 2268                 if (sc->mpr_debug & MPR_EVENT)
 2269                         hexdump(event->EventData, event->EventDataLength, NULL,
 2270                             0);
 2271                 break;
 2272         case MPI2_EVENT_LOG_ENTRY_ADDED:
 2273                 entry = (MPI2_EVENT_DATA_LOG_ENTRY_ADDED *)event->EventData;
 2274                 mpr_dprint(sc, MPR_EVENT, "MPI2_EVENT_LOG_ENTRY_ADDED event "
 2275                     "0x%x Sequence %d:\n", entry->LogEntryQualifier,
 2276                      entry->LogSequence);
 2277                 break;
 2278         default:
 2279                 break;
 2280         }
 2281         return;
 2282 }
 2283 
 2284 static int
 2285 mpr_attach_log(struct mpr_softc *sc)
 2286 {
 2287         uint8_t events[16];
 2288 
 2289         bzero(events, 16);
 2290         setbit(events, MPI2_EVENT_LOG_DATA);
 2291         setbit(events, MPI2_EVENT_LOG_ENTRY_ADDED);
 2292 
 2293         mpr_register_events(sc, events, mpr_log_evt_handler, NULL,
 2294             &sc->mpr_log_eh);
 2295 
 2296         return (0);
 2297 }
 2298 
 2299 static int
 2300 mpr_detach_log(struct mpr_softc *sc)
 2301 {
 2302 
 2303         if (sc->mpr_log_eh != NULL)
 2304                 mpr_deregister_events(sc, sc->mpr_log_eh);
 2305         return (0);
 2306 }
 2307 
 2308 /*
 2309  * Free all of the driver resources and detach submodules.  Should be called
 2310  * without the lock held.
 2311  */
 2312 int
 2313 mpr_free(struct mpr_softc *sc)
 2314 {
 2315         int error;
 2316 
 2317         mpr_dprint(sc, MPR_INIT, "%s entered\n", __func__);
 2318         /* Turn off the watchdog */
 2319         mpr_lock(sc);
 2320         sc->mpr_flags |= MPR_FLAGS_SHUTDOWN;
 2321         mpr_unlock(sc);
 2322         /* Lock must not be held for this */
 2323         callout_drain(&sc->periodic);
 2324         callout_drain(&sc->device_check_callout);
 2325 
 2326         if (((error = mpr_detach_log(sc)) != 0) ||
 2327             ((error = mpr_detach_sas(sc)) != 0)) {
 2328                 mpr_dprint(sc, MPR_INIT|MPR_FAULT, "failed to detach "
 2329                     "subsystems, error= %d, exit\n", error);
 2330                 return (error);
 2331         }
 2332 
 2333         mpr_detach_user(sc);
 2334 
 2335         /* Put the IOC back in the READY state. */
 2336         mpr_lock(sc);
 2337         if ((error = mpr_transition_ready(sc)) != 0) {
 2338                 mpr_unlock(sc);
 2339                 return (error);
 2340         }
 2341         mpr_unlock(sc);
 2342 
 2343         if (sc->facts != NULL)
 2344                 free(sc->facts, M_MPR);
 2345 
 2346         /*
 2347          * Free all buffers that are based on IOC Facts.  A Diag Reset may need
 2348          * to free these buffers too.
 2349          */
 2350         mpr_iocfacts_free(sc);
 2351 
 2352         if (sc->sysctl_tree != NULL)
 2353                 sysctl_ctx_free(&sc->sysctl_ctx);
 2354 
 2355         /* Deregister the shutdown function */
 2356         if (sc->shutdown_eh != NULL)
 2357                 EVENTHANDLER_DEREGISTER(shutdown_final, sc->shutdown_eh);
 2358 
 2359         mtx_destroy(&sc->mpr_mtx);
 2360         mpr_dprint(sc, MPR_INIT, "%s exit\n", __func__);
 2361 
 2362         return (0);
 2363 }
 2364 
 2365 static __inline void
 2366 mpr_complete_command(struct mpr_softc *sc, struct mpr_command *cm)
 2367 {
 2368         MPR_FUNCTRACE(sc);
 2369 
 2370         if (cm == NULL) {
 2371                 mpr_dprint(sc, MPR_ERROR, "Completing NULL command\n");
 2372                 return;
 2373         }
 2374 
 2375         KASSERT(cm->cm_state == MPR_CM_STATE_INQUEUE,
 2376             ("command not inqueue, state = %u\n", cm->cm_state));
 2377         cm->cm_state = MPR_CM_STATE_BUSY;
 2378         if (cm->cm_flags & MPR_CM_FLAGS_POLLED)
 2379                 cm->cm_flags |= MPR_CM_FLAGS_COMPLETE;
 2380 
 2381         if (cm->cm_complete != NULL) {
 2382                 mpr_dprint(sc, MPR_TRACE,
 2383                     "%s cm %p calling cm_complete %p data %p reply %p\n",
 2384                     __func__, cm, cm->cm_complete, cm->cm_complete_data,
 2385                     cm->cm_reply);
 2386                 cm->cm_complete(sc, cm);
 2387         }
 2388 
 2389         if (cm->cm_flags & MPR_CM_FLAGS_WAKEUP) {
 2390                 mpr_dprint(sc, MPR_TRACE, "waking up %p\n", cm);
 2391                 wakeup(cm);
 2392         }
 2393 
 2394         if (sc->io_cmds_active != 0) {
 2395                 sc->io_cmds_active--;
 2396         } else {
 2397                 mpr_dprint(sc, MPR_ERROR, "Warning: io_cmds_active is "
 2398                     "out of sync - resynching to 0\n");
 2399         }
 2400 }
 2401 
 2402 static void
 2403 mpr_sas_log_info(struct mpr_softc *sc , u32 log_info)
 2404 {
 2405         union loginfo_type {
 2406                 u32     loginfo;
 2407                 struct {
 2408                         u32     subcode:16;
 2409                         u32     code:8;
 2410                         u32     originator:4;
 2411                         u32     bus_type:4;
 2412                 } dw;
 2413         };
 2414         union loginfo_type sas_loginfo;
 2415         char *originator_str = NULL;
 2416 
 2417         sas_loginfo.loginfo = log_info;
 2418         if (sas_loginfo.dw.bus_type != 3 /*SAS*/)
 2419                 return;
 2420 
 2421         /* each nexus loss loginfo */
 2422         if (log_info == 0x31170000)
 2423                 return;
 2424 
 2425         /* eat the loginfos associated with task aborts */
 2426         if ((log_info == 30050000) || (log_info == 0x31140000) ||
 2427             (log_info == 0x31130000))
 2428                 return;
 2429 
 2430         switch (sas_loginfo.dw.originator) {
 2431         case 0:
 2432                 originator_str = "IOP";
 2433                 break;
 2434         case 1:
 2435                 originator_str = "PL";
 2436                 break;
 2437         case 2:
 2438                 originator_str = "IR";
 2439                 break;
 2440         }
 2441 
 2442         mpr_dprint(sc, MPR_LOG, "log_info(0x%08x): originator(%s), "
 2443             "code(0x%02x), sub_code(0x%04x)\n", log_info, originator_str,
 2444             sas_loginfo.dw.code, sas_loginfo.dw.subcode);
 2445 }
 2446 
 2447 static void
 2448 mpr_display_reply_info(struct mpr_softc *sc, uint8_t *reply)
 2449 {
 2450         MPI2DefaultReply_t *mpi_reply;
 2451         u16 sc_status;
 2452 
 2453         mpi_reply = (MPI2DefaultReply_t*)reply;
 2454         sc_status = le16toh(mpi_reply->IOCStatus);
 2455         if (sc_status & MPI2_IOCSTATUS_FLAG_LOG_INFO_AVAILABLE)
 2456                 mpr_sas_log_info(sc, le32toh(mpi_reply->IOCLogInfo));
 2457 }
 2458 
 2459 void
 2460 mpr_intr(void *data)
 2461 {
 2462         struct mpr_softc *sc;
 2463         uint32_t status;
 2464 
 2465         sc = (struct mpr_softc *)data;
 2466         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
 2467 
 2468         /*
 2469          * Check interrupt status register to flush the bus.  This is
 2470          * needed for both INTx interrupts and driver-driven polling
 2471          */
 2472         status = mpr_regread(sc, MPI2_HOST_INTERRUPT_STATUS_OFFSET);
 2473         if ((status & MPI2_HIS_REPLY_DESCRIPTOR_INTERRUPT) == 0)
 2474                 return;
 2475 
 2476         mpr_lock(sc);
 2477         mpr_intr_locked(data);
 2478         mpr_unlock(sc);
 2479         return;
 2480 }
 2481 
 2482 /*
 2483  * In theory, MSI/MSIX interrupts shouldn't need to read any registers on the
 2484  * chip.  Hopefully this theory is correct.
 2485  */
 2486 void
 2487 mpr_intr_msi(void *data)
 2488 {
 2489         struct mpr_softc *sc;
 2490 
 2491         sc = (struct mpr_softc *)data;
 2492         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
 2493         mpr_lock(sc);
 2494         mpr_intr_locked(data);
 2495         mpr_unlock(sc);
 2496         return;
 2497 }
 2498 
 2499 /*
 2500  * The locking is overly broad and simplistic, but easy to deal with for now.
 2501  */
 2502 void
 2503 mpr_intr_locked(void *data)
 2504 {
 2505         MPI2_REPLY_DESCRIPTORS_UNION *desc;
 2506         MPI2_DIAG_RELEASE_REPLY *rel_rep;
 2507         mpr_fw_diagnostic_buffer_t *pBuffer;
 2508         struct mpr_softc *sc;
 2509         uint64_t tdesc;
 2510         struct mpr_command *cm = NULL;
 2511         uint8_t flags;
 2512         u_int pq;
 2513 
 2514         sc = (struct mpr_softc *)data;
 2515 
 2516         pq = sc->replypostindex;
 2517         mpr_dprint(sc, MPR_TRACE,
 2518             "%s sc %p starting with replypostindex %u\n", 
 2519             __func__, sc, sc->replypostindex);
 2520 
 2521         for ( ;; ) {
 2522                 cm = NULL;
 2523                 desc = &sc->post_queue[sc->replypostindex];
 2524 
 2525                 /*
 2526                  * Copy and clear out the descriptor so that any reentry will
 2527                  * immediately know that this descriptor has already been
 2528                  * looked at.  There is unfortunate casting magic because the
 2529                  * MPI API doesn't have a cardinal 64bit type.
 2530                  */
 2531                 tdesc = 0xffffffffffffffff;
 2532                 tdesc = atomic_swap_64((uint64_t *)desc, tdesc);
 2533                 desc = (MPI2_REPLY_DESCRIPTORS_UNION *)&tdesc;
 2534 
 2535                 flags = desc->Default.ReplyFlags &
 2536                     MPI2_RPY_DESCRIPT_FLAGS_TYPE_MASK;
 2537                 if ((flags == MPI2_RPY_DESCRIPT_FLAGS_UNUSED) ||
 2538                     (le32toh(desc->Words.High) == 0xffffffff))
 2539                         break;
 2540 
 2541                 /* increment the replypostindex now, so that event handlers
 2542                  * and cm completion handlers which decide to do a diag
 2543                  * reset can zero it without it getting incremented again
 2544                  * afterwards, and we break out of this loop on the next
 2545                  * iteration since the reply post queue has been cleared to
 2546                  * 0xFF and all descriptors look unused (which they are).
 2547                  */
 2548                 if (++sc->replypostindex >= sc->pqdepth)
 2549                         sc->replypostindex = 0;
 2550 
 2551                 switch (flags) {
 2552                 case MPI2_RPY_DESCRIPT_FLAGS_SCSI_IO_SUCCESS:
 2553                 case MPI25_RPY_DESCRIPT_FLAGS_FAST_PATH_SCSI_IO_SUCCESS:
 2554                 case MPI26_RPY_DESCRIPT_FLAGS_PCIE_ENCAPSULATED_SUCCESS:
 2555                         cm = &sc->commands[le16toh(desc->SCSIIOSuccess.SMID)];
 2556                         cm->cm_reply = NULL;
 2557                         break;
 2558                 case MPI2_RPY_DESCRIPT_FLAGS_ADDRESS_REPLY:
 2559                 {
 2560                         uint32_t baddr;
 2561                         uint8_t *reply;
 2562 
 2563                         /*
 2564                          * Re-compose the reply address from the address
 2565                          * sent back from the chip.  The ReplyFrameAddress
 2566                          * is the lower 32 bits of the physical address of
 2567                          * particular reply frame.  Convert that address to
 2568                          * host format, and then use that to provide the
 2569                          * offset against the virtual address base
 2570                          * (sc->reply_frames).
 2571                          */
 2572                         baddr = le32toh(desc->AddressReply.ReplyFrameAddress);
 2573                         reply = sc->reply_frames +
 2574                                 (baddr - ((uint32_t)sc->reply_busaddr));
 2575                         /*
 2576                          * Make sure the reply we got back is in a valid
 2577                          * range.  If not, go ahead and panic here, since
 2578                          * we'll probably panic as soon as we deference the
 2579                          * reply pointer anyway.
 2580                          */
 2581                         if ((reply < sc->reply_frames)
 2582                          || (reply > (sc->reply_frames +
 2583                              (sc->fqdepth * sc->replyframesz)))) {
 2584                                 printf("%s: WARNING: reply %p out of range!\n",
 2585                                        __func__, reply);
 2586                                 printf("%s: reply_frames %p, fqdepth %d, "
 2587                                        "frame size %d\n", __func__,
 2588                                        sc->reply_frames, sc->fqdepth,
 2589                                        sc->replyframesz);
 2590                                 printf("%s: baddr %#x,\n", __func__, baddr);
 2591                                 /* LSI-TODO. See Linux Code for Graceful exit */
 2592                                 panic("Reply address out of range");
 2593                         }
 2594                         if (le16toh(desc->AddressReply.SMID) == 0) {
 2595                                 if (((MPI2_DEFAULT_REPLY *)reply)->Function ==
 2596                                     MPI2_FUNCTION_DIAG_BUFFER_POST) {
 2597                                         /*
 2598                                          * If SMID is 0 for Diag Buffer Post,
 2599                                          * this implies that the reply is due to
 2600                                          * a release function with a status that
 2601                                          * the buffer has been released.  Set
 2602                                          * the buffer flags accordingly.
 2603                                          */
 2604                                         rel_rep =
 2605                                             (MPI2_DIAG_RELEASE_REPLY *)reply;
 2606                                         if ((le16toh(rel_rep->IOCStatus) &
 2607                                             MPI2_IOCSTATUS_MASK) ==
 2608                                             MPI2_IOCSTATUS_DIAGNOSTIC_RELEASED)
 2609                                         {
 2610                                                 pBuffer =
 2611                                                     &sc->fw_diag_buffer_list[
 2612                                                     rel_rep->BufferType];
 2613                                                 pBuffer->valid_data = TRUE;
 2614                                                 pBuffer->owned_by_firmware =
 2615                                                     FALSE;
 2616                                                 pBuffer->immediate = FALSE;
 2617                                         }
 2618                                 } else
 2619                                         mpr_dispatch_event(sc, baddr,
 2620                                             (MPI2_EVENT_NOTIFICATION_REPLY *)
 2621                                             reply);
 2622                         } else {
 2623                                 cm = &sc->commands[
 2624                                     le16toh(desc->AddressReply.SMID)];
 2625                                 if (cm->cm_state == MPR_CM_STATE_INQUEUE) {
 2626                                         cm->cm_reply = reply;
 2627                                         cm->cm_reply_data =
 2628                                             le32toh(desc->AddressReply.
 2629                                                 ReplyFrameAddress);
 2630                                 } else {
 2631                                         mpr_dprint(sc, MPR_RECOVERY,
 2632                                             "Bad state for ADDRESS_REPLY status,"
 2633                                             " ignoring state %d cm %p\n",
 2634                                             cm->cm_state, cm);
 2635                                 }
 2636                         }
 2637                         break;
 2638                 }
 2639                 case MPI2_RPY_DESCRIPT_FLAGS_TARGETASSIST_SUCCESS:
 2640                 case MPI2_RPY_DESCRIPT_FLAGS_TARGET_COMMAND_BUFFER:
 2641                 case MPI2_RPY_DESCRIPT_FLAGS_RAID_ACCELERATOR_SUCCESS:
 2642                 default:
 2643                         /* Unhandled */
 2644                         mpr_dprint(sc, MPR_ERROR, "Unhandled reply 0x%x\n",
 2645                             desc->Default.ReplyFlags);
 2646                         cm = NULL;
 2647                         break;
 2648                 }
 2649 
 2650                 if (cm != NULL) {
 2651                         // Print Error reply frame
 2652                         if (cm->cm_reply)
 2653                                 mpr_display_reply_info(sc,cm->cm_reply);
 2654                         mpr_complete_command(sc, cm);
 2655                 }
 2656         }
 2657 
 2658         if (pq != sc->replypostindex) {
 2659                 mpr_dprint(sc, MPR_TRACE, "%s sc %p writing postindex %d\n",
 2660                     __func__, sc, sc->replypostindex);
 2661                 mpr_regwrite(sc, MPI2_REPLY_POST_HOST_INDEX_OFFSET,
 2662                     sc->replypostindex);
 2663         }
 2664 
 2665         return;
 2666 }
 2667 
 2668 static void
 2669 mpr_dispatch_event(struct mpr_softc *sc, uintptr_t data,
 2670     MPI2_EVENT_NOTIFICATION_REPLY *reply)
 2671 {
 2672         struct mpr_event_handle *eh;
 2673         int event, handled = 0;
 2674 
 2675         event = le16toh(reply->Event);
 2676         TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 2677                 if (isset(eh->mask, event)) {
 2678                         eh->callback(sc, data, reply);
 2679                         handled++;
 2680                 }
 2681         }
 2682 
 2683         if (handled == 0)
 2684                 mpr_dprint(sc, MPR_EVENT, "Unhandled event 0x%x\n",
 2685                     le16toh(event));
 2686 
 2687         /*
 2688          * This is the only place that the event/reply should be freed.
 2689          * Anything wanting to hold onto the event data should have
 2690          * already copied it into their own storage.
 2691          */
 2692         mpr_free_reply(sc, data);
 2693 }
 2694 
 2695 static void
 2696 mpr_reregister_events_complete(struct mpr_softc *sc, struct mpr_command *cm)
 2697 {
 2698         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
 2699 
 2700         if (cm->cm_reply)
 2701                 MPR_DPRINT_EVENT(sc, generic,
 2702                         (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply);
 2703 
 2704         mpr_free_command(sc, cm);
 2705 
 2706         /* next, send a port enable */
 2707         mprsas_startup(sc);
 2708 }
 2709 
 2710 /*
 2711  * For both register_events and update_events, the caller supplies a bitmap
 2712  * of events that it _wants_.  These functions then turn that into a bitmask
 2713  * suitable for the controller.
 2714  */
 2715 int
 2716 mpr_register_events(struct mpr_softc *sc, uint8_t *mask,
 2717     mpr_evt_callback_t *cb, void *data, struct mpr_event_handle **handle)
 2718 {
 2719         struct mpr_event_handle *eh;
 2720         int error = 0;
 2721 
 2722         eh = malloc(sizeof(struct mpr_event_handle), M_MPR, M_WAITOK|M_ZERO);
 2723         eh->callback = cb;
 2724         eh->data = data;
 2725         TAILQ_INSERT_TAIL(&sc->event_list, eh, eh_list);
 2726         if (mask != NULL)
 2727                 error = mpr_update_events(sc, eh, mask);
 2728         *handle = eh;
 2729 
 2730         return (error);
 2731 }
 2732 
 2733 int
 2734 mpr_update_events(struct mpr_softc *sc, struct mpr_event_handle *handle,
 2735     uint8_t *mask)
 2736 {
 2737         MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
 2738         MPI2_EVENT_NOTIFICATION_REPLY *reply = NULL;
 2739         struct mpr_command *cm = NULL;
 2740         struct mpr_event_handle *eh;
 2741         int error, i;
 2742 
 2743         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
 2744 
 2745         if ((mask != NULL) && (handle != NULL))
 2746                 bcopy(mask, &handle->mask[0], 16);
 2747         memset(sc->event_mask, 0xff, 16);
 2748 
 2749         TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 2750                 for (i = 0; i < 16; i++)
 2751                         sc->event_mask[i] &= ~eh->mask[i];
 2752         }
 2753 
 2754         if ((cm = mpr_alloc_command(sc)) == NULL)
 2755                 return (EBUSY);
 2756         evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
 2757         evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
 2758         evtreq->MsgFlags = 0;
 2759         evtreq->SASBroadcastPrimitiveMasks = 0;
 2760 #ifdef MPR_DEBUG_ALL_EVENTS
 2761         {
 2762                 u_char fullmask[16];
 2763                 memset(fullmask, 0x00, 16);
 2764                 bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, 16);
 2765         }
 2766 #else
 2767         for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 2768                 evtreq->EventMasks[i] = htole32(sc->event_mask[i]);
 2769 #endif
 2770         cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 2771         cm->cm_data = NULL;
 2772 
 2773         error = mpr_request_polled(sc, &cm);
 2774         if (cm != NULL)
 2775                 reply = (MPI2_EVENT_NOTIFICATION_REPLY *)cm->cm_reply;
 2776         if ((reply == NULL) ||
 2777             (reply->IOCStatus & MPI2_IOCSTATUS_MASK) != MPI2_IOCSTATUS_SUCCESS)
 2778                 error = ENXIO;
 2779 
 2780         if (reply)
 2781                 MPR_DPRINT_EVENT(sc, generic, reply);
 2782 
 2783         mpr_dprint(sc, MPR_TRACE, "%s finished error %d\n", __func__, error);
 2784 
 2785         if (cm != NULL)
 2786                 mpr_free_command(sc, cm);
 2787         return (error);
 2788 }
 2789 
 2790 static int
 2791 mpr_reregister_events(struct mpr_softc *sc)
 2792 {
 2793         MPI2_EVENT_NOTIFICATION_REQUEST *evtreq;
 2794         struct mpr_command *cm;
 2795         struct mpr_event_handle *eh;
 2796         int error, i;
 2797 
 2798         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
 2799 
 2800         /* first, reregister events */
 2801 
 2802         memset(sc->event_mask, 0xff, 16);
 2803 
 2804         TAILQ_FOREACH(eh, &sc->event_list, eh_list) {
 2805                 for (i = 0; i < 16; i++)
 2806                         sc->event_mask[i] &= ~eh->mask[i];
 2807         }
 2808 
 2809         if ((cm = mpr_alloc_command(sc)) == NULL)
 2810                 return (EBUSY);
 2811         evtreq = (MPI2_EVENT_NOTIFICATION_REQUEST *)cm->cm_req;
 2812         evtreq->Function = MPI2_FUNCTION_EVENT_NOTIFICATION;
 2813         evtreq->MsgFlags = 0;
 2814         evtreq->SASBroadcastPrimitiveMasks = 0;
 2815 #ifdef MPR_DEBUG_ALL_EVENTS
 2816         {
 2817                 u_char fullmask[16];
 2818                 memset(fullmask, 0x00, 16);
 2819                 bcopy(fullmask, (uint8_t *)&evtreq->EventMasks, 16);
 2820         }
 2821 #else
 2822         for (i = 0; i < MPI2_EVENT_NOTIFY_EVENTMASK_WORDS; i++)
 2823                 evtreq->EventMasks[i] = htole32(sc->event_mask[i]);
 2824 #endif
 2825         cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 2826         cm->cm_data = NULL;
 2827         cm->cm_complete = mpr_reregister_events_complete;
 2828 
 2829         error = mpr_map_command(sc, cm);
 2830 
 2831         mpr_dprint(sc, MPR_TRACE, "%s finished with error %d\n", __func__,
 2832             error);
 2833         return (error);
 2834 }
 2835 
 2836 int
 2837 mpr_deregister_events(struct mpr_softc *sc, struct mpr_event_handle *handle)
 2838 {
 2839 
 2840         TAILQ_REMOVE(&sc->event_list, handle, eh_list);
 2841         free(handle, M_MPR);
 2842         return (mpr_update_events(sc, NULL, NULL));
 2843 }
 2844 
 2845 /**
 2846 * mpr_build_nvme_prp - This function is called for NVMe end devices to build a
 2847 * native SGL (NVMe PRP). The native SGL is built starting in the first PRP entry
 2848 * of the NVMe message (PRP1). If the data buffer is small enough to be described
 2849 * entirely using PRP1, then PRP2 is not used. If needed, PRP2 is used to
 2850 * describe a larger data buffer. If the data buffer is too large to describe
 2851 * using the two PRP entriess inside the NVMe message, then PRP1 describes the
 2852 * first data memory segment, and PRP2 contains a pointer to a PRP list located
 2853 * elsewhere in memory to describe the remaining data memory segments. The PRP
 2854 * list will be contiguous.
 2855 
 2856 * The native SGL for NVMe devices is a Physical Region Page (PRP). A PRP
 2857 * consists of a list of PRP entries to describe a number of noncontigous
 2858 * physical memory segments as a single memory buffer, just as a SGL does. Note
 2859 * however, that this function is only used by the IOCTL call, so the memory
 2860 * given will be guaranteed to be contiguous. There is no need to translate
 2861 * non-contiguous SGL into a PRP in this case. All PRPs will describe contiguous
 2862 * space that is one page size each.
 2863 *
 2864 * Each NVMe message contains two PRP entries. The first (PRP1) either contains
 2865 * a PRP list pointer or a PRP element, depending upon the command. PRP2 contains
 2866 * the second PRP element if the memory being described fits within 2 PRP
 2867 * entries, or a PRP list pointer if the PRP spans more than two entries.
 2868 *
 2869 * A PRP list pointer contains the address of a PRP list, structured as a linear
 2870 * array of PRP entries. Each PRP entry in this list describes a segment of
 2871 * physical memory.
 2872 *
 2873 * Each 64-bit PRP entry comprises an address and an offset field. The address
 2874 * always points to the beginning of a PAGE_SIZE physical memory page, and the
 2875 * offset describes where within that page the memory segment begins. Only the
 2876 * first element in a PRP list may contain a non-zero offest, implying that all
 2877 * memory segments following the first begin at the start of a PAGE_SIZE page.
 2878 *
 2879 * Each PRP element normally describes a chunck of PAGE_SIZE physical memory,
 2880 * with exceptions for the first and last elements in the list. If the memory
 2881 * being described by the list begins at a non-zero offset within the first page,
 2882 * then the first PRP element will contain a non-zero offset indicating where the
 2883 * region begins within the page. The last memory segment may end before the end
 2884 * of the PAGE_SIZE segment, depending upon the overall size of the memory being
 2885 * described by the PRP list. 
 2886 *
 2887 * Since PRP entries lack any indication of size, the overall data buffer length
 2888 * is used to determine where the end of the data memory buffer is located, and
 2889 * how many PRP entries are required to describe it.
 2890 *
 2891 * Returns nothing.
 2892 */
 2893 void 
 2894 mpr_build_nvme_prp(struct mpr_softc *sc, struct mpr_command *cm,
 2895     Mpi26NVMeEncapsulatedRequest_t *nvme_encap_request, void *data,
 2896     uint32_t data_in_sz, uint32_t data_out_sz)
 2897 {
 2898         int                     prp_size = PRP_ENTRY_SIZE;
 2899         uint64_t                *prp_entry, *prp1_entry, *prp2_entry;
 2900         uint64_t                *prp_entry_phys, *prp_page, *prp_page_phys;
 2901         uint32_t                offset, entry_len, page_mask_result, page_mask;
 2902         bus_addr_t              paddr;
 2903         size_t                  length;
 2904         struct mpr_prp_page     *prp_page_info = NULL;
 2905 
 2906         /*
 2907          * Not all commands require a data transfer. If no data, just return
 2908          * without constructing any PRP.
 2909          */
 2910         if (!data_in_sz && !data_out_sz)
 2911                 return;
 2912 
 2913         /*
 2914          * Set pointers to PRP1 and PRP2, which are in the NVMe command. PRP1 is
 2915          * located at a 24 byte offset from the start of the NVMe command. Then
 2916          * set the current PRP entry pointer to PRP1.
 2917          */
 2918         prp1_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
 2919             NVME_CMD_PRP1_OFFSET);
 2920         prp2_entry = (uint64_t *)(nvme_encap_request->NVMe_Command +
 2921             NVME_CMD_PRP2_OFFSET);
 2922         prp_entry = prp1_entry;
 2923 
 2924         /*
 2925          * For the PRP entries, use the specially allocated buffer of
 2926          * contiguous memory. PRP Page allocation failures should not happen
 2927          * because there should be enough PRP page buffers to account for the
 2928          * possible NVMe QDepth.
 2929          */
 2930         prp_page_info = mpr_alloc_prp_page(sc);
 2931         KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
 2932             "used for building a native NVMe SGL.\n", __func__));
 2933         prp_page = (uint64_t *)prp_page_info->prp_page;
 2934         prp_page_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;
 2935 
 2936         /*
 2937          * Insert the allocated PRP page into the command's PRP page list. This
 2938          * will be freed when the command is freed.
 2939          */
 2940         TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);
 2941 
 2942         /*
 2943          * Check if we are within 1 entry of a page boundary we don't want our
 2944          * first entry to be a PRP List entry.
 2945          */
 2946         page_mask = PAGE_SIZE - 1;
 2947         page_mask_result = (uintptr_t)((uint8_t *)prp_page + prp_size) &
 2948             page_mask;
 2949         if (!page_mask_result)
 2950         {
 2951                 /* Bump up to next page boundary. */
 2952                 prp_page = (uint64_t *)((uint8_t *)prp_page + prp_size);
 2953                 prp_page_phys = (uint64_t *)((uint8_t *)prp_page_phys +
 2954                     prp_size);
 2955         }
 2956 
 2957         /*
 2958          * Set PRP physical pointer, which initially points to the current PRP
 2959          * DMA memory page.
 2960          */
 2961         prp_entry_phys = prp_page_phys;
 2962 
 2963         /* Get physical address and length of the data buffer. */
 2964         paddr = (bus_addr_t)(uintptr_t)data;
 2965         if (data_in_sz)
 2966                 length = data_in_sz;
 2967         else
 2968                 length = data_out_sz;
 2969 
 2970         /* Loop while the length is not zero. */
 2971         while (length)
 2972         {
 2973                 /*
 2974                  * Check if we need to put a list pointer here if we are at page
 2975                  * boundary - prp_size (8 bytes).
 2976                  */
 2977                 page_mask_result = (uintptr_t)((uint8_t *)prp_entry_phys +
 2978                     prp_size) & page_mask;
 2979                 if (!page_mask_result)
 2980                 {
 2981                         /*
 2982                          * This is the last entry in a PRP List, so we need to
 2983                          * put a PRP list pointer here. What this does is:
 2984                          *   - bump the current memory pointer to the next
 2985                          *     address, which will be the next full page.
 2986                          *   - set the PRP Entry to point to that page. This is
 2987                          *     now the PRP List pointer.
 2988                          *   - bump the PRP Entry pointer the start of the next
 2989                          *     page. Since all of this PRP memory is contiguous,
 2990                          *     no need to get a new page - it's just the next
 2991                          *     address.
 2992                          */
 2993                         prp_entry_phys++;
 2994                         *prp_entry =
 2995                             htole64((uint64_t)(uintptr_t)prp_entry_phys);
 2996                         prp_entry++;
 2997                 }
 2998 
 2999                 /* Need to handle if entry will be part of a page. */
 3000                 offset = (uint32_t)paddr & page_mask;
 3001                 entry_len = PAGE_SIZE - offset;
 3002 
 3003                 if (prp_entry == prp1_entry)
 3004                 {
 3005                         /*
 3006                          * Must fill in the first PRP pointer (PRP1) before
 3007                          * moving on.
 3008                          */
 3009                         *prp1_entry = htole64((uint64_t)paddr);
 3010 
 3011                         /*
 3012                          * Now point to the second PRP entry within the
 3013                          * command (PRP2).
 3014                          */
 3015                         prp_entry = prp2_entry;
 3016                 }
 3017                 else if (prp_entry == prp2_entry)
 3018                 {
 3019                         /*
 3020                          * Should the PRP2 entry be a PRP List pointer or just a
 3021                          * regular PRP pointer? If there is more than one more
 3022                          * page of data, must use a PRP List pointer.
 3023                          */
 3024                         if (length > PAGE_SIZE)
 3025                         {
 3026                                 /*
 3027                                  * PRP2 will contain a PRP List pointer because
 3028                                  * more PRP's are needed with this command. The
 3029                                  * list will start at the beginning of the
 3030                                  * contiguous buffer.
 3031                                  */
 3032                                 *prp2_entry =
 3033                                     htole64(
 3034                                     (uint64_t)(uintptr_t)prp_entry_phys);
 3035 
 3036                                 /*
 3037                                  * The next PRP Entry will be the start of the
 3038                                  * first PRP List.
 3039                                  */
 3040                                 prp_entry = prp_page;
 3041                         }
 3042                         else
 3043                         {
 3044                                 /*
 3045                                  * After this, the PRP Entries are complete.
 3046                                  * This command uses 2 PRP's and no PRP list.
 3047                                  */
 3048                                 *prp2_entry = htole64((uint64_t)paddr);
 3049                         }
 3050                 }
 3051                 else
 3052                 {
 3053                         /*
 3054                          * Put entry in list and bump the addresses.
 3055                          *
 3056                          * After PRP1 and PRP2 are filled in, this will fill in
 3057                          * all remaining PRP entries in a PRP List, one per each
 3058                          * time through the loop.
 3059                          */
 3060                         *prp_entry = htole64((uint64_t)paddr);
 3061                         prp_entry++;
 3062                         prp_entry_phys++;
 3063                 }
 3064 
 3065                 /*
 3066                  * Bump the phys address of the command's data buffer by the
 3067                  * entry_len.
 3068                  */
 3069                 paddr += entry_len;
 3070 
 3071                 /* Decrement length accounting for last partial page. */
 3072                 if (entry_len > length)
 3073                         length = 0;
 3074                 else
 3075                         length -= entry_len;
 3076         }
 3077 }
 3078 
 3079 /*
 3080  * mpr_check_pcie_native_sgl - This function is called for PCIe end devices to
 3081  * determine if the driver needs to build a native SGL. If so, that native SGL
 3082  * is built in the contiguous buffers allocated especially for PCIe SGL
 3083  * creation. If the driver will not build a native SGL, return TRUE and a
 3084  * normal IEEE SGL will be built. Currently this routine supports NVMe devices
 3085  * only.
 3086  *
 3087  * Returns FALSE (0) if native SGL was built, TRUE (1) if no SGL was built.
 3088  */
 3089 static int
 3090 mpr_check_pcie_native_sgl(struct mpr_softc *sc, struct mpr_command *cm,
 3091     bus_dma_segment_t *segs, int segs_left)
 3092 {
 3093         uint32_t                i, sge_dwords, length, offset, entry_len;
 3094         uint32_t                num_entries, buff_len = 0, sges_in_segment;
 3095         uint32_t                page_mask, page_mask_result, *curr_buff;
 3096         uint32_t                *ptr_sgl, *ptr_first_sgl, first_page_offset;
 3097         uint32_t                first_page_data_size, end_residual;
 3098         uint64_t                *msg_phys;
 3099         bus_addr_t              paddr;
 3100         int                     build_native_sgl = 0, first_prp_entry;
 3101         int                     prp_size = PRP_ENTRY_SIZE;
 3102         Mpi25IeeeSgeChain64_t   *main_chain_element = NULL;
 3103         struct mpr_prp_page     *prp_page_info = NULL;
 3104 
 3105         mpr_dprint(sc, MPR_TRACE, "%s\n", __func__);
 3106 
 3107         /*
 3108          * Add up the sizes of each segment length to get the total transfer
 3109          * size, which will be checked against the Maximum Data Transfer Size.
 3110          * If the data transfer length exceeds the MDTS for this device, just
 3111          * return 1 so a normal IEEE SGL will be built. F/W will break the I/O
 3112          * up into multiple I/O's. [nvme_mdts = 0 means unlimited]
 3113          */
 3114         for (i = 0; i < segs_left; i++)
 3115                 buff_len += htole32(segs[i].ds_len);
 3116         if ((cm->cm_targ->MDTS > 0) && (buff_len > cm->cm_targ->MDTS))
 3117                 return 1;
 3118 
 3119         /* Create page_mask (to get offset within page) */
 3120         page_mask = PAGE_SIZE - 1;
 3121 
 3122         /*
 3123          * Check if the number of elements exceeds the max number that can be
 3124          * put in the main message frame (H/W can only translate an SGL that
 3125          * is contained entirely in the main message frame).
 3126          */
 3127         sges_in_segment = (sc->reqframesz -
 3128             offsetof(Mpi25SCSIIORequest_t, SGL)) / sizeof(MPI25_SGE_IO_UNION);
 3129         if (segs_left > sges_in_segment)
 3130                 build_native_sgl = 1;
 3131         else
 3132         {
 3133                 /*
 3134                  * NVMe uses one PRP for each physical page (or part of physical
 3135                  * page).
 3136                  *    if 4 pages or less then IEEE is OK
 3137                  *    if > 5 pages then we need to build a native SGL
 3138                  *    if > 4 and <= 5 pages, then check the physical address of
 3139                  *      the first SG entry, then if this first size in the page
 3140                  *      is >= the residual beyond 4 pages then use IEEE,
 3141                  *      otherwise use native SGL
 3142                  */
 3143                 if (buff_len > (PAGE_SIZE * 5))
 3144                         build_native_sgl = 1;
 3145                 else if ((buff_len > (PAGE_SIZE * 4)) &&
 3146                     (buff_len <= (PAGE_SIZE * 5)) )
 3147                 {
 3148                         msg_phys = (uint64_t *)(uintptr_t)segs[0].ds_addr;
 3149                         first_page_offset =
 3150                             ((uint32_t)(uint64_t)(uintptr_t)msg_phys &
 3151                             page_mask);
 3152                         first_page_data_size = PAGE_SIZE - first_page_offset;
 3153                         end_residual = buff_len % PAGE_SIZE;
 3154 
 3155                         /*
 3156                          * If offset into first page pushes the end of the data
 3157                          * beyond end of the 5th page, we need the extra PRP
 3158                          * list.
 3159                          */
 3160                         if (first_page_data_size < end_residual)
 3161                                 build_native_sgl = 1;
 3162 
 3163                         /*
 3164                          * Check if first SG entry size is < residual beyond 4
 3165                          * pages.
 3166                          */
 3167                         if (htole32(segs[0].ds_len) <
 3168                             (buff_len - (PAGE_SIZE * 4)))
 3169                                 build_native_sgl = 1;
 3170                 }
 3171         }
 3172 
 3173         /* check if native SGL is needed */
 3174         if (!build_native_sgl)
 3175                 return 1;
 3176 
 3177         /*
 3178          * Native SGL is needed.
 3179          * Put a chain element in main message frame that points to the first
 3180          * chain buffer.
 3181          *
 3182          * NOTE:  The ChainOffset field must be 0 when using a chain pointer to
 3183          *        a native SGL.
 3184          */
 3185 
 3186         /* Set main message chain element pointer */
 3187         main_chain_element = (pMpi25IeeeSgeChain64_t)cm->cm_sge;
 3188 
 3189         /*
 3190          * For NVMe the chain element needs to be the 2nd SGL entry in the main
 3191          * message.
 3192          */
 3193         main_chain_element = (Mpi25IeeeSgeChain64_t *)
 3194             ((uint8_t *)main_chain_element + sizeof(MPI25_IEEE_SGE_CHAIN64));
 3195 
 3196         /*
 3197          * For the PRP entries, use the specially allocated buffer of
 3198          * contiguous memory. PRP Page allocation failures should not happen
 3199          * because there should be enough PRP page buffers to account for the
 3200          * possible NVMe QDepth.
 3201          */
 3202         prp_page_info = mpr_alloc_prp_page(sc);
 3203         KASSERT(prp_page_info != NULL, ("%s: There are no PRP Pages left to be "
 3204             "used for building a native NVMe SGL.\n", __func__));
 3205         curr_buff = (uint32_t *)prp_page_info->prp_page;
 3206         msg_phys = (uint64_t *)(uintptr_t)prp_page_info->prp_page_busaddr;
 3207 
 3208         /*
 3209          * Insert the allocated PRP page into the command's PRP page list. This
 3210          * will be freed when the command is freed.
 3211          */
 3212         TAILQ_INSERT_TAIL(&cm->cm_prp_page_list, prp_page_info, prp_page_link);
 3213 
 3214         /*
 3215          * Check if we are within 1 entry of a page boundary we don't want our
 3216          * first entry to be a PRP List entry.
 3217          */
 3218         page_mask_result = (uintptr_t)((uint8_t *)curr_buff + prp_size) &
 3219             page_mask;
 3220         if (!page_mask_result) {
 3221                 /* Bump up to next page boundary. */
 3222                 curr_buff = (uint32_t *)((uint8_t *)curr_buff + prp_size);
 3223                 msg_phys = (uint64_t *)((uint8_t *)msg_phys + prp_size);
 3224         }
 3225 
 3226         /* Fill in the chain element and make it an NVMe segment type. */
 3227         main_chain_element->Address.High =
 3228             htole32((uint32_t)((uint64_t)(uintptr_t)msg_phys >> 32));
 3229         main_chain_element->Address.Low =
 3230             htole32((uint32_t)(uintptr_t)msg_phys);
 3231         main_chain_element->NextChainOffset = 0;
 3232         main_chain_element->Flags = MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT |
 3233             MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR |
 3234             MPI26_IEEE_SGE_FLAGS_NSF_NVME_PRP;
 3235 
 3236         /* Set SGL pointer to start of contiguous PCIe buffer. */
 3237         ptr_sgl = curr_buff;
 3238         sge_dwords = 2;
 3239         num_entries = 0;
 3240 
 3241         /*
 3242          * NVMe has a very convoluted PRP format. One PRP is required for each
 3243          * page or partial page. We need to split up OS SG entries if they are
 3244          * longer than one page or cross a page boundary. We also have to insert
 3245          * a PRP list pointer entry as the last entry in each physical page of
 3246          * the PRP list.
 3247          *
 3248          * NOTE: The first PRP "entry" is actually placed in the first SGL entry
 3249          * in the main message in IEEE 64 format. The 2nd entry in the main
 3250          * message is the chain element, and the rest of the PRP entries are
 3251          * built in the contiguous PCIe buffer.
 3252          */
 3253         first_prp_entry = 1;
 3254         ptr_first_sgl = (uint32_t *)cm->cm_sge;
 3255 
 3256         for (i = 0; i < segs_left; i++) {
 3257                 /* Get physical address and length of this SG entry. */
 3258                 paddr = segs[i].ds_addr;
 3259                 length = segs[i].ds_len;
 3260 
 3261                 /*
 3262                  * Check whether a given SGE buffer lies on a non-PAGED
 3263                  * boundary if this is not the first page. If so, this is not
 3264                  * expected so have FW build the SGL.
 3265                  */
 3266                 if ((i != 0) && (((uint32_t)paddr & page_mask) != 0)) {
 3267                         mpr_dprint(sc, MPR_ERROR, "Unaligned SGE while "
 3268                             "building NVMe PRPs, low address is 0x%x\n",
 3269                             (uint32_t)paddr);
 3270                         return 1;
 3271                 }
 3272 
 3273                 /* Apart from last SGE, if any other SGE boundary is not page
 3274                  * aligned then it means that hole exists. Existence of hole
 3275                  * leads to data corruption. So fallback to IEEE SGEs.
 3276                  */
 3277                 if (i != (segs_left - 1)) {
 3278                         if (((uint32_t)paddr + length) & page_mask) {
 3279                                 mpr_dprint(sc, MPR_ERROR, "Unaligned SGE "
 3280                                     "boundary while building NVMe PRPs, low "
 3281                                     "address: 0x%x and length: %u\n",
 3282                                     (uint32_t)paddr, length);
 3283                                 return 1;
 3284                         }
 3285                 }
 3286 
 3287                 /* Loop while the length is not zero. */
 3288                 while (length) {
 3289                         /*
 3290                          * Check if we need to put a list pointer here if we are
 3291                          * at page boundary - prp_size.
 3292                          */
 3293                         page_mask_result = (uintptr_t)((uint8_t *)ptr_sgl +
 3294                             prp_size) & page_mask;
 3295                         if (!page_mask_result) {
 3296                                 /*
 3297                                  * Need to put a PRP list pointer here.
 3298                                  */
 3299                                 msg_phys = (uint64_t *)((uint8_t *)msg_phys +
 3300                                     prp_size);
 3301                                 *ptr_sgl = htole32((uintptr_t)msg_phys);
 3302                                 *(ptr_sgl+1) = htole32((uint64_t)(uintptr_t)
 3303                                     msg_phys >> 32);
 3304                                 ptr_sgl += sge_dwords;
 3305                                 num_entries++;
 3306                         }
 3307 
 3308                         /* Need to handle if entry will be part of a page. */
 3309                         offset = (uint32_t)paddr & page_mask;
 3310                         entry_len = PAGE_SIZE - offset;
 3311                         if (first_prp_entry) {
 3312                                 /*
 3313                                  * Put IEEE entry in first SGE in main message.
 3314                                  * (Simple element, System addr, not end of
 3315                                  * list.)
 3316                                  */
 3317                                 *ptr_first_sgl = htole32((uint32_t)paddr);
 3318                                 *(ptr_first_sgl + 1) =
 3319                                     htole32((uint32_t)((uint64_t)paddr >> 32));
 3320                                 *(ptr_first_sgl + 2) = htole32(entry_len);
 3321                                 *(ptr_first_sgl + 3) = 0;
 3322 
 3323                                 /* No longer the first PRP entry. */
 3324                                 first_prp_entry = 0;
 3325                         } else {
 3326                                 /* Put entry in list. */
 3327                                 *ptr_sgl = htole32((uint32_t)paddr);
 3328                                 *(ptr_sgl + 1) =
 3329                                     htole32((uint32_t)((uint64_t)paddr >> 32));
 3330 
 3331                                 /* Bump ptr_sgl, msg_phys, and num_entries. */
 3332                                 ptr_sgl += sge_dwords;
 3333                                 msg_phys = (uint64_t *)((uint8_t *)msg_phys +
 3334                                     prp_size);
 3335                                 num_entries++;
 3336                         }
 3337 
 3338                         /* Bump the phys address by the entry_len. */
 3339                         paddr += entry_len;
 3340 
 3341                         /* Decrement length accounting for last partial page. */
 3342                         if (entry_len > length)
 3343                                 length = 0;
 3344                         else
 3345                                 length -= entry_len;
 3346                 }
 3347         }
 3348 
 3349         /* Set chain element Length. */
 3350         main_chain_element->Length = htole32(num_entries * prp_size);
 3351 
 3352         /* Return 0, indicating we built a native SGL. */
 3353         return 0;
 3354 }
 3355 
 3356 /*
 3357  * Add a chain element as the next SGE for the specified command.
 3358  * Reset cm_sge and cm_sgesize to indicate all the available space. Chains are
 3359  * only required for IEEE commands.  Therefore there is no code for commands
 3360  * that have the MPR_CM_FLAGS_SGE_SIMPLE flag set (and those commands
 3361  * shouldn't be requesting chains).
 3362  */
 3363 static int
 3364 mpr_add_chain(struct mpr_command *cm, int segsleft)
 3365 {
 3366         struct mpr_softc *sc = cm->cm_sc;
 3367         MPI2_REQUEST_HEADER *req;
 3368         MPI25_IEEE_SGE_CHAIN64 *ieee_sgc;
 3369         struct mpr_chain *chain;
 3370         int sgc_size, current_segs, rem_segs, segs_per_frame;
 3371         uint8_t next_chain_offset = 0;
 3372 
 3373         /*
 3374          * Fail if a command is requesting a chain for SIMPLE SGE's.  For SAS3
 3375          * only IEEE commands should be requesting chains.  Return some error
 3376          * code other than 0.
 3377          */
 3378         if (cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE) {
 3379                 mpr_dprint(sc, MPR_ERROR, "A chain element cannot be added to "
 3380                     "an MPI SGL.\n");
 3381                 return(ENOBUFS);
 3382         }
 3383 
 3384         sgc_size = sizeof(MPI25_IEEE_SGE_CHAIN64);
 3385         if (cm->cm_sglsize < sgc_size)
 3386                 panic("MPR: Need SGE Error Code\n");
 3387 
 3388         chain = mpr_alloc_chain(cm->cm_sc);
 3389         if (chain == NULL)
 3390                 return (ENOBUFS);
 3391 
 3392         /*
 3393          * Note: a double-linked list is used to make it easier to walk for
 3394          * debugging.
 3395          */
 3396         TAILQ_INSERT_TAIL(&cm->cm_chain_list, chain, chain_link);
 3397 
 3398         /*
 3399          * Need to know if the number of frames left is more than 1 or not.  If
 3400          * more than 1 frame is required, NextChainOffset will need to be set,
 3401          * which will just be the last segment of the frame.
 3402          */
 3403         rem_segs = 0;
 3404         if (cm->cm_sglsize < (sgc_size * segsleft)) {
 3405                 /*
 3406                  * rem_segs is the number of segment remaining after the
 3407                  * segments that will go into the current frame.  Since it is
 3408                  * known that at least one more frame is required, account for
 3409                  * the chain element.  To know if more than one more frame is
 3410                  * required, just check if there will be a remainder after using
 3411                  * the current frame (with this chain) and the next frame.  If
 3412                  * so the NextChainOffset must be the last element of the next
 3413                  * frame.
 3414                  */
 3415                 current_segs = (cm->cm_sglsize / sgc_size) - 1;
 3416                 rem_segs = segsleft - current_segs;
 3417                 segs_per_frame = sc->chain_frame_size / sgc_size;
 3418                 if (rem_segs > segs_per_frame) {
 3419                         next_chain_offset = segs_per_frame - 1;
 3420                 }
 3421         }
 3422         ieee_sgc = &((MPI25_SGE_IO_UNION *)cm->cm_sge)->IeeeChain;
 3423         ieee_sgc->Length = next_chain_offset ?
 3424             htole32((uint32_t)sc->chain_frame_size) :
 3425             htole32((uint32_t)rem_segs * (uint32_t)sgc_size);
 3426         ieee_sgc->NextChainOffset = next_chain_offset;
 3427         ieee_sgc->Flags = (MPI2_IEEE_SGE_FLAGS_CHAIN_ELEMENT |
 3428             MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
 3429         ieee_sgc->Address.Low = htole32(chain->chain_busaddr);
 3430         ieee_sgc->Address.High = htole32(chain->chain_busaddr >> 32);
 3431         cm->cm_sge = &((MPI25_SGE_IO_UNION *)chain->chain)->IeeeSimple;
 3432         req = (MPI2_REQUEST_HEADER *)cm->cm_req;
 3433         req->ChainOffset = (sc->chain_frame_size - sgc_size) >> 4;
 3434 
 3435         cm->cm_sglsize = sc->chain_frame_size;
 3436         return (0);
 3437 }
 3438 
 3439 /*
 3440  * Add one scatter-gather element to the scatter-gather list for a command.
 3441  * Maintain cm_sglsize and cm_sge as the remaining size and pointer to the
 3442  * next SGE to fill in, respectively.  In Gen3, the MPI SGL does not have a
 3443  * chain, so don't consider any chain additions.
 3444  */
 3445 int
 3446 mpr_push_sge(struct mpr_command *cm, MPI2_SGE_SIMPLE64 *sge, size_t len,
 3447     int segsleft)
 3448 {
 3449         uint32_t saved_buf_len, saved_address_low, saved_address_high;
 3450         u32 sge_flags;
 3451 
 3452         /*
 3453          * case 1: >=1 more segment, no room for anything (error)
 3454          * case 2: 1 more segment and enough room for it
 3455          */
 3456 
 3457         if (cm->cm_sglsize < (segsleft * sizeof(MPI2_SGE_SIMPLE64))) {
 3458                 mpr_dprint(cm->cm_sc, MPR_ERROR,
 3459                     "%s: warning: Not enough room for MPI SGL in frame.\n",
 3460                     __func__);
 3461                 return(ENOBUFS);
 3462         }
 3463 
 3464         KASSERT(segsleft == 1,
 3465             ("segsleft cannot be more than 1 for an MPI SGL; segsleft = %d\n",
 3466             segsleft));
 3467 
 3468         /*
 3469          * There is one more segment left to add for the MPI SGL and there is
 3470          * enough room in the frame to add it.  This is the normal case because
 3471          * MPI SGL's don't have chains, otherwise something is wrong.
 3472          *
 3473          * If this is a bi-directional request, need to account for that
 3474          * here.  Save the pre-filled sge values.  These will be used
 3475          * either for the 2nd SGL or for a single direction SGL.  If
 3476          * cm_out_len is non-zero, this is a bi-directional request, so
 3477          * fill in the OUT SGL first, then the IN SGL, otherwise just
 3478          * fill in the IN SGL.  Note that at this time, when filling in
 3479          * 2 SGL's for a bi-directional request, they both use the same
 3480          * DMA buffer (same cm command).
 3481          */
 3482         saved_buf_len = sge->FlagsLength & 0x00FFFFFF;
 3483         saved_address_low = sge->Address.Low;
 3484         saved_address_high = sge->Address.High;
 3485         if (cm->cm_out_len) {
 3486                 sge->FlagsLength = cm->cm_out_len |
 3487                     ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 3488                     MPI2_SGE_FLAGS_END_OF_BUFFER |
 3489                     MPI2_SGE_FLAGS_HOST_TO_IOC |
 3490                     MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
 3491                     MPI2_SGE_FLAGS_SHIFT);
 3492                 cm->cm_sglsize -= len;
 3493                 /* Endian Safe code */
 3494                 sge_flags = sge->FlagsLength;
 3495                 sge->FlagsLength = htole32(sge_flags);
 3496                 bcopy(sge, cm->cm_sge, len);
 3497                 cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
 3498         }
 3499         sge->FlagsLength = saved_buf_len |
 3500             ((uint32_t)(MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 3501             MPI2_SGE_FLAGS_END_OF_BUFFER |
 3502             MPI2_SGE_FLAGS_LAST_ELEMENT |
 3503             MPI2_SGE_FLAGS_END_OF_LIST |
 3504             MPI2_SGE_FLAGS_64_BIT_ADDRESSING) <<
 3505             MPI2_SGE_FLAGS_SHIFT);
 3506         if (cm->cm_flags & MPR_CM_FLAGS_DATAIN) {
 3507                 sge->FlagsLength |=
 3508                     ((uint32_t)(MPI2_SGE_FLAGS_IOC_TO_HOST) <<
 3509                     MPI2_SGE_FLAGS_SHIFT);
 3510         } else {
 3511                 sge->FlagsLength |=
 3512                     ((uint32_t)(MPI2_SGE_FLAGS_HOST_TO_IOC) <<
 3513                     MPI2_SGE_FLAGS_SHIFT);
 3514         }
 3515         sge->Address.Low = saved_address_low;
 3516         sge->Address.High = saved_address_high;
 3517 
 3518         cm->cm_sglsize -= len;
 3519         /* Endian Safe code */
 3520         sge_flags = sge->FlagsLength;
 3521         sge->FlagsLength = htole32(sge_flags);
 3522         bcopy(sge, cm->cm_sge, len);
 3523         cm->cm_sge = (MPI2_SGE_IO_UNION *)((uintptr_t)cm->cm_sge + len);
 3524         return (0);
 3525 }
 3526 
 3527 /*
 3528  * Add one IEEE scatter-gather element (chain or simple) to the IEEE scatter-
 3529  * gather list for a command.  Maintain cm_sglsize and cm_sge as the
 3530  * remaining size and pointer to the next SGE to fill in, respectively.
 3531  */
 3532 int
 3533 mpr_push_ieee_sge(struct mpr_command *cm, void *sgep, int segsleft)
 3534 {
 3535         MPI2_IEEE_SGE_SIMPLE64 *sge = sgep;
 3536         int error, ieee_sge_size = sizeof(MPI25_SGE_IO_UNION);
 3537         uint32_t saved_buf_len, saved_address_low, saved_address_high;
 3538         uint32_t sge_length;
 3539 
 3540         /*
 3541          * case 1: No room for chain or segment (error).
 3542          * case 2: Two or more segments left but only room for chain.
 3543          * case 3: Last segment and room for it, so set flags.
 3544          */
 3545 
 3546         /*
 3547          * There should be room for at least one element, or there is a big
 3548          * problem.
 3549          */
 3550         if (cm->cm_sglsize < ieee_sge_size)
 3551                 panic("MPR: Need SGE Error Code\n");
 3552 
 3553         if ((segsleft >= 2) && (cm->cm_sglsize < (ieee_sge_size * 2))) {
 3554                 if ((error = mpr_add_chain(cm, segsleft)) != 0)
 3555                         return (error);
 3556         }
 3557 
 3558         if (segsleft == 1) {
 3559                 /*
 3560                  * If this is a bi-directional request, need to account for that
 3561                  * here.  Save the pre-filled sge values.  These will be used
 3562                  * either for the 2nd SGL or for a single direction SGL.  If
 3563                  * cm_out_len is non-zero, this is a bi-directional request, so
 3564                  * fill in the OUT SGL first, then the IN SGL, otherwise just
 3565                  * fill in the IN SGL.  Note that at this time, when filling in
 3566                  * 2 SGL's for a bi-directional request, they both use the same
 3567                  * DMA buffer (same cm command).
 3568                  */
 3569                 saved_buf_len = sge->Length;
 3570                 saved_address_low = sge->Address.Low;
 3571                 saved_address_high = sge->Address.High;
 3572                 if (cm->cm_out_len) {
 3573                         sge->Length = cm->cm_out_len;
 3574                         sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
 3575                             MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
 3576                         cm->cm_sglsize -= ieee_sge_size;
 3577                         /* Endian Safe code */
 3578                         sge_length = sge->Length;
 3579                         sge->Length = htole32(sge_length);
 3580                         bcopy(sgep, cm->cm_sge, ieee_sge_size);
 3581                         cm->cm_sge =
 3582                             (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
 3583                             ieee_sge_size);
 3584                 }
 3585                 sge->Length = saved_buf_len;
 3586                 sge->Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
 3587                     MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR |
 3588                     MPI25_IEEE_SGE_FLAGS_END_OF_LIST);
 3589                 sge->Address.Low = saved_address_low;
 3590                 sge->Address.High = saved_address_high;
 3591         }
 3592 
 3593         cm->cm_sglsize -= ieee_sge_size;
 3594         /* Endian Safe code */
 3595         sge_length = sge->Length;
 3596         sge->Length = htole32(sge_length);
 3597         bcopy(sgep, cm->cm_sge, ieee_sge_size);
 3598         cm->cm_sge = (MPI25_SGE_IO_UNION *)((uintptr_t)cm->cm_sge +
 3599             ieee_sge_size);
 3600         return (0);
 3601 }
 3602 
 3603 /*
 3604  * Add one dma segment to the scatter-gather list for a command.
 3605  */
 3606 int
 3607 mpr_add_dmaseg(struct mpr_command *cm, vm_paddr_t pa, size_t len, u_int flags,
 3608     int segsleft)
 3609 {
 3610         MPI2_SGE_SIMPLE64 sge;
 3611         MPI2_IEEE_SGE_SIMPLE64 ieee_sge;
 3612 
 3613         if (!(cm->cm_flags & MPR_CM_FLAGS_SGE_SIMPLE)) {
 3614                 ieee_sge.Flags = (MPI2_IEEE_SGE_FLAGS_SIMPLE_ELEMENT |
 3615                     MPI2_IEEE_SGE_FLAGS_SYSTEM_ADDR);
 3616                 ieee_sge.Length = len;
 3617                 mpr_from_u64(pa, &ieee_sge.Address);
 3618 
 3619                 return (mpr_push_ieee_sge(cm, &ieee_sge, segsleft));
 3620         } else {
 3621                 /*
 3622                  * This driver always uses 64-bit address elements for
 3623                  * simplicity.
 3624                  */
 3625                 flags |= MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
 3626                     MPI2_SGE_FLAGS_64_BIT_ADDRESSING;
 3627                 /* Set Endian safe macro in mpr_push_sge */
 3628                 sge.FlagsLength = len | (flags << MPI2_SGE_FLAGS_SHIFT);
 3629                 mpr_from_u64(pa, &sge.Address);
 3630 
 3631                 return (mpr_push_sge(cm, &sge, sizeof sge, segsleft));
 3632         }
 3633 }
 3634 
 3635 static void
 3636 mpr_data_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 3637 {
 3638         struct mpr_softc *sc;
 3639         struct mpr_command *cm;
 3640         u_int i, dir, sflags;
 3641 
 3642         cm = (struct mpr_command *)arg;
 3643         sc = cm->cm_sc;
 3644 
 3645         /*
 3646          * In this case, just print out a warning and let the chip tell the
 3647          * user they did the wrong thing.
 3648          */
 3649         if ((cm->cm_max_segs != 0) && (nsegs > cm->cm_max_segs)) {
 3650                 mpr_dprint(sc, MPR_ERROR, "%s: warning: busdma returned %d "
 3651                     "segments, more than the %d allowed\n", __func__, nsegs,
 3652                     cm->cm_max_segs);
 3653         }
 3654 
 3655         /*
 3656          * Set up DMA direction flags.  Bi-directional requests are also handled
 3657          * here.  In that case, both direction flags will be set.
 3658          */
 3659         sflags = 0;
 3660         if (cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) {
 3661                 /*
 3662                  * We have to add a special case for SMP passthrough, there
 3663                  * is no easy way to generically handle it.  The first
 3664                  * S/G element is used for the command (therefore the
 3665                  * direction bit needs to be set).  The second one is used
 3666                  * for the reply.  We'll leave it to the caller to make
 3667                  * sure we only have two buffers.
 3668                  */
 3669                 /*
 3670                  * Even though the busdma man page says it doesn't make
 3671                  * sense to have both direction flags, it does in this case.
 3672                  * We have one s/g element being accessed in each direction.
 3673                  */
 3674                 dir = BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD;
 3675 
 3676                 /*
 3677                  * Set the direction flag on the first buffer in the SMP
 3678                  * passthrough request.  We'll clear it for the second one.
 3679                  */
 3680                 sflags |= MPI2_SGE_FLAGS_DIRECTION |
 3681                           MPI2_SGE_FLAGS_END_OF_BUFFER;
 3682         } else if (cm->cm_flags & MPR_CM_FLAGS_DATAOUT) {
 3683                 sflags |= MPI2_SGE_FLAGS_HOST_TO_IOC;
 3684                 dir = BUS_DMASYNC_PREWRITE;
 3685         } else
 3686                 dir = BUS_DMASYNC_PREREAD;
 3687 
 3688         /* Check if a native SG list is needed for an NVMe PCIe device. */
 3689         if (cm->cm_targ && cm->cm_targ->is_nvme &&
 3690             mpr_check_pcie_native_sgl(sc, cm, segs, nsegs) == 0) {
 3691                 /* A native SG list was built, skip to end. */
 3692                 goto out;
 3693         }
 3694 
 3695         for (i = 0; i < nsegs; i++) {
 3696                 if ((cm->cm_flags & MPR_CM_FLAGS_SMP_PASS) && (i != 0)) {
 3697                         sflags &= ~MPI2_SGE_FLAGS_DIRECTION;
 3698                 }
 3699                 error = mpr_add_dmaseg(cm, segs[i].ds_addr, segs[i].ds_len,
 3700                     sflags, nsegs - i);
 3701                 if (error != 0) {
 3702                         /* Resource shortage, roll back! */
 3703                         if (ratecheck(&sc->lastfail, &mpr_chainfail_interval))
 3704                                 mpr_dprint(sc, MPR_INFO, "Out of chain frames, "
 3705                                     "consider increasing hw.mpr.max_chains.\n");
 3706                         cm->cm_flags |= MPR_CM_FLAGS_CHAIN_FAILED;
 3707                         /*
 3708                          * mpr_complete_command can only be called on commands
 3709                          * that are in the queue. Since this is an error path
 3710                          * which gets called before we enqueue, update the state
 3711                          * to meet this requirement before we complete it.
 3712                          */
 3713                         cm->cm_state = MPR_CM_STATE_INQUEUE;
 3714                         mpr_complete_command(sc, cm);
 3715                         return;
 3716                 }
 3717         }
 3718 
 3719 out:
 3720         bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap, dir);
 3721         mpr_enqueue_request(sc, cm);
 3722 
 3723         return;
 3724 }
 3725 
 3726 static void
 3727 mpr_data_cb2(void *arg, bus_dma_segment_t *segs, int nsegs, bus_size_t mapsize,
 3728              int error)
 3729 {
 3730         mpr_data_cb(arg, segs, nsegs, error);
 3731 }
 3732 
 3733 /*
 3734  * This is the routine to enqueue commands ansynchronously.
 3735  * Note that the only error path here is from bus_dmamap_load(), which can
 3736  * return EINPROGRESS if it is waiting for resources.  Other than this, it's
 3737  * assumed that if you have a command in-hand, then you have enough credits
 3738  * to use it.
 3739  */
 3740 int
 3741 mpr_map_command(struct mpr_softc *sc, struct mpr_command *cm)
 3742 {
 3743         int error = 0;
 3744 
 3745         if (cm->cm_flags & MPR_CM_FLAGS_USE_UIO) {
 3746                 error = bus_dmamap_load_uio(sc->buffer_dmat, cm->cm_dmamap,
 3747                     &cm->cm_uio, mpr_data_cb2, cm, 0);
 3748         } else if (cm->cm_flags & MPR_CM_FLAGS_USE_CCB) {
 3749                 error = bus_dmamap_load_ccb(sc->buffer_dmat, cm->cm_dmamap,
 3750                     cm->cm_data, mpr_data_cb, cm, 0);
 3751         } else if ((cm->cm_data != NULL) && (cm->cm_length != 0)) {
 3752                 error = bus_dmamap_load(sc->buffer_dmat, cm->cm_dmamap,
 3753                     cm->cm_data, cm->cm_length, mpr_data_cb, cm, 0);
 3754         } else {
 3755                 /* Add a zero-length element as needed */
 3756                 if (cm->cm_sge != NULL)
 3757                         mpr_add_dmaseg(cm, 0, 0, 0, 1);
 3758                 mpr_enqueue_request(sc, cm);
 3759         }
 3760 
 3761         return (error);
 3762 }
 3763 
 3764 /*
 3765  * This is the routine to enqueue commands synchronously.  An error of
 3766  * EINPROGRESS from mpr_map_command() is ignored since the command will
 3767  * be executed and enqueued automatically.  Other errors come from msleep().
 3768  */
 3769 int
 3770 mpr_wait_command(struct mpr_softc *sc, struct mpr_command **cmp, int timeout,
 3771     int sleep_flag)
 3772 {
 3773         int error, rc;
 3774         struct timeval cur_time, start_time;
 3775         struct mpr_command *cm = *cmp;
 3776 
 3777         if (sc->mpr_flags & MPR_FLAGS_DIAGRESET) 
 3778                 return  EBUSY;
 3779 
 3780         cm->cm_complete = NULL;
 3781         cm->cm_flags |= (MPR_CM_FLAGS_WAKEUP + MPR_CM_FLAGS_POLLED);
 3782         error = mpr_map_command(sc, cm);
 3783         if ((error != 0) && (error != EINPROGRESS))
 3784                 return (error);
 3785 
 3786         // Check for context and wait for 50 mSec at a time until time has
 3787         // expired or the command has finished.  If msleep can't be used, need
 3788         // to poll.
 3789         if (curthread->td_no_sleeping)
 3790                 sleep_flag = NO_SLEEP;
 3791         getmicrouptime(&start_time);
 3792         if (mtx_owned(&sc->mpr_mtx) && sleep_flag == CAN_SLEEP) {
 3793                 error = msleep(cm, &sc->mpr_mtx, 0, "mprwait", timeout*hz);
 3794                 if (error == EWOULDBLOCK) {
 3795                         /*
 3796                          * Record the actual elapsed time in the case of a
 3797                          * timeout for the message below.
 3798                          */
 3799                         getmicrouptime(&cur_time);
 3800                         timevalsub(&cur_time, &start_time);
 3801                 }
 3802         } else {
 3803                 while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
 3804                         mpr_intr_locked(sc);
 3805                         if (sleep_flag == CAN_SLEEP)
 3806                                 pause("mprwait", hz/20);
 3807                         else
 3808                                 DELAY(50000);
 3809                 
 3810                         getmicrouptime(&cur_time);
 3811                         timevalsub(&cur_time, &start_time);
 3812                         if (cur_time.tv_sec > timeout) {
 3813                                 error = EWOULDBLOCK;
 3814                                 break;
 3815                         }
 3816                 }
 3817         }
 3818 
 3819         if (error == EWOULDBLOCK) {
 3820                 if (cm->cm_timeout_handler == NULL) {
 3821                         mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s, timeout=%d,"
 3822                             " elapsed=%jd\n", __func__, timeout,
 3823                             (intmax_t)cur_time.tv_sec);
 3824                         rc = mpr_reinit(sc);
 3825                         mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
 3826                             "failed");
 3827                 } else
 3828                         cm->cm_timeout_handler(sc, cm);
 3829                 if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
 3830                         /*
 3831                          * Tell the caller that we freed the command in a
 3832                          * reinit.
 3833                          */
 3834                         *cmp = NULL;
 3835                 }
 3836                 error = ETIMEDOUT;
 3837         }
 3838         return (error);
 3839 }
 3840 
 3841 /*
 3842  * This is the routine to enqueue a command synchonously and poll for
 3843  * completion.  Its use should be rare.
 3844  */
 3845 int
 3846 mpr_request_polled(struct mpr_softc *sc, struct mpr_command **cmp)
 3847 {
 3848         int error, rc;
 3849         struct timeval cur_time, start_time;
 3850         struct mpr_command *cm = *cmp;
 3851 
 3852         error = 0;
 3853 
 3854         cm->cm_flags |= MPR_CM_FLAGS_POLLED;
 3855         cm->cm_complete = NULL;
 3856         mpr_map_command(sc, cm);
 3857 
 3858         getmicrouptime(&start_time);
 3859         while ((cm->cm_flags & MPR_CM_FLAGS_COMPLETE) == 0) {
 3860                 mpr_intr_locked(sc);
 3861 
 3862                 if (mtx_owned(&sc->mpr_mtx))
 3863                         msleep(&sc->msleep_fake_chan, &sc->mpr_mtx, 0,
 3864                             "mprpoll", hz/20);
 3865                 else
 3866                         pause("mprpoll", hz/20);
 3867 
 3868                 /*
 3869                  * Check for real-time timeout and fail if more than 60 seconds.
 3870                  */
 3871                 getmicrouptime(&cur_time);
 3872                 timevalsub(&cur_time, &start_time);
 3873                 if (cur_time.tv_sec > 60) {
 3874                         mpr_dprint(sc, MPR_FAULT, "polling failed\n");
 3875                         error = ETIMEDOUT;
 3876                         break;
 3877                 }
 3878         }
 3879         cm->cm_state = MPR_CM_STATE_BUSY;
 3880         if (error) {
 3881                 mpr_dprint(sc, MPR_FAULT, "Calling Reinit from %s\n", __func__);
 3882                 rc = mpr_reinit(sc);
 3883                 mpr_dprint(sc, MPR_FAULT, "Reinit %s\n", (rc == 0) ? "success" :
 3884                     "failed");
 3885 
 3886                 if (sc->mpr_flags & MPR_FLAGS_REALLOCATED) {
 3887                         /*
 3888                          * Tell the caller that we freed the command in a
 3889                          * reinit.
 3890                          */
 3891                         *cmp = NULL;
 3892                 }
 3893         }
 3894         return (error);
 3895 }
 3896 
 3897 /*
 3898  * The MPT driver had a verbose interface for config pages.  In this driver,
 3899  * reduce it to much simpler terms, similar to the Linux driver.
 3900  */
 3901 int
 3902 mpr_read_config_page(struct mpr_softc *sc, struct mpr_config_params *params)
 3903 {
 3904         MPI2_CONFIG_REQUEST *req;
 3905         struct mpr_command *cm;
 3906         int error;
 3907 
 3908         if (sc->mpr_flags & MPR_FLAGS_BUSY) {
 3909                 return (EBUSY);
 3910         }
 3911 
 3912         cm = mpr_alloc_command(sc);
 3913         if (cm == NULL) {
 3914                 return (EBUSY);
 3915         }
 3916 
 3917         req = (MPI2_CONFIG_REQUEST *)cm->cm_req;
 3918         req->Function = MPI2_FUNCTION_CONFIG;
 3919         req->Action = params->action;
 3920         req->SGLFlags = 0;
 3921         req->ChainOffset = 0;
 3922         req->PageAddress = params->page_address;
 3923         if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
 3924                 MPI2_CONFIG_EXTENDED_PAGE_HEADER *hdr;
 3925 
 3926                 hdr = &params->hdr.Ext;
 3927                 req->ExtPageType = hdr->ExtPageType;
 3928                 req->ExtPageLength = hdr->ExtPageLength;
 3929                 req->Header.PageType = MPI2_CONFIG_PAGETYPE_EXTENDED;
 3930                 req->Header.PageLength = 0; /* Must be set to zero */
 3931                 req->Header.PageNumber = hdr->PageNumber;
 3932                 req->Header.PageVersion = hdr->PageVersion;
 3933         } else {
 3934                 MPI2_CONFIG_PAGE_HEADER *hdr;
 3935 
 3936                 hdr = &params->hdr.Struct;
 3937                 req->Header.PageType = hdr->PageType;
 3938                 req->Header.PageNumber = hdr->PageNumber;
 3939                 req->Header.PageLength = hdr->PageLength;
 3940                 req->Header.PageVersion = hdr->PageVersion;
 3941         }
 3942 
 3943         cm->cm_data = params->buffer;
 3944         cm->cm_length = params->length;
 3945         if (cm->cm_data != NULL) {
 3946                 cm->cm_sge = &req->PageBufferSGE;
 3947                 cm->cm_sglsize = sizeof(MPI2_SGE_IO_UNION);
 3948                 cm->cm_flags = MPR_CM_FLAGS_SGE_SIMPLE | MPR_CM_FLAGS_DATAIN;
 3949         } else
 3950                 cm->cm_sge = NULL;
 3951         cm->cm_desc.Default.RequestFlags = MPI2_REQ_DESCRIPT_FLAGS_DEFAULT_TYPE;
 3952 
 3953         cm->cm_complete_data = params;
 3954         if (params->callback != NULL) {
 3955                 cm->cm_complete = mpr_config_complete;
 3956                 return (mpr_map_command(sc, cm));
 3957         } else {
 3958                 error = mpr_wait_command(sc, &cm, 0, CAN_SLEEP);
 3959                 if (error) {
 3960                         mpr_dprint(sc, MPR_FAULT,
 3961                             "Error %d reading config page\n", error);
 3962                         if (cm != NULL)
 3963                                 mpr_free_command(sc, cm);
 3964                         return (error);
 3965                 }
 3966                 mpr_config_complete(sc, cm);
 3967         }
 3968 
 3969         return (0);
 3970 }
 3971 
 3972 int
 3973 mpr_write_config_page(struct mpr_softc *sc, struct mpr_config_params *params)
 3974 {
 3975         return (EINVAL);
 3976 }
 3977 
 3978 static void
 3979 mpr_config_complete(struct mpr_softc *sc, struct mpr_command *cm)
 3980 {
 3981         MPI2_CONFIG_REPLY *reply;
 3982         struct mpr_config_params *params;
 3983 
 3984         MPR_FUNCTRACE(sc);
 3985         params = cm->cm_complete_data;
 3986 
 3987         if (cm->cm_data != NULL) {
 3988                 bus_dmamap_sync(sc->buffer_dmat, cm->cm_dmamap,
 3989                     BUS_DMASYNC_POSTREAD);
 3990                 bus_dmamap_unload(sc->buffer_dmat, cm->cm_dmamap);
 3991         }
 3992 
 3993         /*
 3994          * XXX KDM need to do more error recovery?  This results in the
 3995          * device in question not getting probed.
 3996          */
 3997         if ((cm->cm_flags & MPR_CM_FLAGS_ERROR_MASK) != 0) {
 3998                 params->status = MPI2_IOCSTATUS_BUSY;
 3999                 goto done;
 4000         }
 4001 
 4002         reply = (MPI2_CONFIG_REPLY *)cm->cm_reply;
 4003         if (reply == NULL) {
 4004                 params->status = MPI2_IOCSTATUS_BUSY;
 4005                 goto done;
 4006         }
 4007         params->status = reply->IOCStatus;
 4008         if (params->hdr.Struct.PageType == MPI2_CONFIG_PAGETYPE_EXTENDED) {
 4009                 params->hdr.Ext.ExtPageType = reply->ExtPageType;
 4010                 params->hdr.Ext.ExtPageLength = reply->ExtPageLength;
 4011                 params->hdr.Ext.PageType = reply->Header.PageType;
 4012                 params->hdr.Ext.PageNumber = reply->Header.PageNumber;
 4013                 params->hdr.Ext.PageVersion = reply->Header.PageVersion;
 4014         } else {
 4015                 params->hdr.Struct.PageType = reply->Header.PageType;
 4016                 params->hdr.Struct.PageNumber = reply->Header.PageNumber;
 4017                 params->hdr.Struct.PageLength = reply->Header.PageLength;
 4018                 params->hdr.Struct.PageVersion = reply->Header.PageVersion;
 4019         }
 4020 
 4021 done:
 4022         mpr_free_command(sc, cm);
 4023         if (params->callback != NULL)
 4024                 params->callback(sc, params);
 4025 
 4026         return;
 4027 }
Cache object: 9d684c1c0b12859ad497b49b55160a4c
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/dev/mpr/mpr.c

FreeBSD/Linux Kernel Cross Reference
sys/dev/mpr/mpr.c