The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/net/iflib.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2014-2018, Matthew Macy <mmacy@mattmacy.io>
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions are met:
    7  *
    8  *  1. Redistributions of source code must retain the above copyright notice,
    9  *     this list of conditions and the following disclaimer.
   10  *
   11  *  2. Neither the name of Matthew Macy nor the names of its
   12  *     contributors may be used to endorse or promote products derived from
   13  *     this software without specific prior written permission.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   16  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
   19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   22  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   23  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   24  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   25  * POSSIBILITY OF SUCH DAMAGE.
   26  */
   27 
   28 #include <sys/cdefs.h>
   29 __FBSDID("$FreeBSD$");
   30 
   31 #include "opt_inet.h"
   32 #include "opt_inet6.h"
   33 #include "opt_acpi.h"
   34 #include "opt_sched.h"
   35 
   36 #include <sys/param.h>
   37 #include <sys/types.h>
   38 #include <sys/bus.h>
   39 #include <sys/eventhandler.h>
   40 #include <sys/jail.h>
   41 #include <sys/kernel.h>
   42 #include <sys/lock.h>
   43 #include <sys/md5.h>
   44 #include <sys/mutex.h>
   45 #include <sys/module.h>
   46 #include <sys/kobj.h>
   47 #include <sys/rman.h>
   48 #include <sys/proc.h>
   49 #include <sys/sbuf.h>
   50 #include <sys/smp.h>
   51 #include <sys/socket.h>
   52 #include <sys/sockio.h>
   53 #include <sys/sysctl.h>
   54 #include <sys/syslog.h>
   55 #include <sys/taskqueue.h>
   56 #include <sys/limits.h>
   57 
   58 #include <net/if.h>
   59 #include <net/if_var.h>
   60 #include <net/if_types.h>
   61 #include <net/if_media.h>
   62 #include <net/bpf.h>
   63 #include <net/ethernet.h>
   64 #include <net/mp_ring.h>
   65 #include <net/vnet.h>
   66 
   67 #include <netinet/in.h>
   68 #include <netinet/in_pcb.h>
   69 #include <netinet/tcp_lro.h>
   70 #include <netinet/in_systm.h>
   71 #include <netinet/if_ether.h>
   72 #include <netinet/ip.h>
   73 #include <netinet/ip6.h>
   74 #include <netinet/tcp.h>
   75 #include <netinet/ip_var.h>
   76 #include <netinet/netdump/netdump.h>
   77 #include <netinet6/ip6_var.h>
   78 
   79 #include <machine/bus.h>
   80 #include <machine/in_cksum.h>
   81 
   82 #include <vm/vm.h>
   83 #include <vm/pmap.h>
   84 
   85 #include <dev/led/led.h>
   86 #include <dev/pci/pcireg.h>
   87 #include <dev/pci/pcivar.h>
   88 #include <dev/pci/pci_private.h>
   89 
   90 #include <net/iflib.h>
   91 #include <net/iflib_private.h>
   92 
   93 #include "ifdi_if.h"
   94 
   95 #ifdef PCI_IOV
   96 #include <dev/pci/pci_iov.h>
   97 #endif
   98 
   99 #include <sys/bitstring.h>
  100 /*
  101  * enable accounting of every mbuf as it comes in to and goes out of
  102  * iflib's software descriptor references
  103  */
  104 #define MEMORY_LOGGING 0
  105 /*
  106  * Enable mbuf vectors for compressing long mbuf chains
  107  */
  108 
  109 /*
  110  * NB:
  111  * - Prefetching in tx cleaning should perhaps be a tunable. The distance ahead
  112  *   we prefetch needs to be determined by the time spent in m_free vis a vis
  113  *   the cost of a prefetch. This will of course vary based on the workload:
  114  *      - NFLX's m_free path is dominated by vm-based M_EXT manipulation which
  115  *        is quite expensive, thus suggesting very little prefetch.
  116  *      - small packet forwarding which is just returning a single mbuf to
  117  *        UMA will typically be very fast vis a vis the cost of a memory
  118  *        access.
  119  */
  120 
  121 
  122 /*
  123  * File organization:
  124  *  - private structures
  125  *  - iflib private utility functions
  126  *  - ifnet functions
  127  *  - vlan registry and other exported functions
  128  *  - iflib public core functions
  129  *
  130  *
  131  */
  132 MALLOC_DEFINE(M_IFLIB, "iflib", "ifnet library");
  133 
  134 #define IFLIB_RXEOF_MORE (1U << 0)
  135 #define IFLIB_RXEOF_EMPTY (2U << 0)
  136 
  137 struct iflib_txq;
  138 typedef struct iflib_txq *iflib_txq_t;
  139 struct iflib_rxq;
  140 typedef struct iflib_rxq *iflib_rxq_t;
  141 struct iflib_fl;
  142 typedef struct iflib_fl *iflib_fl_t;
  143 
  144 struct iflib_ctx;
  145 
  146 static void iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid);
  147 static void iflib_timer(void *arg);
  148 static void iflib_tqg_detach(if_ctx_t ctx);
  149 
  150 typedef struct iflib_filter_info {
  151         driver_filter_t *ifi_filter;
  152         void *ifi_filter_arg;
  153         struct grouptask *ifi_task;
  154         void *ifi_ctx;
  155 } *iflib_filter_info_t;
  156 
  157 struct iflib_ctx {
  158         KOBJ_FIELDS;
  159         /*
  160          * Pointer to hardware driver's softc
  161          */
  162         void *ifc_softc;
  163         device_t ifc_dev;
  164         if_t ifc_ifp;
  165 
  166         cpuset_t ifc_cpus;
  167         if_shared_ctx_t ifc_sctx;
  168         struct if_softc_ctx ifc_softc_ctx;
  169 
  170         struct sx ifc_ctx_sx;
  171         struct mtx ifc_state_mtx;
  172 
  173         iflib_txq_t ifc_txqs;
  174         iflib_rxq_t ifc_rxqs;
  175         uint32_t ifc_if_flags;
  176         uint32_t ifc_flags;
  177         uint32_t ifc_max_fl_buf_size;
  178         uint32_t ifc_rx_mbuf_sz;
  179 
  180         int ifc_link_state;
  181         int ifc_watchdog_events;
  182         struct cdev *ifc_led_dev;
  183         struct resource *ifc_msix_mem;
  184 
  185         struct if_irq ifc_legacy_irq;
  186         struct grouptask ifc_admin_task;
  187         struct grouptask ifc_vflr_task;
  188         struct iflib_filter_info ifc_filter_info;
  189         struct ifmedia  ifc_media;
  190 
  191         struct sysctl_oid *ifc_sysctl_node;
  192         uint16_t ifc_sysctl_ntxqs;
  193         uint16_t ifc_sysctl_nrxqs;
  194         uint16_t ifc_sysctl_qs_eq_override;
  195         uint16_t ifc_sysctl_rx_budget;
  196         uint16_t ifc_sysctl_tx_abdicate;
  197         uint16_t ifc_sysctl_core_offset;
  198 #define CORE_OFFSET_UNSPECIFIED 0xffff
  199         uint8_t  ifc_sysctl_separate_txrx;
  200         uint8_t  ifc_sysctl_use_logical_cores;
  201         bool     ifc_cpus_are_physical_cores;
  202 
  203         qidx_t ifc_sysctl_ntxds[8];
  204         qidx_t ifc_sysctl_nrxds[8];
  205         struct if_txrx ifc_txrx;
  206 #define isc_txd_encap  ifc_txrx.ift_txd_encap
  207 #define isc_txd_flush  ifc_txrx.ift_txd_flush
  208 #define isc_txd_credits_update  ifc_txrx.ift_txd_credits_update
  209 #define isc_rxd_available ifc_txrx.ift_rxd_available
  210 #define isc_rxd_pkt_get ifc_txrx.ift_rxd_pkt_get
  211 #define isc_rxd_refill ifc_txrx.ift_rxd_refill
  212 #define isc_rxd_flush ifc_txrx.ift_rxd_flush
  213 #define isc_legacy_intr ifc_txrx.ift_legacy_intr
  214 #define isc_txq_select ifc_txrx.ift_txq_select
  215         eventhandler_tag ifc_vlan_attach_event;
  216         eventhandler_tag ifc_vlan_detach_event;
  217         uint8_t ifc_mac[ETHER_ADDR_LEN];
  218 };
  219 
  220 void *
  221 iflib_get_softc(if_ctx_t ctx)
  222 {
  223 
  224         return (ctx->ifc_softc);
  225 }
  226 
  227 device_t
  228 iflib_get_dev(if_ctx_t ctx)
  229 {
  230 
  231         return (ctx->ifc_dev);
  232 }
  233 
  234 if_t
  235 iflib_get_ifp(if_ctx_t ctx)
  236 {
  237 
  238         return (ctx->ifc_ifp);
  239 }
  240 
  241 struct ifmedia *
  242 iflib_get_media(if_ctx_t ctx)
  243 {
  244 
  245         return (&ctx->ifc_media);
  246 }
  247 
  248 uint32_t
  249 iflib_get_flags(if_ctx_t ctx)
  250 {
  251         return (ctx->ifc_flags);
  252 }
  253 
  254 void
  255 iflib_set_mac(if_ctx_t ctx, uint8_t mac[ETHER_ADDR_LEN])
  256 {
  257 
  258         bcopy(mac, ctx->ifc_mac, ETHER_ADDR_LEN);
  259 }
  260 
  261 if_softc_ctx_t
  262 iflib_get_softc_ctx(if_ctx_t ctx)
  263 {
  264 
  265         return (&ctx->ifc_softc_ctx);
  266 }
  267 
  268 if_shared_ctx_t
  269 iflib_get_sctx(if_ctx_t ctx)
  270 {
  271 
  272         return (ctx->ifc_sctx);
  273 }
  274 
  275 #define IP_ALIGNED(m) ((((uintptr_t)(m)->m_data) & 0x3) == 0x2)
  276 #define CACHE_PTR_INCREMENT (CACHE_LINE_SIZE/sizeof(void*))
  277 #define CACHE_PTR_NEXT(ptr) ((void *)(((uintptr_t)(ptr)+CACHE_LINE_SIZE-1) & (CACHE_LINE_SIZE-1)))
  278 
  279 #define LINK_ACTIVE(ctx) ((ctx)->ifc_link_state == LINK_STATE_UP)
  280 #define CTX_IS_VF(ctx) ((ctx)->ifc_sctx->isc_flags & IFLIB_IS_VF)
  281 
  282 typedef struct iflib_sw_rx_desc_array {
  283         bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
  284         struct mbuf     **ifsd_m;           /* pkthdr mbufs */
  285         caddr_t         *ifsd_cl;          /* direct cluster pointer for rx */
  286         bus_addr_t      *ifsd_ba;          /* bus addr of cluster for rx */
  287 } iflib_rxsd_array_t;
  288 
  289 typedef struct iflib_sw_tx_desc_array {
  290         bus_dmamap_t    *ifsd_map;         /* bus_dma maps for packet */
  291         bus_dmamap_t    *ifsd_tso_map;     /* bus_dma maps for TSO packet */
  292         struct mbuf    **ifsd_m;           /* pkthdr mbufs */
  293 } if_txsd_vec_t;
  294 
  295 /* magic number that should be high enough for any hardware */
  296 #define IFLIB_MAX_TX_SEGS               128
  297 #define IFLIB_RX_COPY_THRESH            128
  298 #define IFLIB_MAX_RX_REFRESH            32
  299 /* The minimum descriptors per second before we start coalescing */
  300 #define IFLIB_MIN_DESC_SEC              16384
  301 #define IFLIB_DEFAULT_TX_UPDATE_FREQ    16
  302 #define IFLIB_QUEUE_IDLE                0
  303 #define IFLIB_QUEUE_HUNG                1
  304 #define IFLIB_QUEUE_WORKING             2
  305 /* maximum number of txqs that can share an rx interrupt */
  306 #define IFLIB_MAX_TX_SHARED_INTR        4
  307 
  308 /* this should really scale with ring size - this is a fairly arbitrary value */
  309 #define TX_BATCH_SIZE                   32
  310 
  311 #define IFLIB_RESTART_BUDGET            8
  312 
  313 #define CSUM_OFFLOAD            (CSUM_IP_TSO|CSUM_IP6_TSO|CSUM_IP| \
  314                                  CSUM_IP_UDP|CSUM_IP_TCP|CSUM_IP_SCTP| \
  315                                  CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP)
  316 
  317 struct iflib_txq {
  318         qidx_t          ift_in_use;
  319         qidx_t          ift_cidx;
  320         qidx_t          ift_cidx_processed;
  321         qidx_t          ift_pidx;
  322         uint8_t         ift_gen;
  323         uint8_t         ift_br_offset;
  324         uint16_t        ift_npending;
  325         uint16_t        ift_db_pending;
  326         uint16_t        ift_rs_pending;
  327         /* implicit pad */
  328         uint8_t         ift_txd_size[8];
  329         uint64_t        ift_processed;
  330         uint64_t        ift_cleaned;
  331         uint64_t        ift_cleaned_prev;
  332 #if MEMORY_LOGGING
  333         uint64_t        ift_enqueued;
  334         uint64_t        ift_dequeued;
  335 #endif
  336         uint64_t        ift_no_tx_dma_setup;
  337         uint64_t        ift_no_desc_avail;
  338         uint64_t        ift_mbuf_defrag_failed;
  339         uint64_t        ift_mbuf_defrag;
  340         uint64_t        ift_map_failed;
  341         uint64_t        ift_txd_encap_efbig;
  342         uint64_t        ift_pullups;
  343         uint64_t        ift_last_timer_tick;
  344 
  345         struct mtx      ift_mtx;
  346         struct mtx      ift_db_mtx;
  347 
  348         /* constant values */
  349         if_ctx_t        ift_ctx;
  350         struct ifmp_ring        *ift_br;
  351         struct grouptask        ift_task;
  352         qidx_t          ift_size;
  353         uint16_t        ift_id;
  354         struct callout  ift_timer;
  355 #ifdef DEV_NETMAP
  356         struct callout  ift_netmap_timer;
  357 #endif /* DEV_NETMAP */
  358 
  359         if_txsd_vec_t   ift_sds;
  360         uint8_t         ift_qstatus;
  361         uint8_t         ift_closed;
  362         uint8_t         ift_update_freq;
  363         struct iflib_filter_info ift_filter_info;
  364         bus_dma_tag_t   ift_buf_tag;
  365         bus_dma_tag_t   ift_tso_buf_tag;
  366         iflib_dma_info_t        ift_ifdi;
  367 #define MTX_NAME_LEN    32
  368         char                    ift_mtx_name[MTX_NAME_LEN];
  369         bus_dma_segment_t       ift_segs[IFLIB_MAX_TX_SEGS]  __aligned(CACHE_LINE_SIZE);
  370 #ifdef IFLIB_DIAGNOSTICS
  371         uint64_t ift_cpu_exec_count[256];
  372 #endif
  373 } __aligned(CACHE_LINE_SIZE);
  374 
  375 struct iflib_fl {
  376         qidx_t          ifl_cidx;
  377         qidx_t          ifl_pidx;
  378         qidx_t          ifl_credits;
  379         uint8_t         ifl_gen;
  380         uint8_t         ifl_rxd_size;
  381 #if MEMORY_LOGGING
  382         uint64_t        ifl_m_enqueued;
  383         uint64_t        ifl_m_dequeued;
  384         uint64_t        ifl_cl_enqueued;
  385         uint64_t        ifl_cl_dequeued;
  386 #endif
  387         /* implicit pad */
  388 
  389         bitstr_t        *ifl_rx_bitmap;
  390         qidx_t          ifl_fragidx;
  391         /* constant */
  392         qidx_t          ifl_size;
  393         uint16_t        ifl_buf_size;
  394         uint16_t        ifl_cltype;
  395         uma_zone_t      ifl_zone;
  396         iflib_rxsd_array_t      ifl_sds;
  397         iflib_rxq_t     ifl_rxq;
  398         uint8_t         ifl_id;
  399         bus_dma_tag_t   ifl_buf_tag;
  400         iflib_dma_info_t        ifl_ifdi;
  401         uint64_t        ifl_bus_addrs[IFLIB_MAX_RX_REFRESH] __aligned(CACHE_LINE_SIZE);
  402         qidx_t          ifl_rxd_idxs[IFLIB_MAX_RX_REFRESH];
  403 }  __aligned(CACHE_LINE_SIZE);
  404 
  405 static inline qidx_t
  406 get_inuse(int size, qidx_t cidx, qidx_t pidx, uint8_t gen)
  407 {
  408         qidx_t used;
  409 
  410         if (pidx > cidx)
  411                 used = pidx - cidx;
  412         else if (pidx < cidx)
  413                 used = size - cidx + pidx;
  414         else if (gen == 0 && pidx == cidx)
  415                 used = 0;
  416         else if (gen == 1 && pidx == cidx)
  417                 used = size;
  418         else
  419                 panic("bad state");
  420 
  421         return (used);
  422 }
  423 
  424 #define TXQ_AVAIL(txq) (txq->ift_size - get_inuse(txq->ift_size, txq->ift_cidx, txq->ift_pidx, txq->ift_gen))
  425 
  426 #define IDXDIFF(head, tail, wrap) \
  427         ((head) >= (tail) ? (head) - (tail) : (wrap) - (tail) + (head))
  428 
  429 struct iflib_rxq {
  430         if_ctx_t        ifr_ctx;
  431         iflib_fl_t      ifr_fl;
  432         uint64_t        ifr_rx_irq;
  433         /*
  434          * If there is a separate completion queue (IFLIB_HAS_RXCQ), this is
  435          * the completion queue consumer index.  Otherwise it's unused.
  436          */
  437         qidx_t          ifr_cq_cidx;
  438         uint16_t        ifr_id;
  439         uint8_t         ifr_nfl;
  440         uint8_t         ifr_ntxqirq;
  441         uint8_t         ifr_txqid[IFLIB_MAX_TX_SHARED_INTR];
  442         uint8_t         ifr_fl_offset;
  443         struct lro_ctrl                 ifr_lc;
  444         struct grouptask        ifr_task;
  445         struct callout          ifr_watchdog;
  446         struct iflib_filter_info ifr_filter_info;
  447         iflib_dma_info_t                ifr_ifdi;
  448 
  449         /* dynamically allocate if any drivers need a value substantially larger than this */
  450         struct if_rxd_frag      ifr_frags[IFLIB_MAX_RX_SEGS] __aligned(CACHE_LINE_SIZE);
  451 #ifdef IFLIB_DIAGNOSTICS
  452         uint64_t ifr_cpu_exec_count[256];
  453 #endif
  454 }  __aligned(CACHE_LINE_SIZE);
  455 
  456 typedef struct if_rxsd {
  457         caddr_t *ifsd_cl;
  458         struct mbuf **ifsd_m;
  459         iflib_fl_t ifsd_fl;
  460 } *if_rxsd_t;
  461 
  462 /* multiple of word size */
  463 #ifdef __LP64__
  464 #define PKT_INFO_SIZE   6
  465 #define RXD_INFO_SIZE   5
  466 #define PKT_TYPE uint64_t
  467 #else
  468 #define PKT_INFO_SIZE   11
  469 #define RXD_INFO_SIZE   8
  470 #define PKT_TYPE uint32_t
  471 #endif
  472 #define PKT_LOOP_BOUND  ((PKT_INFO_SIZE/3)*3)
  473 #define RXD_LOOP_BOUND  ((RXD_INFO_SIZE/4)*4)
  474 
  475 typedef struct if_pkt_info_pad {
  476         PKT_TYPE pkt_val[PKT_INFO_SIZE];
  477 } *if_pkt_info_pad_t;
  478 typedef struct if_rxd_info_pad {
  479         PKT_TYPE rxd_val[RXD_INFO_SIZE];
  480 } *if_rxd_info_pad_t;
  481 
  482 CTASSERT(sizeof(struct if_pkt_info_pad) == sizeof(struct if_pkt_info));
  483 CTASSERT(sizeof(struct if_rxd_info_pad) == sizeof(struct if_rxd_info));
  484 
  485 
  486 static inline void
  487 pkt_info_zero(if_pkt_info_t pi)
  488 {
  489         if_pkt_info_pad_t pi_pad;
  490 
  491         pi_pad = (if_pkt_info_pad_t)pi;
  492         pi_pad->pkt_val[0] = 0; pi_pad->pkt_val[1] = 0; pi_pad->pkt_val[2] = 0;
  493         pi_pad->pkt_val[3] = 0; pi_pad->pkt_val[4] = 0; pi_pad->pkt_val[5] = 0;
  494 #ifndef __LP64__
  495         pi_pad->pkt_val[6] = 0; pi_pad->pkt_val[7] = 0; pi_pad->pkt_val[8] = 0;
  496         pi_pad->pkt_val[9] = 0; pi_pad->pkt_val[10] = 0;
  497 #endif  
  498 }
  499 
  500 static device_method_t iflib_pseudo_methods[] = {
  501         DEVMETHOD(device_attach, noop_attach),
  502         DEVMETHOD(device_detach, iflib_pseudo_detach),
  503         DEVMETHOD_END
  504 };
  505 
  506 driver_t iflib_pseudodriver = {
  507         "iflib_pseudo", iflib_pseudo_methods, sizeof(struct iflib_ctx),
  508 };
  509 
  510 static inline void
  511 rxd_info_zero(if_rxd_info_t ri)
  512 {
  513         if_rxd_info_pad_t ri_pad;
  514         int i;
  515 
  516         ri_pad = (if_rxd_info_pad_t)ri;
  517         for (i = 0; i < RXD_LOOP_BOUND; i += 4) {
  518                 ri_pad->rxd_val[i] = 0;
  519                 ri_pad->rxd_val[i+1] = 0;
  520                 ri_pad->rxd_val[i+2] = 0;
  521                 ri_pad->rxd_val[i+3] = 0;
  522         }
  523 #ifdef __LP64__
  524         ri_pad->rxd_val[RXD_INFO_SIZE-1] = 0;
  525 #endif
  526 }
  527 
  528 /*
  529  * Only allow a single packet to take up most 1/nth of the tx ring
  530  */
  531 #define MAX_SINGLE_PACKET_FRACTION 12
  532 #define IF_BAD_DMA (bus_addr_t)-1
  533 
  534 #define CTX_ACTIVE(ctx) ((if_getdrvflags((ctx)->ifc_ifp) & IFF_DRV_RUNNING))
  535 
  536 #define CTX_LOCK_INIT(_sc)  sx_init(&(_sc)->ifc_ctx_sx, "iflib ctx lock")
  537 #define CTX_LOCK(ctx) sx_xlock(&(ctx)->ifc_ctx_sx)
  538 #define CTX_UNLOCK(ctx) sx_xunlock(&(ctx)->ifc_ctx_sx)
  539 #define CTX_LOCK_DESTROY(ctx) sx_destroy(&(ctx)->ifc_ctx_sx)
  540 
  541 #define STATE_LOCK_INIT(_sc, _name)  mtx_init(&(_sc)->ifc_state_mtx, _name, "iflib state lock", MTX_DEF)
  542 #define STATE_LOCK(ctx) mtx_lock(&(ctx)->ifc_state_mtx)
  543 #define STATE_UNLOCK(ctx) mtx_unlock(&(ctx)->ifc_state_mtx)
  544 #define STATE_LOCK_DESTROY(ctx) mtx_destroy(&(ctx)->ifc_state_mtx)
  545 
  546 #define CALLOUT_LOCK(txq)       mtx_lock(&txq->ift_mtx)
  547 #define CALLOUT_UNLOCK(txq)     mtx_unlock(&txq->ift_mtx)
  548 
  549 void
  550 iflib_set_detach(if_ctx_t ctx)
  551 {
  552         STATE_LOCK(ctx);
  553         ctx->ifc_flags |= IFC_IN_DETACH;
  554         STATE_UNLOCK(ctx);
  555 }
  556 
  557 /* Our boot-time initialization hook */
  558 static int      iflib_module_event_handler(module_t, int, void *);
  559 
  560 static moduledata_t iflib_moduledata = {
  561         "iflib",
  562         iflib_module_event_handler,
  563         NULL
  564 };
  565 
  566 DECLARE_MODULE(iflib, iflib_moduledata, SI_SUB_INIT_IF, SI_ORDER_ANY);
  567 MODULE_VERSION(iflib, 1);
  568 
  569 MODULE_DEPEND(iflib, pci, 1, 1, 1);
  570 MODULE_DEPEND(iflib, ether, 1, 1, 1);
  571 
  572 TASKQGROUP_DEFINE(if_io_tqg, mp_ncpus, 1);
  573 TASKQGROUP_DEFINE(if_config_tqg, 1, 1);
  574 
  575 #ifndef IFLIB_DEBUG_COUNTERS
  576 #ifdef INVARIANTS
  577 #define IFLIB_DEBUG_COUNTERS 1
  578 #else
  579 #define IFLIB_DEBUG_COUNTERS 0
  580 #endif /* !INVARIANTS */
  581 #endif
  582 
  583 static SYSCTL_NODE(_net, OID_AUTO, iflib, CTLFLAG_RD, 0,
  584                    "iflib driver parameters");
  585 
  586 /*
  587  * XXX need to ensure that this can't accidentally cause the head to be moved backwards 
  588  */
  589 static int iflib_min_tx_latency = 0;
  590 SYSCTL_INT(_net_iflib, OID_AUTO, min_tx_latency, CTLFLAG_RW,
  591                    &iflib_min_tx_latency, 0, "minimize transmit latency at the possible expense of throughput");
  592 static int iflib_no_tx_batch = 0;
  593 SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW,
  594                    &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput");
  595 static int iflib_timer_default = 1000;
  596 SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW,
  597                    &iflib_timer_default, 0, "number of ticks between iflib_timer calls");
  598 
  599 
  600 
  601 #if IFLIB_DEBUG_COUNTERS
  602 
  603 static int iflib_tx_seen;
  604 static int iflib_tx_sent;
  605 static int iflib_tx_encap;
  606 static int iflib_rx_allocs;
  607 static int iflib_fl_refills;
  608 static int iflib_fl_refills_large;
  609 static int iflib_tx_frees;
  610 
  611 SYSCTL_INT(_net_iflib, OID_AUTO, tx_seen, CTLFLAG_RD,
  612                    &iflib_tx_seen, 0, "# TX mbufs seen");
  613 SYSCTL_INT(_net_iflib, OID_AUTO, tx_sent, CTLFLAG_RD,
  614                    &iflib_tx_sent, 0, "# TX mbufs sent");
  615 SYSCTL_INT(_net_iflib, OID_AUTO, tx_encap, CTLFLAG_RD,
  616                    &iflib_tx_encap, 0, "# TX mbufs encapped");
  617 SYSCTL_INT(_net_iflib, OID_AUTO, tx_frees, CTLFLAG_RD,
  618                    &iflib_tx_frees, 0, "# TX frees");
  619 SYSCTL_INT(_net_iflib, OID_AUTO, rx_allocs, CTLFLAG_RD,
  620                    &iflib_rx_allocs, 0, "# RX allocations");
  621 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills, CTLFLAG_RD,
  622                    &iflib_fl_refills, 0, "# refills");
  623 SYSCTL_INT(_net_iflib, OID_AUTO, fl_refills_large, CTLFLAG_RD,
  624                    &iflib_fl_refills_large, 0, "# large refills");
  625 
  626 
  627 static int iflib_txq_drain_flushing;
  628 static int iflib_txq_drain_oactive;
  629 static int iflib_txq_drain_notready;
  630 
  631 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD,
  632                    &iflib_txq_drain_flushing, 0, "# drain flushes");
  633 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_oactive, CTLFLAG_RD,
  634                    &iflib_txq_drain_oactive, 0, "# drain oactives");
  635 SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD,
  636                    &iflib_txq_drain_notready, 0, "# drain notready");
  637 
  638 
  639 static int iflib_encap_load_mbuf_fail;
  640 static int iflib_encap_pad_mbuf_fail;
  641 static int iflib_encap_txq_avail_fail;
  642 static int iflib_encap_txd_encap_fail;
  643 
  644 SYSCTL_INT(_net_iflib, OID_AUTO, encap_load_mbuf_fail, CTLFLAG_RD,
  645                    &iflib_encap_load_mbuf_fail, 0, "# busdma load failures");
  646 SYSCTL_INT(_net_iflib, OID_AUTO, encap_pad_mbuf_fail, CTLFLAG_RD,
  647                    &iflib_encap_pad_mbuf_fail, 0, "# runt frame pad failures");
  648 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txq_avail_fail, CTLFLAG_RD,
  649                    &iflib_encap_txq_avail_fail, 0, "# txq avail failures");
  650 SYSCTL_INT(_net_iflib, OID_AUTO, encap_txd_encap_fail, CTLFLAG_RD,
  651                    &iflib_encap_txd_encap_fail, 0, "# driver encap failures");
  652 
  653 static int iflib_task_fn_rxs;
  654 static int iflib_rx_intr_enables;
  655 static int iflib_fast_intrs;
  656 static int iflib_rx_unavail;
  657 static int iflib_rx_ctx_inactive;
  658 static int iflib_rx_if_input;
  659 static int iflib_rx_mbuf_null;
  660 static int iflib_rxd_flush;
  661 
  662 static int iflib_verbose_debug;
  663 
  664 SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD,
  665                    &iflib_task_fn_rxs, 0, "# task_fn_rx calls");
  666 SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD,
  667                    &iflib_rx_intr_enables, 0, "# RX intr enables");
  668 SYSCTL_INT(_net_iflib, OID_AUTO, fast_intrs, CTLFLAG_RD,
  669                    &iflib_fast_intrs, 0, "# fast_intr calls");
  670 SYSCTL_INT(_net_iflib, OID_AUTO, rx_unavail, CTLFLAG_RD,
  671                    &iflib_rx_unavail, 0, "# times rxeof called with no available data");
  672 SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD,
  673                    &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context");
  674 SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD,
  675                    &iflib_rx_if_input, 0, "# times rxeof called if_input");
  676 SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD,
  677                    &iflib_rx_mbuf_null, 0, "# times rxeof got null mbuf");
  678 SYSCTL_INT(_net_iflib, OID_AUTO, rxd_flush, CTLFLAG_RD,
  679                  &iflib_rxd_flush, 0, "# times rxd_flush called");
  680 SYSCTL_INT(_net_iflib, OID_AUTO, verbose_debug, CTLFLAG_RW,
  681                    &iflib_verbose_debug, 0, "enable verbose debugging");
  682 
  683 #define DBG_COUNTER_INC(name) atomic_add_int(&(iflib_ ## name), 1)
  684 static void
  685 iflib_debug_reset(void)
  686 {
  687         iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs =
  688                 iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees =
  689                 iflib_txq_drain_flushing = iflib_txq_drain_oactive =
  690                 iflib_txq_drain_notready =
  691                 iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail =
  692                 iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail =
  693                 iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs =
  694                 iflib_rx_unavail =
  695                 iflib_rx_ctx_inactive = iflib_rx_if_input =
  696                 iflib_rx_mbuf_null = iflib_rxd_flush = 0;
  697 }
  698 
  699 #else
  700 #define DBG_COUNTER_INC(name)
  701 static void iflib_debug_reset(void) {}
  702 #endif
  703 
  704 #define IFLIB_DEBUG 0
  705 
  706 static void iflib_tx_structures_free(if_ctx_t ctx);
  707 static void iflib_rx_structures_free(if_ctx_t ctx);
  708 static int iflib_queues_alloc(if_ctx_t ctx);
  709 static int iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq);
  710 static int iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget);
  711 static int iflib_qset_structures_setup(if_ctx_t ctx);
  712 static int iflib_msix_init(if_ctx_t ctx);
  713 static int iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filterarg, int *rid, const char *str);
  714 static void iflib_txq_check_drain(iflib_txq_t txq, int budget);
  715 static uint32_t iflib_txq_can_drain(struct ifmp_ring *);
  716 #ifdef ALTQ
  717 static void iflib_altq_if_start(if_t ifp);
  718 static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m);
  719 #endif
  720 static int iflib_register(if_ctx_t);
  721 static void iflib_deregister(if_ctx_t);
  722 static void iflib_unregister_vlan_handlers(if_ctx_t ctx);
  723 static uint16_t iflib_get_mbuf_size_for(unsigned int size);
  724 static void iflib_init_locked(if_ctx_t ctx);
  725 static void iflib_add_device_sysctl_pre(if_ctx_t ctx);
  726 static void iflib_add_device_sysctl_post(if_ctx_t ctx);
  727 static void iflib_ifmp_purge(iflib_txq_t txq);
  728 static void _iflib_pre_assert(if_softc_ctx_t scctx);
  729 static void iflib_if_init_locked(if_ctx_t ctx);
  730 static void iflib_free_intr_mem(if_ctx_t ctx);
  731 #ifndef __NO_STRICT_ALIGNMENT
  732 static struct mbuf * iflib_fixup_rx(struct mbuf *m);
  733 #endif
  734 
  735 static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
  736     SLIST_HEAD_INITIALIZER(cpu_offsets);
  737 struct cpu_offset {
  738         SLIST_ENTRY(cpu_offset) entries;
  739         cpuset_t        set;
  740         unsigned int    refcount;
  741         uint16_t        next_cpuid;
  742 };
  743 static struct mtx cpu_offset_mtx;
  744 MTX_SYSINIT(iflib_cpu_offset, &cpu_offset_mtx, "iflib_cpu_offset lock",
  745     MTX_DEF);
  746 
  747 NETDUMP_DEFINE(iflib);
  748 
  749 static int
  750 iflib_num_rx_descs(if_ctx_t ctx)
  751 {
  752         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
  753         if_shared_ctx_t sctx = ctx->ifc_sctx;
  754         uint16_t first_rxq = (sctx->isc_flags & IFLIB_HAS_RXCQ) ? 1 : 0;
  755 
  756         return scctx->isc_nrxd[first_rxq];
  757 }
  758 
  759 static int
  760 iflib_num_tx_descs(if_ctx_t ctx)
  761 {
  762         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
  763         if_shared_ctx_t sctx = ctx->ifc_sctx;
  764         uint16_t first_txq = (sctx->isc_flags & IFLIB_HAS_TXCQ) ? 1 : 0;
  765 
  766         return scctx->isc_ntxd[first_txq];
  767 }
  768 
  769 #ifdef DEV_NETMAP
  770 #include <sys/selinfo.h>
  771 #include <net/netmap.h>
  772 #include <dev/netmap/netmap_kern.h>
  773 
  774 MODULE_DEPEND(iflib, netmap, 1, 1, 1);
  775 
  776 static int netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init);
  777 static void iflib_netmap_timer(void *arg);
  778 
  779 /*
  780  * device-specific sysctl variables:
  781  *
  782  * iflib_crcstrip: 0: keep CRC in rx frames (default), 1: strip it.
  783  *      During regular operations the CRC is stripped, but on some
  784  *      hardware reception of frames not multiple of 64 is slower,
  785  *      so using crcstrip=0 helps in benchmarks.
  786  *
  787  * iflib_rx_miss, iflib_rx_miss_bufs:
  788  *      count packets that might be missed due to lost interrupts.
  789  */
  790 SYSCTL_DECL(_dev_netmap);
  791 /*
  792  * The xl driver by default strips CRCs and we do not override it.
  793  */
  794 
  795 int iflib_crcstrip = 1;
  796 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_crcstrip,
  797     CTLFLAG_RW, &iflib_crcstrip, 1, "strip CRC on RX frames");
  798 
  799 int iflib_rx_miss, iflib_rx_miss_bufs;
  800 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss,
  801     CTLFLAG_RW, &iflib_rx_miss, 0, "potentially missed RX intr");
  802 SYSCTL_INT(_dev_netmap, OID_AUTO, iflib_rx_miss_bufs,
  803     CTLFLAG_RW, &iflib_rx_miss_bufs, 0, "potentially missed RX intr bufs");
  804 
  805 /*
  806  * Register/unregister. We are already under netmap lock.
  807  * Only called on the first register or the last unregister.
  808  */
  809 static int
  810 iflib_netmap_register(struct netmap_adapter *na, int onoff)
  811 {
  812         if_t ifp = na->ifp;
  813         if_ctx_t ctx = ifp->if_softc;
  814         int status;
  815 
  816         CTX_LOCK(ctx);
  817         if (!CTX_IS_VF(ctx))
  818                 IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip);
  819 
  820         iflib_stop(ctx);
  821 
  822         /*
  823          * Enable (or disable) netmap flags, and intercept (or restore)
  824          * ifp->if_transmit. This is done once the device has been stopped
  825          * to prevent race conditions. Also, this must be done after
  826          * calling netmap_disable_all_rings() and before calling
  827          * netmap_enable_all_rings(), so that these two functions see the
  828          * updated state of the NAF_NETMAP_ON bit.
  829          */
  830         if (onoff) {
  831                 nm_set_native_flags(na);
  832         } else {
  833                 nm_clear_native_flags(na);
  834         }
  835 
  836         iflib_init_locked(ctx);
  837         IFDI_CRCSTRIP_SET(ctx, onoff, iflib_crcstrip); // XXX why twice ?
  838         status = ifp->if_drv_flags & IFF_DRV_RUNNING ? 0 : 1;
  839         if (status)
  840                 nm_clear_native_flags(na);
  841         CTX_UNLOCK(ctx);
  842         return (status);
  843 }
  844 
  845 static int
  846 iflib_netmap_config(struct netmap_adapter *na, struct nm_config_info *info)
  847 {
  848         if_t ifp = na->ifp;
  849         if_ctx_t ctx = ifp->if_softc;
  850         iflib_rxq_t rxq = &ctx->ifc_rxqs[0];
  851         iflib_fl_t fl = &rxq->ifr_fl[0];
  852 
  853         info->num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
  854         info->num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
  855         info->num_tx_descs = iflib_num_tx_descs(ctx);
  856         info->num_rx_descs = iflib_num_rx_descs(ctx);
  857         info->rx_buf_maxsize = fl->ifl_buf_size;
  858         nm_prinf("txr %u rxr %u txd %u rxd %u rbufsz %u",
  859                 info->num_tx_rings, info->num_rx_rings, info->num_tx_descs,
  860                 info->num_rx_descs, info->rx_buf_maxsize);
  861 
  862         return 0;
  863 }
  864 
  865 static int
  866 netmap_fl_refill(iflib_rxq_t rxq, struct netmap_kring *kring, bool init)
  867 {
  868         struct netmap_adapter *na = kring->na;
  869         u_int const lim = kring->nkr_num_slots - 1;
  870         struct netmap_ring *ring = kring->ring;
  871         bus_dmamap_t *map;
  872         struct if_rxd_update iru;
  873         if_ctx_t ctx = rxq->ifr_ctx;
  874         iflib_fl_t fl = &rxq->ifr_fl[0];
  875         u_int nic_i_first, nic_i;
  876         u_int nm_i;
  877         int i, n;
  878 #if IFLIB_DEBUG_COUNTERS
  879         int rf_count = 0;
  880 #endif
  881 
  882         /*
  883          * This function is used both at initialization and in rxsync.
  884          * At initialization we need to prepare (with isc_rxd_refill())
  885          * all the netmap buffers currently owned by the kernel, in
  886          * such a way to keep fl->ifl_pidx and kring->nr_hwcur in sync
  887          * (except for kring->nkr_hwofs). These may be less than
  888          * kring->nkr_num_slots if netmap_reset() was called while
  889          * an application using the kring that still owned some
  890          * buffers.
  891          * At rxsync time, both indexes point to the next buffer to be
  892          * refilled.
  893          * In any case we publish (with isc_rxd_flush()) up to
  894          * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod
  895          * pointer to overrun the head/cons pointer, although this is
  896          * not necessary for some NICs (e.g. vmx).
  897          */
  898         if (__predict_false(init)) {
  899                 n = kring->nkr_num_slots - nm_kr_rxspace(kring);
  900         } else {
  901                 n = kring->rhead - kring->nr_hwcur;
  902                 if (n == 0)
  903                         return (0); /* Nothing to do. */
  904                 if (n < 0)
  905                         n += kring->nkr_num_slots;
  906         }
  907 
  908         iru_init(&iru, rxq, 0 /* flid */);
  909         map = fl->ifl_sds.ifsd_map;
  910         nic_i = fl->ifl_pidx;
  911         nm_i = netmap_idx_n2k(kring, nic_i);
  912         if (__predict_false(init)) {
  913                 /*
  914                  * On init/reset, nic_i must be 0, and we must
  915                  * start to refill from hwtail (see netmap_reset()).
  916                  */
  917                 MPASS(nic_i == 0);
  918                 MPASS(nm_i == kring->nr_hwtail);
  919         } else
  920                 MPASS(nm_i == kring->nr_hwcur);
  921         DBG_COUNTER_INC(fl_refills);
  922         while (n > 0) {
  923 #if IFLIB_DEBUG_COUNTERS
  924                 if (++rf_count == 9)
  925                         DBG_COUNTER_INC(fl_refills_large);
  926 #endif
  927                 nic_i_first = nic_i;
  928                 for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) {
  929                         struct netmap_slot *slot = &ring->slot[nm_i];
  930                         void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[i]);
  931 
  932                         MPASS(i < IFLIB_MAX_RX_REFRESH);
  933 
  934                         if (addr == NETMAP_BUF_BASE(na)) /* bad buf */
  935                                 return netmap_ring_reinit(kring);
  936 
  937                         fl->ifl_rxd_idxs[i] = nic_i;
  938 
  939                         if (__predict_false(init)) {
  940                                 netmap_load_map(na, fl->ifl_buf_tag,
  941                                     map[nic_i], addr);
  942                         } else if (slot->flags & NS_BUF_CHANGED) {
  943                                 /* buffer has changed, reload map */
  944                                 netmap_reload_map(na, fl->ifl_buf_tag,
  945                                     map[nic_i], addr);
  946                         }
  947                         bus_dmamap_sync(fl->ifl_buf_tag, map[nic_i],
  948                             BUS_DMASYNC_PREREAD);
  949                         slot->flags &= ~NS_BUF_CHANGED;
  950 
  951                         nm_i = nm_next(nm_i, lim);
  952                         nic_i = nm_next(nic_i, lim);
  953                 }
  954 
  955                 iru.iru_pidx = nic_i_first;
  956                 iru.iru_count = i;
  957                 ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
  958         }
  959         fl->ifl_pidx = nic_i;
  960         /*
  961          * At the end of the loop we must have refilled everything
  962          * we could possibly refill.
  963          */
  964         MPASS(nm_i == kring->rhead);
  965         kring->nr_hwcur = nm_i;
  966 
  967         bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
  968             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
  969         ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id,
  970             nm_prev(nic_i, lim));
  971         DBG_COUNTER_INC(rxd_flush);
  972 
  973         return (0);
  974 }
  975 
  976 #define NETMAP_TX_TIMER_US      90
  977 
  978 /*
  979  * Reconcile kernel and user view of the transmit ring.
  980  *
  981  * All information is in the kring.
  982  * Userspace wants to send packets up to the one before kring->rhead,
  983  * kernel knows kring->nr_hwcur is the first unsent packet.
  984  *
  985  * Here we push packets out (as many as possible), and possibly
  986  * reclaim buffers from previously completed transmission.
  987  *
  988  * The caller (netmap) guarantees that there is only one instance
  989  * running at any time. Any interference with other driver
  990  * methods should be handled by the individual drivers.
  991  */
  992 static int
  993 iflib_netmap_txsync(struct netmap_kring *kring, int flags)
  994 {
  995         struct netmap_adapter *na = kring->na;
  996         if_t ifp = na->ifp;
  997         struct netmap_ring *ring = kring->ring;
  998         u_int nm_i;     /* index into the netmap kring */
  999         u_int nic_i;    /* index into the NIC ring */
 1000         u_int const lim = kring->nkr_num_slots - 1;
 1001         u_int const head = kring->rhead;
 1002         struct if_pkt_info pi;
 1003         int tx_pkts = 0, tx_bytes = 0;
 1004 
 1005         /*
 1006          * interrupts on every tx packet are expensive so request
 1007          * them every half ring, or where NS_REPORT is set
 1008          */
 1009         u_int report_frequency = kring->nkr_num_slots >> 1;
 1010         /* device-specific */
 1011         if_ctx_t ctx = ifp->if_softc;
 1012         iflib_txq_t txq = &ctx->ifc_txqs[kring->ring_id];
 1013 
 1014         bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 1015             BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 1016 
 1017         /*
 1018          * First part: process new packets to send.
 1019          * nm_i is the current index in the netmap kring,
 1020          * nic_i is the corresponding index in the NIC ring.
 1021          *
 1022          * If we have packets to send (nm_i != head)
 1023          * iterate over the netmap ring, fetch length and update
 1024          * the corresponding slot in the NIC ring. Some drivers also
 1025          * need to update the buffer's physical address in the NIC slot
 1026          * even NS_BUF_CHANGED is not set (PNMB computes the addresses).
 1027          *
 1028          * The netmap_reload_map() calls is especially expensive,
 1029          * even when (as in this case) the tag is 0, so do only
 1030          * when the buffer has actually changed.
 1031          *
 1032          * If possible do not set the report/intr bit on all slots,
 1033          * but only a few times per ring or when NS_REPORT is set.
 1034          *
 1035          * Finally, on 10G and faster drivers, it might be useful
 1036          * to prefetch the next slot and txr entry.
 1037          */
 1038 
 1039         nm_i = kring->nr_hwcur;
 1040         if (nm_i != head) {     /* we have new packets to send */
 1041                 uint32_t pkt_len = 0, seg_idx = 0;
 1042                 int nic_i_start = -1, flags = 0;
 1043                 pkt_info_zero(&pi);
 1044                 pi.ipi_segs = txq->ift_segs;
 1045                 pi.ipi_qsidx = kring->ring_id;
 1046                 nic_i = netmap_idx_k2n(kring, nm_i);
 1047 
 1048                 __builtin_prefetch(&ring->slot[nm_i]);
 1049                 __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i]);
 1050                 __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i]);
 1051 
 1052                 while (nm_i != head) {
 1053                         struct netmap_slot *slot = &ring->slot[nm_i];
 1054                         u_int len = slot->len;
 1055                         uint64_t paddr;
 1056                         void *addr = PNMB(na, slot, &paddr);
 1057 
 1058                         flags |= (slot->flags & NS_REPORT ||
 1059                                 nic_i == 0 || nic_i == report_frequency) ?
 1060                                 IPI_TX_INTR : 0;
 1061 
 1062                         /*
 1063                          * If this is the first packet fragment, save the
 1064                          * index of the first NIC slot for later.
 1065                          */
 1066                         if (nic_i_start < 0)
 1067                                 nic_i_start = nic_i;
 1068 
 1069                         pi.ipi_segs[seg_idx].ds_addr = paddr;
 1070                         pi.ipi_segs[seg_idx].ds_len = len;
 1071                         if (len) {
 1072                                 pkt_len += len;
 1073                                 seg_idx++;
 1074                         }
 1075 
 1076                         if (!(slot->flags & NS_MOREFRAG)) {
 1077                                 pi.ipi_len = pkt_len;
 1078                                 pi.ipi_nsegs = seg_idx;
 1079                                 pi.ipi_pidx = nic_i_start;
 1080                                 pi.ipi_ndescs = 0;
 1081                                 pi.ipi_flags = flags;
 1082 
 1083                                 /* Prepare the NIC TX ring. */
 1084                                 ctx->isc_txd_encap(ctx->ifc_softc, &pi);
 1085                                 DBG_COUNTER_INC(tx_encap);
 1086 
 1087                                 /* Update transmit counters */
 1088                                 tx_bytes += pi.ipi_len;
 1089                                 tx_pkts++;
 1090 
 1091                                 /* Reinit per-packet info for the next one. */
 1092                                 flags = seg_idx = pkt_len = 0;
 1093                                 nic_i_start = -1;
 1094                         }
 1095 
 1096                         /* prefetch for next round */
 1097                         __builtin_prefetch(&ring->slot[nm_i + 1]);
 1098                         __builtin_prefetch(&txq->ift_sds.ifsd_m[nic_i + 1]);
 1099                         __builtin_prefetch(&txq->ift_sds.ifsd_map[nic_i + 1]);
 1100 
 1101                         NM_CHECK_ADDR_LEN(na, addr, len);
 1102 
 1103                         if (slot->flags & NS_BUF_CHANGED) {
 1104                                 /* buffer has changed, reload map */
 1105                                 netmap_reload_map(na, txq->ift_buf_tag,
 1106                                     txq->ift_sds.ifsd_map[nic_i], addr);
 1107                         }
 1108                         /* make sure changes to the buffer are synced */
 1109                         bus_dmamap_sync(txq->ift_buf_tag,
 1110                             txq->ift_sds.ifsd_map[nic_i],
 1111                             BUS_DMASYNC_PREWRITE);
 1112 
 1113                         slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED | NS_MOREFRAG);
 1114                         nm_i = nm_next(nm_i, lim);
 1115                         nic_i = nm_next(nic_i, lim);
 1116                 }
 1117                 kring->nr_hwcur = nm_i;
 1118 
 1119                 /* synchronize the NIC ring */
 1120                 bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 1121                     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 1122 
 1123                 /* (re)start the tx unit up to slot nic_i (excluded) */
 1124                 ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, nic_i);
 1125         }
 1126 
 1127         /*
 1128          * Second part: reclaim buffers for completed transmissions.
 1129          *
 1130          * If there are unclaimed buffers, attempt to reclaim them.
 1131          * If we don't manage to reclaim them all, and TX IRQs are not in use,
 1132          * trigger a per-tx-queue timer to try again later.
 1133          */
 1134         if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 1135                 if (iflib_tx_credits_update(ctx, txq)) {
 1136                         /* some tx completed, increment avail */
 1137                         nic_i = txq->ift_cidx_processed;
 1138                         kring->nr_hwtail = nm_prev(netmap_idx_n2k(kring, nic_i), lim);
 1139                 }
 1140         }
 1141 
 1142         if (!(ctx->ifc_flags & IFC_NETMAP_TX_IRQ))
 1143                 if (kring->nr_hwtail != nm_prev(kring->nr_hwcur, lim)) {
 1144                         callout_reset_sbt_on(&txq->ift_netmap_timer,
 1145                             NETMAP_TX_TIMER_US * SBT_1US, SBT_1US,
 1146                             iflib_netmap_timer, txq,
 1147                             txq->ift_netmap_timer.c_cpu, 0);
 1148                 }
 1149 
 1150         if_inc_counter(ifp, IFCOUNTER_OBYTES, tx_bytes);
 1151         if_inc_counter(ifp, IFCOUNTER_OPACKETS, tx_pkts);
 1152 
 1153         return (0);
 1154 }
 1155 
 1156 /*
 1157  * Reconcile kernel and user view of the receive ring.
 1158  * Same as for the txsync, this routine must be efficient.
 1159  * The caller guarantees a single invocations, but races against
 1160  * the rest of the driver should be handled here.
 1161  *
 1162  * On call, kring->rhead is the first packet that userspace wants
 1163  * to keep, and kring->rcur is the wakeup point.
 1164  * The kernel has previously reported packets up to kring->rtail.
 1165  *
 1166  * If (flags & NAF_FORCE_READ) also check for incoming packets irrespective
 1167  * of whether or not we received an interrupt.
 1168  */
 1169 static int
 1170 iflib_netmap_rxsync(struct netmap_kring *kring, int flags)
 1171 {
 1172         struct netmap_adapter *na = kring->na;
 1173         struct netmap_ring *ring = kring->ring;
 1174         if_t ifp = na->ifp;
 1175         uint32_t nm_i;  /* index into the netmap ring */
 1176         uint32_t nic_i; /* index into the NIC ring */
 1177         u_int n;
 1178         u_int const lim = kring->nkr_num_slots - 1;
 1179         int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 1180         int i = 0, rx_bytes = 0, rx_pkts = 0;
 1181 
 1182         if_ctx_t ctx = ifp->if_softc;
 1183         if_shared_ctx_t sctx = ctx->ifc_sctx;
 1184         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 1185         iflib_rxq_t rxq = &ctx->ifc_rxqs[kring->ring_id];
 1186         iflib_fl_t fl = &rxq->ifr_fl[0];
 1187         struct if_rxd_info ri;
 1188         qidx_t *cidxp;
 1189 
 1190         /*
 1191          * netmap only uses free list 0, to avoid out of order consumption
 1192          * of receive buffers
 1193          */
 1194 
 1195         bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 1196             BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 1197 
 1198         /*
 1199          * First part: import newly received packets.
 1200          *
 1201          * nm_i is the index of the next free slot in the netmap ring,
 1202          * nic_i is the index of the next received packet in the NIC ring
 1203          * (or in the free list 0 if IFLIB_HAS_RXCQ is set), and they may
 1204          * differ in case if_init() has been called while
 1205          * in netmap mode. For the receive ring we have
 1206          *
 1207          *      nic_i = fl->ifl_cidx;
 1208          *      nm_i = kring->nr_hwtail (previous)
 1209          * and
 1210          *      nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 1211          *
 1212          * fl->ifl_cidx is set to 0 on a ring reinit
 1213          */
 1214         if (netmap_no_pendintr || force_update) {
 1215                 uint32_t hwtail_lim = nm_prev(kring->nr_hwcur, lim);
 1216                 bool have_rxcq = sctx->isc_flags & IFLIB_HAS_RXCQ;
 1217                 int crclen = iflib_crcstrip ? 0 : 4;
 1218                 int error, avail;
 1219 
 1220                 /*
 1221                  * For the free list consumer index, we use the same
 1222                  * logic as in iflib_rxeof().
 1223                  */
 1224                 if (have_rxcq)
 1225                         cidxp = &rxq->ifr_cq_cidx;
 1226                 else
 1227                         cidxp = &fl->ifl_cidx;
 1228                 avail = ctx->isc_rxd_available(ctx->ifc_softc,
 1229                     rxq->ifr_id, *cidxp, USHRT_MAX);
 1230 
 1231                 nic_i = fl->ifl_cidx;
 1232                 nm_i = netmap_idx_n2k(kring, nic_i);
 1233                 MPASS(nm_i == kring->nr_hwtail);
 1234                 for (n = 0; avail > 0 && nm_i != hwtail_lim; n++, avail--) {
 1235                         rxd_info_zero(&ri);
 1236                         ri.iri_frags = rxq->ifr_frags;
 1237                         ri.iri_qsidx = kring->ring_id;
 1238                         ri.iri_ifp = ctx->ifc_ifp;
 1239                         ri.iri_cidx = *cidxp;
 1240 
 1241                         error = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 1242                         for (i = 0; i < ri.iri_nfrags; i++) {
 1243                                 if (error) {
 1244                                         ring->slot[nm_i].len = 0;
 1245                                         ring->slot[nm_i].flags = 0;
 1246                                 } else {
 1247                                         ring->slot[nm_i].len = ri.iri_frags[i].irf_len;
 1248                                         if (i == (ri.iri_nfrags - 1)) {
 1249                                                 ring->slot[nm_i].len -= crclen;
 1250                                                 ring->slot[nm_i].flags = 0;
 1251 
 1252                                                 /* Update receive counters */
 1253                                                 rx_bytes += ri.iri_len;
 1254                                                 rx_pkts++;
 1255                                         } else
 1256                                                 ring->slot[nm_i].flags = NS_MOREFRAG;
 1257                                 }
 1258 
 1259                                 bus_dmamap_sync(fl->ifl_buf_tag,
 1260                                     fl->ifl_sds.ifsd_map[nic_i], BUS_DMASYNC_POSTREAD);
 1261                                 nm_i = nm_next(nm_i, lim);
 1262                                 fl->ifl_cidx = nic_i = nm_next(nic_i, lim);
 1263                         }
 1264 
 1265                         if (have_rxcq) {
 1266                                 *cidxp = ri.iri_cidx;
 1267                                 while (*cidxp >= scctx->isc_nrxd[0])
 1268                                         *cidxp -= scctx->isc_nrxd[0];
 1269                         }
 1270 
 1271                 }
 1272                 if (n) { /* update the state variables */
 1273                         if (netmap_no_pendintr && !force_update) {
 1274                                 /* diagnostics */
 1275                                 iflib_rx_miss ++;
 1276                                 iflib_rx_miss_bufs += n;
 1277                         }
 1278                         kring->nr_hwtail = nm_i;
 1279                 }
 1280                 kring->nr_kflags &= ~NKR_PENDINTR;
 1281         }
 1282         /*
 1283          * Second part: skip past packets that userspace has released.
 1284          * (kring->nr_hwcur to head excluded),
 1285          * and make the buffers available for reception.
 1286          * As usual nm_i is the index in the netmap ring,
 1287          * nic_i is the index in the NIC ring, and
 1288          * nm_i == (nic_i + kring->nkr_hwofs) % ring_size
 1289          */
 1290         netmap_fl_refill(rxq, kring, false);
 1291 
 1292         if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 1293         if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 1294 
 1295         return (0);
 1296 }
 1297 
 1298 static void
 1299 iflib_netmap_intr(struct netmap_adapter *na, int onoff)
 1300 {
 1301         if_ctx_t ctx = na->ifp->if_softc;
 1302 
 1303         CTX_LOCK(ctx);
 1304         if (onoff) {
 1305                 IFDI_INTR_ENABLE(ctx);
 1306         } else {
 1307                 IFDI_INTR_DISABLE(ctx);
 1308         }
 1309         CTX_UNLOCK(ctx);
 1310 }
 1311 
 1312 
 1313 static int
 1314 iflib_netmap_attach(if_ctx_t ctx)
 1315 {
 1316         struct netmap_adapter na;
 1317 
 1318         bzero(&na, sizeof(na));
 1319 
 1320         na.ifp = ctx->ifc_ifp;
 1321         na.na_flags = NAF_BDG_MAYSLEEP | NAF_MOREFRAG;
 1322         MPASS(ctx->ifc_softc_ctx.isc_ntxqsets);
 1323         MPASS(ctx->ifc_softc_ctx.isc_nrxqsets);
 1324 
 1325         na.num_tx_desc = iflib_num_tx_descs(ctx);
 1326         na.num_rx_desc = iflib_num_rx_descs(ctx);
 1327         na.nm_txsync = iflib_netmap_txsync;
 1328         na.nm_rxsync = iflib_netmap_rxsync;
 1329         na.nm_register = iflib_netmap_register;
 1330         na.nm_intr = iflib_netmap_intr;
 1331         na.nm_config = iflib_netmap_config;
 1332         na.num_tx_rings = ctx->ifc_softc_ctx.isc_ntxqsets;
 1333         na.num_rx_rings = ctx->ifc_softc_ctx.isc_nrxqsets;
 1334         return (netmap_attach(&na));
 1335 }
 1336 
 1337 static int
 1338 iflib_netmap_txq_init(if_ctx_t ctx, iflib_txq_t txq)
 1339 {
 1340         struct netmap_adapter *na = NA(ctx->ifc_ifp);
 1341         struct netmap_slot *slot;
 1342 
 1343         slot = netmap_reset(na, NR_TX, txq->ift_id, 0);
 1344         if (slot == NULL)
 1345                 return (0);
 1346         for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxd[0]; i++) {
 1347 
 1348                 /*
 1349                  * In netmap mode, set the map for the packet buffer.
 1350                  * NOTE: Some drivers (not this one) also need to set
 1351                  * the physical buffer address in the NIC ring.
 1352                  * netmap_idx_n2k() maps a nic index, i, into the corresponding
 1353                  * netmap slot index, si
 1354                  */
 1355                 int si = netmap_idx_n2k(na->tx_rings[txq->ift_id], i);
 1356                 netmap_load_map(na, txq->ift_buf_tag, txq->ift_sds.ifsd_map[i],
 1357                     NMB(na, slot + si));
 1358         }
 1359         return (1);
 1360 }
 1361 
 1362 static int
 1363 iflib_netmap_rxq_init(if_ctx_t ctx, iflib_rxq_t rxq)
 1364 {
 1365         struct netmap_adapter *na = NA(ctx->ifc_ifp);
 1366         struct netmap_kring *kring;
 1367         struct netmap_slot *slot;
 1368 
 1369         slot = netmap_reset(na, NR_RX, rxq->ifr_id, 0);
 1370         if (slot == NULL)
 1371                 return (0);
 1372         kring = na->rx_rings[rxq->ifr_id];
 1373         netmap_fl_refill(rxq, kring, true);
 1374         return (1);
 1375 }
 1376 
 1377 static void
 1378 iflib_netmap_timer(void *arg)
 1379 {
 1380         iflib_txq_t txq = arg;
 1381         if_ctx_t ctx = txq->ift_ctx;
 1382 
 1383         /*
 1384          * Wake up the netmap application, to give it a chance to
 1385          * call txsync and reclaim more completed TX buffers.
 1386          */
 1387         netmap_tx_irq(ctx->ifc_ifp, txq->ift_id);
 1388 }
 1389 
 1390 #define iflib_netmap_detach(ifp) netmap_detach(ifp)
 1391 
 1392 #else
 1393 #define iflib_netmap_txq_init(ctx, txq) (0)
 1394 #define iflib_netmap_rxq_init(ctx, rxq) (0)
 1395 #define iflib_netmap_detach(ifp)
 1396 #define netmap_enable_all_rings(ifp)
 1397 #define netmap_disable_all_rings(ifp)
 1398 
 1399 #define iflib_netmap_attach(ctx) (0)
 1400 #define netmap_rx_irq(ifp, qid, budget) (0)
 1401 
 1402 #endif
 1403 
 1404 #if defined(__i386__) || defined(__amd64__)
 1405 static __inline void
 1406 prefetch(void *x)
 1407 {
 1408         __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 1409 }
 1410 static __inline void
 1411 prefetch2cachelines(void *x)
 1412 {
 1413         __asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x));
 1414 #if (CACHE_LINE_SIZE < 128)
 1415         __asm volatile("prefetcht0 %0" :: "m" (*(((unsigned long *)x)+CACHE_LINE_SIZE/(sizeof(unsigned long)))));
 1416 #endif
 1417 }
 1418 #else
 1419 #define prefetch(x)
 1420 #define prefetch2cachelines(x)
 1421 #endif
 1422 
 1423 static void
 1424 iflib_gen_mac(if_ctx_t ctx)
 1425 {
 1426         struct thread *td;
 1427         MD5_CTX mdctx;
 1428         char uuid[HOSTUUIDLEN+1];
 1429         char buf[HOSTUUIDLEN+16];
 1430         uint8_t *mac;
 1431         unsigned char digest[16];
 1432 
 1433         td = curthread;
 1434         mac = ctx->ifc_mac;
 1435         uuid[HOSTUUIDLEN] = 0;
 1436         bcopy(td->td_ucred->cr_prison->pr_hostuuid, uuid, HOSTUUIDLEN);
 1437         snprintf(buf, HOSTUUIDLEN+16, "%s-%s", uuid, device_get_nameunit(ctx->ifc_dev));
 1438         /*
 1439          * Generate a pseudo-random, deterministic MAC
 1440          * address based on the UUID and unit number.
 1441          * The FreeBSD Foundation OUI of 58-9C-FC is used.
 1442          */
 1443         MD5Init(&mdctx);
 1444         MD5Update(&mdctx, buf, strlen(buf));
 1445         MD5Final(digest, &mdctx);
 1446 
 1447         mac[0] = 0x58;
 1448         mac[1] = 0x9C;
 1449         mac[2] = 0xFC;
 1450         mac[3] = digest[0];
 1451         mac[4] = digest[1];
 1452         mac[5] = digest[2];
 1453 }
 1454 
 1455 static void
 1456 iru_init(if_rxd_update_t iru, iflib_rxq_t rxq, uint8_t flid)
 1457 {
 1458         iflib_fl_t fl;
 1459 
 1460         fl = &rxq->ifr_fl[flid];
 1461         iru->iru_paddrs = fl->ifl_bus_addrs;
 1462         iru->iru_idxs = fl->ifl_rxd_idxs;
 1463         iru->iru_qsidx = rxq->ifr_id;
 1464         iru->iru_buf_size = fl->ifl_buf_size;
 1465         iru->iru_flidx = fl->ifl_id;
 1466 }
 1467 
 1468 static void
 1469 _iflib_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err)
 1470 {
 1471         if (err)
 1472                 return;
 1473         *(bus_addr_t *) arg = segs[0].ds_addr;
 1474 }
 1475 
 1476 int
 1477 iflib_dma_alloc_align(if_ctx_t ctx, int size, int align, iflib_dma_info_t dma, int mapflags)
 1478 {
 1479         int err;
 1480         device_t dev = ctx->ifc_dev;
 1481 
 1482         err = bus_dma_tag_create(bus_get_dma_tag(dev),  /* parent */
 1483                                 align, 0,               /* alignment, bounds */
 1484                                 BUS_SPACE_MAXADDR,      /* lowaddr */
 1485                                 BUS_SPACE_MAXADDR,      /* highaddr */
 1486                                 NULL, NULL,             /* filter, filterarg */
 1487                                 size,                   /* maxsize */
 1488                                 1,                      /* nsegments */
 1489                                 size,                   /* maxsegsize */
 1490                                 BUS_DMA_ALLOCNOW,       /* flags */
 1491                                 NULL,                   /* lockfunc */
 1492                                 NULL,                   /* lockarg */
 1493                                 &dma->idi_tag);
 1494         if (err) {
 1495                 device_printf(dev,
 1496                     "%s: bus_dma_tag_create failed: %d\n",
 1497                     __func__, err);
 1498                 goto fail_0;
 1499         }
 1500 
 1501         err = bus_dmamem_alloc(dma->idi_tag, (void**) &dma->idi_vaddr,
 1502             BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->idi_map);
 1503         if (err) {
 1504                 device_printf(dev,
 1505                     "%s: bus_dmamem_alloc(%ju) failed: %d\n",
 1506                     __func__, (uintmax_t)size, err);
 1507                 goto fail_1;
 1508         }
 1509 
 1510         dma->idi_paddr = IF_BAD_DMA;
 1511         err = bus_dmamap_load(dma->idi_tag, dma->idi_map, dma->idi_vaddr,
 1512             size, _iflib_dmamap_cb, &dma->idi_paddr, mapflags | BUS_DMA_NOWAIT);
 1513         if (err || dma->idi_paddr == IF_BAD_DMA) {
 1514                 device_printf(dev,
 1515                     "%s: bus_dmamap_load failed: %d\n",
 1516                     __func__, err);
 1517                 goto fail_2;
 1518         }
 1519 
 1520         dma->idi_size = size;
 1521         return (0);
 1522 
 1523 fail_2:
 1524         bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 1525 fail_1:
 1526         bus_dma_tag_destroy(dma->idi_tag);
 1527 fail_0:
 1528         dma->idi_tag = NULL;
 1529 
 1530         return (err);
 1531 }
 1532 
 1533 int
 1534 iflib_dma_alloc(if_ctx_t ctx, int size, iflib_dma_info_t dma, int mapflags)
 1535 {
 1536         if_shared_ctx_t sctx = ctx->ifc_sctx;
 1537 
 1538         KASSERT(sctx->isc_q_align != 0, ("alignment value not initialized"));
 1539 
 1540         return (iflib_dma_alloc_align(ctx, size, sctx->isc_q_align, dma, mapflags));
 1541 }
 1542 
 1543 int
 1544 iflib_dma_alloc_multi(if_ctx_t ctx, int *sizes, iflib_dma_info_t *dmalist, int mapflags, int count)
 1545 {
 1546         int i, err;
 1547         iflib_dma_info_t *dmaiter;
 1548 
 1549         dmaiter = dmalist;
 1550         for (i = 0; i < count; i++, dmaiter++) {
 1551                 if ((err = iflib_dma_alloc(ctx, sizes[i], *dmaiter, mapflags)) != 0)
 1552                         break;
 1553         }
 1554         if (err)
 1555                 iflib_dma_free_multi(dmalist, i);
 1556         return (err);
 1557 }
 1558 
 1559 void
 1560 iflib_dma_free(iflib_dma_info_t dma)
 1561 {
 1562         if (dma->idi_tag == NULL)
 1563                 return;
 1564         if (dma->idi_paddr != IF_BAD_DMA) {
 1565                 bus_dmamap_sync(dma->idi_tag, dma->idi_map,
 1566                     BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 1567                 bus_dmamap_unload(dma->idi_tag, dma->idi_map);
 1568                 dma->idi_paddr = IF_BAD_DMA;
 1569         }
 1570         if (dma->idi_vaddr != NULL) {
 1571                 bus_dmamem_free(dma->idi_tag, dma->idi_vaddr, dma->idi_map);
 1572                 dma->idi_vaddr = NULL;
 1573         }
 1574         bus_dma_tag_destroy(dma->idi_tag);
 1575         dma->idi_tag = NULL;
 1576 }
 1577 
 1578 void
 1579 iflib_dma_free_multi(iflib_dma_info_t *dmalist, int count)
 1580 {
 1581         int i;
 1582         iflib_dma_info_t *dmaiter = dmalist;
 1583 
 1584         for (i = 0; i < count; i++, dmaiter++)
 1585                 iflib_dma_free(*dmaiter);
 1586 }
 1587 
 1588 #ifdef EARLY_AP_STARTUP
 1589 static const int iflib_started = 1;
 1590 #else
 1591 /*
 1592  * We used to abuse the smp_started flag to decide if the queues have been
 1593  * fully initialized (by late taskqgroup_adjust() calls in a SYSINIT()).
 1594  * That gave bad races, since the SYSINIT() runs strictly after smp_started
 1595  * is set.  Run a SYSINIT() strictly after that to just set a usable
 1596  * completion flag.
 1597  */
 1598 
 1599 static int iflib_started;
 1600 
 1601 static void
 1602 iflib_record_started(void *arg)
 1603 {
 1604         iflib_started = 1;
 1605 }
 1606 
 1607 SYSINIT(iflib_record_started, SI_SUB_SMP + 1, SI_ORDER_FIRST,
 1608         iflib_record_started, NULL);
 1609 #endif
 1610 
 1611 static int
 1612 iflib_fast_intr(void *arg)
 1613 {
 1614         iflib_filter_info_t info = arg;
 1615         struct grouptask *gtask = info->ifi_task;
 1616         int result;
 1617 
 1618         if (!iflib_started)
 1619                 return (FILTER_STRAY);
 1620 
 1621         DBG_COUNTER_INC(fast_intrs);
 1622         if (info->ifi_filter != NULL) {
 1623                 result = info->ifi_filter(info->ifi_filter_arg);
 1624                 if ((result & FILTER_SCHEDULE_THREAD) == 0)
 1625                         return (result);
 1626         }
 1627 
 1628         GROUPTASK_ENQUEUE(gtask);
 1629         return (FILTER_HANDLED);
 1630 }
 1631 
 1632 static int
 1633 iflib_fast_intr_rxtx(void *arg)
 1634 {
 1635         iflib_filter_info_t info = arg;
 1636         struct grouptask *gtask = info->ifi_task;
 1637         if_ctx_t ctx;
 1638         iflib_rxq_t rxq = (iflib_rxq_t)info->ifi_ctx;
 1639         iflib_txq_t txq;
 1640         void *sc;
 1641         int i, cidx, result;
 1642         qidx_t txqid;
 1643         bool intr_enable, intr_legacy;
 1644 
 1645         if (!iflib_started)
 1646                 return (FILTER_STRAY);
 1647 
 1648         DBG_COUNTER_INC(fast_intrs);
 1649         if (info->ifi_filter != NULL) {
 1650                 result = info->ifi_filter(info->ifi_filter_arg);
 1651                 if ((result & FILTER_SCHEDULE_THREAD) == 0)
 1652                         return (result);
 1653         }
 1654 
 1655         ctx = rxq->ifr_ctx;
 1656         sc = ctx->ifc_softc;
 1657         intr_enable = false;
 1658         intr_legacy = !!(ctx->ifc_flags & IFC_LEGACY);
 1659         MPASS(rxq->ifr_ntxqirq);
 1660         for (i = 0; i < rxq->ifr_ntxqirq; i++) {
 1661                 txqid = rxq->ifr_txqid[i];
 1662                 txq = &ctx->ifc_txqs[txqid];
 1663                 bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 1664                     BUS_DMASYNC_POSTREAD);
 1665                 if (!ctx->isc_txd_credits_update(sc, txqid, false)) {
 1666                         if (intr_legacy)
 1667                                 intr_enable = true;
 1668                         else
 1669                                 IFDI_TX_QUEUE_INTR_ENABLE(ctx, txqid);
 1670                         continue;
 1671                 }
 1672                 GROUPTASK_ENQUEUE(&txq->ift_task);
 1673         }
 1674         if (ctx->ifc_sctx->isc_flags & IFLIB_HAS_RXCQ)
 1675                 cidx = rxq->ifr_cq_cidx;
 1676         else
 1677                 cidx = rxq->ifr_fl[0].ifl_cidx;
 1678         if (iflib_rxd_avail(ctx, rxq, cidx, 1))
 1679                 GROUPTASK_ENQUEUE(gtask);
 1680         else {
 1681                 if (intr_legacy)
 1682                         intr_enable = true;
 1683                 else
 1684                         IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 1685                 DBG_COUNTER_INC(rx_intr_enables);
 1686         }
 1687         if (intr_enable)
 1688                 IFDI_INTR_ENABLE(ctx);
 1689         return (FILTER_HANDLED);
 1690 }
 1691 
 1692 
 1693 static int
 1694 iflib_fast_intr_ctx(void *arg)
 1695 {
 1696         iflib_filter_info_t info = arg;
 1697         struct grouptask *gtask = info->ifi_task;
 1698         int result;
 1699 
 1700         if (!iflib_started)
 1701                 return (FILTER_STRAY);
 1702 
 1703         DBG_COUNTER_INC(fast_intrs);
 1704         if (info->ifi_filter != NULL) {
 1705                 result = info->ifi_filter(info->ifi_filter_arg);
 1706                 if ((result & FILTER_SCHEDULE_THREAD) == 0)
 1707                         return (result);
 1708         }
 1709 
 1710         GROUPTASK_ENQUEUE(gtask);
 1711         return (FILTER_HANDLED);
 1712 }
 1713 
 1714 static int
 1715 _iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 1716                  driver_filter_t filter, driver_intr_t handler, void *arg,
 1717                  const char *name)
 1718 {
 1719         int rc, flags;
 1720         struct resource *res;
 1721         void *tag = NULL;
 1722         device_t dev = ctx->ifc_dev;
 1723 
 1724         flags = RF_ACTIVE;
 1725         if (ctx->ifc_flags & IFC_LEGACY)
 1726                 flags |= RF_SHAREABLE;
 1727         MPASS(rid < 512);
 1728         irq->ii_rid = rid;
 1729         res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &irq->ii_rid, flags);
 1730         if (res == NULL) {
 1731                 device_printf(dev,
 1732                     "failed to allocate IRQ for rid %d, name %s.\n", rid, name);
 1733                 return (ENOMEM);
 1734         }
 1735         irq->ii_res = res;
 1736         KASSERT(filter == NULL || handler == NULL, ("filter and handler can't both be non-NULL"));
 1737         rc = bus_setup_intr(dev, res, INTR_MPSAFE | INTR_TYPE_NET,
 1738                                                 filter, handler, arg, &tag);
 1739         if (rc != 0) {
 1740                 device_printf(dev,
 1741                     "failed to setup interrupt for rid %d, name %s: %d\n",
 1742                                           rid, name ? name : "unknown", rc);
 1743                 return (rc);
 1744         } else if (name)
 1745                 bus_describe_intr(dev, res, tag, "%s", name);
 1746 
 1747         irq->ii_tag = tag;
 1748         return (0);
 1749 }
 1750 
 1751 
 1752 /*********************************************************************
 1753  *
 1754  *  Allocate DMA resources for TX buffers as well as memory for the TX
 1755  *  mbuf map.  TX DMA maps (non-TSO/TSO) and TX mbuf map are kept in a
 1756  *  iflib_sw_tx_desc_array structure, storing all the information that
 1757  *  is needed to transmit a packet on the wire.  This is called only
 1758  *  once at attach, setup is done every reset.
 1759  *
 1760  **********************************************************************/
 1761 static int
 1762 iflib_txsd_alloc(iflib_txq_t txq)
 1763 {
 1764         if_ctx_t ctx = txq->ift_ctx;
 1765         if_shared_ctx_t sctx = ctx->ifc_sctx;
 1766         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 1767         device_t dev = ctx->ifc_dev;
 1768         bus_size_t tsomaxsize;
 1769         int err, nsegments, ntsosegments;
 1770         bool tso;
 1771 
 1772         nsegments = scctx->isc_tx_nsegments;
 1773         ntsosegments = scctx->isc_tx_tso_segments_max;
 1774         tsomaxsize = scctx->isc_tx_tso_size_max;
 1775         if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_VLAN_MTU)
 1776                 tsomaxsize += sizeof(struct ether_vlan_header);
 1777         MPASS(scctx->isc_ntxd[0] > 0);
 1778         MPASS(scctx->isc_ntxd[txq->ift_br_offset] > 0);
 1779         MPASS(nsegments > 0);
 1780         if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) {
 1781                 MPASS(ntsosegments > 0);
 1782                 MPASS(sctx->isc_tso_maxsize >= tsomaxsize);
 1783         }
 1784 
 1785         /*
 1786          * Set up DMA tags for TX buffers.
 1787          */
 1788         if ((err = bus_dma_tag_create(bus_get_dma_tag(dev),
 1789                                1, 0,                    /* alignment, bounds */
 1790                                BUS_SPACE_MAXADDR,       /* lowaddr */
 1791                                BUS_SPACE_MAXADDR,       /* highaddr */
 1792                                NULL, NULL,              /* filter, filterarg */
 1793                                sctx->isc_tx_maxsize,            /* maxsize */
 1794                                nsegments,       /* nsegments */
 1795                                sctx->isc_tx_maxsegsize, /* maxsegsize */
 1796                                0,                       /* flags */
 1797                                NULL,                    /* lockfunc */
 1798                                NULL,                    /* lockfuncarg */
 1799                                &txq->ift_buf_tag))) {
 1800                 device_printf(dev,"Unable to allocate TX DMA tag: %d\n", err);
 1801                 device_printf(dev,"maxsize: %ju nsegments: %d maxsegsize: %ju\n",
 1802                     (uintmax_t)sctx->isc_tx_maxsize, nsegments, (uintmax_t)sctx->isc_tx_maxsegsize);
 1803                 goto fail;
 1804         }
 1805         tso = (if_getcapabilities(ctx->ifc_ifp) & IFCAP_TSO) != 0;
 1806         if (tso && (err = bus_dma_tag_create(bus_get_dma_tag(dev),
 1807                                1, 0,                    /* alignment, bounds */
 1808                                BUS_SPACE_MAXADDR,       /* lowaddr */
 1809                                BUS_SPACE_MAXADDR,       /* highaddr */
 1810                                NULL, NULL,              /* filter, filterarg */
 1811                                tsomaxsize,              /* maxsize */
 1812                                ntsosegments,    /* nsegments */
 1813                                sctx->isc_tso_maxsegsize,/* maxsegsize */
 1814                                0,                       /* flags */
 1815                                NULL,                    /* lockfunc */
 1816                                NULL,                    /* lockfuncarg */
 1817                                &txq->ift_tso_buf_tag))) {
 1818                 device_printf(dev, "Unable to allocate TSO TX DMA tag: %d\n",
 1819                     err);
 1820                 goto fail;
 1821         }
 1822 
 1823         /* Allocate memory for the TX mbuf map. */
 1824         if (!(txq->ift_sds.ifsd_m =
 1825             (struct mbuf **) malloc(sizeof(struct mbuf *) *
 1826             scctx->isc_ntxd[txq->ift_br_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 1827                 device_printf(dev, "Unable to allocate TX mbuf map memory\n");
 1828                 err = ENOMEM;
 1829                 goto fail;
 1830         }
 1831 
 1832         /*
 1833          * Create the DMA maps for TX buffers.
 1834          */
 1835         if ((txq->ift_sds.ifsd_map = (bus_dmamap_t *)malloc(
 1836             sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 1837             M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 1838                 device_printf(dev,
 1839                     "Unable to allocate TX buffer DMA map memory\n");
 1840                 err = ENOMEM;
 1841                 goto fail;
 1842         }
 1843         if (tso && (txq->ift_sds.ifsd_tso_map = (bus_dmamap_t *)malloc(
 1844             sizeof(bus_dmamap_t) * scctx->isc_ntxd[txq->ift_br_offset],
 1845             M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 1846                 device_printf(dev,
 1847                     "Unable to allocate TSO TX buffer map memory\n");
 1848                 err = ENOMEM;
 1849                 goto fail;
 1850         }
 1851         for (int i = 0; i < scctx->isc_ntxd[txq->ift_br_offset]; i++) {
 1852                 err = bus_dmamap_create(txq->ift_buf_tag, 0,
 1853                     &txq->ift_sds.ifsd_map[i]);
 1854                 if (err != 0) {
 1855                         device_printf(dev, "Unable to create TX DMA map\n");
 1856                         goto fail;
 1857                 }
 1858                 if (!tso)
 1859                         continue;
 1860                 err = bus_dmamap_create(txq->ift_tso_buf_tag, 0,
 1861                     &txq->ift_sds.ifsd_tso_map[i]);
 1862                 if (err != 0) {
 1863                         device_printf(dev, "Unable to create TSO TX DMA map\n");
 1864                         goto fail;
 1865                 }
 1866         }
 1867         return (0);
 1868 fail:
 1869         /* We free all, it handles case where we are in the middle */
 1870         iflib_tx_structures_free(ctx);
 1871         return (err);
 1872 }
 1873 
 1874 static void
 1875 iflib_txsd_destroy(if_ctx_t ctx, iflib_txq_t txq, int i)
 1876 {
 1877         bus_dmamap_t map;
 1878 
 1879         if (txq->ift_sds.ifsd_map != NULL) {
 1880                 map = txq->ift_sds.ifsd_map[i];
 1881                 bus_dmamap_sync(txq->ift_buf_tag, map, BUS_DMASYNC_POSTWRITE);
 1882                 bus_dmamap_unload(txq->ift_buf_tag, map);
 1883                 bus_dmamap_destroy(txq->ift_buf_tag, map);
 1884                 txq->ift_sds.ifsd_map[i] = NULL;
 1885         }
 1886 
 1887         if (txq->ift_sds.ifsd_tso_map != NULL) {
 1888                 map = txq->ift_sds.ifsd_tso_map[i];
 1889                 bus_dmamap_sync(txq->ift_tso_buf_tag, map,
 1890                     BUS_DMASYNC_POSTWRITE);
 1891                 bus_dmamap_unload(txq->ift_tso_buf_tag, map);
 1892                 bus_dmamap_destroy(txq->ift_tso_buf_tag, map);
 1893                 txq->ift_sds.ifsd_tso_map[i] = NULL;
 1894         }
 1895 }
 1896 
 1897 static void
 1898 iflib_txq_destroy(iflib_txq_t txq)
 1899 {
 1900         if_ctx_t ctx = txq->ift_ctx;
 1901 
 1902         for (int i = 0; i < txq->ift_size; i++)
 1903                 iflib_txsd_destroy(ctx, txq, i);
 1904 
 1905         if (txq->ift_br != NULL) {
 1906                 ifmp_ring_free(txq->ift_br);
 1907                 txq->ift_br = NULL;
 1908         }
 1909 
 1910         mtx_destroy(&txq->ift_mtx);
 1911 
 1912         if (txq->ift_sds.ifsd_map != NULL) {
 1913                 free(txq->ift_sds.ifsd_map, M_IFLIB);
 1914                 txq->ift_sds.ifsd_map = NULL;
 1915         }
 1916         if (txq->ift_sds.ifsd_tso_map != NULL) {
 1917                 free(txq->ift_sds.ifsd_tso_map, M_IFLIB);
 1918                 txq->ift_sds.ifsd_tso_map = NULL;
 1919         }
 1920         if (txq->ift_sds.ifsd_m != NULL) {
 1921                 free(txq->ift_sds.ifsd_m, M_IFLIB);
 1922                 txq->ift_sds.ifsd_m = NULL;
 1923         }
 1924         if (txq->ift_buf_tag != NULL) {
 1925                 bus_dma_tag_destroy(txq->ift_buf_tag);
 1926                 txq->ift_buf_tag = NULL;
 1927         }
 1928         if (txq->ift_tso_buf_tag != NULL) {
 1929                 bus_dma_tag_destroy(txq->ift_tso_buf_tag);
 1930                 txq->ift_tso_buf_tag = NULL;
 1931         }
 1932         if (txq->ift_ifdi != NULL) {
 1933                 free(txq->ift_ifdi, M_IFLIB);
 1934         }
 1935 }
 1936 
 1937 static void
 1938 iflib_txsd_free(if_ctx_t ctx, iflib_txq_t txq, int i)
 1939 {
 1940         struct mbuf **mp;
 1941 
 1942         mp = &txq->ift_sds.ifsd_m[i];
 1943         if (*mp == NULL)
 1944                 return;
 1945 
 1946         if (txq->ift_sds.ifsd_map != NULL) {
 1947                 bus_dmamap_sync(txq->ift_buf_tag,
 1948                     txq->ift_sds.ifsd_map[i], BUS_DMASYNC_POSTWRITE);
 1949                 bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[i]);
 1950         }
 1951         if (txq->ift_sds.ifsd_tso_map != NULL) {
 1952                 bus_dmamap_sync(txq->ift_tso_buf_tag,
 1953                     txq->ift_sds.ifsd_tso_map[i], BUS_DMASYNC_POSTWRITE);
 1954                 bus_dmamap_unload(txq->ift_tso_buf_tag,
 1955                     txq->ift_sds.ifsd_tso_map[i]);
 1956         }
 1957         m_freem(*mp);
 1958         DBG_COUNTER_INC(tx_frees);
 1959         *mp = NULL;
 1960 }
 1961 
 1962 static int
 1963 iflib_txq_setup(iflib_txq_t txq)
 1964 {
 1965         if_ctx_t ctx = txq->ift_ctx;
 1966         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 1967         if_shared_ctx_t sctx = ctx->ifc_sctx;
 1968         iflib_dma_info_t di;
 1969         int i;
 1970 
 1971         /* Set number of descriptors available */
 1972         txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 1973         /* XXX make configurable */
 1974         txq->ift_update_freq = IFLIB_DEFAULT_TX_UPDATE_FREQ;
 1975 
 1976         /* Reset indices */
 1977         txq->ift_cidx_processed = 0;
 1978         txq->ift_pidx = txq->ift_cidx = txq->ift_npending = 0;
 1979         txq->ift_size = scctx->isc_ntxd[txq->ift_br_offset];
 1980 
 1981         for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 1982                 bzero((void *)di->idi_vaddr, di->idi_size);
 1983 
 1984         IFDI_TXQ_SETUP(ctx, txq->ift_id);
 1985         for (i = 0, di = txq->ift_ifdi; i < sctx->isc_ntxqs; i++, di++)
 1986                 bus_dmamap_sync(di->idi_tag, di->idi_map,
 1987                     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 1988         return (0);
 1989 }
 1990 
 1991 /*********************************************************************
 1992  *
 1993  *  Allocate DMA resources for RX buffers as well as memory for the RX
 1994  *  mbuf map, direct RX cluster pointer map and RX cluster bus address
 1995  *  map.  RX DMA map, RX mbuf map, direct RX cluster pointer map and
 1996  *  RX cluster map are kept in a iflib_sw_rx_desc_array structure.
 1997  *  Since we use use one entry in iflib_sw_rx_desc_array per received
 1998  *  packet, the maximum number of entries we'll need is equal to the
 1999  *  number of hardware receive descriptors that we've allocated.
 2000  *
 2001  **********************************************************************/
 2002 static int
 2003 iflib_rxsd_alloc(iflib_rxq_t rxq)
 2004 {
 2005         if_ctx_t ctx = rxq->ifr_ctx;
 2006         if_shared_ctx_t sctx = ctx->ifc_sctx;
 2007         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 2008         device_t dev = ctx->ifc_dev;
 2009         iflib_fl_t fl;
 2010         int                     err;
 2011 
 2012         MPASS(scctx->isc_nrxd[0] > 0);
 2013         MPASS(scctx->isc_nrxd[rxq->ifr_fl_offset] > 0);
 2014 
 2015         fl = rxq->ifr_fl;
 2016         for (int i = 0; i <  rxq->ifr_nfl; i++, fl++) {
 2017                 fl->ifl_size = scctx->isc_nrxd[rxq->ifr_fl_offset]; /* this isn't necessarily the same */
 2018                 /* Set up DMA tag for RX buffers. */
 2019                 err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */
 2020                                          1, 0,                  /* alignment, bounds */
 2021                                          BUS_SPACE_MAXADDR,     /* lowaddr */
 2022                                          BUS_SPACE_MAXADDR,     /* highaddr */
 2023                                          NULL, NULL,            /* filter, filterarg */
 2024                                          sctx->isc_rx_maxsize,  /* maxsize */
 2025                                          sctx->isc_rx_nsegments,        /* nsegments */
 2026                                          sctx->isc_rx_maxsegsize,       /* maxsegsize */
 2027                                          0,                     /* flags */
 2028                                          NULL,                  /* lockfunc */
 2029                                          NULL,                  /* lockarg */
 2030                                          &fl->ifl_buf_tag);
 2031                 if (err) {
 2032                         device_printf(dev,
 2033                             "Unable to allocate RX DMA tag: %d\n", err);
 2034                         goto fail;
 2035                 }
 2036 
 2037                 /* Allocate memory for the RX mbuf map. */
 2038                 if (!(fl->ifl_sds.ifsd_m =
 2039                       (struct mbuf **) malloc(sizeof(struct mbuf *) *
 2040                                               scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 2041                         device_printf(dev,
 2042                             "Unable to allocate RX mbuf map memory\n");
 2043                         err = ENOMEM;
 2044                         goto fail;
 2045                 }
 2046 
 2047                 /* Allocate memory for the direct RX cluster pointer map. */
 2048                 if (!(fl->ifl_sds.ifsd_cl =
 2049                       (caddr_t *) malloc(sizeof(caddr_t) *
 2050                                               scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 2051                         device_printf(dev,
 2052                             "Unable to allocate RX cluster map memory\n");
 2053                         err = ENOMEM;
 2054                         goto fail;
 2055                 }
 2056 
 2057                 /* Allocate memory for the RX cluster bus address map. */
 2058                 if (!(fl->ifl_sds.ifsd_ba =
 2059                       (bus_addr_t *) malloc(sizeof(bus_addr_t) *
 2060                                               scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 2061                         device_printf(dev,
 2062                             "Unable to allocate RX bus address map memory\n");
 2063                         err = ENOMEM;
 2064                         goto fail;
 2065                 }
 2066 
 2067                 /*
 2068                  * Create the DMA maps for RX buffers.
 2069                  */
 2070                 if (!(fl->ifl_sds.ifsd_map =
 2071                       (bus_dmamap_t *) malloc(sizeof(bus_dmamap_t) * scctx->isc_nrxd[rxq->ifr_fl_offset], M_IFLIB, M_NOWAIT | M_ZERO))) {
 2072                         device_printf(dev,
 2073                             "Unable to allocate RX buffer DMA map memory\n");
 2074                         err = ENOMEM;
 2075                         goto fail;
 2076                 }
 2077                 for (int i = 0; i < scctx->isc_nrxd[rxq->ifr_fl_offset]; i++) {
 2078                         err = bus_dmamap_create(fl->ifl_buf_tag, 0,
 2079                             &fl->ifl_sds.ifsd_map[i]);
 2080                         if (err != 0) {
 2081                                 device_printf(dev, "Unable to create RX buffer DMA map\n");
 2082                                 goto fail;
 2083                         }
 2084                 }
 2085         }
 2086         return (0);
 2087 
 2088 fail:
 2089         iflib_rx_structures_free(ctx);
 2090         return (err);
 2091 }
 2092 
 2093 
 2094 /*
 2095  * Internal service routines
 2096  */
 2097 
 2098 struct rxq_refill_cb_arg {
 2099         int               error;
 2100         bus_dma_segment_t seg;
 2101         int               nseg;
 2102 };
 2103 
 2104 static void
 2105 _rxq_refill_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 2106 {
 2107         struct rxq_refill_cb_arg *cb_arg = arg;
 2108 
 2109         cb_arg->error = error;
 2110         cb_arg->seg = segs[0];
 2111         cb_arg->nseg = nseg;
 2112 }
 2113 
 2114 /**
 2115  * iflib_fl_refill - refill an rxq free-buffer list
 2116  * @ctx: the iflib context
 2117  * @fl: the free list to refill
 2118  * @count: the number of new buffers to allocate
 2119  *
 2120  * (Re)populate an rxq free-buffer list with up to @count new packet buffers.
 2121  * The caller must assure that @count does not exceed the queue's capacity
 2122  * minus one (since we always leave a descriptor unavailable).
 2123  */
 2124 static uint8_t
 2125 iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count)
 2126 {
 2127         struct if_rxd_update iru;
 2128         struct rxq_refill_cb_arg cb_arg;
 2129         struct mbuf *m;
 2130         caddr_t cl, *sd_cl;
 2131         struct mbuf **sd_m;
 2132         bus_dmamap_t *sd_map;
 2133         bus_addr_t bus_addr, *sd_ba;
 2134         int err, frag_idx, i, idx, n, pidx;
 2135         qidx_t credits;
 2136 
 2137         MPASS(count <= fl->ifl_size - fl->ifl_credits - 1);
 2138 
 2139         sd_m = fl->ifl_sds.ifsd_m;
 2140         sd_map = fl->ifl_sds.ifsd_map;
 2141         sd_cl = fl->ifl_sds.ifsd_cl;
 2142         sd_ba = fl->ifl_sds.ifsd_ba;
 2143         pidx = fl->ifl_pidx;
 2144         idx = pidx;
 2145         frag_idx = fl->ifl_fragidx;
 2146         credits = fl->ifl_credits;
 2147 
 2148         i = 0;
 2149         n = count;
 2150         MPASS(n > 0);
 2151         MPASS(credits + n <= fl->ifl_size);
 2152 
 2153         if (pidx < fl->ifl_cidx)
 2154                 MPASS(pidx + n <= fl->ifl_cidx);
 2155         if (pidx == fl->ifl_cidx && (credits < fl->ifl_size))
 2156                 MPASS(fl->ifl_gen == 0);
 2157         if (pidx > fl->ifl_cidx)
 2158                 MPASS(n <= fl->ifl_size - pidx + fl->ifl_cidx);
 2159 
 2160         DBG_COUNTER_INC(fl_refills);
 2161         if (n > 8)
 2162                 DBG_COUNTER_INC(fl_refills_large);
 2163         iru_init(&iru, fl->ifl_rxq, fl->ifl_id);
 2164         while (n-- > 0) {
 2165                 /*
 2166                  * We allocate an uninitialized mbuf + cluster, mbuf is
 2167                  * initialized after rx.
 2168                  *
 2169                  * If the cluster is still set then we know a minimum sized
 2170                  * packet was received
 2171                  */
 2172                 bit_ffc_at(fl->ifl_rx_bitmap, frag_idx, fl->ifl_size,
 2173                     &frag_idx);
 2174                 if (frag_idx < 0)
 2175                         bit_ffc(fl->ifl_rx_bitmap, fl->ifl_size, &frag_idx);
 2176                 MPASS(frag_idx >= 0);
 2177                 if ((cl = sd_cl[frag_idx]) == NULL) {
 2178                         cl = uma_zalloc(fl->ifl_zone, M_NOWAIT);
 2179                         if (__predict_false(cl == NULL))
 2180                                 break;
 2181 
 2182                         cb_arg.error = 0;
 2183                         MPASS(sd_map != NULL);
 2184                         err = bus_dmamap_load(fl->ifl_buf_tag, sd_map[frag_idx],
 2185                             cl, fl->ifl_buf_size, _rxq_refill_cb, &cb_arg,
 2186                             BUS_DMA_NOWAIT);
 2187                         if (__predict_false(err != 0 || cb_arg.error)) {
 2188                                 uma_zfree(fl->ifl_zone, cl);
 2189                                 break;
 2190                         }
 2191 
 2192                         sd_ba[frag_idx] = bus_addr = cb_arg.seg.ds_addr;
 2193                         sd_cl[frag_idx] = cl;
 2194 #if MEMORY_LOGGING
 2195                         fl->ifl_cl_enqueued++;
 2196 #endif
 2197                 } else {
 2198                         bus_addr = sd_ba[frag_idx];
 2199                 }
 2200                 bus_dmamap_sync(fl->ifl_buf_tag, sd_map[frag_idx],
 2201                     BUS_DMASYNC_PREREAD);
 2202 
 2203                 MPASS(sd_m[frag_idx] == NULL);
 2204                 m = m_gethdr(M_NOWAIT, MT_NOINIT);
 2205                 if (__predict_false(m == NULL))
 2206                         break;
 2207                 sd_m[frag_idx] = m;
 2208                 bit_set(fl->ifl_rx_bitmap, frag_idx);
 2209 #if MEMORY_LOGGING
 2210                 fl->ifl_m_enqueued++;
 2211 #endif
 2212 
 2213                 DBG_COUNTER_INC(rx_allocs);
 2214                 fl->ifl_rxd_idxs[i] = frag_idx;
 2215                 fl->ifl_bus_addrs[i] = bus_addr;
 2216                 credits++;
 2217                 i++;
 2218                 MPASS(credits <= fl->ifl_size);
 2219                 if (++idx == fl->ifl_size) {
 2220 #ifdef INVARIANTS
 2221                         fl->ifl_gen = 1;
 2222 #endif
 2223                         idx = 0;
 2224                 }
 2225                 if (n == 0 || i == IFLIB_MAX_RX_REFRESH) {
 2226                         iru.iru_pidx = pidx;
 2227                         iru.iru_count = i;
 2228                         ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 2229                         fl->ifl_pidx = idx;
 2230                         fl->ifl_credits = credits;
 2231                         pidx = idx;
 2232                         i = 0;
 2233                 }
 2234         }
 2235 
 2236         if (n < count - 1) {
 2237                 if (i != 0) {
 2238                         iru.iru_pidx = pidx;
 2239                         iru.iru_count = i;
 2240                         ctx->isc_rxd_refill(ctx->ifc_softc, &iru);
 2241                         fl->ifl_pidx = idx;
 2242                         fl->ifl_credits = credits;
 2243                 }
 2244                 DBG_COUNTER_INC(rxd_flush);
 2245                 bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 2246                     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 2247                 ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id,
 2248                     fl->ifl_id, fl->ifl_pidx);
 2249                 if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) {
 2250                         fl->ifl_fragidx = frag_idx + 1;
 2251                         if (fl->ifl_fragidx == fl->ifl_size)
 2252                                 fl->ifl_fragidx = 0;
 2253                 } else {
 2254                         fl->ifl_fragidx = frag_idx;
 2255                 }
 2256         }
 2257 
 2258         return (n == -1 ? 0 : IFLIB_RXEOF_EMPTY);
 2259 }
 2260 
 2261 static inline uint8_t
 2262 iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl)
 2263 {
 2264         /*
 2265          * We leave an unused descriptor to avoid pidx to catch up with cidx.
 2266          * This is important as it confuses most NICs. For instance,
 2267          * Intel NICs have (per receive ring) RDH and RDT registers, where
 2268          * RDH points to the next receive descriptor to be used by the NIC,
 2269          * and RDT for the next receive descriptor to be published by the
 2270          * driver to the NIC (RDT - 1 is thus the last valid one).
 2271          * The condition RDH == RDT means no descriptors are available to
 2272          * the NIC, and thus it would be ambiguous if it also meant that
 2273          * all the descriptors are available to the NIC.
 2274          */
 2275         int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1;
 2276 #ifdef INVARIANTS
 2277         int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1;
 2278 #endif
 2279 
 2280         MPASS(fl->ifl_credits <= fl->ifl_size);
 2281         MPASS(reclaimable == delta);
 2282 
 2283         if (reclaimable > 0)
 2284                 return (iflib_fl_refill(ctx, fl, reclaimable));
 2285         return (0);
 2286 }
 2287 
 2288 uint8_t
 2289 iflib_in_detach(if_ctx_t ctx)
 2290 {
 2291         bool in_detach;
 2292 
 2293         STATE_LOCK(ctx);
 2294         in_detach = !!(ctx->ifc_flags & IFC_IN_DETACH);
 2295         STATE_UNLOCK(ctx);
 2296         return (in_detach);
 2297 }
 2298 
 2299 static void
 2300 iflib_fl_bufs_free(iflib_fl_t fl)
 2301 {
 2302         iflib_dma_info_t idi = fl->ifl_ifdi;
 2303         bus_dmamap_t sd_map;
 2304         uint32_t i;
 2305 
 2306         for (i = 0; i < fl->ifl_size; i++) {
 2307                 struct mbuf **sd_m = &fl->ifl_sds.ifsd_m[i];
 2308                 caddr_t *sd_cl = &fl->ifl_sds.ifsd_cl[i];
 2309 
 2310                 if (*sd_cl != NULL) {
 2311                         sd_map = fl->ifl_sds.ifsd_map[i];
 2312                         bus_dmamap_sync(fl->ifl_buf_tag, sd_map,
 2313                             BUS_DMASYNC_POSTREAD);
 2314                         bus_dmamap_unload(fl->ifl_buf_tag, sd_map);
 2315                         uma_zfree(fl->ifl_zone, *sd_cl);
 2316                         *sd_cl = NULL;
 2317                         if (*sd_m != NULL) {
 2318                                 m_init(*sd_m, M_NOWAIT, MT_DATA, 0);
 2319                                 uma_zfree(zone_mbuf, *sd_m);
 2320                                 *sd_m = NULL;
 2321                         }
 2322                 } else {
 2323                         MPASS(*sd_m == NULL);
 2324                 }
 2325 #if MEMORY_LOGGING
 2326                 fl->ifl_m_dequeued++;
 2327                 fl->ifl_cl_dequeued++;
 2328 #endif
 2329         }
 2330 #ifdef INVARIANTS
 2331         for (i = 0; i < fl->ifl_size; i++) {
 2332                 MPASS(fl->ifl_sds.ifsd_cl[i] == NULL);
 2333                 MPASS(fl->ifl_sds.ifsd_m[i] == NULL);
 2334         }
 2335 #endif
 2336         /*
 2337          * Reset free list values
 2338          */
 2339         fl->ifl_credits = fl->ifl_cidx = fl->ifl_pidx = fl->ifl_gen = fl->ifl_fragidx = 0;
 2340         bzero(idi->idi_vaddr, idi->idi_size);
 2341 }
 2342 
 2343 /*********************************************************************
 2344  *
 2345  *  Initialize a free list and its buffers.
 2346  *
 2347  **********************************************************************/
 2348 static int
 2349 iflib_fl_setup(iflib_fl_t fl)
 2350 {
 2351         iflib_rxq_t rxq = fl->ifl_rxq;
 2352         if_ctx_t ctx = rxq->ifr_ctx;
 2353         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 2354         int qidx;
 2355 
 2356         bit_nclear(fl->ifl_rx_bitmap, 0, fl->ifl_size - 1);
 2357         /*
 2358         ** Free current RX buffer structs and their mbufs
 2359         */
 2360         iflib_fl_bufs_free(fl);
 2361         /* Now replenish the mbufs */
 2362         MPASS(fl->ifl_credits == 0);
 2363         qidx = rxq->ifr_fl_offset + fl->ifl_id;
 2364         if (scctx->isc_rxd_buf_size[qidx] != 0)
 2365                 fl->ifl_buf_size = scctx->isc_rxd_buf_size[qidx];
 2366         else
 2367                 fl->ifl_buf_size = ctx->ifc_rx_mbuf_sz;
 2368         /*
 2369          * ifl_buf_size may be a driver-supplied value, so pull it up
 2370          * to the selected mbuf size.
 2371          */
 2372         fl->ifl_buf_size = iflib_get_mbuf_size_for(fl->ifl_buf_size);
 2373         if (fl->ifl_buf_size > ctx->ifc_max_fl_buf_size)
 2374                 ctx->ifc_max_fl_buf_size = fl->ifl_buf_size;
 2375         fl->ifl_cltype = m_gettype(fl->ifl_buf_size);
 2376         fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 2377 
 2378 
 2379         /*
 2380          * Avoid pre-allocating zillions of clusters to an idle card
 2381          * potentially speeding up attach. In any case make sure
 2382          * to leave a descriptor unavailable. See the comment in
 2383          * iflib_fl_refill_all().
 2384          */
 2385         MPASS(fl->ifl_size > 0);
 2386         (void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1));
 2387         if (min(128, fl->ifl_size - 1) != fl->ifl_credits)
 2388                 return (ENOBUFS);
 2389         /*
 2390          * handle failure
 2391          */
 2392         MPASS(rxq != NULL);
 2393         MPASS(fl->ifl_ifdi != NULL);
 2394         bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 2395             BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 2396         return (0);
 2397 }
 2398 
 2399 /*********************************************************************
 2400  *
 2401  *  Free receive ring data structures
 2402  *
 2403  **********************************************************************/
 2404 static void
 2405 iflib_rx_sds_free(iflib_rxq_t rxq)
 2406 {
 2407         iflib_fl_t fl;
 2408         int i, j;
 2409 
 2410         if (rxq->ifr_fl != NULL) {
 2411                 for (i = 0; i < rxq->ifr_nfl; i++) {
 2412                         fl = &rxq->ifr_fl[i];
 2413                         if (fl->ifl_buf_tag != NULL) {
 2414                                 if (fl->ifl_sds.ifsd_map != NULL) {
 2415                                         for (j = 0; j < fl->ifl_size; j++) {
 2416                                                 bus_dmamap_sync(
 2417                                                     fl->ifl_buf_tag,
 2418                                                     fl->ifl_sds.ifsd_map[j],
 2419                                                     BUS_DMASYNC_POSTREAD);
 2420                                                 bus_dmamap_unload(
 2421                                                     fl->ifl_buf_tag,
 2422                                                     fl->ifl_sds.ifsd_map[j]);
 2423                                                 bus_dmamap_destroy(
 2424                                                     fl->ifl_buf_tag,
 2425                                                     fl->ifl_sds.ifsd_map[j]);
 2426                                         }
 2427                                 }
 2428                                 bus_dma_tag_destroy(fl->ifl_buf_tag);
 2429                                 fl->ifl_buf_tag = NULL;
 2430                         }
 2431                         free(fl->ifl_sds.ifsd_m, M_IFLIB);
 2432                         free(fl->ifl_sds.ifsd_cl, M_IFLIB);
 2433                         free(fl->ifl_sds.ifsd_ba, M_IFLIB);
 2434                         free(fl->ifl_sds.ifsd_map, M_IFLIB);
 2435                         free(fl->ifl_rx_bitmap, M_IFLIB);
 2436                         fl->ifl_sds.ifsd_m = NULL;
 2437                         fl->ifl_sds.ifsd_cl = NULL;
 2438                         fl->ifl_sds.ifsd_ba = NULL;
 2439                         fl->ifl_sds.ifsd_map = NULL;
 2440                         fl->ifl_rx_bitmap = NULL;
 2441                 }
 2442                 free(rxq->ifr_fl, M_IFLIB);
 2443                 rxq->ifr_fl = NULL;
 2444                 free(rxq->ifr_ifdi, M_IFLIB);
 2445                 rxq->ifr_ifdi = NULL;
 2446                 rxq->ifr_cq_cidx = 0;
 2447         }
 2448 }
 2449 
 2450 /*
 2451  * Timer routine
 2452  */
 2453 static void
 2454 iflib_timer(void *arg)
 2455 {
 2456         iflib_txq_t txq = arg;
 2457         if_ctx_t ctx = txq->ift_ctx;
 2458         if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 2459         uint64_t this_tick = ticks;
 2460 
 2461         if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING))
 2462                 return;
 2463 
 2464         /*
 2465         ** Check on the state of the TX queue(s), this
 2466         ** can be done without the lock because its RO
 2467         ** and the HUNG state will be static if set.
 2468         */
 2469         if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) {
 2470                 txq->ift_last_timer_tick = this_tick;
 2471                 IFDI_TIMER(ctx, txq->ift_id);
 2472                 if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) &&
 2473                     ((txq->ift_cleaned_prev == txq->ift_cleaned) ||
 2474                      (sctx->isc_pause_frames == 0)))
 2475                         goto hung;
 2476 
 2477                 if (txq->ift_qstatus != IFLIB_QUEUE_IDLE &&
 2478                     ifmp_ring_is_stalled(txq->ift_br)) {
 2479                         KASSERT(ctx->ifc_link_state == LINK_STATE_UP,
 2480                             ("queue can't be marked as hung if interface is down"));
 2481                         txq->ift_qstatus = IFLIB_QUEUE_HUNG;
 2482                 }
 2483                 txq->ift_cleaned_prev = txq->ift_cleaned;
 2484         }
 2485         /* handle any laggards */
 2486         if (txq->ift_db_pending)
 2487                 GROUPTASK_ENQUEUE(&txq->ift_task);
 2488 
 2489         sctx->isc_pause_frames = 0;
 2490         if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) 
 2491                 callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer,
 2492                     txq, txq->ift_timer.c_cpu);
 2493         return;
 2494 
 2495  hung:
 2496         device_printf(ctx->ifc_dev,
 2497             "Watchdog timeout (TX: %d desc avail: %d pidx: %d) -- resetting\n",
 2498             txq->ift_id, TXQ_AVAIL(txq), txq->ift_pidx);
 2499         STATE_LOCK(ctx);
 2500         if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 2501         ctx->ifc_flags |= (IFC_DO_WATCHDOG|IFC_DO_RESET);
 2502         iflib_admin_intr_deferred(ctx);
 2503         STATE_UNLOCK(ctx);
 2504 }
 2505 
 2506 static uint16_t
 2507 iflib_get_mbuf_size_for(unsigned int size)
 2508 {
 2509 
 2510         if (size <= MCLBYTES)
 2511                 return (MCLBYTES);
 2512         else
 2513                 return (MJUMPAGESIZE);
 2514 }
 2515 
 2516 static void
 2517 iflib_calc_rx_mbuf_sz(if_ctx_t ctx)
 2518 {
 2519         if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 2520 
 2521         /*
 2522          * XXX don't set the max_frame_size to larger
 2523          * than the hardware can handle
 2524          */
 2525         ctx->ifc_rx_mbuf_sz =
 2526             iflib_get_mbuf_size_for(sctx->isc_max_frame_size);
 2527 }
 2528 
 2529 uint32_t
 2530 iflib_get_rx_mbuf_sz(if_ctx_t ctx)
 2531 {
 2532 
 2533         return (ctx->ifc_rx_mbuf_sz);
 2534 }
 2535 
 2536 static void
 2537 iflib_init_locked(if_ctx_t ctx)
 2538 {
 2539         if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 2540         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 2541         if_t ifp = ctx->ifc_ifp;
 2542         iflib_fl_t fl;
 2543         iflib_txq_t txq;
 2544         iflib_rxq_t rxq;
 2545         int i, j, tx_ip_csum_flags, tx_ip6_csum_flags;
 2546 
 2547         if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 2548         IFDI_INTR_DISABLE(ctx);
 2549 
 2550         /*
 2551          * See iflib_stop(). Useful in case iflib_init_locked() is
 2552          * called without first calling iflib_stop().
 2553          */
 2554         netmap_disable_all_rings(ifp);
 2555 
 2556         tx_ip_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP | CSUM_SCTP);
 2557         tx_ip6_csum_flags = scctx->isc_tx_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_IP6_SCTP);
 2558         /* Set hardware offload abilities */
 2559         if_clearhwassist(ifp);
 2560         if (if_getcapenable(ifp) & IFCAP_TXCSUM)
 2561                 if_sethwassistbits(ifp, tx_ip_csum_flags, 0);
 2562         if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6)
 2563                 if_sethwassistbits(ifp,  tx_ip6_csum_flags, 0);
 2564         if (if_getcapenable(ifp) & IFCAP_TSO4)
 2565                 if_sethwassistbits(ifp, CSUM_IP_TSO, 0);
 2566         if (if_getcapenable(ifp) & IFCAP_TSO6)
 2567                 if_sethwassistbits(ifp, CSUM_IP6_TSO, 0);
 2568 
 2569         for (i = 0, txq = ctx->ifc_txqs; i < sctx->isc_ntxqsets; i++, txq++) {
 2570                 CALLOUT_LOCK(txq);
 2571                 callout_stop(&txq->ift_timer);
 2572 #ifdef DEV_NETMAP
 2573                 callout_stop(&txq->ift_netmap_timer);
 2574 #endif /* DEV_NETMAP */
 2575                 CALLOUT_UNLOCK(txq);
 2576                 iflib_netmap_txq_init(ctx, txq);
 2577         }
 2578 
 2579         /*
 2580          * Calculate a suitable Rx mbuf size prior to calling IFDI_INIT, so
 2581          * that drivers can use the value when setting up the hardware receive
 2582          * buffers.
 2583          */
 2584         iflib_calc_rx_mbuf_sz(ctx);
 2585 
 2586 #ifdef INVARIANTS
 2587         i = if_getdrvflags(ifp);
 2588 #endif
 2589         IFDI_INIT(ctx);
 2590         MPASS(if_getdrvflags(ifp) == i);
 2591         for (i = 0, rxq = ctx->ifc_rxqs; i < sctx->isc_nrxqsets; i++, rxq++) {
 2592                 if (iflib_netmap_rxq_init(ctx, rxq) > 0) {
 2593                         /* This rxq is in netmap mode. Skip normal init. */
 2594                         continue;
 2595                 }
 2596                 for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 2597                         if (iflib_fl_setup(fl)) {
 2598                                 device_printf(ctx->ifc_dev,
 2599                                     "setting up free list %d failed - "
 2600                                     "check cluster settings\n", j);
 2601                                 goto done;
 2602                         }
 2603                 }
 2604         }
 2605 done:
 2606         if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE);
 2607         IFDI_INTR_ENABLE(ctx);
 2608         txq = ctx->ifc_txqs;
 2609         for (i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 2610                 callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
 2611                         txq->ift_timer.c_cpu);
 2612 
 2613         /* Re-enable txsync/rxsync. */
 2614         netmap_enable_all_rings(ifp);
 2615 }
 2616 
 2617 static int
 2618 iflib_media_change(if_t ifp)
 2619 {
 2620         if_ctx_t ctx = if_getsoftc(ifp);
 2621         int err;
 2622 
 2623         CTX_LOCK(ctx);
 2624         if ((err = IFDI_MEDIA_CHANGE(ctx)) == 0)
 2625                 iflib_if_init_locked(ctx);
 2626         CTX_UNLOCK(ctx);
 2627         return (err);
 2628 }
 2629 
 2630 static void
 2631 iflib_media_status(if_t ifp, struct ifmediareq *ifmr)
 2632 {
 2633         if_ctx_t ctx = if_getsoftc(ifp);
 2634 
 2635         CTX_LOCK(ctx);
 2636         IFDI_UPDATE_ADMIN_STATUS(ctx);
 2637         IFDI_MEDIA_STATUS(ctx, ifmr);
 2638         CTX_UNLOCK(ctx);
 2639 }
 2640 
 2641 void
 2642 iflib_stop(if_ctx_t ctx)
 2643 {
 2644         iflib_txq_t txq = ctx->ifc_txqs;
 2645         iflib_rxq_t rxq = ctx->ifc_rxqs;
 2646         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 2647         if_shared_ctx_t sctx = ctx->ifc_sctx;
 2648         iflib_dma_info_t di;
 2649         iflib_fl_t fl;
 2650         int i, j;
 2651 
 2652         /* Tell the stack that the interface is no longer active */
 2653         if_setdrvflagbits(ctx->ifc_ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 2654 
 2655         IFDI_INTR_DISABLE(ctx);
 2656         DELAY(1000);
 2657         IFDI_STOP(ctx);
 2658         DELAY(1000);
 2659 
 2660         /*
 2661          * Stop any pending txsync/rxsync and prevent new ones
 2662          * form starting. Processes blocked in poll() will get
 2663          * POLLERR.
 2664          */
 2665         netmap_disable_all_rings(ctx->ifc_ifp);
 2666 
 2667         iflib_debug_reset();
 2668         /* Wait for current tx queue users to exit to disarm watchdog timer. */
 2669         for (i = 0; i < scctx->isc_ntxqsets; i++, txq++) {
 2670                 /* make sure all transmitters have completed before proceeding XXX */
 2671 
 2672                 CALLOUT_LOCK(txq);
 2673                 callout_stop(&txq->ift_timer);
 2674 #ifdef DEV_NETMAP
 2675                 callout_stop(&txq->ift_netmap_timer);
 2676 #endif /* DEV_NETMAP */
 2677                 CALLOUT_UNLOCK(txq);
 2678 
 2679                 /* clean any enqueued buffers */
 2680                 iflib_ifmp_purge(txq);
 2681                 /* Free any existing tx buffers. */
 2682                 for (j = 0; j < txq->ift_size; j++) {
 2683                         iflib_txsd_free(ctx, txq, j);
 2684                 }
 2685                 txq->ift_processed = txq->ift_cleaned = txq->ift_cidx_processed = 0;
 2686                 txq->ift_in_use = txq->ift_gen = txq->ift_cidx = txq->ift_pidx = txq->ift_no_desc_avail = 0;
 2687                 txq->ift_closed = txq->ift_mbuf_defrag = txq->ift_mbuf_defrag_failed = 0;
 2688                 txq->ift_no_tx_dma_setup = txq->ift_txd_encap_efbig = txq->ift_map_failed = 0;
 2689                 txq->ift_pullups = 0;
 2690                 ifmp_ring_reset_stats(txq->ift_br);
 2691                 for (j = 0, di = txq->ift_ifdi; j < sctx->isc_ntxqs; j++, di++)
 2692                         bzero((void *)di->idi_vaddr, di->idi_size);
 2693         }
 2694         for (i = 0; i < scctx->isc_nrxqsets; i++, rxq++) {
 2695                 gtaskqueue_drain(rxq->ifr_task.gt_taskqueue,
 2696                     &rxq->ifr_task.gt_task);
 2697 
 2698                 rxq->ifr_cq_cidx = 0;
 2699                 for (j = 0, di = rxq->ifr_ifdi; j < sctx->isc_nrxqs; j++, di++)
 2700                         bzero((void *)di->idi_vaddr, di->idi_size);
 2701                 /* also resets the free lists pidx/cidx */
 2702                 for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++)
 2703                         iflib_fl_bufs_free(fl);
 2704         }
 2705 }
 2706 
 2707 static inline caddr_t
 2708 calc_next_rxd(iflib_fl_t fl, int cidx)
 2709 {
 2710         qidx_t size;
 2711         int nrxd;
 2712         caddr_t start, end, cur, next;
 2713 
 2714         nrxd = fl->ifl_size;
 2715         size = fl->ifl_rxd_size;
 2716         start = fl->ifl_ifdi->idi_vaddr;
 2717 
 2718         if (__predict_false(size == 0))
 2719                 return (start);
 2720         cur = start + size*cidx;
 2721         end = start + size*nrxd;
 2722         next = CACHE_PTR_NEXT(cur);
 2723         return (next < end ? next : start);
 2724 }
 2725 
 2726 static inline void
 2727 prefetch_pkts(iflib_fl_t fl, int cidx)
 2728 {
 2729         int nextptr;
 2730         int nrxd = fl->ifl_size;
 2731         caddr_t next_rxd;
 2732 
 2733 
 2734         nextptr = (cidx + CACHE_PTR_INCREMENT) & (nrxd-1);
 2735         prefetch(&fl->ifl_sds.ifsd_m[nextptr]);
 2736         prefetch(&fl->ifl_sds.ifsd_cl[nextptr]);
 2737         next_rxd = calc_next_rxd(fl, cidx);
 2738         prefetch(next_rxd);
 2739         prefetch(fl->ifl_sds.ifsd_m[(cidx + 1) & (nrxd-1)]);
 2740         prefetch(fl->ifl_sds.ifsd_m[(cidx + 2) & (nrxd-1)]);
 2741         prefetch(fl->ifl_sds.ifsd_m[(cidx + 3) & (nrxd-1)]);
 2742         prefetch(fl->ifl_sds.ifsd_m[(cidx + 4) & (nrxd-1)]);
 2743         prefetch(fl->ifl_sds.ifsd_cl[(cidx + 1) & (nrxd-1)]);
 2744         prefetch(fl->ifl_sds.ifsd_cl[(cidx + 2) & (nrxd-1)]);
 2745         prefetch(fl->ifl_sds.ifsd_cl[(cidx + 3) & (nrxd-1)]);
 2746         prefetch(fl->ifl_sds.ifsd_cl[(cidx + 4) & (nrxd-1)]);
 2747 }
 2748 
 2749 static void
 2750 rxd_frag_to_sd(iflib_rxq_t rxq, if_rxd_frag_t irf, int unload, if_rxsd_t sd)
 2751 {
 2752         int flid, cidx;
 2753         bus_dmamap_t map;
 2754         iflib_fl_t fl;
 2755         int next;
 2756 
 2757         map = NULL;
 2758         flid = irf->irf_flid;
 2759         cidx = irf->irf_idx;
 2760         fl = &rxq->ifr_fl[flid];
 2761         sd->ifsd_fl = fl;
 2762         sd->ifsd_m = &fl->ifl_sds.ifsd_m[cidx];
 2763         sd->ifsd_cl = &fl->ifl_sds.ifsd_cl[cidx];
 2764         fl->ifl_credits--;
 2765 #if MEMORY_LOGGING
 2766         fl->ifl_m_dequeued++;
 2767 #endif
 2768         if (rxq->ifr_ctx->ifc_flags & IFC_PREFETCH)
 2769                 prefetch_pkts(fl, cidx);
 2770         next = (cidx + CACHE_PTR_INCREMENT) & (fl->ifl_size-1);
 2771         prefetch(&fl->ifl_sds.ifsd_map[next]);
 2772         map = fl->ifl_sds.ifsd_map[cidx];
 2773 
 2774         bus_dmamap_sync(fl->ifl_buf_tag, map, BUS_DMASYNC_POSTREAD);
 2775 
 2776         if (unload && irf->irf_len != 0)
 2777                 bus_dmamap_unload(fl->ifl_buf_tag, map);
 2778         fl->ifl_cidx = (fl->ifl_cidx + 1) & (fl->ifl_size-1);
 2779         if (__predict_false(fl->ifl_cidx == 0))
 2780                 fl->ifl_gen = 0;
 2781         bit_clear(fl->ifl_rx_bitmap, cidx);
 2782 }
 2783 
 2784 static struct mbuf *
 2785 assemble_segments(iflib_rxq_t rxq, if_rxd_info_t ri, if_rxsd_t sd)
 2786 {
 2787         int i, padlen , flags;
 2788         struct mbuf *m, *mh, *mt;
 2789         caddr_t cl;
 2790 
 2791         i = 0;
 2792         mh = NULL;
 2793         do {
 2794                 rxd_frag_to_sd(rxq, &ri->iri_frags[i], TRUE, sd);
 2795 
 2796                 MPASS(*sd->ifsd_cl != NULL);
 2797                 MPASS(*sd->ifsd_m != NULL);
 2798 
 2799                 /* Don't include zero-length frags */
 2800                 if (ri->iri_frags[i].irf_len == 0) {
 2801                         /* XXX we can save the cluster here, but not the mbuf */
 2802                         m_init(*sd->ifsd_m, M_NOWAIT, MT_DATA, 0);
 2803                         m_free(*sd->ifsd_m);
 2804                         *sd->ifsd_m = NULL;
 2805                         continue;
 2806                 }
 2807                 m = *sd->ifsd_m;
 2808                 *sd->ifsd_m = NULL;
 2809                 if (mh == NULL) {
 2810                         flags = M_PKTHDR|M_EXT;
 2811                         mh = mt = m;
 2812                         padlen = ri->iri_pad;
 2813                 } else {
 2814                         flags = M_EXT;
 2815                         mt->m_next = m;
 2816                         mt = m;
 2817                         /* assuming padding is only on the first fragment */
 2818                         padlen = 0;
 2819                 }
 2820                 cl = *sd->ifsd_cl;
 2821                 *sd->ifsd_cl = NULL;
 2822 
 2823                 /* Can these two be made one ? */
 2824                 m_init(m, M_NOWAIT, MT_DATA, flags);
 2825                 m_cljset(m, cl, sd->ifsd_fl->ifl_cltype);
 2826                 /*
 2827                  * These must follow m_init and m_cljset
 2828                  */
 2829                 m->m_data += padlen;
 2830                 ri->iri_len -= padlen;
 2831                 m->m_len = ri->iri_frags[i].irf_len;
 2832         } while (++i < ri->iri_nfrags);
 2833 
 2834         return (mh);
 2835 }
 2836 
 2837 /*
 2838  * Process one software descriptor
 2839  */
 2840 static struct mbuf *
 2841 iflib_rxd_pkt_get(iflib_rxq_t rxq, if_rxd_info_t ri)
 2842 {
 2843         struct if_rxsd sd;
 2844         struct mbuf *m;
 2845 
 2846         /* should I merge this back in now that the two paths are basically duplicated? */
 2847         if (ri->iri_nfrags == 1 &&
 2848             ri->iri_frags[0].irf_len != 0 &&
 2849             ri->iri_frags[0].irf_len <= MIN(IFLIB_RX_COPY_THRESH, MHLEN)) {
 2850                 rxd_frag_to_sd(rxq, &ri->iri_frags[0], FALSE, &sd);
 2851                 m = *sd.ifsd_m;
 2852                 *sd.ifsd_m = NULL;
 2853                 m_init(m, M_NOWAIT, MT_DATA, M_PKTHDR);
 2854 #ifndef __NO_STRICT_ALIGNMENT
 2855                 if (!IP_ALIGNED(m))
 2856                         m->m_data += 2;
 2857 #endif
 2858                 memcpy(m->m_data, *sd.ifsd_cl, ri->iri_len);
 2859                 m->m_len = ri->iri_frags[0].irf_len;
 2860        } else {
 2861                 m = assemble_segments(rxq, ri, &sd);
 2862                 if (m == NULL)
 2863                         return (NULL);
 2864         }
 2865         m->m_pkthdr.len = ri->iri_len;
 2866         m->m_pkthdr.rcvif = ri->iri_ifp;
 2867         m->m_flags |= ri->iri_flags;
 2868         m->m_pkthdr.ether_vtag = ri->iri_vtag;
 2869         m->m_pkthdr.flowid = ri->iri_flowid;
 2870         M_HASHTYPE_SET(m, ri->iri_rsstype);
 2871         m->m_pkthdr.csum_flags = ri->iri_csum_flags;
 2872         m->m_pkthdr.csum_data = ri->iri_csum_data;
 2873         return (m);
 2874 }
 2875 
 2876 #if defined(INET6) || defined(INET)
 2877 static void
 2878 iflib_get_ip_forwarding(struct lro_ctrl *lc, bool *v4, bool *v6)
 2879 {
 2880         CURVNET_SET(lc->ifp->if_vnet);
 2881 #if defined(INET6)
 2882         *v6 = V_ip6_forwarding;
 2883 #endif
 2884 #if defined(INET)
 2885         *v4 = V_ipforwarding;
 2886 #endif
 2887         CURVNET_RESTORE();
 2888 }
 2889 
 2890 /*
 2891  * Returns true if it's possible this packet could be LROed.
 2892  * if it returns false, it is guaranteed that tcp_lro_rx()
 2893  * would not return zero.
 2894  */
 2895 static bool
 2896 iflib_check_lro_possible(struct mbuf *m, bool v4_forwarding, bool v6_forwarding)
 2897 {
 2898         struct ether_header *eh;
 2899 
 2900         eh = mtod(m, struct ether_header *);
 2901         switch (eh->ether_type) {
 2902 #if defined(INET6)
 2903                 case htons(ETHERTYPE_IPV6):
 2904                         return (!v6_forwarding);
 2905 #endif
 2906 #if defined (INET)
 2907                 case htons(ETHERTYPE_IP):
 2908                         return (!v4_forwarding);
 2909 #endif
 2910         }
 2911 
 2912         return false;
 2913 }
 2914 #else
 2915 static void
 2916 iflib_get_ip_forwarding(struct lro_ctrl *lc __unused, bool *v4 __unused, bool *v6 __unused)
 2917 {
 2918 }
 2919 #endif
 2920 
 2921 static void
 2922 _task_fn_rx_watchdog(void *context)
 2923 {
 2924         iflib_rxq_t rxq = context;
 2925 
 2926         GROUPTASK_ENQUEUE(&rxq->ifr_task);
 2927 }
 2928 
 2929 static uint8_t
 2930 iflib_rxeof(iflib_rxq_t rxq, qidx_t budget)
 2931 {
 2932         if_t ifp;
 2933         if_ctx_t ctx = rxq->ifr_ctx;
 2934         if_shared_ctx_t sctx = ctx->ifc_sctx;
 2935         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 2936         int avail, i;
 2937         qidx_t *cidxp;
 2938         struct if_rxd_info ri;
 2939         int err, budget_left, rx_bytes, rx_pkts;
 2940         iflib_fl_t fl;
 2941         int lro_enabled;
 2942         bool v4_forwarding, v6_forwarding, lro_possible;
 2943         uint8_t retval = 0;
 2944 
 2945         /*
 2946          * XXX early demux data packets so that if_input processing only handles
 2947          * acks in interrupt context
 2948          */
 2949         struct mbuf *m, *mh, *mt, *mf;
 2950 
 2951         lro_possible = v4_forwarding = v6_forwarding = false;
 2952         ifp = ctx->ifc_ifp;
 2953         mh = mt = NULL;
 2954         MPASS(budget > 0);
 2955         rx_pkts = rx_bytes = 0;
 2956         if (sctx->isc_flags & IFLIB_HAS_RXCQ)
 2957                 cidxp = &rxq->ifr_cq_cidx;
 2958         else
 2959                 cidxp = &rxq->ifr_fl[0].ifl_cidx;
 2960         if ((avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget)) == 0) {
 2961                 for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 2962                         retval |= iflib_fl_refill_all(ctx, fl);
 2963                 DBG_COUNTER_INC(rx_unavail);
 2964                 return (retval);
 2965         }
 2966 
 2967         for (budget_left = budget; budget_left > 0 && avail > 0;) {
 2968                 if (__predict_false(!CTX_ACTIVE(ctx))) {
 2969                         DBG_COUNTER_INC(rx_ctx_inactive);
 2970                         break;
 2971                 }
 2972                 /*
 2973                  * Reset client set fields to their default values
 2974                  */
 2975                 rxd_info_zero(&ri);
 2976                 ri.iri_qsidx = rxq->ifr_id;
 2977                 ri.iri_cidx = *cidxp;
 2978                 ri.iri_ifp = ifp;
 2979                 ri.iri_frags = rxq->ifr_frags;
 2980                 err = ctx->isc_rxd_pkt_get(ctx->ifc_softc, &ri);
 2981 
 2982                 if (err)
 2983                         goto err;
 2984                 if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 2985                         *cidxp = ri.iri_cidx;
 2986                         /* Update our consumer index */
 2987                         /* XXX NB: shurd - check if this is still safe */
 2988                         while (rxq->ifr_cq_cidx >= scctx->isc_nrxd[0])
 2989                                 rxq->ifr_cq_cidx -= scctx->isc_nrxd[0];
 2990                         /* was this only a completion queue message? */
 2991                         if (__predict_false(ri.iri_nfrags == 0))
 2992                                 continue;
 2993                 }
 2994                 MPASS(ri.iri_nfrags != 0);
 2995                 MPASS(ri.iri_len != 0);
 2996 
 2997                 /* will advance the cidx on the corresponding free lists */
 2998                 m = iflib_rxd_pkt_get(rxq, &ri);
 2999                 avail--;
 3000                 budget_left--;
 3001                 if (avail == 0 && budget_left)
 3002                         avail = iflib_rxd_avail(ctx, rxq, *cidxp, budget_left);
 3003 
 3004                 if (__predict_false(m == NULL)) {
 3005                         DBG_COUNTER_INC(rx_mbuf_null);
 3006                         continue;
 3007                 }
 3008                 /* imm_pkt: -- cxgb */
 3009                 if (mh == NULL)
 3010                         mh = mt = m;
 3011                 else {
 3012                         mt->m_nextpkt = m;
 3013                         mt = m;
 3014                 }
 3015         }
 3016         /* make sure that we can refill faster than drain */
 3017         for (i = 0, fl = &rxq->ifr_fl[0]; i < sctx->isc_nfl; i++, fl++)
 3018                 retval |= iflib_fl_refill_all(ctx, fl);
 3019 
 3020         lro_enabled = (if_getcapenable(ifp) & IFCAP_LRO);
 3021         if (lro_enabled)
 3022                 iflib_get_ip_forwarding(&rxq->ifr_lc, &v4_forwarding, &v6_forwarding);
 3023         mt = mf = NULL;
 3024         while (mh != NULL) {
 3025                 m = mh;
 3026                 mh = mh->m_nextpkt;
 3027                 m->m_nextpkt = NULL;
 3028 #ifndef __NO_STRICT_ALIGNMENT
 3029                 if (!IP_ALIGNED(m) && (m = iflib_fixup_rx(m)) == NULL)
 3030                         continue;
 3031 #endif
 3032                 rx_bytes += m->m_pkthdr.len;
 3033                 rx_pkts++;
 3034 #if defined(INET6) || defined(INET)
 3035                 if (lro_enabled) {
 3036                         if (!lro_possible) {
 3037                                 lro_possible = iflib_check_lro_possible(m, v4_forwarding, v6_forwarding);
 3038                                 if (lro_possible && mf != NULL) {
 3039                                         ifp->if_input(ifp, mf);
 3040                                         DBG_COUNTER_INC(rx_if_input);
 3041                                         mt = mf = NULL;
 3042                                 }
 3043                         }
 3044                         if ((m->m_pkthdr.csum_flags & (CSUM_L4_CALC|CSUM_L4_VALID)) ==
 3045                             (CSUM_L4_CALC|CSUM_L4_VALID)) {
 3046                                 if (lro_possible && tcp_lro_rx(&rxq->ifr_lc, m, 0) == 0)
 3047                                         continue;
 3048                         }
 3049                 }
 3050 #endif
 3051                 if (lro_possible) {
 3052                         ifp->if_input(ifp, m);
 3053                         DBG_COUNTER_INC(rx_if_input);
 3054                         continue;
 3055                 }
 3056 
 3057                 if (mf == NULL)
 3058                         mf = m;
 3059                 if (mt != NULL)
 3060                         mt->m_nextpkt = m;
 3061                 mt = m;
 3062         }
 3063         if (mf != NULL) {
 3064                 ifp->if_input(ifp, mf);
 3065                 DBG_COUNTER_INC(rx_if_input);
 3066         }
 3067 
 3068         if_inc_counter(ifp, IFCOUNTER_IBYTES, rx_bytes);
 3069         if_inc_counter(ifp, IFCOUNTER_IPACKETS, rx_pkts);
 3070 
 3071         /*
 3072          * Flush any outstanding LRO work
 3073          */
 3074 #if defined(INET6) || defined(INET)
 3075         tcp_lro_flush_all(&rxq->ifr_lc);
 3076 #endif
 3077         if (avail != 0 || iflib_rxd_avail(ctx, rxq, *cidxp, 1) != 0)
 3078                 retval |= IFLIB_RXEOF_MORE;
 3079         return (retval);
 3080 err:
 3081         STATE_LOCK(ctx);
 3082         ctx->ifc_flags |= IFC_DO_RESET;
 3083         iflib_admin_intr_deferred(ctx);
 3084         STATE_UNLOCK(ctx);
 3085         return (0);
 3086 }
 3087 
 3088 #define TXD_NOTIFY_COUNT(txq) (((txq)->ift_size / (txq)->ift_update_freq)-1)
 3089 static inline qidx_t
 3090 txq_max_db_deferred(iflib_txq_t txq, qidx_t in_use)
 3091 {
 3092         qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 3093         qidx_t minthresh = txq->ift_size / 8;
 3094         if (in_use > 4*minthresh)
 3095                 return (notify_count);
 3096         if (in_use > 2*minthresh)
 3097                 return (notify_count >> 1);
 3098         if (in_use > minthresh)
 3099                 return (notify_count >> 3);
 3100         return (0);
 3101 }
 3102 
 3103 static inline qidx_t
 3104 txq_max_rs_deferred(iflib_txq_t txq)
 3105 {
 3106         qidx_t notify_count = TXD_NOTIFY_COUNT(txq);
 3107         qidx_t minthresh = txq->ift_size / 8;
 3108         if (txq->ift_in_use > 4*minthresh)
 3109                 return (notify_count);
 3110         if (txq->ift_in_use > 2*minthresh)
 3111                 return (notify_count >> 1);
 3112         if (txq->ift_in_use > minthresh)
 3113                 return (notify_count >> 2);
 3114         return (2);
 3115 }
 3116 
 3117 #define M_CSUM_FLAGS(m) ((m)->m_pkthdr.csum_flags)
 3118 #define M_HAS_VLANTAG(m) (m->m_flags & M_VLANTAG)
 3119 
 3120 #define TXQ_MAX_DB_DEFERRED(txq, in_use) txq_max_db_deferred((txq), (in_use))
 3121 #define TXQ_MAX_RS_DEFERRED(txq) txq_max_rs_deferred(txq)
 3122 #define TXQ_MAX_DB_CONSUMED(size) (size >> 4)
 3123 
 3124 /* forward compatibility for cxgb */
 3125 #define FIRST_QSET(ctx) 0
 3126 #define NTXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_ntxqsets)
 3127 #define NRXQSETS(ctx) ((ctx)->ifc_softc_ctx.isc_nrxqsets)
 3128 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 3129 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 3130 
 3131 /* XXX we should be setting this to something other than zero */
 3132 #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 3133 #define MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
 3134     (ctx)->ifc_softc_ctx.isc_tx_nsegments)
 3135 
 3136 static inline bool
 3137 iflib_txd_db_check(iflib_txq_t txq, int ring)
 3138 {
 3139         if_ctx_t ctx = txq->ift_ctx;
 3140         qidx_t dbval, max;
 3141 
 3142         max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use);
 3143 
 3144         /* force || threshold exceeded || at the edge of the ring */
 3145         if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) {
 3146 
 3147                 /*
 3148                  * 'npending' is used if the card's doorbell is in terms of the number of descriptors
 3149                  * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the
 3150                  * producer index explicitly (INTC).
 3151                  */
 3152                 dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx;
 3153                 bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 3154                     BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
 3155                 ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval);
 3156 
 3157                 /*
 3158                  * Absent bugs there are zero packets pending so reset pending counts to zero.
 3159                  */
 3160                 txq->ift_db_pending = txq->ift_npending = 0;
 3161                 return (true);
 3162         }
 3163         return (false);
 3164 }
 3165 
 3166 #ifdef PKT_DEBUG
 3167 static void
 3168 print_pkt(if_pkt_info_t pi)
 3169 {
 3170         printf("pi len:  %d qsidx: %d nsegs: %d ndescs: %d flags: %x pidx: %d\n",
 3171                pi->ipi_len, pi->ipi_qsidx, pi->ipi_nsegs, pi->ipi_ndescs, pi->ipi_flags, pi->ipi_pidx);
 3172         printf("pi new_pidx: %d csum_flags: %lx tso_segsz: %d mflags: %x vtag: %d\n",
 3173                pi->ipi_new_pidx, pi->ipi_csum_flags, pi->ipi_tso_segsz, pi->ipi_mflags, pi->ipi_vtag);
 3174         printf("pi etype: %d ehdrlen: %d ip_hlen: %d ipproto: %d\n",
 3175                pi->ipi_etype, pi->ipi_ehdrlen, pi->ipi_ip_hlen, pi->ipi_ipproto);
 3176 }
 3177 #endif
 3178 
 3179 #define IS_TSO4(pi) ((pi)->ipi_csum_flags & CSUM_IP_TSO)
 3180 #define IS_TX_OFFLOAD4(pi) ((pi)->ipi_csum_flags & (CSUM_IP_TCP | CSUM_IP_TSO))
 3181 #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO)
 3182 #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO))
 3183 
 3184 static int
 3185 iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp)
 3186 {
 3187         if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx;
 3188         struct ether_vlan_header *eh;
 3189         struct mbuf *m;
 3190 
 3191         m = *mp;
 3192         if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) &&
 3193             M_WRITABLE(m) == 0) {
 3194                 if ((m = m_dup(m, M_NOWAIT)) == NULL) {
 3195                         return (ENOMEM);
 3196                 } else {
 3197                         m_freem(*mp);
 3198                         DBG_COUNTER_INC(tx_frees);
 3199                         *mp = m;
 3200                 }
 3201         }
 3202 
 3203         /*
 3204          * Determine where frame payload starts.
 3205          * Jump over vlan headers if already present,
 3206          * helpful for QinQ too.
 3207          */
 3208         if (__predict_false(m->m_len < sizeof(*eh))) {
 3209                 txq->ift_pullups++;
 3210                 if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL))
 3211                         return (ENOMEM);
 3212         }
 3213         eh = mtod(m, struct ether_vlan_header *);
 3214         if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
 3215                 pi->ipi_etype = ntohs(eh->evl_proto);
 3216                 pi->ipi_ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
 3217         } else {
 3218                 pi->ipi_etype = ntohs(eh->evl_encap_proto);
 3219                 pi->ipi_ehdrlen = ETHER_HDR_LEN;
 3220         }
 3221 
 3222         switch (pi->ipi_etype) {
 3223 #ifdef INET
 3224         case ETHERTYPE_IP:
 3225         {
 3226                 struct mbuf *n;
 3227                 struct ip *ip = NULL;
 3228                 struct tcphdr *th = NULL;
 3229                 int minthlen;
 3230 
 3231                 minthlen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip) + sizeof(*th));
 3232                 if (__predict_false(m->m_len < minthlen)) {
 3233                         /*
 3234                          * if this code bloat is causing too much of a hit
 3235                          * move it to a separate function and mark it noinline
 3236                          */
 3237                         if (m->m_len == pi->ipi_ehdrlen) {
 3238                                 n = m->m_next;
 3239                                 MPASS(n);
 3240                                 if (n->m_len >= sizeof(*ip))  {
 3241                                         ip = (struct ip *)n->m_data;
 3242                                         if (n->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 3243                                                 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 3244                                 } else {
 3245                                         txq->ift_pullups++;
 3246                                         if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 3247                                                 return (ENOMEM);
 3248                                         ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 3249                                 }
 3250                         } else {
 3251                                 txq->ift_pullups++;
 3252                                 if (__predict_false((m = m_pullup(m, minthlen)) == NULL))
 3253                                         return (ENOMEM);
 3254                                 ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 3255                                 if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 3256                                         th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 3257                         }
 3258                 } else {
 3259                         ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen);
 3260                         if (m->m_len >= (ip->ip_hl << 2) + sizeof(*th))
 3261                                 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 3262                 }
 3263                 pi->ipi_ip_hlen = ip->ip_hl << 2;
 3264                 pi->ipi_ipproto = ip->ip_p;
 3265                 pi->ipi_flags |= IPI_TX_IPV4;
 3266 
 3267                 /* TCP checksum offload may require TCP header length */
 3268                 if (IS_TX_OFFLOAD4(pi)) {
 3269                         if (__predict_true(pi->ipi_ipproto == IPPROTO_TCP)) {
 3270                                 if (__predict_false(th == NULL)) {
 3271                                         txq->ift_pullups++;
 3272                                         if (__predict_false((m = m_pullup(m, (ip->ip_hl << 2) + sizeof(*th))) == NULL))
 3273                                                 return (ENOMEM);
 3274                                         th = (struct tcphdr *)((caddr_t)ip + pi->ipi_ip_hlen);
 3275                                 }
 3276                                 pi->ipi_tcp_hflags = th->th_flags;
 3277                                 pi->ipi_tcp_hlen = th->th_off << 2;
 3278                                 pi->ipi_tcp_seq = th->th_seq;
 3279                         }
 3280                         if (IS_TSO4(pi)) {
 3281                                 if (__predict_false(ip->ip_p != IPPROTO_TCP))
 3282                                         return (ENXIO);
 3283                                 /*
 3284                                  * TSO always requires hardware checksum offload.
 3285                                  */
 3286                                 pi->ipi_csum_flags |= (CSUM_IP_TCP | CSUM_IP);
 3287                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
 3288                                                        ip->ip_dst.s_addr, htons(IPPROTO_TCP));
 3289                                 pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 3290                                 if (sctx->isc_flags & IFLIB_TSO_INIT_IP) {
 3291                                         ip->ip_sum = 0;
 3292                                         ip->ip_len = htons(pi->ipi_ip_hlen + pi->ipi_tcp_hlen + pi->ipi_tso_segsz);
 3293                                 }
 3294                         }
 3295                 }
 3296                 if ((sctx->isc_flags & IFLIB_NEED_ZERO_CSUM) && (pi->ipi_csum_flags & CSUM_IP))
 3297                        ip->ip_sum = 0;
 3298 
 3299                 break;
 3300         }
 3301 #endif
 3302 #ifdef INET6
 3303         case ETHERTYPE_IPV6:
 3304         {
 3305                 struct ip6_hdr *ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen);
 3306                 struct tcphdr *th;
 3307                 pi->ipi_ip_hlen = sizeof(struct ip6_hdr);
 3308 
 3309                 if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) {
 3310                         txq->ift_pullups++;
 3311                         if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL))
 3312                                 return (ENOMEM);
 3313                 }
 3314                 th = (struct tcphdr *)((caddr_t)ip6 + pi->ipi_ip_hlen);
 3315 
 3316                 /* XXX-BZ this will go badly in case of ext hdrs. */
 3317                 pi->ipi_ipproto = ip6->ip6_nxt;
 3318                 pi->ipi_flags |= IPI_TX_IPV6;
 3319 
 3320                 /* TCP checksum offload may require TCP header length */
 3321                 if (IS_TX_OFFLOAD6(pi)) {
 3322                         if (pi->ipi_ipproto == IPPROTO_TCP) {
 3323                                 if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) {
 3324                                         txq->ift_pullups++;
 3325                                         if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr) + sizeof(struct tcphdr))) == NULL))
 3326                                                 return (ENOMEM);
 3327                                 }
 3328                                 pi->ipi_tcp_hflags = th->th_flags;
 3329                                 pi->ipi_tcp_hlen = th->th_off << 2;
 3330                                 pi->ipi_tcp_seq = th->th_seq;
 3331                         }
 3332                         if (IS_TSO6(pi)) {
 3333                                 if (__predict_false(ip6->ip6_nxt != IPPROTO_TCP))
 3334                                         return (ENXIO);
 3335                                 /*
 3336                                  * TSO always requires hardware checksum offload.
 3337                                  */
 3338                                 pi->ipi_csum_flags |= CSUM_IP6_TCP;
 3339                                 th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0);
 3340                                 pi->ipi_tso_segsz = m->m_pkthdr.tso_segsz;
 3341                         }
 3342                 }
 3343                 break;
 3344         }
 3345 #endif
 3346         default:
 3347                 pi->ipi_csum_flags &= ~CSUM_OFFLOAD;
 3348                 pi->ipi_ip_hlen = 0;
 3349                 break;
 3350         }
 3351         *mp = m;
 3352 
 3353         return (0);
 3354 }
 3355 
 3356 /*
 3357  * If dodgy hardware rejects the scatter gather chain we've handed it
 3358  * we'll need to remove the mbuf chain from ifsg_m[] before we can add the
 3359  * m_defrag'd mbufs
 3360  */
 3361 static __noinline struct mbuf *
 3362 iflib_remove_mbuf(iflib_txq_t txq)
 3363 {
 3364         int ntxd, pidx;
 3365         struct mbuf *m, **ifsd_m;
 3366 
 3367         ifsd_m = txq->ift_sds.ifsd_m;
 3368         ntxd = txq->ift_size;
 3369         pidx = txq->ift_pidx & (ntxd - 1);
 3370         ifsd_m = txq->ift_sds.ifsd_m;
 3371         m = ifsd_m[pidx];
 3372         ifsd_m[pidx] = NULL;
 3373         bus_dmamap_unload(txq->ift_buf_tag, txq->ift_sds.ifsd_map[pidx]);
 3374         if (txq->ift_sds.ifsd_tso_map != NULL)
 3375                 bus_dmamap_unload(txq->ift_tso_buf_tag,
 3376                     txq->ift_sds.ifsd_tso_map[pidx]);
 3377 #if MEMORY_LOGGING
 3378         txq->ift_dequeued++;
 3379 #endif
 3380         return (m);
 3381 }
 3382 
 3383 static inline caddr_t
 3384 calc_next_txd(iflib_txq_t txq, int cidx, uint8_t qid)
 3385 {
 3386         qidx_t size;
 3387         int ntxd;
 3388         caddr_t start, end, cur, next;
 3389 
 3390         ntxd = txq->ift_size;
 3391         size = txq->ift_txd_size[qid];
 3392         start = txq->ift_ifdi[qid].idi_vaddr;
 3393 
 3394         if (__predict_false(size == 0))
 3395                 return (start);
 3396         cur = start + size*cidx;
 3397         end = start + size*ntxd;
 3398         next = CACHE_PTR_NEXT(cur);
 3399         return (next < end ? next : start);
 3400 }
 3401 
 3402 /*
 3403  * Pad an mbuf to ensure a minimum ethernet frame size.
 3404  * min_frame_size is the frame size (less CRC) to pad the mbuf to
 3405  */
 3406 static __noinline int
 3407 iflib_ether_pad(device_t dev, struct mbuf **m_head, uint16_t min_frame_size)
 3408 {
 3409         /*
 3410          * 18 is enough bytes to pad an ARP packet to 46 bytes, and
 3411          * and ARP message is the smallest common payload I can think of
 3412          */
 3413         static char pad[18];    /* just zeros */
 3414         int n;
 3415         struct mbuf *new_head;
 3416 
 3417         if (!M_WRITABLE(*m_head)) {
 3418                 new_head = m_dup(*m_head, M_NOWAIT);
 3419                 if (new_head == NULL) {
 3420                         m_freem(*m_head);
 3421                         device_printf(dev, "cannot pad short frame, m_dup() failed");
 3422                         DBG_COUNTER_INC(encap_pad_mbuf_fail);
 3423                         DBG_COUNTER_INC(tx_frees);
 3424                         return ENOMEM;
 3425                 }
 3426                 m_freem(*m_head);
 3427                 *m_head = new_head;
 3428         }
 3429 
 3430         for (n = min_frame_size - (*m_head)->m_pkthdr.len;
 3431              n > 0; n -= sizeof(pad))
 3432                 if (!m_append(*m_head, min(n, sizeof(pad)), pad))
 3433                         break;
 3434 
 3435         if (n > 0) {
 3436                 m_freem(*m_head);
 3437                 device_printf(dev, "cannot pad short frame\n");
 3438                 DBG_COUNTER_INC(encap_pad_mbuf_fail);
 3439                 DBG_COUNTER_INC(tx_frees);
 3440                 return (ENOBUFS);
 3441         }
 3442 
 3443         return 0;
 3444 }
 3445 
 3446 static int
 3447 iflib_encap(iflib_txq_t txq, struct mbuf **m_headp)
 3448 {
 3449         if_ctx_t                ctx;
 3450         if_shared_ctx_t         sctx;
 3451         if_softc_ctx_t          scctx;
 3452         bus_dma_tag_t           buf_tag;
 3453         bus_dma_segment_t       *segs;
 3454         struct mbuf             *m_head, **ifsd_m;
 3455         void                    *next_txd;
 3456         bus_dmamap_t            map;
 3457         struct if_pkt_info      pi;
 3458         int remap = 0;
 3459         int err, nsegs, ndesc, max_segs, pidx, cidx, next, ntxd;
 3460 
 3461         ctx = txq->ift_ctx;
 3462         sctx = ctx->ifc_sctx;
 3463         scctx = &ctx->ifc_softc_ctx;
 3464         segs = txq->ift_segs;
 3465         ntxd = txq->ift_size;
 3466         m_head = *m_headp;
 3467         map = NULL;
 3468 
 3469         /*
 3470          * If we're doing TSO the next descriptor to clean may be quite far ahead
 3471          */
 3472         cidx = txq->ift_cidx;
 3473         pidx = txq->ift_pidx;
 3474         if (ctx->ifc_flags & IFC_PREFETCH) {
 3475                 next = (cidx + CACHE_PTR_INCREMENT) & (ntxd-1);
 3476                 if (!(ctx->ifc_flags & IFLIB_HAS_TXCQ)) {
 3477                         next_txd = calc_next_txd(txq, cidx, 0);
 3478                         prefetch(next_txd);
 3479                 }
 3480 
 3481                 /* prefetch the next cache line of mbuf pointers and flags */
 3482                 prefetch(&txq->ift_sds.ifsd_m[next]);
 3483                 prefetch(&txq->ift_sds.ifsd_map[next]);
 3484                 next = (cidx + CACHE_LINE_SIZE) & (ntxd-1);
 3485         }
 3486         map = txq->ift_sds.ifsd_map[pidx];
 3487         ifsd_m = txq->ift_sds.ifsd_m;
 3488 
 3489         if (m_head->m_pkthdr.csum_flags & CSUM_TSO) {
 3490                 buf_tag = txq->ift_tso_buf_tag;
 3491                 max_segs = scctx->isc_tx_tso_segments_max;
 3492                 map = txq->ift_sds.ifsd_tso_map[pidx];
 3493                 MPASS(buf_tag != NULL);
 3494                 MPASS(max_segs > 0);
 3495         } else {
 3496                 buf_tag = txq->ift_buf_tag;
 3497                 max_segs = scctx->isc_tx_nsegments;
 3498                 map = txq->ift_sds.ifsd_map[pidx];
 3499         }
 3500         if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) &&
 3501             __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) {
 3502                 err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size);
 3503                 if (err) {
 3504                         DBG_COUNTER_INC(encap_txd_encap_fail);
 3505                         return err;
 3506                 }
 3507         }
 3508         m_head = *m_headp;
 3509 
 3510         pkt_info_zero(&pi);
 3511         pi.ipi_mflags = (m_head->m_flags & (M_VLANTAG|M_BCAST|M_MCAST));
 3512         pi.ipi_pidx = pidx;
 3513         pi.ipi_qsidx = txq->ift_id;
 3514         pi.ipi_len = m_head->m_pkthdr.len;
 3515         pi.ipi_csum_flags = m_head->m_pkthdr.csum_flags;
 3516         pi.ipi_vtag = M_HAS_VLANTAG(m_head) ? m_head->m_pkthdr.ether_vtag : 0;
 3517 
 3518         /* deliberate bitwise OR to make one condition */
 3519         if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) {
 3520                 if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) {
 3521                         DBG_COUNTER_INC(encap_txd_encap_fail);
 3522                         return (err);
 3523                 }
 3524                 m_head = *m_headp;
 3525         }
 3526 
 3527 retry:
 3528         err = bus_dmamap_load_mbuf_sg(buf_tag, map, m_head, segs, &nsegs,
 3529             BUS_DMA_NOWAIT);
 3530 defrag:
 3531         if (__predict_false(err)) {
 3532                 switch (err) {
 3533                 case EFBIG:
 3534                         /* try collapse once and defrag once */
 3535                         if (remap == 0) {
 3536                                 m_head = m_collapse(*m_headp, M_NOWAIT, max_segs);
 3537                                 /* try defrag if collapsing fails */
 3538                                 if (m_head == NULL)
 3539                                         remap++;
 3540                         }
 3541                         if (remap == 1) {
 3542                                 txq->ift_mbuf_defrag++;
 3543                                 m_head = m_defrag(*m_headp, M_NOWAIT);
 3544                         }
 3545                         /*
 3546                          * remap should never be >1 unless bus_dmamap_load_mbuf_sg
 3547                          * failed to map an mbuf that was run through m_defrag
 3548                          */
 3549                         MPASS(remap <= 1);
 3550                         if (__predict_false(m_head == NULL || remap > 1))
 3551                                 goto defrag_failed;
 3552                         remap++;
 3553                         *m_headp = m_head;
 3554                         goto retry;
 3555                         break;
 3556                 case ENOMEM:
 3557                         txq->ift_no_tx_dma_setup++;
 3558                         break;
 3559                 default:
 3560                         txq->ift_no_tx_dma_setup++;
 3561                         m_freem(*m_headp);
 3562                         DBG_COUNTER_INC(tx_frees);
 3563                         *m_headp = NULL;
 3564                         break;
 3565                 }
 3566                 txq->ift_map_failed++;
 3567                 DBG_COUNTER_INC(encap_load_mbuf_fail);
 3568                 DBG_COUNTER_INC(encap_txd_encap_fail);
 3569                 return (err);
 3570         }
 3571         ifsd_m[pidx] = m_head;
 3572         /*
 3573          * XXX assumes a 1 to 1 relationship between segments and
 3574          *        descriptors - this does not hold true on all drivers, e.g.
 3575          *        cxgb
 3576          */
 3577         if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 3578                 txq->ift_no_desc_avail++;
 3579                 bus_dmamap_unload(buf_tag, map);
 3580                 DBG_COUNTER_INC(encap_txq_avail_fail);
 3581                 DBG_COUNTER_INC(encap_txd_encap_fail);
 3582                 if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0)
 3583                         GROUPTASK_ENQUEUE(&txq->ift_task);
 3584                 return (ENOBUFS);
 3585         }
 3586         /*
 3587          * On Intel cards we can greatly reduce the number of TX interrupts
 3588          * we see by only setting report status on every Nth descriptor.
 3589          * However, this also means that the driver will need to keep track
 3590          * of the descriptors that RS was set on to check them for the DD bit.
 3591          */
 3592         txq->ift_rs_pending += nsegs + 1;
 3593         if (txq->ift_rs_pending > TXQ_MAX_RS_DEFERRED(txq) ||
 3594              iflib_no_tx_batch || (TXQ_AVAIL(txq) - nsegs) <= MAX_TX_DESC(ctx) + 2) {
 3595                 pi.ipi_flags |= IPI_TX_INTR;
 3596                 txq->ift_rs_pending = 0;
 3597         }
 3598 
 3599         pi.ipi_segs = segs;
 3600         pi.ipi_nsegs = nsegs;
 3601 
 3602         MPASS(pidx >= 0 && pidx < txq->ift_size);
 3603 #ifdef PKT_DEBUG
 3604         print_pkt(&pi);
 3605 #endif
 3606         if ((err = ctx->isc_txd_encap(ctx->ifc_softc, &pi)) == 0) {
 3607                 bus_dmamap_sync(buf_tag, map, BUS_DMASYNC_PREWRITE);
 3608                 DBG_COUNTER_INC(tx_encap);
 3609                 MPASS(pi.ipi_new_pidx < txq->ift_size);
 3610 
 3611                 ndesc = pi.ipi_new_pidx - pi.ipi_pidx;
 3612                 if (pi.ipi_new_pidx < pi.ipi_pidx) {
 3613                         ndesc += txq->ift_size;
 3614                         txq->ift_gen = 1;
 3615                 }
 3616                 /*
 3617                  * drivers can need as many as 
 3618                  * two sentinels
 3619                  */
 3620                 MPASS(ndesc <= pi.ipi_nsegs + 2);
 3621                 MPASS(pi.ipi_new_pidx != pidx);
 3622                 MPASS(ndesc > 0);
 3623                 txq->ift_in_use += ndesc;
 3624                 txq->ift_db_pending += ndesc;
 3625 
 3626                 /*
 3627                  * We update the last software descriptor again here because there may
 3628                  * be a sentinel and/or there may be more mbufs than segments
 3629                  */
 3630                 txq->ift_pidx = pi.ipi_new_pidx;
 3631                 txq->ift_npending += pi.ipi_ndescs;
 3632         } else {
 3633                 *m_headp = m_head = iflib_remove_mbuf(txq);
 3634                 if (err == EFBIG) {
 3635                         txq->ift_txd_encap_efbig++;
 3636                         if (remap < 2) {
 3637                                 remap = 1;
 3638                                 goto defrag;
 3639                         }
 3640                 }
 3641                 goto defrag_failed;
 3642         }
 3643         /*
 3644          * err can't possibly be non-zero here, so we don't neet to test it
 3645          * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail).
 3646          */
 3647         return (err);
 3648 
 3649 defrag_failed:
 3650         txq->ift_mbuf_defrag_failed++;
 3651         txq->ift_map_failed++;
 3652         m_freem(*m_headp);
 3653         DBG_COUNTER_INC(tx_frees);
 3654         *m_headp = NULL;
 3655         DBG_COUNTER_INC(encap_txd_encap_fail);
 3656         return (ENOMEM);
 3657 }
 3658 
 3659 static void
 3660 iflib_tx_desc_free(iflib_txq_t txq, int n)
 3661 {
 3662         uint32_t qsize, cidx, mask, gen;
 3663         struct mbuf *m, **ifsd_m;
 3664         bool do_prefetch;
 3665 
 3666         cidx = txq->ift_cidx;
 3667         gen = txq->ift_gen;
 3668         qsize = txq->ift_size;
 3669         mask = qsize-1;
 3670         ifsd_m = txq->ift_sds.ifsd_m;
 3671         do_prefetch = (txq->ift_ctx->ifc_flags & IFC_PREFETCH);
 3672 
 3673         while (n-- > 0) {
 3674                 if (do_prefetch) {
 3675                         prefetch(ifsd_m[(cidx + 3) & mask]);
 3676                         prefetch(ifsd_m[(cidx + 4) & mask]);
 3677                 }
 3678                 if ((m = ifsd_m[cidx]) != NULL) {
 3679                         prefetch(&ifsd_m[(cidx + CACHE_PTR_INCREMENT) & mask]);
 3680                         if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 3681                                 bus_dmamap_sync(txq->ift_tso_buf_tag,
 3682                                     txq->ift_sds.ifsd_tso_map[cidx],
 3683                                     BUS_DMASYNC_POSTWRITE);
 3684                                 bus_dmamap_unload(txq->ift_tso_buf_tag,
 3685                                     txq->ift_sds.ifsd_tso_map[cidx]);
 3686                         } else {
 3687                                 bus_dmamap_sync(txq->ift_buf_tag,
 3688                                     txq->ift_sds.ifsd_map[cidx],
 3689                                     BUS_DMASYNC_POSTWRITE);
 3690                                 bus_dmamap_unload(txq->ift_buf_tag,
 3691                                     txq->ift_sds.ifsd_map[cidx]);
 3692                         }
 3693                         /* XXX we don't support any drivers that batch packets yet */
 3694                         MPASS(m->m_nextpkt == NULL);
 3695                         m_freem(m);
 3696                         ifsd_m[cidx] = NULL;
 3697 #if MEMORY_LOGGING
 3698                         txq->ift_dequeued++;
 3699 #endif
 3700                         DBG_COUNTER_INC(tx_frees);
 3701                 }
 3702                 if (__predict_false(++cidx == qsize)) {
 3703                         cidx = 0;
 3704                         gen = 0;
 3705                 }
 3706         }
 3707         txq->ift_cidx = cidx;
 3708         txq->ift_gen = gen;
 3709 }
 3710 
 3711 static __inline int
 3712 iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
 3713 {
 3714         int reclaim;
 3715         if_ctx_t ctx = txq->ift_ctx;
 3716 
 3717         KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 3718         MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 3719 
 3720         /*
 3721          * Need a rate-limiting check so that this isn't called every time
 3722          */
 3723         iflib_tx_credits_update(ctx, txq);
 3724         reclaim = DESC_RECLAIMABLE(txq);
 3725 
 3726         if (reclaim <= thresh /* + MAX_TX_DESC(txq->ift_ctx) */) {
 3727 #ifdef INVARIANTS
 3728                 if (iflib_verbose_debug) {
 3729                         printf("%s processed=%ju cleaned=%ju tx_nsegments=%d reclaim=%d thresh=%d\n", __FUNCTION__,
 3730                                txq->ift_processed, txq->ift_cleaned, txq->ift_ctx->ifc_softc_ctx.isc_tx_nsegments,
 3731                                reclaim, thresh);
 3732 
 3733                 }
 3734 #endif
 3735                 return (0);
 3736         }
 3737         iflib_tx_desc_free(txq, reclaim);
 3738         txq->ift_cleaned += reclaim;
 3739         txq->ift_in_use -= reclaim;
 3740 
 3741         return (reclaim);
 3742 }
 3743 
 3744 static struct mbuf **
 3745 _ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining)
 3746 {
 3747         int next, size;
 3748         struct mbuf **items;
 3749 
 3750         size = r->size;
 3751         next = (cidx + CACHE_PTR_INCREMENT) & (size-1);
 3752         items = __DEVOLATILE(struct mbuf **, &r->items[0]);
 3753 
 3754         prefetch(items[(cidx + offset) & (size-1)]);
 3755         if (remaining > 1) {
 3756                 prefetch2cachelines(&items[next]);
 3757                 prefetch2cachelines(items[(cidx + offset + 1) & (size-1)]);
 3758                 prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]);
 3759                 prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]);
 3760         }
 3761         return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)]));
 3762 }
 3763 
 3764 static void
 3765 iflib_txq_check_drain(iflib_txq_t txq, int budget)
 3766 {
 3767 
 3768         ifmp_ring_check_drainage(txq->ift_br, budget);
 3769 }
 3770 
 3771 static uint32_t
 3772 iflib_txq_can_drain(struct ifmp_ring *r)
 3773 {
 3774         iflib_txq_t txq = r->cookie;
 3775         if_ctx_t ctx = txq->ift_ctx;
 3776 
 3777         if (TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)
 3778                 return (1);
 3779         bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 3780             BUS_DMASYNC_POSTREAD);
 3781         return (ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id,
 3782             false));
 3783 }
 3784 
 3785 static uint32_t
 3786 iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 3787 {
 3788         iflib_txq_t txq = r->cookie;
 3789         if_ctx_t ctx = txq->ift_ctx;
 3790         if_t ifp = ctx->ifc_ifp;
 3791         struct mbuf *m, **mp;
 3792         int avail, bytes_sent, skipped, count, err, i;
 3793         int mcast_sent, pkt_sent, reclaimed;
 3794         bool do_prefetch, rang, ring;
 3795 
 3796         if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) ||
 3797                             !LINK_ACTIVE(ctx))) {
 3798                 DBG_COUNTER_INC(txq_drain_notready);
 3799                 return (0);
 3800         }
 3801         reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 3802         rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
 3803         avail = IDXDIFF(pidx, cidx, r->size);
 3804 
 3805         if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) {
 3806                 /*
 3807                  * The driver is unloading so we need to free all pending packets.
 3808                  */
 3809                 DBG_COUNTER_INC(txq_drain_flushing);
 3810                 for (i = 0; i < avail; i++) {
 3811                         if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq))
 3812                                 m_freem(r->items[(cidx + i) & (r->size-1)]);
 3813                         r->items[(cidx + i) & (r->size-1)] = NULL;
 3814                 }
 3815                 return (avail);
 3816         }
 3817 
 3818         if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) {
 3819                 txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 3820                 CALLOUT_LOCK(txq);
 3821                 callout_stop(&txq->ift_timer);
 3822                 CALLOUT_UNLOCK(txq);
 3823                 DBG_COUNTER_INC(txq_drain_oactive);
 3824                 return (0);
 3825         }
 3826 
 3827         /*
 3828          * If we've reclaimed any packets this queue cannot be hung.
 3829          */
 3830         if (reclaimed)
 3831                 txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 3832         skipped = mcast_sent = bytes_sent = pkt_sent = 0;
 3833         count = MIN(avail, TX_BATCH_SIZE);
 3834 #ifdef INVARIANTS
 3835         if (iflib_verbose_debug)
 3836                 printf("%s avail=%d ifc_flags=%x txq_avail=%d ", __FUNCTION__,
 3837                        avail, ctx->ifc_flags, TXQ_AVAIL(txq));
 3838 #endif
 3839         do_prefetch = (ctx->ifc_flags & IFC_PREFETCH);
 3840         err = 0;
 3841         for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) {
 3842                 int rem = do_prefetch ? count - i : 0;
 3843 
 3844                 mp = _ring_peek_one(r, cidx, i, rem);
 3845                 MPASS(mp != NULL && *mp != NULL);
 3846 
 3847                 /*
 3848                  * Completion interrupts will use the address of the txq
 3849                  * as a sentinel to enqueue _something_ in order to acquire
 3850                  * the lock on the mp_ring (there's no direct lock call).
 3851                  * We obviously whave to check for these sentinel cases
 3852                  * and skip them.
 3853                  */
 3854                 if (__predict_false(*mp == (struct mbuf *)txq)) {
 3855                         skipped++;
 3856                         continue;
 3857                 }
 3858                 err = iflib_encap(txq, mp);
 3859                 if (__predict_false(err)) {
 3860                         /* no room - bail out */
 3861                         if (err == ENOBUFS)
 3862                                 break;
 3863                         skipped++;
 3864                         /* we can't send this packet - skip it */
 3865                         continue;
 3866                 }
 3867                 pkt_sent++;
 3868                 m = *mp;
 3869                 DBG_COUNTER_INC(tx_sent);
 3870                 bytes_sent += m->m_pkthdr.len;
 3871                 mcast_sent += !!(m->m_flags & M_MCAST);
 3872 
 3873                 if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING)))
 3874                         break;
 3875                 ETHER_BPF_MTAP(ifp, m);
 3876                 rang = iflib_txd_db_check(txq, false);
 3877         }
 3878 
 3879         /* deliberate use of bitwise or to avoid gratuitous short-circuit */
 3880         ring = rang ? false  : (iflib_min_tx_latency | err);
 3881         iflib_txd_db_check(txq, ring);
 3882         if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 3883         if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
 3884         if (mcast_sent)
 3885                 if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent);
 3886 #ifdef INVARIANTS
 3887         if (iflib_verbose_debug)
 3888                 printf("consumed=%d\n", skipped + pkt_sent);
 3889 #endif
 3890         return (skipped + pkt_sent);
 3891 }
 3892 
 3893 static uint32_t
 3894 iflib_txq_drain_always(struct ifmp_ring *r)
 3895 {
 3896         return (1);
 3897 }
 3898 
 3899 static uint32_t
 3900 iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 3901 {
 3902         int i, avail;
 3903         struct mbuf **mp;
 3904         iflib_txq_t txq;
 3905 
 3906         txq = r->cookie;
 3907 
 3908         txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 3909         CALLOUT_LOCK(txq);
 3910         callout_stop(&txq->ift_timer);
 3911         CALLOUT_UNLOCK(txq);
 3912 
 3913         avail = IDXDIFF(pidx, cidx, r->size);
 3914         for (i = 0; i < avail; i++) {
 3915                 mp = _ring_peek_one(r, cidx, i, avail - i);
 3916                 if (__predict_false(*mp == (struct mbuf *)txq))
 3917                         continue;
 3918                 m_freem(*mp);
 3919                 DBG_COUNTER_INC(tx_frees);
 3920         }
 3921         MPASS(ifmp_ring_is_stalled(r) == 0);
 3922         return (avail);
 3923 }
 3924 
 3925 static void
 3926 iflib_ifmp_purge(iflib_txq_t txq)
 3927 {
 3928         struct ifmp_ring *r;
 3929 
 3930         r = txq->ift_br;
 3931         r->drain = iflib_txq_drain_free;
 3932         r->can_drain = iflib_txq_drain_always;
 3933 
 3934         ifmp_ring_check_drainage(r, r->size);
 3935 
 3936         r->drain = iflib_txq_drain;
 3937         r->can_drain = iflib_txq_can_drain;
 3938 }
 3939 
 3940 static void
 3941 _task_fn_tx(void *context)
 3942 {
 3943         iflib_txq_t txq = context;
 3944         if_ctx_t ctx = txq->ift_ctx;
 3945         if_t ifp = ctx->ifc_ifp;
 3946         int abdicate = ctx->ifc_sysctl_tx_abdicate;
 3947 
 3948 #ifdef IFLIB_DIAGNOSTICS
 3949         txq->ift_cpu_exec_count[curcpu]++;
 3950 #endif
 3951         if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 3952                 return;
 3953 #ifdef DEV_NETMAP
 3954         if ((if_getcapenable(ifp) & IFCAP_NETMAP) &&
 3955             netmap_tx_irq(ifp, txq->ift_id))
 3956                 goto skip_ifmp;
 3957 #endif
 3958 #ifdef ALTQ
 3959         if (ALTQ_IS_ENABLED(&ifp->if_snd))
 3960                 iflib_altq_if_start(ifp);
 3961 #endif
 3962         if (txq->ift_db_pending)
 3963                 ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate);
 3964         else if (!abdicate)
 3965                 ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 3966         /*
 3967          * When abdicating, we always need to check drainage, not just when we don't enqueue
 3968          */
 3969         if (abdicate)
 3970                 ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 3971 #ifdef DEV_NETMAP
 3972 skip_ifmp:
 3973 #endif
 3974         if (ctx->ifc_flags & IFC_LEGACY)
 3975                 IFDI_INTR_ENABLE(ctx);
 3976         else
 3977                 IFDI_TX_QUEUE_INTR_ENABLE(ctx, txq->ift_id);
 3978 }
 3979 
 3980 static void
 3981 _task_fn_rx(void *context)
 3982 {
 3983         iflib_rxq_t rxq = context;
 3984         if_ctx_t ctx = rxq->ifr_ctx;
 3985         uint8_t more;
 3986         uint16_t budget;
 3987 #ifdef DEV_NETMAP
 3988         u_int work = 0;
 3989         int nmirq;
 3990 #endif
 3991 
 3992 #ifdef IFLIB_DIAGNOSTICS
 3993         rxq->ifr_cpu_exec_count[curcpu]++;
 3994 #endif
 3995         DBG_COUNTER_INC(task_fn_rxs);
 3996         if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 3997                 return;
 3998 #ifdef DEV_NETMAP
 3999         nmirq = netmap_rx_irq(ctx->ifc_ifp, rxq->ifr_id, &work);
 4000         if (nmirq != NM_IRQ_PASS) {
 4001                 more = (nmirq == NM_IRQ_RESCHED) ? IFLIB_RXEOF_MORE : 0;
 4002                 goto skip_rxeof;
 4003         }
 4004 #endif
 4005         budget = ctx->ifc_sysctl_rx_budget;
 4006         if (budget == 0)
 4007                 budget = 16;    /* XXX */
 4008         more = iflib_rxeof(rxq, budget);
 4009 #ifdef DEV_NETMAP
 4010 skip_rxeof:
 4011 #endif
 4012         if ((more & IFLIB_RXEOF_MORE) == 0) {
 4013                 if (ctx->ifc_flags & IFC_LEGACY)
 4014                         IFDI_INTR_ENABLE(ctx);
 4015                 else
 4016                         IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id);
 4017                 DBG_COUNTER_INC(rx_intr_enables);
 4018         }
 4019         if (__predict_false(!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING)))
 4020                 return;
 4021 
 4022         if (more & IFLIB_RXEOF_MORE)
 4023                 GROUPTASK_ENQUEUE(&rxq->ifr_task);
 4024         else if (more & IFLIB_RXEOF_EMPTY)
 4025                 callout_reset_curcpu(&rxq->ifr_watchdog, 1, &_task_fn_rx_watchdog, rxq);
 4026 }
 4027 
 4028 static void
 4029 _task_fn_admin(void *context)
 4030 {
 4031         if_ctx_t ctx = context;
 4032         if_softc_ctx_t sctx = &ctx->ifc_softc_ctx;
 4033         iflib_txq_t txq;
 4034         int i;
 4035         bool oactive, running, do_reset, do_watchdog, in_detach;
 4036 
 4037         STATE_LOCK(ctx);
 4038         running = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING);
 4039         oactive = (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE);
 4040         do_reset = (ctx->ifc_flags & IFC_DO_RESET);
 4041         do_watchdog = (ctx->ifc_flags & IFC_DO_WATCHDOG);
 4042         in_detach = (ctx->ifc_flags & IFC_IN_DETACH);
 4043         ctx->ifc_flags &= ~(IFC_DO_RESET|IFC_DO_WATCHDOG);
 4044         STATE_UNLOCK(ctx);
 4045 
 4046         if ((!running && !oactive) && !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 4047                 return;
 4048         if (in_detach)
 4049                 return;
 4050 
 4051         CTX_LOCK(ctx);
 4052         for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 4053                 CALLOUT_LOCK(txq);
 4054                 callout_stop(&txq->ift_timer);
 4055                 CALLOUT_UNLOCK(txq);
 4056         }
 4057         if (do_watchdog) {
 4058                 ctx->ifc_watchdog_events++;
 4059                 IFDI_WATCHDOG_RESET(ctx);
 4060         }
 4061         IFDI_UPDATE_ADMIN_STATUS(ctx);
 4062         for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) {
 4063                 callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq,
 4064                     txq->ift_timer.c_cpu);
 4065         }
 4066         IFDI_LINK_INTR_ENABLE(ctx);
 4067         if (do_reset)
 4068                 iflib_if_init_locked(ctx);
 4069         CTX_UNLOCK(ctx);
 4070 
 4071         if (LINK_ACTIVE(ctx) == 0)
 4072                 return;
 4073         for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++)
 4074                 iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 4075 }
 4076 
 4077 
 4078 static void
 4079 _task_fn_iov(void *context)
 4080 {
 4081         if_ctx_t ctx = context;
 4082 
 4083         if (!(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) &&
 4084             !(ctx->ifc_sctx->isc_flags & IFLIB_ADMIN_ALWAYS_RUN))
 4085                 return;
 4086 
 4087         CTX_LOCK(ctx);
 4088         IFDI_VFLR_HANDLE(ctx);
 4089         CTX_UNLOCK(ctx);
 4090 }
 4091 
 4092 static int
 4093 iflib_sysctl_int_delay(SYSCTL_HANDLER_ARGS)
 4094 {
 4095         int err;
 4096         if_int_delay_info_t info;
 4097         if_ctx_t ctx;
 4098 
 4099         info = (if_int_delay_info_t)arg1;
 4100         ctx = info->iidi_ctx;
 4101         info->iidi_req = req;
 4102         info->iidi_oidp = oidp;
 4103         CTX_LOCK(ctx);
 4104         err = IFDI_SYSCTL_INT_DELAY(ctx, info);
 4105         CTX_UNLOCK(ctx);
 4106         return (err);
 4107 }
 4108 
 4109 /*********************************************************************
 4110  *
 4111  *  IFNET FUNCTIONS
 4112  *
 4113  **********************************************************************/
 4114 
 4115 static void
 4116 iflib_if_init_locked(if_ctx_t ctx)
 4117 {
 4118         iflib_stop(ctx);
 4119         iflib_init_locked(ctx);
 4120 }
 4121 
 4122 
 4123 static void
 4124 iflib_if_init(void *arg)
 4125 {
 4126         if_ctx_t ctx = arg;
 4127 
 4128         CTX_LOCK(ctx);
 4129         iflib_if_init_locked(ctx);
 4130         CTX_UNLOCK(ctx);
 4131 }
 4132 
 4133 static int
 4134 iflib_if_transmit(if_t ifp, struct mbuf *m)
 4135 {
 4136         if_ctx_t        ctx = if_getsoftc(ifp);
 4137 
 4138         iflib_txq_t txq;
 4139         int err, qidx;
 4140         int abdicate = ctx->ifc_sysctl_tx_abdicate;
 4141 
 4142         if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) {
 4143                 DBG_COUNTER_INC(tx_frees);
 4144                 m_freem(m);
 4145                 return (ENETDOWN);
 4146         }
 4147 
 4148         MPASS(m->m_nextpkt == NULL);
 4149         /* ALTQ-enabled interfaces always use queue 0. */
 4150         qidx = 0;
 4151         /* Use driver-supplied queue selection method if it exists */
 4152         if (ctx->isc_txq_select)
 4153                 qidx = ctx->isc_txq_select(ctx->ifc_softc, m);
 4154         /* If not, use iflib's standard method */
 4155         else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd))
 4156                 qidx = QIDX(ctx, m);
 4157 
 4158         /* Set TX queue */
 4159         txq = &ctx->ifc_txqs[qidx];
 4160 
 4161 #ifdef DRIVER_BACKPRESSURE
 4162         if (txq->ift_closed) {
 4163                 while (m != NULL) {
 4164                         next = m->m_nextpkt;
 4165                         m->m_nextpkt = NULL;
 4166                         m_freem(m);
 4167                         DBG_COUNTER_INC(tx_frees);
 4168                         m = next;
 4169                 }
 4170                 return (ENOBUFS);
 4171         }
 4172 #endif
 4173 #ifdef notyet
 4174         qidx = count = 0;
 4175         mp = marr;
 4176         next = m;
 4177         do {
 4178                 count++;
 4179                 next = next->m_nextpkt;
 4180         } while (next != NULL);
 4181 
 4182         if (count > nitems(marr))
 4183                 if ((mp = malloc(count*sizeof(struct mbuf *), M_IFLIB, M_NOWAIT)) == NULL) {
 4184                         /* XXX check nextpkt */
 4185                         m_freem(m);
 4186                         /* XXX simplify for now */
 4187                         DBG_COUNTER_INC(tx_frees);
 4188                         return (ENOBUFS);
 4189                 }
 4190         for (next = m, i = 0; next != NULL; i++) {
 4191                 mp[i] = next;
 4192                 next = next->m_nextpkt;
 4193                 mp[i]->m_nextpkt = NULL;
 4194         }
 4195 #endif
 4196         DBG_COUNTER_INC(tx_seen);
 4197         err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate);
 4198 
 4199         if (abdicate)
 4200                 GROUPTASK_ENQUEUE(&txq->ift_task);
 4201         if (err) {
 4202                 if (!abdicate)
 4203                         GROUPTASK_ENQUEUE(&txq->ift_task);
 4204                 /* support forthcoming later */
 4205 #ifdef DRIVER_BACKPRESSURE
 4206                 txq->ift_closed = TRUE;
 4207 #endif
 4208                 ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE);
 4209                 m_freem(m);
 4210                 DBG_COUNTER_INC(tx_frees);
 4211         }
 4212 
 4213         return (err);
 4214 }
 4215 
 4216 #ifdef ALTQ
 4217 /*
 4218  * The overall approach to integrating iflib with ALTQ is to continue to use
 4219  * the iflib mp_ring machinery between the ALTQ queue(s) and the hardware
 4220  * ring.  Technically, when using ALTQ, queueing to an intermediate mp_ring
 4221  * is redundant/unnecessary, but doing so minimizes the amount of
 4222  * ALTQ-specific code required in iflib.  It is assumed that the overhead of
 4223  * redundantly queueing to an intermediate mp_ring is swamped by the
 4224  * performance limitations inherent in using ALTQ.
 4225  *
 4226  * When ALTQ support is compiled in, all iflib drivers will use a transmit
 4227  * routine, iflib_altq_if_transmit(), that checks if ALTQ is enabled for the
 4228  * given interface.  If ALTQ is enabled for an interface, then all
 4229  * transmitted packets for that interface will be submitted to the ALTQ
 4230  * subsystem via IFQ_ENQUEUE().  We don't use the legacy if_transmit()
 4231  * implementation because it uses IFQ_HANDOFF(), which will duplicatively
 4232  * update stats that the iflib machinery handles, and which is sensitve to
 4233  * the disused IFF_DRV_OACTIVE flag.  Additionally, iflib_altq_if_start()
 4234  * will be installed as the start routine for use by ALTQ facilities that
 4235  * need to trigger queue drains on a scheduled basis.
 4236  *
 4237  */
 4238 static void
 4239 iflib_altq_if_start(if_t ifp)
 4240 {
 4241         struct ifaltq *ifq = &ifp->if_snd;
 4242         struct mbuf *m;
 4243         
 4244         IFQ_LOCK(ifq);
 4245         IFQ_DEQUEUE_NOLOCK(ifq, m);
 4246         while (m != NULL) {
 4247                 iflib_if_transmit(ifp, m);
 4248                 IFQ_DEQUEUE_NOLOCK(ifq, m);
 4249         }
 4250         IFQ_UNLOCK(ifq);
 4251 }
 4252 
 4253 static int
 4254 iflib_altq_if_transmit(if_t ifp, struct mbuf *m)
 4255 {
 4256         int err;
 4257 
 4258         if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 4259                 IFQ_ENQUEUE(&ifp->if_snd, m, err);
 4260                 if (err == 0)
 4261                         iflib_altq_if_start(ifp);
 4262         } else
 4263                 err = iflib_if_transmit(ifp, m);
 4264 
 4265         return (err);
 4266 }
 4267 #endif /* ALTQ */
 4268 
 4269 static void
 4270 iflib_if_qflush(if_t ifp)
 4271 {
 4272         if_ctx_t ctx = if_getsoftc(ifp);
 4273         iflib_txq_t txq = ctx->ifc_txqs;
 4274         int i;
 4275 
 4276         STATE_LOCK(ctx);
 4277         ctx->ifc_flags |= IFC_QFLUSH;
 4278         STATE_UNLOCK(ctx);
 4279         for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 4280                 while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br)))
 4281                         iflib_txq_check_drain(txq, 0);
 4282         STATE_LOCK(ctx);
 4283         ctx->ifc_flags &= ~IFC_QFLUSH;
 4284         STATE_UNLOCK(ctx);
 4285 
 4286         /*
 4287          * When ALTQ is enabled, this will also take care of purging the
 4288          * ALTQ queue(s).
 4289          */
 4290         if_qflush(ifp);
 4291 }
 4292 
 4293 
 4294 #define IFCAP_FLAGS (IFCAP_HWCSUM_IPV6 | IFCAP_HWCSUM | IFCAP_LRO | \
 4295                      IFCAP_TSO | IFCAP_VLAN_HWTAGGING | IFCAP_HWSTATS | \
 4296                      IFCAP_VLAN_MTU | IFCAP_VLAN_HWFILTER | \
 4297                      IFCAP_VLAN_HWTSO | IFCAP_VLAN_HWCSUM)
 4298 
 4299 static int
 4300 iflib_if_ioctl(if_t ifp, u_long command, caddr_t data)
 4301 {
 4302         if_ctx_t ctx = if_getsoftc(ifp);
 4303         struct ifreq    *ifr = (struct ifreq *)data;
 4304 #if defined(INET) || defined(INET6)
 4305         struct ifaddr   *ifa = (struct ifaddr *)data;
 4306 #endif
 4307         bool            avoid_reset = false;
 4308         int             err = 0, reinit = 0, bits;
 4309 
 4310         switch (command) {
 4311         case SIOCSIFADDR:
 4312 #ifdef INET
 4313                 if (ifa->ifa_addr->sa_family == AF_INET)
 4314                         avoid_reset = true;
 4315 #endif
 4316 #ifdef INET6
 4317                 if (ifa->ifa_addr->sa_family == AF_INET6)
 4318                         avoid_reset = true;
 4319 #endif
 4320                 /*
 4321                 ** Calling init results in link renegotiation,
 4322                 ** so we avoid doing it when possible.
 4323                 */
 4324                 if (avoid_reset) {
 4325                         if_setflagbits(ifp, IFF_UP,0);
 4326                         if (!(if_getdrvflags(ifp) & IFF_DRV_RUNNING))
 4327                                 reinit = 1;
 4328 #ifdef INET
 4329                         if (!(if_getflags(ifp) & IFF_NOARP))
 4330                                 arp_ifinit(ifp, ifa);
 4331 #endif
 4332                 } else
 4333                         err = ether_ioctl(ifp, command, data);
 4334                 break;
 4335         case SIOCSIFMTU:
 4336                 CTX_LOCK(ctx);
 4337                 if (ifr->ifr_mtu == if_getmtu(ifp)) {
 4338                         CTX_UNLOCK(ctx);
 4339                         break;
 4340                 }
 4341                 bits = if_getdrvflags(ifp);
 4342                 /* stop the driver and free any clusters before proceeding */
 4343                 iflib_stop(ctx);
 4344 
 4345                 if ((err = IFDI_MTU_SET(ctx, ifr->ifr_mtu)) == 0) {
 4346                         STATE_LOCK(ctx);
 4347                         if (ifr->ifr_mtu > ctx->ifc_max_fl_buf_size)
 4348                                 ctx->ifc_flags |= IFC_MULTISEG;
 4349                         else
 4350                                 ctx->ifc_flags &= ~IFC_MULTISEG;
 4351                         STATE_UNLOCK(ctx);
 4352                         err = if_setmtu(ifp, ifr->ifr_mtu);
 4353                 }
 4354                 iflib_init_locked(ctx);
 4355                 STATE_LOCK(ctx);
 4356                 if_setdrvflags(ifp, bits);
 4357                 STATE_UNLOCK(ctx);
 4358                 CTX_UNLOCK(ctx);
 4359                 break;
 4360         case SIOCSIFFLAGS:
 4361                 CTX_LOCK(ctx);
 4362                 if (if_getflags(ifp) & IFF_UP) {
 4363                         if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 4364                                 if ((if_getflags(ifp) ^ ctx->ifc_if_flags) &
 4365                                     (IFF_PROMISC | IFF_ALLMULTI)) {
 4366                                         err = IFDI_PROMISC_SET(ctx, if_getflags(ifp));
 4367                                 }
 4368                         } else
 4369                                 reinit = 1;
 4370                 } else if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 4371                         iflib_stop(ctx);
 4372                 }
 4373                 ctx->ifc_if_flags = if_getflags(ifp);
 4374                 CTX_UNLOCK(ctx);
 4375                 break;
 4376         case SIOCADDMULTI:
 4377         case SIOCDELMULTI:
 4378                 if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) {
 4379                         CTX_LOCK(ctx);
 4380                         IFDI_INTR_DISABLE(ctx);
 4381                         IFDI_MULTI_SET(ctx);
 4382                         IFDI_INTR_ENABLE(ctx);
 4383                         CTX_UNLOCK(ctx);
 4384                 }
 4385                 break;
 4386         case SIOCSIFMEDIA:
 4387                 CTX_LOCK(ctx);
 4388                 IFDI_MEDIA_SET(ctx);
 4389                 CTX_UNLOCK(ctx);
 4390                 /* FALLTHROUGH */
 4391         case SIOCGIFMEDIA:
 4392         case SIOCGIFXMEDIA:
 4393                 err = ifmedia_ioctl(ifp, ifr, &ctx->ifc_media, command);
 4394                 break;
 4395         case SIOCGI2C:
 4396         {
 4397                 struct ifi2creq i2c;
 4398 
 4399                 err = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c));
 4400                 if (err != 0)
 4401                         break;
 4402                 if (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2) {
 4403                         err = EINVAL;
 4404                         break;
 4405                 }
 4406                 if (i2c.len > sizeof(i2c.data)) {
 4407                         err = EINVAL;
 4408                         break;
 4409                 }
 4410 
 4411                 if ((err = IFDI_I2C_REQ(ctx, &i2c)) == 0)
 4412                         err = copyout(&i2c, ifr_data_get_ptr(ifr),
 4413                             sizeof(i2c));
 4414                 break;
 4415         }
 4416         case SIOCSIFCAP:
 4417         {
 4418                 int mask, setmask, oldmask;
 4419 
 4420                 oldmask = if_getcapenable(ifp);
 4421                 mask = ifr->ifr_reqcap ^ oldmask;
 4422                 mask &= ctx->ifc_softc_ctx.isc_capabilities;
 4423                 setmask = 0;
 4424 #ifdef TCP_OFFLOAD
 4425                 setmask |= mask & (IFCAP_TOE4|IFCAP_TOE6);
 4426 #endif
 4427                 setmask |= (mask & IFCAP_FLAGS);
 4428                 setmask |= (mask & IFCAP_WOL);
 4429 
 4430                 /*
 4431                  * If any RX csum has changed, change all the ones that
 4432                  * are supported by the driver.
 4433                  */
 4434                 if (setmask & (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6)) {
 4435                         setmask |= ctx->ifc_softc_ctx.isc_capabilities &
 4436                             (IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6);
 4437                 }
 4438 
 4439                 /*
 4440                  * want to ensure that traffic has stopped before we change any of the flags
 4441                  */
 4442                 if (setmask) {
 4443                         CTX_LOCK(ctx);
 4444                         bits = if_getdrvflags(ifp);
 4445                         if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 4446                                 iflib_stop(ctx);
 4447                         STATE_LOCK(ctx);
 4448                         if_togglecapenable(ifp, setmask);
 4449                         ctx->ifc_softc_ctx.isc_capenable ^= setmask;
 4450                         STATE_UNLOCK(ctx);
 4451                         if (bits & IFF_DRV_RUNNING && setmask & ~IFCAP_WOL)
 4452                                 iflib_init_locked(ctx);
 4453                         STATE_LOCK(ctx);
 4454                         if_setdrvflags(ifp, bits);
 4455                         STATE_UNLOCK(ctx);
 4456                         CTX_UNLOCK(ctx);
 4457                 }
 4458                 if_vlancap(ifp);
 4459                 break;
 4460         }
 4461         case SIOCGPRIVATE_0:
 4462         case SIOCSDRVSPEC:
 4463         case SIOCGDRVSPEC:
 4464                 CTX_LOCK(ctx);
 4465                 err = IFDI_PRIV_IOCTL(ctx, command, data);
 4466                 CTX_UNLOCK(ctx);
 4467                 break;
 4468         default:
 4469                 err = ether_ioctl(ifp, command, data);
 4470                 break;
 4471         }
 4472         if (reinit)
 4473                 iflib_if_init(ctx);
 4474         return (err);
 4475 }
 4476 
 4477 static uint64_t
 4478 iflib_if_get_counter(if_t ifp, ift_counter cnt)
 4479 {
 4480         if_ctx_t ctx = if_getsoftc(ifp);
 4481 
 4482         return (IFDI_GET_COUNTER(ctx, cnt));
 4483 }
 4484 
 4485 /*********************************************************************
 4486  *
 4487  *  OTHER FUNCTIONS EXPORTED TO THE STACK
 4488  *
 4489  **********************************************************************/
 4490 
 4491 static void
 4492 iflib_vlan_register(void *arg, if_t ifp, uint16_t vtag)
 4493 {
 4494         if_ctx_t ctx = if_getsoftc(ifp);
 4495 
 4496         if ((void *)ctx != arg)
 4497                 return;
 4498 
 4499         if ((vtag == 0) || (vtag > 4095))
 4500                 return;
 4501 
 4502         if (iflib_in_detach(ctx))
 4503                 return;
 4504 
 4505         CTX_LOCK(ctx);
 4506         /* Driver may need all untagged packets to be flushed */
 4507         if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 4508                 iflib_stop(ctx);
 4509         IFDI_VLAN_REGISTER(ctx, vtag);
 4510         /* Re-init to load the changes, if required */
 4511         if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 4512                 iflib_init_locked(ctx);
 4513         CTX_UNLOCK(ctx);
 4514 }
 4515 
 4516 static void
 4517 iflib_vlan_unregister(void *arg, if_t ifp, uint16_t vtag)
 4518 {
 4519         if_ctx_t ctx = if_getsoftc(ifp);
 4520 
 4521         if ((void *)ctx != arg)
 4522                 return;
 4523 
 4524         if ((vtag == 0) || (vtag > 4095))
 4525                 return;
 4526 
 4527         CTX_LOCK(ctx);
 4528         /* Driver may need all tagged packets to be flushed */
 4529         if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 4530                 iflib_stop(ctx);
 4531         IFDI_VLAN_UNREGISTER(ctx, vtag);
 4532         /* Re-init to load the changes, if required */
 4533         if (IFDI_NEEDS_RESTART(ctx, IFLIB_RESTART_VLAN_CONFIG))
 4534                 iflib_init_locked(ctx);
 4535         CTX_UNLOCK(ctx);
 4536 }
 4537 
 4538 static void
 4539 iflib_led_func(void *arg, int onoff)
 4540 {
 4541         if_ctx_t ctx = arg;
 4542 
 4543         CTX_LOCK(ctx);
 4544         IFDI_LED_FUNC(ctx, onoff);
 4545         CTX_UNLOCK(ctx);
 4546 }
 4547 
 4548 /*********************************************************************
 4549  *
 4550  *  BUS FUNCTION DEFINITIONS
 4551  *
 4552  **********************************************************************/
 4553 
 4554 int
 4555 iflib_device_probe(device_t dev)
 4556 {
 4557         pci_vendor_info_t *ent;
 4558 
 4559         uint16_t        pci_vendor_id, pci_device_id;
 4560         uint16_t        pci_subvendor_id, pci_subdevice_id;
 4561         uint16_t        pci_rev_id;
 4562         if_shared_ctx_t sctx;
 4563 
 4564         if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 4565                 return (ENOTSUP);
 4566 
 4567         pci_vendor_id = pci_get_vendor(dev);
 4568         pci_device_id = pci_get_device(dev);
 4569         pci_subvendor_id = pci_get_subvendor(dev);
 4570         pci_subdevice_id = pci_get_subdevice(dev);
 4571         pci_rev_id = pci_get_revid(dev);
 4572         if (sctx->isc_parse_devinfo != NULL)
 4573                 sctx->isc_parse_devinfo(&pci_device_id, &pci_subvendor_id, &pci_subdevice_id, &pci_rev_id);
 4574 
 4575         ent = sctx->isc_vendor_info;
 4576         while (ent->pvi_vendor_id != 0) {
 4577                 if (pci_vendor_id != ent->pvi_vendor_id) {
 4578                         ent++;
 4579                         continue;
 4580                 }
 4581                 if ((pci_device_id == ent->pvi_device_id) &&
 4582                     ((pci_subvendor_id == ent->pvi_subvendor_id) ||
 4583                      (ent->pvi_subvendor_id == 0)) &&
 4584                     ((pci_subdevice_id == ent->pvi_subdevice_id) ||
 4585                      (ent->pvi_subdevice_id == 0)) &&
 4586                     ((pci_rev_id == ent->pvi_rev_id) ||
 4587                      (ent->pvi_rev_id == 0))) {
 4588 
 4589                         device_set_desc_copy(dev, ent->pvi_name);
 4590                         /* this needs to be changed to zero if the bus probing code
 4591                          * ever stops re-probing on best match because the sctx
 4592                          * may have its values over written by register calls
 4593                          * in subsequent probes
 4594                          */
 4595                         return (BUS_PROBE_DEFAULT);
 4596                 }
 4597                 ent++;
 4598         }
 4599         return (ENXIO);
 4600 }
 4601 
 4602 int
 4603 iflib_device_probe_vendor(device_t dev)
 4604 {
 4605         int probe;
 4606 
 4607         probe = iflib_device_probe(dev);
 4608         if (probe == BUS_PROBE_DEFAULT)
 4609                 return (BUS_PROBE_VENDOR);
 4610         else
 4611                 return (probe);
 4612 }
 4613 
 4614 static void
 4615 iflib_reset_qvalues(if_ctx_t ctx)
 4616 {
 4617         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 4618         if_shared_ctx_t sctx = ctx->ifc_sctx;
 4619         device_t dev = ctx->ifc_dev;
 4620         int i;
 4621 
 4622         scctx->isc_txrx_budget_bytes_max = IFLIB_MAX_TX_BYTES;
 4623         scctx->isc_tx_qdepth = IFLIB_DEFAULT_TX_QDEPTH;
 4624         if (ctx->ifc_sysctl_ntxqs != 0)
 4625                 scctx->isc_ntxqsets = ctx->ifc_sysctl_ntxqs;
 4626         if (ctx->ifc_sysctl_nrxqs != 0)
 4627                 scctx->isc_nrxqsets = ctx->ifc_sysctl_nrxqs;
 4628 
 4629         for (i = 0; i < sctx->isc_ntxqs; i++) {
 4630                 if (ctx->ifc_sysctl_ntxds[i] != 0)
 4631                         scctx->isc_ntxd[i] = ctx->ifc_sysctl_ntxds[i];
 4632                 else
 4633                         scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 4634         }
 4635 
 4636         for (i = 0; i < sctx->isc_nrxqs; i++) {
 4637                 if (ctx->ifc_sysctl_nrxds[i] != 0)
 4638                         scctx->isc_nrxd[i] = ctx->ifc_sysctl_nrxds[i];
 4639                 else
 4640                         scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 4641         }
 4642 
 4643         for (i = 0; i < sctx->isc_nrxqs; i++) {
 4644                 if (scctx->isc_nrxd[i] < sctx->isc_nrxd_min[i]) {
 4645                         device_printf(dev, "nrxd%d: %d less than nrxd_min %d - resetting to min\n",
 4646                                       i, scctx->isc_nrxd[i], sctx->isc_nrxd_min[i]);
 4647                         scctx->isc_nrxd[i] = sctx->isc_nrxd_min[i];
 4648                 }
 4649                 if (scctx->isc_nrxd[i] > sctx->isc_nrxd_max[i]) {
 4650                         device_printf(dev, "nrxd%d: %d greater than nrxd_max %d - resetting to max\n",
 4651                                       i, scctx->isc_nrxd[i], sctx->isc_nrxd_max[i]);
 4652                         scctx->isc_nrxd[i] = sctx->isc_nrxd_max[i];
 4653                 }
 4654                 if (!powerof2(scctx->isc_nrxd[i])) {
 4655                         device_printf(dev, "nrxd%d: %d is not a power of 2 - using default value of %d\n",
 4656                                       i, scctx->isc_nrxd[i], sctx->isc_nrxd_default[i]);
 4657                         scctx->isc_nrxd[i] = sctx->isc_nrxd_default[i];
 4658                 }
 4659         }
 4660 
 4661         for (i = 0; i < sctx->isc_ntxqs; i++) {
 4662                 if (scctx->isc_ntxd[i] < sctx->isc_ntxd_min[i]) {
 4663                         device_printf(dev, "ntxd%d: %d less than ntxd_min %d - resetting to min\n",
 4664                                       i, scctx->isc_ntxd[i], sctx->isc_ntxd_min[i]);
 4665                         scctx->isc_ntxd[i] = sctx->isc_ntxd_min[i];
 4666                 }
 4667                 if (scctx->isc_ntxd[i] > sctx->isc_ntxd_max[i]) {
 4668                         device_printf(dev, "ntxd%d: %d greater than ntxd_max %d - resetting to max\n",
 4669                                       i, scctx->isc_ntxd[i], sctx->isc_ntxd_max[i]);
 4670                         scctx->isc_ntxd[i] = sctx->isc_ntxd_max[i];
 4671                 }
 4672                 if (!powerof2(scctx->isc_ntxd[i])) {
 4673                         device_printf(dev, "ntxd%d: %d is not a power of 2 - using default value of %d\n",
 4674                                       i, scctx->isc_ntxd[i], sctx->isc_ntxd_default[i]);
 4675                         scctx->isc_ntxd[i] = sctx->isc_ntxd_default[i];
 4676                 }
 4677         }
 4678 }
 4679 
 4680 /*
 4681  * Advance forward by n members of the cpuset ctx->ifc_cpus starting from
 4682  * cpuid and wrapping as necessary.
 4683  */
 4684 static unsigned int
 4685 cpuid_advance(if_ctx_t ctx, unsigned int cpuid, unsigned int n)
 4686 {
 4687         unsigned int first_valid;
 4688         unsigned int last_valid;
 4689 
 4690         /* cpuid should always be in the valid set */
 4691         MPASS(CPU_ISSET(cpuid, &ctx->ifc_cpus));
 4692 
 4693         /* valid set should never be empty */
 4694         MPASS(!CPU_EMPTY(&ctx->ifc_cpus));
 4695 
 4696         first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
 4697         last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
 4698         n = n % CPU_COUNT(&ctx->ifc_cpus);
 4699         while (n > 0) {
 4700                 do {
 4701                         cpuid++;
 4702                         if (cpuid > last_valid)
 4703                                 cpuid = first_valid;
 4704                 } while (!CPU_ISSET(cpuid, &ctx->ifc_cpus));
 4705                 n--;
 4706         }
 4707 
 4708         return (cpuid);
 4709 }
 4710 
 4711 #if defined(SMP) && defined(SCHED_ULE)
 4712 extern struct cpu_group *cpu_top;              /* CPU topology */
 4713 
 4714 static int
 4715 find_child_with_core(int cpu, struct cpu_group *grp)
 4716 {
 4717         int i;
 4718 
 4719         if (grp->cg_children == 0)
 4720                 return -1;
 4721 
 4722         MPASS(grp->cg_child);
 4723         for (i = 0; i < grp->cg_children; i++) {
 4724                 if (CPU_ISSET(cpu, &grp->cg_child[i].cg_mask))
 4725                         return i;
 4726         }
 4727 
 4728         return -1;
 4729 }
 4730 
 4731 
 4732 /*
 4733  * Find an L2 neighbor of the given CPU or return -1 if none found.  This
 4734  * does not distinguish among multiple L2 neighbors if the given CPU has
 4735  * more than one (it will always return the same result in that case).
 4736  */
 4737 static int
 4738 find_l2_neighbor(int cpu)
 4739 {
 4740         struct cpu_group *grp;
 4741         int i;
 4742 
 4743         grp = cpu_top;
 4744         if (grp == NULL)
 4745                 return -1;
 4746 
 4747         /*
 4748          * Find the smallest CPU group that contains the given core.
 4749          */
 4750         i = 0;
 4751         while ((i = find_child_with_core(cpu, grp)) != -1) {
 4752                 /*
 4753                  * If the smallest group containing the given CPU has less
 4754                  * than two members, we conclude the given CPU has no
 4755                  * L2 neighbor.
 4756                  */
 4757                 if (grp->cg_child[i].cg_count <= 1)
 4758                         return (-1);
 4759                 grp = &grp->cg_child[i];
 4760         }
 4761 
 4762         /* Must share L2. */
 4763         if (grp->cg_level > CG_SHARE_L2 || grp->cg_level == CG_SHARE_NONE)
 4764                 return -1;
 4765 
 4766         /*
 4767          * Select the first member of the set that isn't the reference
 4768          * CPU, which at this point is guaranteed to exist.
 4769          */
 4770         for (i = 0; i < CPU_SETSIZE; i++) {
 4771                 if (CPU_ISSET(i, &grp->cg_mask) && i != cpu)
 4772                         return (i);
 4773         }
 4774 
 4775         /* Should never be reached */
 4776         return (-1);
 4777 }
 4778 
 4779 #else
 4780 static int
 4781 find_l2_neighbor(int cpu)
 4782 {
 4783 
 4784         return (-1);
 4785 }
 4786 #endif
 4787 
 4788 /*
 4789  * CPU mapping behaviors
 4790  * ---------------------
 4791  * 'separate txrx' refers to the separate_txrx sysctl
 4792  * 'use logical' refers to the use_logical_cores sysctl
 4793  * 'INTR CPUS' indicates whether bus_get_cpus(INTR_CPUS) succeeded
 4794  *
 4795  *  separate     use     INTR
 4796  *    txrx     logical   CPUS   result
 4797  * ---------- --------- ------ ------------------------------------------------
 4798  *     -          -       X     RX and TX queues mapped to consecutive physical
 4799  *                              cores with RX/TX pairs on same core and excess
 4800  *                              of either following
 4801  *     -          X       X     RX and TX queues mapped to consecutive cores
 4802  *                              of any type with RX/TX pairs on same core and
 4803  *                              excess of either following
 4804  *     X          -       X     RX and TX queues mapped to consecutive physical
 4805  *                              cores; all RX then all TX
 4806  *     X          X       X     RX queues mapped to consecutive physical cores
 4807  *                              first, then TX queues mapped to L2 neighbor of
 4808  *                              the corresponding RX queue if one exists,
 4809  *                              otherwise to consecutive physical cores
 4810  *     -         n/a      -     RX and TX queues mapped to consecutive cores of
 4811  *                              any type with RX/TX pairs on same core and excess
 4812  *                              of either following
 4813  *     X         n/a      -     RX and TX queues mapped to consecutive cores of
 4814  *                              any type; all RX then all TX
 4815  */
 4816 static unsigned int
 4817 get_cpuid_for_queue(if_ctx_t ctx, unsigned int base_cpuid, unsigned int qid,
 4818     bool is_tx)
 4819 {
 4820         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 4821         unsigned int core_index;
 4822 
 4823         if (ctx->ifc_sysctl_separate_txrx) {
 4824                 /*
 4825                  * When using separate CPUs for TX and RX, the assignment
 4826                  * will always be of a consecutive CPU out of the set of
 4827                  * context CPUs, except for the specific case where the
 4828                  * context CPUs are phsyical cores, the use of logical cores
 4829                  * has been enabled, the assignment is for TX, the TX qid
 4830                  * corresponds to an RX qid, and the CPU assigned to the
 4831                  * corresponding RX queue has an L2 neighbor.
 4832                  */
 4833                 if (ctx->ifc_sysctl_use_logical_cores &&
 4834                     ctx->ifc_cpus_are_physical_cores &&
 4835                     is_tx && qid < scctx->isc_nrxqsets) {
 4836                         int l2_neighbor;
 4837                         unsigned int rx_cpuid;
 4838 
 4839                         rx_cpuid = cpuid_advance(ctx, base_cpuid, qid);
 4840                         l2_neighbor = find_l2_neighbor(rx_cpuid);
 4841                         if (l2_neighbor != -1) {
 4842                                 return (l2_neighbor);
 4843                         }
 4844                         /*
 4845                          * ... else fall through to the normal
 4846                          * consecutive-after-RX assignment scheme.
 4847                          *
 4848                          * Note that we are assuming that all RX queue CPUs
 4849                          * have an L2 neighbor, or all do not.  If a mixed
 4850                          * scenario is possible, we will have to keep track
 4851                          * separately of how many queues prior to this one
 4852                          * were not able to be assigned to an L2 neighbor.
 4853                          */
 4854                 }
 4855                 if (is_tx)
 4856                         core_index = scctx->isc_nrxqsets + qid;
 4857                 else
 4858                         core_index = qid;
 4859         } else {
 4860                 core_index = qid;
 4861         }
 4862 
 4863         return (cpuid_advance(ctx, base_cpuid, core_index));
 4864 }
 4865 
 4866 static uint16_t
 4867 get_ctx_core_offset(if_ctx_t ctx)
 4868 {
 4869         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 4870         struct cpu_offset *op;
 4871         cpuset_t assigned_cpus;
 4872         unsigned int cores_consumed;
 4873         unsigned int base_cpuid = ctx->ifc_sysctl_core_offset;
 4874         unsigned int first_valid;
 4875         unsigned int last_valid;
 4876         unsigned int i;
 4877 
 4878         first_valid = CPU_FFS(&ctx->ifc_cpus) - 1;
 4879         last_valid = CPU_FLS(&ctx->ifc_cpus) - 1;
 4880 
 4881         if (base_cpuid != CORE_OFFSET_UNSPECIFIED) {
 4882                 /*
 4883                  * Align the user-chosen base CPU ID to the next valid CPU
 4884                  * for this device.  If the chosen base CPU ID is smaller
 4885                  * than the first valid CPU or larger than the last valid
 4886                  * CPU, we assume the user does not know what the valid
 4887                  * range is for this device and is thinking in terms of a
 4888                  * zero-based reference frame, and so we shift the given
 4889                  * value into the valid range (and wrap accordingly) so the
 4890                  * intent is translated to the proper frame of reference.
 4891                  * If the base CPU ID is within the valid first/last, but
 4892                  * does not correspond to a valid CPU, it is advanced to the
 4893                  * next valid CPU (wrapping if necessary).
 4894                  */
 4895                 if (base_cpuid < first_valid || base_cpuid > last_valid) {
 4896                         /* shift from zero-based to first_valid-based */
 4897                         base_cpuid += first_valid;
 4898                         /* wrap to range [first_valid, last_valid] */
 4899                         base_cpuid = (base_cpuid - first_valid) %
 4900                             (last_valid - first_valid + 1);
 4901                 }
 4902                 if (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus)) {
 4903                         /*
 4904                          * base_cpuid is in [first_valid, last_valid], but
 4905                          * not a member of the valid set.  In this case,
 4906                          * there will always be a member of the valid set
 4907                          * with a CPU ID that is greater than base_cpuid,
 4908                          * and we simply advance to it.
 4909                          */
 4910                         while (!CPU_ISSET(base_cpuid, &ctx->ifc_cpus))
 4911                                 base_cpuid++;
 4912                 }
 4913                 return (base_cpuid);
 4914         }
 4915 
 4916         /*
 4917          * Determine how many cores will be consumed by performing the CPU
 4918          * assignments and counting how many of the assigned CPUs correspond
 4919          * to CPUs in the set of context CPUs.  This is done using the CPU
 4920          * ID first_valid as the base CPU ID, as the base CPU must be within
 4921          * the set of context CPUs.
 4922          *
 4923          * Note not all assigned CPUs will be in the set of context CPUs
 4924          * when separate CPUs are being allocated to TX and RX queues,
 4925          * assignment to logical cores has been enabled, the set of context
 4926          * CPUs contains only physical CPUs, and TX queues are mapped to L2
 4927          * neighbors of CPUs that RX queues have been mapped to - in this
 4928          * case we do only want to count how many CPUs in the set of context
 4929          * CPUs have been consumed, as that determines the next CPU in that
 4930          * set to start allocating at for the next device for which
 4931          * core_offset is not set.
 4932          */
 4933         CPU_ZERO(&assigned_cpus);
 4934         for (i = 0; i < scctx->isc_ntxqsets; i++)
 4935                 CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, true),
 4936                     &assigned_cpus);
 4937         for (i = 0; i < scctx->isc_nrxqsets; i++)
 4938                 CPU_SET(get_cpuid_for_queue(ctx, first_valid, i, false),
 4939                     &assigned_cpus);
 4940         CPU_AND(&assigned_cpus, &ctx->ifc_cpus);
 4941         cores_consumed = CPU_COUNT(&assigned_cpus);
 4942 
 4943         mtx_lock(&cpu_offset_mtx);
 4944         SLIST_FOREACH(op, &cpu_offsets, entries) {
 4945                 if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
 4946                         base_cpuid = op->next_cpuid;
 4947                         op->next_cpuid = cpuid_advance(ctx, op->next_cpuid,
 4948                             cores_consumed);
 4949                         MPASS(op->refcount < UINT_MAX);
 4950                         op->refcount++;
 4951                         break;
 4952                 }
 4953         }
 4954         if (base_cpuid == CORE_OFFSET_UNSPECIFIED) {
 4955                 base_cpuid = first_valid;
 4956                 op = malloc(sizeof(struct cpu_offset), M_IFLIB,
 4957                     M_NOWAIT | M_ZERO);
 4958                 if (op == NULL) {
 4959                         device_printf(ctx->ifc_dev,
 4960                             "allocation for cpu offset failed.\n");
 4961                 } else {
 4962                         op->next_cpuid = cpuid_advance(ctx, base_cpuid,
 4963                             cores_consumed);
 4964                         op->refcount = 1;
 4965                         CPU_COPY(&ctx->ifc_cpus, &op->set);
 4966                         SLIST_INSERT_HEAD(&cpu_offsets, op, entries);
 4967                 }
 4968         }
 4969         mtx_unlock(&cpu_offset_mtx);
 4970 
 4971         return (base_cpuid);
 4972 }
 4973 
 4974 static void
 4975 unref_ctx_core_offset(if_ctx_t ctx)
 4976 {
 4977         struct cpu_offset *op, *top;
 4978 
 4979         mtx_lock(&cpu_offset_mtx);
 4980         SLIST_FOREACH_SAFE(op, &cpu_offsets, entries, top) {
 4981                 if (CPU_CMP(&ctx->ifc_cpus, &op->set) == 0) {
 4982                         MPASS(op->refcount > 0);
 4983                         op->refcount--;
 4984                         if (op->refcount == 0) {
 4985                                 SLIST_REMOVE(&cpu_offsets, op, cpu_offset, entries);
 4986                                 free(op, M_IFLIB);
 4987                         }
 4988                         break;
 4989                 }
 4990         }
 4991         mtx_unlock(&cpu_offset_mtx);
 4992 }
 4993 
 4994 int
 4995 iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ctxp)
 4996 {
 4997         if_ctx_t ctx;
 4998         if_t ifp;
 4999         if_softc_ctx_t scctx;
 5000         kobjop_desc_t kobj_desc;
 5001         kobj_method_t *kobj_method;
 5002         int err, msix, rid;
 5003         int num_txd, num_rxd;
 5004 
 5005         ctx = malloc(sizeof(* ctx), M_IFLIB, M_WAITOK|M_ZERO);
 5006 
 5007         if (sc == NULL) {
 5008                 sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 5009                 device_set_softc(dev, ctx);
 5010                 ctx->ifc_flags |= IFC_SC_ALLOCATED;
 5011         }
 5012 
 5013         ctx->ifc_sctx = sctx;
 5014         ctx->ifc_dev = dev;
 5015         ctx->ifc_softc = sc;
 5016 
 5017         if ((err = iflib_register(ctx)) != 0) {
 5018                 device_printf(dev, "iflib_register failed %d\n", err);
 5019                 goto fail_ctx_free;
 5020         }
 5021         iflib_add_device_sysctl_pre(ctx);
 5022 
 5023         scctx = &ctx->ifc_softc_ctx;
 5024         ifp = ctx->ifc_ifp;
 5025 
 5026         iflib_reset_qvalues(ctx);
 5027         IFNET_WLOCK();
 5028         CTX_LOCK(ctx);
 5029         if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 5030                 device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 5031                 goto fail_unlock;
 5032         }
 5033         _iflib_pre_assert(scctx);
 5034         ctx->ifc_txrx = *scctx->isc_txrx;
 5035 
 5036 #ifdef INVARIANTS
 5037         if (scctx->isc_capabilities & IFCAP_TXCSUM)
 5038                 MPASS(scctx->isc_tx_csum_flags);
 5039 #endif
 5040 
 5041         if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS);
 5042         if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS);
 5043 
 5044         if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 5045                 scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 5046         if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 5047                 scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 5048 
 5049         num_txd = iflib_num_tx_descs(ctx);
 5050         num_rxd = iflib_num_rx_descs(ctx);
 5051 
 5052         /* XXX change for per-queue sizes */
 5053         device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
 5054             num_txd, num_rxd);
 5055 
 5056         if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
 5057                 scctx->isc_tx_nsegments = max(1, num_txd /
 5058                     MAX_SINGLE_PACKET_FRACTION);
 5059         if (scctx->isc_tx_tso_segments_max > num_txd /
 5060             MAX_SINGLE_PACKET_FRACTION)
 5061                 scctx->isc_tx_tso_segments_max = max(1,
 5062                     num_txd / MAX_SINGLE_PACKET_FRACTION);
 5063 
 5064         /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 5065         if (if_getcapabilities(ifp) & IFCAP_TSO) {
 5066                 /*
 5067                  * The stack can't handle a TSO size larger than IP_MAXPACKET,
 5068                  * but some MACs do.
 5069                  */
 5070                 if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 5071                     IP_MAXPACKET));
 5072                 /*
 5073                  * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 5074                  * into account.  In the worst case, each of these calls will
 5075                  * add another mbuf and, thus, the requirement for another DMA
 5076                  * segment.  So for best performance, it doesn't make sense to
 5077                  * advertize a maximum of TSO segments that typically will
 5078                  * require defragmentation in iflib_encap().
 5079                  */
 5080                 if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 5081                 if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 5082         }
 5083         if (scctx->isc_rss_table_size == 0)
 5084                 scctx->isc_rss_table_size = 64;
 5085         scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 5086 
 5087         GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 5088         /* XXX format name */
 5089         taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 5090             -1, "admin");
 5091 
 5092         /* Set up cpu set.  If it fails, use the set of all CPUs. */
 5093         if (bus_get_cpus(dev, INTR_CPUS, sizeof(ctx->ifc_cpus), &ctx->ifc_cpus) != 0) {
 5094                 device_printf(dev, "Unable to fetch CPU list\n");
 5095                 CPU_COPY(&all_cpus, &ctx->ifc_cpus);
 5096                 ctx->ifc_cpus_are_physical_cores = false;
 5097         } else
 5098                 ctx->ifc_cpus_are_physical_cores = true;
 5099         MPASS(CPU_COUNT(&ctx->ifc_cpus) > 0);
 5100 
 5101         /*
 5102         ** Now set up MSI or MSI-X, should return us the number of supported
 5103         ** vectors (will be 1 for a legacy interrupt and MSI).
 5104         */
 5105         if (sctx->isc_flags & IFLIB_SKIP_MSIX) {
 5106                 msix = scctx->isc_vectors;
 5107         } else if (scctx->isc_msix_bar != 0)
 5108                /*
 5109                 * The simple fact that isc_msix_bar is not 0 does not mean we
 5110                 * we have a good value there that is known to work.
 5111                 */
 5112                 msix = iflib_msix_init(ctx);
 5113         else {
 5114                 scctx->isc_vectors = 1;
 5115                 scctx->isc_ntxqsets = 1;
 5116                 scctx->isc_nrxqsets = 1;
 5117                 scctx->isc_intr = IFLIB_INTR_LEGACY;
 5118                 msix = 0;
 5119         }
 5120         /* Get memory for the station queues */
 5121         if ((err = iflib_queues_alloc(ctx))) {
 5122                 device_printf(dev, "Unable to allocate queue memory\n");
 5123                 goto fail_intr_free;
 5124         }
 5125 
 5126         if ((err = iflib_qset_structures_setup(ctx)))
 5127                 goto fail_queues;
 5128 
 5129         /*
 5130          * Now that we know how many queues there are, get the core offset.
 5131          */
 5132         ctx->ifc_sysctl_core_offset = get_ctx_core_offset(ctx);
 5133 
 5134         /*
 5135          * Group taskqueues aren't properly set up until SMP is started,
 5136          * so we disable interrupts until we can handle them post
 5137          * SI_SUB_SMP.
 5138          *
 5139          * XXX: disabling interrupts doesn't actually work, at least for
 5140          * the non-MSI case.  When they occur before SI_SUB_SMP completes,
 5141          * we do null handling and depend on this not causing too large an
 5142          * interrupt storm.
 5143          */
 5144         IFDI_INTR_DISABLE(ctx);
 5145 
 5146         if (msix > 1) {
 5147                 /*
 5148                  * When using MSI-X, ensure that ifdi_{r,t}x_queue_intr_enable
 5149                  * aren't the default NULL implementation.
 5150                  */
 5151                 kobj_desc = &ifdi_rx_queue_intr_enable_desc;
 5152                 kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
 5153                     kobj_desc);
 5154                 if (kobj_method == &kobj_desc->deflt) {
 5155                         device_printf(dev,
 5156                             "MSI-X requires ifdi_rx_queue_intr_enable method");
 5157                         err = EOPNOTSUPP;
 5158                         goto fail_queues;
 5159                 }
 5160                 kobj_desc = &ifdi_tx_queue_intr_enable_desc;
 5161                 kobj_method = kobj_lookup_method(((kobj_t)ctx)->ops->cls, NULL,
 5162                     kobj_desc);
 5163                 if (kobj_method == &kobj_desc->deflt) {
 5164                         device_printf(dev,
 5165                             "MSI-X requires ifdi_tx_queue_intr_enable method");
 5166                         err = EOPNOTSUPP;
 5167                         goto fail_queues;
 5168                 }
 5169 
 5170                 /*
 5171                  * Assign the MSI-X vectors.
 5172                  * Note that the default NULL ifdi_msix_intr_assign method will
 5173                  * fail here, too.
 5174                  */
 5175                 err = IFDI_MSIX_INTR_ASSIGN(ctx, msix);
 5176                 if (err != 0) {
 5177                         device_printf(dev, "IFDI_MSIX_INTR_ASSIGN failed %d\n",
 5178                             err);
 5179                         goto fail_queues;
 5180                 }
 5181         } else if (scctx->isc_intr != IFLIB_INTR_MSIX) {
 5182                 rid = 0;
 5183                 if (scctx->isc_intr == IFLIB_INTR_MSI) {
 5184                         MPASS(msix == 1);
 5185                         rid = 1;
 5186                 }
 5187                 if ((err = iflib_legacy_setup(ctx, ctx->isc_legacy_intr, ctx->ifc_softc, &rid, "irq0")) != 0) {
 5188                         device_printf(dev, "iflib_legacy_setup failed %d\n", err);
 5189                         goto fail_queues;
 5190                 }
 5191         } else {
 5192                 device_printf(dev,
 5193                     "Cannot use iflib with only 1 MSI-X interrupt!\n");
 5194                 err = ENODEV;
 5195                 goto fail_queues;
 5196         }
 5197 
 5198         ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 5199 
 5200         if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 5201                 device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 5202                 goto fail_detach;
 5203         }
 5204 
 5205         /*
 5206          * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 5207          * This must appear after the call to ether_ifattach() because
 5208          * ether_ifattach() sets if_hdrlen to the default value.
 5209          */
 5210         if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 5211                 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 5212 
 5213         if ((err = iflib_netmap_attach(ctx))) {
 5214                 device_printf(ctx->ifc_dev, "netmap attach failed: %d\n", err);
 5215                 goto fail_detach;
 5216         }
 5217         *ctxp = ctx;
 5218 
 5219         NETDUMP_SET(ctx->ifc_ifp, iflib);
 5220 
 5221         if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 5222         iflib_add_device_sysctl_post(ctx);
 5223         ctx->ifc_flags |= IFC_INIT_DONE;
 5224         CTX_UNLOCK(ctx);
 5225         IFNET_WUNLOCK();
 5226 
 5227         return (0);
 5228 
 5229 fail_detach:
 5230         ether_ifdetach(ctx->ifc_ifp);
 5231 fail_queues:
 5232         iflib_tqg_detach(ctx);
 5233         iflib_tx_structures_free(ctx);
 5234         iflib_rx_structures_free(ctx);
 5235         IFDI_DETACH(ctx);
 5236         IFDI_QUEUES_FREE(ctx);
 5237 fail_intr_free:
 5238         iflib_free_intr_mem(ctx);
 5239 fail_unlock:
 5240         CTX_UNLOCK(ctx);
 5241         IFNET_WUNLOCK();
 5242         iflib_deregister(ctx);
 5243 fail_ctx_free:
 5244         device_set_softc(ctx->ifc_dev, NULL);
 5245         if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 5246                 free(ctx->ifc_softc, M_IFLIB);
 5247         free(ctx, M_IFLIB);
 5248         return (err);
 5249 }
 5250 
 5251 int
 5252 iflib_pseudo_register(device_t dev, if_shared_ctx_t sctx, if_ctx_t *ctxp,
 5253                                           struct iflib_cloneattach_ctx *clctx)
 5254 {
 5255         int num_txd, num_rxd;
 5256         int err;
 5257         if_ctx_t ctx;
 5258         if_t ifp;
 5259         if_softc_ctx_t scctx;
 5260         int i;
 5261         void *sc;
 5262 
 5263         ctx = malloc(sizeof(*ctx), M_IFLIB, M_WAITOK|M_ZERO);
 5264         sc = malloc(sctx->isc_driver->size, M_IFLIB, M_WAITOK|M_ZERO);
 5265         ctx->ifc_flags |= IFC_SC_ALLOCATED;
 5266         if (sctx->isc_flags & (IFLIB_PSEUDO|IFLIB_VIRTUAL))
 5267                 ctx->ifc_flags |= IFC_PSEUDO;
 5268 
 5269         ctx->ifc_sctx = sctx;
 5270         ctx->ifc_softc = sc;
 5271         ctx->ifc_dev = dev;
 5272 
 5273         if ((err = iflib_register(ctx)) != 0) {
 5274                 device_printf(dev, "%s: iflib_register failed %d\n", __func__, err);
 5275                 goto fail_ctx_free;
 5276         }
 5277         iflib_add_device_sysctl_pre(ctx);
 5278 
 5279         scctx = &ctx->ifc_softc_ctx;
 5280         ifp = ctx->ifc_ifp;
 5281 
 5282         iflib_reset_qvalues(ctx);
 5283         CTX_LOCK(ctx);
 5284         if ((err = IFDI_ATTACH_PRE(ctx)) != 0) {
 5285                 device_printf(dev, "IFDI_ATTACH_PRE failed %d\n", err);
 5286                 goto fail_unlock;
 5287         }
 5288         if (sctx->isc_flags & IFLIB_GEN_MAC)
 5289                 iflib_gen_mac(ctx);
 5290         if ((err = IFDI_CLONEATTACH(ctx, clctx->cc_ifc, clctx->cc_name,
 5291                                                                 clctx->cc_params)) != 0) {
 5292                 device_printf(dev, "IFDI_CLONEATTACH failed %d\n", err);
 5293                 goto fail_unlock;
 5294         }
 5295 
 5296 #ifdef INVARIANTS
 5297         if (scctx->isc_capabilities & IFCAP_TXCSUM)
 5298                 MPASS(scctx->isc_tx_csum_flags);
 5299 #endif
 5300 
 5301         if_setcapabilities(ifp, scctx->isc_capabilities | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 5302         if_setcapenable(ifp, scctx->isc_capenable | IFCAP_HWSTATS | IFCAP_LINKSTATE);
 5303 
 5304         ifp->if_flags |= IFF_NOGROUP;
 5305         if (sctx->isc_flags & IFLIB_PSEUDO) {
 5306                 ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 5307                 ifmedia_set(&ctx->ifc_media, IFM_ETHER | IFM_AUTO);
 5308 
 5309                 if (sctx->isc_flags & IFLIB_PSEUDO_ETHER) {
 5310                         ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 5311                 } else {
 5312                         if_attach(ctx->ifc_ifp);
 5313                         bpfattach(ctx->ifc_ifp, DLT_NULL, sizeof(u_int32_t));
 5314                 }
 5315 
 5316                 if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 5317                         device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 5318                         goto fail_detach;
 5319                 }
 5320                 *ctxp = ctx;
 5321 
 5322                 /*
 5323                  * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 5324                  * This must appear after the call to ether_ifattach() because
 5325                  * ether_ifattach() sets if_hdrlen to the default value.
 5326                  */
 5327                 if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 5328                         if_setifheaderlen(ifp,
 5329                             sizeof(struct ether_vlan_header));
 5330 
 5331                 if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 5332                 iflib_add_device_sysctl_post(ctx);
 5333                 ctx->ifc_flags |= IFC_INIT_DONE;
 5334                 CTX_UNLOCK(ctx);
 5335                 return (0);
 5336         }
 5337         ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_1000_T | IFM_FDX, 0, NULL);
 5338         ifmedia_add(&ctx->ifc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
 5339         ifmedia_set(&ctx->ifc_media, IFM_ETHER | IFM_AUTO);
 5340 
 5341         _iflib_pre_assert(scctx);
 5342         ctx->ifc_txrx = *scctx->isc_txrx;
 5343 
 5344         if (scctx->isc_ntxqsets == 0 || (scctx->isc_ntxqsets_max && scctx->isc_ntxqsets_max < scctx->isc_ntxqsets))
 5345                 scctx->isc_ntxqsets = scctx->isc_ntxqsets_max;
 5346         if (scctx->isc_nrxqsets == 0 || (scctx->isc_nrxqsets_max && scctx->isc_nrxqsets_max < scctx->isc_nrxqsets))
 5347                 scctx->isc_nrxqsets = scctx->isc_nrxqsets_max;
 5348 
 5349         num_txd = iflib_num_tx_descs(ctx);
 5350         num_rxd = iflib_num_rx_descs(ctx);
 5351 
 5352         /* XXX change for per-queue sizes */
 5353         device_printf(dev, "Using %d TX descriptors and %d RX descriptors\n",
 5354             num_txd, num_rxd);
 5355 
 5356         if (scctx->isc_tx_nsegments > num_txd / MAX_SINGLE_PACKET_FRACTION)
 5357                 scctx->isc_tx_nsegments = max(1, num_txd /
 5358                     MAX_SINGLE_PACKET_FRACTION);
 5359         if (scctx->isc_tx_tso_segments_max > num_txd /
 5360             MAX_SINGLE_PACKET_FRACTION)
 5361                 scctx->isc_tx_tso_segments_max = max(1,
 5362                     num_txd / MAX_SINGLE_PACKET_FRACTION);
 5363 
 5364         /* TSO parameters - dig these out of the data sheet - simply correspond to tag setup */
 5365         if (if_getcapabilities(ifp) & IFCAP_TSO) {
 5366                 /*
 5367                  * The stack can't handle a TSO size larger than IP_MAXPACKET,
 5368                  * but some MACs do.
 5369                  */
 5370                 if_sethwtsomax(ifp, min(scctx->isc_tx_tso_size_max,
 5371                     IP_MAXPACKET));
 5372                 /*
 5373                  * Take maximum number of m_pullup(9)'s in iflib_parse_header()
 5374                  * into account.  In the worst case, each of these calls will
 5375                  * add another mbuf and, thus, the requirement for another DMA
 5376                  * segment.  So for best performance, it doesn't make sense to
 5377                  * advertize a maximum of TSO segments that typically will
 5378                  * require defragmentation in iflib_encap().
 5379                  */
 5380                 if_sethwtsomaxsegcount(ifp, scctx->isc_tx_tso_segments_max - 3);
 5381                 if_sethwtsomaxsegsize(ifp, scctx->isc_tx_tso_segsize_max);
 5382         }
 5383         if (scctx->isc_rss_table_size == 0)
 5384                 scctx->isc_rss_table_size = 64;
 5385         scctx->isc_rss_table_mask = scctx->isc_rss_table_size-1;
 5386 
 5387         GROUPTASK_INIT(&ctx->ifc_admin_task, 0, _task_fn_admin, ctx);
 5388         /* XXX format name */
 5389         taskqgroup_attach(qgroup_if_config_tqg, &ctx->ifc_admin_task, ctx,
 5390             -1, "admin");
 5391 
 5392         /* XXX --- can support > 1 -- but keep it simple for now */
 5393         scctx->isc_intr = IFLIB_INTR_LEGACY;
 5394 
 5395         /* Get memory for the station queues */
 5396         if ((err = iflib_queues_alloc(ctx))) {
 5397                 device_printf(dev, "Unable to allocate queue memory\n");
 5398                 goto fail_iflib_detach;
 5399         }
 5400 
 5401         if ((err = iflib_qset_structures_setup(ctx))) {
 5402                 device_printf(dev, "qset structure setup failed %d\n", err);
 5403                 goto fail_queues;
 5404         }
 5405 
 5406         /*
 5407          * XXX What if anything do we want to do about interrupts?
 5408          */
 5409         ether_ifattach(ctx->ifc_ifp, ctx->ifc_mac);
 5410         if ((err = IFDI_ATTACH_POST(ctx)) != 0) {
 5411                 device_printf(dev, "IFDI_ATTACH_POST failed %d\n", err);
 5412                 goto fail_detach;
 5413         }
 5414 
 5415         /*
 5416          * Tell the upper layer(s) if IFCAP_VLAN_MTU is supported.
 5417          * This must appear after the call to ether_ifattach() because
 5418          * ether_ifattach() sets if_hdrlen to the default value.
 5419          */
 5420         if (if_getcapabilities(ifp) & IFCAP_VLAN_MTU)
 5421                 if_setifheaderlen(ifp, sizeof(struct ether_vlan_header));
 5422 
 5423         /* XXX handle more than one queue */
 5424         for (i = 0; i < scctx->isc_nrxqsets; i++)
 5425                 IFDI_RX_CLSET(ctx, 0, i, ctx->ifc_rxqs[i].ifr_fl[0].ifl_sds.ifsd_cl);
 5426 
 5427         *ctxp = ctx;
 5428 
 5429         if_setgetcounterfn(ctx->ifc_ifp, iflib_if_get_counter);
 5430         iflib_add_device_sysctl_post(ctx);
 5431         ctx->ifc_flags |= IFC_INIT_DONE;
 5432         CTX_UNLOCK(ctx);
 5433 
 5434         return (0);
 5435 fail_detach:
 5436         ether_ifdetach(ctx->ifc_ifp);
 5437 fail_queues:
 5438         iflib_tqg_detach(ctx);
 5439         iflib_tx_structures_free(ctx);
 5440         iflib_rx_structures_free(ctx);
 5441 fail_iflib_detach:
 5442         IFDI_DETACH(ctx);
 5443         IFDI_QUEUES_FREE(ctx);
 5444 fail_unlock:
 5445         CTX_UNLOCK(ctx);
 5446         iflib_deregister(ctx);
 5447 fail_ctx_free:
 5448         free(ctx->ifc_softc, M_IFLIB);
 5449         free(ctx, M_IFLIB);
 5450         return (err);
 5451 }
 5452 
 5453 int
 5454 iflib_pseudo_deregister(if_ctx_t ctx)
 5455 {
 5456         if_t ifp = ctx->ifc_ifp;
 5457         if_shared_ctx_t sctx = ctx->ifc_sctx;
 5458 
 5459         /* Unregister VLAN event handlers early */
 5460         iflib_unregister_vlan_handlers(ctx);
 5461 
 5462         if ((sctx->isc_flags & IFLIB_PSEUDO)  &&
 5463                 (sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0) {
 5464                 bpfdetach(ifp);
 5465                 if_detach(ifp);
 5466         } else {
 5467                 ether_ifdetach(ifp);
 5468         }
 5469 
 5470         iflib_tqg_detach(ctx);
 5471         iflib_tx_structures_free(ctx);
 5472         iflib_rx_structures_free(ctx);
 5473         IFDI_DETACH(ctx);
 5474         IFDI_QUEUES_FREE(ctx);
 5475 
 5476         iflib_deregister(ctx);
 5477 
 5478         if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 5479                 free(ctx->ifc_softc, M_IFLIB);
 5480         free(ctx, M_IFLIB);
 5481         return (0);
 5482 }
 5483 
 5484 int
 5485 iflib_device_attach(device_t dev)
 5486 {
 5487         if_ctx_t ctx;
 5488         if_shared_ctx_t sctx;
 5489 
 5490         if ((sctx = DEVICE_REGISTER(dev)) == NULL || sctx->isc_magic != IFLIB_MAGIC)
 5491                 return (ENOTSUP);
 5492 
 5493         pci_enable_busmaster(dev);
 5494 
 5495         return (iflib_device_register(dev, NULL, sctx, &ctx));
 5496 }
 5497 
 5498 int
 5499 iflib_device_deregister(if_ctx_t ctx)
 5500 {
 5501         if_t ifp = ctx->ifc_ifp;
 5502         device_t dev = ctx->ifc_dev;
 5503 
 5504         /* Make sure VLANS are not using driver */
 5505         if (if_vlantrunkinuse(ifp)) {
 5506                 device_printf(dev, "Vlan in use, detach first\n");
 5507                 return (EBUSY);
 5508         }
 5509 #ifdef PCI_IOV
 5510         if (!CTX_IS_VF(ctx) && pci_iov_detach(dev) != 0) {
 5511                 device_printf(dev, "SR-IOV in use; detach first.\n");
 5512                 return (EBUSY);
 5513         }
 5514 #endif
 5515 
 5516         STATE_LOCK(ctx);
 5517         ctx->ifc_flags |= IFC_IN_DETACH;
 5518         STATE_UNLOCK(ctx);
 5519 
 5520         /* Unregister VLAN handlers before calling iflib_stop() */
 5521         iflib_unregister_vlan_handlers(ctx);
 5522 
 5523         iflib_netmap_detach(ifp);
 5524         ether_ifdetach(ifp);
 5525 
 5526         CTX_LOCK(ctx);
 5527         iflib_stop(ctx);
 5528         CTX_UNLOCK(ctx);
 5529 
 5530         if (ctx->ifc_led_dev != NULL)
 5531                 led_destroy(ctx->ifc_led_dev);
 5532 
 5533         iflib_tqg_detach(ctx);
 5534         iflib_tx_structures_free(ctx);
 5535         iflib_rx_structures_free(ctx);
 5536 
 5537         CTX_LOCK(ctx);
 5538         IFDI_DETACH(ctx);
 5539         IFDI_QUEUES_FREE(ctx);
 5540         CTX_UNLOCK(ctx);
 5541 
 5542         /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 5543         iflib_free_intr_mem(ctx);
 5544 
 5545         bus_generic_detach(dev);
 5546 
 5547         iflib_deregister(ctx);
 5548 
 5549         device_set_softc(ctx->ifc_dev, NULL);
 5550         if (ctx->ifc_flags & IFC_SC_ALLOCATED)
 5551                 free(ctx->ifc_softc, M_IFLIB);
 5552         unref_ctx_core_offset(ctx);
 5553         free(ctx, M_IFLIB);
 5554         return (0);
 5555 }
 5556 
 5557 static void
 5558 iflib_tqg_detach(if_ctx_t ctx)
 5559 {
 5560         iflib_txq_t txq;
 5561         iflib_rxq_t rxq;
 5562         int i;
 5563         struct taskqgroup *tqg;
 5564 
 5565         /* XXX drain any dependent tasks */
 5566         tqg = qgroup_if_io_tqg;
 5567         for (txq = ctx->ifc_txqs, i = 0; i < NTXQSETS(ctx); i++, txq++) {
 5568                 callout_drain(&txq->ift_timer);
 5569 #ifdef DEV_NETMAP
 5570                 callout_drain(&txq->ift_netmap_timer);
 5571 #endif /* DEV_NETMAP */
 5572                 if (txq->ift_task.gt_uniq != NULL)
 5573                         taskqgroup_detach(tqg, &txq->ift_task);
 5574         }
 5575         for (i = 0, rxq = ctx->ifc_rxqs; i < NRXQSETS(ctx); i++, rxq++) {
 5576                 if (rxq->ifr_task.gt_uniq != NULL)
 5577                         taskqgroup_detach(tqg, &rxq->ifr_task);
 5578         }
 5579         tqg = qgroup_if_config_tqg;
 5580         if (ctx->ifc_admin_task.gt_uniq != NULL)
 5581                 taskqgroup_detach(tqg, &ctx->ifc_admin_task);
 5582         if (ctx->ifc_vflr_task.gt_uniq != NULL)
 5583                 taskqgroup_detach(tqg, &ctx->ifc_vflr_task);
 5584 }
 5585 
 5586 static void
 5587 iflib_free_intr_mem(if_ctx_t ctx)
 5588 {
 5589 
 5590         if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_MSIX) {
 5591                 iflib_irq_free(ctx, &ctx->ifc_legacy_irq);
 5592         }
 5593         if (ctx->ifc_softc_ctx.isc_intr != IFLIB_INTR_LEGACY) {
 5594                 pci_release_msi(ctx->ifc_dev);
 5595         }
 5596         if (ctx->ifc_msix_mem != NULL) {
 5597                 bus_release_resource(ctx->ifc_dev, SYS_RES_MEMORY,
 5598                     rman_get_rid(ctx->ifc_msix_mem), ctx->ifc_msix_mem);
 5599                 ctx->ifc_msix_mem = NULL;
 5600         }
 5601 }
 5602 
 5603 int
 5604 iflib_device_detach(device_t dev)
 5605 {
 5606         if_ctx_t ctx = device_get_softc(dev);
 5607 
 5608         return (iflib_device_deregister(ctx));
 5609 }
 5610 
 5611 int
 5612 iflib_device_suspend(device_t dev)
 5613 {
 5614         if_ctx_t ctx = device_get_softc(dev);
 5615 
 5616         CTX_LOCK(ctx);
 5617         IFDI_SUSPEND(ctx);
 5618         CTX_UNLOCK(ctx);
 5619 
 5620         return bus_generic_suspend(dev);
 5621 }
 5622 int
 5623 iflib_device_shutdown(device_t dev)
 5624 {
 5625         if_ctx_t ctx = device_get_softc(dev);
 5626 
 5627         CTX_LOCK(ctx);
 5628         IFDI_SHUTDOWN(ctx);
 5629         CTX_UNLOCK(ctx);
 5630 
 5631         return bus_generic_suspend(dev);
 5632 }
 5633 
 5634 
 5635 int
 5636 iflib_device_resume(device_t dev)
 5637 {
 5638         if_ctx_t ctx = device_get_softc(dev);
 5639         iflib_txq_t txq = ctx->ifc_txqs;
 5640 
 5641         CTX_LOCK(ctx);
 5642         IFDI_RESUME(ctx);
 5643         iflib_if_init_locked(ctx);
 5644         CTX_UNLOCK(ctx);
 5645         for (int i = 0; i < NTXQSETS(ctx); i++, txq++)
 5646                 iflib_txq_check_drain(txq, IFLIB_RESTART_BUDGET);
 5647 
 5648         return (bus_generic_resume(dev));
 5649 }
 5650 
 5651 int
 5652 iflib_device_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params)
 5653 {
 5654         int error;
 5655         if_ctx_t ctx = device_get_softc(dev);
 5656 
 5657         CTX_LOCK(ctx);
 5658         error = IFDI_IOV_INIT(ctx, num_vfs, params);
 5659         CTX_UNLOCK(ctx);
 5660 
 5661         return (error);
 5662 }
 5663 
 5664 void
 5665 iflib_device_iov_uninit(device_t dev)
 5666 {
 5667         if_ctx_t ctx = device_get_softc(dev);
 5668 
 5669         CTX_LOCK(ctx);
 5670         IFDI_IOV_UNINIT(ctx);
 5671         CTX_UNLOCK(ctx);
 5672 }
 5673 
 5674 int
 5675 iflib_device_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params)
 5676 {
 5677         int error;
 5678         if_ctx_t ctx = device_get_softc(dev);
 5679 
 5680         CTX_LOCK(ctx);
 5681         error = IFDI_IOV_VF_ADD(ctx, vfnum, params);
 5682         CTX_UNLOCK(ctx);
 5683 
 5684         return (error);
 5685 }
 5686 
 5687 /*********************************************************************
 5688  *
 5689  *  MODULE FUNCTION DEFINITIONS
 5690  *
 5691  **********************************************************************/
 5692 
 5693 /*
 5694  * - Start a fast taskqueue thread for each core
 5695  * - Start a taskqueue for control operations
 5696  */
 5697 static int
 5698 iflib_module_init(void)
 5699 {
 5700         iflib_timer_default = hz / 2;
 5701         return (0);
 5702 }
 5703 
 5704 static int
 5705 iflib_module_event_handler(module_t mod, int what, void *arg)
 5706 {
 5707         int err;
 5708 
 5709         switch (what) {
 5710         case MOD_LOAD:
 5711                 if ((err = iflib_module_init()) != 0)
 5712                         return (err);
 5713                 break;
 5714         case MOD_UNLOAD:
 5715                 return (EBUSY);
 5716         default:
 5717                 return (EOPNOTSUPP);
 5718         }
 5719 
 5720         return (0);
 5721 }
 5722 
 5723 /*********************************************************************
 5724  *
 5725  *  PUBLIC FUNCTION DEFINITIONS
 5726  *     ordered as in iflib.h
 5727  *
 5728  **********************************************************************/
 5729 
 5730 
 5731 static void
 5732 _iflib_assert(if_shared_ctx_t sctx)
 5733 {
 5734         int i;
 5735 
 5736         MPASS(sctx->isc_tx_maxsize);
 5737         MPASS(sctx->isc_tx_maxsegsize);
 5738 
 5739         MPASS(sctx->isc_rx_maxsize);
 5740         MPASS(sctx->isc_rx_nsegments);
 5741         MPASS(sctx->isc_rx_maxsegsize);
 5742 
 5743         MPASS(sctx->isc_nrxqs >= 1 && sctx->isc_nrxqs <= 8);
 5744         for (i = 0; i < sctx->isc_nrxqs; i++) {
 5745                 MPASS(sctx->isc_nrxd_min[i]);
 5746                 MPASS(powerof2(sctx->isc_nrxd_min[i]));
 5747                 MPASS(sctx->isc_nrxd_max[i]);
 5748                 MPASS(powerof2(sctx->isc_nrxd_max[i]));
 5749                 MPASS(sctx->isc_nrxd_default[i]);
 5750                 MPASS(powerof2(sctx->isc_nrxd_default[i]));
 5751         }
 5752 
 5753         MPASS(sctx->isc_ntxqs >= 1 && sctx->isc_ntxqs <= 8);
 5754         for (i = 0; i < sctx->isc_ntxqs; i++) {
 5755                 MPASS(sctx->isc_ntxd_min[i]);
 5756                 MPASS(powerof2(sctx->isc_ntxd_min[i]));
 5757                 MPASS(sctx->isc_ntxd_max[i]);
 5758                 MPASS(powerof2(sctx->isc_ntxd_max[i]));
 5759                 MPASS(sctx->isc_ntxd_default[i]);
 5760                 MPASS(powerof2(sctx->isc_ntxd_default[i]));
 5761         }
 5762 }
 5763 
 5764 static void
 5765 _iflib_pre_assert(if_softc_ctx_t scctx)
 5766 {
 5767 
 5768         MPASS(scctx->isc_txrx->ift_txd_encap);
 5769         MPASS(scctx->isc_txrx->ift_txd_flush);
 5770         MPASS(scctx->isc_txrx->ift_txd_credits_update);
 5771         MPASS(scctx->isc_txrx->ift_rxd_available);
 5772         MPASS(scctx->isc_txrx->ift_rxd_pkt_get);
 5773         MPASS(scctx->isc_txrx->ift_rxd_refill);
 5774         MPASS(scctx->isc_txrx->ift_rxd_flush);
 5775 }
 5776 
 5777 static int
 5778 iflib_register(if_ctx_t ctx)
 5779 {
 5780         if_shared_ctx_t sctx = ctx->ifc_sctx;
 5781         driver_t *driver = sctx->isc_driver;
 5782         device_t dev = ctx->ifc_dev;
 5783         if_t ifp;
 5784         u_char type;
 5785         int iflags;
 5786 
 5787         if ((sctx->isc_flags & IFLIB_PSEUDO) == 0)
 5788                 _iflib_assert(sctx);
 5789 
 5790         CTX_LOCK_INIT(ctx);
 5791         STATE_LOCK_INIT(ctx, device_get_nameunit(ctx->ifc_dev));
 5792         if (sctx->isc_flags & IFLIB_PSEUDO) {
 5793                 if (sctx->isc_flags & IFLIB_PSEUDO_ETHER)
 5794                         type = IFT_ETHER;
 5795                 else
 5796                         type = IFT_PPP;
 5797         } else
 5798                 type = IFT_ETHER;
 5799         ifp = ctx->ifc_ifp = if_alloc(type);
 5800         if (ifp == NULL) {
 5801                 device_printf(dev, "can not allocate ifnet structure\n");
 5802                 return (ENOMEM);
 5803         }
 5804 
 5805         /*
 5806          * Initialize our context's device specific methods
 5807          */
 5808         kobj_init((kobj_t) ctx, (kobj_class_t) driver);
 5809         kobj_class_compile((kobj_class_t) driver);
 5810 
 5811         if_initname(ifp, device_get_name(dev), device_get_unit(dev));
 5812         if_setsoftc(ifp, ctx);
 5813         if_setdev(ifp, dev);
 5814         if_setinitfn(ifp, iflib_if_init);
 5815         if_setioctlfn(ifp, iflib_if_ioctl);
 5816 #ifdef ALTQ
 5817         if_setstartfn(ifp, iflib_altq_if_start);
 5818         if_settransmitfn(ifp, iflib_altq_if_transmit);
 5819         if_setsendqready(ifp);
 5820 #else
 5821         if_settransmitfn(ifp, iflib_if_transmit);
 5822 #endif
 5823         if_setqflushfn(ifp, iflib_if_qflush);
 5824         iflags = IFF_MULTICAST;
 5825 
 5826         if ((sctx->isc_flags & IFLIB_PSEUDO) &&
 5827                 (sctx->isc_flags & IFLIB_PSEUDO_ETHER) == 0)
 5828                 iflags |= IFF_POINTOPOINT;
 5829         else
 5830                 iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 5831         if_setflags(ifp, iflags);
 5832         ctx->ifc_vlan_attach_event =
 5833                 EVENTHANDLER_REGISTER(vlan_config, iflib_vlan_register, ctx,
 5834                                                           EVENTHANDLER_PRI_FIRST);
 5835         ctx->ifc_vlan_detach_event =
 5836                 EVENTHANDLER_REGISTER(vlan_unconfig, iflib_vlan_unregister, ctx,
 5837                                                           EVENTHANDLER_PRI_FIRST);
 5838 
 5839         ifmedia_init(&ctx->ifc_media, IFM_IMASK,
 5840                                          iflib_media_change, iflib_media_status);
 5841 
 5842         return (0);
 5843 }
 5844 
 5845 static void
 5846 iflib_unregister_vlan_handlers(if_ctx_t ctx)
 5847 {
 5848         /* Unregister VLAN events */
 5849         if (ctx->ifc_vlan_attach_event != NULL) {
 5850                 EVENTHANDLER_DEREGISTER(vlan_config, ctx->ifc_vlan_attach_event);
 5851                 ctx->ifc_vlan_attach_event = NULL;
 5852         }
 5853         if (ctx->ifc_vlan_detach_event != NULL) {
 5854                 EVENTHANDLER_DEREGISTER(vlan_unconfig, ctx->ifc_vlan_detach_event);
 5855                 ctx->ifc_vlan_detach_event = NULL;
 5856         }
 5857 
 5858 }
 5859 
 5860 static void
 5861 iflib_deregister(if_ctx_t ctx)
 5862 {
 5863         if_t ifp = ctx->ifc_ifp;
 5864 
 5865         /* Remove all media */
 5866         ifmedia_removeall(&ctx->ifc_media);
 5867 
 5868         /* Ensure that VLAN event handlers are unregistered */
 5869         iflib_unregister_vlan_handlers(ctx);
 5870 
 5871         /* Release kobject reference */
 5872         kobj_delete((kobj_t) ctx, NULL);
 5873 
 5874         /* Free the ifnet structure */
 5875         if_free(ifp);
 5876 
 5877         STATE_LOCK_DESTROY(ctx);
 5878 
 5879         /* ether_ifdetach calls if_qflush - lock must be destroy afterwards*/
 5880         CTX_LOCK_DESTROY(ctx);
 5881 }
 5882 
 5883 static int
 5884 iflib_queues_alloc(if_ctx_t ctx)
 5885 {
 5886         if_shared_ctx_t sctx = ctx->ifc_sctx;
 5887         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 5888         device_t dev = ctx->ifc_dev;
 5889         int nrxqsets = scctx->isc_nrxqsets;
 5890         int ntxqsets = scctx->isc_ntxqsets;
 5891         iflib_txq_t txq;
 5892         iflib_rxq_t rxq;
 5893         iflib_fl_t fl = NULL;
 5894         int i, j, cpu, err, txconf, rxconf;
 5895         iflib_dma_info_t ifdip;
 5896         uint32_t *rxqsizes = scctx->isc_rxqsizes;
 5897         uint32_t *txqsizes = scctx->isc_txqsizes;
 5898         uint8_t nrxqs = sctx->isc_nrxqs;
 5899         uint8_t ntxqs = sctx->isc_ntxqs;
 5900         int nfree_lists = sctx->isc_nfl ? sctx->isc_nfl : 1;
 5901         int fl_offset = (sctx->isc_flags & IFLIB_HAS_RXCQ ? 1 : 0);
 5902         caddr_t *vaddrs;
 5903         uint64_t *paddrs;
 5904 
 5905         KASSERT(ntxqs > 0, ("number of queues per qset must be at least 1"));
 5906         KASSERT(nrxqs > 0, ("number of queues per qset must be at least 1"));
 5907         KASSERT(nrxqs >= fl_offset + nfree_lists,
 5908            ("there must be at least a rxq for each free list"));
 5909 
 5910         /* Allocate the TX ring struct memory */
 5911         if (!(ctx->ifc_txqs =
 5912             (iflib_txq_t) malloc(sizeof(struct iflib_txq) *
 5913             ntxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 5914                 device_printf(dev, "Unable to allocate TX ring memory\n");
 5915                 err = ENOMEM;
 5916                 goto fail;
 5917         }
 5918 
 5919         /* Now allocate the RX */
 5920         if (!(ctx->ifc_rxqs =
 5921             (iflib_rxq_t) malloc(sizeof(struct iflib_rxq) *
 5922             nrxqsets, M_IFLIB, M_NOWAIT | M_ZERO))) {
 5923                 device_printf(dev, "Unable to allocate RX ring memory\n");
 5924                 err = ENOMEM;
 5925                 goto rx_fail;
 5926         }
 5927 
 5928         txq = ctx->ifc_txqs;
 5929         rxq = ctx->ifc_rxqs;
 5930 
 5931         /*
 5932          * XXX handle allocation failure
 5933          */
 5934         for (txconf = i = 0, cpu = CPU_FIRST(); i < ntxqsets; i++, txconf++, txq++, cpu = CPU_NEXT(cpu)) {
 5935                 /* Set up some basics */
 5936 
 5937                 if ((ifdip = malloc(sizeof(struct iflib_dma_info) * ntxqs,
 5938                     M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 5939                         device_printf(dev,
 5940                             "Unable to allocate TX DMA info memory\n");
 5941                         err = ENOMEM;
 5942                         goto err_tx_desc;
 5943                 }
 5944                 txq->ift_ifdi = ifdip;
 5945                 for (j = 0; j < ntxqs; j++, ifdip++) {
 5946                         if (iflib_dma_alloc(ctx, txqsizes[j], ifdip, 0)) {
 5947                                 device_printf(dev,
 5948                                     "Unable to allocate TX descriptors\n");
 5949                                 err = ENOMEM;
 5950                                 goto err_tx_desc;
 5951                         }
 5952                         txq->ift_txd_size[j] = scctx->isc_txd_size[j];
 5953                         bzero((void *)ifdip->idi_vaddr, txqsizes[j]);
 5954                 }
 5955                 txq->ift_ctx = ctx;
 5956                 txq->ift_id = i;
 5957                 if (sctx->isc_flags & IFLIB_HAS_TXCQ) {
 5958                         txq->ift_br_offset = 1;
 5959                 } else {
 5960                         txq->ift_br_offset = 0;
 5961                 }
 5962 
 5963                 if (iflib_txsd_alloc(txq)) {
 5964                         device_printf(dev, "Critical Failure setting up TX buffers\n");
 5965                         err = ENOMEM;
 5966                         goto err_tx_desc;
 5967                 }
 5968 
 5969                 /* Initialize the TX lock */
 5970                 snprintf(txq->ift_mtx_name, MTX_NAME_LEN, "%s:TX(%d):callout",
 5971                     device_get_nameunit(dev), txq->ift_id);
 5972                 mtx_init(&txq->ift_mtx, txq->ift_mtx_name, NULL, MTX_DEF);
 5973                 callout_init_mtx(&txq->ift_timer, &txq->ift_mtx, 0);
 5974                 txq->ift_timer.c_cpu = cpu;
 5975 #ifdef DEV_NETMAP
 5976                 callout_init_mtx(&txq->ift_netmap_timer, &txq->ift_mtx, 0);
 5977                 txq->ift_netmap_timer.c_cpu = cpu;
 5978 #endif /* DEV_NETMAP */
 5979 
 5980                 err = ifmp_ring_alloc(&txq->ift_br, 2048, txq, iflib_txq_drain,
 5981                                       iflib_txq_can_drain, M_IFLIB, M_WAITOK);
 5982                 if (err) {
 5983                         /* XXX free any allocated rings */
 5984                         device_printf(dev, "Unable to allocate buf_ring\n");
 5985                         goto err_tx_desc;
 5986                 }
 5987         }
 5988 
 5989         for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
 5990                 /* Set up some basics */
 5991                 callout_init(&rxq->ifr_watchdog, 1);
 5992 
 5993                 if ((ifdip = malloc(sizeof(struct iflib_dma_info) * nrxqs,
 5994                    M_IFLIB, M_NOWAIT | M_ZERO)) == NULL) {
 5995                         device_printf(dev,
 5996                             "Unable to allocate RX DMA info memory\n");
 5997                         err = ENOMEM;
 5998                         goto err_tx_desc;
 5999                 }
 6000 
 6001                 rxq->ifr_ifdi = ifdip;
 6002                 /* XXX this needs to be changed if #rx queues != #tx queues */
 6003                 rxq->ifr_ntxqirq = 1;
 6004                 rxq->ifr_txqid[0] = i;
 6005                 for (j = 0; j < nrxqs; j++, ifdip++) {
 6006                         if (iflib_dma_alloc(ctx, rxqsizes[j], ifdip, 0)) {
 6007                                 device_printf(dev,
 6008                                     "Unable to allocate RX descriptors\n");
 6009                                 err = ENOMEM;
 6010                                 goto err_tx_desc;
 6011                         }
 6012                         bzero((void *)ifdip->idi_vaddr, rxqsizes[j]);
 6013                 }
 6014                 rxq->ifr_ctx = ctx;
 6015                 rxq->ifr_id = i;
 6016                 rxq->ifr_fl_offset = fl_offset;
 6017                 rxq->ifr_nfl = nfree_lists;
 6018                 if (!(fl =
 6019                           (iflib_fl_t) malloc(sizeof(struct iflib_fl) * nfree_lists, M_IFLIB, M_NOWAIT | M_ZERO))) {
 6020                         device_printf(dev, "Unable to allocate free list memory\n");
 6021                         err = ENOMEM;
 6022                         goto err_tx_desc;
 6023                 }
 6024                 rxq->ifr_fl = fl;
 6025                 for (j = 0; j < nfree_lists; j++) {
 6026                         fl[j].ifl_rxq = rxq;
 6027                         fl[j].ifl_id = j;
 6028                         fl[j].ifl_ifdi = &rxq->ifr_ifdi[j + rxq->ifr_fl_offset];
 6029                         fl[j].ifl_rxd_size = scctx->isc_rxd_size[j];
 6030                 }
 6031                 /* Allocate receive buffers for the ring */
 6032                 if (iflib_rxsd_alloc(rxq)) {
 6033                         device_printf(dev,
 6034                             "Critical Failure setting up receive buffers\n");
 6035                         err = ENOMEM;
 6036                         goto err_rx_desc;
 6037                 }
 6038 
 6039                 for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) 
 6040                         fl->ifl_rx_bitmap = bit_alloc(fl->ifl_size, M_IFLIB,
 6041                             M_WAITOK);
 6042         }
 6043 
 6044         /* TXQs */
 6045         vaddrs = malloc(sizeof(caddr_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 6046         paddrs = malloc(sizeof(uint64_t)*ntxqsets*ntxqs, M_IFLIB, M_WAITOK);
 6047         for (i = 0; i < ntxqsets; i++) {
 6048                 iflib_dma_info_t di = ctx->ifc_txqs[i].ift_ifdi;
 6049 
 6050                 for (j = 0; j < ntxqs; j++, di++) {
 6051                         vaddrs[i*ntxqs + j] = di->idi_vaddr;
 6052                         paddrs[i*ntxqs + j] = di->idi_paddr;
 6053                 }
 6054         }
 6055         if ((err = IFDI_TX_QUEUES_ALLOC(ctx, vaddrs, paddrs, ntxqs, ntxqsets)) != 0) {
 6056                 device_printf(ctx->ifc_dev,
 6057                     "Unable to allocate device TX queue\n");
 6058                 iflib_tx_structures_free(ctx);
 6059                 free(vaddrs, M_IFLIB);
 6060                 free(paddrs, M_IFLIB);
 6061                 goto err_rx_desc;
 6062         }
 6063         free(vaddrs, M_IFLIB);
 6064         free(paddrs, M_IFLIB);
 6065 
 6066         /* RXQs */
 6067         vaddrs = malloc(sizeof(caddr_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 6068         paddrs = malloc(sizeof(uint64_t)*nrxqsets*nrxqs, M_IFLIB, M_WAITOK);
 6069         for (i = 0; i < nrxqsets; i++) {
 6070                 iflib_dma_info_t di = ctx->ifc_rxqs[i].ifr_ifdi;
 6071 
 6072                 for (j = 0; j < nrxqs; j++, di++) {
 6073                         vaddrs[i*nrxqs + j] = di->idi_vaddr;
 6074                         paddrs[i*nrxqs + j] = di->idi_paddr;
 6075                 }
 6076         }
 6077         if ((err = IFDI_RX_QUEUES_ALLOC(ctx, vaddrs, paddrs, nrxqs, nrxqsets)) != 0) {
 6078                 device_printf(ctx->ifc_dev,
 6079                     "Unable to allocate device RX queue\n");
 6080                 iflib_tx_structures_free(ctx);
 6081                 free(vaddrs, M_IFLIB);
 6082                 free(paddrs, M_IFLIB);
 6083                 goto err_rx_desc;
 6084         }
 6085         free(vaddrs, M_IFLIB);
 6086         free(paddrs, M_IFLIB);
 6087 
 6088         return (0);
 6089 
 6090 /* XXX handle allocation failure changes */
 6091 err_rx_desc:
 6092 err_tx_desc:
 6093 rx_fail:
 6094         if (ctx->ifc_rxqs != NULL)
 6095                 free(ctx->ifc_rxqs, M_IFLIB);
 6096         ctx->ifc_rxqs = NULL;
 6097         if (ctx->ifc_txqs != NULL)
 6098                 free(ctx->ifc_txqs, M_IFLIB);
 6099         ctx->ifc_txqs = NULL;
 6100 fail:
 6101         return (err);
 6102 }
 6103 
 6104 static int
 6105 iflib_tx_structures_setup(if_ctx_t ctx)
 6106 {
 6107         iflib_txq_t txq = ctx->ifc_txqs;
 6108         int i;
 6109 
 6110         for (i = 0; i < NTXQSETS(ctx); i++, txq++)
 6111                 iflib_txq_setup(txq);
 6112 
 6113         return (0);
 6114 }
 6115 
 6116 static void
 6117 iflib_tx_structures_free(if_ctx_t ctx)
 6118 {
 6119         iflib_txq_t txq = ctx->ifc_txqs;
 6120         if_shared_ctx_t sctx = ctx->ifc_sctx;
 6121         int i, j;
 6122 
 6123         for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
 6124                 for (j = 0; j < sctx->isc_ntxqs; j++)
 6125                         iflib_dma_free(&txq->ift_ifdi[j]);
 6126                 iflib_txq_destroy(txq);
 6127         }
 6128         free(ctx->ifc_txqs, M_IFLIB);
 6129         ctx->ifc_txqs = NULL;
 6130 }
 6131 
 6132 /*********************************************************************
 6133  *
 6134  *  Initialize all receive rings.
 6135  *
 6136  **********************************************************************/
 6137 static int
 6138 iflib_rx_structures_setup(if_ctx_t ctx)
 6139 {
 6140         iflib_rxq_t rxq = ctx->ifc_rxqs;
 6141         int q;
 6142 #if defined(INET6) || defined(INET)
 6143         int err, i;
 6144 #endif
 6145 
 6146         for (q = 0; q < ctx->ifc_softc_ctx.isc_nrxqsets; q++, rxq++) {
 6147 #if defined(INET6) || defined(INET)
 6148                 if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO) {
 6149                         err = tcp_lro_init_args(&rxq->ifr_lc, ctx->ifc_ifp,
 6150                             TCP_LRO_ENTRIES, min(1024,
 6151                             ctx->ifc_softc_ctx.isc_nrxd[rxq->ifr_fl_offset]));
 6152                         if (err != 0) {
 6153                                 device_printf(ctx->ifc_dev,
 6154                                     "LRO Initialization failed!\n");
 6155                                 goto fail;
 6156                         }
 6157                 }
 6158 #endif
 6159                 IFDI_RXQ_SETUP(ctx, rxq->ifr_id);
 6160         }
 6161         return (0);
 6162 #if defined(INET6) || defined(INET)
 6163 fail:
 6164         /*
 6165          * Free LRO resources allocated so far, we will only handle
 6166          * the rings that completed, the failing case will have
 6167          * cleaned up for itself.  'q' failed, so its the terminus.
 6168          */
 6169         rxq = ctx->ifc_rxqs;
 6170         for (i = 0; i < q; ++i, rxq++) {
 6171                 if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO)
 6172                         tcp_lro_free(&rxq->ifr_lc);
 6173         }
 6174         return (err);
 6175 #endif
 6176 }
 6177 
 6178 /*********************************************************************
 6179  *
 6180  *  Free all receive rings.
 6181  *
 6182  **********************************************************************/
 6183 static void
 6184 iflib_rx_structures_free(if_ctx_t ctx)
 6185 {
 6186         iflib_rxq_t rxq = ctx->ifc_rxqs;
 6187         if_shared_ctx_t sctx = ctx->ifc_sctx;
 6188         int i, j;
 6189 
 6190         for (i = 0; i < ctx->ifc_softc_ctx.isc_nrxqsets; i++, rxq++) {
 6191                 for (j = 0; j < sctx->isc_nrxqs; j++)
 6192                         iflib_dma_free(&rxq->ifr_ifdi[j]);
 6193                 iflib_rx_sds_free(rxq);
 6194 #if defined(INET6) || defined(INET)
 6195                 if (if_getcapabilities(ctx->ifc_ifp) & IFCAP_LRO)
 6196                         tcp_lro_free(&rxq->ifr_lc);
 6197 #endif
 6198         }
 6199         free(ctx->ifc_rxqs, M_IFLIB);
 6200         ctx->ifc_rxqs = NULL;
 6201 }
 6202 
 6203 static int
 6204 iflib_qset_structures_setup(if_ctx_t ctx)
 6205 {
 6206         int err;
 6207 
 6208         /*
 6209          * It is expected that the caller takes care of freeing queues if this
 6210          * fails.
 6211          */
 6212         if ((err = iflib_tx_structures_setup(ctx)) != 0) {
 6213                 device_printf(ctx->ifc_dev, "iflib_tx_structures_setup failed: %d\n", err);
 6214                 return (err);
 6215         }
 6216 
 6217         if ((err = iflib_rx_structures_setup(ctx)) != 0)
 6218                 device_printf(ctx->ifc_dev, "iflib_rx_structures_setup failed: %d\n", err);
 6219 
 6220         return (err);
 6221 }
 6222 
 6223 int
 6224 iflib_irq_alloc(if_ctx_t ctx, if_irq_t irq, int rid,
 6225                 driver_filter_t filter, void *filter_arg, driver_intr_t handler, void *arg, const char *name)
 6226 {
 6227 
 6228         return (_iflib_irq_alloc(ctx, irq, rid, filter, handler, arg, name));
 6229 }
 6230 
 6231 /* Just to avoid copy/paste */
 6232 static inline int
 6233 iflib_irq_set_affinity(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type,
 6234     int qid, struct grouptask *gtask, struct taskqgroup *tqg, void *uniq,
 6235     const char *name)
 6236 {
 6237         device_t dev;
 6238         unsigned int base_cpuid, cpuid;
 6239         int err;
 6240 
 6241         dev = ctx->ifc_dev;
 6242         base_cpuid = ctx->ifc_sysctl_core_offset;
 6243         cpuid = get_cpuid_for_queue(ctx, base_cpuid, qid, type == IFLIB_INTR_TX);
 6244         err = taskqgroup_attach_cpu(tqg, gtask, uniq, cpuid,
 6245             irq ? rman_get_start(irq->ii_res) : -1, name);
 6246         if (err) {
 6247                 device_printf(dev, "taskqgroup_attach_cpu failed %d\n", err);
 6248                 return (err);
 6249         }
 6250 #ifdef notyet
 6251         if (cpuid > ctx->ifc_cpuid_highest)
 6252                 ctx->ifc_cpuid_highest = cpuid;
 6253 #endif
 6254         return (0);
 6255 }
 6256 
 6257 int
 6258 iflib_irq_alloc_generic(if_ctx_t ctx, if_irq_t irq, int rid,
 6259                         iflib_intr_type_t type, driver_filter_t *filter,
 6260                         void *filter_arg, int qid, const char *name)
 6261 {
 6262         device_t dev;
 6263         struct grouptask *gtask;
 6264         struct taskqgroup *tqg;
 6265         iflib_filter_info_t info;
 6266         gtask_fn_t *fn;
 6267         int tqrid, err;
 6268         driver_filter_t *intr_fast;
 6269         void *q;
 6270 
 6271         info = &ctx->ifc_filter_info;
 6272         tqrid = rid;
 6273 
 6274         switch (type) {
 6275         /* XXX merge tx/rx for netmap? */
 6276         case IFLIB_INTR_TX:
 6277                 q = &ctx->ifc_txqs[qid];
 6278                 info = &ctx->ifc_txqs[qid].ift_filter_info;
 6279                 gtask = &ctx->ifc_txqs[qid].ift_task;
 6280                 tqg = qgroup_if_io_tqg;
 6281                 fn = _task_fn_tx;
 6282                 intr_fast = iflib_fast_intr;
 6283                 GROUPTASK_INIT(gtask, 0, fn, q);
 6284                 ctx->ifc_flags |= IFC_NETMAP_TX_IRQ;
 6285                 break;
 6286         case IFLIB_INTR_RX:
 6287                 q = &ctx->ifc_rxqs[qid];
 6288                 info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 6289                 gtask = &ctx->ifc_rxqs[qid].ifr_task;
 6290                 tqg = qgroup_if_io_tqg;
 6291                 fn = _task_fn_rx;
 6292                 intr_fast = iflib_fast_intr;
 6293                 GROUPTASK_INIT(gtask, 0, fn, q);
 6294                 break;
 6295         case IFLIB_INTR_RXTX:
 6296                 q = &ctx->ifc_rxqs[qid];
 6297                 info = &ctx->ifc_rxqs[qid].ifr_filter_info;
 6298                 gtask = &ctx->ifc_rxqs[qid].ifr_task;
 6299                 tqg = qgroup_if_io_tqg;
 6300                 fn = _task_fn_rx;
 6301                 intr_fast = iflib_fast_intr_rxtx;
 6302                 GROUPTASK_INIT(gtask, 0, fn, q);
 6303                 break;
 6304         case IFLIB_INTR_ADMIN:
 6305                 q = ctx;
 6306                 tqrid = -1;
 6307                 info = &ctx->ifc_filter_info;
 6308                 gtask = &ctx->ifc_admin_task;
 6309                 tqg = qgroup_if_config_tqg;
 6310                 fn = _task_fn_admin;
 6311                 intr_fast = iflib_fast_intr_ctx;
 6312                 break;
 6313         default:
 6314                 device_printf(ctx->ifc_dev, "%s: unknown net intr type\n",
 6315                     __func__);
 6316                 return (EINVAL);
 6317         }
 6318 
 6319         info->ifi_filter = filter;
 6320         info->ifi_filter_arg = filter_arg;
 6321         info->ifi_task = gtask;
 6322         info->ifi_ctx = q;
 6323 
 6324         dev = ctx->ifc_dev;
 6325         err = _iflib_irq_alloc(ctx, irq, rid, intr_fast, NULL, info,  name);
 6326         if (err != 0) {
 6327                 device_printf(dev, "_iflib_irq_alloc failed %d\n", err);
 6328                 return (err);
 6329         }
 6330         if (type == IFLIB_INTR_ADMIN)
 6331                 return (0);
 6332 
 6333         if (tqrid != -1) {
 6334                 err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q,
 6335                     name);
 6336                 if (err)
 6337                         return (err);
 6338         } else {
 6339                 taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res),
 6340                     name);
 6341         }
 6342 
 6343         return (0);
 6344 }
 6345 
 6346 void
 6347 iflib_softirq_alloc_generic(if_ctx_t ctx, if_irq_t irq, iflib_intr_type_t type, void *arg, int qid, const char *name)
 6348 {
 6349         device_t dev;
 6350         struct grouptask *gtask;
 6351         struct taskqgroup *tqg;
 6352         gtask_fn_t *fn;
 6353         void *q;
 6354         int err;
 6355 
 6356         switch (type) {
 6357         case IFLIB_INTR_TX:
 6358                 q = &ctx->ifc_txqs[qid];
 6359                 gtask = &ctx->ifc_txqs[qid].ift_task;
 6360                 tqg = qgroup_if_io_tqg;
 6361                 fn = _task_fn_tx;
 6362                 break;
 6363         case IFLIB_INTR_RX:
 6364                 q = &ctx->ifc_rxqs[qid];
 6365                 gtask = &ctx->ifc_rxqs[qid].ifr_task;
 6366                 tqg = qgroup_if_io_tqg;
 6367                 fn = _task_fn_rx;
 6368                 break;
 6369         case IFLIB_INTR_IOV:
 6370                 q = ctx;
 6371                 gtask = &ctx->ifc_vflr_task;
 6372                 tqg = qgroup_if_config_tqg;
 6373                 fn = _task_fn_iov;
 6374                 break;
 6375         default:
 6376                 panic("unknown net intr type");
 6377         }
 6378         GROUPTASK_INIT(gtask, 0, fn, q);
 6379         err = iflib_irq_set_affinity(ctx, irq, type, qid, gtask, tqg, q, name);
 6380         if (err) {
 6381                 dev = ctx->ifc_dev;
 6382                 taskqgroup_attach(tqg, gtask, q, irq ? rman_get_start(irq->ii_res) : -1,
 6383                     name);
 6384         }
 6385 }
 6386 
 6387 void
 6388 iflib_irq_free(if_ctx_t ctx, if_irq_t irq)
 6389 {
 6390 
 6391         if (irq->ii_tag)
 6392                 bus_teardown_intr(ctx->ifc_dev, irq->ii_res, irq->ii_tag);
 6393 
 6394         if (irq->ii_res)
 6395                 bus_release_resource(ctx->ifc_dev, SYS_RES_IRQ,
 6396                     rman_get_rid(irq->ii_res), irq->ii_res);
 6397 }
 6398 
 6399 static int
 6400 iflib_legacy_setup(if_ctx_t ctx, driver_filter_t filter, void *filter_arg, int *rid, const char *name)
 6401 {
 6402         iflib_txq_t txq = ctx->ifc_txqs;
 6403         iflib_rxq_t rxq = ctx->ifc_rxqs;
 6404         if_irq_t irq = &ctx->ifc_legacy_irq;
 6405         iflib_filter_info_t info;
 6406         struct grouptask *gtask;
 6407         struct taskqgroup *tqg;
 6408         gtask_fn_t *fn;
 6409         int tqrid;
 6410         void *q;
 6411         int err;
 6412         bool rx_only;
 6413 
 6414         q = &ctx->ifc_rxqs[0];
 6415         info = &rxq[0].ifr_filter_info;
 6416         gtask = &rxq[0].ifr_task;
 6417         tqg = qgroup_if_io_tqg;
 6418         tqrid = irq->ii_rid = *rid;
 6419         fn = _task_fn_rx;
 6420         rx_only = (ctx->ifc_sctx->isc_flags & IFLIB_SINGLE_IRQ_RX_ONLY) != 0;
 6421 
 6422         ctx->ifc_flags |= IFC_LEGACY;
 6423         info->ifi_filter = filter;
 6424         info->ifi_filter_arg = filter_arg;
 6425         info->ifi_task = gtask;
 6426         info->ifi_ctx = rx_only ? ctx : q;
 6427 
 6428         /* We allocate a single interrupt resource */
 6429         err = _iflib_irq_alloc(ctx, irq, tqrid, rx_only ? iflib_fast_intr_ctx :
 6430             iflib_fast_intr_rxtx, NULL, info, name);
 6431         if (err != 0)
 6432                 return (err);
 6433         GROUPTASK_INIT(gtask, 0, fn, q);
 6434         taskqgroup_attach(tqg, gtask, q, rman_get_start(irq->ii_res), name);
 6435 
 6436         GROUPTASK_INIT(&txq->ift_task, 0, _task_fn_tx, txq);
 6437         taskqgroup_attach(qgroup_if_io_tqg, &txq->ift_task, txq,
 6438             rman_get_start(irq->ii_res), "tx");
 6439         return (0);
 6440 }
 6441 
 6442 void
 6443 iflib_led_create(if_ctx_t ctx)
 6444 {
 6445 
 6446         ctx->ifc_led_dev = led_create(iflib_led_func, ctx,
 6447             device_get_nameunit(ctx->ifc_dev));
 6448 }
 6449 
 6450 void
 6451 iflib_tx_intr_deferred(if_ctx_t ctx, int txqid)
 6452 {
 6453 
 6454         GROUPTASK_ENQUEUE(&ctx->ifc_txqs[txqid].ift_task);
 6455 }
 6456 
 6457 void
 6458 iflib_rx_intr_deferred(if_ctx_t ctx, int rxqid)
 6459 {
 6460 
 6461         GROUPTASK_ENQUEUE(&ctx->ifc_rxqs[rxqid].ifr_task);
 6462 }
 6463 
 6464 void
 6465 iflib_admin_intr_deferred(if_ctx_t ctx)
 6466 {
 6467 #ifdef INVARIANTS
 6468         struct grouptask *gtask;
 6469 
 6470         gtask = &ctx->ifc_admin_task;
 6471         MPASS(gtask != NULL && gtask->gt_taskqueue != NULL);
 6472 #endif
 6473 
 6474         GROUPTASK_ENQUEUE(&ctx->ifc_admin_task);
 6475 }
 6476 
 6477 void
 6478 iflib_iov_intr_deferred(if_ctx_t ctx)
 6479 {
 6480 
 6481         GROUPTASK_ENQUEUE(&ctx->ifc_vflr_task);
 6482 }
 6483 
 6484 void
 6485 iflib_io_tqg_attach(struct grouptask *gt, void *uniq, int cpu, char *name)
 6486 {
 6487 
 6488         taskqgroup_attach_cpu(qgroup_if_io_tqg, gt, uniq, cpu, -1, name);
 6489 }
 6490 
 6491 void
 6492 iflib_config_gtask_init(void *ctx, struct grouptask *gtask, gtask_fn_t *fn,
 6493         const char *name)
 6494 {
 6495 
 6496         GROUPTASK_INIT(gtask, 0, fn, ctx);
 6497         taskqgroup_attach(qgroup_if_config_tqg, gtask, gtask, -1, name);
 6498 }
 6499 
 6500 void
 6501 iflib_config_gtask_deinit(struct grouptask *gtask)
 6502 {
 6503 
 6504         taskqgroup_detach(qgroup_if_config_tqg, gtask); 
 6505 }
 6506 
 6507 void
 6508 iflib_link_state_change(if_ctx_t ctx, int link_state, uint64_t baudrate)
 6509 {
 6510         if_t ifp = ctx->ifc_ifp;
 6511         iflib_txq_t txq = ctx->ifc_txqs;
 6512 
 6513         if_setbaudrate(ifp, baudrate);
 6514         if (baudrate >= IF_Gbps(10)) {
 6515                 STATE_LOCK(ctx);
 6516                 ctx->ifc_flags |= IFC_PREFETCH;
 6517                 STATE_UNLOCK(ctx);
 6518         }
 6519         /* If link down, disable watchdog */
 6520         if ((ctx->ifc_link_state == LINK_STATE_UP) && (link_state == LINK_STATE_DOWN)) {
 6521                 for (int i = 0; i < ctx->ifc_softc_ctx.isc_ntxqsets; i++, txq++)
 6522                         txq->ift_qstatus = IFLIB_QUEUE_IDLE;
 6523         }
 6524         ctx->ifc_link_state = link_state;
 6525         if_link_state_change(ifp, link_state);
 6526 }
 6527 
 6528 static int
 6529 iflib_tx_credits_update(if_ctx_t ctx, iflib_txq_t txq)
 6530 {
 6531         int credits;
 6532 #ifdef INVARIANTS
 6533         int credits_pre = txq->ift_cidx_processed;
 6534 #endif
 6535 
 6536         bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map,
 6537             BUS_DMASYNC_POSTREAD);
 6538         if ((credits = ctx->isc_txd_credits_update(ctx->ifc_softc, txq->ift_id, true)) == 0)
 6539                 return (0);
 6540 
 6541         txq->ift_processed += credits;
 6542         txq->ift_cidx_processed += credits;
 6543 
 6544         MPASS(credits_pre + credits == txq->ift_cidx_processed);
 6545         if (txq->ift_cidx_processed >= txq->ift_size)
 6546                 txq->ift_cidx_processed -= txq->ift_size;
 6547         return (credits);
 6548 }
 6549 
 6550 static int
 6551 iflib_rxd_avail(if_ctx_t ctx, iflib_rxq_t rxq, qidx_t cidx, qidx_t budget)
 6552 {
 6553         iflib_fl_t fl;
 6554         u_int i;
 6555 
 6556         for (i = 0, fl = &rxq->ifr_fl[0]; i < rxq->ifr_nfl; i++, fl++)
 6557                 bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map,
 6558                     BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
 6559         return (ctx->isc_rxd_available(ctx->ifc_softc, rxq->ifr_id, cidx,
 6560             budget));
 6561 }
 6562 
 6563 void
 6564 iflib_add_int_delay_sysctl(if_ctx_t ctx, const char *name,
 6565         const char *description, if_int_delay_info_t info,
 6566         int offset, int value)
 6567 {
 6568         info->iidi_ctx = ctx;
 6569         info->iidi_offset = offset;
 6570         info->iidi_value = value;
 6571         SYSCTL_ADD_PROC(device_get_sysctl_ctx(ctx->ifc_dev),
 6572             SYSCTL_CHILDREN(device_get_sysctl_tree(ctx->ifc_dev)),
 6573             OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
 6574             info, 0, iflib_sysctl_int_delay, "I", description);
 6575 }
 6576 
 6577 struct sx *
 6578 iflib_ctx_lock_get(if_ctx_t ctx)
 6579 {
 6580 
 6581         return (&ctx->ifc_ctx_sx);
 6582 }
 6583 
 6584 static int
 6585 iflib_msix_init(if_ctx_t ctx)
 6586 {
 6587         device_t dev = ctx->ifc_dev;
 6588         if_shared_ctx_t sctx = ctx->ifc_sctx;
 6589         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 6590         int admincnt, bar, err, iflib_num_rx_queues, iflib_num_tx_queues;
 6591         int msgs, queuemsgs, queues, rx_queues, tx_queues, vectors;
 6592 
 6593         iflib_num_tx_queues = ctx->ifc_sysctl_ntxqs;
 6594         iflib_num_rx_queues = ctx->ifc_sysctl_nrxqs;
 6595 
 6596         if (bootverbose)
 6597                 device_printf(dev, "msix_init qsets capped at %d\n",
 6598                     imax(scctx->isc_ntxqsets, scctx->isc_nrxqsets));
 6599 
 6600         /* Override by tuneable */
 6601         if (scctx->isc_disable_msix)
 6602                 goto msi;
 6603 
 6604         /* First try MSI-X */
 6605         if ((msgs = pci_msix_count(dev)) == 0) {
 6606                 if (bootverbose)
 6607                         device_printf(dev, "MSI-X not supported or disabled\n");
 6608                 goto msi;
 6609         }
 6610 
 6611         bar = ctx->ifc_softc_ctx.isc_msix_bar;
 6612         /*
 6613          * bar == -1 => "trust me I know what I'm doing"
 6614          * Some drivers are for hardware that is so shoddily
 6615          * documented that no one knows which bars are which
 6616          * so the developer has to map all bars. This hack
 6617          * allows shoddy garbage to use MSI-X in this framework.
 6618          */
 6619         if (bar != -1) {
 6620                 ctx->ifc_msix_mem = bus_alloc_resource_any(dev,
 6621                     SYS_RES_MEMORY, &bar, RF_ACTIVE);
 6622                 if (ctx->ifc_msix_mem == NULL) {
 6623                         device_printf(dev, "Unable to map MSI-X table\n");
 6624                         goto msi;
 6625                 }
 6626         }
 6627 
 6628         admincnt = sctx->isc_admin_intrcnt;
 6629 #if IFLIB_DEBUG
 6630         /* use only 1 qset in debug mode */
 6631         queuemsgs = min(msgs - admincnt, 1);
 6632 #else
 6633         queuemsgs = msgs - admincnt;
 6634 #endif
 6635 #ifdef RSS
 6636         queues = imin(queuemsgs, rss_getnumbuckets());
 6637 #else
 6638         queues = queuemsgs;
 6639 #endif
 6640         queues = imin(CPU_COUNT(&ctx->ifc_cpus), queues);
 6641         if (bootverbose)
 6642                 device_printf(dev,
 6643                     "intr CPUs: %d queue msgs: %d admincnt: %d\n",
 6644                     CPU_COUNT(&ctx->ifc_cpus), queuemsgs, admincnt);
 6645 #ifdef  RSS
 6646         /* If we're doing RSS, clamp at the number of RSS buckets */
 6647         if (queues > rss_getnumbuckets())
 6648                 queues = rss_getnumbuckets();
 6649 #endif
 6650         if (iflib_num_rx_queues > 0 && iflib_num_rx_queues < queuemsgs - admincnt)
 6651                 rx_queues = iflib_num_rx_queues;
 6652         else
 6653                 rx_queues = queues;
 6654 
 6655         if (rx_queues > scctx->isc_nrxqsets)
 6656                 rx_queues = scctx->isc_nrxqsets;
 6657 
 6658         /*
 6659          * We want this to be all logical CPUs by default
 6660          */
 6661         if (iflib_num_tx_queues > 0 && iflib_num_tx_queues < queues)
 6662                 tx_queues = iflib_num_tx_queues;
 6663         else
 6664                 tx_queues = mp_ncpus;
 6665 
 6666         if (tx_queues > scctx->isc_ntxqsets)
 6667                 tx_queues = scctx->isc_ntxqsets;
 6668 
 6669         if (ctx->ifc_sysctl_qs_eq_override == 0) {
 6670 #ifdef INVARIANTS
 6671                 if (tx_queues != rx_queues)
 6672                         device_printf(dev,
 6673                             "queue equality override not set, capping rx_queues at %d and tx_queues at %d\n",
 6674                             min(rx_queues, tx_queues), min(rx_queues, tx_queues));
 6675 #endif
 6676                 tx_queues = min(rx_queues, tx_queues);
 6677                 rx_queues = min(rx_queues, tx_queues);
 6678         }
 6679 
 6680         vectors = rx_queues + admincnt;
 6681         if (msgs < vectors) {
 6682                 device_printf(dev,
 6683                     "insufficient number of MSI-X vectors "
 6684                     "(supported %d, need %d)\n", msgs, vectors);
 6685                 goto msi;
 6686         }
 6687 
 6688         device_printf(dev, "Using %d RX queues %d TX queues\n", rx_queues,
 6689             tx_queues);
 6690         msgs = vectors;
 6691         if ((err = pci_alloc_msix(dev, &vectors)) == 0) {
 6692                 if (vectors != msgs) {
 6693                         device_printf(dev,
 6694                             "Unable to allocate sufficient MSI-X vectors "
 6695                             "(got %d, need %d)\n", vectors, msgs);
 6696                         pci_release_msi(dev);
 6697                         if (bar != -1) {
 6698                                 bus_release_resource(dev, SYS_RES_MEMORY, bar,
 6699                                     ctx->ifc_msix_mem);
 6700                                 ctx->ifc_msix_mem = NULL;
 6701                         }
 6702                         goto msi;
 6703                 }
 6704                 device_printf(dev, "Using MSI-X interrupts with %d vectors\n",
 6705                     vectors);
 6706                 scctx->isc_vectors = vectors;
 6707                 scctx->isc_nrxqsets = rx_queues;
 6708                 scctx->isc_ntxqsets = tx_queues;
 6709                 scctx->isc_intr = IFLIB_INTR_MSIX;
 6710 
 6711                 return (vectors);
 6712         } else {
 6713                 device_printf(dev,
 6714                     "failed to allocate %d MSI-X vectors, err: %d\n", vectors,
 6715                     err);
 6716                 if (bar != -1) {
 6717                         bus_release_resource(dev, SYS_RES_MEMORY, bar,
 6718                             ctx->ifc_msix_mem);
 6719                         ctx->ifc_msix_mem = NULL;
 6720                 }
 6721         }
 6722 
 6723 msi:
 6724         vectors = pci_msi_count(dev);
 6725         scctx->isc_nrxqsets = 1;
 6726         scctx->isc_ntxqsets = 1;
 6727         scctx->isc_vectors = vectors;
 6728         if (vectors == 1 && pci_alloc_msi(dev, &vectors) == 0) {
 6729                 device_printf(dev,"Using an MSI interrupt\n");
 6730                 scctx->isc_intr = IFLIB_INTR_MSI;
 6731         } else {
 6732                 scctx->isc_vectors = 1;
 6733                 device_printf(dev,"Using a Legacy interrupt\n");
 6734                 scctx->isc_intr = IFLIB_INTR_LEGACY;
 6735         }
 6736 
 6737         return (vectors);
 6738 }
 6739 
 6740 static const char *ring_states[] = { "IDLE", "BUSY", "STALLED", "ABDICATED" };
 6741 
 6742 static int
 6743 mp_ring_state_handler(SYSCTL_HANDLER_ARGS)
 6744 {
 6745         int rc;
 6746         uint16_t *state = ((uint16_t *)oidp->oid_arg1);
 6747         struct sbuf *sb;
 6748         const char *ring_state = "UNKNOWN";
 6749 
 6750         /* XXX needed ? */
 6751         rc = sysctl_wire_old_buffer(req, 0);
 6752         MPASS(rc == 0);
 6753         if (rc != 0)
 6754                 return (rc);
 6755         sb = sbuf_new_for_sysctl(NULL, NULL, 80, req);
 6756         MPASS(sb != NULL);
 6757         if (sb == NULL)
 6758                 return (ENOMEM);
 6759         if (state[3] <= 3)
 6760                 ring_state = ring_states[state[3]];
 6761 
 6762         sbuf_printf(sb, "pidx_head: %04hd pidx_tail: %04hd cidx: %04hd state: %s",
 6763                     state[0], state[1], state[2], ring_state);
 6764         rc = sbuf_finish(sb);
 6765         sbuf_delete(sb);
 6766         return(rc);
 6767 }
 6768 
 6769 enum iflib_ndesc_handler {
 6770         IFLIB_NTXD_HANDLER,
 6771         IFLIB_NRXD_HANDLER,
 6772 };
 6773 
 6774 static int
 6775 mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 6776 {
 6777         if_ctx_t ctx = (void *)arg1;
 6778         enum iflib_ndesc_handler type = arg2;
 6779         char buf[256] = {0};
 6780         qidx_t *ndesc;
 6781         char *p, *next;
 6782         int nqs, rc, i;
 6783 
 6784         nqs = 8;
 6785         switch(type) {
 6786         case IFLIB_NTXD_HANDLER:
 6787                 ndesc = ctx->ifc_sysctl_ntxds;
 6788                 if (ctx->ifc_sctx)
 6789                         nqs = ctx->ifc_sctx->isc_ntxqs;
 6790                 break;
 6791         case IFLIB_NRXD_HANDLER:
 6792                 ndesc = ctx->ifc_sysctl_nrxds;
 6793                 if (ctx->ifc_sctx)
 6794                         nqs = ctx->ifc_sctx->isc_nrxqs;
 6795                 break;
 6796         default:
 6797                 printf("%s: unhandled type\n", __func__);
 6798                 return (EINVAL);
 6799         }
 6800         if (nqs == 0)
 6801                 nqs = 8;
 6802 
 6803         for (i=0; i<8; i++) {
 6804                 if (i >= nqs)
 6805                         break;
 6806                 if (i)
 6807                         strcat(buf, ",");
 6808                 sprintf(strchr(buf, 0), "%d", ndesc[i]);
 6809         }
 6810 
 6811         rc = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 6812         if (rc || req->newptr == NULL)
 6813                 return rc;
 6814 
 6815         for (i = 0, next = buf, p = strsep(&next, " ,"); i < 8 && p;
 6816             i++, p = strsep(&next, " ,")) {
 6817                 ndesc[i] = strtoul(p, NULL, 10);
 6818         }
 6819 
 6820         return(rc);
 6821 }
 6822 
 6823 #define NAME_BUFLEN 32
 6824 static void
 6825 iflib_add_device_sysctl_pre(if_ctx_t ctx)
 6826 {
 6827         device_t dev = iflib_get_dev(ctx);
 6828         struct sysctl_oid_list *child, *oid_list;
 6829         struct sysctl_ctx_list *ctx_list;
 6830         struct sysctl_oid *node;
 6831 
 6832         ctx_list = device_get_sysctl_ctx(dev);
 6833         child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
 6834         ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, "iflib",
 6835                                                       CTLFLAG_RD, NULL, "IFLIB fields");
 6836         oid_list = SYSCTL_CHILDREN(node);
 6837 
 6838         SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
 6839                        CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version,
 6840                        "driver version");
 6841 
 6842         SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
 6843                        CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 6844                         "# of txqs to use, 0 => use default #");
 6845         SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
 6846                        CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 6847                         "# of rxqs to use, 0 => use default #");
 6848         SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
 6849                        CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
 6850                        "permit #txq != #rxq");
 6851         SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
 6852                       CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
 6853                       "disable MSI-X (default 0)");
 6854         SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
 6855                        CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0,
 6856                        "set the RX budget");
 6857         SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
 6858                        CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
 6859                        "cause TX to abdicate instead of running to completion");
 6860         ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
 6861         SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
 6862                        CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
 6863                        "offset to start using cores at");
 6864         SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
 6865                        CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
 6866                        "use separate cores for TX and RX");
 6867         SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
 6868                       CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
 6869                       "try to make use of logical cores for TX and RX");
 6870 
 6871         /* XXX change for per-queue sizes */
 6872         SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
 6873                        CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NTXD_HANDLER,
 6874                        mp_ndesc_handler, "A",
 6875                        "list of # of TX descriptors to use, 0 = use default #");
 6876         SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
 6877                        CTLTYPE_STRING|CTLFLAG_RWTUN, ctx, IFLIB_NRXD_HANDLER,
 6878                        mp_ndesc_handler, "A",
 6879                        "list of # of RX descriptors to use, 0 = use default #");
 6880 }
 6881 
 6882 static void
 6883 iflib_add_device_sysctl_post(if_ctx_t ctx)
 6884 {
 6885         if_shared_ctx_t sctx = ctx->ifc_sctx;
 6886         if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
 6887         device_t dev = iflib_get_dev(ctx);
 6888         struct sysctl_oid_list *child;
 6889         struct sysctl_ctx_list *ctx_list;
 6890         iflib_fl_t fl;
 6891         iflib_txq_t txq;
 6892         iflib_rxq_t rxq;
 6893         int i, j;
 6894         char namebuf[NAME_BUFLEN];
 6895         char *qfmt;
 6896         struct sysctl_oid *queue_node, *fl_node, *node;
 6897         struct sysctl_oid_list *queue_list, *fl_list;
 6898         ctx_list = device_get_sysctl_ctx(dev);
 6899 
 6900         node = ctx->ifc_sysctl_node;
 6901         child = SYSCTL_CHILDREN(node);
 6902 
 6903         if (scctx->isc_ntxqsets > 100)
 6904                 qfmt = "txq%03d";
 6905         else if (scctx->isc_ntxqsets > 10)
 6906                 qfmt = "txq%02d";
 6907         else
 6908                 qfmt = "txq%d";
 6909         for (i = 0, txq = ctx->ifc_txqs; i < scctx->isc_ntxqsets; i++, txq++) {
 6910                 snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 6911                 queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 6912                                              CTLFLAG_RD, NULL, "Queue Name");
 6913                 queue_list = SYSCTL_CHILDREN(queue_node);
 6914                 SYSCTL_ADD_S16(ctx_list, queue_list, OID_AUTO, "cpu",
 6915                                CTLFLAG_RD,
 6916                                &txq->ift_task.gt_cpu, 0, "cpu this queue is bound to");
 6917 #if MEMORY_LOGGING
 6918                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_dequeued",
 6919                                 CTLFLAG_RD,
 6920                                 &txq->ift_dequeued, "total mbufs freed");
 6921                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_enqueued",
 6922                                 CTLFLAG_RD,
 6923                                 &txq->ift_enqueued, "total mbufs enqueued");
 6924 #endif
 6925                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag",
 6926                                    CTLFLAG_RD,
 6927                                    &txq->ift_mbuf_defrag, "# of times m_defrag was called");
 6928                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "m_pullups",
 6929                                    CTLFLAG_RD,
 6930                                    &txq->ift_pullups, "# of times m_pullup was called");
 6931                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "mbuf_defrag_failed",
 6932                                    CTLFLAG_RD,
 6933                                    &txq->ift_mbuf_defrag_failed, "# of times m_defrag failed");
 6934                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_desc_avail",
 6935                                    CTLFLAG_RD,
 6936                                    &txq->ift_no_desc_avail, "# of times no descriptors were available");
 6937                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "tx_map_failed",
 6938                                    CTLFLAG_RD,
 6939                                    &txq->ift_map_failed, "# of times DMA map failed");
 6940                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txd_encap_efbig",
 6941                                    CTLFLAG_RD,
 6942                                    &txq->ift_txd_encap_efbig, "# of times txd_encap returned EFBIG");
 6943                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "no_tx_dma_setup",
 6944                                    CTLFLAG_RD,
 6945                                    &txq->ift_no_tx_dma_setup, "# of times map failed for other than EFBIG");
 6946                 SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_pidx",
 6947                                    CTLFLAG_RD,
 6948                                    &txq->ift_pidx, 1, "Producer Index");
 6949                 SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx",
 6950                                    CTLFLAG_RD,
 6951                                    &txq->ift_cidx, 1, "Consumer Index");
 6952                 SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_cidx_processed",
 6953                                    CTLFLAG_RD,
 6954                                    &txq->ift_cidx_processed, 1, "Consumer Index seen by credit update");
 6955                 SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "txq_in_use",
 6956                                    CTLFLAG_RD,
 6957                                    &txq->ift_in_use, 1, "descriptors in use");
 6958                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_processed",
 6959                                    CTLFLAG_RD,
 6960                                    &txq->ift_processed, "descriptors procesed for clean");
 6961                 SYSCTL_ADD_QUAD(ctx_list, queue_list, OID_AUTO, "txq_cleaned",
 6962                                    CTLFLAG_RD,
 6963                                    &txq->ift_cleaned, "total cleaned");
 6964                 SYSCTL_ADD_PROC(ctx_list, queue_list, OID_AUTO, "ring_state",
 6965                                 CTLTYPE_STRING | CTLFLAG_RD, __DEVOLATILE(uint64_t *, &txq->ift_br->state),
 6966                                 0, mp_ring_state_handler, "A", "soft ring state");
 6967                 SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_enqueues",
 6968                                        CTLFLAG_RD, &txq->ift_br->enqueues,
 6969                                        "# of enqueues to the mp_ring for this queue");
 6970                 SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_drops",
 6971                                        CTLFLAG_RD, &txq->ift_br->drops,
 6972                                        "# of drops in the mp_ring for this queue");
 6973                 SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_starts",
 6974                                        CTLFLAG_RD, &txq->ift_br->starts,
 6975                                        "# of normal consumer starts in the mp_ring for this queue");
 6976                 SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_stalls",
 6977                                        CTLFLAG_RD, &txq->ift_br->stalls,
 6978                                                "# of consumer stalls in the mp_ring for this queue");
 6979                 SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_restarts",
 6980                                CTLFLAG_RD, &txq->ift_br->restarts,
 6981                                        "# of consumer restarts in the mp_ring for this queue");
 6982                 SYSCTL_ADD_COUNTER_U64(ctx_list, queue_list, OID_AUTO, "r_abdications",
 6983                                        CTLFLAG_RD, &txq->ift_br->abdications,
 6984                                        "# of consumer abdications in the mp_ring for this queue");
 6985         }
 6986 
 6987         if (scctx->isc_nrxqsets > 100)
 6988                 qfmt = "rxq%03d";
 6989         else if (scctx->isc_nrxqsets > 10)
 6990                 qfmt = "rxq%02d";
 6991         else
 6992                 qfmt = "rxq%d";
 6993         for (i = 0, rxq = ctx->ifc_rxqs; i < scctx->isc_nrxqsets; i++, rxq++) {
 6994                 snprintf(namebuf, NAME_BUFLEN, qfmt, i);
 6995                 queue_node = SYSCTL_ADD_NODE(ctx_list, child, OID_AUTO, namebuf,
 6996                                              CTLFLAG_RD, NULL, "Queue Name");
 6997                 queue_list = SYSCTL_CHILDREN(queue_node);
 6998                 SYSCTL_ADD_S16(ctx_list, queue_list, OID_AUTO, "cpu",
 6999                                CTLFLAG_RD,
 7000                                &rxq->ifr_task.gt_cpu, 0, "cpu this queue is bound to");
 7001                 if (sctx->isc_flags & IFLIB_HAS_RXCQ) {
 7002                         SYSCTL_ADD_U16(ctx_list, queue_list, OID_AUTO, "rxq_cq_cidx",
 7003                                        CTLFLAG_RD,
 7004                                        &rxq->ifr_cq_cidx, 1, "Consumer Index");
 7005                 }
 7006 
 7007                 for (j = 0, fl = rxq->ifr_fl; j < rxq->ifr_nfl; j++, fl++) {
 7008                         snprintf(namebuf, NAME_BUFLEN, "rxq_fl%d", j);
 7009                         fl_node = SYSCTL_ADD_NODE(ctx_list, queue_list, OID_AUTO, namebuf,
 7010                                                      CTLFLAG_RD, NULL, "freelist Name");
 7011                         fl_list = SYSCTL_CHILDREN(fl_node);
 7012                         SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "pidx",
 7013                                        CTLFLAG_RD,
 7014                                        &fl->ifl_pidx, 1, "Producer Index");
 7015                         SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "cidx",
 7016                                        CTLFLAG_RD,
 7017                                        &fl->ifl_cidx, 1, "Consumer Index");
 7018                         SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "credits",
 7019                                        CTLFLAG_RD,
 7020                                        &fl->ifl_credits, 1, "credits available");
 7021                         SYSCTL_ADD_U16(ctx_list, fl_list, OID_AUTO, "buf_size",
 7022                                        CTLFLAG_RD,
 7023                                        &fl->ifl_buf_size, 1, "buffer size");
 7024 #if MEMORY_LOGGING
 7025                         SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_enqueued",
 7026                                         CTLFLAG_RD,
 7027                                         &fl->ifl_m_enqueued, "mbufs allocated");
 7028                         SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_m_dequeued",
 7029                                         CTLFLAG_RD,
 7030                                         &fl->ifl_m_dequeued, "mbufs freed");
 7031                         SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_enqueued",
 7032                                         CTLFLAG_RD,
 7033                                         &fl->ifl_cl_enqueued, "clusters allocated");
 7034                         SYSCTL_ADD_QUAD(ctx_list, fl_list, OID_AUTO, "fl_cl_dequeued",
 7035                                         CTLFLAG_RD,
 7036                                         &fl->ifl_cl_dequeued, "clusters freed");
 7037 #endif
 7038 
 7039                 }
 7040         }
 7041 
 7042 }
 7043 
 7044 void
 7045 iflib_request_reset(if_ctx_t ctx)
 7046 {
 7047 
 7048         STATE_LOCK(ctx);
 7049         ctx->ifc_flags |= IFC_DO_RESET;
 7050         STATE_UNLOCK(ctx);
 7051 }
 7052 
 7053 #ifndef __NO_STRICT_ALIGNMENT
 7054 static struct mbuf *
 7055 iflib_fixup_rx(struct mbuf *m)
 7056 {
 7057         struct mbuf *n;
 7058 
 7059         if (m->m_len <= (MCLBYTES - ETHER_HDR_LEN)) {
 7060                 bcopy(m->m_data, m->m_data + ETHER_HDR_LEN, m->m_len);
 7061                 m->m_data += ETHER_HDR_LEN;
 7062                 n = m;
 7063         } else {
 7064                 MGETHDR(n, M_NOWAIT, MT_DATA);
 7065                 if (n == NULL) {
 7066                         m_freem(m);
 7067                         return (NULL);
 7068                 }
 7069                 bcopy(m->m_data, n->m_data, ETHER_HDR_LEN);
 7070                 m->m_data += ETHER_HDR_LEN;
 7071                 m->m_len -= ETHER_HDR_LEN;
 7072                 n->m_len = ETHER_HDR_LEN;
 7073                 M_MOVE_PKTHDR(n, m);
 7074                 n->m_next = m;
 7075         }
 7076         return (n);
 7077 }
 7078 #endif
 7079 
 7080 #ifdef NETDUMP
 7081 static void
 7082 iflib_netdump_init(if_t ifp, int *nrxr, int *ncl, int *clsize)
 7083 {
 7084         if_ctx_t ctx;
 7085 
 7086         ctx = if_getsoftc(ifp);
 7087         CTX_LOCK(ctx);
 7088         *nrxr = NRXQSETS(ctx);
 7089         *ncl = ctx->ifc_rxqs[0].ifr_fl->ifl_size;
 7090         *clsize = ctx->ifc_rxqs[0].ifr_fl->ifl_buf_size;
 7091         CTX_UNLOCK(ctx);
 7092 }
 7093 
 7094 static void
 7095 iflib_netdump_event(if_t ifp, enum netdump_ev event)
 7096 {
 7097         if_ctx_t ctx;
 7098         if_softc_ctx_t scctx;
 7099         iflib_fl_t fl;
 7100         iflib_rxq_t rxq;
 7101         int i, j;
 7102 
 7103         ctx = if_getsoftc(ifp);
 7104         scctx = &ctx->ifc_softc_ctx;
 7105 
 7106         switch (event) {
 7107         case NETDUMP_START:
 7108                 for (i = 0; i < scctx->isc_nrxqsets; i++) {
 7109                         rxq = &ctx->ifc_rxqs[i];
 7110                         for (j = 0; j < rxq->ifr_nfl; j++) {
 7111                                 fl = rxq->ifr_fl;
 7112                                 fl->ifl_zone = m_getzone(fl->ifl_buf_size);
 7113                         }
 7114                 }
 7115                 iflib_no_tx_batch = 1;
 7116                 break;
 7117         default:
 7118                 break;
 7119         }
 7120 }
 7121 
 7122 static int
 7123 iflib_netdump_transmit(if_t ifp, struct mbuf *m)
 7124 {
 7125         if_ctx_t ctx;
 7126         iflib_txq_t txq;
 7127         int error;
 7128 
 7129         ctx = if_getsoftc(ifp);
 7130         if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 7131             IFF_DRV_RUNNING)
 7132                 return (EBUSY);
 7133 
 7134         txq = &ctx->ifc_txqs[0];
 7135         error = iflib_encap(txq, &m);
 7136         if (error == 0)
 7137                 (void)iflib_txd_db_check(txq, true);
 7138         return (error);
 7139 }
 7140 
 7141 static int
 7142 iflib_netdump_poll(if_t ifp, int count)
 7143 {
 7144         if_ctx_t ctx;
 7145         if_softc_ctx_t scctx;
 7146         iflib_txq_t txq;
 7147         int i;
 7148 
 7149         ctx = if_getsoftc(ifp);
 7150         scctx = &ctx->ifc_softc_ctx;
 7151 
 7152         if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
 7153             IFF_DRV_RUNNING)
 7154                 return (EBUSY);
 7155 
 7156         txq = &ctx->ifc_txqs[0];
 7157         (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
 7158 
 7159         for (i = 0; i < scctx->isc_nrxqsets; i++)
 7160                 (void)iflib_rxeof(&ctx->ifc_rxqs[i], 16 /* XXX */);
 7161         return (0);
 7162 }
 7163 #endif /* NETDUMP */

Cache object: b7b84f21c48f0b84e1243b31b5b8e425


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.