The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/sfxge/sfxge_tx.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
    5  * All rights reserved.
    6  *
    7  * This software was developed in part by Philip Paeps under contract for
    8  * Solarflare Communications, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions are met:
   12  *
   13  * 1. Redistributions of source code must retain the above copyright notice,
   14  *    this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright notice,
   16  *    this list of conditions and the following disclaimer in the documentation
   17  *    and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
   23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
   26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
   28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
   29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   30  *
   31  * The views and conclusions contained in the software and documentation are
   32  * those of the authors and should not be interpreted as representing official
   33  * policies, either expressed or implied, of the FreeBSD Project.
   34  */
   35 
   36 /* Theory of operation:
   37  *
   38  * Tx queues allocation and mapping on Siena
   39  *
   40  * One Tx queue with enabled checksum offload is allocated per Rx channel
   41  * (event queue).  Also 2 Tx queues (one without checksum offload and one
   42  * with IP checksum offload only) are allocated and bound to event queue 0.
   43  * sfxge_txq_type is used as Tx queue label.
   44  *
   45  * So, event queue plus label mapping to Tx queue index is:
   46  *      if event queue index is 0, TxQ-index = TxQ-label * [0..SFXGE_TXQ_NTYPES)
   47  *      else TxQ-index = SFXGE_TXQ_NTYPES + EvQ-index - 1
   48  * See sfxge_get_txq_by_label() sfxge_ev.c
   49  *
   50  * Tx queue allocation and mapping on EF10
   51  *
   52  * One Tx queue with enabled checksum offload is allocated per Rx
   53  * channel (event queue). Checksum offload on all Tx queues is enabled or
   54  * disabled dynamically by inserting option descriptors, so the additional
   55  * queues used on Siena are not required.
   56  *
   57  * TxQ label is always set to zero on EF10 hardware.
   58  * So, event queue to Tx queue mapping is simple:
   59  * TxQ-index = EvQ-index
   60  */
   61 
   62 #include <sys/cdefs.h>
   63 __FBSDID("$FreeBSD$");
   64 
   65 #include "opt_rss.h"
   66 
   67 #include <sys/param.h>
   68 #include <sys/malloc.h>
   69 #include <sys/mbuf.h>
   70 #include <sys/smp.h>
   71 #include <sys/socket.h>
   72 #include <sys/sysctl.h>
   73 #include <sys/syslog.h>
   74 #include <sys/limits.h>
   75 
   76 #include <net/bpf.h>
   77 #include <net/ethernet.h>
   78 #include <net/if.h>
   79 #include <net/if_vlan_var.h>
   80 
   81 #include <netinet/in.h>
   82 #include <netinet/ip.h>
   83 #include <netinet/ip6.h>
   84 #include <netinet/tcp.h>
   85 
   86 #ifdef RSS
   87 #include <net/rss_config.h>
   88 #endif
   89 
   90 #include "common/efx.h"
   91 
   92 #include "sfxge.h"
   93 #include "sfxge_tx.h"
   94 
   95 #define SFXGE_PARAM_TX_DPL_GET_MAX      SFXGE_PARAM(tx_dpl_get_max)
   96 static int sfxge_tx_dpl_get_max = SFXGE_TX_DPL_GET_PKT_LIMIT_DEFAULT;
   97 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_MAX, &sfxge_tx_dpl_get_max);
   98 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_max, CTLFLAG_RDTUN,
   99            &sfxge_tx_dpl_get_max, 0,
  100            "Maximum number of any packets in deferred packet get-list");
  101 
  102 #define SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX \
  103         SFXGE_PARAM(tx_dpl_get_non_tcp_max)
  104 static int sfxge_tx_dpl_get_non_tcp_max =
  105         SFXGE_TX_DPL_GET_NON_TCP_PKT_LIMIT_DEFAULT;
  106 TUNABLE_INT(SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX, &sfxge_tx_dpl_get_non_tcp_max);
  107 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_get_non_tcp_max, CTLFLAG_RDTUN,
  108            &sfxge_tx_dpl_get_non_tcp_max, 0,
  109            "Maximum number of non-TCP packets in deferred packet get-list");
  110 
  111 #define SFXGE_PARAM_TX_DPL_PUT_MAX      SFXGE_PARAM(tx_dpl_put_max)
  112 static int sfxge_tx_dpl_put_max = SFXGE_TX_DPL_PUT_PKT_LIMIT_DEFAULT;
  113 TUNABLE_INT(SFXGE_PARAM_TX_DPL_PUT_MAX, &sfxge_tx_dpl_put_max);
  114 SYSCTL_INT(_hw_sfxge, OID_AUTO, tx_dpl_put_max, CTLFLAG_RDTUN,
  115            &sfxge_tx_dpl_put_max, 0,
  116            "Maximum number of any packets in deferred packet put-list");
  117 
  118 #define SFXGE_PARAM_TSO_FW_ASSISTED     SFXGE_PARAM(tso_fw_assisted)
  119 static int sfxge_tso_fw_assisted = (SFXGE_FATSOV1 | SFXGE_FATSOV2);
  120 TUNABLE_INT(SFXGE_PARAM_TSO_FW_ASSISTED, &sfxge_tso_fw_assisted);
  121 SYSCTL_INT(_hw_sfxge, OID_AUTO, tso_fw_assisted, CTLFLAG_RDTUN,
  122            &sfxge_tso_fw_assisted, 0,
  123            "Bitmask of FW-assisted TSO allowed to use if supported by NIC firmware");
  124 
  125 static const struct {
  126         const char *name;
  127         size_t offset;
  128 } sfxge_tx_stats[] = {
  129 #define SFXGE_TX_STAT(name, member) \
  130         { #name, offsetof(struct sfxge_txq, member) }
  131         SFXGE_TX_STAT(tso_bursts, tso_bursts),
  132         SFXGE_TX_STAT(tso_packets, tso_packets),
  133         SFXGE_TX_STAT(tso_long_headers, tso_long_headers),
  134         SFXGE_TX_STAT(tso_pdrop_too_many, tso_pdrop_too_many),
  135         SFXGE_TX_STAT(tso_pdrop_no_rsrc, tso_pdrop_no_rsrc),
  136         SFXGE_TX_STAT(tx_collapses, collapses),
  137         SFXGE_TX_STAT(tx_drops, drops),
  138         SFXGE_TX_STAT(tx_get_overflow, get_overflow),
  139         SFXGE_TX_STAT(tx_get_non_tcp_overflow, get_non_tcp_overflow),
  140         SFXGE_TX_STAT(tx_put_overflow, put_overflow),
  141         SFXGE_TX_STAT(tx_netdown_drops, netdown_drops),
  142 };
  143 
  144 /* Forward declarations. */
  145 static void sfxge_tx_qdpl_service(struct sfxge_txq *txq);
  146 static void sfxge_tx_qlist_post(struct sfxge_txq *txq);
  147 static void sfxge_tx_qunblock(struct sfxge_txq *txq);
  148 static int sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
  149                               const bus_dma_segment_t *dma_seg, int n_dma_seg,
  150                               int n_extra_descs);
  151 
  152 static inline void
  153 sfxge_next_stmp(struct sfxge_txq *txq, struct sfxge_tx_mapping **pstmp)
  154 {
  155         KASSERT((*pstmp)->flags == 0, ("stmp flags are not 0"));
  156         if (__predict_false(*pstmp ==
  157                             &txq->stmp[txq->ptr_mask]))
  158                 *pstmp = &txq->stmp[0];
  159         else
  160                 (*pstmp)++;
  161 }
  162 
  163 static int
  164 sfxge_tx_maybe_toggle_cksum_offload(struct sfxge_txq *txq, struct mbuf *mbuf,
  165                                     struct sfxge_tx_mapping **pstmp)
  166 {
  167         uint16_t new_hw_cksum_flags;
  168         efx_desc_t *desc;
  169 
  170         if (mbuf->m_pkthdr.csum_flags &
  171             (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6 | CSUM_TSO)) {
  172                 /*
  173                  * We always set EFX_TXQ_CKSUM_IPV4 here because this
  174                  * configuration is the most useful, and this won't
  175                  * cause any trouble in case of IPv6 traffic anyway.
  176                  */
  177                 new_hw_cksum_flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
  178         } else if (mbuf->m_pkthdr.csum_flags & CSUM_DELAY_IP) {
  179                 new_hw_cksum_flags = EFX_TXQ_CKSUM_IPV4;
  180         } else {
  181                 new_hw_cksum_flags = 0;
  182         }
  183 
  184         if (new_hw_cksum_flags == txq->hw_cksum_flags)
  185                 return (0);
  186 
  187         desc = &txq->pend_desc[txq->n_pend_desc];
  188         efx_tx_qdesc_checksum_create(txq->common, new_hw_cksum_flags, desc);
  189         txq->hw_cksum_flags = new_hw_cksum_flags;
  190         txq->n_pend_desc++;
  191 
  192         sfxge_next_stmp(txq, pstmp);
  193 
  194         return (1);
  195 }
  196 
  197 static int
  198 sfxge_tx_maybe_insert_tag(struct sfxge_txq *txq, struct mbuf *mbuf,
  199                           struct sfxge_tx_mapping **pstmp)
  200 {
  201         uint16_t this_tag = ((mbuf->m_flags & M_VLANTAG) ?
  202                              mbuf->m_pkthdr.ether_vtag :
  203                              0);
  204         efx_desc_t *desc;
  205 
  206         if (this_tag == txq->hw_vlan_tci)
  207                 return (0);
  208 
  209         desc = &txq->pend_desc[txq->n_pend_desc];
  210         efx_tx_qdesc_vlantci_create(txq->common, bswap16(this_tag), desc);
  211         txq->hw_vlan_tci = this_tag;
  212         txq->n_pend_desc++;
  213 
  214         sfxge_next_stmp(txq, pstmp);
  215 
  216         return (1);
  217 }
  218 
  219 void
  220 sfxge_tx_qcomplete(struct sfxge_txq *txq, struct sfxge_evq *evq)
  221 {
  222         unsigned int completed;
  223 
  224         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
  225 
  226         completed = txq->completed;
  227         while (completed != txq->pending) {
  228                 struct sfxge_tx_mapping *stmp;
  229                 unsigned int id;
  230 
  231                 id = completed++ & txq->ptr_mask;
  232 
  233                 stmp = &txq->stmp[id];
  234                 if (stmp->flags & TX_BUF_UNMAP) {
  235                         bus_dmamap_unload(txq->packet_dma_tag, stmp->map);
  236                         if (stmp->flags & TX_BUF_MBUF) {
  237                                 struct mbuf *m = stmp->u.mbuf;
  238                                 do
  239                                         m = m_free(m);
  240                                 while (m != NULL);
  241                         } else {
  242                                 free(stmp->u.heap_buf, M_SFXGE);
  243                         }
  244                         stmp->flags = 0;
  245                 }
  246         }
  247         txq->completed = completed;
  248 
  249         /* Check whether we need to unblock the queue. */
  250         mb();
  251         if (txq->blocked) {
  252                 unsigned int level;
  253 
  254                 level = txq->added - txq->completed;
  255                 if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries))
  256                         sfxge_tx_qunblock(txq);
  257         }
  258 }
  259 
  260 static unsigned int
  261 sfxge_is_mbuf_non_tcp(struct mbuf *mbuf)
  262 {
  263         /* Absence of TCP checksum flags does not mean that it is non-TCP
  264          * but it should be true if user wants to achieve high throughput.
  265          */
  266         return (!(mbuf->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)));
  267 }
  268 
  269 /*
  270  * Reorder the put list and append it to the get list.
  271  */
  272 static void
  273 sfxge_tx_qdpl_swizzle(struct sfxge_txq *txq)
  274 {
  275         struct sfxge_tx_dpl *stdp;
  276         struct mbuf *mbuf, *get_next, **get_tailp;
  277         volatile uintptr_t *putp;
  278         uintptr_t put;
  279         unsigned int count;
  280         unsigned int non_tcp_count;
  281 
  282         SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
  283 
  284         stdp = &txq->dpl;
  285 
  286         /* Acquire the put list. */
  287         putp = &stdp->std_put;
  288         put = atomic_readandclear_ptr(putp);
  289         mbuf = (void *)put;
  290 
  291         if (mbuf == NULL)
  292                 return;
  293 
  294         /* Reverse the put list. */
  295         get_tailp = &mbuf->m_nextpkt;
  296         get_next = NULL;
  297 
  298         count = 0;
  299         non_tcp_count = 0;
  300         do {
  301                 struct mbuf *put_next;
  302 
  303                 non_tcp_count += sfxge_is_mbuf_non_tcp(mbuf);
  304                 put_next = mbuf->m_nextpkt;
  305                 mbuf->m_nextpkt = get_next;
  306                 get_next = mbuf;
  307                 mbuf = put_next;
  308 
  309                 count++;
  310         } while (mbuf != NULL);
  311 
  312         if (count > stdp->std_put_hiwat)
  313                 stdp->std_put_hiwat = count;
  314 
  315         /* Append the reversed put list to the get list. */
  316         KASSERT(*get_tailp == NULL, ("*get_tailp != NULL"));
  317         *stdp->std_getp = get_next;
  318         stdp->std_getp = get_tailp;
  319         stdp->std_get_count += count;
  320         stdp->std_get_non_tcp_count += non_tcp_count;
  321 }
  322 
  323 static void
  324 sfxge_tx_qreap(struct sfxge_txq *txq)
  325 {
  326         SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
  327 
  328         txq->reaped = txq->completed;
  329 }
  330 
  331 static void
  332 sfxge_tx_qlist_post(struct sfxge_txq *txq)
  333 {
  334         unsigned int old_added __diagused;
  335         unsigned int block_level;
  336         unsigned int level;
  337         int rc __diagused;
  338 
  339         SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
  340 
  341         KASSERT(txq->n_pend_desc != 0, ("txq->n_pend_desc == 0"));
  342         KASSERT(txq->n_pend_desc <= txq->max_pkt_desc,
  343                 ("txq->n_pend_desc too large"));
  344         KASSERT(!txq->blocked, ("txq->blocked"));
  345 
  346         old_added = txq->added;
  347 
  348         /* Post the fragment list. */
  349         rc = efx_tx_qdesc_post(txq->common, txq->pend_desc, txq->n_pend_desc,
  350                           txq->reaped, &txq->added);
  351         KASSERT(rc == 0, ("efx_tx_qdesc_post() failed"));
  352 
  353         /* If efx_tx_qdesc_post() had to refragment, our information about
  354          * buffers to free may be associated with the wrong
  355          * descriptors.
  356          */
  357         KASSERT(txq->added - old_added == txq->n_pend_desc,
  358                 ("efx_tx_qdesc_post() refragmented descriptors"));
  359 
  360         level = txq->added - txq->reaped;
  361         KASSERT(level <= txq->entries, ("overfilled TX queue"));
  362 
  363         /* Clear the fragment list. */
  364         txq->n_pend_desc = 0;
  365 
  366         /*
  367          * Set the block level to ensure there is space to generate a
  368          * large number of descriptors for TSO.
  369          */
  370         block_level = EFX_TXQ_LIMIT(txq->entries) - txq->max_pkt_desc;
  371 
  372         /* Have we reached the block level? */
  373         if (level < block_level)
  374                 return;
  375 
  376         /* Reap, and check again */
  377         sfxge_tx_qreap(txq);
  378         level = txq->added - txq->reaped;
  379         if (level < block_level)
  380                 return;
  381 
  382         txq->blocked = 1;
  383 
  384         /*
  385          * Avoid a race with completion interrupt handling that could leave
  386          * the queue blocked.
  387          */
  388         mb();
  389         sfxge_tx_qreap(txq);
  390         level = txq->added - txq->reaped;
  391         if (level < block_level) {
  392                 mb();
  393                 txq->blocked = 0;
  394         }
  395 }
  396 
  397 static int sfxge_tx_queue_mbuf(struct sfxge_txq *txq, struct mbuf *mbuf)
  398 {
  399         bus_dmamap_t *used_map;
  400         bus_dmamap_t map;
  401         bus_dma_segment_t dma_seg[SFXGE_TX_MAPPING_MAX_SEG];
  402         unsigned int id;
  403         struct sfxge_tx_mapping *stmp;
  404         efx_desc_t *desc;
  405         int n_dma_seg;
  406         int rc;
  407         int i;
  408         int eop;
  409         uint16_t hw_cksum_flags_prev;
  410         uint16_t hw_vlan_tci_prev;
  411         int n_extra_descs;
  412 
  413         KASSERT(!txq->blocked, ("txq->blocked"));
  414 
  415 #if SFXGE_TX_PARSE_EARLY
  416         /*
  417          * If software TSO is used, we still need to copy packet header,
  418          * even if we have already parsed it early before enqueue.
  419          */
  420         if ((mbuf->m_pkthdr.csum_flags & CSUM_TSO) &&
  421             (txq->tso_fw_assisted == 0))
  422                 prefetch_read_many(mbuf->m_data);
  423 #else
  424         /*
  425          * Prefetch packet header since we need to parse it and extract
  426          * IP ID, TCP sequence number and flags.
  427          */
  428         if (mbuf->m_pkthdr.csum_flags & CSUM_TSO)
  429                 prefetch_read_many(mbuf->m_data);
  430 #endif
  431 
  432         if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED)) {
  433                 rc = EINTR;
  434                 goto reject;
  435         }
  436 
  437         /* Load the packet for DMA. */
  438         id = txq->added & txq->ptr_mask;
  439         stmp = &txq->stmp[id];
  440         rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag, stmp->map,
  441                                      mbuf, dma_seg, &n_dma_seg, 0);
  442         if (rc == EFBIG) {
  443                 /* Try again. */
  444                 struct mbuf *new_mbuf = m_collapse(mbuf, M_NOWAIT,
  445                                                    SFXGE_TX_MAPPING_MAX_SEG);
  446                 if (new_mbuf == NULL)
  447                         goto reject;
  448                 ++txq->collapses;
  449                 mbuf = new_mbuf;
  450                 rc = bus_dmamap_load_mbuf_sg(txq->packet_dma_tag,
  451                                              stmp->map, mbuf,
  452                                              dma_seg, &n_dma_seg, 0);
  453         }
  454         if (rc != 0)
  455                 goto reject;
  456 
  457         /* Make the packet visible to the hardware. */
  458         bus_dmamap_sync(txq->packet_dma_tag, stmp->map, BUS_DMASYNC_PREWRITE);
  459 
  460         used_map = &stmp->map;
  461 
  462         hw_cksum_flags_prev = txq->hw_cksum_flags;
  463         hw_vlan_tci_prev = txq->hw_vlan_tci;
  464 
  465         /*
  466          * The order of option descriptors, which are used to leverage VLAN tag
  467          * and checksum offloads, might be important. Changing checksum offload
  468          * between VLAN option and packet descriptors probably does not work.
  469          */
  470         n_extra_descs = sfxge_tx_maybe_toggle_cksum_offload(txq, mbuf, &stmp);
  471         n_extra_descs += sfxge_tx_maybe_insert_tag(txq, mbuf, &stmp);
  472 
  473         if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) {
  474                 rc = sfxge_tx_queue_tso(txq, mbuf, dma_seg, n_dma_seg,
  475                                         n_extra_descs);
  476                 if (rc < 0)
  477                         goto reject_mapped;
  478                 stmp = &txq->stmp[(rc - 1) & txq->ptr_mask];
  479         } else {
  480                 /* Add the mapping to the fragment list, and set flags
  481                  * for the buffer.
  482                  */
  483 
  484                 i = 0;
  485                 for (;;) {
  486                         desc = &txq->pend_desc[i + n_extra_descs];
  487                         eop = (i == n_dma_seg - 1);
  488                         efx_tx_qdesc_dma_create(txq->common,
  489                                                 dma_seg[i].ds_addr,
  490                                                 dma_seg[i].ds_len,
  491                                                 eop,
  492                                                 desc);
  493                         if (eop)
  494                                 break;
  495                         i++;
  496                         sfxge_next_stmp(txq, &stmp);
  497                 }
  498                 txq->n_pend_desc = n_dma_seg + n_extra_descs;
  499         }
  500 
  501         /*
  502          * If the mapping required more than one descriptor
  503          * then we need to associate the DMA map with the last
  504          * descriptor, not the first.
  505          */
  506         if (used_map != &stmp->map) {
  507                 map = stmp->map;
  508                 stmp->map = *used_map;
  509                 *used_map = map;
  510         }
  511 
  512         stmp->u.mbuf = mbuf;
  513         stmp->flags = TX_BUF_UNMAP | TX_BUF_MBUF;
  514 
  515         /* Post the fragment list. */
  516         sfxge_tx_qlist_post(txq);
  517 
  518         return (0);
  519 
  520 reject_mapped:
  521         txq->hw_vlan_tci = hw_vlan_tci_prev;
  522         txq->hw_cksum_flags = hw_cksum_flags_prev;
  523         bus_dmamap_unload(txq->packet_dma_tag, *used_map);
  524 reject:
  525         /* Drop the packet on the floor. */
  526         m_freem(mbuf);
  527         ++txq->drops;
  528 
  529         return (rc);
  530 }
  531 
  532 /*
  533  * Drain the deferred packet list into the transmit queue.
  534  */
  535 static void
  536 sfxge_tx_qdpl_drain(struct sfxge_txq *txq)
  537 {
  538         struct sfxge_softc *sc;
  539         struct sfxge_tx_dpl *stdp;
  540         struct mbuf *mbuf, *next;
  541         unsigned int count;
  542         unsigned int non_tcp_count;
  543         unsigned int pushed;
  544         int rc;
  545 
  546         SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
  547 
  548         sc = txq->sc;
  549         stdp = &txq->dpl;
  550         pushed = txq->added;
  551 
  552         if (__predict_true(txq->init_state == SFXGE_TXQ_STARTED)) {
  553                 prefetch_read_many(sc->enp);
  554                 prefetch_read_many(txq->common);
  555         }
  556 
  557         mbuf = stdp->std_get;
  558         count = stdp->std_get_count;
  559         non_tcp_count = stdp->std_get_non_tcp_count;
  560 
  561         if (count > stdp->std_get_hiwat)
  562                 stdp->std_get_hiwat = count;
  563 
  564         while (count != 0) {
  565                 KASSERT(mbuf != NULL, ("mbuf == NULL"));
  566 
  567                 next = mbuf->m_nextpkt;
  568                 mbuf->m_nextpkt = NULL;
  569 
  570                 ETHER_BPF_MTAP(sc->ifnet, mbuf); /* packet capture */
  571 
  572                 if (next != NULL)
  573                         prefetch_read_many(next);
  574 
  575                 rc = sfxge_tx_queue_mbuf(txq, mbuf);
  576                 --count;
  577                 non_tcp_count -= sfxge_is_mbuf_non_tcp(mbuf);
  578                 mbuf = next;
  579                 if (rc != 0)
  580                         continue;
  581 
  582                 if (txq->blocked)
  583                         break;
  584 
  585                 /* Push the fragments to the hardware in batches. */
  586                 if (txq->added - pushed >= SFXGE_TX_BATCH) {
  587                         efx_tx_qpush(txq->common, txq->added, pushed);
  588                         pushed = txq->added;
  589                 }
  590         }
  591 
  592         if (count == 0) {
  593                 KASSERT(mbuf == NULL, ("mbuf != NULL"));
  594                 KASSERT(non_tcp_count == 0,
  595                         ("inconsistent TCP/non-TCP detection"));
  596                 stdp->std_get = NULL;
  597                 stdp->std_get_count = 0;
  598                 stdp->std_get_non_tcp_count = 0;
  599                 stdp->std_getp = &stdp->std_get;
  600         } else {
  601                 stdp->std_get = mbuf;
  602                 stdp->std_get_count = count;
  603                 stdp->std_get_non_tcp_count = non_tcp_count;
  604         }
  605 
  606         if (txq->added != pushed)
  607                 efx_tx_qpush(txq->common, txq->added, pushed);
  608 
  609         KASSERT(txq->blocked || stdp->std_get_count == 0,
  610                 ("queue unblocked but count is non-zero"));
  611 }
  612 
  613 #define SFXGE_TX_QDPL_PENDING(_txq)     ((_txq)->dpl.std_put != 0)
  614 
  615 /*
  616  * Service the deferred packet list.
  617  *
  618  * NOTE: drops the txq mutex!
  619  */
  620 static void
  621 sfxge_tx_qdpl_service(struct sfxge_txq *txq)
  622 {
  623         SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
  624 
  625         do {
  626                 if (SFXGE_TX_QDPL_PENDING(txq))
  627                         sfxge_tx_qdpl_swizzle(txq);
  628 
  629                 if (!txq->blocked)
  630                         sfxge_tx_qdpl_drain(txq);
  631 
  632                 SFXGE_TXQ_UNLOCK(txq);
  633         } while (SFXGE_TX_QDPL_PENDING(txq) &&
  634                  SFXGE_TXQ_TRYLOCK(txq));
  635 }
  636 
  637 /*
  638  * Put a packet on the deferred packet get-list.
  639  */
  640 static int
  641 sfxge_tx_qdpl_put_locked(struct sfxge_txq *txq, struct mbuf *mbuf)
  642 {
  643         struct sfxge_tx_dpl *stdp;
  644 
  645         stdp = &txq->dpl;
  646 
  647         KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
  648 
  649         SFXGE_TXQ_LOCK_ASSERT_OWNED(txq);
  650 
  651         if (stdp->std_get_count >= stdp->std_get_max) {
  652                 txq->get_overflow++;
  653                 return (ENOBUFS);
  654         }
  655         if (sfxge_is_mbuf_non_tcp(mbuf)) {
  656                 if (stdp->std_get_non_tcp_count >=
  657                     stdp->std_get_non_tcp_max) {
  658                         txq->get_non_tcp_overflow++;
  659                         return (ENOBUFS);
  660                 }
  661                 stdp->std_get_non_tcp_count++;
  662         }
  663 
  664         *(stdp->std_getp) = mbuf;
  665         stdp->std_getp = &mbuf->m_nextpkt;
  666         stdp->std_get_count++;
  667 
  668         return (0);
  669 }
  670 
  671 /*
  672  * Put a packet on the deferred packet put-list.
  673  *
  674  * We overload the csum_data field in the mbuf to keep track of this length
  675  * because there is no cheap alternative to avoid races.
  676  */
  677 static int
  678 sfxge_tx_qdpl_put_unlocked(struct sfxge_txq *txq, struct mbuf *mbuf)
  679 {
  680         struct sfxge_tx_dpl *stdp;
  681         volatile uintptr_t *putp;
  682         uintptr_t old;
  683         uintptr_t new;
  684         unsigned int put_count;
  685 
  686         KASSERT(mbuf->m_nextpkt == NULL, ("mbuf->m_nextpkt != NULL"));
  687 
  688         SFXGE_TXQ_LOCK_ASSERT_NOTOWNED(txq);
  689 
  690         stdp = &txq->dpl;
  691         putp = &stdp->std_put;
  692         new = (uintptr_t)mbuf;
  693 
  694         do {
  695                 old = *putp;
  696                 if (old != 0) {
  697                         struct mbuf *mp = (struct mbuf *)old;
  698                         put_count = mp->m_pkthdr.csum_data;
  699                 } else
  700                         put_count = 0;
  701                 if (put_count >= stdp->std_put_max) {
  702                         atomic_add_long(&txq->put_overflow, 1);
  703                         return (ENOBUFS);
  704                 }
  705                 mbuf->m_pkthdr.csum_data = put_count + 1;
  706                 mbuf->m_nextpkt = (void *)old;
  707         } while (atomic_cmpset_ptr(putp, old, new) == 0);
  708 
  709         return (0);
  710 }
  711 
  712 /*
  713  * Called from if_transmit - will try to grab the txq lock and enqueue to the
  714  * put list if it succeeds, otherwise try to push onto the defer list if space.
  715  */
  716 static int
  717 sfxge_tx_packet_add(struct sfxge_txq *txq, struct mbuf *m)
  718 {
  719         int rc;
  720 
  721         if (!SFXGE_LINK_UP(txq->sc)) {
  722                 atomic_add_long(&txq->netdown_drops, 1);
  723                 return (ENETDOWN);
  724         }
  725 
  726         /*
  727          * Try to grab the txq lock.  If we are able to get the lock,
  728          * the packet will be appended to the "get list" of the deferred
  729          * packet list.  Otherwise, it will be pushed on the "put list".
  730          */
  731         if (SFXGE_TXQ_TRYLOCK(txq)) {
  732                 /* First swizzle put-list to get-list to keep order */
  733                 sfxge_tx_qdpl_swizzle(txq);
  734 
  735                 rc = sfxge_tx_qdpl_put_locked(txq, m);
  736 
  737                 /* Try to service the list. */
  738                 sfxge_tx_qdpl_service(txq);
  739                 /* Lock has been dropped. */
  740         } else {
  741                 rc = sfxge_tx_qdpl_put_unlocked(txq, m);
  742 
  743                 /*
  744                  * Try to grab the lock again.
  745                  *
  746                  * If we are able to get the lock, we need to process
  747                  * the deferred packet list.  If we are not able to get
  748                  * the lock, another thread is processing the list.
  749                  */
  750                 if ((rc == 0) && SFXGE_TXQ_TRYLOCK(txq)) {
  751                         sfxge_tx_qdpl_service(txq);
  752                         /* Lock has been dropped. */
  753                 }
  754         }
  755 
  756         SFXGE_TXQ_LOCK_ASSERT_NOTOWNED(txq);
  757 
  758         return (rc);
  759 }
  760 
  761 static void
  762 sfxge_tx_qdpl_flush(struct sfxge_txq *txq)
  763 {
  764         struct sfxge_tx_dpl *stdp = &txq->dpl;
  765         struct mbuf *mbuf, *next;
  766 
  767         SFXGE_TXQ_LOCK(txq);
  768 
  769         sfxge_tx_qdpl_swizzle(txq);
  770         for (mbuf = stdp->std_get; mbuf != NULL; mbuf = next) {
  771                 next = mbuf->m_nextpkt;
  772                 m_freem(mbuf);
  773         }
  774         stdp->std_get = NULL;
  775         stdp->std_get_count = 0;
  776         stdp->std_get_non_tcp_count = 0;
  777         stdp->std_getp = &stdp->std_get;
  778 
  779         SFXGE_TXQ_UNLOCK(txq);
  780 }
  781 
  782 void
  783 sfxge_if_qflush(struct ifnet *ifp)
  784 {
  785         struct sfxge_softc *sc;
  786         unsigned int i;
  787 
  788         sc = ifp->if_softc;
  789 
  790         for (i = 0; i < sc->txq_count; i++)
  791                 sfxge_tx_qdpl_flush(sc->txq[i]);
  792 }
  793 
  794 #if SFXGE_TX_PARSE_EARLY
  795 
  796 /* There is little space for user data in mbuf pkthdr, so we
  797  * use l*hlen fields which are not used by the driver otherwise
  798  * to store header offsets.
  799  * The fields are 8-bit, but it's ok, no header may be longer than 255 bytes.
  800  */
  801 
  802 #define TSO_MBUF_PROTO(_mbuf)    ((_mbuf)->m_pkthdr.PH_loc.sixteen[0])
  803 /* We abuse l5hlen here because PH_loc can hold only 64 bits of data */
  804 #define TSO_MBUF_FLAGS(_mbuf)    ((_mbuf)->m_pkthdr.l5hlen)
  805 #define TSO_MBUF_PACKETID(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[1])
  806 #define TSO_MBUF_SEQNUM(_mbuf)   ((_mbuf)->m_pkthdr.PH_loc.thirtytwo[1])
  807 
  808 static void sfxge_parse_tx_packet(struct mbuf *mbuf)
  809 {
  810         struct ether_header *eh = mtod(mbuf, struct ether_header *);
  811         const struct tcphdr *th;
  812         struct tcphdr th_copy;
  813 
  814         /* Find network protocol and header */
  815         TSO_MBUF_PROTO(mbuf) = eh->ether_type;
  816         if (TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_VLAN)) {
  817                 struct ether_vlan_header *veh =
  818                         mtod(mbuf, struct ether_vlan_header *);
  819                 TSO_MBUF_PROTO(mbuf) = veh->evl_proto;
  820                 mbuf->m_pkthdr.l2hlen = sizeof(*veh);
  821         } else {
  822                 mbuf->m_pkthdr.l2hlen = sizeof(*eh);
  823         }
  824 
  825         /* Find TCP header */
  826         if (TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_IP)) {
  827                 const struct ip *iph = (const struct ip *)mtodo(mbuf, mbuf->m_pkthdr.l2hlen);
  828 
  829                 KASSERT(iph->ip_p == IPPROTO_TCP,
  830                         ("TSO required on non-TCP packet"));
  831                 mbuf->m_pkthdr.l3hlen = mbuf->m_pkthdr.l2hlen + 4 * iph->ip_hl;
  832                 TSO_MBUF_PACKETID(mbuf) = iph->ip_id;
  833         } else {
  834                 KASSERT(TSO_MBUF_PROTO(mbuf) == htons(ETHERTYPE_IPV6),
  835                         ("TSO required on non-IP packet"));
  836                 KASSERT(((const struct ip6_hdr *)mtodo(mbuf, mbuf->m_pkthdr.l2hlen))->ip6_nxt ==
  837                         IPPROTO_TCP,
  838                         ("TSO required on non-TCP packet"));
  839                 mbuf->m_pkthdr.l3hlen = mbuf->m_pkthdr.l2hlen + sizeof(struct ip6_hdr);
  840                 TSO_MBUF_PACKETID(mbuf) = 0;
  841         }
  842 
  843         KASSERT(mbuf->m_len >= mbuf->m_pkthdr.l3hlen,
  844                 ("network header is fragmented in mbuf"));
  845 
  846         /* We need TCP header including flags (window is the next) */
  847         if (mbuf->m_len < mbuf->m_pkthdr.l3hlen + offsetof(struct tcphdr, th_win)) {
  848                 m_copydata(mbuf, mbuf->m_pkthdr.l3hlen, sizeof(th_copy),
  849                            (caddr_t)&th_copy);
  850                 th = &th_copy;
  851         } else {
  852                 th = (const struct tcphdr *)mtodo(mbuf, mbuf->m_pkthdr.l3hlen);
  853         }
  854 
  855         mbuf->m_pkthdr.l4hlen = mbuf->m_pkthdr.l3hlen + 4 * th->th_off;
  856         TSO_MBUF_SEQNUM(mbuf) = ntohl(th->th_seq);
  857 
  858         /* These flags must not be duplicated */
  859         /*
  860          * RST should not be duplicated as well, but FreeBSD kernel
  861          * generates TSO packets with RST flag. So, do not assert
  862          * its absence.
  863          */
  864         KASSERT(!(th->th_flags & (TH_URG | TH_SYN)),
  865                 ("incompatible TCP flag 0x%x on TSO packet",
  866                  th->th_flags & (TH_URG | TH_SYN)));
  867         TSO_MBUF_FLAGS(mbuf) = th->th_flags;
  868 }
  869 #endif
  870 
  871 /*
  872  * TX start -- called by the stack.
  873  */
  874 int
  875 sfxge_if_transmit(struct ifnet *ifp, struct mbuf *m)
  876 {
  877         struct sfxge_softc *sc;
  878         struct sfxge_txq *txq;
  879         int rc;
  880 
  881         sc = (struct sfxge_softc *)ifp->if_softc;
  882 
  883         /*
  884          * Transmit may be called when interface is up from the kernel
  885          * point of view, but not yet up (in progress) from the driver
  886          * point of view. I.e. link aggregation bring up.
  887          * Transmit may be called when interface is up from the driver
  888          * point of view, but already down from the kernel point of
  889          * view. I.e. Rx when interface shutdown is in progress.
  890          */
  891         KASSERT((ifp->if_flags & IFF_UP) || (sc->if_flags & IFF_UP),
  892                 ("interface not up"));
  893 
  894         /* Pick the desired transmit queue. */
  895         if (sc->txq_dynamic_cksum_toggle_supported |
  896             (m->m_pkthdr.csum_flags &
  897              (CSUM_DELAY_DATA | CSUM_TCP_IPV6 | CSUM_UDP_IPV6 | CSUM_TSO))) {
  898                 int index = 0;
  899 
  900 #ifdef RSS
  901                 uint32_t bucket_id;
  902 
  903                 /*
  904                  * Select a TX queue which matches the corresponding
  905                  * RX queue for the hash in order to assign both
  906                  * TX and RX parts of the flow to the same CPU
  907                  */
  908                 if (rss_m2bucket(m, &bucket_id) == 0)
  909                         index = bucket_id % (sc->txq_count - (SFXGE_TXQ_NTYPES - 1));
  910 #else
  911                 /* check if flowid is set */
  912                 if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) {
  913                         uint32_t hash = m->m_pkthdr.flowid;
  914                         uint32_t idx = hash % nitems(sc->rx_indir_table);
  915 
  916                         index = sc->rx_indir_table[idx];
  917                 }
  918 #endif
  919 #if SFXGE_TX_PARSE_EARLY
  920                 if (m->m_pkthdr.csum_flags & CSUM_TSO)
  921                         sfxge_parse_tx_packet(m);
  922 #endif
  923                 index += (sc->txq_dynamic_cksum_toggle_supported == B_FALSE) ?
  924                          SFXGE_TXQ_IP_TCP_UDP_CKSUM : 0;
  925                 txq = sc->txq[index];
  926         } else if (m->m_pkthdr.csum_flags & CSUM_DELAY_IP) {
  927                 txq = sc->txq[SFXGE_TXQ_IP_CKSUM];
  928         } else {
  929                 txq = sc->txq[SFXGE_TXQ_NON_CKSUM];
  930         }
  931 
  932         rc = sfxge_tx_packet_add(txq, m);
  933         if (rc != 0)
  934                 m_freem(m);
  935 
  936         return (rc);
  937 }
  938 
  939 /*
  940  * Software "TSO".  Not quite as good as doing it in hardware, but
  941  * still faster than segmenting in the stack.
  942  */
  943 
  944 struct sfxge_tso_state {
  945         /* Output position */
  946         unsigned out_len;       /* Remaining length in current segment */
  947         unsigned seqnum;        /* Current sequence number */
  948         unsigned packet_space;  /* Remaining space in current packet */
  949         unsigned segs_space;    /* Remaining number of DMA segments
  950                                    for the packet (FATSOv2 only) */
  951 
  952         /* Input position */
  953         uint64_t dma_addr;      /* DMA address of current position */
  954         unsigned in_len;        /* Remaining length in current mbuf */
  955 
  956         const struct mbuf *mbuf; /* Input mbuf (head of chain) */
  957         u_short protocol;       /* Network protocol (after VLAN decap) */
  958         ssize_t nh_off;         /* Offset of network header */
  959         ssize_t tcph_off;       /* Offset of TCP header */
  960         unsigned header_len;    /* Number of bytes of header */
  961         unsigned seg_size;      /* TCP segment size */
  962         int fw_assisted;        /* Use FW-assisted TSO */
  963         u_short packet_id;      /* IPv4 packet ID from the original packet */
  964         uint8_t tcp_flags;      /* TCP flags */
  965         efx_desc_t header_desc; /* Precomputed header descriptor for
  966                                  * FW-assisted TSO */
  967 };
  968 
  969 #if !SFXGE_TX_PARSE_EARLY
  970 static const struct ip *tso_iph(const struct sfxge_tso_state *tso)
  971 {
  972         KASSERT(tso->protocol == htons(ETHERTYPE_IP),
  973                 ("tso_iph() in non-IPv4 state"));
  974         return (const struct ip *)(tso->mbuf->m_data + tso->nh_off);
  975 }
  976 
  977 static __unused const struct ip6_hdr *tso_ip6h(const struct sfxge_tso_state *tso)
  978 {
  979         KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
  980                 ("tso_ip6h() in non-IPv6 state"));
  981         return (const struct ip6_hdr *)(tso->mbuf->m_data + tso->nh_off);
  982 }
  983 
  984 static const struct tcphdr *tso_tcph(const struct sfxge_tso_state *tso)
  985 {
  986         return (const struct tcphdr *)(tso->mbuf->m_data + tso->tcph_off);
  987 }
  988 #endif
  989 
  990 /* Size of preallocated TSO header buffers.  Larger blocks must be
  991  * allocated from the heap.
  992  */
  993 #define TSOH_STD_SIZE   128
  994 
  995 /* At most half the descriptors in the queue at any time will refer to
  996  * a TSO header buffer, since they must always be followed by a
  997  * payload descriptor referring to an mbuf.
  998  */
  999 #define TSOH_COUNT(_txq_entries)        ((_txq_entries) / 2u)
 1000 #define TSOH_PER_PAGE   (PAGE_SIZE / TSOH_STD_SIZE)
 1001 #define TSOH_PAGE_COUNT(_txq_entries)   \
 1002         howmany(TSOH_COUNT(_txq_entries), TSOH_PER_PAGE)
 1003 
 1004 static int tso_init(struct sfxge_txq *txq)
 1005 {
 1006         struct sfxge_softc *sc = txq->sc;
 1007         unsigned int tsoh_page_count = TSOH_PAGE_COUNT(sc->txq_entries);
 1008         int i, rc;
 1009 
 1010         /* Allocate TSO header buffers */
 1011         txq->tsoh_buffer = malloc(tsoh_page_count * sizeof(txq->tsoh_buffer[0]),
 1012                                   M_SFXGE, M_WAITOK);
 1013 
 1014         for (i = 0; i < tsoh_page_count; i++) {
 1015                 rc = sfxge_dma_alloc(sc, PAGE_SIZE, &txq->tsoh_buffer[i]);
 1016                 if (rc != 0)
 1017                         goto fail;
 1018         }
 1019 
 1020         return (0);
 1021 
 1022 fail:
 1023         while (i-- > 0)
 1024                 sfxge_dma_free(&txq->tsoh_buffer[i]);
 1025         free(txq->tsoh_buffer, M_SFXGE);
 1026         txq->tsoh_buffer = NULL;
 1027         return (rc);
 1028 }
 1029 
 1030 static void tso_fini(struct sfxge_txq *txq)
 1031 {
 1032         int i;
 1033 
 1034         if (txq->tsoh_buffer != NULL) {
 1035                 for (i = 0; i < TSOH_PAGE_COUNT(txq->sc->txq_entries); i++)
 1036                         sfxge_dma_free(&txq->tsoh_buffer[i]);
 1037                 free(txq->tsoh_buffer, M_SFXGE);
 1038         }
 1039 }
 1040 
 1041 static void tso_start(struct sfxge_txq *txq, struct sfxge_tso_state *tso,
 1042                       const bus_dma_segment_t *hdr_dma_seg,
 1043                       struct mbuf *mbuf)
 1044 {
 1045         const efx_nic_cfg_t *encp = efx_nic_cfg_get(txq->sc->enp);
 1046 #if !SFXGE_TX_PARSE_EARLY
 1047         struct ether_header *eh = mtod(mbuf, struct ether_header *);
 1048         const struct tcphdr *th;
 1049         struct tcphdr th_copy;
 1050 #endif
 1051 
 1052         tso->fw_assisted = txq->tso_fw_assisted;
 1053         tso->mbuf = mbuf;
 1054 
 1055         /* Find network protocol and header */
 1056 #if !SFXGE_TX_PARSE_EARLY
 1057         tso->protocol = eh->ether_type;
 1058         if (tso->protocol == htons(ETHERTYPE_VLAN)) {
 1059                 struct ether_vlan_header *veh =
 1060                         mtod(mbuf, struct ether_vlan_header *);
 1061                 tso->protocol = veh->evl_proto;
 1062                 tso->nh_off = sizeof(*veh);
 1063         } else {
 1064                 tso->nh_off = sizeof(*eh);
 1065         }
 1066 #else
 1067         tso->protocol = TSO_MBUF_PROTO(mbuf);
 1068         tso->nh_off = mbuf->m_pkthdr.l2hlen;
 1069         tso->tcph_off = mbuf->m_pkthdr.l3hlen;
 1070         tso->packet_id = ntohs(TSO_MBUF_PACKETID(mbuf));
 1071 #endif
 1072 
 1073 #if !SFXGE_TX_PARSE_EARLY
 1074         /* Find TCP header */
 1075         if (tso->protocol == htons(ETHERTYPE_IP)) {
 1076                 KASSERT(tso_iph(tso)->ip_p == IPPROTO_TCP,
 1077                         ("TSO required on non-TCP packet"));
 1078                 tso->tcph_off = tso->nh_off + 4 * tso_iph(tso)->ip_hl;
 1079                 tso->packet_id = ntohs(tso_iph(tso)->ip_id);
 1080         } else {
 1081                 KASSERT(tso->protocol == htons(ETHERTYPE_IPV6),
 1082                         ("TSO required on non-IP packet"));
 1083                 KASSERT(tso_ip6h(tso)->ip6_nxt == IPPROTO_TCP,
 1084                         ("TSO required on non-TCP packet"));
 1085                 tso->tcph_off = tso->nh_off + sizeof(struct ip6_hdr);
 1086                 tso->packet_id = 0;
 1087         }
 1088 #endif
 1089 
 1090         if (tso->fw_assisted &&
 1091             __predict_false(tso->tcph_off >
 1092                             encp->enc_tx_tso_tcp_header_offset_limit)) {
 1093                 tso->fw_assisted = 0;
 1094         }
 1095 
 1096 #if !SFXGE_TX_PARSE_EARLY
 1097         KASSERT(mbuf->m_len >= tso->tcph_off,
 1098                 ("network header is fragmented in mbuf"));
 1099         /* We need TCP header including flags (window is the next) */
 1100         if (mbuf->m_len < tso->tcph_off + offsetof(struct tcphdr, th_win)) {
 1101                 m_copydata(tso->mbuf, tso->tcph_off, sizeof(th_copy),
 1102                            (caddr_t)&th_copy);
 1103                 th = &th_copy;
 1104         } else {
 1105                 th = tso_tcph(tso);
 1106         }
 1107         tso->header_len = tso->tcph_off + 4 * th->th_off;
 1108 #else
 1109         tso->header_len = mbuf->m_pkthdr.l4hlen;
 1110 #endif
 1111         tso->seg_size = mbuf->m_pkthdr.tso_segsz;
 1112 
 1113 #if !SFXGE_TX_PARSE_EARLY
 1114         tso->seqnum = ntohl(th->th_seq);
 1115 
 1116         /* These flags must not be duplicated */
 1117         /*
 1118          * RST should not be duplicated as well, but FreeBSD kernel
 1119          * generates TSO packets with RST flag. So, do not assert
 1120          * its absence.
 1121          */
 1122         KASSERT(!(th->th_flags & (TH_URG | TH_SYN)),
 1123                 ("incompatible TCP flag 0x%x on TSO packet",
 1124                  th->th_flags & (TH_URG | TH_SYN)));
 1125         tso->tcp_flags = th->th_flags;
 1126 #else
 1127         tso->seqnum = TSO_MBUF_SEQNUM(mbuf);
 1128         tso->tcp_flags = TSO_MBUF_FLAGS(mbuf);
 1129 #endif
 1130 
 1131         tso->out_len = mbuf->m_pkthdr.len - tso->header_len;
 1132 
 1133         if (tso->fw_assisted) {
 1134                 if (hdr_dma_seg->ds_len >= tso->header_len)
 1135                         efx_tx_qdesc_dma_create(txq->common,
 1136                                                 hdr_dma_seg->ds_addr,
 1137                                                 tso->header_len,
 1138                                                 B_FALSE,
 1139                                                 &tso->header_desc);
 1140                 else
 1141                         tso->fw_assisted = 0;
 1142         }
 1143 }
 1144 
 1145 /*
 1146  * tso_fill_packet_with_fragment - form descriptors for the current fragment
 1147  *
 1148  * Form descriptors for the current fragment, until we reach the end
 1149  * of fragment or end-of-packet.  Return 0 on success, 1 if not enough
 1150  * space.
 1151  */
 1152 static void tso_fill_packet_with_fragment(struct sfxge_txq *txq,
 1153                                           struct sfxge_tso_state *tso)
 1154 {
 1155         efx_desc_t *desc;
 1156         int n;
 1157         uint64_t dma_addr = tso->dma_addr;
 1158         boolean_t eop;
 1159 
 1160         if (tso->in_len == 0 || tso->packet_space == 0)
 1161                 return;
 1162 
 1163         KASSERT(tso->in_len > 0, ("TSO input length went negative"));
 1164         KASSERT(tso->packet_space > 0, ("TSO packet space went negative"));
 1165 
 1166         if (tso->fw_assisted & SFXGE_FATSOV2) {
 1167                 n = tso->in_len;
 1168                 tso->out_len -= n;
 1169                 tso->seqnum += n;
 1170                 tso->in_len = 0;
 1171                 if (n < tso->packet_space) {
 1172                         tso->packet_space -= n;
 1173                         tso->segs_space--;
 1174                 } else {
 1175                         tso->packet_space = tso->seg_size -
 1176                             (n - tso->packet_space) % tso->seg_size;
 1177                         tso->segs_space =
 1178                             EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1 -
 1179                             (tso->packet_space != tso->seg_size);
 1180                 }
 1181         } else {
 1182                 n = min(tso->in_len, tso->packet_space);
 1183                 tso->packet_space -= n;
 1184                 tso->out_len -= n;
 1185                 tso->dma_addr += n;
 1186                 tso->in_len -= n;
 1187         }
 1188 
 1189         /*
 1190          * It is OK to use binary OR below to avoid extra branching
 1191          * since all conditions may always be checked.
 1192          */
 1193         eop = (tso->out_len == 0) | (tso->packet_space == 0) |
 1194             (tso->segs_space == 0);
 1195 
 1196         desc = &txq->pend_desc[txq->n_pend_desc++];
 1197         efx_tx_qdesc_dma_create(txq->common, dma_addr, n, eop, desc);
 1198 }
 1199 
 1200 /* Callback from bus_dmamap_load() for long TSO headers. */
 1201 static void tso_map_long_header(void *dma_addr_ret,
 1202                                 bus_dma_segment_t *segs, int nseg,
 1203                                 int error)
 1204 {
 1205         *(uint64_t *)dma_addr_ret = ((__predict_true(error == 0) &&
 1206                                       __predict_true(nseg == 1)) ?
 1207                                      segs->ds_addr : 0);
 1208 }
 1209 
 1210 /*
 1211  * tso_start_new_packet - generate a new header and prepare for the new packet
 1212  *
 1213  * Generate a new header and prepare for the new packet.  Return 0 on
 1214  * success, or an error code if failed to alloc header.
 1215  */
 1216 static int tso_start_new_packet(struct sfxge_txq *txq,
 1217                                 struct sfxge_tso_state *tso,
 1218                                 unsigned int *idp)
 1219 {
 1220         unsigned int id = *idp;
 1221         struct tcphdr *tsoh_th;
 1222         unsigned ip_length;
 1223         caddr_t header;
 1224         uint64_t dma_addr;
 1225         bus_dmamap_t map;
 1226         efx_desc_t *desc;
 1227         int rc;
 1228 
 1229         if (tso->fw_assisted) {
 1230                 if (tso->fw_assisted & SFXGE_FATSOV2) {
 1231                         /* Add 2 FATSOv2 option descriptors */
 1232                         desc = &txq->pend_desc[txq->n_pend_desc];
 1233                         efx_tx_qdesc_tso2_create(txq->common,
 1234                                                  tso->packet_id,
 1235                                                  0,
 1236                                                  tso->seqnum,
 1237                                                  tso->seg_size,
 1238                                                  desc,
 1239                                                  EFX_TX_FATSOV2_OPT_NDESCS);
 1240                         desc += EFX_TX_FATSOV2_OPT_NDESCS;
 1241                         txq->n_pend_desc += EFX_TX_FATSOV2_OPT_NDESCS;
 1242                         KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
 1243                         id = (id + EFX_TX_FATSOV2_OPT_NDESCS) & txq->ptr_mask;
 1244 
 1245                         tso->segs_space =
 1246                             EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1;
 1247                 } else {
 1248                         uint8_t tcp_flags = tso->tcp_flags;
 1249 
 1250                         if (tso->out_len > tso->seg_size)
 1251                                 tcp_flags &= ~(TH_FIN | TH_PUSH);
 1252 
 1253                         /* Add FATSOv1 option descriptor */
 1254                         desc = &txq->pend_desc[txq->n_pend_desc++];
 1255                         efx_tx_qdesc_tso_create(txq->common,
 1256                                                 tso->packet_id,
 1257                                                 tso->seqnum,
 1258                                                 tcp_flags,
 1259                                                 desc++);
 1260                         KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
 1261                         id = (id + 1) & txq->ptr_mask;
 1262 
 1263                         tso->seqnum += tso->seg_size;
 1264                         tso->segs_space = UINT_MAX;
 1265                 }
 1266 
 1267                 /* Header DMA descriptor */
 1268                 *desc = tso->header_desc;
 1269                 txq->n_pend_desc++;
 1270                 KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
 1271                 id = (id + 1) & txq->ptr_mask;
 1272         } else {
 1273                 /* Allocate a DMA-mapped header buffer. */
 1274                 if (__predict_true(tso->header_len <= TSOH_STD_SIZE)) {
 1275                         unsigned int page_index = (id / 2) / TSOH_PER_PAGE;
 1276                         unsigned int buf_index = (id / 2) % TSOH_PER_PAGE;
 1277 
 1278                         header = (txq->tsoh_buffer[page_index].esm_base +
 1279                                   buf_index * TSOH_STD_SIZE);
 1280                         dma_addr = (txq->tsoh_buffer[page_index].esm_addr +
 1281                                     buf_index * TSOH_STD_SIZE);
 1282                         map = txq->tsoh_buffer[page_index].esm_map;
 1283 
 1284                         KASSERT(txq->stmp[id].flags == 0,
 1285                                 ("stmp flags are not 0"));
 1286                 } else {
 1287                         struct sfxge_tx_mapping *stmp = &txq->stmp[id];
 1288 
 1289                         /* We cannot use bus_dmamem_alloc() as that may sleep */
 1290                         header = malloc(tso->header_len, M_SFXGE, M_NOWAIT);
 1291                         if (__predict_false(!header))
 1292                                 return (ENOMEM);
 1293                         rc = bus_dmamap_load(txq->packet_dma_tag, stmp->map,
 1294                                              header, tso->header_len,
 1295                                              tso_map_long_header, &dma_addr,
 1296                                              BUS_DMA_NOWAIT);
 1297                         if (__predict_false(dma_addr == 0)) {
 1298                                 if (rc == 0) {
 1299                                         /* Succeeded but got >1 segment */
 1300                                         bus_dmamap_unload(txq->packet_dma_tag,
 1301                                                           stmp->map);
 1302                                         rc = EINVAL;
 1303                                 }
 1304                                 free(header, M_SFXGE);
 1305                                 return (rc);
 1306                         }
 1307                         map = stmp->map;
 1308 
 1309                         txq->tso_long_headers++;
 1310                         stmp->u.heap_buf = header;
 1311                         stmp->flags = TX_BUF_UNMAP;
 1312                 }
 1313 
 1314                 tsoh_th = (struct tcphdr *)(header + tso->tcph_off);
 1315 
 1316                 /* Copy and update the headers. */
 1317                 m_copydata(tso->mbuf, 0, tso->header_len, header);
 1318 
 1319                 tsoh_th->th_seq = htonl(tso->seqnum);
 1320                 tso->seqnum += tso->seg_size;
 1321                 if (tso->out_len > tso->seg_size) {
 1322                         /* This packet will not finish the TSO burst. */
 1323                         ip_length = tso->header_len - tso->nh_off + tso->seg_size;
 1324                         tsoh_th->th_flags &= ~(TH_FIN | TH_PUSH);
 1325                 } else {
 1326                         /* This packet will be the last in the TSO burst. */
 1327                         ip_length = tso->header_len - tso->nh_off + tso->out_len;
 1328                 }
 1329 
 1330                 if (tso->protocol == htons(ETHERTYPE_IP)) {
 1331                         struct ip *tsoh_iph = (struct ip *)(header + tso->nh_off);
 1332                         tsoh_iph->ip_len = htons(ip_length);
 1333                         /* XXX We should increment ip_id, but FreeBSD doesn't
 1334                          * currently allocate extra IDs for multiple segments.
 1335                          */
 1336                 } else {
 1337                         struct ip6_hdr *tsoh_iph =
 1338                                 (struct ip6_hdr *)(header + tso->nh_off);
 1339                         tsoh_iph->ip6_plen = htons(ip_length - sizeof(*tsoh_iph));
 1340                 }
 1341 
 1342                 /* Make the header visible to the hardware. */
 1343                 bus_dmamap_sync(txq->packet_dma_tag, map, BUS_DMASYNC_PREWRITE);
 1344 
 1345                 /* Form a descriptor for this header. */
 1346                 desc = &txq->pend_desc[txq->n_pend_desc++];
 1347                 efx_tx_qdesc_dma_create(txq->common,
 1348                                         dma_addr,
 1349                                         tso->header_len,
 1350                                         0,
 1351                                         desc);
 1352                 id = (id + 1) & txq->ptr_mask;
 1353 
 1354                 tso->segs_space = UINT_MAX;
 1355         }
 1356         tso->packet_space = tso->seg_size;
 1357         txq->tso_packets++;
 1358         *idp = id;
 1359 
 1360         return (0);
 1361 }
 1362 
 1363 static int
 1364 sfxge_tx_queue_tso(struct sfxge_txq *txq, struct mbuf *mbuf,
 1365                    const bus_dma_segment_t *dma_seg, int n_dma_seg,
 1366                    int n_extra_descs)
 1367 {
 1368         struct sfxge_tso_state tso;
 1369         unsigned int id;
 1370         unsigned skipped = 0;
 1371 
 1372         tso_start(txq, &tso, dma_seg, mbuf);
 1373 
 1374         while (dma_seg->ds_len + skipped <= tso.header_len) {
 1375                 skipped += dma_seg->ds_len;
 1376                 --n_dma_seg;
 1377                 KASSERT(n_dma_seg, ("no payload found in TSO packet"));
 1378                 ++dma_seg;
 1379         }
 1380         tso.in_len = dma_seg->ds_len - (tso.header_len - skipped);
 1381         tso.dma_addr = dma_seg->ds_addr + (tso.header_len - skipped);
 1382 
 1383         id = (txq->added + n_extra_descs) & txq->ptr_mask;
 1384         if (__predict_false(tso_start_new_packet(txq, &tso, &id)))
 1385                 return (-1);
 1386 
 1387         while (1) {
 1388                 tso_fill_packet_with_fragment(txq, &tso);
 1389                 /* Exactly one DMA descriptor is added */
 1390                 KASSERT(txq->stmp[id].flags == 0, ("stmp flags are not 0"));
 1391                 id = (id + 1) & txq->ptr_mask;
 1392 
 1393                 /* Move onto the next fragment? */
 1394                 if (tso.in_len == 0) {
 1395                         --n_dma_seg;
 1396                         if (n_dma_seg == 0)
 1397                                 break;
 1398                         ++dma_seg;
 1399                         tso.in_len = dma_seg->ds_len;
 1400                         tso.dma_addr = dma_seg->ds_addr;
 1401                 }
 1402 
 1403                 /* End of packet? */
 1404                 if ((tso.packet_space == 0) | (tso.segs_space == 0)) {
 1405                         unsigned int n_fatso_opt_desc =
 1406                             (tso.fw_assisted & SFXGE_FATSOV2) ?
 1407                             EFX_TX_FATSOV2_OPT_NDESCS :
 1408                             (tso.fw_assisted & SFXGE_FATSOV1) ? 1 : 0;
 1409 
 1410                         /* If the queue is now full due to tiny MSS,
 1411                          * or we can't create another header, discard
 1412                          * the remainder of the input mbuf but do not
 1413                          * roll back the work we have done.
 1414                          */
 1415                         if (txq->n_pend_desc + n_fatso_opt_desc +
 1416                             1 /* header */ + n_dma_seg > txq->max_pkt_desc) {
 1417                                 txq->tso_pdrop_too_many++;
 1418                                 break;
 1419                         }
 1420                         if (__predict_false(tso_start_new_packet(txq, &tso,
 1421                                                                  &id))) {
 1422                                 txq->tso_pdrop_no_rsrc++;
 1423                                 break;
 1424                         }
 1425                 }
 1426         }
 1427 
 1428         txq->tso_bursts++;
 1429         return (id);
 1430 }
 1431 
 1432 static void
 1433 sfxge_tx_qunblock(struct sfxge_txq *txq)
 1434 {
 1435         struct sfxge_softc *sc;
 1436         struct sfxge_evq *evq __diagused;
 1437 
 1438         sc = txq->sc;
 1439         evq = sc->evq[txq->evq_index];
 1440 
 1441         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
 1442 
 1443         if (__predict_false(txq->init_state != SFXGE_TXQ_STARTED))
 1444                 return;
 1445 
 1446         SFXGE_TXQ_LOCK(txq);
 1447 
 1448         if (txq->blocked) {
 1449                 unsigned int level;
 1450 
 1451                 level = txq->added - txq->completed;
 1452                 if (level <= SFXGE_TXQ_UNBLOCK_LEVEL(txq->entries)) {
 1453                         /* reaped must be in sync with blocked */
 1454                         sfxge_tx_qreap(txq);
 1455                         txq->blocked = 0;
 1456                 }
 1457         }
 1458 
 1459         sfxge_tx_qdpl_service(txq);
 1460         /* note: lock has been dropped */
 1461 }
 1462 
 1463 void
 1464 sfxge_tx_qflush_done(struct sfxge_txq *txq)
 1465 {
 1466 
 1467         txq->flush_state = SFXGE_FLUSH_DONE;
 1468 }
 1469 
 1470 static void
 1471 sfxge_tx_qstop(struct sfxge_softc *sc, unsigned int index)
 1472 {
 1473         struct sfxge_txq *txq;
 1474         struct sfxge_evq *evq;
 1475         unsigned int count;
 1476 
 1477         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
 1478 
 1479         txq = sc->txq[index];
 1480         evq = sc->evq[txq->evq_index];
 1481 
 1482         SFXGE_EVQ_LOCK(evq);
 1483         SFXGE_TXQ_LOCK(txq);
 1484 
 1485         KASSERT(txq->init_state == SFXGE_TXQ_STARTED,
 1486             ("txq->init_state != SFXGE_TXQ_STARTED"));
 1487 
 1488         txq->init_state = SFXGE_TXQ_INITIALIZED;
 1489 
 1490         if (txq->flush_state != SFXGE_FLUSH_DONE) {
 1491                 txq->flush_state = SFXGE_FLUSH_PENDING;
 1492 
 1493                 SFXGE_EVQ_UNLOCK(evq);
 1494                 SFXGE_TXQ_UNLOCK(txq);
 1495 
 1496                 /* Flush the transmit queue. */
 1497                 if (efx_tx_qflush(txq->common) != 0) {
 1498                         log(LOG_ERR, "%s: Flushing Tx queue %u failed\n",
 1499                             device_get_nameunit(sc->dev), index);
 1500                         txq->flush_state = SFXGE_FLUSH_DONE;
 1501                 } else {
 1502                         count = 0;
 1503                         do {
 1504                                 /* Spin for 100ms. */
 1505                                 DELAY(100000);
 1506                                 if (txq->flush_state != SFXGE_FLUSH_PENDING)
 1507                                         break;
 1508                         } while (++count < 20);
 1509                 }
 1510                 SFXGE_EVQ_LOCK(evq);
 1511                 SFXGE_TXQ_LOCK(txq);
 1512 
 1513                 KASSERT(txq->flush_state != SFXGE_FLUSH_FAILED,
 1514                     ("txq->flush_state == SFXGE_FLUSH_FAILED"));
 1515 
 1516                 if (txq->flush_state != SFXGE_FLUSH_DONE) {
 1517                         /* Flush timeout */
 1518                         log(LOG_ERR, "%s: Cannot flush Tx queue %u\n",
 1519                             device_get_nameunit(sc->dev), index);
 1520                         txq->flush_state = SFXGE_FLUSH_DONE;
 1521                 }
 1522         }
 1523 
 1524         txq->blocked = 0;
 1525         txq->pending = txq->added;
 1526 
 1527         sfxge_tx_qcomplete(txq, evq);
 1528         KASSERT(txq->completed == txq->added,
 1529             ("txq->completed != txq->added"));
 1530 
 1531         sfxge_tx_qreap(txq);
 1532         KASSERT(txq->reaped == txq->completed,
 1533             ("txq->reaped != txq->completed"));
 1534 
 1535         txq->added = 0;
 1536         txq->pending = 0;
 1537         txq->completed = 0;
 1538         txq->reaped = 0;
 1539 
 1540         /* Destroy the common code transmit queue. */
 1541         efx_tx_qdestroy(txq->common);
 1542         txq->common = NULL;
 1543 
 1544         efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
 1545             EFX_TXQ_NBUFS(sc->txq_entries));
 1546 
 1547         txq->hw_cksum_flags = 0;
 1548 
 1549         SFXGE_EVQ_UNLOCK(evq);
 1550         SFXGE_TXQ_UNLOCK(txq);
 1551 }
 1552 
 1553 /*
 1554  * Estimate maximum number of Tx descriptors required for TSO packet.
 1555  * With minimum MSS and maximum mbuf length we might need more (even
 1556  * than a ring-ful of descriptors), but this should not happen in
 1557  * practice except due to deliberate attack.  In that case we will
 1558  * truncate the output at a packet boundary.
 1559  */
 1560 static unsigned int
 1561 sfxge_tx_max_pkt_desc(const struct sfxge_softc *sc, enum sfxge_txq_type type,
 1562                       unsigned int tso_fw_assisted)
 1563 {
 1564         /* One descriptor for every input fragment */
 1565         unsigned int max_descs = SFXGE_TX_MAPPING_MAX_SEG;
 1566         unsigned int sw_tso_max_descs;
 1567         unsigned int fa_tso_v1_max_descs = 0;
 1568         unsigned int fa_tso_v2_max_descs = 0;
 1569 
 1570         /* Checksum offload Tx option descriptor may be required */
 1571         if (sc->txq_dynamic_cksum_toggle_supported)
 1572                 max_descs++;
 1573 
 1574         /* VLAN tagging Tx option descriptor may be required */
 1575         if (efx_nic_cfg_get(sc->enp)->enc_hw_tx_insert_vlan_enabled)
 1576                 max_descs++;
 1577 
 1578         if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM) {
 1579                 /*
 1580                  * Plus header and payload descriptor for each output segment.
 1581                  * Minus one since header fragment is already counted.
 1582                  * Even if FATSO is used, we should be ready to fallback
 1583                  * to do it in the driver.
 1584                  */
 1585                 sw_tso_max_descs = SFXGE_TSO_MAX_SEGS * 2 - 1;
 1586 
 1587                 /* FW assisted TSOv1 requires one more descriptor per segment
 1588                  * in comparison to SW TSO */
 1589                 if (tso_fw_assisted & SFXGE_FATSOV1)
 1590                         fa_tso_v1_max_descs =
 1591                             sw_tso_max_descs + SFXGE_TSO_MAX_SEGS;
 1592 
 1593                 /* FW assisted TSOv2 requires 3 (2 FATSO plus header) extra
 1594                  * descriptors per superframe limited by number of DMA fetches
 1595                  * per packet. The first packet header is already counted.
 1596                  */
 1597                 if (tso_fw_assisted & SFXGE_FATSOV2) {
 1598                         fa_tso_v2_max_descs =
 1599                             howmany(SFXGE_TX_MAPPING_MAX_SEG,
 1600                                     EFX_TX_FATSOV2_DMA_SEGS_PER_PKT_MAX - 1) *
 1601                             (EFX_TX_FATSOV2_OPT_NDESCS + 1) - 1;
 1602                 }
 1603 
 1604                 max_descs += MAX(sw_tso_max_descs,
 1605                                  MAX(fa_tso_v1_max_descs, fa_tso_v2_max_descs));
 1606         }
 1607 
 1608         return (max_descs);
 1609 }
 1610 
 1611 static int
 1612 sfxge_tx_qstart(struct sfxge_softc *sc, unsigned int index)
 1613 {
 1614         struct sfxge_txq *txq;
 1615         efsys_mem_t *esmp;
 1616         uint16_t flags;
 1617         unsigned int tso_fw_assisted;
 1618         unsigned int label;
 1619         struct sfxge_evq *evq;
 1620         unsigned int desc_index;
 1621         int rc;
 1622 
 1623         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
 1624 
 1625         txq = sc->txq[index];
 1626         esmp = &txq->mem;
 1627         evq = sc->evq[txq->evq_index];
 1628 
 1629         KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
 1630             ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
 1631         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
 1632             ("evq->init_state != SFXGE_EVQ_STARTED"));
 1633 
 1634         /* Program the buffer table. */
 1635         if ((rc = efx_sram_buf_tbl_set(sc->enp, txq->buf_base_id, esmp,
 1636             EFX_TXQ_NBUFS(sc->txq_entries))) != 0)
 1637                 return (rc);
 1638 
 1639         /* Determine the kind of queue we are creating. */
 1640         tso_fw_assisted = 0;
 1641         switch (txq->type) {
 1642         case SFXGE_TXQ_NON_CKSUM:
 1643                 flags = 0;
 1644                 break;
 1645         case SFXGE_TXQ_IP_CKSUM:
 1646                 flags = EFX_TXQ_CKSUM_IPV4;
 1647                 break;
 1648         case SFXGE_TXQ_IP_TCP_UDP_CKSUM:
 1649                 flags = EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP;
 1650                 tso_fw_assisted = sc->tso_fw_assisted;
 1651                 if (tso_fw_assisted & SFXGE_FATSOV2)
 1652                         flags |= EFX_TXQ_FATSOV2;
 1653                 break;
 1654         default:
 1655                 KASSERT(0, ("Impossible TX queue"));
 1656                 flags = 0;
 1657                 break;
 1658         }
 1659 
 1660         label = (sc->txq_dynamic_cksum_toggle_supported) ? 0 : txq->type;
 1661 
 1662         /* Create the common code transmit queue. */
 1663         if ((rc = efx_tx_qcreate(sc->enp, index, label, esmp,
 1664             sc->txq_entries, txq->buf_base_id, flags, evq->common,
 1665             &txq->common, &desc_index)) != 0) {
 1666                 /* Retry if no FATSOv2 resources, otherwise fail */
 1667                 if ((rc != ENOSPC) || (~flags & EFX_TXQ_FATSOV2))
 1668                         goto fail;
 1669 
 1670                 /* Looks like all FATSOv2 contexts are used */
 1671                 flags &= ~EFX_TXQ_FATSOV2;
 1672                 tso_fw_assisted &= ~SFXGE_FATSOV2;
 1673                 if ((rc = efx_tx_qcreate(sc->enp, index, label, esmp,
 1674                     sc->txq_entries, txq->buf_base_id, flags, evq->common,
 1675                     &txq->common, &desc_index)) != 0)
 1676                         goto fail;
 1677         }
 1678 
 1679         /* Initialise queue descriptor indexes */
 1680         txq->added = txq->pending = txq->completed = txq->reaped = desc_index;
 1681 
 1682         SFXGE_TXQ_LOCK(txq);
 1683 
 1684         /* Enable the transmit queue. */
 1685         efx_tx_qenable(txq->common);
 1686 
 1687         txq->init_state = SFXGE_TXQ_STARTED;
 1688         txq->flush_state = SFXGE_FLUSH_REQUIRED;
 1689         txq->tso_fw_assisted = tso_fw_assisted;
 1690 
 1691         txq->max_pkt_desc = sfxge_tx_max_pkt_desc(sc, txq->type,
 1692                                                   tso_fw_assisted);
 1693 
 1694         txq->hw_vlan_tci = 0;
 1695 
 1696         txq->hw_cksum_flags = flags &
 1697                               (EFX_TXQ_CKSUM_IPV4 | EFX_TXQ_CKSUM_TCPUDP);
 1698 
 1699         SFXGE_TXQ_UNLOCK(txq);
 1700 
 1701         return (0);
 1702 
 1703 fail:
 1704         efx_sram_buf_tbl_clear(sc->enp, txq->buf_base_id,
 1705             EFX_TXQ_NBUFS(sc->txq_entries));
 1706         return (rc);
 1707 }
 1708 
 1709 void
 1710 sfxge_tx_stop(struct sfxge_softc *sc)
 1711 {
 1712         int index;
 1713 
 1714         index = sc->txq_count;
 1715         while (--index >= 0)
 1716                 sfxge_tx_qstop(sc, index);
 1717 
 1718         /* Tear down the transmit module */
 1719         efx_tx_fini(sc->enp);
 1720 }
 1721 
 1722 int
 1723 sfxge_tx_start(struct sfxge_softc *sc)
 1724 {
 1725         int index;
 1726         int rc;
 1727 
 1728         /* Initialize the common code transmit module. */
 1729         if ((rc = efx_tx_init(sc->enp)) != 0)
 1730                 return (rc);
 1731 
 1732         for (index = 0; index < sc->txq_count; index++) {
 1733                 if ((rc = sfxge_tx_qstart(sc, index)) != 0)
 1734                         goto fail;
 1735         }
 1736 
 1737         return (0);
 1738 
 1739 fail:
 1740         while (--index >= 0)
 1741                 sfxge_tx_qstop(sc, index);
 1742 
 1743         efx_tx_fini(sc->enp);
 1744 
 1745         return (rc);
 1746 }
 1747 
 1748 static int
 1749 sfxge_txq_stat_init(struct sfxge_txq *txq, struct sysctl_oid *txq_node)
 1750 {
 1751         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(txq->sc->dev);
 1752         struct sysctl_oid *stat_node;
 1753         unsigned int id;
 1754 
 1755         stat_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
 1756             "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Tx queue statistics");
 1757         if (stat_node == NULL)
 1758                 return (ENOMEM);
 1759 
 1760         for (id = 0; id < nitems(sfxge_tx_stats); id++) {
 1761                 SYSCTL_ADD_ULONG(
 1762                     ctx, SYSCTL_CHILDREN(stat_node), OID_AUTO,
 1763                     sfxge_tx_stats[id].name, CTLFLAG_RD | CTLFLAG_STATS,
 1764                     (unsigned long *)((caddr_t)txq + sfxge_tx_stats[id].offset),
 1765                     "");
 1766         }
 1767 
 1768         return (0);
 1769 }
 1770 
 1771 /**
 1772  * Destroy a transmit queue.
 1773  */
 1774 static void
 1775 sfxge_tx_qfini(struct sfxge_softc *sc, unsigned int index)
 1776 {
 1777         struct sfxge_txq *txq;
 1778         unsigned int nmaps;
 1779 
 1780         txq = sc->txq[index];
 1781 
 1782         KASSERT(txq->init_state == SFXGE_TXQ_INITIALIZED,
 1783             ("txq->init_state != SFXGE_TXQ_INITIALIZED"));
 1784 
 1785         if (txq->type == SFXGE_TXQ_IP_TCP_UDP_CKSUM)
 1786                 tso_fini(txq);
 1787 
 1788         /* Free the context arrays. */
 1789         free(txq->pend_desc, M_SFXGE);
 1790         nmaps = sc->txq_entries;
 1791         while (nmaps-- != 0)
 1792                 bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
 1793         free(txq->stmp, M_SFXGE);
 1794 
 1795         /* Release DMA memory mapping. */
 1796         sfxge_dma_free(&txq->mem);
 1797 
 1798         sc->txq[index] = NULL;
 1799 
 1800         SFXGE_TXQ_LOCK_DESTROY(txq);
 1801 
 1802         free(txq, M_SFXGE);
 1803 }
 1804 
 1805 static int
 1806 sfxge_tx_qinit(struct sfxge_softc *sc, unsigned int txq_index,
 1807                enum sfxge_txq_type type, unsigned int evq_index)
 1808 {
 1809         const efx_nic_cfg_t *encp = efx_nic_cfg_get(sc->enp);
 1810         char name[16];
 1811         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
 1812         struct sysctl_oid *txq_node;
 1813         struct sfxge_txq *txq;
 1814         struct sfxge_tx_dpl *stdp;
 1815         struct sysctl_oid *dpl_node;
 1816         efsys_mem_t *esmp;
 1817         unsigned int nmaps;
 1818         int rc;
 1819 
 1820         txq = malloc(sizeof(struct sfxge_txq), M_SFXGE, M_ZERO | M_WAITOK);
 1821         txq->sc = sc;
 1822         txq->entries = sc->txq_entries;
 1823         txq->ptr_mask = txq->entries - 1;
 1824 
 1825         sc->txq[txq_index] = txq;
 1826         esmp = &txq->mem;
 1827 
 1828         /* Allocate and zero DMA space for the descriptor ring. */
 1829         if ((rc = sfxge_dma_alloc(sc, EFX_TXQ_SIZE(sc->txq_entries), esmp)) != 0)
 1830                 return (rc);
 1831 
 1832         /* Allocate buffer table entries. */
 1833         sfxge_sram_buf_tbl_alloc(sc, EFX_TXQ_NBUFS(sc->txq_entries),
 1834                                  &txq->buf_base_id);
 1835 
 1836         /* Create a DMA tag for packet mappings. */
 1837         if (bus_dma_tag_create(sc->parent_dma_tag, 1,
 1838             encp->enc_tx_dma_desc_boundary,
 1839             MIN(0x3FFFFFFFFFFFUL, BUS_SPACE_MAXADDR), BUS_SPACE_MAXADDR, NULL,
 1840             NULL, 0x11000, SFXGE_TX_MAPPING_MAX_SEG,
 1841             encp->enc_tx_dma_desc_size_max, 0, NULL, NULL,
 1842             &txq->packet_dma_tag) != 0) {
 1843                 device_printf(sc->dev, "Couldn't allocate txq DMA tag\n");
 1844                 rc = ENOMEM;
 1845                 goto fail;
 1846         }
 1847 
 1848         /* Allocate pending descriptor array for batching writes. */
 1849         txq->pend_desc = malloc(sizeof(efx_desc_t) * sc->txq_entries,
 1850                                 M_SFXGE, M_ZERO | M_WAITOK);
 1851 
 1852         /* Allocate and initialise mbuf DMA mapping array. */
 1853         txq->stmp = malloc(sizeof(struct sfxge_tx_mapping) * sc->txq_entries,
 1854             M_SFXGE, M_ZERO | M_WAITOK);
 1855         for (nmaps = 0; nmaps < sc->txq_entries; nmaps++) {
 1856                 rc = bus_dmamap_create(txq->packet_dma_tag, 0,
 1857                                        &txq->stmp[nmaps].map);
 1858                 if (rc != 0)
 1859                         goto fail2;
 1860         }
 1861 
 1862         snprintf(name, sizeof(name), "%u", txq_index);
 1863         txq_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->txqs_node),
 1864             OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 1865         if (txq_node == NULL) {
 1866                 rc = ENOMEM;
 1867                 goto fail_txq_node;
 1868         }
 1869 
 1870         if (type == SFXGE_TXQ_IP_TCP_UDP_CKSUM &&
 1871             (rc = tso_init(txq)) != 0)
 1872                 goto fail3;
 1873 
 1874         /* Initialize the deferred packet list. */
 1875         stdp = &txq->dpl;
 1876         stdp->std_put_max = sfxge_tx_dpl_put_max;
 1877         stdp->std_get_max = sfxge_tx_dpl_get_max;
 1878         stdp->std_get_non_tcp_max = sfxge_tx_dpl_get_non_tcp_max;
 1879         stdp->std_getp = &stdp->std_get;
 1880 
 1881         SFXGE_TXQ_LOCK_INIT(txq, device_get_nameunit(sc->dev), txq_index);
 1882 
 1883         dpl_node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(txq_node), OID_AUTO,
 1884             "dpl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 1885             "Deferred packet list statistics");
 1886         if (dpl_node == NULL) {
 1887                 rc = ENOMEM;
 1888                 goto fail_dpl_node;
 1889         }
 1890 
 1891         SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
 1892                         "get_count", CTLFLAG_RD | CTLFLAG_STATS,
 1893                         &stdp->std_get_count, 0, "");
 1894         SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
 1895                         "get_non_tcp_count", CTLFLAG_RD | CTLFLAG_STATS,
 1896                         &stdp->std_get_non_tcp_count, 0, "");
 1897         SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
 1898                         "get_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
 1899                         &stdp->std_get_hiwat, 0, "");
 1900         SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(dpl_node), OID_AUTO,
 1901                         "put_hiwat", CTLFLAG_RD | CTLFLAG_STATS,
 1902                         &stdp->std_put_hiwat, 0, "");
 1903 
 1904         rc = sfxge_txq_stat_init(txq, txq_node);
 1905         if (rc != 0)
 1906                 goto fail_txq_stat_init;
 1907 
 1908         txq->type = type;
 1909         txq->evq_index = evq_index;
 1910         txq->init_state = SFXGE_TXQ_INITIALIZED;
 1911 
 1912         return (0);
 1913 
 1914 fail_txq_stat_init:
 1915 fail_dpl_node:
 1916 fail3:
 1917 fail_txq_node:
 1918         free(txq->pend_desc, M_SFXGE);
 1919 fail2:
 1920         while (nmaps-- != 0)
 1921                 bus_dmamap_destroy(txq->packet_dma_tag, txq->stmp[nmaps].map);
 1922         free(txq->stmp, M_SFXGE);
 1923         bus_dma_tag_destroy(txq->packet_dma_tag);
 1924 
 1925 fail:
 1926         sfxge_dma_free(esmp);
 1927 
 1928         return (rc);
 1929 }
 1930 
 1931 static int
 1932 sfxge_tx_stat_handler(SYSCTL_HANDLER_ARGS)
 1933 {
 1934         struct sfxge_softc *sc = arg1;
 1935         unsigned int id = arg2;
 1936         unsigned long sum;
 1937         unsigned int index;
 1938 
 1939         /* Sum across all TX queues */
 1940         sum = 0;
 1941         for (index = 0; index < sc->txq_count; index++)
 1942                 sum += *(unsigned long *)((caddr_t)sc->txq[index] +
 1943                                           sfxge_tx_stats[id].offset);
 1944 
 1945         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
 1946 }
 1947 
 1948 static void
 1949 sfxge_tx_stat_init(struct sfxge_softc *sc)
 1950 {
 1951         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
 1952         struct sysctl_oid_list *stat_list;
 1953         unsigned int id;
 1954 
 1955         stat_list = SYSCTL_CHILDREN(sc->stats_node);
 1956 
 1957         for (id = 0; id < nitems(sfxge_tx_stats); id++) {
 1958                 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
 1959                     sfxge_tx_stats[id].name,
 1960                     CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 1961                     sc, id, sfxge_tx_stat_handler, "LU", "");
 1962         }
 1963 }
 1964 
 1965 uint64_t
 1966 sfxge_tx_get_drops(struct sfxge_softc *sc)
 1967 {
 1968         unsigned int index;
 1969         uint64_t drops = 0;
 1970         struct sfxge_txq *txq;
 1971 
 1972         /* Sum across all TX queues */
 1973         for (index = 0; index < sc->txq_count; index++) {
 1974                 txq = sc->txq[index];
 1975                 /*
 1976                  * In theory, txq->put_overflow and txq->netdown_drops
 1977                  * should use atomic operation and other should be
 1978                  * obtained under txq lock, but it is just statistics.
 1979                  */
 1980                 drops += txq->drops + txq->get_overflow +
 1981                          txq->get_non_tcp_overflow +
 1982                          txq->put_overflow + txq->netdown_drops +
 1983                          txq->tso_pdrop_too_many + txq->tso_pdrop_no_rsrc;
 1984         }
 1985         return (drops);
 1986 }
 1987 
 1988 void
 1989 sfxge_tx_fini(struct sfxge_softc *sc)
 1990 {
 1991         int index;
 1992 
 1993         index = sc->txq_count;
 1994         while (--index >= 0)
 1995                 sfxge_tx_qfini(sc, index);
 1996 
 1997         sc->txq_count = 0;
 1998 }
 1999 
 2000 int
 2001 sfxge_tx_init(struct sfxge_softc *sc)
 2002 {
 2003         const efx_nic_cfg_t *encp = efx_nic_cfg_get(sc->enp);
 2004         struct sfxge_intr *intr __diagused;
 2005         int index;
 2006         int rc;
 2007 
 2008         intr = &sc->intr;
 2009 
 2010         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
 2011             ("intr->state != SFXGE_INTR_INITIALIZED"));
 2012 
 2013         if (sfxge_tx_dpl_get_max <= 0) {
 2014                 log(LOG_ERR, "%s=%d must be greater than 0",
 2015                     SFXGE_PARAM_TX_DPL_GET_MAX, sfxge_tx_dpl_get_max);
 2016                 rc = EINVAL;
 2017                 goto fail_tx_dpl_get_max;
 2018         }
 2019         if (sfxge_tx_dpl_get_non_tcp_max <= 0) {
 2020                 log(LOG_ERR, "%s=%d must be greater than 0",
 2021                     SFXGE_PARAM_TX_DPL_GET_NON_TCP_MAX,
 2022                     sfxge_tx_dpl_get_non_tcp_max);
 2023                 rc = EINVAL;
 2024                 goto fail_tx_dpl_get_non_tcp_max;
 2025         }
 2026         if (sfxge_tx_dpl_put_max < 0) {
 2027                 log(LOG_ERR, "%s=%d must be greater or equal to 0",
 2028                     SFXGE_PARAM_TX_DPL_PUT_MAX, sfxge_tx_dpl_put_max);
 2029                 rc = EINVAL;
 2030                 goto fail_tx_dpl_put_max;
 2031         }
 2032 
 2033         sc->txq_count = SFXGE_EVQ0_N_TXQ(sc) - 1 + sc->intr.n_alloc;
 2034 
 2035         sc->tso_fw_assisted = sfxge_tso_fw_assisted;
 2036         if ((~encp->enc_features & EFX_FEATURE_FW_ASSISTED_TSO) ||
 2037             (!encp->enc_fw_assisted_tso_enabled))
 2038                 sc->tso_fw_assisted &= ~SFXGE_FATSOV1;
 2039         if ((~encp->enc_features & EFX_FEATURE_FW_ASSISTED_TSO_V2) ||
 2040             (!encp->enc_fw_assisted_tso_v2_enabled))
 2041                 sc->tso_fw_assisted &= ~SFXGE_FATSOV2;
 2042 
 2043         sc->txqs_node = SYSCTL_ADD_NODE(device_get_sysctl_ctx(sc->dev),
 2044             SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)), OID_AUTO,
 2045             "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Tx queues");
 2046         if (sc->txqs_node == NULL) {
 2047                 rc = ENOMEM;
 2048                 goto fail_txq_node;
 2049         }
 2050 
 2051         /* Initialize the transmit queues */
 2052         if (sc->txq_dynamic_cksum_toggle_supported == B_FALSE) {
 2053                 if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_NON_CKSUM,
 2054                     SFXGE_TXQ_NON_CKSUM, 0)) != 0)
 2055                         goto fail;
 2056 
 2057                 if ((rc = sfxge_tx_qinit(sc, SFXGE_TXQ_IP_CKSUM,
 2058                     SFXGE_TXQ_IP_CKSUM, 0)) != 0)
 2059                         goto fail2;
 2060         }
 2061 
 2062         for (index = 0;
 2063              index < sc->txq_count - SFXGE_EVQ0_N_TXQ(sc) + 1;
 2064              index++) {
 2065                 if ((rc = sfxge_tx_qinit(sc, SFXGE_EVQ0_N_TXQ(sc) - 1 + index,
 2066                     SFXGE_TXQ_IP_TCP_UDP_CKSUM, index)) != 0)
 2067                         goto fail3;
 2068         }
 2069 
 2070         sfxge_tx_stat_init(sc);
 2071 
 2072         return (0);
 2073 
 2074 fail3:
 2075         while (--index >= 0)
 2076                 sfxge_tx_qfini(sc, SFXGE_TXQ_IP_TCP_UDP_CKSUM + index);
 2077 
 2078         sfxge_tx_qfini(sc, SFXGE_TXQ_IP_CKSUM);
 2079 
 2080 fail2:
 2081         sfxge_tx_qfini(sc, SFXGE_TXQ_NON_CKSUM);
 2082 
 2083 fail:
 2084 fail_txq_node:
 2085         sc->txq_count = 0;
 2086 fail_tx_dpl_put_max:
 2087 fail_tx_dpl_get_non_tcp_max:
 2088 fail_tx_dpl_get_max:
 2089         return (rc);
 2090 }

Cache object: 871424f87c655d4c2c16002b15679f15


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.