The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/sfxge/sfxge_rx.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2010-2016 Solarflare Communications Inc.
    5  * All rights reserved.
    6  *
    7  * This software was developed in part by Philip Paeps under contract for
    8  * Solarflare Communications, Inc.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions are met:
   12  *
   13  * 1. Redistributions of source code must retain the above copyright notice,
   14  *    this list of conditions and the following disclaimer.
   15  * 2. Redistributions in binary form must reproduce the above copyright notice,
   16  *    this list of conditions and the following disclaimer in the documentation
   17  *    and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   21  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   22  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
   23  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   24  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   25  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
   26  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   27  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
   28  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
   29  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   30  *
   31  * The views and conclusions contained in the software and documentation are
   32  * those of the authors and should not be interpreted as representing official
   33  * policies, either expressed or implied, of the FreeBSD Project.
   34  */
   35 
   36 #include <sys/cdefs.h>
   37 __FBSDID("$FreeBSD$");
   38 
   39 #include "opt_rss.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/malloc.h>
   43 #include <sys/mbuf.h>
   44 #include <sys/smp.h>
   45 #include <sys/socket.h>
   46 #include <sys/sysctl.h>
   47 #include <sys/syslog.h>
   48 #include <sys/limits.h>
   49 #include <sys/syslog.h>
   50 
   51 #include <net/ethernet.h>
   52 #include <net/if.h>
   53 #include <net/if_vlan_var.h>
   54 
   55 #include <netinet/in.h>
   56 #include <netinet/ip.h>
   57 #include <netinet/ip6.h>
   58 #include <netinet/tcp.h>
   59 
   60 #include <machine/in_cksum.h>
   61 
   62 #ifdef RSS
   63 #include <net/rss_config.h>
   64 #endif
   65 
   66 #include "common/efx.h"
   67 
   68 #include "sfxge.h"
   69 #include "sfxge_rx.h"
   70 
   71 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
   72 
   73 #ifdef SFXGE_LRO
   74 
   75 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
   76     "Large receive offload (LRO) parameters");
   77 
   78 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
   79 
   80 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
   81  * means we can accelerate a larger number of streams.
   82  */
   83 static unsigned lro_table_size = 128;
   84 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
   85 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
   86             &lro_table_size, 0,
   87             "Size of the LRO hash table (must be a power of 2)");
   88 
   89 /* Maximum length of a hash chain.  If chains get too long then the lookup
   90  * time increases and may exceed the benefit of LRO.
   91  */
   92 static unsigned lro_chain_max = 20;
   93 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
   94 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
   95             &lro_chain_max, 0,
   96             "The maximum length of a hash chain");
   97 
   98 /* Maximum time (in ticks) that a connection can be idle before it's LRO
   99  * state is discarded.
  100  */
  101 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
  102 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
  103 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
  104             &lro_idle_ticks, 0,
  105             "The maximum time (in ticks) that a connection can be idle "
  106             "before it's LRO state is discarded");
  107 
  108 /* Number of packets with payload that must arrive in-order before a
  109  * connection is eligible for LRO.  The idea is we should avoid coalescing
  110  * segments when the sender is in slow-start because reducing the ACK rate
  111  * can damage performance.
  112  */
  113 static int lro_slow_start_packets = 2000;
  114 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
  115 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
  116             &lro_slow_start_packets, 0,
  117             "Number of packets with payload that must arrive in-order before "
  118             "a connection is eligible for LRO");
  119 
  120 /* Number of packets with payload that must arrive in-order following loss
  121  * before a connection is eligible for LRO.  The idea is we should avoid
  122  * coalescing segments when the sender is recovering from loss, because
  123  * reducing the ACK rate can damage performance.
  124  */
  125 static int lro_loss_packets = 20;
  126 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
  127 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
  128             &lro_loss_packets, 0,
  129             "Number of packets with payload that must arrive in-order "
  130             "following loss before a connection is eligible for LRO");
  131 
  132 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
  133 #define SFXGE_LRO_L2_ID_VLAN 0x4000
  134 #define SFXGE_LRO_L2_ID_IPV6 0x8000
  135 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
  136 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
  137 
  138 /* Compare IPv6 addresses, avoiding conditional branches */
  139 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
  140                                    const struct in6_addr *right)
  141 {
  142 #if LONG_BIT == 64
  143         const uint64_t *left64 = (const uint64_t *)left;
  144         const uint64_t *right64 = (const uint64_t *)right;
  145         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
  146 #else
  147         return (left->s6_addr32[0] - right->s6_addr32[0]) |
  148                (left->s6_addr32[1] - right->s6_addr32[1]) |
  149                (left->s6_addr32[2] - right->s6_addr32[2]) |
  150                (left->s6_addr32[3] - right->s6_addr32[3]);
  151 #endif
  152 }
  153 
  154 #endif  /* SFXGE_LRO */
  155 
  156 void
  157 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
  158 {
  159 
  160         rxq->flush_state = SFXGE_FLUSH_DONE;
  161 }
  162 
  163 void
  164 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
  165 {
  166 
  167         rxq->flush_state = SFXGE_FLUSH_FAILED;
  168 }
  169 
  170 #ifdef RSS
  171 static uint8_t toep_key[RSS_KEYSIZE];
  172 #else
  173 static uint8_t toep_key[] = {
  174         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
  175         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
  176         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
  177         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
  178         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
  179 };
  180 #endif
  181 
  182 static void
  183 sfxge_rx_post_refill(void *arg)
  184 {
  185         struct sfxge_rxq *rxq = arg;
  186         struct sfxge_softc *sc;
  187         unsigned int index;
  188         struct sfxge_evq *evq;
  189         uint16_t magic;
  190 
  191         sc = rxq->sc;
  192         index = rxq->index;
  193         evq = sc->evq[index];
  194         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
  195 
  196         /* This is guaranteed due to the start/stop order of rx and ev */
  197         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
  198             ("evq not started"));
  199         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
  200             ("rxq not started"));
  201         efx_ev_qpost(evq->common, magic);
  202 }
  203 
  204 static void
  205 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
  206 {
  207         /* Initially retry after 100 ms, but back off in case of
  208          * repeated failures as we probably have to wait for the
  209          * administrator to raise the pool limit. */
  210         if (retrying)
  211                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
  212         else
  213                 rxq->refill_delay = hz / 10;
  214 
  215         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
  216                              sfxge_rx_post_refill, rxq);
  217 }
  218 
  219 #define SFXGE_REFILL_BATCH  64
  220 
  221 static void
  222 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
  223 {
  224         struct sfxge_softc *sc;
  225         unsigned int index;
  226         struct sfxge_evq *evq __diagused;
  227         unsigned int batch;
  228         unsigned int rxfill;
  229         unsigned int mblksize;
  230         int ntodo;
  231         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
  232 
  233         sc = rxq->sc;
  234         index = rxq->index;
  235         evq = sc->evq[index];
  236 
  237         prefetch_read_many(sc->enp);
  238         prefetch_read_many(rxq->common);
  239 
  240         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
  241 
  242         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
  243                 return;
  244 
  245         rxfill = rxq->added - rxq->completed;
  246         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
  247             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
  248         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
  249         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
  250             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
  251 
  252         if (ntodo == 0)
  253                 return;
  254 
  255         batch = 0;
  256         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
  257         while (ntodo-- > 0) {
  258                 unsigned int id;
  259                 struct sfxge_rx_sw_desc *rx_desc;
  260                 bus_dma_segment_t seg;
  261                 struct mbuf *m;
  262 
  263                 id = (rxq->added + batch) & rxq->ptr_mask;
  264                 rx_desc = &rxq->queue[id];
  265                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
  266 
  267                 rx_desc->flags = EFX_DISCARD;
  268                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
  269                     sc->rx_cluster_size);
  270                 if (m == NULL)
  271                         break;
  272 
  273                 /* m_len specifies length of area to be mapped for DMA */
  274                 m->m_len  = mblksize;
  275                 m->m_data = (caddr_t)EFX_P2ROUNDUP(uintptr_t, m->m_data,
  276                                                    CACHE_LINE_SIZE);
  277                 m->m_data += sc->rx_buffer_align;
  278 
  279                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
  280                 addr[batch++] = seg.ds_addr;
  281 
  282                 if (batch == SFXGE_REFILL_BATCH) {
  283                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
  284                             rxq->completed, rxq->added);
  285                         rxq->added += batch;
  286                         batch = 0;
  287                 }
  288         }
  289 
  290         if (ntodo != 0)
  291                 sfxge_rx_schedule_refill(rxq, retrying);
  292 
  293         if (batch != 0) {
  294                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
  295                     rxq->completed, rxq->added);
  296                 rxq->added += batch;
  297         }
  298 
  299         /* Make the descriptors visible to the hardware */
  300         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
  301                         BUS_DMASYNC_PREWRITE);
  302 
  303         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
  304 
  305         /* The queue could still be empty if no descriptors were actually
  306          * pushed, in which case there will be no event to cause the next
  307          * refill, so we must schedule a refill ourselves.
  308          */
  309         if(rxq->pushed == rxq->completed) {
  310                 sfxge_rx_schedule_refill(rxq, retrying);
  311         }
  312 }
  313 
  314 void
  315 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
  316 {
  317 
  318         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
  319                 return;
  320 
  321         /* Make sure the queue is full */
  322         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
  323 }
  324 
  325 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
  326 {
  327         struct ifnet *ifp = sc->ifnet;
  328 
  329         m->m_pkthdr.rcvif = ifp;
  330         m->m_pkthdr.csum_data = 0xffff;
  331         ifp->if_input(ifp, m);
  332 }
  333 
  334 static void
  335 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
  336 {
  337         struct sfxge_softc *sc = rxq->sc;
  338         struct mbuf *m = rx_desc->mbuf;
  339         int flags = rx_desc->flags;
  340         int csum_flags;
  341 
  342         /* Convert checksum flags */
  343         csum_flags = (flags & EFX_CKSUM_IPV4) ?
  344                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
  345         if (flags & EFX_CKSUM_TCPUDP)
  346                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  347 
  348         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
  349                 m->m_pkthdr.flowid =
  350                         efx_pseudo_hdr_hash_get(rxq->common,
  351                                                 EFX_RX_HASHALG_TOEPLITZ,
  352                                                 mtod(m, uint8_t *));
  353                 /* The hash covers a 4-tuple for TCP only */
  354                 M_HASHTYPE_SET(m,
  355                     (flags & EFX_PKT_IPV4) ?
  356                         ((flags & EFX_PKT_TCP) ?
  357                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
  358                         ((flags & EFX_PKT_TCP) ?
  359                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
  360         }
  361         m->m_data += sc->rx_prefix_size;
  362         m->m_len = rx_desc->size - sc->rx_prefix_size;
  363         m->m_pkthdr.len = m->m_len;
  364         m->m_pkthdr.csum_flags = csum_flags;
  365         __sfxge_rx_deliver(sc, rx_desc->mbuf);
  366 
  367         rx_desc->flags = EFX_DISCARD;
  368         rx_desc->mbuf = NULL;
  369 }
  370 
  371 #ifdef SFXGE_LRO
  372 
  373 static void
  374 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
  375 {
  376         struct sfxge_softc *sc = st->sc;
  377         struct mbuf *m = c->mbuf;
  378         struct tcphdr *c_th;
  379         int csum_flags;
  380 
  381         KASSERT(m, ("no mbuf to deliver"));
  382 
  383         ++st->n_bursts;
  384 
  385         /* Finish off packet munging and recalculate IP header checksum. */
  386         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  387                 struct ip *iph = c->nh;
  388                 iph->ip_len = htons(iph->ip_len);
  389                 iph->ip_sum = 0;
  390                 iph->ip_sum = in_cksum_hdr(iph);
  391                 c_th = (struct tcphdr *)(iph + 1);
  392                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
  393                               CSUM_IP_CHECKED | CSUM_IP_VALID);
  394         } else {
  395                 struct ip6_hdr *iph = c->nh;
  396                 iph->ip6_plen = htons(iph->ip6_plen);
  397                 c_th = (struct tcphdr *)(iph + 1);
  398                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  399         }
  400 
  401         c_th->th_win = c->th_last->th_win;
  402         c_th->th_ack = c->th_last->th_ack;
  403         if (c_th->th_off == c->th_last->th_off) {
  404                 /* Copy TCP options (take care to avoid going negative). */
  405                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
  406                 memcpy(c_th + 1, c->th_last + 1, optlen);
  407         }
  408 
  409         m->m_pkthdr.flowid = c->conn_hash;
  410         M_HASHTYPE_SET(m,
  411             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
  412                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
  413 
  414         m->m_pkthdr.csum_flags = csum_flags;
  415         __sfxge_rx_deliver(sc, m);
  416 
  417         c->mbuf = NULL;
  418         c->delivered = 1;
  419 }
  420 
  421 /* Drop the given connection, and add it to the free list. */
  422 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
  423 {
  424         unsigned bucket;
  425 
  426         KASSERT(!c->mbuf, ("found orphaned mbuf"));
  427 
  428         if (c->next_buf.mbuf != NULL) {
  429                 sfxge_rx_deliver(rxq, &c->next_buf);
  430                 LIST_REMOVE(c, active_link);
  431         }
  432 
  433         bucket = c->conn_hash & rxq->lro.conns_mask;
  434         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
  435         --rxq->lro.conns_n[bucket];
  436         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
  437         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
  438 }
  439 
  440 /* Stop tracking connections that have gone idle in order to keep hash
  441  * chains short.
  442  */
  443 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
  444 {
  445         struct sfxge_lro_conn *c;
  446         unsigned i;
  447 
  448         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
  449                 ("found active connections"));
  450 
  451         rxq->lro.last_purge_ticks = now;
  452         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
  453                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
  454                         continue;
  455 
  456                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
  457                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
  458                         ++rxq->lro.n_drop_idle;
  459                         sfxge_lro_drop(rxq, c);
  460                 }
  461         }
  462 }
  463 
  464 static void
  465 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
  466                 struct mbuf *mbuf, struct tcphdr *th)
  467 {
  468         struct tcphdr *c_th;
  469 
  470         /* Tack the new mbuf onto the chain. */
  471         KASSERT(!mbuf->m_next, ("mbuf already chained"));
  472         c->mbuf_tail->m_next = mbuf;
  473         c->mbuf_tail = mbuf;
  474 
  475         /* Increase length appropriately */
  476         c->mbuf->m_pkthdr.len += mbuf->m_len;
  477 
  478         /* Update the connection state flags */
  479         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  480                 struct ip *iph = c->nh;
  481                 iph->ip_len += mbuf->m_len;
  482                 c_th = (struct tcphdr *)(iph + 1);
  483         } else {
  484                 struct ip6_hdr *iph = c->nh;
  485                 iph->ip6_plen += mbuf->m_len;
  486                 c_th = (struct tcphdr *)(iph + 1);
  487         }
  488         c_th->th_flags |= (th->th_flags & TH_PUSH);
  489         c->th_last = th;
  490         ++st->n_merges;
  491 
  492         /* Pass packet up now if another segment could overflow the IP
  493          * length.
  494          */
  495         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
  496                 sfxge_lro_deliver(st, c);
  497 }
  498 
  499 static void
  500 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
  501                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
  502 {
  503         /* Start the chain */
  504         c->mbuf = mbuf;
  505         c->mbuf_tail = c->mbuf;
  506         c->nh = nh;
  507         c->th_last = th;
  508 
  509         mbuf->m_pkthdr.len = mbuf->m_len;
  510 
  511         /* Mangle header fields for later processing */
  512         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  513                 struct ip *iph = nh;
  514                 iph->ip_len = ntohs(iph->ip_len);
  515         } else {
  516                 struct ip6_hdr *iph = nh;
  517                 iph->ip6_plen = ntohs(iph->ip6_plen);
  518         }
  519 }
  520 
  521 /* Try to merge or otherwise hold or deliver (as appropriate) the
  522  * packet buffered for this connection (c->next_buf).  Return a flag
  523  * indicating whether the connection is still active for LRO purposes.
  524  */
  525 static int
  526 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
  527 {
  528         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
  529         char *eh = c->next_eh;
  530         int data_length, hdr_length, dont_merge;
  531         unsigned th_seq, pkt_length;
  532         struct tcphdr *th;
  533         unsigned now;
  534 
  535         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  536                 struct ip *iph = c->next_nh;
  537                 th = (struct tcphdr *)(iph + 1);
  538                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
  539         } else {
  540                 struct ip6_hdr *iph = c->next_nh;
  541                 th = (struct tcphdr *)(iph + 1);
  542                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
  543         }
  544 
  545         hdr_length = (char *) th + th->th_off * 4 - eh;
  546         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
  547                        hdr_length);
  548         th_seq = ntohl(th->th_seq);
  549         dont_merge = ((data_length <= 0)
  550                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
  551 
  552         /* Check for options other than aligned timestamp. */
  553         if (th->th_off != 5) {
  554                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
  555                 if (th->th_off == 8 &&
  556                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
  557                                         (TCPOPT_NOP << 16) |
  558                                         (TCPOPT_TIMESTAMP << 8) |
  559                                         TCPOLEN_TIMESTAMP)) {
  560                         /* timestamp option -- okay */
  561                 } else {
  562                         dont_merge = 1;
  563                 }
  564         }
  565 
  566         if (__predict_false(th_seq != c->next_seq)) {
  567                 /* Out-of-order, so start counting again. */
  568                 if (c->mbuf != NULL)
  569                         sfxge_lro_deliver(&rxq->lro, c);
  570                 c->n_in_order_pkts -= lro_loss_packets;
  571                 c->next_seq = th_seq + data_length;
  572                 ++rxq->lro.n_misorder;
  573                 goto deliver_buf_out;
  574         }
  575         c->next_seq = th_seq + data_length;
  576 
  577         now = ticks;
  578         if (now - c->last_pkt_ticks > lro_idle_ticks) {
  579                 ++rxq->lro.n_drop_idle;
  580                 if (c->mbuf != NULL)
  581                         sfxge_lro_deliver(&rxq->lro, c);
  582                 sfxge_lro_drop(rxq, c);
  583                 return (0);
  584         }
  585         c->last_pkt_ticks = ticks;
  586 
  587         if (c->n_in_order_pkts < lro_slow_start_packets) {
  588                 /* May be in slow-start, so don't merge. */
  589                 ++rxq->lro.n_slow_start;
  590                 ++c->n_in_order_pkts;
  591                 goto deliver_buf_out;
  592         }
  593 
  594         if (__predict_false(dont_merge)) {
  595                 if (c->mbuf != NULL)
  596                         sfxge_lro_deliver(&rxq->lro, c);
  597                 if (th->th_flags & (TH_FIN | TH_RST)) {
  598                         ++rxq->lro.n_drop_closed;
  599                         sfxge_lro_drop(rxq, c);
  600                         return (0);
  601                 }
  602                 goto deliver_buf_out;
  603         }
  604 
  605         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
  606 
  607         if (__predict_true(c->mbuf != NULL)) {
  608                 /* Remove headers and any padding */
  609                 rx_buf->mbuf->m_data += hdr_length;
  610                 rx_buf->mbuf->m_len = data_length;
  611 
  612                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
  613         } else {
  614                 /* Remove any padding */
  615                 rx_buf->mbuf->m_len = pkt_length;
  616 
  617                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
  618         }
  619 
  620         rx_buf->mbuf = NULL;
  621         return (1);
  622 
  623  deliver_buf_out:
  624         sfxge_rx_deliver(rxq, rx_buf);
  625         return (1);
  626 }
  627 
  628 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
  629                                uint16_t l2_id, void *nh, struct tcphdr *th)
  630 {
  631         unsigned bucket = conn_hash & st->conns_mask;
  632         struct sfxge_lro_conn *c;
  633 
  634         if (st->conns_n[bucket] >= lro_chain_max) {
  635                 ++st->n_too_many;
  636                 return;
  637         }
  638 
  639         if (!TAILQ_EMPTY(&st->free_conns)) {
  640                 c = TAILQ_FIRST(&st->free_conns);
  641                 TAILQ_REMOVE(&st->free_conns, c, link);
  642         } else {
  643                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
  644                 if (c == NULL)
  645                         return;
  646                 c->mbuf = NULL;
  647                 c->next_buf.mbuf = NULL;
  648         }
  649 
  650         /* Create the connection tracking data */
  651         ++st->conns_n[bucket];
  652         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
  653         c->l2_id = l2_id;
  654         c->conn_hash = conn_hash;
  655         c->source = th->th_sport;
  656         c->dest = th->th_dport;
  657         c->n_in_order_pkts = 0;
  658         c->last_pkt_ticks = *(volatile int *)&ticks;
  659         c->delivered = 0;
  660         ++st->n_new_stream;
  661         /* NB. We don't initialise c->next_seq, and it doesn't matter what
  662          * value it has.  Most likely the next packet received for this
  663          * connection will not match -- no harm done.
  664          */
  665 }
  666 
  667 /* Process mbuf and decide whether to dispatch it to the stack now or
  668  * later.
  669  */
  670 static void
  671 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
  672 {
  673         struct sfxge_softc *sc = rxq->sc;
  674         struct mbuf *m = rx_buf->mbuf;
  675         struct ether_header *eh;
  676         struct sfxge_lro_conn *c;
  677         uint16_t l2_id;
  678         uint16_t l3_proto;
  679         void *nh;
  680         struct tcphdr *th;
  681         uint32_t conn_hash;
  682         unsigned bucket;
  683 
  684         /* Get the hardware hash */
  685         conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
  686                                             EFX_RX_HASHALG_TOEPLITZ,
  687                                             mtod(m, uint8_t *));
  688 
  689         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
  690         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
  691                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
  692                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
  693                         SFXGE_LRO_L2_ID_VLAN;
  694                 l3_proto = veh->evl_proto;
  695                 nh = veh + 1;
  696         } else {
  697                 l2_id = 0;
  698                 l3_proto = eh->ether_type;
  699                 nh = eh + 1;
  700         }
  701 
  702         /* Check whether this is a suitable packet (unfragmented
  703          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
  704          * length, and compute a hash if necessary.  If not, return.
  705          */
  706         if (l3_proto == htons(ETHERTYPE_IP)) {
  707                 struct ip *iph = nh;
  708 
  709                 KASSERT(iph->ip_p == IPPROTO_TCP,
  710                     ("IPv4 protocol is not TCP, but packet marker is set"));
  711                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
  712                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
  713                         goto deliver_now;
  714                 th = (struct tcphdr *)(iph + 1);
  715         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
  716                 struct ip6_hdr *iph = nh;
  717 
  718                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
  719                     ("IPv6 next header is not TCP, but packet marker is set"));
  720                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
  721                 th = (struct tcphdr *)(iph + 1);
  722         } else {
  723                 goto deliver_now;
  724         }
  725 
  726         bucket = conn_hash & rxq->lro.conns_mask;
  727 
  728         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
  729                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
  730                         continue;
  731                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
  732                         continue;
  733                 if (c->mbuf != NULL) {
  734                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  735                                 struct ip *c_iph, *iph = nh;
  736                                 c_iph = c->nh;
  737                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
  738                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
  739                                         continue;
  740                         } else {
  741                                 struct ip6_hdr *c_iph, *iph = nh;
  742                                 c_iph = c->nh;
  743                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
  744                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
  745                                         continue;
  746                         }
  747                 }
  748 
  749                 /* Re-insert at head of list to reduce lookup time. */
  750                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
  751                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
  752 
  753                 if (c->next_buf.mbuf != NULL) {
  754                         if (!sfxge_lro_try_merge(rxq, c))
  755                                 goto deliver_now;
  756                 } else {
  757                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
  758                             active_link);
  759                 }
  760                 c->next_buf = *rx_buf;
  761                 c->next_eh = eh;
  762                 c->next_nh = nh;
  763 
  764                 rx_buf->mbuf = NULL;
  765                 rx_buf->flags = EFX_DISCARD;
  766                 return;
  767         }
  768 
  769         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
  770  deliver_now:
  771         sfxge_rx_deliver(rxq, rx_buf);
  772 }
  773 
  774 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
  775 {
  776         struct sfxge_lro_state *st = &rxq->lro;
  777         struct sfxge_lro_conn *c;
  778         unsigned t;
  779 
  780         while (!LIST_EMPTY(&st->active_conns)) {
  781                 c = LIST_FIRST(&st->active_conns);
  782                 if (!c->delivered && c->mbuf != NULL)
  783                         sfxge_lro_deliver(st, c);
  784                 if (sfxge_lro_try_merge(rxq, c)) {
  785                         if (c->mbuf != NULL)
  786                                 sfxge_lro_deliver(st, c);
  787                         LIST_REMOVE(c, active_link);
  788                 }
  789                 c->delivered = 0;
  790         }
  791 
  792         t = *(volatile int *)&ticks;
  793         if (__predict_false(t != st->last_purge_ticks))
  794                 sfxge_lro_purge_idle(rxq, t);
  795 }
  796 
  797 #else   /* !SFXGE_LRO */
  798 
  799 static void
  800 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
  801 {
  802 }
  803 
  804 static void
  805 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
  806 {
  807 }
  808 
  809 #endif  /* SFXGE_LRO */
  810 
  811 void
  812 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
  813 {
  814         struct sfxge_softc *sc = rxq->sc;
  815         int if_capenable = sc->ifnet->if_capenable;
  816         int lro_enabled = if_capenable & IFCAP_LRO;
  817         unsigned int index;
  818         struct sfxge_evq *evq __diagused;
  819         unsigned int completed;
  820         unsigned int level;
  821         struct mbuf *m;
  822         struct sfxge_rx_sw_desc *prev = NULL;
  823 
  824         index = rxq->index;
  825         evq = sc->evq[index];
  826 
  827         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
  828 
  829         completed = rxq->completed;
  830         while (completed != rxq->pending) {
  831                 unsigned int id;
  832                 struct sfxge_rx_sw_desc *rx_desc;
  833 
  834                 id = completed++ & rxq->ptr_mask;
  835                 rx_desc = &rxq->queue[id];
  836                 m = rx_desc->mbuf;
  837 
  838                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
  839                         goto discard;
  840 
  841                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
  842                         goto discard;
  843 
  844                 /* Read the length from the pseudo header if required */
  845                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
  846                         uint16_t tmp_size;
  847                         int rc __diagused;
  848 
  849                         rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
  850                                                            mtod(m, uint8_t *),
  851                                                            &tmp_size);
  852                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
  853                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
  854                 }
  855 
  856                 prefetch_read_many(mtod(m, caddr_t));
  857 
  858                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
  859                 case EFX_PKT_IPV4:
  860                         if (~if_capenable & IFCAP_RXCSUM)
  861                                 rx_desc->flags &=
  862                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
  863                         break;
  864                 case EFX_PKT_IPV6:
  865                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
  866                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
  867                         break;
  868                 case 0:
  869                         /* Check for loopback packets */
  870                         {
  871                                 struct ether_header *etherhp;
  872 
  873                                 /*LINTED*/
  874                                 etherhp = mtod(m, struct ether_header *);
  875 
  876                                 if (etherhp->ether_type ==
  877                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
  878                                         EFSYS_PROBE(loopback);
  879 
  880                                         rxq->loopback++;
  881                                         goto discard;
  882                                 }
  883                         }
  884                         break;
  885                 default:
  886                         KASSERT(B_FALSE,
  887                             ("Rx descriptor with both IPv4 and IPv6 flags"));
  888                         goto discard;
  889                 }
  890 
  891                 /* Pass packet up the stack or into LRO (pipelined) */
  892                 if (prev != NULL) {
  893                         if (lro_enabled &&
  894                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
  895                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
  896                                 sfxge_lro(rxq, prev);
  897                         else
  898                                 sfxge_rx_deliver(rxq, prev);
  899                 }
  900                 prev = rx_desc;
  901                 continue;
  902 
  903 discard:
  904                 /* Return the packet to the pool */
  905                 m_free(m);
  906                 rx_desc->mbuf = NULL;
  907         }
  908         rxq->completed = completed;
  909 
  910         level = rxq->added - rxq->completed;
  911 
  912         /* Pass last packet up the stack or into LRO */
  913         if (prev != NULL) {
  914                 if (lro_enabled &&
  915                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
  916                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
  917                         sfxge_lro(rxq, prev);
  918                 else
  919                         sfxge_rx_deliver(rxq, prev);
  920         }
  921 
  922         /*
  923          * If there are any pending flows and this is the end of the
  924          * poll then they must be completed.
  925          */
  926         if (eop)
  927                 sfxge_lro_end_of_burst(rxq);
  928 
  929         /* Top up the queue if necessary */
  930         if (level < rxq->refill_threshold)
  931                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
  932 }
  933 
  934 static void
  935 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
  936 {
  937         struct sfxge_rxq *rxq;
  938         struct sfxge_evq *evq;
  939         unsigned int count;
  940         unsigned int retry = 3;
  941 
  942         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
  943 
  944         rxq = sc->rxq[index];
  945         evq = sc->evq[index];
  946 
  947         SFXGE_EVQ_LOCK(evq);
  948 
  949         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
  950             ("rxq not started"));
  951 
  952         rxq->init_state = SFXGE_RXQ_INITIALIZED;
  953 
  954         callout_stop(&rxq->refill_callout);
  955 
  956         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
  957                 rxq->flush_state = SFXGE_FLUSH_PENDING;
  958 
  959                 SFXGE_EVQ_UNLOCK(evq);
  960 
  961                 /* Flush the receive queue */
  962                 if (efx_rx_qflush(rxq->common) != 0) {
  963                         SFXGE_EVQ_LOCK(evq);
  964                         rxq->flush_state = SFXGE_FLUSH_FAILED;
  965                         break;
  966                 }
  967 
  968                 count = 0;
  969                 do {
  970                         /* Spin for 100 ms */
  971                         DELAY(100000);
  972 
  973                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
  974                                 break;
  975 
  976                 } while (++count < 20);
  977 
  978                 SFXGE_EVQ_LOCK(evq);
  979 
  980                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
  981                         /* Flush timeout - neither done nor failed */
  982                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
  983                             device_get_nameunit(sc->dev), index);
  984                         rxq->flush_state = SFXGE_FLUSH_DONE;
  985                 }
  986                 retry--;
  987         }
  988         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
  989                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
  990                     device_get_nameunit(sc->dev), index);
  991                 rxq->flush_state = SFXGE_FLUSH_DONE;
  992         }
  993 
  994         rxq->pending = rxq->added;
  995         sfxge_rx_qcomplete(rxq, B_TRUE);
  996 
  997         KASSERT(rxq->completed == rxq->pending,
  998             ("rxq->completed != rxq->pending"));
  999 
 1000         rxq->added = 0;
 1001         rxq->pushed = 0;
 1002         rxq->pending = 0;
 1003         rxq->completed = 0;
 1004         rxq->loopback = 0;
 1005 
 1006         /* Destroy the common code receive queue. */
 1007         efx_rx_qdestroy(rxq->common);
 1008 
 1009         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
 1010             EFX_RXQ_NBUFS(sc->rxq_entries));
 1011 
 1012         SFXGE_EVQ_UNLOCK(evq);
 1013 }
 1014 
 1015 static int
 1016 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
 1017 {
 1018         struct sfxge_rxq *rxq;
 1019         efsys_mem_t *esmp;
 1020         struct sfxge_evq *evq;
 1021         int rc;
 1022 
 1023         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
 1024 
 1025         rxq = sc->rxq[index];
 1026         esmp = &rxq->mem;
 1027         evq = sc->evq[index];
 1028 
 1029         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
 1030             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
 1031         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
 1032             ("evq->init_state != SFXGE_EVQ_STARTED"));
 1033 
 1034         /* Program the buffer table. */
 1035         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
 1036             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
 1037                 return (rc);
 1038 
 1039         /* Create the common code receive queue. */
 1040         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
 1041             esmp, sc->rxq_entries, rxq->buf_base_id, EFX_RXQ_FLAG_NONE,
 1042             evq->common, &rxq->common)) != 0)
 1043                 goto fail;
 1044 
 1045         SFXGE_EVQ_LOCK(evq);
 1046 
 1047         /* Enable the receive queue. */
 1048         efx_rx_qenable(rxq->common);
 1049 
 1050         rxq->init_state = SFXGE_RXQ_STARTED;
 1051         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
 1052 
 1053         /* Try to fill the queue from the pool. */
 1054         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
 1055 
 1056         SFXGE_EVQ_UNLOCK(evq);
 1057 
 1058         return (0);
 1059 
 1060 fail:
 1061         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
 1062             EFX_RXQ_NBUFS(sc->rxq_entries));
 1063         return (rc);
 1064 }
 1065 
 1066 void
 1067 sfxge_rx_stop(struct sfxge_softc *sc)
 1068 {
 1069         int index;
 1070 
 1071         efx_mac_filter_default_rxq_clear(sc->enp);
 1072 
 1073         /* Stop the receive queue(s) */
 1074         index = sc->rxq_count;
 1075         while (--index >= 0)
 1076                 sfxge_rx_qstop(sc, index);
 1077 
 1078         sc->rx_prefix_size = 0;
 1079         sc->rx_buffer_size = 0;
 1080 
 1081         efx_rx_fini(sc->enp);
 1082 }
 1083 
 1084 int
 1085 sfxge_rx_start(struct sfxge_softc *sc)
 1086 {
 1087         const efx_nic_cfg_t *encp;
 1088         size_t hdrlen, align, reserved;
 1089         int index;
 1090         int rc;
 1091 
 1092         /* Initialize the common code receive module. */
 1093         if ((rc = efx_rx_init(sc->enp)) != 0)
 1094                 return (rc);
 1095 
 1096         encp = efx_nic_cfg_get(sc->enp);
 1097         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
 1098 
 1099         /* Calculate the receive packet buffer size. */
 1100         sc->rx_prefix_size = encp->enc_rx_prefix_size;
 1101 
 1102         /* Ensure IP headers are 32bit aligned */
 1103         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
 1104         sc->rx_buffer_align = EFX_P2ROUNDUP(size_t, hdrlen, 4) - hdrlen;
 1105 
 1106         sc->rx_buffer_size += sc->rx_buffer_align;
 1107 
 1108         /* Align end of packet buffer for RX DMA end padding */
 1109         align = MAX(1, encp->enc_rx_buf_align_end);
 1110         EFSYS_ASSERT(ISP2(align));
 1111         sc->rx_buffer_size = EFX_P2ROUNDUP(size_t, sc->rx_buffer_size, align);
 1112 
 1113         /*
 1114          * Standard mbuf zones only guarantee pointer-size alignment;
 1115          * we need extra space to align to the cache line
 1116          */
 1117         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
 1118 
 1119         /* Select zone for packet buffers */
 1120         if (reserved <= MCLBYTES)
 1121                 sc->rx_cluster_size = MCLBYTES;
 1122         else if (reserved <= MJUMPAGESIZE)
 1123                 sc->rx_cluster_size = MJUMPAGESIZE;
 1124         else if (reserved <= MJUM9BYTES)
 1125                 sc->rx_cluster_size = MJUM9BYTES;
 1126         else
 1127                 sc->rx_cluster_size = MJUM16BYTES;
 1128 
 1129         /*
 1130          * Set up the scale table.  Enable all hash types and hash insertion.
 1131          */
 1132         for (index = 0; index < nitems(sc->rx_indir_table); index++)
 1133 #ifdef RSS
 1134                 sc->rx_indir_table[index] =
 1135                         rss_get_indirection_to_bucket(index) % sc->rxq_count;
 1136 #else
 1137                 sc->rx_indir_table[index] = index % sc->rxq_count;
 1138 #endif
 1139         if ((rc = efx_rx_scale_tbl_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
 1140                                        sc->rx_indir_table,
 1141                                        nitems(sc->rx_indir_table))) != 0)
 1142                 goto fail;
 1143         (void)efx_rx_scale_mode_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
 1144             EFX_RX_HASHALG_TOEPLITZ,
 1145             EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
 1146             EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
 1147 
 1148 #ifdef RSS
 1149         rss_getkey(toep_key);
 1150 #endif
 1151         if ((rc = efx_rx_scale_key_set(sc->enp, EFX_RSS_CONTEXT_DEFAULT,
 1152                                        toep_key,
 1153                                        sizeof(toep_key))) != 0)
 1154                 goto fail;
 1155 
 1156         /* Start the receive queue(s). */
 1157         for (index = 0; index < sc->rxq_count; index++) {
 1158                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
 1159                         goto fail2;
 1160         }
 1161 
 1162         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
 1163                                             sc->intr.n_alloc > 1);
 1164         if (rc != 0)
 1165                 goto fail3;
 1166 
 1167         return (0);
 1168 
 1169 fail3:
 1170 fail2:
 1171         while (--index >= 0)
 1172                 sfxge_rx_qstop(sc, index);
 1173 
 1174 fail:
 1175         efx_rx_fini(sc->enp);
 1176 
 1177         return (rc);
 1178 }
 1179 
 1180 #ifdef SFXGE_LRO
 1181 
 1182 static void sfxge_lro_init(struct sfxge_rxq *rxq)
 1183 {
 1184         struct sfxge_lro_state *st = &rxq->lro;
 1185         unsigned i;
 1186 
 1187         st->conns_mask = lro_table_size - 1;
 1188         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
 1189                 ("lro_table_size must be a power of 2"));
 1190         st->sc = rxq->sc;
 1191         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
 1192                            M_SFXGE, M_WAITOK);
 1193         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
 1194                              M_SFXGE, M_WAITOK);
 1195         for (i = 0; i <= st->conns_mask; ++i) {
 1196                 TAILQ_INIT(&st->conns[i]);
 1197                 st->conns_n[i] = 0;
 1198         }
 1199         LIST_INIT(&st->active_conns);
 1200         TAILQ_INIT(&st->free_conns);
 1201 }
 1202 
 1203 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
 1204 {
 1205         struct sfxge_lro_state *st = &rxq->lro;
 1206         struct sfxge_lro_conn *c;
 1207         unsigned i;
 1208 
 1209         /* Return cleanly if sfxge_lro_init() has not been called. */
 1210         if (st->conns == NULL)
 1211                 return;
 1212 
 1213         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
 1214 
 1215         for (i = 0; i <= st->conns_mask; ++i) {
 1216                 while (!TAILQ_EMPTY(&st->conns[i])) {
 1217                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
 1218                         sfxge_lro_drop(rxq, c);
 1219                 }
 1220         }
 1221 
 1222         while (!TAILQ_EMPTY(&st->free_conns)) {
 1223                 c = TAILQ_FIRST(&st->free_conns);
 1224                 TAILQ_REMOVE(&st->free_conns, c, link);
 1225                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
 1226                 free(c, M_SFXGE);
 1227         }
 1228 
 1229         free(st->conns_n, M_SFXGE);
 1230         free(st->conns, M_SFXGE);
 1231         st->conns = NULL;
 1232 }
 1233 
 1234 #else
 1235 
 1236 static void
 1237 sfxge_lro_init(struct sfxge_rxq *rxq)
 1238 {
 1239 }
 1240 
 1241 static void
 1242 sfxge_lro_fini(struct sfxge_rxq *rxq)
 1243 {
 1244 }
 1245 
 1246 #endif  /* SFXGE_LRO */
 1247 
 1248 static void
 1249 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
 1250 {
 1251         struct sfxge_rxq *rxq;
 1252 
 1253         rxq = sc->rxq[index];
 1254 
 1255         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
 1256             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
 1257 
 1258         /* Free the context array and the flow table. */
 1259         free(rxq->queue, M_SFXGE);
 1260         sfxge_lro_fini(rxq);
 1261 
 1262         /* Release DMA memory. */
 1263         sfxge_dma_free(&rxq->mem);
 1264 
 1265         sc->rxq[index] = NULL;
 1266 
 1267         free(rxq, M_SFXGE);
 1268 }
 1269 
 1270 static int
 1271 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
 1272 {
 1273         struct sfxge_rxq *rxq;
 1274         efsys_mem_t *esmp;
 1275         int rc;
 1276 
 1277         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
 1278 
 1279         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
 1280         rxq->sc = sc;
 1281         rxq->index = index;
 1282         rxq->entries = sc->rxq_entries;
 1283         rxq->ptr_mask = rxq->entries - 1;
 1284         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
 1285 
 1286         sc->rxq[index] = rxq;
 1287         esmp = &rxq->mem;
 1288 
 1289         /* Allocate and zero DMA space. */
 1290         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
 1291                 return (rc);
 1292 
 1293         /* Allocate buffer table entries. */
 1294         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
 1295                                  &rxq->buf_base_id);
 1296 
 1297         /* Allocate the context array and the flow table. */
 1298         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
 1299             M_SFXGE, M_WAITOK | M_ZERO);
 1300         sfxge_lro_init(rxq);
 1301 
 1302         callout_init(&rxq->refill_callout, 1);
 1303 
 1304         rxq->init_state = SFXGE_RXQ_INITIALIZED;
 1305 
 1306         return (0);
 1307 }
 1308 
 1309 static const struct {
 1310         const char *name;
 1311         size_t offset;
 1312 } sfxge_rx_stats[] = {
 1313 #define SFXGE_RX_STAT(name, member) \
 1314         { #name, offsetof(struct sfxge_rxq, member) }
 1315 #ifdef SFXGE_LRO
 1316         SFXGE_RX_STAT(lro_merges, lro.n_merges),
 1317         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
 1318         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
 1319         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
 1320         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
 1321         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
 1322         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
 1323         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
 1324 #endif
 1325 };
 1326 
 1327 static int
 1328 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
 1329 {
 1330         struct sfxge_softc *sc = arg1;
 1331         unsigned int id = arg2;
 1332         unsigned int sum, index;
 1333 
 1334         /* Sum across all RX queues */
 1335         sum = 0;
 1336         for (index = 0; index < sc->rxq_count; index++)
 1337                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
 1338                                          sfxge_rx_stats[id].offset);
 1339 
 1340         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
 1341 }
 1342 
 1343 static void
 1344 sfxge_rx_stat_init(struct sfxge_softc *sc)
 1345 {
 1346         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
 1347         struct sysctl_oid_list *stat_list;
 1348         unsigned int id;
 1349 
 1350         stat_list = SYSCTL_CHILDREN(sc->stats_node);
 1351 
 1352         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
 1353                 SYSCTL_ADD_PROC(ctx, stat_list, OID_AUTO,
 1354                     sfxge_rx_stats[id].name,
 1355                     CTLTYPE_UINT | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 1356                     sc, id, sfxge_rx_stat_handler, "IU", "");
 1357         }
 1358 }
 1359 
 1360 void
 1361 sfxge_rx_fini(struct sfxge_softc *sc)
 1362 {
 1363         int index;
 1364 
 1365         index = sc->rxq_count;
 1366         while (--index >= 0)
 1367                 sfxge_rx_qfini(sc, index);
 1368 
 1369         sc->rxq_count = 0;
 1370 }
 1371 
 1372 int
 1373 sfxge_rx_init(struct sfxge_softc *sc)
 1374 {
 1375         struct sfxge_intr *intr;
 1376         int index;
 1377         int rc;
 1378 
 1379 #ifdef SFXGE_LRO
 1380         if (!ISP2(lro_table_size)) {
 1381                 log(LOG_ERR, "%s=%u must be power of 2",
 1382                     SFXGE_LRO_PARAM(table_size), lro_table_size);
 1383                 rc = EINVAL;
 1384                 goto fail_lro_table_size;
 1385         }
 1386 
 1387         if (lro_idle_ticks == 0)
 1388                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
 1389 #endif
 1390 
 1391         intr = &sc->intr;
 1392 
 1393         sc->rxq_count = intr->n_alloc;
 1394 
 1395         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
 1396             ("intr->state != SFXGE_INTR_INITIALIZED"));
 1397 
 1398         /* Initialize the receive queue(s) - one per interrupt. */
 1399         for (index = 0; index < sc->rxq_count; index++) {
 1400                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
 1401                         goto fail;
 1402         }
 1403 
 1404         sfxge_rx_stat_init(sc);
 1405 
 1406         return (0);
 1407 
 1408 fail:
 1409         /* Tear down the receive queue(s). */
 1410         while (--index >= 0)
 1411                 sfxge_rx_qfini(sc, index);
 1412 
 1413         sc->rxq_count = 0;
 1414 
 1415 #ifdef SFXGE_LRO
 1416 fail_lro_table_size:
 1417 #endif
 1418         return (rc);
 1419 }

Cache object: 16acce679e3e75bcfa2d4a906e09e69b


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.