The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/sfxge/sfxge_rx.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010-2016 Solarflare Communications Inc.
    3  * All rights reserved.
    4  *
    5  * This software was developed in part by Philip Paeps under contract for
    6  * Solarflare Communications, Inc.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions are met:
   10  *
   11  * 1. Redistributions of source code must retain the above copyright notice,
   12  *    this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright notice,
   14  *    this list of conditions and the following disclaimer in the documentation
   15  *    and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
   18  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
   19  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   20  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
   21  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   22  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   23  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
   24  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
   25  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
   26  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
   27  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   28  *
   29  * The views and conclusions contained in the software and documentation are
   30  * those of the authors and should not be interpreted as representing official
   31  * policies, either expressed or implied, of the FreeBSD Project.
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD: releng/11.2/sys/dev/sfxge/sfxge_rx.c 331722 2018-03-29 02:50:57Z eadler $");
   36 
   37 #include "opt_rss.h"
   38 
   39 #include <sys/param.h>
   40 #include <sys/malloc.h>
   41 #include <sys/mbuf.h>
   42 #include <sys/smp.h>
   43 #include <sys/socket.h>
   44 #include <sys/sysctl.h>
   45 #include <sys/syslog.h>
   46 #include <sys/limits.h>
   47 #include <sys/syslog.h>
   48 
   49 #include <net/ethernet.h>
   50 #include <net/if.h>
   51 #include <net/if_vlan_var.h>
   52 
   53 #include <netinet/in.h>
   54 #include <netinet/ip.h>
   55 #include <netinet/ip6.h>
   56 #include <netinet/tcp.h>
   57 
   58 #include <machine/in_cksum.h>
   59 
   60 #ifdef RSS
   61 #include <net/rss_config.h>
   62 #endif
   63 
   64 #include "common/efx.h"
   65 
   66 
   67 #include "sfxge.h"
   68 #include "sfxge_rx.h"
   69 
   70 #define RX_REFILL_THRESHOLD(_entries)   (EFX_RXQ_LIMIT(_entries) * 9 / 10)
   71 
   72 #ifdef SFXGE_LRO
   73 
   74 SYSCTL_NODE(_hw_sfxge, OID_AUTO, lro, CTLFLAG_RD, NULL,
   75             "Large receive offload (LRO) parameters");
   76 
   77 #define SFXGE_LRO_PARAM(_param) SFXGE_PARAM(lro._param)
   78 
   79 /* Size of the LRO hash table.  Must be a power of 2.  A larger table
   80  * means we can accelerate a larger number of streams.
   81  */
   82 static unsigned lro_table_size = 128;
   83 TUNABLE_INT(SFXGE_LRO_PARAM(table_size), &lro_table_size);
   84 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, table_size, CTLFLAG_RDTUN,
   85             &lro_table_size, 0,
   86             "Size of the LRO hash table (must be a power of 2)");
   87 
   88 /* Maximum length of a hash chain.  If chains get too long then the lookup
   89  * time increases and may exceed the benefit of LRO.
   90  */
   91 static unsigned lro_chain_max = 20;
   92 TUNABLE_INT(SFXGE_LRO_PARAM(chain_max), &lro_chain_max);
   93 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, chain_max, CTLFLAG_RDTUN,
   94             &lro_chain_max, 0,
   95             "The maximum length of a hash chain");
   96 
   97 /* Maximum time (in ticks) that a connection can be idle before it's LRO
   98  * state is discarded.
   99  */
  100 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
  101 TUNABLE_INT(SFXGE_LRO_PARAM(idle_ticks), &lro_idle_ticks);
  102 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, idle_ticks, CTLFLAG_RDTUN,
  103             &lro_idle_ticks, 0,
  104             "The maximum time (in ticks) that a connection can be idle "
  105             "before it's LRO state is discarded");
  106 
  107 /* Number of packets with payload that must arrive in-order before a
  108  * connection is eligible for LRO.  The idea is we should avoid coalescing
  109  * segments when the sender is in slow-start because reducing the ACK rate
  110  * can damage performance.
  111  */
  112 static int lro_slow_start_packets = 2000;
  113 TUNABLE_INT(SFXGE_LRO_PARAM(slow_start_packets), &lro_slow_start_packets);
  114 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, slow_start_packets, CTLFLAG_RDTUN,
  115             &lro_slow_start_packets, 0,
  116             "Number of packets with payload that must arrive in-order before "
  117             "a connection is eligible for LRO");
  118 
  119 /* Number of packets with payload that must arrive in-order following loss
  120  * before a connection is eligible for LRO.  The idea is we should avoid
  121  * coalescing segments when the sender is recovering from loss, because
  122  * reducing the ACK rate can damage performance.
  123  */
  124 static int lro_loss_packets = 20;
  125 TUNABLE_INT(SFXGE_LRO_PARAM(loss_packets), &lro_loss_packets);
  126 SYSCTL_UINT(_hw_sfxge_lro, OID_AUTO, loss_packets, CTLFLAG_RDTUN,
  127             &lro_loss_packets, 0,
  128             "Number of packets with payload that must arrive in-order "
  129             "following loss before a connection is eligible for LRO");
  130 
  131 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
  132 #define SFXGE_LRO_L2_ID_VLAN 0x4000
  133 #define SFXGE_LRO_L2_ID_IPV6 0x8000
  134 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
  135 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
  136 
  137 /* Compare IPv6 addresses, avoiding conditional branches */
  138 static unsigned long ipv6_addr_cmp(const struct in6_addr *left,
  139                                    const struct in6_addr *right)
  140 {
  141 #if LONG_BIT == 64
  142         const uint64_t *left64 = (const uint64_t *)left;
  143         const uint64_t *right64 = (const uint64_t *)right;
  144         return (left64[0] - right64[0]) | (left64[1] - right64[1]);
  145 #else
  146         return (left->s6_addr32[0] - right->s6_addr32[0]) |
  147                (left->s6_addr32[1] - right->s6_addr32[1]) |
  148                (left->s6_addr32[2] - right->s6_addr32[2]) |
  149                (left->s6_addr32[3] - right->s6_addr32[3]);
  150 #endif
  151 }
  152 
  153 #endif  /* SFXGE_LRO */
  154 
  155 void
  156 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
  157 {
  158 
  159         rxq->flush_state = SFXGE_FLUSH_DONE;
  160 }
  161 
  162 void
  163 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
  164 {
  165 
  166         rxq->flush_state = SFXGE_FLUSH_FAILED;
  167 }
  168 
  169 #ifdef RSS
  170 static uint8_t toep_key[RSS_KEYSIZE];
  171 #else
  172 static uint8_t toep_key[] = {
  173         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
  174         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
  175         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
  176         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
  177         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
  178 };
  179 #endif
  180 
  181 static void
  182 sfxge_rx_post_refill(void *arg)
  183 {
  184         struct sfxge_rxq *rxq = arg;
  185         struct sfxge_softc *sc;
  186         unsigned int index;
  187         struct sfxge_evq *evq;
  188         uint16_t magic;
  189 
  190         sc = rxq->sc;
  191         index = rxq->index;
  192         evq = sc->evq[index];
  193         magic = sfxge_sw_ev_rxq_magic(SFXGE_SW_EV_RX_QREFILL, rxq);
  194 
  195         /* This is guaranteed due to the start/stop order of rx and ev */
  196         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
  197             ("evq not started"));
  198         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
  199             ("rxq not started"));
  200         efx_ev_qpost(evq->common, magic);
  201 }
  202 
  203 static void
  204 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
  205 {
  206         /* Initially retry after 100 ms, but back off in case of
  207          * repeated failures as we probably have to wait for the
  208          * administrator to raise the pool limit. */
  209         if (retrying)
  210                 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
  211         else
  212                 rxq->refill_delay = hz / 10;
  213 
  214         callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
  215                              sfxge_rx_post_refill, rxq);
  216 }
  217 
  218 #define SFXGE_REFILL_BATCH  64
  219 
  220 static void
  221 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
  222 {
  223         struct sfxge_softc *sc;
  224         unsigned int index;
  225         struct sfxge_evq *evq;
  226         unsigned int batch;
  227         unsigned int rxfill;
  228         unsigned int mblksize;
  229         int ntodo;
  230         efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
  231 
  232         sc = rxq->sc;
  233         index = rxq->index;
  234         evq = sc->evq[index];
  235 
  236         prefetch_read_many(sc->enp);
  237         prefetch_read_many(rxq->common);
  238 
  239         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
  240 
  241         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
  242                 return;
  243 
  244         rxfill = rxq->added - rxq->completed;
  245         KASSERT(rxfill <= EFX_RXQ_LIMIT(rxq->entries),
  246             ("rxfill > EFX_RXQ_LIMIT(rxq->entries)"));
  247         ntodo = min(EFX_RXQ_LIMIT(rxq->entries) - rxfill, target);
  248         KASSERT(ntodo <= EFX_RXQ_LIMIT(rxq->entries),
  249             ("ntodo > EFX_RQX_LIMIT(rxq->entries)"));
  250 
  251         if (ntodo == 0)
  252                 return;
  253 
  254         batch = 0;
  255         mblksize = sc->rx_buffer_size - sc->rx_buffer_align;
  256         while (ntodo-- > 0) {
  257                 unsigned int id;
  258                 struct sfxge_rx_sw_desc *rx_desc;
  259                 bus_dma_segment_t seg;
  260                 struct mbuf *m;
  261 
  262                 id = (rxq->added + batch) & rxq->ptr_mask;
  263                 rx_desc = &rxq->queue[id];
  264                 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
  265 
  266                 rx_desc->flags = EFX_DISCARD;
  267                 m = rx_desc->mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
  268                     sc->rx_cluster_size);
  269                 if (m == NULL)
  270                         break;
  271 
  272                 /* m_len specifies length of area to be mapped for DMA */
  273                 m->m_len  = mblksize;
  274                 m->m_data = (caddr_t)P2ROUNDUP((uintptr_t)m->m_data, CACHE_LINE_SIZE);
  275                 m->m_data += sc->rx_buffer_align;
  276 
  277                 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
  278                 addr[batch++] = seg.ds_addr;
  279 
  280                 if (batch == SFXGE_REFILL_BATCH) {
  281                         efx_rx_qpost(rxq->common, addr, mblksize, batch,
  282                             rxq->completed, rxq->added);
  283                         rxq->added += batch;
  284                         batch = 0;
  285                 }
  286         }
  287 
  288         if (ntodo != 0)
  289                 sfxge_rx_schedule_refill(rxq, retrying);
  290 
  291         if (batch != 0) {
  292                 efx_rx_qpost(rxq->common, addr, mblksize, batch,
  293                     rxq->completed, rxq->added);
  294                 rxq->added += batch;
  295         }
  296 
  297         /* Make the descriptors visible to the hardware */
  298         bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
  299                         BUS_DMASYNC_PREWRITE);
  300 
  301         efx_rx_qpush(rxq->common, rxq->added, &rxq->pushed);
  302 
  303         /* The queue could still be empty if no descriptors were actually
  304          * pushed, in which case there will be no event to cause the next
  305          * refill, so we must schedule a refill ourselves.
  306          */
  307         if(rxq->pushed == rxq->completed) {
  308                 sfxge_rx_schedule_refill(rxq, retrying);
  309         }
  310 }
  311 
  312 void
  313 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
  314 {
  315 
  316         if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
  317                 return;
  318 
  319         /* Make sure the queue is full */
  320         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_TRUE);
  321 }
  322 
  323 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
  324 {
  325         struct ifnet *ifp = sc->ifnet;
  326 
  327         m->m_pkthdr.rcvif = ifp;
  328         m->m_pkthdr.csum_data = 0xffff;
  329         ifp->if_input(ifp, m);
  330 }
  331 
  332 static void
  333 sfxge_rx_deliver(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_desc)
  334 {
  335         struct sfxge_softc *sc = rxq->sc;
  336         struct mbuf *m = rx_desc->mbuf;
  337         int flags = rx_desc->flags;
  338         int csum_flags;
  339 
  340         /* Convert checksum flags */
  341         csum_flags = (flags & EFX_CKSUM_IPV4) ?
  342                 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
  343         if (flags & EFX_CKSUM_TCPUDP)
  344                 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  345 
  346         if (flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
  347                 m->m_pkthdr.flowid =
  348                         efx_pseudo_hdr_hash_get(rxq->common,
  349                                                 EFX_RX_HASHALG_TOEPLITZ,
  350                                                 mtod(m, uint8_t *));
  351                 /* The hash covers a 4-tuple for TCP only */
  352                 M_HASHTYPE_SET(m,
  353                     (flags & EFX_PKT_IPV4) ?
  354                         ((flags & EFX_PKT_TCP) ?
  355                             M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_IPV4) :
  356                         ((flags & EFX_PKT_TCP) ?
  357                             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_IPV6));
  358         }
  359         m->m_data += sc->rx_prefix_size;
  360         m->m_len = rx_desc->size - sc->rx_prefix_size;
  361         m->m_pkthdr.len = m->m_len;
  362         m->m_pkthdr.csum_flags = csum_flags;
  363         __sfxge_rx_deliver(sc, rx_desc->mbuf);
  364 
  365         rx_desc->flags = EFX_DISCARD;
  366         rx_desc->mbuf = NULL;
  367 }
  368 
  369 #ifdef SFXGE_LRO
  370 
  371 static void
  372 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
  373 {
  374         struct sfxge_softc *sc = st->sc;
  375         struct mbuf *m = c->mbuf;
  376         struct tcphdr *c_th;
  377         int csum_flags;
  378 
  379         KASSERT(m, ("no mbuf to deliver"));
  380 
  381         ++st->n_bursts;
  382 
  383         /* Finish off packet munging and recalculate IP header checksum. */
  384         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  385                 struct ip *iph = c->nh;
  386                 iph->ip_len = htons(iph->ip_len);
  387                 iph->ip_sum = 0;
  388                 iph->ip_sum = in_cksum_hdr(iph);
  389                 c_th = (struct tcphdr *)(iph + 1);
  390                 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
  391                               CSUM_IP_CHECKED | CSUM_IP_VALID);
  392         } else {
  393                 struct ip6_hdr *iph = c->nh;
  394                 iph->ip6_plen = htons(iph->ip6_plen);
  395                 c_th = (struct tcphdr *)(iph + 1);
  396                 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  397         }
  398 
  399         c_th->th_win = c->th_last->th_win;
  400         c_th->th_ack = c->th_last->th_ack;
  401         if (c_th->th_off == c->th_last->th_off) {
  402                 /* Copy TCP options (take care to avoid going negative). */
  403                 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
  404                 memcpy(c_th + 1, c->th_last + 1, optlen);
  405         }
  406 
  407         m->m_pkthdr.flowid = c->conn_hash;
  408         M_HASHTYPE_SET(m,
  409             SFXGE_LRO_CONN_IS_TCPIPV4(c) ?
  410                 M_HASHTYPE_RSS_TCP_IPV4 : M_HASHTYPE_RSS_TCP_IPV6);
  411 
  412         m->m_pkthdr.csum_flags = csum_flags;
  413         __sfxge_rx_deliver(sc, m);
  414 
  415         c->mbuf = NULL;
  416         c->delivered = 1;
  417 }
  418 
  419 /* Drop the given connection, and add it to the free list. */
  420 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
  421 {
  422         unsigned bucket;
  423 
  424         KASSERT(!c->mbuf, ("found orphaned mbuf"));
  425 
  426         if (c->next_buf.mbuf != NULL) {
  427                 sfxge_rx_deliver(rxq, &c->next_buf);
  428                 LIST_REMOVE(c, active_link);
  429         }
  430 
  431         bucket = c->conn_hash & rxq->lro.conns_mask;
  432         KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
  433         --rxq->lro.conns_n[bucket];
  434         TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
  435         TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
  436 }
  437 
  438 /* Stop tracking connections that have gone idle in order to keep hash
  439  * chains short.
  440  */
  441 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
  442 {
  443         struct sfxge_lro_conn *c;
  444         unsigned i;
  445 
  446         KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
  447                 ("found active connections"));
  448 
  449         rxq->lro.last_purge_ticks = now;
  450         for (i = 0; i <= rxq->lro.conns_mask; ++i) {
  451                 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
  452                         continue;
  453 
  454                 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
  455                 if (now - c->last_pkt_ticks > lro_idle_ticks) {
  456                         ++rxq->lro.n_drop_idle;
  457                         sfxge_lro_drop(rxq, c);
  458                 }
  459         }
  460 }
  461 
  462 static void
  463 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
  464                 struct mbuf *mbuf, struct tcphdr *th)
  465 {
  466         struct tcphdr *c_th;
  467 
  468         /* Tack the new mbuf onto the chain. */
  469         KASSERT(!mbuf->m_next, ("mbuf already chained"));
  470         c->mbuf_tail->m_next = mbuf;
  471         c->mbuf_tail = mbuf;
  472 
  473         /* Increase length appropriately */
  474         c->mbuf->m_pkthdr.len += mbuf->m_len;
  475 
  476         /* Update the connection state flags */
  477         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  478                 struct ip *iph = c->nh;
  479                 iph->ip_len += mbuf->m_len;
  480                 c_th = (struct tcphdr *)(iph + 1);
  481         } else {
  482                 struct ip6_hdr *iph = c->nh;
  483                 iph->ip6_plen += mbuf->m_len;
  484                 c_th = (struct tcphdr *)(iph + 1);
  485         }
  486         c_th->th_flags |= (th->th_flags & TH_PUSH);
  487         c->th_last = th;
  488         ++st->n_merges;
  489 
  490         /* Pass packet up now if another segment could overflow the IP
  491          * length.
  492          */
  493         if (c->mbuf->m_pkthdr.len > 65536 - 9200)
  494                 sfxge_lro_deliver(st, c);
  495 }
  496 
  497 static void
  498 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
  499                 struct mbuf *mbuf, void *nh, struct tcphdr *th)
  500 {
  501         /* Start the chain */
  502         c->mbuf = mbuf;
  503         c->mbuf_tail = c->mbuf;
  504         c->nh = nh;
  505         c->th_last = th;
  506 
  507         mbuf->m_pkthdr.len = mbuf->m_len;
  508 
  509         /* Mangle header fields for later processing */
  510         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  511                 struct ip *iph = nh;
  512                 iph->ip_len = ntohs(iph->ip_len);
  513         } else {
  514                 struct ip6_hdr *iph = nh;
  515                 iph->ip6_plen = ntohs(iph->ip6_plen);
  516         }
  517 }
  518 
  519 /* Try to merge or otherwise hold or deliver (as appropriate) the
  520  * packet buffered for this connection (c->next_buf).  Return a flag
  521  * indicating whether the connection is still active for LRO purposes.
  522  */
  523 static int
  524 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
  525 {
  526         struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
  527         char *eh = c->next_eh;
  528         int data_length, hdr_length, dont_merge;
  529         unsigned th_seq, pkt_length;
  530         struct tcphdr *th;
  531         unsigned now;
  532 
  533         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  534                 struct ip *iph = c->next_nh;
  535                 th = (struct tcphdr *)(iph + 1);
  536                 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
  537         } else {
  538                 struct ip6_hdr *iph = c->next_nh;
  539                 th = (struct tcphdr *)(iph + 1);
  540                 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
  541         }
  542 
  543         hdr_length = (char *) th + th->th_off * 4 - eh;
  544         data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
  545                        hdr_length);
  546         th_seq = ntohl(th->th_seq);
  547         dont_merge = ((data_length <= 0)
  548                       | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
  549 
  550         /* Check for options other than aligned timestamp. */
  551         if (th->th_off != 5) {
  552                 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
  553                 if (th->th_off == 8 &&
  554                     opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
  555                                         (TCPOPT_NOP << 16) |
  556                                         (TCPOPT_TIMESTAMP << 8) |
  557                                         TCPOLEN_TIMESTAMP)) {
  558                         /* timestamp option -- okay */
  559                 } else {
  560                         dont_merge = 1;
  561                 }
  562         }
  563 
  564         if (__predict_false(th_seq != c->next_seq)) {
  565                 /* Out-of-order, so start counting again. */
  566                 if (c->mbuf != NULL)
  567                         sfxge_lro_deliver(&rxq->lro, c);
  568                 c->n_in_order_pkts -= lro_loss_packets;
  569                 c->next_seq = th_seq + data_length;
  570                 ++rxq->lro.n_misorder;
  571                 goto deliver_buf_out;
  572         }
  573         c->next_seq = th_seq + data_length;
  574 
  575         now = ticks;
  576         if (now - c->last_pkt_ticks > lro_idle_ticks) {
  577                 ++rxq->lro.n_drop_idle;
  578                 if (c->mbuf != NULL)
  579                         sfxge_lro_deliver(&rxq->lro, c);
  580                 sfxge_lro_drop(rxq, c);
  581                 return (0);
  582         }
  583         c->last_pkt_ticks = ticks;
  584 
  585         if (c->n_in_order_pkts < lro_slow_start_packets) {
  586                 /* May be in slow-start, so don't merge. */
  587                 ++rxq->lro.n_slow_start;
  588                 ++c->n_in_order_pkts;
  589                 goto deliver_buf_out;
  590         }
  591 
  592         if (__predict_false(dont_merge)) {
  593                 if (c->mbuf != NULL)
  594                         sfxge_lro_deliver(&rxq->lro, c);
  595                 if (th->th_flags & (TH_FIN | TH_RST)) {
  596                         ++rxq->lro.n_drop_closed;
  597                         sfxge_lro_drop(rxq, c);
  598                         return (0);
  599                 }
  600                 goto deliver_buf_out;
  601         }
  602 
  603         rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
  604 
  605         if (__predict_true(c->mbuf != NULL)) {
  606                 /* Remove headers and any padding */
  607                 rx_buf->mbuf->m_data += hdr_length;
  608                 rx_buf->mbuf->m_len = data_length;
  609 
  610                 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
  611         } else {
  612                 /* Remove any padding */
  613                 rx_buf->mbuf->m_len = pkt_length;
  614 
  615                 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
  616         }
  617 
  618         rx_buf->mbuf = NULL;
  619         return (1);
  620 
  621  deliver_buf_out:
  622         sfxge_rx_deliver(rxq, rx_buf);
  623         return (1);
  624 }
  625 
  626 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
  627                                uint16_t l2_id, void *nh, struct tcphdr *th)
  628 {
  629         unsigned bucket = conn_hash & st->conns_mask;
  630         struct sfxge_lro_conn *c;
  631 
  632         if (st->conns_n[bucket] >= lro_chain_max) {
  633                 ++st->n_too_many;
  634                 return;
  635         }
  636 
  637         if (!TAILQ_EMPTY(&st->free_conns)) {
  638                 c = TAILQ_FIRST(&st->free_conns);
  639                 TAILQ_REMOVE(&st->free_conns, c, link);
  640         } else {
  641                 c = malloc(sizeof(*c), M_SFXGE, M_NOWAIT);
  642                 if (c == NULL)
  643                         return;
  644                 c->mbuf = NULL;
  645                 c->next_buf.mbuf = NULL;
  646         }
  647 
  648         /* Create the connection tracking data */
  649         ++st->conns_n[bucket];
  650         TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
  651         c->l2_id = l2_id;
  652         c->conn_hash = conn_hash;
  653         c->source = th->th_sport;
  654         c->dest = th->th_dport;
  655         c->n_in_order_pkts = 0;
  656         c->last_pkt_ticks = *(volatile int *)&ticks;
  657         c->delivered = 0;
  658         ++st->n_new_stream;
  659         /* NB. We don't initialise c->next_seq, and it doesn't matter what
  660          * value it has.  Most likely the next packet received for this
  661          * connection will not match -- no harm done.
  662          */
  663 }
  664 
  665 /* Process mbuf and decide whether to dispatch it to the stack now or
  666  * later.
  667  */
  668 static void
  669 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
  670 {
  671         struct sfxge_softc *sc = rxq->sc;
  672         struct mbuf *m = rx_buf->mbuf;
  673         struct ether_header *eh;
  674         struct sfxge_lro_conn *c;
  675         uint16_t l2_id;
  676         uint16_t l3_proto;
  677         void *nh;
  678         struct tcphdr *th;
  679         uint32_t conn_hash;
  680         unsigned bucket;
  681 
  682         /* Get the hardware hash */
  683         conn_hash = efx_pseudo_hdr_hash_get(rxq->common,
  684                                             EFX_RX_HASHALG_TOEPLITZ,
  685                                             mtod(m, uint8_t *));
  686 
  687         eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
  688         if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
  689                 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
  690                 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
  691                         SFXGE_LRO_L2_ID_VLAN;
  692                 l3_proto = veh->evl_proto;
  693                 nh = veh + 1;
  694         } else {
  695                 l2_id = 0;
  696                 l3_proto = eh->ether_type;
  697                 nh = eh + 1;
  698         }
  699 
  700         /* Check whether this is a suitable packet (unfragmented
  701          * TCP/IPv4 or TCP/IPv6).  If so, find the TCP header and
  702          * length, and compute a hash if necessary.  If not, return.
  703          */
  704         if (l3_proto == htons(ETHERTYPE_IP)) {
  705                 struct ip *iph = nh;
  706 
  707                 KASSERT(iph->ip_p == IPPROTO_TCP,
  708                     ("IPv4 protocol is not TCP, but packet marker is set"));
  709                 if ((iph->ip_hl - (sizeof(*iph) >> 2u)) |
  710                     (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
  711                         goto deliver_now;
  712                 th = (struct tcphdr *)(iph + 1);
  713         } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
  714                 struct ip6_hdr *iph = nh;
  715 
  716                 KASSERT(iph->ip6_nxt == IPPROTO_TCP,
  717                     ("IPv6 next header is not TCP, but packet marker is set"));
  718                 l2_id |= SFXGE_LRO_L2_ID_IPV6;
  719                 th = (struct tcphdr *)(iph + 1);
  720         } else {
  721                 goto deliver_now;
  722         }
  723 
  724         bucket = conn_hash & rxq->lro.conns_mask;
  725 
  726         TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
  727                 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
  728                         continue;
  729                 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
  730                         continue;
  731                 if (c->mbuf != NULL) {
  732                         if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
  733                                 struct ip *c_iph, *iph = nh;
  734                                 c_iph = c->nh;
  735                                 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
  736                                     (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
  737                                         continue;
  738                         } else {
  739                                 struct ip6_hdr *c_iph, *iph = nh;
  740                                 c_iph = c->nh;
  741                                 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
  742                                     ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
  743                                         continue;
  744                         }
  745                 }
  746 
  747                 /* Re-insert at head of list to reduce lookup time. */
  748                 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
  749                 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
  750 
  751                 if (c->next_buf.mbuf != NULL) {
  752                         if (!sfxge_lro_try_merge(rxq, c))
  753                                 goto deliver_now;
  754                 } else {
  755                         LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
  756                             active_link);
  757                 }
  758                 c->next_buf = *rx_buf;
  759                 c->next_eh = eh;
  760                 c->next_nh = nh;
  761 
  762                 rx_buf->mbuf = NULL;
  763                 rx_buf->flags = EFX_DISCARD;
  764                 return;
  765         }
  766 
  767         sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
  768  deliver_now:
  769         sfxge_rx_deliver(rxq, rx_buf);
  770 }
  771 
  772 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
  773 {
  774         struct sfxge_lro_state *st = &rxq->lro;
  775         struct sfxge_lro_conn *c;
  776         unsigned t;
  777 
  778         while (!LIST_EMPTY(&st->active_conns)) {
  779                 c = LIST_FIRST(&st->active_conns);
  780                 if (!c->delivered && c->mbuf != NULL)
  781                         sfxge_lro_deliver(st, c);
  782                 if (sfxge_lro_try_merge(rxq, c)) {
  783                         if (c->mbuf != NULL)
  784                                 sfxge_lro_deliver(st, c);
  785                         LIST_REMOVE(c, active_link);
  786                 }
  787                 c->delivered = 0;
  788         }
  789 
  790         t = *(volatile int *)&ticks;
  791         if (__predict_false(t != st->last_purge_ticks))
  792                 sfxge_lro_purge_idle(rxq, t);
  793 }
  794 
  795 #else   /* !SFXGE_LRO */
  796 
  797 static void
  798 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
  799 {
  800 }
  801 
  802 static void
  803 sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
  804 {
  805 }
  806 
  807 #endif  /* SFXGE_LRO */
  808 
  809 void
  810 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
  811 {
  812         struct sfxge_softc *sc = rxq->sc;
  813         int if_capenable = sc->ifnet->if_capenable;
  814         int lro_enabled = if_capenable & IFCAP_LRO;
  815         unsigned int index;
  816         struct sfxge_evq *evq;
  817         unsigned int completed;
  818         unsigned int level;
  819         struct mbuf *m;
  820         struct sfxge_rx_sw_desc *prev = NULL;
  821 
  822         index = rxq->index;
  823         evq = sc->evq[index];
  824 
  825         SFXGE_EVQ_LOCK_ASSERT_OWNED(evq);
  826 
  827         completed = rxq->completed;
  828         while (completed != rxq->pending) {
  829                 unsigned int id;
  830                 struct sfxge_rx_sw_desc *rx_desc;
  831 
  832                 id = completed++ & rxq->ptr_mask;
  833                 rx_desc = &rxq->queue[id];
  834                 m = rx_desc->mbuf;
  835 
  836                 if (__predict_false(rxq->init_state != SFXGE_RXQ_STARTED))
  837                         goto discard;
  838 
  839                 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
  840                         goto discard;
  841 
  842                 /* Read the length from the pseudo header if required */
  843                 if (rx_desc->flags & EFX_PKT_PREFIX_LEN) {
  844                         uint16_t tmp_size;
  845                         int rc;
  846                         rc = efx_pseudo_hdr_pkt_length_get(rxq->common,
  847                                                            mtod(m, uint8_t *),
  848                                                            &tmp_size);
  849                         KASSERT(rc == 0, ("cannot get packet length: %d", rc));
  850                         rx_desc->size = (int)tmp_size + sc->rx_prefix_size;
  851                 }
  852 
  853                 prefetch_read_many(mtod(m, caddr_t));
  854 
  855                 switch (rx_desc->flags & (EFX_PKT_IPV4 | EFX_PKT_IPV6)) {
  856                 case EFX_PKT_IPV4:
  857                         if (~if_capenable & IFCAP_RXCSUM)
  858                                 rx_desc->flags &=
  859                                     ~(EFX_CKSUM_IPV4 | EFX_CKSUM_TCPUDP);
  860                         break;
  861                 case EFX_PKT_IPV6:
  862                         if (~if_capenable & IFCAP_RXCSUM_IPV6)
  863                                 rx_desc->flags &= ~EFX_CKSUM_TCPUDP;
  864                         break;
  865                 case 0:
  866                         /* Check for loopback packets */
  867                         {
  868                                 struct ether_header *etherhp;
  869 
  870                                 /*LINTED*/
  871                                 etherhp = mtod(m, struct ether_header *);
  872 
  873                                 if (etherhp->ether_type ==
  874                                     htons(SFXGE_ETHERTYPE_LOOPBACK)) {
  875                                         EFSYS_PROBE(loopback);
  876 
  877                                         rxq->loopback++;
  878                                         goto discard;
  879                                 }
  880                         }
  881                         break;
  882                 default:
  883                         KASSERT(B_FALSE,
  884                             ("Rx descriptor with both IPv4 and IPv6 flags"));
  885                         goto discard;
  886                 }
  887 
  888                 /* Pass packet up the stack or into LRO (pipelined) */
  889                 if (prev != NULL) {
  890                         if (lro_enabled &&
  891                             ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
  892                              (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
  893                                 sfxge_lro(rxq, prev);
  894                         else
  895                                 sfxge_rx_deliver(rxq, prev);
  896                 }
  897                 prev = rx_desc;
  898                 continue;
  899 
  900 discard:
  901                 /* Return the packet to the pool */
  902                 m_free(m);
  903                 rx_desc->mbuf = NULL;
  904         }
  905         rxq->completed = completed;
  906 
  907         level = rxq->added - rxq->completed;
  908 
  909         /* Pass last packet up the stack or into LRO */
  910         if (prev != NULL) {
  911                 if (lro_enabled &&
  912                     ((prev->flags & (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)) ==
  913                      (EFX_PKT_TCP | EFX_CKSUM_TCPUDP)))
  914                         sfxge_lro(rxq, prev);
  915                 else
  916                         sfxge_rx_deliver(rxq, prev);
  917         }
  918 
  919         /*
  920          * If there are any pending flows and this is the end of the
  921          * poll then they must be completed.
  922          */
  923         if (eop)
  924                 sfxge_lro_end_of_burst(rxq);
  925 
  926         /* Top up the queue if necessary */
  927         if (level < rxq->refill_threshold)
  928                 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(rxq->entries), B_FALSE);
  929 }
  930 
  931 static void
  932 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
  933 {
  934         struct sfxge_rxq *rxq;
  935         struct sfxge_evq *evq;
  936         unsigned int count;
  937         unsigned int retry = 3;
  938 
  939         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
  940 
  941         rxq = sc->rxq[index];
  942         evq = sc->evq[index];
  943 
  944         SFXGE_EVQ_LOCK(evq);
  945 
  946         KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
  947             ("rxq not started"));
  948 
  949         rxq->init_state = SFXGE_RXQ_INITIALIZED;
  950 
  951         callout_stop(&rxq->refill_callout);
  952 
  953         while (rxq->flush_state != SFXGE_FLUSH_DONE && retry != 0) {
  954                 rxq->flush_state = SFXGE_FLUSH_PENDING;
  955 
  956                 SFXGE_EVQ_UNLOCK(evq);
  957 
  958                 /* Flush the receive queue */
  959                 if (efx_rx_qflush(rxq->common) != 0) {
  960                         SFXGE_EVQ_LOCK(evq);
  961                         rxq->flush_state = SFXGE_FLUSH_FAILED;
  962                         break;
  963                 }
  964 
  965                 count = 0;
  966                 do {
  967                         /* Spin for 100 ms */
  968                         DELAY(100000);
  969 
  970                         if (rxq->flush_state != SFXGE_FLUSH_PENDING)
  971                                 break;
  972 
  973                 } while (++count < 20);
  974 
  975                 SFXGE_EVQ_LOCK(evq);
  976 
  977                 if (rxq->flush_state == SFXGE_FLUSH_PENDING) {
  978                         /* Flush timeout - neither done nor failed */
  979                         log(LOG_ERR, "%s: Cannot flush Rx queue %u\n",
  980                             device_get_nameunit(sc->dev), index);
  981                         rxq->flush_state = SFXGE_FLUSH_DONE;
  982                 }
  983                 retry--;
  984         }
  985         if (rxq->flush_state == SFXGE_FLUSH_FAILED) {
  986                 log(LOG_ERR, "%s: Flushing Rx queue %u failed\n",
  987                     device_get_nameunit(sc->dev), index);
  988                 rxq->flush_state = SFXGE_FLUSH_DONE;
  989         }
  990 
  991         rxq->pending = rxq->added;
  992         sfxge_rx_qcomplete(rxq, B_TRUE);
  993 
  994         KASSERT(rxq->completed == rxq->pending,
  995             ("rxq->completed != rxq->pending"));
  996 
  997         rxq->added = 0;
  998         rxq->pushed = 0;
  999         rxq->pending = 0;
 1000         rxq->completed = 0;
 1001         rxq->loopback = 0;
 1002 
 1003         /* Destroy the common code receive queue. */
 1004         efx_rx_qdestroy(rxq->common);
 1005 
 1006         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
 1007             EFX_RXQ_NBUFS(sc->rxq_entries));
 1008 
 1009         SFXGE_EVQ_UNLOCK(evq);
 1010 }
 1011 
 1012 static int
 1013 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
 1014 {
 1015         struct sfxge_rxq *rxq;
 1016         efsys_mem_t *esmp;
 1017         struct sfxge_evq *evq;
 1018         int rc;
 1019 
 1020         SFXGE_ADAPTER_LOCK_ASSERT_OWNED(sc);
 1021 
 1022         rxq = sc->rxq[index];
 1023         esmp = &rxq->mem;
 1024         evq = sc->evq[index];
 1025 
 1026         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
 1027             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
 1028         KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
 1029             ("evq->init_state != SFXGE_EVQ_STARTED"));
 1030 
 1031         /* Program the buffer table. */
 1032         if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
 1033             EFX_RXQ_NBUFS(sc->rxq_entries))) != 0)
 1034                 return (rc);
 1035 
 1036         /* Create the common code receive queue. */
 1037         if ((rc = efx_rx_qcreate(sc->enp, index, 0, EFX_RXQ_TYPE_DEFAULT,
 1038             esmp, sc->rxq_entries, rxq->buf_base_id, evq->common,
 1039             &rxq->common)) != 0)
 1040                 goto fail;
 1041 
 1042         SFXGE_EVQ_LOCK(evq);
 1043 
 1044         /* Enable the receive queue. */
 1045         efx_rx_qenable(rxq->common);
 1046 
 1047         rxq->init_state = SFXGE_RXQ_STARTED;
 1048         rxq->flush_state = SFXGE_FLUSH_REQUIRED;
 1049 
 1050         /* Try to fill the queue from the pool. */
 1051         sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(sc->rxq_entries), B_FALSE);
 1052 
 1053         SFXGE_EVQ_UNLOCK(evq);
 1054 
 1055         return (0);
 1056 
 1057 fail:
 1058         efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
 1059             EFX_RXQ_NBUFS(sc->rxq_entries));
 1060         return (rc);
 1061 }
 1062 
 1063 void
 1064 sfxge_rx_stop(struct sfxge_softc *sc)
 1065 {
 1066         int index;
 1067 
 1068         efx_mac_filter_default_rxq_clear(sc->enp);
 1069 
 1070         /* Stop the receive queue(s) */
 1071         index = sc->rxq_count;
 1072         while (--index >= 0)
 1073                 sfxge_rx_qstop(sc, index);
 1074 
 1075         sc->rx_prefix_size = 0;
 1076         sc->rx_buffer_size = 0;
 1077 
 1078         efx_rx_fini(sc->enp);
 1079 }
 1080 
 1081 int
 1082 sfxge_rx_start(struct sfxge_softc *sc)
 1083 {
 1084         struct sfxge_intr *intr;
 1085         const efx_nic_cfg_t *encp;
 1086         size_t hdrlen, align, reserved;
 1087         int index;
 1088         int rc;
 1089 
 1090         intr = &sc->intr;
 1091 
 1092         /* Initialize the common code receive module. */
 1093         if ((rc = efx_rx_init(sc->enp)) != 0)
 1094                 return (rc);
 1095 
 1096         encp = efx_nic_cfg_get(sc->enp);
 1097         sc->rx_buffer_size = EFX_MAC_PDU(sc->ifnet->if_mtu);
 1098 
 1099         /* Calculate the receive packet buffer size. */ 
 1100         sc->rx_prefix_size = encp->enc_rx_prefix_size;
 1101 
 1102         /* Ensure IP headers are 32bit aligned */
 1103         hdrlen = sc->rx_prefix_size + sizeof (struct ether_header);
 1104         sc->rx_buffer_align = P2ROUNDUP(hdrlen, 4) - hdrlen;
 1105 
 1106         sc->rx_buffer_size += sc->rx_buffer_align;
 1107 
 1108         /* Align end of packet buffer for RX DMA end padding */
 1109         align = MAX(1, encp->enc_rx_buf_align_end);
 1110         EFSYS_ASSERT(ISP2(align));
 1111         sc->rx_buffer_size = P2ROUNDUP(sc->rx_buffer_size, align);
 1112 
 1113         /*
 1114          * Standard mbuf zones only guarantee pointer-size alignment;
 1115          * we need extra space to align to the cache line
 1116          */
 1117         reserved = sc->rx_buffer_size + CACHE_LINE_SIZE;
 1118 
 1119         /* Select zone for packet buffers */
 1120         if (reserved <= MCLBYTES)
 1121                 sc->rx_cluster_size = MCLBYTES;
 1122         else if (reserved <= MJUMPAGESIZE)
 1123                 sc->rx_cluster_size = MJUMPAGESIZE;
 1124         else if (reserved <= MJUM9BYTES)
 1125                 sc->rx_cluster_size = MJUM9BYTES;
 1126         else
 1127                 sc->rx_cluster_size = MJUM16BYTES;
 1128 
 1129         /*
 1130          * Set up the scale table.  Enable all hash types and hash insertion.
 1131          */
 1132         for (index = 0; index < nitems(sc->rx_indir_table); index++)
 1133 #ifdef RSS
 1134                 sc->rx_indir_table[index] =
 1135                         rss_get_indirection_to_bucket(index) % sc->rxq_count;
 1136 #else
 1137                 sc->rx_indir_table[index] = index % sc->rxq_count;
 1138 #endif
 1139         if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
 1140                                        nitems(sc->rx_indir_table))) != 0)
 1141                 goto fail;
 1142         (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
 1143             EFX_RX_HASH_IPV4 | EFX_RX_HASH_TCPIPV4 |
 1144             EFX_RX_HASH_IPV6 | EFX_RX_HASH_TCPIPV6, B_TRUE);
 1145 
 1146 #ifdef RSS
 1147         rss_getkey(toep_key);
 1148 #endif
 1149         if ((rc = efx_rx_scale_key_set(sc->enp, toep_key,
 1150                                        sizeof(toep_key))) != 0)
 1151                 goto fail;
 1152 
 1153         /* Start the receive queue(s). */
 1154         for (index = 0; index < sc->rxq_count; index++) {
 1155                 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
 1156                         goto fail2;
 1157         }
 1158 
 1159         rc = efx_mac_filter_default_rxq_set(sc->enp, sc->rxq[0]->common,
 1160                                             sc->intr.n_alloc > 1);
 1161         if (rc != 0)
 1162                 goto fail3;
 1163 
 1164         return (0);
 1165 
 1166 fail3:
 1167 fail2:
 1168         while (--index >= 0)
 1169                 sfxge_rx_qstop(sc, index);
 1170 
 1171 fail:
 1172         efx_rx_fini(sc->enp);
 1173 
 1174         return (rc);
 1175 }
 1176 
 1177 #ifdef SFXGE_LRO
 1178 
 1179 static void sfxge_lro_init(struct sfxge_rxq *rxq)
 1180 {
 1181         struct sfxge_lro_state *st = &rxq->lro;
 1182         unsigned i;
 1183 
 1184         st->conns_mask = lro_table_size - 1;
 1185         KASSERT(!((st->conns_mask + 1) & st->conns_mask),
 1186                 ("lro_table_size must be a power of 2"));
 1187         st->sc = rxq->sc;
 1188         st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
 1189                            M_SFXGE, M_WAITOK);
 1190         st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
 1191                              M_SFXGE, M_WAITOK);
 1192         for (i = 0; i <= st->conns_mask; ++i) {
 1193                 TAILQ_INIT(&st->conns[i]);
 1194                 st->conns_n[i] = 0;
 1195         }
 1196         LIST_INIT(&st->active_conns);
 1197         TAILQ_INIT(&st->free_conns);
 1198 }
 1199 
 1200 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
 1201 {
 1202         struct sfxge_lro_state *st = &rxq->lro;
 1203         struct sfxge_lro_conn *c;
 1204         unsigned i;
 1205 
 1206         /* Return cleanly if sfxge_lro_init() has not been called. */
 1207         if (st->conns == NULL)
 1208                 return;
 1209 
 1210         KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
 1211 
 1212         for (i = 0; i <= st->conns_mask; ++i) {
 1213                 while (!TAILQ_EMPTY(&st->conns[i])) {
 1214                         c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
 1215                         sfxge_lro_drop(rxq, c);
 1216                 }
 1217         }
 1218 
 1219         while (!TAILQ_EMPTY(&st->free_conns)) {
 1220                 c = TAILQ_FIRST(&st->free_conns);
 1221                 TAILQ_REMOVE(&st->free_conns, c, link);
 1222                 KASSERT(!c->mbuf, ("found orphaned mbuf"));
 1223                 free(c, M_SFXGE);
 1224         }
 1225 
 1226         free(st->conns_n, M_SFXGE);
 1227         free(st->conns, M_SFXGE);
 1228         st->conns = NULL;
 1229 }
 1230 
 1231 #else
 1232 
 1233 static void
 1234 sfxge_lro_init(struct sfxge_rxq *rxq)
 1235 {
 1236 }
 1237 
 1238 static void
 1239 sfxge_lro_fini(struct sfxge_rxq *rxq)
 1240 {
 1241 }
 1242 
 1243 #endif  /* SFXGE_LRO */
 1244 
 1245 static void
 1246 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
 1247 {
 1248         struct sfxge_rxq *rxq;
 1249 
 1250         rxq = sc->rxq[index];
 1251 
 1252         KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
 1253             ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
 1254 
 1255         /* Free the context array and the flow table. */
 1256         free(rxq->queue, M_SFXGE);
 1257         sfxge_lro_fini(rxq);
 1258 
 1259         /* Release DMA memory. */
 1260         sfxge_dma_free(&rxq->mem);
 1261 
 1262         sc->rxq[index] = NULL;
 1263 
 1264         free(rxq, M_SFXGE);
 1265 }
 1266 
 1267 static int
 1268 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
 1269 {
 1270         struct sfxge_rxq *rxq;
 1271         struct sfxge_evq *evq;
 1272         efsys_mem_t *esmp;
 1273         int rc;
 1274 
 1275         KASSERT(index < sc->rxq_count, ("index >= %d", sc->rxq_count));
 1276 
 1277         rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
 1278         rxq->sc = sc;
 1279         rxq->index = index;
 1280         rxq->entries = sc->rxq_entries;
 1281         rxq->ptr_mask = rxq->entries - 1;
 1282         rxq->refill_threshold = RX_REFILL_THRESHOLD(rxq->entries);
 1283 
 1284         sc->rxq[index] = rxq;
 1285         esmp = &rxq->mem;
 1286 
 1287         evq = sc->evq[index];
 1288 
 1289         /* Allocate and zero DMA space. */
 1290         if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(sc->rxq_entries), esmp)) != 0)
 1291                 return (rc);
 1292 
 1293         /* Allocate buffer table entries. */
 1294         sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(sc->rxq_entries),
 1295                                  &rxq->buf_base_id);
 1296 
 1297         /* Allocate the context array and the flow table. */
 1298         rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * sc->rxq_entries,
 1299             M_SFXGE, M_WAITOK | M_ZERO);
 1300         sfxge_lro_init(rxq);
 1301 
 1302         callout_init(&rxq->refill_callout, 1);
 1303 
 1304         rxq->init_state = SFXGE_RXQ_INITIALIZED;
 1305 
 1306         return (0);
 1307 }
 1308 
 1309 static const struct {
 1310         const char *name;
 1311         size_t offset;
 1312 } sfxge_rx_stats[] = {
 1313 #define SFXGE_RX_STAT(name, member) \
 1314         { #name, offsetof(struct sfxge_rxq, member) }
 1315 #ifdef SFXGE_LRO
 1316         SFXGE_RX_STAT(lro_merges, lro.n_merges),
 1317         SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
 1318         SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
 1319         SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
 1320         SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
 1321         SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
 1322         SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
 1323         SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
 1324 #endif
 1325 };
 1326 
 1327 static int
 1328 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
 1329 {
 1330         struct sfxge_softc *sc = arg1;
 1331         unsigned int id = arg2;
 1332         unsigned int sum, index;
 1333 
 1334         /* Sum across all RX queues */
 1335         sum = 0;
 1336         for (index = 0; index < sc->rxq_count; index++)
 1337                 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
 1338                                          sfxge_rx_stats[id].offset);
 1339 
 1340         return (SYSCTL_OUT(req, &sum, sizeof(sum)));
 1341 }
 1342 
 1343 static void
 1344 sfxge_rx_stat_init(struct sfxge_softc *sc)
 1345 {
 1346         struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
 1347         struct sysctl_oid_list *stat_list;
 1348         unsigned int id;
 1349 
 1350         stat_list = SYSCTL_CHILDREN(sc->stats_node);
 1351 
 1352         for (id = 0; id < nitems(sfxge_rx_stats); id++) {
 1353                 SYSCTL_ADD_PROC(
 1354                         ctx, stat_list,
 1355                         OID_AUTO, sfxge_rx_stats[id].name,
 1356                         CTLTYPE_UINT|CTLFLAG_RD,
 1357                         sc, id, sfxge_rx_stat_handler, "IU",
 1358                         "");
 1359         }
 1360 }
 1361 
 1362 void
 1363 sfxge_rx_fini(struct sfxge_softc *sc)
 1364 {
 1365         int index;
 1366 
 1367         index = sc->rxq_count;
 1368         while (--index >= 0)
 1369                 sfxge_rx_qfini(sc, index);
 1370 
 1371         sc->rxq_count = 0;
 1372 }
 1373 
 1374 int
 1375 sfxge_rx_init(struct sfxge_softc *sc)
 1376 {
 1377         struct sfxge_intr *intr;
 1378         int index;
 1379         int rc;
 1380 
 1381 #ifdef SFXGE_LRO
 1382         if (!ISP2(lro_table_size)) {
 1383                 log(LOG_ERR, "%s=%u must be power of 2",
 1384                     SFXGE_LRO_PARAM(table_size), lro_table_size);
 1385                 rc = EINVAL;
 1386                 goto fail_lro_table_size;
 1387         }
 1388 
 1389         if (lro_idle_ticks == 0)
 1390                 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
 1391 #endif
 1392 
 1393         intr = &sc->intr;
 1394 
 1395         sc->rxq_count = intr->n_alloc;
 1396 
 1397         KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
 1398             ("intr->state != SFXGE_INTR_INITIALIZED"));
 1399 
 1400         /* Initialize the receive queue(s) - one per interrupt. */
 1401         for (index = 0; index < sc->rxq_count; index++) {
 1402                 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
 1403                         goto fail;
 1404         }
 1405 
 1406         sfxge_rx_stat_init(sc);
 1407 
 1408         return (0);
 1409 
 1410 fail:
 1411         /* Tear down the receive queue(s). */
 1412         while (--index >= 0)
 1413                 sfxge_rx_qfini(sc, index);
 1414 
 1415         sc->rxq_count = 0;
 1416 
 1417 #ifdef SFXGE_LRO
 1418 fail_lro_table_size:
 1419 #endif
 1420         return (rc);
 1421 }

Cache object: 3f7b10fcc392ae16e149025cd31f160a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.