| 
     1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (C) 2013-2016 Vincenzo Maffione
    5  * Copyright (C) 2013-2016 Luigi Rizzo
    6  * All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  *   1. Redistributions of source code must retain the above copyright
   12  *      notice, this list of conditions and the following disclaimer.
   13  *   2. Redistributions in binary form must reproduce the above copyright
   14  *      notice, this list of conditions and the following disclaimer in the
   15  *      documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 /*
   31  * This module implements netmap support on top of standard,
   32  * unmodified device drivers.
   33  *
   34  * A NIOCREGIF request is handled here if the device does not
   35  * have native support. TX and RX rings are emulated as follows:
   36  *
   37  * NIOCREGIF
   38  *      We preallocate a block of TX mbufs (roughly as many as
   39  *      tx descriptors; the number is not critical) to speed up
   40  *      operation during transmissions. The refcount on most of
   41  *      these buffers is artificially bumped up so we can recycle
   42  *      them more easily. Also, the destructor is intercepted
   43  *      so we use it as an interrupt notification to wake up
   44  *      processes blocked on a poll().
   45  *
   46  *      For each receive ring we allocate one "struct mbq"
   47  *      (an mbuf tailq plus a spinlock). We intercept packets
   48  *      (through if_input)
   49  *      on the receive path and put them in the mbq from which
   50  *      netmap receive routines can grab them.
   51  *
   52  * TX:
   53  *      in the generic_txsync() routine, netmap buffers are copied
   54  *      (or linked, in a future) to the preallocated mbufs
   55  *      and pushed to the transmit queue. Some of these mbufs
   56  *      (those with NS_REPORT, or otherwise every half ring)
   57  *      have the refcount=1, others have refcount=2.
   58  *      When the destructor is invoked, we take that as
   59  *      a notification that all mbufs up to that one in
   60  *      the specific ring have been completed, and generate
   61  *      the equivalent of a transmit interrupt.
   62  *
   63  * RX:
   64  *
   65  */
   66 
   67 #ifdef __FreeBSD__
   68 
   69 #include <sys/cdefs.h> /* prerequisite */
   70 __FBSDID("$FreeBSD$");
   71 
   72 #include <sys/types.h>
   73 #include <sys/errno.h>
   74 #include <sys/malloc.h>
   75 #include <sys/lock.h>   /* PROT_EXEC */
   76 #include <sys/rwlock.h>
   77 #include <sys/socket.h> /* sockaddrs */
   78 #include <sys/selinfo.h>
   79 #include <net/if.h>
   80 #include <net/if_types.h>
   81 #include <net/if_var.h>
   82 #include <machine/bus.h>        /* bus_dmamap_* in netmap_kern.h */
   83 
   84 #include <net/netmap.h>
   85 #include <dev/netmap/netmap_kern.h>
   86 #include <dev/netmap/netmap_mem2.h>
   87 
   88 #define MBUF_RXQ(m)     ((m)->m_pkthdr.flowid)
   89 #define smp_mb()
   90 
   91 #elif defined _WIN32
   92 
   93 #include "win_glue.h"
   94 
   95 #define MBUF_TXQ(m)     0//((m)->m_pkthdr.flowid)
   96 #define MBUF_RXQ(m)         0//((m)->m_pkthdr.flowid)
   97 #define smp_mb()                //XXX: to be correctly defined
   98 
   99 #else /* linux */
  100 
  101 #include "bsd_glue.h"
  102 
  103 #include <linux/ethtool.h>      /* struct ethtool_ops, get_ringparam */
  104 #include <linux/hrtimer.h>
  105 
  106 static inline struct mbuf *
  107 nm_os_get_mbuf(struct ifnet *ifp, int len)
  108 {
  109         return alloc_skb(LL_RESERVED_SPACE(ifp) + len +
  110                          ifp->needed_tailroom, GFP_ATOMIC);
  111 }
  112 
  113 #endif /* linux */
  114 
  115 
  116 /* Common headers. */
  117 #include <net/netmap.h>
  118 #include <dev/netmap/netmap_kern.h>
  119 #include <dev/netmap/netmap_mem2.h>
  120 
  121 
  122 #define for_each_kring_n(_i, _k, _karr, _n) \
  123         for ((_k)=*(_karr), (_i) = 0; (_i) < (_n); (_i)++, (_k) = (_karr)[(_i)])
  124 
  125 #define for_each_tx_kring(_i, _k, _na) \
  126                 for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings)
  127 #define for_each_tx_kring_h(_i, _k, _na) \
  128                 for_each_kring_n(_i, _k, (_na)->tx_rings, (_na)->num_tx_rings + 1)
  129 
  130 #define for_each_rx_kring(_i, _k, _na) \
  131                 for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings)
  132 #define for_each_rx_kring_h(_i, _k, _na) \
  133                 for_each_kring_n(_i, _k, (_na)->rx_rings, (_na)->num_rx_rings + 1)
  134 
  135 
  136 /* ======================== PERFORMANCE STATISTICS =========================== */
  137 
  138 #ifdef RATE_GENERIC
  139 #define IFRATE(x) x
  140 struct rate_stats {
  141         unsigned long txpkt;
  142         unsigned long txsync;
  143         unsigned long txirq;
  144         unsigned long txrepl;
  145         unsigned long txdrop;
  146         unsigned long rxpkt;
  147         unsigned long rxirq;
  148         unsigned long rxsync;
  149 };
  150 
  151 struct rate_context {
  152         unsigned refcount;
  153         struct timer_list timer;
  154         struct rate_stats new;
  155         struct rate_stats old;
  156 };
  157 
  158 #define RATE_PRINTK(_NAME_) \
  159         printk( #_NAME_ " = %lu Hz\n", (cur._NAME_ - ctx->old._NAME_)/RATE_PERIOD);
  160 #define RATE_PERIOD  2
  161 static void rate_callback(unsigned long arg)
  162 {
  163         struct rate_context * ctx = (struct rate_context *)arg;
  164         struct rate_stats cur = ctx->new;
  165         int r;
  166 
  167         RATE_PRINTK(txpkt);
  168         RATE_PRINTK(txsync);
  169         RATE_PRINTK(txirq);
  170         RATE_PRINTK(txrepl);
  171         RATE_PRINTK(txdrop);
  172         RATE_PRINTK(rxpkt);
  173         RATE_PRINTK(rxsync);
  174         RATE_PRINTK(rxirq);
  175         printk("\n");
  176 
  177         ctx->old = cur;
  178         r = mod_timer(&ctx->timer, jiffies +
  179                         msecs_to_jiffies(RATE_PERIOD * 1000));
  180         if (unlikely(r))
  181                 nm_prerr("mod_timer() failed");
  182 }
  183 
  184 static struct rate_context rate_ctx;
  185 
  186 void generic_rate(int txp, int txs, int txi, int rxp, int rxs, int rxi)
  187 {
  188         if (txp) rate_ctx.new.txpkt++;
  189         if (txs) rate_ctx.new.txsync++;
  190         if (txi) rate_ctx.new.txirq++;
  191         if (rxp) rate_ctx.new.rxpkt++;
  192         if (rxs) rate_ctx.new.rxsync++;
  193         if (rxi) rate_ctx.new.rxirq++;
  194 }
  195 
  196 #else /* !RATE */
  197 #define IFRATE(x)
  198 #endif /* !RATE */
  199 
  200 
  201 /* ========== GENERIC (EMULATED) NETMAP ADAPTER SUPPORT ============= */
  202 
  203 /*
  204  * Wrapper used by the generic adapter layer to notify
  205  * the poller threads. Differently from netmap_rx_irq(), we check
  206  * only NAF_NETMAP_ON instead of NAF_NATIVE_ON to enable the irq.
  207  */
  208 void
  209 netmap_generic_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
  210 {
  211         if (unlikely(!nm_netmap_on(na)))
  212                 return;
  213 
  214         netmap_common_irq(na, q, work_done);
  215 #ifdef RATE_GENERIC
  216         if (work_done)
  217                 rate_ctx.new.rxirq++;
  218         else
  219                 rate_ctx.new.txirq++;
  220 #endif  /* RATE_GENERIC */
  221 }
  222 
  223 static int
  224 generic_netmap_unregister(struct netmap_adapter *na)
  225 {
  226         struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
  227         struct netmap_kring *kring = NULL;
  228         int i, r;
  229 
  230         if (na->active_fds == 0) {
  231                 na->na_flags &= ~NAF_NETMAP_ON;
  232 
  233                 /* Stop intercepting packets on the RX path. */
  234                 nm_os_catch_rx(gna, 0);
  235 
  236                 /* Release packet steering control. */
  237                 nm_os_catch_tx(gna, 0);
  238         }
  239 
  240         netmap_krings_mode_commit(na, /*onoff=*/0);
  241 
  242         for_each_rx_kring(r, kring, na) {
  243                 /* Free the mbufs still pending in the RX queues,
  244                  * that did not end up into the corresponding netmap
  245                  * RX rings. */
  246                 mbq_safe_purge(&kring->rx_queue);
  247                 nm_os_mitigation_cleanup(&gna->mit[r]);
  248         }
  249 
  250         /* Decrement reference counter for the mbufs in the
  251          * TX pools. These mbufs can be still pending in drivers,
  252          * (e.g. this happens with virtio-net driver, which
  253          * does lazy reclaiming of transmitted mbufs). */
  254         for_each_tx_kring(r, kring, na) {
  255                 /* We must remove the destructor on the TX event,
  256                  * because the destructor invokes netmap code, and
  257                  * the netmap module may disappear before the
  258                  * TX event is consumed. */
  259                 mtx_lock_spin(&kring->tx_event_lock);
  260                 if (kring->tx_event) {
  261                         SET_MBUF_DESTRUCTOR(kring->tx_event, NULL);
  262                 }
  263                 kring->tx_event = NULL;
  264                 mtx_unlock_spin(&kring->tx_event_lock);
  265         }
  266 
  267         if (na->active_fds == 0) {
  268                 nm_os_free(gna->mit);
  269 
  270                 for_each_rx_kring(r, kring, na) {
  271                         mbq_safe_fini(&kring->rx_queue);
  272                 }
  273 
  274                 for_each_tx_kring(r, kring, na) {
  275                         mtx_destroy(&kring->tx_event_lock);
  276                         if (kring->tx_pool == NULL) {
  277                                 continue;
  278                         }
  279 
  280                         for (i=0; i<na->num_tx_desc; i++) {
  281                                 if (kring->tx_pool[i]) {
  282                                         m_freem(kring->tx_pool[i]);
  283                                 }
  284                         }
  285                         nm_os_free(kring->tx_pool);
  286                         kring->tx_pool = NULL;
  287                 }
  288 
  289 #ifdef RATE_GENERIC
  290                 if (--rate_ctx.refcount == 0) {
  291                         nm_prinf("del_timer()");
  292                         del_timer(&rate_ctx.timer);
  293                 }
  294 #endif
  295                 nm_prinf("Emulated adapter for %s deactivated", na->name);
  296         }
  297 
  298         return 0;
  299 }
  300 
  301 /* Enable/disable netmap mode for a generic network interface. */
  302 static int
  303 generic_netmap_register(struct netmap_adapter *na, int enable)
  304 {
  305         struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
  306         struct netmap_kring *kring = NULL;
  307         int error;
  308         int i, r;
  309 
  310         if (!na) {
  311                 return EINVAL;
  312         }
  313 
  314         if (!enable) {
  315                 /* This is actually an unregif. */
  316                 return generic_netmap_unregister(na);
  317         }
  318 
  319         if (na->active_fds == 0) {
  320                 nm_prinf("Emulated adapter for %s activated", na->name);
  321                 /* Do all memory allocations when (na->active_fds == 0), to
  322                  * simplify error management. */
  323 
  324                 /* Allocate memory for mitigation support on all the rx queues. */
  325                 gna->mit = nm_os_malloc(na->num_rx_rings * sizeof(struct nm_generic_mit));
  326                 if (!gna->mit) {
  327                         nm_prerr("mitigation allocation failed");
  328                         error = ENOMEM;
  329                         goto out;
  330                 }
  331 
  332                 for_each_rx_kring(r, kring, na) {
  333                         /* Init mitigation support. */
  334                         nm_os_mitigation_init(&gna->mit[r], r, na);
  335 
  336                         /* Initialize the rx queue, as generic_rx_handler() can
  337                          * be called as soon as nm_os_catch_rx() returns.
  338                          */
  339                         mbq_safe_init(&kring->rx_queue);
  340                 }
  341 
  342                 /*
  343                  * Prepare mbuf pools (parallel to the tx rings), for packet
  344                  * transmission. Don't preallocate the mbufs here, it's simpler
  345                  * to leave this task to txsync.
  346                  */
  347                 for_each_tx_kring(r, kring, na) {
  348                         kring->tx_pool = NULL;
  349                 }
  350                 for_each_tx_kring(r, kring, na) {
  351                         kring->tx_pool =
  352                                 nm_os_malloc(na->num_tx_desc * sizeof(struct mbuf *));
  353                         if (!kring->tx_pool) {
  354                                 nm_prerr("tx_pool allocation failed");
  355                                 error = ENOMEM;
  356                                 goto free_tx_pools;
  357                         }
  358                         mtx_init(&kring->tx_event_lock, "tx_event_lock",
  359                                  NULL, MTX_SPIN);
  360                 }
  361         }
  362 
  363         netmap_krings_mode_commit(na, /*onoff=*/1);
  364 
  365         for_each_tx_kring(r, kring, na) {
  366                 /* Initialize tx_pool and tx_event. */
  367                 for (i=0; i<na->num_tx_desc; i++) {
  368                         kring->tx_pool[i] = NULL;
  369                 }
  370 
  371                 kring->tx_event = NULL;
  372         }
  373 
  374         if (na->active_fds == 0) {
  375                 /* Prepare to intercept incoming traffic. */
  376                 error = nm_os_catch_rx(gna, 1);
  377                 if (error) {
  378                         nm_prerr("nm_os_catch_rx(1) failed (%d)", error);
  379                         goto free_tx_pools;
  380                 }
  381 
  382                 /* Let netmap control the packet steering. */
  383                 error = nm_os_catch_tx(gna, 1);
  384                 if (error) {
  385                         nm_prerr("nm_os_catch_tx(1) failed (%d)", error);
  386                         goto catch_rx;
  387                 }
  388 
  389                 na->na_flags |= NAF_NETMAP_ON;
  390 
  391 #ifdef RATE_GENERIC
  392                 if (rate_ctx.refcount == 0) {
  393                         nm_prinf("setup_timer()");
  394                         memset(&rate_ctx, 0, sizeof(rate_ctx));
  395                         setup_timer(&rate_ctx.timer, &rate_callback, (unsigned long)&rate_ctx);
  396                         if (mod_timer(&rate_ctx.timer, jiffies + msecs_to_jiffies(1500))) {
  397                                 nm_prerr("Error: mod_timer()");
  398                         }
  399                 }
  400                 rate_ctx.refcount++;
  401 #endif /* RATE */
  402         }
  403 
  404         return 0;
  405 
  406         /* Here (na->active_fds == 0) holds. */
  407 catch_rx:
  408         nm_os_catch_rx(gna, 0);
  409 free_tx_pools:
  410         for_each_tx_kring(r, kring, na) {
  411                 mtx_destroy(&kring->tx_event_lock);
  412                 if (kring->tx_pool == NULL) {
  413                         continue;
  414                 }
  415                 nm_os_free(kring->tx_pool);
  416                 kring->tx_pool = NULL;
  417         }
  418         for_each_rx_kring(r, kring, na) {
  419                 mbq_safe_fini(&kring->rx_queue);
  420         }
  421         nm_os_free(gna->mit);
  422 out:
  423 
  424         return error;
  425 }
  426 
  427 /*
  428  * Callback invoked when the device driver frees an mbuf used
  429  * by netmap to transmit a packet. This usually happens when
  430  * the NIC notifies the driver that transmission is completed.
  431  */
  432 static void
  433 generic_mbuf_destructor(struct mbuf *m)
  434 {
  435         struct netmap_adapter *na = NA(GEN_TX_MBUF_IFP(m));
  436         struct netmap_kring *kring;
  437         unsigned int r = MBUF_TXQ(m);
  438         unsigned int r_orig = r;
  439 
  440         if (unlikely(!nm_netmap_on(na) || r >= na->num_tx_rings)) {
  441                 nm_prerr("Error: no netmap adapter on device %p",
  442                   GEN_TX_MBUF_IFP(m));
  443                 return;
  444         }
  445 
  446         /*
  447          * First, clear the event mbuf.
  448          * In principle, the event 'm' should match the one stored
  449          * on ring 'r'. However we check it explicitly to stay
  450          * safe against lower layers (qdisc, driver, etc.) changing
  451          * MBUF_TXQ(m) under our feet. If the match is not found
  452          * on 'r', we try to see if it belongs to some other ring.
  453          */
  454         for (;;) {
  455                 bool match = false;
  456 
  457                 kring = na->tx_rings[r];
  458                 mtx_lock_spin(&kring->tx_event_lock);
  459                 if (kring->tx_event == m) {
  460                         kring->tx_event = NULL;
  461                         match = true;
  462                 }
  463                 mtx_unlock_spin(&kring->tx_event_lock);
  464 
  465                 if (match) {
  466                         if (r != r_orig) {
  467                                 nm_prlim(1, "event %p migrated: ring %u --> %u",
  468                                       m, r_orig, r);
  469                         }
  470                         break;
  471                 }
  472 
  473                 if (++r == na->num_tx_rings) r = 0;
  474 
  475                 if (r == r_orig) {
  476                         nm_prlim(1, "Cannot match event %p", m);
  477                         return;
  478                 }
  479         }
  480 
  481         /* Second, wake up clients. They will reclaim the event through
  482          * txsync. */
  483         netmap_generic_irq(na, r, NULL);
  484 #ifdef __FreeBSD__
  485         void_mbuf_dtor(m);
  486 #endif
  487 }
  488 
  489 /* Record completed transmissions and update hwtail.
  490  *
  491  * The oldest tx buffer not yet completed is at nr_hwtail + 1,
  492  * nr_hwcur is the first unsent buffer.
  493  */
  494 static u_int
  495 generic_netmap_tx_clean(struct netmap_kring *kring, int txqdisc)
  496 {
  497         u_int const lim = kring->nkr_num_slots - 1;
  498         u_int nm_i = nm_next(kring->nr_hwtail, lim);
  499         u_int hwcur = kring->nr_hwcur;
  500         u_int n = 0;
  501         struct mbuf **tx_pool = kring->tx_pool;
  502 
  503         nm_prdis("hwcur = %d, hwtail = %d", kring->nr_hwcur, kring->nr_hwtail);
  504 
  505         while (nm_i != hwcur) { /* buffers not completed */
  506                 struct mbuf *m = tx_pool[nm_i];
  507 
  508                 if (txqdisc) {
  509                         if (m == NULL) {
  510                                 /* Nothing to do, this is going
  511                                  * to be replenished. */
  512                                 nm_prlim(3, "Is this happening?");
  513 
  514                         } else if (MBUF_QUEUED(m)) {
  515                                 break; /* Not dequeued yet. */
  516 
  517                         } else if (MBUF_REFCNT(m) != 1) {
  518                                 /* This mbuf has been dequeued but is still busy
  519                                  * (refcount is 2).
  520                                  * Leave it to the driver and replenish. */
  521                                 m_freem(m);
  522                                 tx_pool[nm_i] = NULL;
  523                         }
  524 
  525                 } else {
  526                         if (unlikely(m == NULL)) {
  527                                 int event_consumed;
  528 
  529                                 /* This slot was used to place an event. */
  530                                 mtx_lock_spin(&kring->tx_event_lock);
  531                                 event_consumed = (kring->tx_event == NULL);
  532                                 mtx_unlock_spin(&kring->tx_event_lock);
  533                                 if (!event_consumed) {
  534                                         /* The event has not been consumed yet,
  535                                          * still busy in the driver. */
  536                                         break;
  537                                 }
  538                                 /* The event has been consumed, we can go
  539                                  * ahead. */
  540 
  541                         } else if (MBUF_REFCNT(m) != 1) {
  542                                 /* This mbuf is still busy: its refcnt is 2. */
  543                                 break;
  544                         }
  545                 }
  546 
  547                 n++;
  548                 nm_i = nm_next(nm_i, lim);
  549         }
  550         kring->nr_hwtail = nm_prev(nm_i, lim);
  551         nm_prdis("tx completed [%d] -> hwtail %d", n, kring->nr_hwtail);
  552 
  553         return n;
  554 }
  555 
  556 /* Compute a slot index in the middle between inf and sup. */
  557 static inline u_int
  558 ring_middle(u_int inf, u_int sup, u_int lim)
  559 {
  560         u_int n = lim + 1;
  561         u_int e;
  562 
  563         if (sup >= inf) {
  564                 e = (sup + inf) / 2;
  565         } else { /* wrap around */
  566                 e = (sup + n + inf) / 2;
  567                 if (e >= n) {
  568                         e -= n;
  569                 }
  570         }
  571 
  572         if (unlikely(e >= n)) {
  573                 nm_prerr("This cannot happen");
  574                 e = 0;
  575         }
  576 
  577         return e;
  578 }
  579 
  580 static void
  581 generic_set_tx_event(struct netmap_kring *kring, u_int hwcur)
  582 {
  583         u_int lim = kring->nkr_num_slots - 1;
  584         struct mbuf *m;
  585         u_int e;
  586         u_int ntc = nm_next(kring->nr_hwtail, lim); /* next to clean */
  587 
  588         if (ntc == hwcur) {
  589                 return; /* all buffers are free */
  590         }
  591 
  592         /*
  593          * We have pending packets in the driver between hwtail+1
  594          * and hwcur, and we have to chose one of these slot to
  595          * generate a notification.
  596          * There is a race but this is only called within txsync which
  597          * does a double check.
  598          */
  599 #if 0
  600         /* Choose a slot in the middle, so that we don't risk ending
  601          * up in a situation where the client continuously wake up,
  602          * fills one or a few TX slots and go to sleep again. */
  603         e = ring_middle(ntc, hwcur, lim);
  604 #else
  605         /* Choose the first pending slot, to be safe against driver
  606          * reordering mbuf transmissions. */
  607         e = ntc;
  608 #endif
  609 
  610         m = kring->tx_pool[e];
  611         if (m == NULL) {
  612                 /* An event is already in place. */
  613                 return;
  614         }
  615 
  616         mtx_lock_spin(&kring->tx_event_lock);
  617         if (kring->tx_event) {
  618                 /* An event is already in place. */
  619                 mtx_unlock_spin(&kring->tx_event_lock);
  620                 return;
  621         }
  622 
  623         SET_MBUF_DESTRUCTOR(m, generic_mbuf_destructor);
  624         kring->tx_event = m;
  625         mtx_unlock_spin(&kring->tx_event_lock);
  626 
  627         kring->tx_pool[e] = NULL;
  628 
  629         nm_prdis("Request Event at %d mbuf %p refcnt %d", e, m, m ? MBUF_REFCNT(m) : -2 );
  630 
  631         /* Decrement the refcount. This will free it if we lose the race
  632          * with the driver. */
  633         m_freem(m);
  634         smp_mb();
  635 }
  636 
  637 
  638 /*
  639  * generic_netmap_txsync() transforms netmap buffers into mbufs
  640  * and passes them to the standard device driver
  641  * (ndo_start_xmit() or ifp->if_transmit() ).
  642  * On linux this is not done directly, but using dev_queue_xmit(),
  643  * since it implements the TX flow control (and takes some locks).
  644  */
  645 static int
  646 generic_netmap_txsync(struct netmap_kring *kring, int flags)
  647 {
  648         struct netmap_adapter *na = kring->na;
  649         struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
  650         struct ifnet *ifp = na->ifp;
  651         struct netmap_ring *ring = kring->ring;
  652         u_int nm_i;     /* index into the netmap ring */ // j
  653         u_int const lim = kring->nkr_num_slots - 1;
  654         u_int const head = kring->rhead;
  655         u_int ring_nr = kring->ring_id;
  656 
  657         IFRATE(rate_ctx.new.txsync++);
  658 
  659         rmb();
  660 
  661         /*
  662          * First part: process new packets to send.
  663          */
  664         nm_i = kring->nr_hwcur;
  665         if (nm_i != head) {     /* we have new packets to send */
  666                 struct nm_os_gen_arg a;
  667                 u_int event = -1;
  668 #ifdef __FreeBSD__
  669                 struct epoch_tracker et;
  670 
  671                 NET_EPOCH_ENTER(et);
  672 #endif
  673 
  674                 if (gna->txqdisc && nm_kr_txempty(kring)) {
  675                         /* In txqdisc mode, we ask for a delayed notification,
  676                          * but only when cur == hwtail, which means that the
  677                          * client is going to block. */
  678                         event = ring_middle(nm_i, head, lim);
  679                         nm_prdis("Place txqdisc event (hwcur=%u,event=%u,"
  680                               "head=%u,hwtail=%u)", nm_i, event, head,
  681                               kring->nr_hwtail);
  682                 }
  683 
  684                 a.ifp = ifp;
  685                 a.ring_nr = ring_nr;
  686                 a.head = a.tail = NULL;
  687 
  688                 while (nm_i != head) {
  689                         struct netmap_slot *slot = &ring->slot[nm_i];
  690                         u_int len = slot->len;
  691                         void *addr = NMB(na, slot);
  692                         /* device-specific */
  693                         struct mbuf *m;
  694                         int tx_ret;
  695 
  696                         NM_CHECK_ADDR_LEN(na, addr, len);
  697 
  698                         /* Tale a mbuf from the tx pool (replenishing the pool
  699                          * entry if necessary) and copy in the user packet. */
  700                         m = kring->tx_pool[nm_i];
  701                         if (unlikely(m == NULL)) {
  702                                 kring->tx_pool[nm_i] = m =
  703                                         nm_os_get_mbuf(ifp, NETMAP_BUF_SIZE(na));
  704                                 if (m == NULL) {
  705                                         nm_prlim(2, "Failed to replenish mbuf");
  706                                         /* Here we could schedule a timer which
  707                                          * retries to replenish after a while,
  708                                          * and notifies the client when it
  709                                          * manages to replenish some slots. In
  710                                          * any case we break early to avoid
  711                                          * crashes. */
  712                                         break;
  713                                 }
  714                                 IFRATE(rate_ctx.new.txrepl++);
  715                         }
  716 
  717                         a.m = m;
  718                         a.addr = addr;
  719                         a.len = len;
  720                         a.qevent = (nm_i == event);
  721                         /* When not in txqdisc mode, we should ask
  722                          * notifications when NS_REPORT is set, or roughly
  723                          * every half ring. To optimize this, we set a
  724                          * notification event when the client runs out of
  725                          * TX ring space, or when transmission fails. In
  726                          * the latter case we also break early.
  727                          */
  728                         tx_ret = nm_os_generic_xmit_frame(&a);
  729                         if (unlikely(tx_ret)) {
  730                                 if (!gna->txqdisc) {
  731                                         /*
  732                                          * No room for this mbuf in the device driver.
  733                                          * Request a notification FOR A PREVIOUS MBUF,
  734                                          * then call generic_netmap_tx_clean(kring) to do the
  735                                          * double check and see if we can free more buffers.
  736                                          * If there is space continue, else break;
  737                                          * NOTE: the double check is necessary if the problem
  738                                          * occurs in the txsync call after selrecord().
  739                                          * Also, we need some way to tell the caller that not
  740                                          * all buffers were queued onto the device (this was
  741                                          * not a problem with native netmap driver where space
  742                                          * is preallocated). The bridge has a similar problem
  743                                          * and we solve it there by dropping the excess packets.
  744                                          */
  745                                         generic_set_tx_event(kring, nm_i);
  746                                         if (generic_netmap_tx_clean(kring, gna->txqdisc)) {
  747                                                 /* space now available */
  748                                                 continue;
  749                                         } else {
  750                                                 break;
  751                                         }
  752                                 }
  753 
  754                                 /* In txqdisc mode, the netmap-aware qdisc
  755                                  * queue has the same length as the number of
  756                                  * netmap slots (N). Since tail is advanced
  757                                  * only when packets are dequeued, qdisc
  758                                  * queue overrun cannot happen, so
  759                                  * nm_os_generic_xmit_frame() did not fail
  760                                  * because of that.
  761                                  * However, packets can be dropped because
  762                                  * carrier is off, or because our qdisc is
  763                                  * being deactivated, or possibly for other
  764                                  * reasons. In these cases, we just let the
  765                                  * packet to be dropped. */
  766                                 IFRATE(rate_ctx.new.txdrop++);
  767                         }
  768 
  769                         slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
  770                         nm_i = nm_next(nm_i, lim);
  771                         IFRATE(rate_ctx.new.txpkt++);
  772                 }
  773                 if (a.head != NULL) {
  774                         a.addr = NULL;
  775                         nm_os_generic_xmit_frame(&a);
  776                 }
  777                 /* Update hwcur to the next slot to transmit. Here nm_i
  778                  * is not necessarily head, we could break early. */
  779                 kring->nr_hwcur = nm_i;
  780 
  781 #ifdef __FreeBSD__
  782                 NET_EPOCH_EXIT(et);
  783 #endif
  784         }
  785 
  786         /*
  787          * Second, reclaim completed buffers
  788          */
  789         if (!gna->txqdisc && (flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring))) {
  790                 /* No more available slots? Set a notification event
  791                  * on a netmap slot that will be cleaned in the future.
  792                  * No doublecheck is performed, since txsync() will be
  793                  * called twice by netmap_poll().
  794                  */
  795                 generic_set_tx_event(kring, nm_i);
  796         }
  797 
  798         generic_netmap_tx_clean(kring, gna->txqdisc);
  799 
  800         return 0;
  801 }
  802 
  803 
  804 /*
  805  * This handler is registered (through nm_os_catch_rx())
  806  * within the attached network interface
  807  * in the RX subsystem, so that every mbuf passed up by
  808  * the driver can be stolen to the network stack.
  809  * Stolen packets are put in a queue where the
  810  * generic_netmap_rxsync() callback can extract them.
  811  * Returns 1 if the packet was stolen, 0 otherwise.
  812  */
  813 int
  814 generic_rx_handler(struct ifnet *ifp, struct mbuf *m)
  815 {
  816         struct netmap_adapter *na = NA(ifp);
  817         struct netmap_generic_adapter *gna = (struct netmap_generic_adapter *)na;
  818         struct netmap_kring *kring;
  819         u_int work_done;
  820         u_int r = MBUF_RXQ(m); /* receive ring number */
  821 
  822         if (r >= na->num_rx_rings) {
  823                 r = r % na->num_rx_rings;
  824         }
  825 
  826         kring = na->rx_rings[r];
  827 
  828         if (kring->nr_mode == NKR_NETMAP_OFF) {
  829                 /* We must not intercept this mbuf. */
  830                 return 0;
  831         }
  832 
  833         /* limit the size of the queue */
  834         if (unlikely(!gna->rxsg && MBUF_LEN(m) > NETMAP_BUF_SIZE(na))) {
  835                 /* This may happen when GRO/LRO features are enabled for
  836                  * the NIC driver when the generic adapter does not
  837                  * support RX scatter-gather. */
  838                 nm_prlim(2, "Warning: driver pushed up big packet "
  839                                 "(size=%d)", (int)MBUF_LEN(m));
  840                 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
  841                 m_freem(m);
  842         } else if (unlikely(mbq_len(&kring->rx_queue) > na->num_rx_desc)) {
  843                 if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
  844                 m_freem(m);
  845         } else {
  846                 mbq_safe_enqueue(&kring->rx_queue, m);
  847         }
  848 
  849         if (netmap_generic_mit < 32768) {
  850                 /* no rx mitigation, pass notification up */
  851                 netmap_generic_irq(na, r, &work_done);
  852         } else {
  853                 /* same as send combining, filter notification if there is a
  854                  * pending timer, otherwise pass it up and start a timer.
  855                  */
  856                 if (likely(nm_os_mitigation_active(&gna->mit[r]))) {
  857                         /* Record that there is some pending work. */
  858                         gna->mit[r].mit_pending = 1;
  859                 } else {
  860                         netmap_generic_irq(na, r, &work_done);
  861                         nm_os_mitigation_start(&gna->mit[r]);
  862                 }
  863         }
  864 
  865         /* We have intercepted the mbuf. */
  866         return 1;
  867 }
  868 
  869 /*
  870  * generic_netmap_rxsync() extracts mbufs from the queue filled by
  871  * generic_netmap_rx_handler() and puts their content in the netmap
  872  * receive ring.
  873  * Access must be protected because the rx handler is asynchronous,
  874  */
  875 static int
  876 generic_netmap_rxsync(struct netmap_kring *kring, int flags)
  877 {
  878         struct netmap_ring *ring = kring->ring;
  879         struct netmap_adapter *na = kring->na;
  880         u_int nm_i;     /* index into the netmap ring */ //j,
  881         u_int n;
  882         u_int const lim = kring->nkr_num_slots - 1;
  883         u_int const head = kring->rhead;
  884         int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
  885 
  886         /* Adapter-specific variables. */
  887         u_int nm_buf_len = NETMAP_BUF_SIZE(na);
  888         struct mbq tmpq;
  889         struct mbuf *m;
  890         int avail; /* in bytes */
  891         int mlen;
  892         int copy;
  893 
  894         if (head > lim)
  895                 return netmap_ring_reinit(kring);
  896 
  897         IFRATE(rate_ctx.new.rxsync++);
  898 
  899         /*
  900          * First part: skip past packets that userspace has released.
  901          * This can possibly make room for the second part.
  902          */
  903         nm_i = kring->nr_hwcur;
  904         if (nm_i != head) {
  905                 /* Userspace has released some packets. */
  906                 for (n = 0; nm_i != head; n++) {
  907                         struct netmap_slot *slot = &ring->slot[nm_i];
  908 
  909                         slot->flags &= ~NS_BUF_CHANGED;
  910                         nm_i = nm_next(nm_i, lim);
  911                 }
  912                 kring->nr_hwcur = head;
  913         }
  914 
  915         /*
  916          * Second part: import newly received packets.
  917          */
  918         if (!netmap_no_pendintr && !force_update) {
  919                 return 0;
  920         }
  921 
  922         nm_i = kring->nr_hwtail; /* First empty slot in the receive ring. */
  923 
  924         /* Compute the available space (in bytes) in this netmap ring.
  925          * The first slot that is not considered in is the one before
  926          * nr_hwcur. */
  927 
  928         avail = nm_prev(kring->nr_hwcur, lim) - nm_i;
  929         if (avail < 0)
  930                 avail += lim + 1;
  931         avail *= nm_buf_len;
  932 
  933         /* First pass: While holding the lock on the RX mbuf queue,
  934          * extract as many mbufs as they fit the available space,
  935          * and put them in a temporary queue.
  936          * To avoid performing a per-mbuf division (mlen / nm_buf_len) to
  937          * to update avail, we do the update in a while loop that we
  938          * also use to set the RX slots, but without performing the copy. */
  939         mbq_init(&tmpq);
  940         mbq_lock(&kring->rx_queue);
  941         for (n = 0;; n++) {
  942                 m = mbq_peek(&kring->rx_queue);
  943                 if (!m) {
  944                         /* No more packets from the driver. */
  945                         break;
  946                 }
  947 
  948                 mlen = MBUF_LEN(m);
  949                 if (mlen > avail) {
  950                         /* No more space in the ring. */
  951                         break;
  952                 }
  953 
  954                 mbq_dequeue(&kring->rx_queue);
  955 
  956                 while (mlen) {
  957                         copy = nm_buf_len;
  958                         if (mlen < copy) {
  959                                 copy = mlen;
  960                         }
  961                         mlen -= copy;
  962                         avail -= nm_buf_len;
  963 
  964                         ring->slot[nm_i].len = copy;
  965                         ring->slot[nm_i].flags = (mlen ? NS_MOREFRAG : 0);
  966                         nm_i = nm_next(nm_i, lim);
  967                 }
  968 
  969                 mbq_enqueue(&tmpq, m);
  970         }
  971         mbq_unlock(&kring->rx_queue);
  972 
  973         /* Second pass: Drain the temporary queue, going over the used RX slots,
  974          * and perform the copy out of the RX queue lock. */
  975         nm_i = kring->nr_hwtail;
  976 
  977         for (;;) {
  978                 void *nmaddr;
  979                 int ofs = 0;
  980                 int morefrag;
  981 
  982                 m = mbq_dequeue(&tmpq);
  983                 if (!m) {
  984                         break;
  985                 }
  986 
  987                 do {
  988                         nmaddr = NMB(na, &ring->slot[nm_i]);
  989                         /* We only check the address here on generic rx rings. */
  990                         if (nmaddr == NETMAP_BUF_BASE(na)) { /* Bad buffer */
  991                                 m_freem(m);
  992                                 mbq_purge(&tmpq);
  993                                 mbq_fini(&tmpq);
  994                                 return netmap_ring_reinit(kring);
  995                         }
  996 
  997                         copy = ring->slot[nm_i].len;
  998                         m_copydata(m, ofs, copy, nmaddr);
  999                         ofs += copy;
 1000                         morefrag = ring->slot[nm_i].flags & NS_MOREFRAG;
 1001                         nm_i = nm_next(nm_i, lim);
 1002                 } while (morefrag);
 1003 
 1004                 m_freem(m);
 1005         }
 1006 
 1007         mbq_fini(&tmpq);
 1008 
 1009         if (n) {
 1010                 kring->nr_hwtail = nm_i;
 1011                 IFRATE(rate_ctx.new.rxpkt += n);
 1012         }
 1013         kring->nr_kflags &= ~NKR_PENDINTR;
 1014 
 1015         return 0;
 1016 }
 1017 
 1018 static void
 1019 generic_netmap_dtor(struct netmap_adapter *na)
 1020 {
 1021         struct netmap_generic_adapter *gna = (struct netmap_generic_adapter*)na;
 1022         struct ifnet *ifp = netmap_generic_getifp(gna);
 1023         struct netmap_adapter *prev_na = gna->prev;
 1024 
 1025         if (prev_na != NULL) {
 1026                 netmap_adapter_put(prev_na);
 1027                 if (nm_iszombie(na)) {
 1028                         /*
 1029                          * The driver has been removed without releasing
 1030                          * the reference so we need to do it here.
 1031                          */
 1032                         netmap_adapter_put(prev_na);
 1033                 }
 1034                 nm_prinf("Native netmap adapter for %s restored", prev_na->name);
 1035         }
 1036         NM_RESTORE_NA(ifp, prev_na);
 1037         /*
 1038          * netmap_detach_common(), that it's called after this function,
 1039          * overrides WNA(ifp) if na->ifp is not NULL.
 1040          */
 1041         na->ifp = NULL;
 1042         nm_prinf("Emulated netmap adapter for %s destroyed", na->name);
 1043 }
 1044 
 1045 int
 1046 na_is_generic(struct netmap_adapter *na)
 1047 {
 1048         return na->nm_register == generic_netmap_register;
 1049 }
 1050 
 1051 /*
 1052  * generic_netmap_attach() makes it possible to use netmap on
 1053  * a device without native netmap support.
 1054  * This is less performant than native support but potentially
 1055  * faster than raw sockets or similar schemes.
 1056  *
 1057  * In this "emulated" mode, netmap rings do not necessarily
 1058  * have the same size as those in the NIC. We use a default
 1059  * value and possibly override it if the OS has ways to fetch the
 1060  * actual configuration.
 1061  */
 1062 int
 1063 generic_netmap_attach(struct ifnet *ifp)
 1064 {
 1065         struct netmap_adapter *na;
 1066         struct netmap_generic_adapter *gna;
 1067         int retval;
 1068         u_int num_tx_desc, num_rx_desc;
 1069 
 1070 #ifdef __FreeBSD__
 1071         if (ifp->if_type == IFT_LOOP) {
 1072                 nm_prerr("if_loop is not supported by %s", __func__);
 1073                 return EINVAL;
 1074         }
 1075 #endif
 1076 
 1077         if (NM_NA_CLASH(ifp)) {
 1078                 /* If NA(ifp) is not null but there is no valid netmap
 1079                  * adapter it means that someone else is using the same
 1080                  * pointer (e.g. ax25_ptr on linux). This happens for
 1081                  * instance when also PF_RING is in use. */
 1082                 nm_prerr("Error: netmap adapter hook is busy");
 1083                 return EBUSY;
 1084         }
 1085 
 1086         num_tx_desc = num_rx_desc = netmap_generic_ringsize; /* starting point */
 1087 
 1088         nm_os_generic_find_num_desc(ifp, &num_tx_desc, &num_rx_desc); /* ignore errors */
 1089         if (num_tx_desc == 0 || num_rx_desc == 0) {
 1090                 nm_prerr("Device has no hw slots (tx %u, rx %u)", num_tx_desc, num_rx_desc);
 1091                 return EINVAL;
 1092         }
 1093 
 1094         gna = nm_os_malloc(sizeof(*gna));
 1095         if (gna == NULL) {
 1096                 nm_prerr("no memory on attach, give up");
 1097                 return ENOMEM;
 1098         }
 1099         na = (struct netmap_adapter *)gna;
 1100         strlcpy(na->name, ifp->if_xname, sizeof(na->name));
 1101         na->ifp = ifp;
 1102         na->num_tx_desc = num_tx_desc;
 1103         na->num_rx_desc = num_rx_desc;
 1104         na->rx_buf_maxsize = 32768;
 1105         na->nm_register = &generic_netmap_register;
 1106         na->nm_txsync = &generic_netmap_txsync;
 1107         na->nm_rxsync = &generic_netmap_rxsync;
 1108         na->nm_dtor = &generic_netmap_dtor;
 1109         /* when using generic, NAF_NETMAP_ON is set so we force
 1110          * NAF_SKIP_INTR to use the regular interrupt handler
 1111          */
 1112         na->na_flags = NAF_SKIP_INTR | NAF_HOST_RINGS;
 1113 
 1114         nm_prdis("[GNA] num_tx_queues(%d), real_num_tx_queues(%d), len(%lu)",
 1115                         ifp->num_tx_queues, ifp->real_num_tx_queues,
 1116                         ifp->tx_queue_len);
 1117         nm_prdis("[GNA] num_rx_queues(%d), real_num_rx_queues(%d)",
 1118                         ifp->num_rx_queues, ifp->real_num_rx_queues);
 1119 
 1120         nm_os_generic_find_num_queues(ifp, &na->num_tx_rings, &na->num_rx_rings);
 1121 
 1122         retval = netmap_attach_common(na);
 1123         if (retval) {
 1124                 nm_os_free(gna);
 1125                 return retval;
 1126         }
 1127 
 1128         if (NM_NA_VALID(ifp)) {
 1129                 gna->prev = NA(ifp); /* save old na */
 1130                 netmap_adapter_get(gna->prev);
 1131         }
 1132         NM_ATTACH_NA(ifp, na);
 1133 
 1134         nm_os_generic_set_features(gna);
 1135 
 1136         nm_prinf("Emulated adapter for %s created (prev was %s)", na->name,
 1137             gna->prev ? gna->prev->name : "NULL");
 1138 
 1139         return retval;
 1140 }
Cache object: a39cc254d137940e3aec77ecb42e8e54 
 
 |