| 
     1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (C) 2011-2014 Matteo Landi
    5  * Copyright (C) 2011-2016 Luigi Rizzo
    6  * Copyright (C) 2011-2016 Giuseppe Lettieri
    7  * Copyright (C) 2011-2016 Vincenzo Maffione
    8  * All rights reserved.
    9  *
   10  * Redistribution and use in source and binary forms, with or without
   11  * modification, are permitted provided that the following conditions
   12  * are met:
   13  *   1. Redistributions of source code must retain the above copyright
   14  *      notice, this list of conditions and the following disclaimer.
   15  *   2. Redistributions in binary form must reproduce the above copyright
   16  *      notice, this list of conditions and the following disclaimer in the
   17  *      documentation and/or other materials provided with the distribution.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  */
   31 
   32 
   33 /*
   34  * $FreeBSD$
   35  *
   36  * This module supports memory mapped access to network devices,
   37  * see netmap(4).
   38  *
   39  * The module uses a large, memory pool allocated by the kernel
   40  * and accessible as mmapped memory by multiple userspace threads/processes.
   41  * The memory pool contains packet buffers and "netmap rings",
   42  * i.e. user-accessible copies of the interface's queues.
   43  *
   44  * Access to the network card works like this:
   45  * 1. a process/thread issues one or more open() on /dev/netmap, to create
   46  *    select()able file descriptor on which events are reported.
   47  * 2. on each descriptor, the process issues an ioctl() to identify
   48  *    the interface that should report events to the file descriptor.
   49  * 3. on each descriptor, the process issues an mmap() request to
   50  *    map the shared memory region within the process' address space.
   51  *    The list of interesting queues is indicated by a location in
   52  *    the shared memory region.
   53  * 4. using the functions in the netmap(4) userspace API, a process
   54  *    can look up the occupation state of a queue, access memory buffers,
   55  *    and retrieve received packets or enqueue packets to transmit.
   56  * 5. using some ioctl()s the process can synchronize the userspace view
   57  *    of the queue with the actual status in the kernel. This includes both
   58  *    receiving the notification of new packets, and transmitting new
   59  *    packets on the output interface.
   60  * 6. select() or poll() can be used to wait for events on individual
   61  *    transmit or receive queues (or all queues for a given interface).
   62  *
   63 
   64                 SYNCHRONIZATION (USER)
   65 
   66 The netmap rings and data structures may be shared among multiple
   67 user threads or even independent processes.
   68 Any synchronization among those threads/processes is delegated
   69 to the threads themselves. Only one thread at a time can be in
   70 a system call on the same netmap ring. The OS does not enforce
   71 this and only guarantees against system crashes in case of
   72 invalid usage.
   73 
   74                 LOCKING (INTERNAL)
   75 
   76 Within the kernel, access to the netmap rings is protected as follows:
   77 
   78 - a spinlock on each ring, to handle producer/consumer races on
   79   RX rings attached to the host stack (against multiple host
   80   threads writing from the host stack to the same ring),
   81   and on 'destination' rings attached to a VALE switch
   82   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
   83   protecting multiple active senders for the same destination)
   84 
   85 - an atomic variable to guarantee that there is at most one
   86   instance of *_*xsync() on the ring at any time.
   87   For rings connected to user file
   88   descriptors, an atomic_test_and_set() protects this, and the
   89   lock on the ring is not actually used.
   90   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
   91   is also used to prevent multiple executions (the driver might indeed
   92   already guarantee this).
   93   For NIC TX rings connected to a VALE switch, the lock arbitrates
   94   access to the queue (both when allocating buffers and when pushing
   95   them out).
   96 
   97 - *xsync() should be protected against initializations of the card.
   98   On FreeBSD most devices have the reset routine protected by
   99   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
  100   the RING protection on rx_reset(), this should be added.
  101 
  102   On linux there is an external lock on the tx path, which probably
  103   also arbitrates access to the reset routine. XXX to be revised
  104 
  105 - a per-interface core_lock protecting access from the host stack
  106   while interfaces may be detached from netmap mode.
  107   XXX there should be no need for this lock if we detach the interfaces
  108   only while they are down.
  109 
  110 
  111 --- VALE SWITCH ---
  112 
  113 NMG_LOCK() serializes all modifications to switches and ports.
  114 A switch cannot be deleted until all ports are gone.
  115 
  116 For each switch, an SX lock (RWlock on linux) protects
  117 deletion of ports. When configuring or deleting a new port, the
  118 lock is acquired in exclusive mode (after holding NMG_LOCK).
  119 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
  120 The lock is held throughout the entire forwarding cycle,
  121 during which the thread may incur in a page fault.
  122 Hence it is important that sleepable shared locks are used.
  123 
  124 On the rx ring, the per-port lock is grabbed initially to reserve
  125 a number of slot in the ring, then the lock is released,
  126 packets are copied from source to destination, and then
  127 the lock is acquired again and the receive ring is updated.
  128 (A similar thing is done on the tx ring for NIC and host stack
  129 ports attached to the switch)
  130 
  131  */
  132 
  133 
  134 /* --- internals ----
  135  *
  136  * Roadmap to the code that implements the above.
  137  *
  138  * > 1. a process/thread issues one or more open() on /dev/netmap, to create
  139  * >    select()able file descriptor on which events are reported.
  140  *
  141  *      Internally, we allocate a netmap_priv_d structure, that will be
  142  *      initialized on ioctl(NIOCREGIF). There is one netmap_priv_d
  143  *      structure for each open().
  144  *
  145  *      os-specific:
  146  *          FreeBSD: see netmap_open() (netmap_freebsd.c)
  147  *          linux:   see linux_netmap_open() (netmap_linux.c)
  148  *
  149  * > 2. on each descriptor, the process issues an ioctl() to identify
  150  * >    the interface that should report events to the file descriptor.
  151  *
  152  *      Implemented by netmap_ioctl(), NIOCREGIF case, with nmr->nr_cmd==0.
  153  *      Most important things happen in netmap_get_na() and
  154  *      netmap_do_regif(), called from there. Additional details can be
  155  *      found in the comments above those functions.
  156  *
  157  *      In all cases, this action creates/takes-a-reference-to a
  158  *      netmap_*_adapter describing the port, and allocates a netmap_if
  159  *      and all necessary netmap rings, filling them with netmap buffers.
  160  *
  161  *      In this phase, the sync callbacks for each ring are set (these are used
  162  *      in steps 5 and 6 below).  The callbacks depend on the type of adapter.
  163  *      The adapter creation/initialization code puts them in the
  164  *      netmap_adapter (fields na->nm_txsync and na->nm_rxsync).  Then, they
  165  *      are copied from there to the netmap_kring's during netmap_do_regif(), by
  166  *      the nm_krings_create() callback.  All the nm_krings_create callbacks
  167  *      actually call netmap_krings_create() to perform this and the other
  168  *      common stuff. netmap_krings_create() also takes care of the host rings,
  169  *      if needed, by setting their sync callbacks appropriately.
  170  *
  171  *      Additional actions depend on the kind of netmap_adapter that has been
  172  *      registered:
  173  *
  174  *      - netmap_hw_adapter:         [netmap.c]
  175  *           This is a system netdev/ifp with native netmap support.
  176  *           The ifp is detached from the host stack by redirecting:
  177  *             - transmissions (from the network stack) to netmap_transmit()
  178  *             - receive notifications to the nm_notify() callback for
  179  *               this adapter. The callback is normally netmap_notify(), unless
  180  *               the ifp is attached to a bridge using bwrap, in which case it
  181  *               is netmap_bwrap_intr_notify().
  182  *
  183  *      - netmap_generic_adapter:      [netmap_generic.c]
  184  *            A system netdev/ifp without native netmap support.
  185  *
  186  *      (the decision about native/non native support is taken in
  187  *       netmap_get_hw_na(), called by netmap_get_na())
  188  *
  189  *      - netmap_vp_adapter             [netmap_vale.c]
  190  *            Returned by netmap_get_bdg_na().
  191  *            This is a persistent or ephemeral VALE port. Ephemeral ports
  192  *            are created on the fly if they don't already exist, and are
  193  *            always attached to a bridge.
  194  *            Persistent VALE ports must must be created separately, and i
  195  *            then attached like normal NICs. The NIOCREGIF we are examining
  196  *            will find them only if they had previously been created and
  197  *            attached (see VALE_CTL below).
  198  *
  199  *      - netmap_pipe_adapter         [netmap_pipe.c]
  200  *            Returned by netmap_get_pipe_na().
  201  *            Both pipe ends are created, if they didn't already exist.
  202  *
  203  *      - netmap_monitor_adapter      [netmap_monitor.c]
  204  *            Returned by netmap_get_monitor_na().
  205  *            If successful, the nm_sync callbacks of the monitored adapter
  206  *            will be intercepted by the returned monitor.
  207  *
  208  *      - netmap_bwrap_adapter        [netmap_vale.c]
  209  *            Cannot be obtained in this way, see VALE_CTL below
  210  *
  211  *
  212  *      os-specific:
  213  *          linux: we first go through linux_netmap_ioctl() to
  214  *                 adapt the FreeBSD interface to the linux one.
  215  *
  216  *
  217  * > 3. on each descriptor, the process issues an mmap() request to
  218  * >    map the shared memory region within the process' address space.
  219  * >    The list of interesting queues is indicated by a location in
  220  * >    the shared memory region.
  221  *
  222  *      os-specific:
  223  *          FreeBSD: netmap_mmap_single (netmap_freebsd.c).
  224  *          linux:   linux_netmap_mmap (netmap_linux.c).
  225  *
  226  * > 4. using the functions in the netmap(4) userspace API, a process
  227  * >    can look up the occupation state of a queue, access memory buffers,
  228  * >    and retrieve received packets or enqueue packets to transmit.
  229  *
  230  *      these actions do not involve the kernel.
  231  *
  232  * > 5. using some ioctl()s the process can synchronize the userspace view
  233  * >    of the queue with the actual status in the kernel. This includes both
  234  * >    receiving the notification of new packets, and transmitting new
  235  * >    packets on the output interface.
  236  *
  237  *      These are implemented in netmap_ioctl(), NIOCTXSYNC and NIOCRXSYNC
  238  *      cases. They invoke the nm_sync callbacks on the netmap_kring
  239  *      structures, as initialized in step 2 and maybe later modified
  240  *      by a monitor. Monitors, however, will always call the original
  241  *      callback before doing anything else.
  242  *
  243  *
  244  * > 6. select() or poll() can be used to wait for events on individual
  245  * >    transmit or receive queues (or all queues for a given interface).
  246  *
  247  *      Implemented in netmap_poll(). This will call the same nm_sync()
  248  *      callbacks as in step 5 above.
  249  *
  250  *      os-specific:
  251  *              linux: we first go through linux_netmap_poll() to adapt
  252  *                     the FreeBSD interface to the linux one.
  253  *
  254  *
  255  *  ----  VALE_CTL -----
  256  *
  257  *  VALE switches are controlled by issuing a NIOCREGIF with a non-null
  258  *  nr_cmd in the nmreq structure. These subcommands are handled by
  259  *  netmap_bdg_ctl() in netmap_vale.c. Persistent VALE ports are created
  260  *  and destroyed by issuing the NETMAP_BDG_NEWIF and NETMAP_BDG_DELIF
  261  *  subcommands, respectively.
  262  *
  263  *  Any network interface known to the system (including a persistent VALE
  264  *  port) can be attached to a VALE switch by issuing the
  265  *  NETMAP_REQ_VALE_ATTACH command. After the attachment, persistent VALE ports
  266  *  look exactly like ephemeral VALE ports (as created in step 2 above).  The
  267  *  attachment of other interfaces, instead, requires the creation of a
  268  *  netmap_bwrap_adapter.  Moreover, the attached interface must be put in
  269  *  netmap mode. This may require the creation of a netmap_generic_adapter if
  270  *  we have no native support for the interface, or if generic adapters have
  271  *  been forced by sysctl.
  272  *
  273  *  Both persistent VALE ports and bwraps are handled by netmap_get_bdg_na(),
  274  *  called by nm_bdg_ctl_attach(), and discriminated by the nm_bdg_attach()
  275  *  callback.  In the case of the bwrap, the callback creates the
  276  *  netmap_bwrap_adapter.  The initialization of the bwrap is then
  277  *  completed by calling netmap_do_regif() on it, in the nm_bdg_ctl()
  278  *  callback (netmap_bwrap_bdg_ctl in netmap_vale.c).
  279  *  A generic adapter for the wrapped ifp will be created if needed, when
  280  *  netmap_get_bdg_na() calls netmap_get_hw_na().
  281  *
  282  *
  283  *  ---- DATAPATHS -----
  284  *
  285  *              -= SYSTEM DEVICE WITH NATIVE SUPPORT =-
  286  *
  287  *    na == NA(ifp) == netmap_hw_adapter created in DEVICE_netmap_attach()
  288  *
  289  *    - tx from netmap userspace:
  290  *       concurrently:
  291  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
  292  *                kring->nm_sync() == DEVICE_netmap_txsync()
  293  *           2) device interrupt handler
  294  *                na->nm_notify()  == netmap_notify()
  295  *    - rx from netmap userspace:
  296  *       concurrently:
  297  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
  298  *                kring->nm_sync() == DEVICE_netmap_rxsync()
  299  *           2) device interrupt handler
  300  *                na->nm_notify()  == netmap_notify()
  301  *    - rx from host stack
  302  *       concurrently:
  303  *           1) host stack
  304  *                netmap_transmit()
  305  *                  na->nm_notify  == netmap_notify()
  306  *           2) ioctl(NIOCRXSYNC)/netmap_poll() in process context
  307  *                kring->nm_sync() == netmap_rxsync_from_host
  308  *                  netmap_rxsync_from_host(na, NULL, NULL)
  309  *    - tx to host stack
  310  *           ioctl(NIOCTXSYNC)/netmap_poll() in process context
  311  *             kring->nm_sync() == netmap_txsync_to_host
  312  *               netmap_txsync_to_host(na)
  313  *                 nm_os_send_up()
  314  *                   FreeBSD: na->if_input() == ether_input()
  315  *                   linux: netif_rx() with NM_MAGIC_PRIORITY_RX
  316  *
  317  *
  318  *               -= SYSTEM DEVICE WITH GENERIC SUPPORT =-
  319  *
  320  *    na == NA(ifp) == generic_netmap_adapter created in generic_netmap_attach()
  321  *
  322  *    - tx from netmap userspace:
  323  *       concurrently:
  324  *           1) ioctl(NIOCTXSYNC)/netmap_poll() in process context
  325  *               kring->nm_sync() == generic_netmap_txsync()
  326  *                   nm_os_generic_xmit_frame()
  327  *                       linux:   dev_queue_xmit() with NM_MAGIC_PRIORITY_TX
  328  *                           ifp->ndo_start_xmit == generic_ndo_start_xmit()
  329  *                               gna->save_start_xmit == orig. dev. start_xmit
  330  *                       FreeBSD: na->if_transmit() == orig. dev if_transmit
  331  *           2) generic_mbuf_destructor()
  332  *                   na->nm_notify() == netmap_notify()
  333  *    - rx from netmap userspace:
  334  *           1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
  335  *               kring->nm_sync() == generic_netmap_rxsync()
  336  *                   mbq_safe_dequeue()
  337  *           2) device driver
  338  *               generic_rx_handler()
  339  *                   mbq_safe_enqueue()
  340  *                   na->nm_notify() == netmap_notify()
  341  *    - rx from host stack
  342  *        FreeBSD: same as native
  343  *        Linux: same as native except:
  344  *           1) host stack
  345  *               dev_queue_xmit() without NM_MAGIC_PRIORITY_TX
  346  *                   ifp->ndo_start_xmit == generic_ndo_start_xmit()
  347  *                       netmap_transmit()
  348  *                           na->nm_notify() == netmap_notify()
  349  *    - tx to host stack (same as native):
  350  *
  351  *
  352  *                           -= VALE =-
  353  *
  354  *   INCOMING:
  355  *
  356  *      - VALE ports:
  357  *          ioctl(NIOCTXSYNC)/netmap_poll() in process context
  358  *              kring->nm_sync() == netmap_vp_txsync()
  359  *
  360  *      - system device with native support:
  361  *         from cable:
  362  *             interrupt
  363  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
  364  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
  365  *                     netmap_vp_txsync()
  366  *                     kring->nm_sync() == DEVICE_netmap_rxsync()
  367  *         from host stack:
  368  *             netmap_transmit()
  369  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
  370  *                     kring->nm_sync() == netmap_rxsync_from_host()
  371  *                     netmap_vp_txsync()
  372  *
  373  *      - system device with generic support:
  374  *         from device driver:
  375  *            generic_rx_handler()
  376  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr != host ring)
  377  *                     kring->nm_sync() == generic_netmap_rxsync()
  378  *                     netmap_vp_txsync()
  379  *                     kring->nm_sync() == generic_netmap_rxsync()
  380  *         from host stack:
  381  *            netmap_transmit()
  382  *                na->nm_notify() == netmap_bwrap_intr_notify(ring_nr == host ring)
  383  *                     kring->nm_sync() == netmap_rxsync_from_host()
  384  *                     netmap_vp_txsync()
  385  *
  386  *   (all cases) --> nm_bdg_flush()
  387  *                      dest_na->nm_notify() == (see below)
  388  *
  389  *   OUTGOING:
  390  *
  391  *      - VALE ports:
  392  *         concurrently:
  393  *             1) ioctl(NIOCRXSYNC)/netmap_poll() in process context
  394  *                    kring->nm_sync() == netmap_vp_rxsync()
  395  *             2) from nm_bdg_flush()
  396  *                    na->nm_notify() == netmap_notify()
  397  *
  398  *      - system device with native support:
  399  *          to cable:
  400  *             na->nm_notify() == netmap_bwrap_notify()
  401  *                 netmap_vp_rxsync()
  402  *                 kring->nm_sync() == DEVICE_netmap_txsync()
  403  *                 netmap_vp_rxsync()
  404  *          to host stack:
  405  *                 netmap_vp_rxsync()
  406  *                 kring->nm_sync() == netmap_txsync_to_host
  407  *                 netmap_vp_rxsync_locked()
  408  *
  409  *      - system device with generic adapter:
  410  *          to device driver:
  411  *             na->nm_notify() == netmap_bwrap_notify()
  412  *                 netmap_vp_rxsync()
  413  *                 kring->nm_sync() == generic_netmap_txsync()
  414  *                 netmap_vp_rxsync()
  415  *          to host stack:
  416  *                 netmap_vp_rxsync()
  417  *                 kring->nm_sync() == netmap_txsync_to_host
  418  *                 netmap_vp_rxsync()
  419  *
  420  */
  421 
  422 /*
  423  * OS-specific code that is used only within this file.
  424  * Other OS-specific code that must be accessed by drivers
  425  * is present in netmap_kern.h
  426  */
  427 
  428 #if defined(__FreeBSD__)
  429 #include <sys/cdefs.h> /* prerequisite */
  430 #include <sys/types.h>
  431 #include <sys/errno.h>
  432 #include <sys/param.h>  /* defines used in kernel.h */
  433 #include <sys/kernel.h> /* types used in module initialization */
  434 #include <sys/conf.h>   /* cdevsw struct, UID, GID */
  435 #include <sys/filio.h>  /* FIONBIO */
  436 #include <sys/sockio.h>
  437 #include <sys/socketvar.h>      /* struct socket */
  438 #include <sys/malloc.h>
  439 #include <sys/poll.h>
  440 #include <sys/proc.h>
  441 #include <sys/rwlock.h>
  442 #include <sys/socket.h> /* sockaddrs */
  443 #include <sys/selinfo.h>
  444 #include <sys/sysctl.h>
  445 #include <sys/jail.h>
  446 #include <sys/epoch.h>
  447 #include <net/vnet.h>
  448 #include <net/if.h>
  449 #include <net/if_var.h>
  450 #include <net/bpf.h>            /* BIOCIMMEDIATE */
  451 #include <machine/bus.h>        /* bus_dmamap_* */
  452 #include <sys/endian.h>
  453 #include <sys/refcount.h>
  454 #include <net/ethernet.h>       /* ETHER_BPF_MTAP */
  455 
  456 
  457 #elif defined(linux)
  458 
  459 #include "bsd_glue.h"
  460 
  461 #elif defined(__APPLE__)
  462 
  463 #warning OSX support is only partial
  464 #include "osx_glue.h"
  465 
  466 #elif defined (_WIN32)
  467 
  468 #include "win_glue.h"
  469 
  470 #else
  471 
  472 #error  Unsupported platform
  473 
  474 #endif /* unsupported */
  475 
  476 /*
  477  * common headers
  478  */
  479 #include <net/netmap.h>
  480 #include <dev/netmap/netmap_kern.h>
  481 #include <dev/netmap/netmap_mem2.h>
  482 
  483 
  484 /* user-controlled variables */
  485 int netmap_verbose;
  486 #ifdef CONFIG_NETMAP_DEBUG
  487 int netmap_debug;
  488 #endif /* CONFIG_NETMAP_DEBUG */
  489 
  490 static int netmap_no_timestamp; /* don't timestamp on rxsync */
  491 int netmap_no_pendintr = 1;
  492 int netmap_txsync_retry = 2;
  493 static int netmap_fwd = 0;      /* force transparent forwarding */
  494 
  495 /*
  496  * netmap_admode selects the netmap mode to use.
  497  * Invalid values are reset to NETMAP_ADMODE_BEST
  498  */
  499 enum {  NETMAP_ADMODE_BEST = 0, /* use native, fallback to generic */
  500         NETMAP_ADMODE_NATIVE,   /* either native or none */
  501         NETMAP_ADMODE_GENERIC,  /* force generic */
  502         NETMAP_ADMODE_LAST };
  503 static int netmap_admode = NETMAP_ADMODE_BEST;
  504 
  505 /* netmap_generic_mit controls mitigation of RX notifications for
  506  * the generic netmap adapter. The value is a time interval in
  507  * nanoseconds. */
  508 int netmap_generic_mit = 100*1000;
  509 
  510 /* We use by default netmap-aware qdiscs with generic netmap adapters,
  511  * even if there can be a little performance hit with hardware NICs.
  512  * However, using the qdisc is the safer approach, for two reasons:
  513  * 1) it prevents non-fifo qdiscs to break the TX notification
  514  *    scheme, which is based on mbuf destructors when txqdisc is
  515  *    not used.
  516  * 2) it makes it possible to transmit over software devices that
  517  *    change skb->dev, like bridge, veth, ...
  518  *
  519  * Anyway users looking for the best performance should
  520  * use native adapters.
  521  */
  522 #ifdef linux
  523 int netmap_generic_txqdisc = 1;
  524 #endif
  525 
  526 /* Default number of slots and queues for generic adapters. */
  527 int netmap_generic_ringsize = 1024;
  528 int netmap_generic_rings = 1;
  529 
  530 /* Non-zero to enable checksum offloading in NIC drivers */
  531 int netmap_generic_hwcsum = 0;
  532 
  533 /* Non-zero if ptnet devices are allowed to use virtio-net headers. */
  534 int ptnet_vnet_hdr = 1;
  535 
  536 /*
  537  * SYSCTL calls are grouped between SYSBEGIN and SYSEND to be emulated
  538  * in some other operating systems
  539  */
  540 SYSBEGIN(main_init);
  541 
  542 SYSCTL_DECL(_dev_netmap);
  543 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  544     "Netmap args");
  545 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
  546                 CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
  547 #ifdef CONFIG_NETMAP_DEBUG
  548 SYSCTL_INT(_dev_netmap, OID_AUTO, debug,
  549                 CTLFLAG_RW, &netmap_debug, 0, "Debug messages");
  550 #endif /* CONFIG_NETMAP_DEBUG */
  551 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
  552                 CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
  553 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr, CTLFLAG_RW, &netmap_no_pendintr,
  554                 0, "Always look for new received packets.");
  555 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
  556                 &netmap_txsync_retry, 0, "Number of txsync loops in bridge's flush.");
  557 
  558 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0,
  559                 "Force NR_FORWARD mode");
  560 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0,
  561                 "Adapter mode. 0 selects the best option available,"
  562                 "1 forces native adapter, 2 forces emulated adapter");
  563 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_hwcsum, CTLFLAG_RW, &netmap_generic_hwcsum,
  564                 0, "Hardware checksums. 0 to disable checksum generation by the NIC (default),"
  565                 "1 to enable checksum generation by the NIC");
  566 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit,
  567                 0, "RX notification interval in nanoseconds");
  568 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW,
  569                 &netmap_generic_ringsize, 0,
  570                 "Number of per-ring slots for emulated netmap mode");
  571 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_rings, CTLFLAG_RW,
  572                 &netmap_generic_rings, 0,
  573                 "Number of TX/RX queues for emulated netmap adapters");
  574 #ifdef linux
  575 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_txqdisc, CTLFLAG_RW,
  576                 &netmap_generic_txqdisc, 0, "Use qdisc for generic adapters");
  577 #endif
  578 SYSCTL_INT(_dev_netmap, OID_AUTO, ptnet_vnet_hdr, CTLFLAG_RW, &ptnet_vnet_hdr,
  579                 0, "Allow ptnet devices to use virtio-net headers");
  580 
  581 SYSEND;
  582 
  583 NMG_LOCK_T      netmap_global_lock;
  584 
  585 /*
  586  * mark the ring as stopped, and run through the locks
  587  * to make sure other users get to see it.
  588  * stopped must be either NR_KR_STOPPED (for unbounded stop)
  589  * of NR_KR_LOCKED (brief stop for mutual exclusion purposes)
  590  */
  591 static void
  592 netmap_disable_ring(struct netmap_kring *kr, int stopped)
  593 {
  594         nm_kr_stop(kr, stopped);
  595         // XXX check if nm_kr_stop is sufficient
  596         mtx_lock(&kr->q_lock);
  597         mtx_unlock(&kr->q_lock);
  598         nm_kr_put(kr);
  599 }
  600 
  601 /* stop or enable a single ring */
  602 void
  603 netmap_set_ring(struct netmap_adapter *na, u_int ring_id, enum txrx t, int stopped)
  604 {
  605         if (stopped)
  606                 netmap_disable_ring(NMR(na, t)[ring_id], stopped);
  607         else
  608                 NMR(na, t)[ring_id]->nkr_stopped = 0;
  609 }
  610 
  611 
  612 /* stop or enable all the rings of na */
  613 void
  614 netmap_set_all_rings(struct netmap_adapter *na, int stopped)
  615 {
  616         int i;
  617         enum txrx t;
  618 
  619         if (!nm_netmap_on(na))
  620                 return;
  621 
  622         if (netmap_verbose) {
  623                 nm_prinf("%s: %sable all rings", na->name,
  624                     (stopped ? "dis" : "en"));
  625         }
  626         for_rx_tx(t) {
  627                 for (i = 0; i < netmap_real_rings(na, t); i++) {
  628                         netmap_set_ring(na, i, t, stopped);
  629                 }
  630         }
  631 }
  632 
  633 /*
  634  * Convenience function used in drivers.  Waits for current txsync()s/rxsync()s
  635  * to finish and prevents any new one from starting.  Call this before turning
  636  * netmap mode off, or before removing the hardware rings (e.g., on module
  637  * onload).
  638  */
  639 void
  640 netmap_disable_all_rings(struct ifnet *ifp)
  641 {
  642         if (NM_NA_VALID(ifp)) {
  643                 netmap_set_all_rings(NA(ifp), NM_KR_LOCKED);
  644         }
  645 }
  646 
  647 /*
  648  * Convenience function used in drivers.  Re-enables rxsync and txsync on the
  649  * adapter's rings In linux drivers, this should be placed near each
  650  * napi_enable().
  651  */
  652 void
  653 netmap_enable_all_rings(struct ifnet *ifp)
  654 {
  655         if (NM_NA_VALID(ifp)) {
  656                 netmap_set_all_rings(NA(ifp), 0 /* enabled */);
  657         }
  658 }
  659 
  660 void
  661 netmap_make_zombie(struct ifnet *ifp)
  662 {
  663         if (NM_NA_VALID(ifp)) {
  664                 struct netmap_adapter *na = NA(ifp);
  665                 netmap_set_all_rings(na, NM_KR_LOCKED);
  666                 na->na_flags |= NAF_ZOMBIE;
  667                 netmap_set_all_rings(na, 0);
  668         }
  669 }
  670 
  671 void
  672 netmap_undo_zombie(struct ifnet *ifp)
  673 {
  674         if (NM_NA_VALID(ifp)) {
  675                 struct netmap_adapter *na = NA(ifp);
  676                 if (na->na_flags & NAF_ZOMBIE) {
  677                         netmap_set_all_rings(na, NM_KR_LOCKED);
  678                         na->na_flags &= ~NAF_ZOMBIE;
  679                         netmap_set_all_rings(na, 0);
  680                 }
  681         }
  682 }
  683 
  684 /*
  685  * generic bound_checking function
  686  */
  687 u_int
  688 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
  689 {
  690         u_int oldv = *v;
  691         const char *op = NULL;
  692 
  693         if (dflt < lo)
  694                 dflt = lo;
  695         if (dflt > hi)
  696                 dflt = hi;
  697         if (oldv < lo) {
  698                 *v = dflt;
  699                 op = "Bump";
  700         } else if (oldv > hi) {
  701                 *v = hi;
  702                 op = "Clamp";
  703         }
  704         if (op && msg)
  705                 nm_prinf("%s %s to %d (was %d)", op, msg, *v, oldv);
  706         return *v;
  707 }
  708 
  709 
  710 /*
  711  * packet-dump function, user-supplied or static buffer.
  712  * The destination buffer must be at least 30+4*len
  713  */
  714 const char *
  715 nm_dump_buf(char *p, int len, int lim, char *dst)
  716 {
  717         static char _dst[8192];
  718         int i, j, i0;
  719         static char hex[] ="0123456789abcdef";
  720         char *o;        /* output position */
  721 
  722 #define P_HI(x) hex[((x) & 0xf0)>>4]
  723 #define P_LO(x) hex[((x) & 0xf)]
  724 #define P_C(x)  ((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
  725         if (!dst)
  726                 dst = _dst;
  727         if (lim <= 0 || lim > len)
  728                 lim = len;
  729         o = dst;
  730         sprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
  731         o += strlen(o);
  732         /* hexdump routine */
  733         for (i = 0; i < lim; ) {
  734                 sprintf(o, "%5d: ", i);
  735                 o += strlen(o);
  736                 memset(o, ' ', 48);
  737                 i0 = i;
  738                 for (j=0; j < 16 && i < lim; i++, j++) {
  739                         o[j*3] = P_HI(p[i]);
  740                         o[j*3+1] = P_LO(p[i]);
  741                 }
  742                 i = i0;
  743                 for (j=0; j < 16 && i < lim; i++, j++)
  744                         o[j + 48] = P_C(p[i]);
  745                 o[j+48] = '\n';
  746                 o += j+49;
  747         }
  748         *o = '\0';
  749 #undef P_HI
  750 #undef P_LO
  751 #undef P_C
  752         return dst;
  753 }
  754 
  755 
  756 /*
  757  * Fetch configuration from the device, to cope with dynamic
  758  * reconfigurations after loading the module.
  759  */
  760 /* call with NMG_LOCK held */
  761 int
  762 netmap_update_config(struct netmap_adapter *na)
  763 {
  764         struct nm_config_info info;
  765 
  766         if (na->ifp && !nm_is_bwrap(na)) {
  767                 strlcpy(na->name, na->ifp->if_xname, sizeof(na->name));
  768         }
  769 
  770         bzero(&info, sizeof(info));
  771         if (na->nm_config == NULL ||
  772             na->nm_config(na, &info)) {
  773                 /* take whatever we had at init time */
  774                 info.num_tx_rings = na->num_tx_rings;
  775                 info.num_tx_descs = na->num_tx_desc;
  776                 info.num_rx_rings = na->num_rx_rings;
  777                 info.num_rx_descs = na->num_rx_desc;
  778                 info.rx_buf_maxsize = na->rx_buf_maxsize;
  779         }
  780 
  781         if (na->num_tx_rings == info.num_tx_rings &&
  782             na->num_tx_desc == info.num_tx_descs &&
  783             na->num_rx_rings == info.num_rx_rings &&
  784             na->num_rx_desc == info.num_rx_descs &&
  785             na->rx_buf_maxsize == info.rx_buf_maxsize)
  786                 return 0; /* nothing changed */
  787         if (na->active_fds == 0) {
  788                 na->num_tx_rings = info.num_tx_rings;
  789                 na->num_tx_desc = info.num_tx_descs;
  790                 na->num_rx_rings = info.num_rx_rings;
  791                 na->num_rx_desc = info.num_rx_descs;
  792                 na->rx_buf_maxsize = info.rx_buf_maxsize;
  793                 if (netmap_verbose)
  794                         nm_prinf("configuration changed for %s: txring %d x %d, "
  795                                 "rxring %d x %d, rxbufsz %d",
  796                                 na->name, na->num_tx_rings, na->num_tx_desc,
  797                                 na->num_rx_rings, na->num_rx_desc, na->rx_buf_maxsize);
  798                 return 0;
  799         }
  800         nm_prerr("WARNING: configuration changed for %s while active: "
  801                 "txring %d x %d, rxring %d x %d, rxbufsz %d",
  802                 na->name, info.num_tx_rings, info.num_tx_descs,
  803                 info.num_rx_rings, info.num_rx_descs,
  804                 info.rx_buf_maxsize);
  805         return 1;
  806 }
  807 
  808 /* nm_sync callbacks for the host rings */
  809 static int netmap_txsync_to_host(struct netmap_kring *kring, int flags);
  810 static int netmap_rxsync_from_host(struct netmap_kring *kring, int flags);
  811 
  812 static int
  813 netmap_default_bufcfg(struct netmap_kring *kring, uint64_t target)
  814 {
  815         kring->hwbuf_len = target;
  816         kring->buf_align = 0; /* no alignment */
  817         return 0;
  818 }
  819 
  820 /* create the krings array and initialize the fields common to all adapters.
  821  * The array layout is this:
  822  *
  823  *                    +----------+
  824  * na->tx_rings ----->|          | \
  825  *                    |          |  } na->num_tx_ring
  826  *                    |          | /
  827  *                    +----------+
  828  *                    |          |    host tx kring
  829  * na->rx_rings ----> +----------+
  830  *                    |          | \
  831  *                    |          |  } na->num_rx_rings
  832  *                    |          | /
  833  *                    +----------+
  834  *                    |          |    host rx kring
  835  *                    +----------+
  836  * na->tailroom ----->|          | \
  837  *                    |          |  } tailroom bytes
  838  *                    |          | /
  839  *                    +----------+
  840  *
  841  * Note: for compatibility, host krings are created even when not needed.
  842  * The tailroom space is currently used by vale ports for allocating leases.
  843  */
  844 /* call with NMG_LOCK held */
  845 int
  846 netmap_krings_create(struct netmap_adapter *na, u_int tailroom)
  847 {
  848         u_int i, len, ndesc;
  849         struct netmap_kring *kring;
  850         u_int n[NR_TXRX];
  851         enum txrx t;
  852         int err = 0;
  853 
  854         if (na->tx_rings != NULL) {
  855                 if (netmap_debug & NM_DEBUG_ON)
  856                         nm_prerr("warning: krings were already created");
  857                 return 0;
  858         }
  859 
  860         /* account for the (possibly fake) host rings */
  861         n[NR_TX] = netmap_all_rings(na, NR_TX);
  862         n[NR_RX] = netmap_all_rings(na, NR_RX);
  863 
  864         len = (n[NR_TX] + n[NR_RX]) *
  865                 (sizeof(struct netmap_kring) + sizeof(struct netmap_kring *))
  866                 + tailroom;
  867 
  868         na->tx_rings = nm_os_malloc((size_t)len);
  869         if (na->tx_rings == NULL) {
  870                 nm_prerr("Cannot allocate krings");
  871                 return ENOMEM;
  872         }
  873         na->rx_rings = na->tx_rings + n[NR_TX];
  874         na->tailroom = na->rx_rings + n[NR_RX];
  875 
  876         /* link the krings in the krings array */
  877         kring = (struct netmap_kring *)((char *)na->tailroom + tailroom);
  878         for (i = 0; i < n[NR_TX] + n[NR_RX]; i++) {
  879                 na->tx_rings[i] = kring;
  880                 kring++;
  881         }
  882 
  883         /*
  884          * All fields in krings are 0 except the one initialized below.
  885          * but better be explicit on important kring fields.
  886          */
  887         for_rx_tx(t) {
  888                 ndesc = nma_get_ndesc(na, t);
  889                 for (i = 0; i < n[t]; i++) {
  890                         kring = NMR(na, t)[i];
  891                         bzero(kring, sizeof(*kring));
  892                         kring->notify_na = na;
  893                         kring->ring_id = i;
  894                         kring->tx = t;
  895                         kring->nkr_num_slots = ndesc;
  896                         kring->nr_mode = NKR_NETMAP_OFF;
  897                         kring->nr_pending_mode = NKR_NETMAP_OFF;
  898                         if (i < nma_get_nrings(na, t)) {
  899                                 kring->nm_sync = (t == NR_TX ? na->nm_txsync : na->nm_rxsync);
  900                                 kring->nm_bufcfg = na->nm_bufcfg;
  901                                 if (kring->nm_bufcfg == NULL)
  902                                         kring->nm_bufcfg = netmap_default_bufcfg;
  903                         } else {
  904                                 if (!(na->na_flags & NAF_HOST_RINGS))
  905                                         kring->nr_kflags |= NKR_FAKERING;
  906                                 kring->nm_sync = (t == NR_TX ?
  907                                                 netmap_txsync_to_host:
  908                                                 netmap_rxsync_from_host);
  909                                 kring->nm_bufcfg = netmap_default_bufcfg;
  910                         }
  911                         kring->nm_notify = na->nm_notify;
  912                         kring->rhead = kring->rcur = kring->nr_hwcur = 0;
  913                         /*
  914                          * IMPORTANT: Always keep one slot empty.
  915                          */
  916                         kring->rtail = kring->nr_hwtail = (t == NR_TX ? ndesc - 1 : 0);
  917                         snprintf(kring->name, sizeof(kring->name) - 1, "%s %s%d", na->name,
  918                                         nm_txrx2str(t), i);
  919                         nm_prdis("ktx %s h %d c %d t %d",
  920                                 kring->name, kring->rhead, kring->rcur, kring->rtail);
  921                         err = nm_os_selinfo_init(&kring->si, kring->name);
  922                         if (err) {
  923                                 netmap_krings_delete(na);
  924                                 return err;
  925                         }
  926                         mtx_init(&kring->q_lock, (t == NR_TX ? "nm_txq_lock" : "nm_rxq_lock"), NULL, MTX_DEF);
  927                         kring->na = na; /* setting this field marks the mutex as initialized */
  928                 }
  929                 err = nm_os_selinfo_init(&na->si[t], na->name);
  930                 if (err) {
  931                         netmap_krings_delete(na);
  932                         return err;
  933                 }
  934         }
  935 
  936         return 0;
  937 }
  938 
  939 
  940 /* undo the actions performed by netmap_krings_create */
  941 /* call with NMG_LOCK held */
  942 void
  943 netmap_krings_delete(struct netmap_adapter *na)
  944 {
  945         struct netmap_kring **kring = na->tx_rings;
  946         enum txrx t;
  947 
  948         if (na->tx_rings == NULL) {
  949                 if (netmap_debug & NM_DEBUG_ON)
  950                         nm_prerr("warning: krings were already deleted");
  951                 return;
  952         }
  953 
  954         for_rx_tx(t)
  955                 nm_os_selinfo_uninit(&na->si[t]);
  956 
  957         /* we rely on the krings layout described above */
  958         for ( ; kring != na->tailroom; kring++) {
  959                 if ((*kring)->na != NULL)
  960                         mtx_destroy(&(*kring)->q_lock);
  961                 nm_os_selinfo_uninit(&(*kring)->si);
  962         }
  963         nm_os_free(na->tx_rings);
  964         na->tx_rings = na->rx_rings = na->tailroom = NULL;
  965 }
  966 
  967 
  968 /*
  969  * Destructor for NIC ports. They also have an mbuf queue
  970  * on the rings connected to the host so we need to purge
  971  * them first.
  972  */
  973 /* call with NMG_LOCK held */
  974 void
  975 netmap_hw_krings_delete(struct netmap_adapter *na)
  976 {
  977         u_int lim = netmap_real_rings(na, NR_RX), i;
  978 
  979         for (i = nma_get_nrings(na, NR_RX); i < lim; i++) {
  980                 struct mbq *q = &NMR(na, NR_RX)[i]->rx_queue;
  981                 nm_prdis("destroy sw mbq with len %d", mbq_len(q));
  982                 mbq_purge(q);
  983                 mbq_safe_fini(q);
  984         }
  985         netmap_krings_delete(na);
  986 }
  987 
  988 void
  989 netmap_mem_restore(struct netmap_adapter *na)
  990 {
  991         if (na->nm_mem_prev) {
  992                 netmap_mem_put(na->nm_mem);
  993                 na->nm_mem = na->nm_mem_prev;
  994                 na->nm_mem_prev = NULL;
  995         }
  996 }
  997 
  998 static void
  999 netmap_mem_drop(struct netmap_adapter *na)
 1000 {
 1001         netmap_mem_deref(na->nm_mem, na);
 1002 
 1003         if (na->active_fds <= 0) {
 1004                 /* if the native allocator had been overridden on regif,
 1005                  * restore it now and drop the temporary one
 1006                  */
 1007                 netmap_mem_restore(na);
 1008         }
 1009 }
 1010 
 1011 static void
 1012 netmap_update_hostrings_mode(struct netmap_adapter *na)
 1013 {
 1014         enum txrx t;
 1015         struct netmap_kring *kring;
 1016         int i;
 1017 
 1018         for_rx_tx(t) {
 1019                 for (i = nma_get_nrings(na, t);
 1020                      i < netmap_real_rings(na, t); i++) {
 1021                         kring = NMR(na, t)[i];
 1022                         kring->nr_mode = kring->nr_pending_mode;
 1023                 }
 1024         }
 1025 }
 1026 
 1027 /*
 1028  * Undo everything that was done in netmap_do_regif(). In particular,
 1029  * call nm_register(ifp,0) to stop netmap mode on the interface and
 1030  * revert to normal operation.
 1031  */
 1032 /* call with NMG_LOCK held */
 1033 static void netmap_unset_ringid(struct netmap_priv_d *);
 1034 static void netmap_krings_put(struct netmap_priv_d *);
 1035 void
 1036 netmap_do_unregif(struct netmap_priv_d *priv)
 1037 {
 1038         struct netmap_adapter *na = priv->np_na;
 1039 
 1040         NMG_LOCK_ASSERT();
 1041         na->active_fds--;
 1042         /* unset nr_pending_mode and possibly release exclusive mode */
 1043         netmap_krings_put(priv);
 1044 
 1045 #ifdef  WITH_MONITOR
 1046         /* XXX check whether we have to do something with monitor
 1047          * when rings change nr_mode. */
 1048         if (na->active_fds <= 0) {
 1049                 /* walk through all the rings and tell any monitor
 1050                  * that the port is going to exit netmap mode
 1051                  */
 1052                 netmap_monitor_stop(na);
 1053         }
 1054 #endif
 1055 
 1056         if (na->active_fds <= 0 || nm_kring_pending(priv)) {
 1057                 netmap_set_all_rings(na, NM_KR_LOCKED);
 1058                 na->nm_register(na, 0);
 1059                 netmap_set_all_rings(na, 0);
 1060         }
 1061 
 1062         /* delete rings and buffers that are no longer needed */
 1063         netmap_mem_rings_delete(na);
 1064 
 1065         if (na->active_fds <= 0) {      /* last instance */
 1066                 /*
 1067                  * (TO CHECK) We enter here
 1068                  * when the last reference to this file descriptor goes
 1069                  * away. This means we cannot have any pending poll()
 1070                  * or interrupt routine operating on the structure.
 1071                  * XXX The file may be closed in a thread while
 1072                  * another thread is using it.
 1073                  * Linux keeps the file opened until the last reference
 1074                  * by any outstanding ioctl/poll or mmap is gone.
 1075                  * FreeBSD does not track mmap()s (but we do) and
 1076                  * wakes up any sleeping poll(). Need to check what
 1077                  * happens if the close() occurs while a concurrent
 1078                  * syscall is running.
 1079                  */
 1080                 if (netmap_debug & NM_DEBUG_ON)
 1081                         nm_prinf("deleting last instance for %s", na->name);
 1082 
 1083                 if (nm_netmap_on(na)) {
 1084                         nm_prerr("BUG: netmap on while going to delete the krings");
 1085                 }
 1086 
 1087                 na->nm_krings_delete(na);
 1088 
 1089                 /* restore the default number of host tx and rx rings */
 1090                 if (na->na_flags & NAF_HOST_RINGS) {
 1091                         na->num_host_tx_rings = 1;
 1092                         na->num_host_rx_rings = 1;
 1093                 } else {
 1094                         na->num_host_tx_rings = 0;
 1095                         na->num_host_rx_rings = 0;
 1096                 }
 1097         }
 1098 
 1099         /* possibly decrement counter of tx_si/rx_si users */
 1100         netmap_unset_ringid(priv);
 1101         /* delete the nifp */
 1102         netmap_mem_if_delete(na, priv->np_nifp);
 1103         /* drop the allocator */
 1104         netmap_mem_drop(na);
 1105         /* mark the priv as unregistered */
 1106         priv->np_na = NULL;
 1107         priv->np_nifp = NULL;
 1108 }
 1109 
 1110 struct netmap_priv_d*
 1111 netmap_priv_new(void)
 1112 {
 1113         struct netmap_priv_d *priv;
 1114 
 1115         priv = nm_os_malloc(sizeof(struct netmap_priv_d));
 1116         if (priv == NULL)
 1117                 return NULL;
 1118         priv->np_refs = 1;
 1119         nm_os_get_module();
 1120         return priv;
 1121 }
 1122 
 1123 /*
 1124  * Destructor of the netmap_priv_d, called when the fd is closed
 1125  * Action: undo all the things done by NIOCREGIF,
 1126  * On FreeBSD we need to track whether there are active mmap()s,
 1127  * and we use np_active_mmaps for that. On linux, the field is always 0.
 1128  * Return: 1 if we can free priv, 0 otherwise.
 1129  *
 1130  */
 1131 /* call with NMG_LOCK held */
 1132 void
 1133 netmap_priv_delete(struct netmap_priv_d *priv)
 1134 {
 1135         struct netmap_adapter *na = priv->np_na;
 1136 
 1137         /* number of active references to this fd */
 1138         if (--priv->np_refs > 0) {
 1139                 return;
 1140         }
 1141         nm_os_put_module();
 1142         if (na) {
 1143                 netmap_do_unregif(priv);
 1144         }
 1145         netmap_unget_na(na, priv->np_ifp);
 1146         bzero(priv, sizeof(*priv));     /* for safety */
 1147         nm_os_free(priv);
 1148 }
 1149 
 1150 
 1151 /* call with NMG_LOCK *not* held */
 1152 void
 1153 netmap_dtor(void *data)
 1154 {
 1155         struct netmap_priv_d *priv = data;
 1156 
 1157         NMG_LOCK();
 1158         netmap_priv_delete(priv);
 1159         NMG_UNLOCK();
 1160 }
 1161 
 1162 
 1163 /*
 1164  * Handlers for synchronization of the rings from/to the host stack.
 1165  * These are associated to a network interface and are just another
 1166  * ring pair managed by userspace.
 1167  *
 1168  * Netmap also supports transparent forwarding (NS_FORWARD and NR_FORWARD
 1169  * flags):
 1170  *
 1171  * - Before releasing buffers on hw RX rings, the application can mark
 1172  *   them with the NS_FORWARD flag. During the next RXSYNC or poll(), they
 1173  *   will be forwarded to the host stack, similarly to what happened if
 1174  *   the application moved them to the host TX ring.
 1175  *
 1176  * - Before releasing buffers on the host RX ring, the application can
 1177  *   mark them with the NS_FORWARD flag. During the next RXSYNC or poll(),
 1178  *   they will be forwarded to the hw TX rings, saving the application
 1179  *   from doing the same task in user-space.
 1180  *
 1181  * Transparent forwarding can be enabled per-ring, by setting the NR_FORWARD
 1182  * flag, or globally with the netmap_fwd sysctl.
 1183  *
 1184  * The transfer NIC --> host is relatively easy, just encapsulate
 1185  * into mbufs and we are done. The host --> NIC side is slightly
 1186  * harder because there might not be room in the tx ring so it
 1187  * might take a while before releasing the buffer.
 1188  */
 1189 
 1190 
 1191 /*
 1192  * Pass a whole queue of mbufs to the host stack as coming from 'dst'
 1193  * We do not need to lock because the queue is private.
 1194  * After this call the queue is empty.
 1195  */
 1196 static void
 1197 netmap_send_up(struct ifnet *dst, struct mbq *q)
 1198 {
 1199         struct mbuf *m;
 1200         struct mbuf *head = NULL, *prev = NULL;
 1201 #ifdef __FreeBSD__
 1202         struct epoch_tracker et;
 1203 
 1204         NET_EPOCH_ENTER(et);
 1205 #endif /* __FreeBSD__ */
 1206         /* Send packets up, outside the lock; head/prev machinery
 1207          * is only useful for Windows. */
 1208         while ((m = mbq_dequeue(q)) != NULL) {
 1209                 if (netmap_debug & NM_DEBUG_HOST)
 1210                         nm_prinf("sending up pkt %p size %d", m, MBUF_LEN(m));
 1211                 prev = nm_os_send_up(dst, m, prev);
 1212                 if (head == NULL)
 1213                         head = prev;
 1214         }
 1215         if (head)
 1216                 nm_os_send_up(dst, NULL, head);
 1217 #ifdef __FreeBSD__
 1218         NET_EPOCH_EXIT(et);
 1219 #endif /* __FreeBSD__ */
 1220         mbq_fini(q);
 1221 }
 1222 
 1223 
 1224 /*
 1225  * Scan the buffers from hwcur to ring->head, and put a copy of those
 1226  * marked NS_FORWARD (or all of them if forced) into a queue of mbufs.
 1227  * Drop remaining packets in the unlikely event
 1228  * of an mbuf shortage.
 1229  */
 1230 static void
 1231 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
 1232 {
 1233         u_int const lim = kring->nkr_num_slots - 1;
 1234         u_int const head = kring->rhead;
 1235         u_int n;
 1236         struct netmap_adapter *na = kring->na;
 1237 
 1238         for (n = kring->nr_hwcur; n != head; n = nm_next(n, lim)) {
 1239                 struct mbuf *m;
 1240                 struct netmap_slot *slot = &kring->ring->slot[n];
 1241 
 1242                 if ((slot->flags & NS_FORWARD) == 0 && !force)
 1243                         continue;
 1244                 if (slot->len < 14 || slot->len > NETMAP_BUF_SIZE(na)) {
 1245                         nm_prlim(5, "bad pkt at %d len %d", n, slot->len);
 1246                         continue;
 1247                 }
 1248                 slot->flags &= ~NS_FORWARD; // XXX needed ?
 1249                 /* XXX TODO: adapt to the case of a multisegment packet */
 1250                 m = m_devget(NMB(na, slot), slot->len, 0, na->ifp, NULL);
 1251 
 1252                 if (m == NULL)
 1253                         break;
 1254                 mbq_enqueue(q, m);
 1255         }
 1256 }
 1257 
 1258 static inline int
 1259 _nm_may_forward(struct netmap_kring *kring)
 1260 {
 1261         return  ((netmap_fwd || kring->ring->flags & NR_FORWARD) &&
 1262                  kring->na->na_flags & NAF_HOST_RINGS &&
 1263                  kring->tx == NR_RX);
 1264 }
 1265 
 1266 static inline int
 1267 nm_may_forward_up(struct netmap_kring *kring)
 1268 {
 1269         return  _nm_may_forward(kring) &&
 1270                  kring->ring_id != kring->na->num_rx_rings;
 1271 }
 1272 
 1273 static inline int
 1274 nm_may_forward_down(struct netmap_kring *kring, int sync_flags)
 1275 {
 1276         return  _nm_may_forward(kring) &&
 1277                  (sync_flags & NAF_CAN_FORWARD_DOWN) &&
 1278                  kring->ring_id == kring->na->num_rx_rings;
 1279 }
 1280 
 1281 /*
 1282  * Send to the NIC rings packets marked NS_FORWARD between
 1283  * kring->nr_hwcur and kring->rhead.
 1284  * Called under kring->rx_queue.lock on the sw rx ring.
 1285  *
 1286  * It can only be called if the user opened all the TX hw rings,
 1287  * see NAF_CAN_FORWARD_DOWN flag.
 1288  * We can touch the TX netmap rings (slots, head and cur) since
 1289  * we are in poll/ioctl system call context, and the application
 1290  * is not supposed to touch the ring (using a different thread)
 1291  * during the execution of the system call.
 1292  */
 1293 static u_int
 1294 netmap_sw_to_nic(struct netmap_adapter *na)
 1295 {
 1296         struct netmap_kring *kring = na->rx_rings[na->num_rx_rings];
 1297         struct netmap_slot *rxslot = kring->ring->slot;
 1298         u_int i, rxcur = kring->nr_hwcur;
 1299         u_int const head = kring->rhead;
 1300         u_int const src_lim = kring->nkr_num_slots - 1;
 1301         u_int sent = 0;
 1302 
 1303         /* scan rings to find space, then fill as much as possible */
 1304         for (i = 0; i < na->num_tx_rings; i++) {
 1305                 struct netmap_kring *kdst = na->tx_rings[i];
 1306                 struct netmap_ring *rdst = kdst->ring;
 1307                 u_int const dst_lim = kdst->nkr_num_slots - 1;
 1308 
 1309                 /* XXX do we trust ring or kring->rcur,rtail ? */
 1310                 for (; rxcur != head && !nm_ring_empty(rdst);
 1311                      rxcur = nm_next(rxcur, src_lim) ) {
 1312                         struct netmap_slot *src, *dst, tmp;
 1313                         u_int dst_head = rdst->head;
 1314 
 1315                         src = &rxslot[rxcur];
 1316                         if ((src->flags & NS_FORWARD) == 0 && !netmap_fwd)
 1317                                 continue;
 1318 
 1319                         sent++;
 1320 
 1321                         dst = &rdst->slot[dst_head];
 1322 
 1323                         tmp = *src;
 1324 
 1325                         src->buf_idx = dst->buf_idx;
 1326                         src->flags = NS_BUF_CHANGED;
 1327 
 1328                         dst->buf_idx = tmp.buf_idx;
 1329                         dst->len = tmp.len;
 1330                         dst->flags = NS_BUF_CHANGED;
 1331 
 1332                         rdst->head = rdst->cur = nm_next(dst_head, dst_lim);
 1333                 }
 1334                 /* if (sent) XXX txsync ? it would be just an optimization */
 1335         }
 1336         return sent;
 1337 }
 1338 
 1339 
 1340 /*
 1341  * netmap_txsync_to_host() passes packets up. We are called from a
 1342  * system call in user process context, and the only contention
 1343  * can be among multiple user threads erroneously calling
 1344  * this routine concurrently.
 1345  */
 1346 static int
 1347 netmap_txsync_to_host(struct netmap_kring *kring, int flags)
 1348 {
 1349         struct netmap_adapter *na = kring->na;
 1350         u_int const lim = kring->nkr_num_slots - 1;
 1351         u_int const head = kring->rhead;
 1352         struct mbq q;
 1353 
 1354         /* Take packets from hwcur to head and pass them up.
 1355          * Force hwcur = head since netmap_grab_packets() stops at head
 1356          */
 1357         mbq_init(&q);
 1358         netmap_grab_packets(kring, &q, 1 /* force */);
 1359         nm_prdis("have %d pkts in queue", mbq_len(&q));
 1360         kring->nr_hwcur = head;
 1361         kring->nr_hwtail = head + lim;
 1362         if (kring->nr_hwtail > lim)
 1363                 kring->nr_hwtail -= lim + 1;
 1364 
 1365         netmap_send_up(na->ifp, &q);
 1366         return 0;
 1367 }
 1368 
 1369 
 1370 /*
 1371  * rxsync backend for packets coming from the host stack.
 1372  * They have been put in kring->rx_queue by netmap_transmit().
 1373  * We protect access to the kring using kring->rx_queue.lock
 1374  *
 1375  * also moves to the nic hw rings any packet the user has marked
 1376  * for transparent-mode forwarding, then sets the NR_FORWARD
 1377  * flag in the kring to let the caller push them out
 1378  */
 1379 static int
 1380 netmap_rxsync_from_host(struct netmap_kring *kring, int flags)
 1381 {
 1382         struct netmap_adapter *na = kring->na;
 1383         struct netmap_ring *ring = kring->ring;
 1384         u_int nm_i, n;
 1385         u_int const lim = kring->nkr_num_slots - 1;
 1386         u_int const head = kring->rhead;
 1387         int ret = 0;
 1388         struct mbq *q = &kring->rx_queue, fq;
 1389 
 1390         mbq_init(&fq); /* fq holds packets to be freed */
 1391 
 1392         mbq_lock(q);
 1393 
 1394         /* First part: import newly received packets */
 1395         n = mbq_len(q);
 1396         if (n) { /* grab packets from the queue */
 1397                 struct mbuf *m;
 1398                 uint32_t stop_i;
 1399 
 1400                 nm_i = kring->nr_hwtail;
 1401                 stop_i = nm_prev(kring->nr_hwcur, lim);
 1402                 while ( nm_i != stop_i && (m = mbq_dequeue(q)) != NULL ) {
 1403                         int len = MBUF_LEN(m);
 1404                         struct netmap_slot *slot = &ring->slot[nm_i];
 1405 
 1406                         m_copydata(m, 0, len, NMB(na, slot));
 1407                         nm_prdis("nm %d len %d", nm_i, len);
 1408                         if (netmap_debug & NM_DEBUG_HOST)
 1409                                 nm_prinf("%s", nm_dump_buf(NMB(na, slot),len, 128, NULL));
 1410 
 1411                         slot->len = len;
 1412                         slot->flags = 0;
 1413                         nm_i = nm_next(nm_i, lim);
 1414                         mbq_enqueue(&fq, m);
 1415                 }
 1416                 kring->nr_hwtail = nm_i;
 1417         }
 1418 
 1419         /*
 1420          * Second part: skip past packets that userspace has released.
 1421          */
 1422         nm_i = kring->nr_hwcur;
 1423         if (nm_i != head) { /* something was released */
 1424                 if (nm_may_forward_down(kring, flags)) {
 1425                         ret = netmap_sw_to_nic(na);
 1426                         if (ret > 0) {
 1427                                 kring->nr_kflags |= NR_FORWARD;
 1428                                 ret = 0;
 1429                         }
 1430                 }
 1431                 kring->nr_hwcur = head;
 1432         }
 1433 
 1434         mbq_unlock(q);
 1435 
 1436         mbq_purge(&fq);
 1437         mbq_fini(&fq);
 1438 
 1439         return ret;
 1440 }
 1441 
 1442 
 1443 /* Get a netmap adapter for the port.
 1444  *
 1445  * If it is possible to satisfy the request, return 0
 1446  * with *na containing the netmap adapter found.
 1447  * Otherwise return an error code, with *na containing NULL.
 1448  *
 1449  * When the port is attached to a bridge, we always return
 1450  * EBUSY.
 1451  * Otherwise, if the port is already bound to a file descriptor,
 1452  * then we unconditionally return the existing adapter into *na.
 1453  * In all the other cases, we return (into *na) either native,
 1454  * generic or NULL, according to the following table:
 1455  *
 1456  *                                      native_support
 1457  * active_fds   dev.netmap.admode         YES     NO
 1458  * -------------------------------------------------------
 1459  *    >0              *                 NA(ifp) NA(ifp)
 1460  *
 1461  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
 1462  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
 1463  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
 1464  *
 1465  */
 1466 static void netmap_hw_dtor(struct netmap_adapter *); /* needed by NM_IS_NATIVE() */
 1467 int
 1468 netmap_get_hw_na(struct ifnet *ifp, struct netmap_mem_d *nmd, struct netmap_adapter **na)
 1469 {
 1470         /* generic support */
 1471         int i = netmap_admode;  /* Take a snapshot. */
 1472         struct netmap_adapter *prev_na;
 1473         int error = 0;
 1474 
 1475         *na = NULL; /* default */
 1476 
 1477         /* reset in case of invalid value */
 1478         if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
 1479                 i = netmap_admode = NETMAP_ADMODE_BEST;
 1480 
 1481         if (NM_NA_VALID(ifp)) {
 1482                 prev_na = NA(ifp);
 1483                 /* If an adapter already exists, return it if
 1484                  * there are active file descriptors or if
 1485                  * netmap is not forced to use generic
 1486                  * adapters.
 1487                  */
 1488                 if (NETMAP_OWNED_BY_ANY(prev_na)
 1489                         || i != NETMAP_ADMODE_GENERIC
 1490                         || prev_na->na_flags & NAF_FORCE_NATIVE
 1491 #ifdef WITH_PIPES
 1492                         /* ugly, but we cannot allow an adapter switch
 1493                          * if some pipe is referring to this one
 1494                          */
 1495                         || prev_na->na_next_pipe > 0
 1496 #endif
 1497                 ) {
 1498                         *na = prev_na;
 1499                         goto assign_mem;
 1500                 }
 1501         }
 1502 
 1503         /* If there isn't native support and netmap is not allowed
 1504          * to use generic adapters, we cannot satisfy the request.
 1505          */
 1506         if (!NM_IS_NATIVE(ifp) && i == NETMAP_ADMODE_NATIVE)
 1507                 return EOPNOTSUPP;
 1508 
 1509         /* Otherwise, create a generic adapter and return it,
 1510          * saving the previously used netmap adapter, if any.
 1511          *
 1512          * Note that here 'prev_na', if not NULL, MUST be a
 1513          * native adapter, and CANNOT be a generic one. This is
 1514          * true because generic adapters are created on demand, and
 1515          * destroyed when not used anymore. Therefore, if the adapter
 1516          * currently attached to an interface 'ifp' is generic, it
 1517          * must be that
 1518          * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
 1519          * Consequently, if NA(ifp) is generic, we will enter one of
 1520          * the branches above. This ensures that we never override
 1521          * a generic adapter with another generic adapter.
 1522          */
 1523         error = generic_netmap_attach(ifp);
 1524         if (error)
 1525                 return error;
 1526 
 1527         *na = NA(ifp);
 1528 
 1529 assign_mem:
 1530         if (nmd != NULL && !((*na)->na_flags & NAF_MEM_OWNER) &&
 1531             (*na)->active_fds == 0 && ((*na)->nm_mem != nmd)) {
 1532                 (*na)->nm_mem_prev = (*na)->nm_mem;
 1533                 (*na)->nm_mem = netmap_mem_get(nmd);
 1534         }
 1535 
 1536         return 0;
 1537 }
 1538 
 1539 /*
 1540  * MUST BE CALLED UNDER NMG_LOCK()
 1541  *
 1542  * Get a refcounted reference to a netmap adapter attached
 1543  * to the interface specified by req.
 1544  * This is always called in the execution of an ioctl().
 1545  *
 1546  * Return ENXIO if the interface specified by the request does
 1547  * not exist, ENOTSUP if netmap is not supported by the interface,
 1548  * EBUSY if the interface is already attached to a bridge,
 1549  * EINVAL if parameters are invalid, ENOMEM if needed resources
 1550  * could not be allocated.
 1551  * If successful, hold a reference to the netmap adapter.
 1552  *
 1553  * If the interface specified by req is a system one, also keep
 1554  * a reference to it and return a valid *ifp.
 1555  */
 1556 int
 1557 netmap_get_na(struct nmreq_header *hdr,
 1558               struct netmap_adapter **na, struct ifnet **ifp,
 1559               struct netmap_mem_d *nmd, int create)
 1560 {
 1561         struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
 1562         int error = 0;
 1563         struct netmap_adapter *ret = NULL;
 1564         int nmd_ref = 0;
 1565 
 1566         *na = NULL;     /* default return value */
 1567         *ifp = NULL;
 1568 
 1569         if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
 1570                 return EINVAL;
 1571         }
 1572 
 1573         if (req->nr_mode == NR_REG_PIPE_MASTER ||
 1574                         req->nr_mode == NR_REG_PIPE_SLAVE) {
 1575                 /* Do not accept deprecated pipe modes. */
 1576                 nm_prerr("Deprecated pipe nr_mode, use xx{yy or xx}yy syntax");
 1577                 return EINVAL;
 1578         }
 1579 
 1580         NMG_LOCK_ASSERT();
 1581 
 1582         /* if the request contain a memid, try to find the
 1583          * corresponding memory region
 1584          */
 1585         if (nmd == NULL && req->nr_mem_id) {
 1586                 nmd = netmap_mem_find(req->nr_mem_id);
 1587                 if (nmd == NULL)
 1588                         return EINVAL;
 1589                 /* keep the rereference */
 1590                 nmd_ref = 1;
 1591         }
 1592 
 1593         /* We cascade through all possible types of netmap adapter.
 1594          * All netmap_get_*_na() functions return an error and an na,
 1595          * with the following combinations:
 1596          *
 1597          * error    na
 1598          *   0     NULL         type doesn't match
 1599          *  !0     NULL         type matches, but na creation/lookup failed
 1600          *   0    !NULL         type matches and na created/found
 1601          *  !0    !NULL         impossible
 1602          */
 1603         error = netmap_get_null_na(hdr, na, nmd, create);
 1604         if (error || *na != NULL)
 1605                 goto out;
 1606 
 1607         /* try to see if this is a monitor port */
 1608         error = netmap_get_monitor_na(hdr, na, nmd, create);
 1609         if (error || *na != NULL)
 1610                 goto out;
 1611 
 1612         /* try to see if this is a pipe port */
 1613         error = netmap_get_pipe_na(hdr, na, nmd, create);
 1614         if (error || *na != NULL)
 1615                 goto out;
 1616 
 1617         /* try to see if this is a vale port */
 1618         error = netmap_get_vale_na(hdr, na, nmd, create);
 1619         if (error)
 1620                 goto out;
 1621 
 1622         if (*na != NULL) /* valid match in netmap_get_bdg_na() */
 1623                 goto out;
 1624 
 1625         /*
 1626          * This must be a hardware na, lookup the name in the system.
 1627          * Note that by hardware we actually mean "it shows up in ifconfig".
 1628          * This may still be a tap, a veth/epair, or even a
 1629          * persistent VALE port.
 1630          */
 1631         *ifp = ifunit_ref(hdr->nr_name);
 1632         if (*ifp == NULL) {
 1633                 error = ENXIO;
 1634                 goto out;
 1635         }
 1636 
 1637         error = netmap_get_hw_na(*ifp, nmd, &ret);
 1638         if (error)
 1639                 goto out;
 1640 
 1641         *na = ret;
 1642         netmap_adapter_get(ret);
 1643 
 1644         /*
 1645          * if the adapter supports the host rings and it is not already open,
 1646          * try to set the number of host rings as requested by the user
 1647          */
 1648         if (((*na)->na_flags & NAF_HOST_RINGS) && (*na)->active_fds == 0) {
 1649                 if (req->nr_host_tx_rings)
 1650                         (*na)->num_host_tx_rings = req->nr_host_tx_rings;
 1651                 if (req->nr_host_rx_rings)
 1652                         (*na)->num_host_rx_rings = req->nr_host_rx_rings;
 1653         }
 1654         nm_prdis("%s: host tx %d rx %u", (*na)->name, (*na)->num_host_tx_rings,
 1655                         (*na)->num_host_rx_rings);
 1656 
 1657 out:
 1658         if (error) {
 1659                 if (ret)
 1660                         netmap_adapter_put(ret);
 1661                 if (*ifp) {
 1662                         if_rele(*ifp);
 1663                         *ifp = NULL;
 1664                 }
 1665         }
 1666         if (nmd_ref)
 1667                 netmap_mem_put(nmd);
 1668 
 1669         return error;
 1670 }
 1671 
 1672 /* undo netmap_get_na() */
 1673 void
 1674 netmap_unget_na(struct netmap_adapter *na, struct ifnet *ifp)
 1675 {
 1676         if (ifp)
 1677                 if_rele(ifp);
 1678         if (na)
 1679                 netmap_adapter_put(na);
 1680 }
 1681 
 1682 
 1683 #define NM_FAIL_ON(t) do {                                              \
 1684         if (unlikely(t)) {                                              \
 1685                 nm_prlim(5, "%s: fail '" #t "' "                                \
 1686                         "h %d c %d t %d "                               \
 1687                         "rh %d rc %d rt %d "                            \
 1688                         "hc %d ht %d",                                  \
 1689                         kring->name,                                    \
 1690                         head, cur, ring->tail,                          \
 1691                         kring->rhead, kring->rcur, kring->rtail,        \
 1692                         kring->nr_hwcur, kring->nr_hwtail);             \
 1693                 return kring->nkr_num_slots;                            \
 1694         }                                                               \
 1695 } while (0)
 1696 
 1697 /*
 1698  * validate parameters on entry for *_txsync()
 1699  * Returns ring->cur if ok, or something >= kring->nkr_num_slots
 1700  * in case of error.
 1701  *
 1702  * rhead, rcur and rtail=hwtail are stored from previous round.
 1703  * hwcur is the next packet to send to the ring.
 1704  *
 1705  * We want
 1706  *    hwcur <= *rhead <= head <= cur <= tail = *rtail <= hwtail
 1707  *
 1708  * hwcur, rhead, rtail and hwtail are reliable
 1709  */
 1710 u_int
 1711 nm_txsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
 1712 {
 1713         u_int head = NM_ACCESS_ONCE(ring->head);
 1714         u_int cur = NM_ACCESS_ONCE(ring->cur);
 1715         u_int n = kring->nkr_num_slots;
 1716 
 1717         nm_prdis(5, "%s kcur %d ktail %d head %d cur %d tail %d",
 1718                 kring->name,
 1719                 kring->nr_hwcur, kring->nr_hwtail,
 1720                 ring->head, ring->cur, ring->tail);
 1721 #if 1 /* kernel sanity checks; but we can trust the kring. */
 1722         NM_FAIL_ON(kring->nr_hwcur >= n || kring->rhead >= n ||
 1723             kring->rtail >= n ||  kring->nr_hwtail >= n);
 1724 #endif /* kernel sanity checks */
 1725         /*
 1726          * user sanity checks. We only use head,
 1727          * A, B, ... are possible positions for head:
 1728          *
 1729          *  0    A  rhead   B  rtail   C  n-1
 1730          *  0    D  rtail   E  rhead   F  n-1
 1731          *
 1732          * B, F, D are valid. A, C, E are wrong
 1733          */
 1734         if (kring->rtail >= kring->rhead) {
 1735                 /* want rhead <= head <= rtail */
 1736                 NM_FAIL_ON(head < kring->rhead || head > kring->rtail);
 1737                 /* and also head <= cur <= rtail */
 1738                 NM_FAIL_ON(cur < head || cur > kring->rtail);
 1739         } else { /* here rtail < rhead */
 1740                 /* we need head outside rtail .. rhead */
 1741                 NM_FAIL_ON(head > kring->rtail && head < kring->rhead);
 1742 
 1743                 /* two cases now: head <= rtail or head >= rhead  */
 1744                 if (head <= kring->rtail) {
 1745                         /* want head <= cur <= rtail */
 1746                         NM_FAIL_ON(cur < head || cur > kring->rtail);
 1747                 } else { /* head >= rhead */
 1748                         /* cur must be outside rtail..head */
 1749                         NM_FAIL_ON(cur > kring->rtail && cur < head);
 1750                 }
 1751         }
 1752         if (ring->tail != kring->rtail) {
 1753                 nm_prlim(5, "%s tail overwritten was %d need %d", kring->name,
 1754                         ring->tail, kring->rtail);
 1755                 ring->tail = kring->rtail;
 1756         }
 1757         kring->rhead = head;
 1758         kring->rcur = cur;
 1759         return head;
 1760 }
 1761 
 1762 
 1763 /*
 1764  * validate parameters on entry for *_rxsync()
 1765  * Returns ring->head if ok, kring->nkr_num_slots on error.
 1766  *
 1767  * For a valid configuration,
 1768  * hwcur <= head <= cur <= tail <= hwtail
 1769  *
 1770  * We only consider head and cur.
 1771  * hwcur and hwtail are reliable.
 1772  *
 1773  */
 1774 u_int
 1775 nm_rxsync_prologue(struct netmap_kring *kring, struct netmap_ring *ring)
 1776 {
 1777         uint32_t const n = kring->nkr_num_slots;
 1778         uint32_t head, cur;
 1779 
 1780         nm_prdis(5,"%s kc %d kt %d h %d c %d t %d",
 1781                 kring->name,
 1782                 kring->nr_hwcur, kring->nr_hwtail,
 1783                 ring->head, ring->cur, ring->tail);
 1784         /*
 1785          * Before storing the new values, we should check they do not
 1786          * move backwards. However:
 1787          * - head is not an issue because the previous value is hwcur;
 1788          * - cur could in principle go back, however it does not matter
 1789          *   because we are processing a brand new rxsync()
 1790          */
 1791         cur = kring->rcur = NM_ACCESS_ONCE(ring->cur);
 1792         head = kring->rhead = NM_ACCESS_ONCE(ring->head);
 1793 #if 1 /* kernel sanity checks */
 1794         NM_FAIL_ON(kring->nr_hwcur >= n || kring->nr_hwtail >= n);
 1795 #endif /* kernel sanity checks */
 1796         /* user sanity checks */
 1797         if (kring->nr_hwtail >= kring->nr_hwcur) {
 1798                 /* want hwcur <= rhead <= hwtail */
 1799                 NM_FAIL_ON(head < kring->nr_hwcur || head > kring->nr_hwtail);
 1800                 /* and also rhead <= rcur <= hwtail */
 1801                 NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
 1802         } else {
 1803                 /* we need rhead outside hwtail..hwcur */
 1804                 NM_FAIL_ON(head < kring->nr_hwcur && head > kring->nr_hwtail);
 1805                 /* two cases now: head <= hwtail or head >= hwcur  */
 1806                 if (head <= kring->nr_hwtail) {
 1807                         /* want head <= cur <= hwtail */
 1808                         NM_FAIL_ON(cur < head || cur > kring->nr_hwtail);
 1809                 } else {
 1810                         /* cur must be outside hwtail..head */
 1811                         NM_FAIL_ON(cur < head && cur > kring->nr_hwtail);
 1812                 }
 1813         }
 1814         if (ring->tail != kring->rtail) {
 1815                 nm_prlim(5, "%s tail overwritten was %d need %d",
 1816                         kring->name,
 1817                         ring->tail, kring->rtail);
 1818                 ring->tail = kring->rtail;
 1819         }
 1820         return head;
 1821 }
 1822 
 1823 
 1824 /*
 1825  * Error routine called when txsync/rxsync detects an error.
 1826  * Can't do much more than resetting head = cur = hwcur, tail = hwtail
 1827  * Return 1 on reinit.
 1828  *
 1829  * This routine is only called by the upper half of the kernel.
 1830  * It only reads hwcur (which is changed only by the upper half, too)
 1831  * and hwtail (which may be changed by the lower half, but only on
 1832  * a tx ring and only to increase it, so any error will be recovered
 1833  * on the next call). For the above, we don't strictly need to call
 1834  * it under lock.
 1835  */
 1836 int
 1837 netmap_ring_reinit(struct netmap_kring *kring)
 1838 {
 1839         struct netmap_ring *ring = kring->ring;
 1840         u_int i, lim = kring->nkr_num_slots - 1;
 1841         int errors = 0;
 1842 
 1843         // XXX KASSERT nm_kr_tryget
 1844         nm_prlim(10, "called for %s", kring->name);
 1845         // XXX probably wrong to trust userspace
 1846         kring->rhead = ring->head;
 1847         kring->rcur  = ring->cur;
 1848         kring->rtail = ring->tail;
 1849 
 1850         if (ring->cur > lim)
 1851                 errors++;
 1852         if (ring->head > lim)
 1853                 errors++;
 1854         if (ring->tail > lim)
 1855                 errors++;
 1856         for (i = 0; i <= lim; i++) {
 1857                 u_int idx = ring->slot[i].buf_idx;
 1858                 u_int len = ring->slot[i].len;
 1859                 if (idx < 2 || idx >= kring->na->na_lut.objtotal) {
 1860                         nm_prlim(5, "bad index at slot %d idx %d len %d ", i, idx, len);
 1861                         ring->slot[i].buf_idx = 0;
 1862                         ring->slot[i].len = 0;
 1863                 } else if (len > NETMAP_BUF_SIZE(kring->na)) {
 1864                         ring->slot[i].len = 0;
 1865                         nm_prlim(5, "bad len at slot %d idx %d len %d", i, idx, len);
 1866                 }
 1867         }
 1868         if (errors) {
 1869                 nm_prlim(10, "total %d errors", errors);
 1870                 nm_prlim(10, "%s reinit, cur %d -> %d tail %d -> %d",
 1871                         kring->name,
 1872                         ring->cur, kring->nr_hwcur,
 1873                         ring->tail, kring->nr_hwtail);
 1874                 ring->head = kring->rhead = kring->nr_hwcur;
 1875                 ring->cur  = kring->rcur  = kring->nr_hwcur;
 1876                 ring->tail = kring->rtail = kring->nr_hwtail;
 1877         }
 1878         return (errors ? 1 : 0);
 1879 }
 1880 
 1881 /* interpret the ringid and flags fields of an nmreq, by translating them
 1882  * into a pair of intervals of ring indices:
 1883  *
 1884  * [priv->np_txqfirst, priv->np_txqlast) and
 1885  * [priv->np_rxqfirst, priv->np_rxqlast)
 1886  *
 1887  */
 1888 int
 1889 netmap_interp_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
 1890 {
 1891         struct netmap_adapter *na = priv->np_na;
 1892         struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
 1893         int excluded_direction[] = { NR_TX_RINGS_ONLY, NR_RX_RINGS_ONLY };
 1894         enum txrx t;
 1895         u_int j;
 1896         u_int nr_flags = reg->nr_flags, nr_mode = reg->nr_mode,
 1897               nr_ringid = reg->nr_ringid;
 1898 
 1899         for_rx_tx(t) {
 1900                 if (nr_flags & excluded_direction[t]) {
 1901                         priv->np_qfirst[t] = priv->np_qlast[t] = 0;
 1902                         continue;
 1903                 }
 1904                 switch (nr_mode) {
 1905                 case NR_REG_ALL_NIC:
 1906                 case NR_REG_NULL:
 1907                         priv->np_qfirst[t] = 0;
 1908                         priv->np_qlast[t] = nma_get_nrings(na, t);
 1909                         nm_prdis("ALL/PIPE: %s %d %d", nm_txrx2str(t),
 1910                                 priv->np_qfirst[t], priv->np_qlast[t]);
 1911                         break;
 1912                 case NR_REG_SW:
 1913                 case NR_REG_NIC_SW:
 1914                         if (!(na->na_flags & NAF_HOST_RINGS)) {
 1915                                 nm_prerr("host rings not supported");
 1916                                 return EINVAL;
 1917                         }
 1918                         priv->np_qfirst[t] = (nr_mode == NR_REG_SW ?
 1919                                 nma_get_nrings(na, t) : 0);
 1920                         priv->np_qlast[t] = netmap_all_rings(na, t);
 1921                         nm_prdis("%s: %s %d %d", nr_mode == NR_REG_SW ? "SW" : "NIC+SW",
 1922                                 nm_txrx2str(t),
 1923                                 priv->np_qfirst[t], priv->np_qlast[t]);
 1924                         break;
 1925                 case NR_REG_ONE_NIC:
 1926                         if (nr_ringid >= na->num_tx_rings &&
 1927                                         nr_ringid >= na->num_rx_rings) {
 1928                                 nm_prerr("invalid ring id %d", nr_ringid);
 1929                                 return EINVAL;
 1930                         }
 1931                         /* if not enough rings, use the first one */
 1932                         j = nr_ringid;
 1933                         if (j >= nma_get_nrings(na, t))
 1934                                 j = 0;
 1935                         priv->np_qfirst[t] = j;
 1936                         priv->np_qlast[t] = j + 1;
 1937                         nm_prdis("ONE_NIC: %s %d %d", nm_txrx2str(t),
 1938                                 priv->np_qfirst[t], priv->np_qlast[t]);
 1939                         break;
 1940                 case NR_REG_ONE_SW:
 1941                         if (!(na->na_flags & NAF_HOST_RINGS)) {
 1942                                 nm_prerr("host rings not supported");
 1943                                 return EINVAL;
 1944                         }
 1945                         if (nr_ringid >= na->num_host_tx_rings &&
 1946                                         nr_ringid >= na->num_host_rx_rings) {
 1947                                 nm_prerr("invalid ring id %d", nr_ringid);
 1948                                 return EINVAL;
 1949                         }
 1950                         /* if not enough rings, use the first one */
 1951                         j = nr_ringid;
 1952                         if (j >= nma_get_host_nrings(na, t))
 1953                                 j = 0;
 1954                         priv->np_qfirst[t] = nma_get_nrings(na, t) + j;
 1955                         priv->np_qlast[t] = nma_get_nrings(na, t) + j + 1;
 1956                         nm_prdis("ONE_SW: %s %d %d", nm_txrx2str(t),
 1957                                 priv->np_qfirst[t], priv->np_qlast[t]);
 1958                         break;
 1959                 default:
 1960                         nm_prerr("invalid regif type %d", nr_mode);
 1961                         return EINVAL;
 1962                 }
 1963         }
 1964         priv->np_flags = nr_flags;
 1965 
 1966         /* Allow transparent forwarding mode in the host --> nic
 1967          * direction only if all the TX hw rings have been opened. */
 1968         if (priv->np_qfirst[NR_TX] == 0 &&
 1969                         priv->np_qlast[NR_TX] >= na->num_tx_rings) {
 1970                 priv->np_sync_flags |= NAF_CAN_FORWARD_DOWN;
 1971         }
 1972 
 1973         if (netmap_verbose) {
 1974                 nm_prinf("%s: tx [%d,%d) rx [%d,%d) id %d",
 1975                         na->name,
 1976                         priv->np_qfirst[NR_TX],
 1977                         priv->np_qlast[NR_TX],
 1978                         priv->np_qfirst[NR_RX],
 1979                         priv->np_qlast[NR_RX],
 1980                         nr_ringid);
 1981         }
 1982         return 0;
 1983 }
 1984 
 1985 
 1986 /*
 1987  * Set the ring ID. For devices with a single queue, a request
 1988  * for all rings is the same as a single ring.
 1989  */
 1990 static int
 1991 netmap_set_ringid(struct netmap_priv_d *priv, struct nmreq_header *hdr)
 1992 {
 1993         struct netmap_adapter *na = priv->np_na;
 1994         struct nmreq_register *reg = (struct nmreq_register *)hdr->nr_body;
 1995         int error;
 1996         enum txrx t;
 1997 
 1998         error = netmap_interp_ringid(priv, hdr);
 1999         if (error) {
 2000                 return error;
 2001         }
 2002 
 2003         priv->np_txpoll = (reg->nr_flags & NR_NO_TX_POLL) ? 0 : 1;
 2004 
 2005         /* optimization: count the users registered for more than
 2006          * one ring, which are the ones sleeping on the global queue.
 2007          * The default netmap_notify() callback will then
 2008          * avoid signaling the global queue if nobody is using it
 2009          */
 2010         for_rx_tx(t) {
 2011                 if (nm_si_user(priv, t))
 2012                         na->si_users[t]++;
 2013         }
 2014         return 0;
 2015 }
 2016 
 2017 static void
 2018 netmap_unset_ringid(struct netmap_priv_d *priv)
 2019 {
 2020         struct netmap_adapter *na = priv->np_na;
 2021         enum txrx t;
 2022 
 2023         for_rx_tx(t) {
 2024                 if (nm_si_user(priv, t))
 2025                         na->si_users[t]--;
 2026                 priv->np_qfirst[t] = priv->np_qlast[t] = 0;
 2027         }
 2028         priv->np_flags = 0;
 2029         priv->np_txpoll = 0;
 2030         priv->np_kloop_state = 0;
 2031 }
 2032 
 2033 #define within_sel(p_, t_, i_)                                            \
 2034         ((i_) < (p_)->np_qlast[(t_)])
 2035 #define nonempty_sel(p_, t_)                                              \
 2036         (within_sel((p_), (t_), (p_)->np_qfirst[(t_)]))
 2037 #define foreach_selected_ring(p_, t_, i_, kring_)                         \
 2038         for ((t_) = nonempty_sel((p_), NR_RX) ? NR_RX : NR_TX,            \
 2039              (i_) = (p_)->np_qfirst[(t_)];                                \
 2040              (t_ == NR_RX ||                                              \
 2041               (t == NR_TX && within_sel((p_), (t_), (i_)))) &&            \
 2042               ((kring_) = NMR((p_)->np_na, (t_))[(i_)]);                  \
 2043              (i_) = within_sel((p_), (t_), (i_) + 1) ? (i_) + 1 :         \
 2044                 (++(t_) < NR_TXRX ? (p_)->np_qfirst[(t_)] : (i_)))
 2045 
 2046 
 2047 /* Set the nr_pending_mode for the requested rings.
 2048  * If requested, also try to get exclusive access to the rings, provided
 2049  * the rings we want to bind are not exclusively owned by a previous bind.
 2050  */
 2051 static int
 2052 netmap_krings_get(struct netmap_priv_d *priv)
 2053 {
 2054         struct netmap_adapter *na = priv->np_na;
 2055         u_int i;
 2056         struct netmap_kring *kring;
 2057         int excl = (priv->np_flags & NR_EXCLUSIVE);
 2058         enum txrx t;
 2059 
 2060         if (netmap_debug & NM_DEBUG_ON)
 2061                 nm_prinf("%s: grabbing tx [%d, %d) rx [%d, %d)",
 2062                         na->name,
 2063                         priv->np_qfirst[NR_TX],
 2064                         priv->np_qlast[NR_TX],
 2065                         priv->np_qfirst[NR_RX],
 2066                         priv->np_qlast[NR_RX]);
 2067 
 2068         /* first round: check that all the requested rings
 2069          * are neither already exclusively owned, nor we
 2070          * want exclusive ownership when they are already in use
 2071          */
 2072         foreach_selected_ring(priv, t, i, kring) {
 2073                 if ((kring->nr_kflags & NKR_EXCLUSIVE) ||
 2074                     (kring->users && excl))
 2075                 {
 2076                         nm_prdis("ring %s busy", kring->name);
 2077                         return EBUSY;
 2078                 }
 2079         }
 2080 
 2081         /* second round: increment usage count (possibly marking them
 2082          * as exclusive) and set the nr_pending_mode
 2083          */
 2084         foreach_selected_ring(priv, t, i, kring) {
 2085                 kring->users++;
 2086                 if (excl)
 2087                         kring->nr_kflags |= NKR_EXCLUSIVE;
 2088                 kring->nr_pending_mode = NKR_NETMAP_ON;
 2089         }
 2090 
 2091         return 0;
 2092 
 2093 }
 2094 
 2095 /* Undo netmap_krings_get(). This is done by clearing the exclusive mode
 2096  * if was asked on regif, and unset the nr_pending_mode if we are the
 2097  * last users of the involved rings. */
 2098 static void
 2099 netmap_krings_put(struct netmap_priv_d *priv)
 2100 {
 2101         u_int i;
 2102         struct netmap_kring *kring;
 2103         int excl = (priv->np_flags & NR_EXCLUSIVE);
 2104         enum txrx t;
 2105 
 2106         nm_prdis("%s: releasing tx [%d, %d) rx [%d, %d)",
 2107                         na->name,
 2108                         priv->np_qfirst[NR_TX],
 2109                         priv->np_qlast[NR_TX],
 2110                         priv->np_qfirst[NR_RX],
 2111                         priv->np_qlast[MR_RX]);
 2112 
 2113         foreach_selected_ring(priv, t, i, kring) {
 2114                 if (excl)
 2115                         kring->nr_kflags &= ~NKR_EXCLUSIVE;
 2116                 kring->users--;
 2117                 if (kring->users == 0)
 2118                         kring->nr_pending_mode = NKR_NETMAP_OFF;
 2119         }
 2120 }
 2121 
 2122 static int
 2123 nm_priv_rx_enabled(struct netmap_priv_d *priv)
 2124 {
 2125         return (priv->np_qfirst[NR_RX] != priv->np_qlast[NR_RX]);
 2126 }
 2127 
 2128 /* Validate the CSB entries for both directions (atok and ktoa).
 2129  * To be called under NMG_LOCK(). */
 2130 static int
 2131 netmap_csb_validate(struct netmap_priv_d *priv, struct nmreq_opt_csb *csbo)
 2132 {
 2133         struct nm_csb_atok *csb_atok_base =
 2134                 (struct nm_csb_atok *)(uintptr_t)csbo->csb_atok;
 2135         struct nm_csb_ktoa *csb_ktoa_base =
 2136                 (struct nm_csb_ktoa *)(uintptr_t)csbo->csb_ktoa;
 2137         enum txrx t;
 2138         int num_rings[NR_TXRX], tot_rings;
 2139         size_t entry_size[2];
 2140         void *csb_start[2];
 2141         int i;
 2142 
 2143         if (priv->np_kloop_state & NM_SYNC_KLOOP_RUNNING) {
 2144                 nm_prerr("Cannot update CSB while kloop is running");
 2145                 return EBUSY;
 2146         }
 2147 
 2148         tot_rings = 0;
 2149         for_rx_tx(t) {
 2150                 num_rings[t] = priv->np_qlast[t] - priv->np_qfirst[t];
 2151                 tot_rings += num_rings[t];
 2152         }
 2153         if (tot_rings <= 0)
 2154                 return 0;
 2155 
 2156         if (!(priv->np_flags & NR_EXCLUSIVE)) {
 2157                 nm_prerr("CSB mode requires NR_EXCLUSIVE");
 2158                 return EINVAL;
 2159         }
 2160 
 2161         entry_size[0] = sizeof(*csb_atok_base);
 2162         entry_size[1] = sizeof(*csb_ktoa_base);
 2163         csb_start[0] = (void *)csb_atok_base;
 2164         csb_start[1] = (void *)csb_ktoa_base;
 2165 
 2166         for (i = 0; i < 2; i++) {
 2167                 /* On Linux we could use access_ok() to simplify
 2168                  * the validation. However, the advantage of
 2169                  * this approach is that it works also on
 2170                  * FreeBSD. */
 2171                 size_t csb_size = tot_rings * entry_size[i];
 2172                 void *tmp;
 2173                 int err;
 2174 
 2175                 if ((uintptr_t)csb_start[i] & (entry_size[i]-1)) {
 2176                         nm_prerr("Unaligned CSB address");
 2177                         return EINVAL;
 2178                 }
 2179 
 2180                 tmp = nm_os_malloc(csb_size);
 2181                 if (!tmp)
 2182                         return ENOMEM;
 2183                 if (i == 0) {
 2184                         /* Application --> kernel direction. */
 2185                         err = copyin(csb_start[i], tmp, csb_size);
 2186                 } else {
 2187                         /* Kernel --> application direction. */
 2188                         memset(tmp, 0, csb_size);
 2189                         err = copyout(tmp, csb_start[i], csb_size);
 2190                 }
 2191                 nm_os_free(tmp);
 2192                 if (err) {
 2193                         nm_prerr("Invalid CSB address");
 2194                         return err;
 2195                 }
 2196         }
 2197 
 2198         priv->np_csb_atok_base = csb_atok_base;
 2199         priv->np_csb_ktoa_base = csb_ktoa_base;
 2200 
 2201         /* Initialize the CSB. */
 2202         for_rx_tx(t) {
 2203                 for (i = 0; i < num_rings[t]; i++) {
 2204                         struct netmap_kring *kring =
 2205                                 NMR(priv->np_na, t)[i + priv->np_qfirst[t]];
 2206                         struct nm_csb_atok *csb_atok = csb_atok_base + i;
 2207                         struct nm_csb_ktoa *csb_ktoa = csb_ktoa_base + i;
 2208 
 2209                         if (t == NR_RX) {
 2210                                 csb_atok += num_rings[NR_TX];
 2211                                 csb_ktoa += num_rings[NR_TX];
 2212                         }
 2213 
 2214                         CSB_WRITE(csb_atok, head, kring->rhead);
 2215                         CSB_WRITE(csb_atok, cur, kring->rcur);
 2216                         CSB_WRITE(csb_atok, appl_need_kick, 1);
 2217                         CSB_WRITE(csb_atok, sync_flags, 1);
 2218                         CSB_WRITE(csb_ktoa, hwcur, kring->nr_hwcur);
 2219                         CSB_WRITE(csb_ktoa, hwtail, kring->nr_hwtail);
 2220                         CSB_WRITE(csb_ktoa, kern_need_kick, 1);
 2221 
 2222                         nm_prinf("csb_init for kring %s: head %u, cur %u, "
 2223                                 "hwcur %u, hwtail %u", kring->name,
 2224                                 kring->rhead, kring->rcur, kring->nr_hwcur,
 2225                                 kring->nr_hwtail);
 2226                 }
 2227         }
 2228 
 2229         return 0;
 2230 }
 2231 
 2232 /* Ensure that the netmap adapter can support the given MTU.
 2233  * @return EINVAL if the na cannot be set to mtu, 0 otherwise.
 2234  */
 2235 int
 2236 netmap_buf_size_validate(const struct netmap_adapter *na, unsigned mtu) {
 2237         unsigned nbs = NETMAP_BUF_SIZE(na);
 2238 
 2239         if (mtu <= na->rx_buf_maxsize) {
 2240                 /* The MTU fits a single NIC slot. We only
 2241                  * Need to check that netmap buffers are
 2242                  * large enough to hold an MTU. NS_MOREFRAG
 2243                  * cannot be used in this case. */
 2244                 if (nbs < mtu) {
 2245                         nm_prerr("error: netmap buf size (%u) "
 2246                                  "< device MTU (%u)", nbs, mtu);
 2247                         return EINVAL;
 2248                 }
 2249         } else {
 2250                 /* More NIC slots may be needed to receive
 2251                  * or transmit a single packet. Check that
 2252                  * the adapter supports NS_MOREFRAG and that
 2253                  * netmap buffers are large enough to hold
 2254                  * the maximum per-slot size. */
 2255                 if (!(na->na_flags & NAF_MOREFRAG)) {
 2256                         nm_prerr("error: large MTU (%d) needed "
 2257                                  "but %s does not support "
 2258                                  "NS_MOREFRAG", mtu,
 2259                                  na->ifp->if_xname);
 2260                         return EINVAL;
 2261                 } else if (nbs < na->rx_buf_maxsize) {
 2262                         nm_prerr("error: using NS_MOREFRAG on "
 2263                                  "%s requires netmap buf size "
 2264                                  ">= %u", na->ifp->if_xname,
 2265                                  na->rx_buf_maxsize);
 2266                         return EINVAL;
 2267                 } else {
 2268                         nm_prinf("info: netmap application on "
 2269                                  "%s needs to support "
 2270                                  "NS_MOREFRAG "
 2271                                  "(MTU=%u,netmap_buf_size=%u)",
 2272                                  na->ifp->if_xname, mtu, nbs);
 2273                 }
 2274         }
 2275         return 0;
 2276 }
 2277 
 2278 /* Handle the offset option, if present in the hdr.
 2279  * Returns 0 on success, or an error.
 2280  */
 2281 static int
 2282 netmap_offsets_init(struct netmap_priv_d *priv, struct nmreq_header *hdr)
 2283 {
 2284         struct nmreq_opt_offsets *opt;
 2285         struct netmap_adapter *na = priv->np_na;
 2286         struct netmap_kring *kring;
 2287         uint64_t mask = 0, bits = 0, maxbits = sizeof(uint64_t) * 8,
 2288                  max_offset = 0, initial_offset = 0, min_gap = 0;
 2289         u_int i;
 2290         enum txrx t;
 2291         int error = 0;
 2292 
 2293         opt = (struct nmreq_opt_offsets *)
 2294                 nmreq_getoption(hdr, NETMAP_REQ_OPT_OFFSETS);
 2295         if (opt == NULL)
 2296                 return 0;
 2297 
 2298         if (!(na->na_flags & NAF_OFFSETS)) {
 2299                 if (netmap_verbose)
 2300                         nm_prerr("%s does not support offsets",
 2301                                 na->name);
 2302                 error = EOPNOTSUPP;
 2303                 goto out;
 2304         }
 2305 
 2306         /* check sanity of the opt values */
 2307         max_offset = opt->nro_max_offset;
 2308         min_gap = opt->nro_min_gap;
 2309         initial_offset = opt->nro_initial_offset;
 2310         bits = opt->nro_offset_bits;
 2311 
 2312         if (bits > maxbits) {
 2313                 if (netmap_verbose)
 2314                         nm_prerr("bits: %llu too large (max %llu)",
 2315                                 (unsigned long long)bits,
 2316                                 (unsigned long long)maxbits);
 2317                 error = EINVAL;
 2318                 goto out;
 2319         }
 2320         /* we take bits == 0 as a request to use the entire field */
 2321         if (bits == 0 || bits == maxbits) {
 2322                 /* shifting a type by sizeof(type) is undefined */
 2323                 bits = maxbits;
 2324                 mask = 0xffffffffffffffff;
 2325         } else {
 2326                 mask = (1ULL << bits) - 1;
 2327         }
 2328         if (max_offset > NETMAP_BUF_SIZE(na)) {
 2329                 if (netmap_verbose)
 2330                         nm_prerr("max offset %llu > buf size %u",
 2331                                 (unsigned long long)max_offset, NETMAP_BUF_SIZE(na));
 2332                 error = EINVAL;
 2333                 goto out;
 2334         }
 2335         if ((max_offset & mask) != max_offset) {
 2336                 if (netmap_verbose)
 2337                         nm_prerr("max offset %llu to large for %llu bits",
 2338                                 (unsigned long long)max_offset,
 2339                                 (unsigned long long)bits);
 2340                 error = EINVAL;
 2341                 goto out;
 2342         }
 2343         if (initial_offset > max_offset) {
 2344                 if (netmap_verbose)
 2345                         nm_prerr("initial offset %llu > max offset %llu",
 2346                                 (unsigned long long)initial_offset,
 2347                                 (unsigned long long)max_offset);
 2348                 error = EINVAL;
 2349                 goto out;
 2350         }
 2351 
 2352         /* initialize the kring and ring fields. */
 2353         foreach_selected_ring(priv, t, i, kring) {
 2354                 struct netmap_kring *kring = NMR(na, t)[i];
 2355                 struct netmap_ring *ring = kring->ring;
 2356                 u_int j;
 2357 
 2358                 /* it the ring is already in use we check that the
 2359                  * new request is compatible with the existing one
 2360                  */
 2361                 if (kring->offset_mask) {
 2362                         if ((kring->offset_mask & mask) != mask ||
 2363                              kring->offset_max < max_offset) {
 2364                                 if (netmap_verbose)
 2365                                         nm_prinf("%s: cannot increase"
 2366                                                  "offset mask and/or max"
 2367                                                  "(current: mask=%llx,max=%llu",
 2368                                                         kring->name,
 2369                                                         (unsigned long long)kring->offset_mask,
 2370                                                         (unsigned long long)kring->offset_max);
 2371                                 error = EBUSY;
 2372                                 goto out;
 2373                         }
 2374                         mask = kring->offset_mask;
 2375                         max_offset = kring->offset_max;
 2376                 } else {
 2377                         kring->offset_mask = mask;
 2378                         *(uint64_t *)(uintptr_t)&ring->offset_mask = mask;
 2379                         kring->offset_max = max_offset;
 2380                         kring->offset_gap = min_gap;
 2381                 }
 2382 
 2383                 /* if there is an initial offset, put it into
 2384                  * all the slots
 2385                  *
 2386                  * Note: we cannot change the offsets if the
 2387                  * ring is already in use.
 2388                  */
 2389                 if (!initial_offset || kring->users > 1)
 2390                         continue;
 2391 
 2392                 for (j = 0; j < kring->nkr_num_slots; j++) {
 2393                         struct netmap_slot *slot = ring->slot + j;
 2394 
 2395                         nm_write_offset(kring, slot, initial_offset);
 2396                 }
 2397         }
 2398 
 2399 out:
 2400         opt->nro_opt.nro_status = error;
 2401         if (!error) {
 2402                 opt->nro_max_offset = max_offset;
 2403         }
 2404         return error;
 2405 
 2406 }
 2407 
 2408 
 2409 /* set the hardware buffer length in each one of the newly opened rings
 2410  * (hwbuf_len field in the kring struct). The purpose it to select
 2411  * the maximum supported input buffer lenght that will not cause writes
 2412  * outside of the available space, even when offsets are in use.
 2413  */
 2414 static int
 2415 netmap_compute_buf_len(struct netmap_priv_d *priv)
 2416 {
 2417         enum txrx t;
 2418         u_int i;
 2419         struct netmap_kring *kring;
 2420         int error = 0;
 2421         unsigned mtu = 0;
 2422         struct netmap_adapter *na = priv->np_na;
 2423         uint64_t target;
 2424 
 2425         foreach_selected_ring(priv, t, i, kring) {
 2426                 /* rings that are already active have their hwbuf_len
 2427                  * already set and we cannot change it.
 2428                  */
 2429                 if (kring->users > 1)
 2430                         continue;
 2431 
 2432                 /* For netmap buffers which are not shared among several ring
 2433                  * slots (the normal case), the available space is the buf size
 2434                  * minus the max offset declared by the user at open time.  If
 2435                  * the user plans to have several slots pointing to different
 2436                  * offsets into the same large buffer, she must also declare a
 2437                  * "minimum gap" between two such consecutive offsets. In this
 2438                  * case the user-declared 'offset_gap' is taken as the
 2439                  * available space and offset_max is ignored.
 2440                  */
 2441 
 2442                 /* start with the normal case (unshared buffers) */
 2443                 target = NETMAP_BUF_SIZE(kring->na) -
 2444                         kring->offset_max;
 2445                 /* if offset_gap is zero, the user does not intend to use
 2446                  * shared buffers. In this case the minimum gap between
 2447                  * two consective offsets into the same buffer can be
 2448                  * assumed to be equal to the buffer size. In this way
 2449                  * offset_gap always contains the available space ignoring
 2450                  * offset_max. This may be used by drivers of NICs that
 2451                  * are guaranteed to never write more than MTU bytes, even
 2452                  * if the input buffer is larger: if the MTU is less
 2453                  * than the target they can set hwbuf_len to offset_gap.
 2454                  */
 2455                 if (!kring->offset_gap)
 2456                         kring->offset_gap =
 2457                                 NETMAP_BUF_SIZE(kring->na);
 2458 
 2459                 if (kring->offset_gap < target)
 2460                         target = kring->offset_gap;
 2461                 error = kring->nm_bufcfg(kring, target);
 2462                 if (error)
 2463                         goto out;
 2464 
 2465                 *(uint64_t *)(uintptr_t)&kring->ring->buf_align = kring->buf_align;
 2466 
 2467                 if (mtu && t == NR_RX && kring->hwbuf_len < mtu) {
 2468                         if (!(na->na_flags & NAF_MOREFRAG)) {
 2469                                 nm_prerr("error: large MTU (%d) needed "
 2470                                          "but %s does not support "
 2471                                          "NS_MOREFRAG", mtu,
 2472                                          na->name);
 2473                                 error = EINVAL;
 2474                                 goto out;
 2475                         } else {
 2476                                 nm_prinf("info: netmap application on "
 2477                                          "%s needs to support "
 2478                                          "NS_MOREFRAG "
 2479                                          "(MTU=%u,buf_size=%llu)",
 2480                                          kring->name, mtu,
 2481                                          (unsigned long long)kring->hwbuf_len);
 2482                         }
 2483                 }
 2484         }
 2485 out:
 2486         return error;
 2487 }
 2488 
 2489 /*
 2490  * possibly move the interface to netmap-mode.
 2491  * If success it returns a pointer to netmap_if, otherwise NULL.
 2492  * This must be called with NMG_LOCK held.
 2493  *
 2494  * The following na callbacks are called in the process:
 2495  *
 2496  * na->nm_config()                      [by netmap_update_config]
 2497  * (get current number and size of rings)
 2498  *
 2499  *      We have a generic one for linux (netmap_linux_config).
 2500  *      The bwrap has to override this, since it has to forward
 2501  *      the request to the wrapped adapter (netmap_bwrap_config).
 2502  *
 2503  *
 2504  * na->nm_krings_create()
 2505  * (create and init the krings array)
 2506  *
 2507  *      One of the following:
 2508  *
 2509  *      * netmap_hw_krings_create,                      (hw ports)
 2510  *              creates the standard layout for the krings
 2511  *              and adds the mbq (used for the host rings).
 2512  *
 2513  *      * netmap_vp_krings_create                       (VALE ports)
 2514  *              add leases and scratchpads
 2515  *
 2516  *      * netmap_pipe_krings_create                     (pipes)
 2517  *              create the krings and rings of both ends and
 2518  *              cross-link them
 2519  *
 2520  *      * netmap_monitor_krings_create                  (monitors)
 2521  *              avoid allocating the mbq
 2522  *
 2523  *      * netmap_bwrap_krings_create                    (bwraps)
 2524  *              create both the brap krings array,
 2525  *              the krings array of the wrapped adapter, and
 2526  *              (if needed) the fake array for the host adapter
 2527  *
 2528  * na->nm_register(, 1)
 2529  * (put the adapter in netmap mode)
 2530  *
 2531  *      This may be one of the following:
 2532  *
 2533  *      * netmap_hw_reg                                 (hw ports)
 2534  *              checks that the ifp is still there, then calls
 2535  *              the hardware specific callback;
 2536  *
 2537  *      * netmap_vp_reg                                 (VALE ports)
 2538  *              If the port is connected to a bridge,
 2539  *              set the NAF_NETMAP_ON flag under the
 2540  *              bridge write lock.
 2541  *
 2542  *      * netmap_pipe_reg                               (pipes)
 2543  *              inform the other pipe end that it is no
 2544  *              longer responsible for the lifetime of this
 2545  *              pipe end
 2546  *
 2547  *      * netmap_monitor_reg                            (monitors)
 2548  *              intercept the sync callbacks of the monitored
 2549  *              rings
 2550  *
 2551  *      * netmap_bwrap_reg                              (bwraps)
 2552  *              cross-link the bwrap and hwna rings,
 2553  *              forward the request to the hwna, override
 2554  *              the hwna notify callback (to get the frames
 2555  *              coming from outside go through the bridge).
 2556  *
 2557  *
 2558  */
 2559 int
 2560 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
 2561         struct nmreq_header *hdr)
 2562 {
 2563         struct netmap_if *nifp = NULL;
 2564         int error;
 2565 
 2566         NMG_LOCK_ASSERT();
 2567         priv->np_na = na;     /* store the reference */
 2568         error = netmap_mem_finalize(na->nm_mem, na);
 2569         if (error)
 2570                 goto err;
 2571 
 2572         if (na->active_fds == 0) {
 2573 
 2574                 /* cache the allocator info in the na */
 2575                 error = netmap_mem_get_lut(na->nm_mem, &na->na_lut);
 2576                 if (error)
 2577                         goto err_drop_mem;
 2578                 nm_prdis("lut %p bufs %u size %u", na->na_lut.lut, na->na_lut.objtotal,
 2579                                             na->na_lut.objsize);
 2580 
 2581                 /* ring configuration may have changed, fetch from the card */
 2582                 netmap_update_config(na);
 2583         }
 2584 
 2585         /* compute the range of tx and rx rings to monitor */
 2586         error = netmap_set_ringid(priv, hdr);
 2587         if (error)
 2588                 goto err_put_lut;
 2589 
 2590         if (na->active_fds == 0) {
 2591                 /*
 2592                  * If this is the first registration of the adapter,
 2593                  * perform sanity checks and create the in-kernel view
 2594                  * of the netmap rings (the netmap krings).
 2595                  */
 2596                 if (na->ifp && nm_priv_rx_enabled(priv)) {
 2597                         /* This netmap adapter is attached to an ifnet. */
 2598                         unsigned mtu = nm_os_ifnet_mtu(na->ifp);
 2599 
 2600                         nm_prdis("%s: mtu %d rx_buf_maxsize %d netmap_buf_size %d",
 2601                                 na->name, mtu, na->rx_buf_maxsize, NETMAP_BUF_SIZE(na));
 2602 
 2603                         if (na->rx_buf_maxsize == 0) {
 2604                                 nm_prerr("%s: error: rx_buf_maxsize == 0", na->name);
 2605                                 error = EIO;
 2606                                 goto err_drop_mem;
 2607                         }
 2608 
 2609                         error = netmap_buf_size_validate(na, mtu);
 2610                         if (error)
 2611                                 goto err_drop_mem;
 2612                 }
 2613 
 2614                 /*
 2615                  * Depending on the adapter, this may also create
 2616                  * the netmap rings themselves
 2617                  */
 2618                 error = na->nm_krings_create(na);
 2619                 if (error)
 2620                         goto err_put_lut;
 2621 
 2622         }
 2623 
 2624         /* now the krings must exist and we can check whether some
 2625          * previous bind has exclusive ownership on them, and set
 2626          * nr_pending_mode
 2627          */
 2628         error = netmap_krings_get(priv);
 2629         if (error)
 2630                 goto err_del_krings;
 2631 
 2632         /* create all needed missing netmap rings */
 2633         error = netmap_mem_rings_create(na);
 2634         if (error)
 2635                 goto err_rel_excl;
 2636 
 2637         /* initialize offsets if requested */
 2638         error = netmap_offsets_init(priv, hdr);
 2639         if (error)
 2640                 goto err_rel_excl;
 2641 
 2642         /* compute and validate the buf lengths */
 2643         error = netmap_compute_buf_len(priv);
 2644         if (error)
 2645                 goto err_rel_excl;
 2646 
 2647         /* in all cases, create a new netmap if */
 2648         nifp = netmap_mem_if_new(na, priv);
 2649         if (nifp == NULL) {
 2650                 error = ENOMEM;
 2651                 goto err_rel_excl;
 2652         }
 2653 
 2654         if (nm_kring_pending(priv)) {
 2655                 /* Some kring is switching mode, tell the adapter to
 2656                  * react on this. */
 2657                 netmap_set_all_rings(na, NM_KR_LOCKED);
 2658                 error = na->nm_register(na, 1);
 2659                 netmap_set_all_rings(na, 0);
 2660                 if (error)
 2661                         goto err_del_if;
 2662         }
 2663 
 2664         /* Commit the reference. */
 2665         na->active_fds++;
 2666 
 2667         /*
 2668          * advertise that the interface is ready by setting np_nifp.
 2669          * The barrier is needed because readers (poll, *SYNC and mmap)
 2670          * check for priv->np_nifp != NULL without locking
 2671          */
 2672         mb(); /* make sure previous writes are visible to all CPUs */
 2673         priv->np_nifp = nifp;
 2674 
 2675         return 0;
 2676 
 2677 err_del_if:
 2678         netmap_mem_if_delete(na, nifp);
 2679 err_rel_excl:
 2680         netmap_krings_put(priv);
 2681         netmap_mem_rings_delete(na);
 2682 err_del_krings:
 2683         if (na->active_fds == 0)
 2684                 na->nm_krings_delete(na);
 2685 err_put_lut:
 2686         if (na->active_fds == 0)
 2687                 memset(&na->na_lut, 0, sizeof(na->na_lut));
 2688 err_drop_mem:
 2689         netmap_mem_drop(na);
 2690 err:
 2691         priv->np_na = NULL;
 2692         return error;
 2693 }
 2694 
 2695 
 2696 /*
 2697  * update kring and ring at the end of rxsync/txsync.
 2698  */
 2699 static inline void
 2700 nm_sync_finalize(struct netmap_kring *kring)
 2701 {
 2702         /*
 2703          * Update ring tail to what the kernel knows
 2704          * After txsync: head/rhead/hwcur might be behind cur/rcur
 2705          * if no carrier.
 2706          */
 2707         kring->ring->tail = kring->rtail = kring->nr_hwtail;
 2708 
 2709         nm_prdis(5, "%s now hwcur %d hwtail %d head %d cur %d tail %d",
 2710                 kring->name, kring->nr_hwcur, kring->nr_hwtail,
 2711                 kring->rhead, kring->rcur, kring->rtail);
 2712 }
 2713 
 2714 /* set ring timestamp */
 2715 static inline void
 2716 ring_timestamp_set(struct netmap_ring *ring)
 2717 {
 2718         if (netmap_no_timestamp == 0 || ring->flags & NR_TIMESTAMP) {
 2719                 microtime(&ring->ts);
 2720         }
 2721 }
 2722 
 2723 static int nmreq_copyin(struct nmreq_header *, int);
 2724 static int nmreq_copyout(struct nmreq_header *, int);
 2725 static int nmreq_checkoptions(struct nmreq_header *);
 2726 
 2727 /*
 2728  * ioctl(2) support for the "netmap" device.
 2729  *
 2730  * Following a list of accepted commands:
 2731  * - NIOCCTRL           device control API
 2732  * - NIOCTXSYNC         sync TX rings
 2733  * - NIOCRXSYNC         sync RX rings
 2734  * - SIOCGIFADDR        just for convenience
 2735  * - NIOCGINFO          deprecated (legacy API)
 2736  * - NIOCREGIF          deprecated (legacy API)
 2737  *
 2738  * Return 0 on success, errno otherwise.
 2739  */
 2740 int
 2741 netmap_ioctl(struct netmap_priv_d *priv, u_long cmd, caddr_t data,
 2742                 struct thread *td, int nr_body_is_user)
 2743 {
 2744         struct mbq q;   /* packets from RX hw queues to host stack */
 2745         struct netmap_adapter *na = NULL;
 2746         struct netmap_mem_d *nmd = NULL;
 2747         struct ifnet *ifp = NULL;
 2748         int error = 0;
 2749         u_int i, qfirst, qlast;
 2750         struct netmap_kring **krings;
 2751         int sync_flags;
 2752         enum txrx t;
 2753 
 2754         switch (cmd) {
 2755         case NIOCCTRL: {
 2756                 struct nmreq_header *hdr = (struct nmreq_header *)data;
 2757 
 2758                 if (hdr->nr_version < NETMAP_MIN_API ||
 2759                     hdr->nr_version > NETMAP_MAX_API) {
 2760                         nm_prerr("API mismatch: got %d need %d",
 2761                                 hdr->nr_version, NETMAP_API);
 2762                         return EINVAL;
 2763                 }
 2764 
 2765                 /* Make a kernel-space copy of the user-space nr_body.
 2766                  * For convenience, the nr_body pointer and the pointers
 2767                  * in the options list will be replaced with their
 2768                  * kernel-space counterparts. The original pointers are
 2769                  * saved internally and later restored by nmreq_copyout
 2770                  */
 2771                 error = nmreq_copyin(hdr, nr_body_is_user);
 2772                 if (error) {
 2773                         return error;
 2774                 }
 2775 
 2776                 /* Sanitize hdr->nr_name. */
 2777                 hdr->nr_name[sizeof(hdr->nr_name) - 1] = '\0';
 2778 
 2779                 switch (hdr->nr_reqtype) {
 2780                 case NETMAP_REQ_REGISTER: {
 2781                         struct nmreq_register *req =
 2782                                 (struct nmreq_register *)(uintptr_t)hdr->nr_body;
 2783                         struct netmap_if *nifp;
 2784 
 2785                         /* Protect access to priv from concurrent requests. */
 2786                         NMG_LOCK();
 2787                         do {
 2788                                 struct nmreq_option *opt;
 2789                                 u_int memflags;
 2790 
 2791                                 if (priv->np_nifp != NULL) {    /* thread already registered */
 2792                                         error = EBUSY;
 2793                                         break;
 2794                                 }
 2795 
 2796 #ifdef WITH_EXTMEM
 2797                                 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_EXTMEM);
 2798                                 if (opt != NULL) {
 2799                                         struct nmreq_opt_extmem *e =
 2800                                                 (struct nmreq_opt_extmem *)opt;
 2801 
 2802                                         nmd = netmap_mem_ext_create(e->nro_usrptr,
 2803                                                         &e->nro_info, &error);
 2804                                         opt->nro_status = error;
 2805                                         if (nmd == NULL)
 2806                                                 break;
 2807                                 }
 2808 #endif /* WITH_EXTMEM */
 2809 
 2810                                 if (nmd == NULL && req->nr_mem_id) {
 2811                                         /* find the allocator and get a reference */
 2812                                         nmd = netmap_mem_find(req->nr_mem_id);
 2813                                         if (nmd == NULL) {
 2814                                                 if (netmap_verbose) {
 2815                                                         nm_prerr("%s: failed to find mem_id %u",
 2816                                                                         hdr->nr_name, req->nr_mem_id);
 2817                                                 }
 2818                                                 error = EINVAL;
 2819                                                 break;
 2820                                         }
 2821                                 }
 2822                                 /* find the interface and a reference */
 2823                                 error = netmap_get_na(hdr, &na, &ifp, nmd,
 2824                                                       1 /* create */); /* keep reference */
 2825                                 if (error)
 2826                                         break;
 2827                                 if (NETMAP_OWNED_BY_KERN(na)) {
 2828                                         error = EBUSY;
 2829                                         break;
 2830                                 }
 2831 
 2832                                 if (na->virt_hdr_len && !(req->nr_flags & NR_ACCEPT_VNET_HDR)) {
 2833                                         nm_prerr("virt_hdr_len=%d, but application does "
 2834                                                 "not accept it", na->virt_hdr_len);
 2835                                         error = EIO;
 2836                                         break;
 2837                                 }
 2838 
 2839                                 error = netmap_do_regif(priv, na, hdr);
 2840                                 if (error) {    /* reg. failed, release priv and ref */
 2841                                         break;
 2842                                 }
 2843 
 2844                                 opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
 2845                                 if (opt != NULL) {
 2846                                         struct nmreq_opt_csb *csbo =
 2847                                                 (struct nmreq_opt_csb *)opt;
 2848                                         error = netmap_csb_validate(priv, csbo);
 2849                                         opt->nro_status = error;
 2850                                         if (error) {
 2851                                                 netmap_do_unregif(priv);
 2852                                                 break;
 2853                                         }
 2854                                 }
 2855 
 2856                                 nifp = priv->np_nifp;
 2857 
 2858                                 /* return the offset of the netmap_if object */
 2859                                 req->nr_rx_rings = na->num_rx_rings;
 2860                                 req->nr_tx_rings = na->num_tx_rings;
 2861                                 req->nr_rx_slots = na->num_rx_desc;
 2862                                 req->nr_tx_slots = na->num_tx_desc;
 2863                                 req->nr_host_tx_rings = na->num_host_tx_rings;
 2864                                 req->nr_host_rx_rings = na->num_host_rx_rings;
 2865                                 error = netmap_mem_get_info(na->nm_mem, &req->nr_memsize, &memflags,
 2866                                         &req->nr_mem_id);
 2867                                 if (error) {
 2868                                         netmap_do_unregif(priv);
 2869                                         break;
 2870                                 }
 2871                                 if (memflags & NETMAP_MEM_PRIVATE) {
 2872                                         *(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
 2873                                 }
 2874                                 for_rx_tx(t) {
 2875                                         priv->np_si[t] = nm_si_user(priv, t) ?
 2876                                                 &na->si[t] : &NMR(na, t)[priv->np_qfirst[t]]->si;
 2877                                 }
 2878 
 2879                                 if (req->nr_extra_bufs) {
 2880                                         if (netmap_verbose)
 2881                                                 nm_prinf("requested %d extra buffers",
 2882                                                         req->nr_extra_bufs);
 2883                                         req->nr_extra_bufs = netmap_extra_alloc(na,
 2884                                                 &nifp->ni_bufs_head, req->nr_extra_bufs);
 2885                                         if (netmap_verbose)
 2886                                                 nm_prinf("got %d extra buffers", req->nr_extra_bufs);
 2887                                 } else {
 2888                                         nifp->ni_bufs_head = 0;
 2889                                 }
 2890                                 req->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
 2891 
 2892                                 error = nmreq_checkoptions(hdr);
 2893                                 if (error) {
 2894                                         netmap_do_unregif(priv);
 2895                                         break;
 2896                                 }
 2897 
 2898                                 /* store ifp reference so that priv destructor may release it */
 2899                                 priv->np_ifp = ifp;
 2900                         } while (0);
 2901                         if (error) {
 2902                                 netmap_unget_na(na, ifp);
 2903                         }
 2904                         /* release the reference from netmap_mem_find() or
 2905                          * netmap_mem_ext_create()
 2906                          */
 2907                         if (nmd)
 2908                                 netmap_mem_put(nmd);
 2909                         NMG_UNLOCK();
 2910                         break;
 2911                 }
 2912 
 2913                 case NETMAP_REQ_PORT_INFO_GET: {
 2914                         struct nmreq_port_info_get *req =
 2915                                 (struct nmreq_port_info_get *)(uintptr_t)hdr->nr_body;
 2916                         int nmd_ref = 0;
 2917 
 2918                         NMG_LOCK();
 2919                         do {
 2920                                 u_int memflags;
 2921 
 2922                                 if (hdr->nr_name[0] != '\0') {
 2923                                         /* Build a nmreq_register out of the nmreq_port_info_get,
 2924                                          * so that we can call netmap_get_na(). */
 2925                                         struct nmreq_register regreq;
 2926                                         bzero(®req, sizeof(regreq));
 2927                                         regreq.nr_mode = NR_REG_ALL_NIC;
 2928                                         regreq.nr_tx_slots = req->nr_tx_slots;
 2929                                         regreq.nr_rx_slots = req->nr_rx_slots;
 2930                                         regreq.nr_tx_rings = req->nr_tx_rings;
 2931                                         regreq.nr_rx_rings = req->nr_rx_rings;
 2932                                         regreq.nr_host_tx_rings = req->nr_host_tx_rings;
 2933                                         regreq.nr_host_rx_rings = req->nr_host_rx_rings;
 2934                                         regreq.nr_mem_id = req->nr_mem_id;
 2935 
 2936                                         /* get a refcount */
 2937                                         hdr->nr_reqtype = NETMAP_REQ_REGISTER;
 2938                                         hdr->nr_body = (uintptr_t)®req;
 2939                                         error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
 2940                                         hdr->nr_reqtype = NETMAP_REQ_PORT_INFO_GET; /* reset type */
 2941                                         hdr->nr_body = (uintptr_t)req; /* reset nr_body */
 2942                                         if (error) {
 2943                                                 na = NULL;
 2944                                                 ifp = NULL;
 2945                                                 break;
 2946                                         }
 2947                                         nmd = na->nm_mem; /* get memory allocator */
 2948                                 } else {
 2949                                         nmd = netmap_mem_find(req->nr_mem_id ? req->nr_mem_id : 1);
 2950                                         if (nmd == NULL) {
 2951                                                 if (netmap_verbose)
 2952                                                         nm_prerr("%s: failed to find mem_id %u",
 2953                                                                         hdr->nr_name,
 2954                                                                         req->nr_mem_id ? req->nr_mem_id : 1);
 2955                                                 error = EINVAL;
 2956                                                 break;
 2957                                         }
 2958                                         nmd_ref = 1;
 2959                                 }
 2960 
 2961                                 error = netmap_mem_get_info(nmd, &req->nr_memsize, &memflags,
 2962                                         &req->nr_mem_id);
 2963                                 if (error)
 2964                                         break;
 2965                                 if (na == NULL) /* only memory info */
 2966                                         break;
 2967                                 netmap_update_config(na);
 2968                                 req->nr_rx_rings = na->num_rx_rings;
 2969                                 req->nr_tx_rings = na->num_tx_rings;
 2970                                 req->nr_rx_slots = na->num_rx_desc;
 2971                                 req->nr_tx_slots = na->num_tx_desc;
 2972                                 req->nr_host_tx_rings = na->num_host_tx_rings;
 2973                                 req->nr_host_rx_rings = na->num_host_rx_rings;
 2974                         } while (0);
 2975                         netmap_unget_na(na, ifp);
 2976                         if (nmd_ref)
 2977                                 netmap_mem_put(nmd);
 2978                         NMG_UNLOCK();
 2979                         break;
 2980                 }
 2981 #ifdef WITH_VALE
 2982                 case NETMAP_REQ_VALE_ATTACH: {
 2983                         error = netmap_bdg_attach(hdr, NULL /* userspace request */);
 2984                         break;
 2985                 }
 2986 
 2987                 case NETMAP_REQ_VALE_DETACH: {
 2988                         error = netmap_bdg_detach(hdr, NULL /* userspace request */);
 2989                         break;
 2990                 }
 2991 
 2992                 case NETMAP_REQ_PORT_HDR_SET: {
 2993                         struct nmreq_port_hdr *req =
 2994                                 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
 2995                         /* Build a nmreq_register out of the nmreq_port_hdr,
 2996                          * so that we can call netmap_get_bdg_na(). */
 2997                         struct nmreq_register regreq;
 2998                         bzero(®req, sizeof(regreq));
 2999                         regreq.nr_mode = NR_REG_ALL_NIC;
 3000 
 3001                         /* For now we only support virtio-net headers, and only for
 3002                          * VALE ports, but this may change in future. Valid lengths
 3003                          * for the virtio-net header are 0 (no header), 10 and 12. */
 3004                         if (req->nr_hdr_len != 0 &&
 3005                                 req->nr_hdr_len != sizeof(struct nm_vnet_hdr) &&
 3006                                         req->nr_hdr_len != 12) {
 3007                                 if (netmap_verbose)
 3008                                         nm_prerr("invalid hdr_len %u", req->nr_hdr_len);
 3009                                 error = EINVAL;
 3010                                 break;
 3011                         }
 3012                         NMG_LOCK();
 3013                         hdr->nr_reqtype = NETMAP_REQ_REGISTER;
 3014                         hdr->nr_body = (uintptr_t)®req;
 3015                         error = netmap_get_vale_na(hdr, &na, NULL, 0);
 3016                         hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
 3017                         hdr->nr_body = (uintptr_t)req;
 3018                         if (na && !error) {
 3019                                 struct netmap_vp_adapter *vpna =
 3020                                         (struct netmap_vp_adapter *)na;
 3021                                 na->virt_hdr_len = req->nr_hdr_len;
 3022                                 if (na->virt_hdr_len) {
 3023                                         vpna->mfs = NETMAP_BUF_SIZE(na);
 3024                                 }
 3025                                 if (netmap_verbose)
 3026                                         nm_prinf("Using vnet_hdr_len %d for %p", na->virt_hdr_len, na);
 3027                                 netmap_adapter_put(na);
 3028                         } else if (!na) {
 3029                                 error = ENXIO;
 3030                         }
 3031                         NMG_UNLOCK();
 3032                         break;
 3033                 }
 3034 
 3035                 case NETMAP_REQ_PORT_HDR_GET: {
 3036                         /* Get vnet-header length for this netmap port */
 3037                         struct nmreq_port_hdr *req =
 3038                                 (struct nmreq_port_hdr *)(uintptr_t)hdr->nr_body;
 3039                         /* Build a nmreq_register out of the nmreq_port_hdr,
 3040                          * so that we can call netmap_get_bdg_na(). */
 3041                         struct nmreq_register regreq;
 3042                         struct ifnet *ifp;
 3043 
 3044                         bzero(®req, sizeof(regreq));
 3045                         regreq.nr_mode = NR_REG_ALL_NIC;
 3046                         NMG_LOCK();
 3047                         hdr->nr_reqtype = NETMAP_REQ_REGISTER;
 3048                         hdr->nr_body = (uintptr_t)®req;
 3049                         error = netmap_get_na(hdr, &na, &ifp, NULL, 0);
 3050                         hdr->nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
 3051                         hdr->nr_body = (uintptr_t)req;
 3052                         if (na && !error) {
 3053                                 req->nr_hdr_len = na->virt_hdr_len;
 3054                         }
 3055                         netmap_unget_na(na, ifp);
 3056                         NMG_UNLOCK();
 3057                         break;
 3058                 }
 3059 
 3060                 case NETMAP_REQ_VALE_LIST: {
 3061                         error = netmap_vale_list(hdr);
 3062                         break;
 3063                 }
 3064 
 3065                 case NETMAP_REQ_VALE_NEWIF: {
 3066                         error = nm_vi_create(hdr);
 3067                         break;
 3068                 }
 3069 
 3070                 case NETMAP_REQ_VALE_DELIF: {
 3071                         error = nm_vi_destroy(hdr->nr_name);
 3072                         break;
 3073                 }
 3074 #endif  /* WITH_VALE */
 3075 
 3076                 case NETMAP_REQ_VALE_POLLING_ENABLE:
 3077                 case NETMAP_REQ_VALE_POLLING_DISABLE: {
 3078                         error = nm_bdg_polling(hdr);
 3079                         break;
 3080                 }
 3081                 case NETMAP_REQ_POOLS_INFO_GET: {
 3082                         /* Get information from the memory allocator used for
 3083                          * hdr->nr_name. */
 3084                         struct nmreq_pools_info *req =
 3085                                 (struct nmreq_pools_info *)(uintptr_t)hdr->nr_body;
 3086                         NMG_LOCK();
 3087                         do {
 3088                                 /* Build a nmreq_register out of the nmreq_pools_info,
 3089                                  * so that we can call netmap_get_na(). */
 3090                                 struct nmreq_register regreq;
 3091                                 bzero(®req, sizeof(regreq));
 3092                                 regreq.nr_mem_id = req->nr_mem_id;
 3093                                 regreq.nr_mode = NR_REG_ALL_NIC;
 3094 
 3095                                 hdr->nr_reqtype = NETMAP_REQ_REGISTER;
 3096                                 hdr->nr_body = (uintptr_t)®req;
 3097                                 error = netmap_get_na(hdr, &na, &ifp, NULL, 1 /* create */);
 3098                                 hdr->nr_reqtype = NETMAP_REQ_POOLS_INFO_GET; /* reset type */
 3099                                 hdr->nr_body = (uintptr_t)req; /* reset nr_body */
 3100                                 if (error) {
 3101                                         na = NULL;
 3102                                         ifp = NULL;
 3103                                         break;
 3104                                 }
 3105                                 nmd = na->nm_mem; /* grab the memory allocator */
 3106                                 if (nmd == NULL) {
 3107                                         error = EINVAL;
 3108                                         break;
 3109                                 }
 3110 
 3111                                 /* Finalize the memory allocator, get the pools
 3112                                  * information and release the allocator. */
 3113                                 error = netmap_mem_finalize(nmd, na);
 3114                                 if (error) {
 3115                                         break;
 3116                                 }
 3117                                 error = netmap_mem_pools_info_get(req, nmd);
 3118                                 netmap_mem_drop(na);
 3119                         } while (0);
 3120                         netmap_unget_na(na, ifp);
 3121                         NMG_UNLOCK();
 3122                         break;
 3123                 }
 3124 
 3125                 case NETMAP_REQ_CSB_ENABLE: {
 3126                         struct nmreq_option *opt;
 3127 
 3128                         opt = nmreq_getoption(hdr, NETMAP_REQ_OPT_CSB);
 3129                         if (opt == NULL) {
 3130                                 error = EINVAL;
 3131                         } else {
 3132                                 struct nmreq_opt_csb *csbo =
 3133                                         (struct nmreq_opt_csb *)opt;
 3134                                 NMG_LOCK();
 3135                                 error = netmap_csb_validate(priv, csbo);
 3136                                 NMG_UNLOCK();
 3137                                 opt->nro_status = error;
 3138                         }
 3139                         break;
 3140                 }
 3141 
 3142                 case NETMAP_REQ_SYNC_KLOOP_START: {
 3143                         error = netmap_sync_kloop(priv, hdr);
 3144                         break;
 3145                 }
 3146 
 3147                 case NETMAP_REQ_SYNC_KLOOP_STOP: {
 3148                         error = netmap_sync_kloop_stop(priv);
 3149                         break;
 3150                 }
 3151 
 3152                 default: {
 3153                         error = EINVAL;
 3154                         break;
 3155                 }
 3156                 }
 3157                 /* Write back request body to userspace and reset the
 3158                  * user-space pointer. */
 3159                 error = nmreq_copyout(hdr, error);
 3160                 break;
 3161         }
 3162 
 3163         case NIOCTXSYNC:
 3164         case NIOCRXSYNC: {
 3165                 if (unlikely(priv->np_nifp == NULL)) {
 3166                         error = ENXIO;
 3167                         break;
 3168                 }
 3169                 mb(); /* make sure following reads are not from cache */
 3170 
 3171                 if (unlikely(priv->np_csb_atok_base)) {
 3172                         nm_prerr("Invalid sync in CSB mode");
 3173                         error = EBUSY;
 3174                         break;
 3175                 }
 3176 
 3177                 na = priv->np_na;      /* we have a reference */
 3178 
 3179                 mbq_init(&q);
 3180                 t = (cmd == NIOCTXSYNC ? NR_TX : NR_RX);
 3181                 krings = NMR(na, t);
 3182                 qfirst = priv->np_qfirst[t];
 3183                 qlast = priv->np_qlast[t];
 3184                 sync_flags = priv->np_sync_flags;
 3185 
 3186                 for (i = qfirst; i < qlast; i++) {
 3187                         struct netmap_kring *kring = krings[i];
 3188                         struct netmap_ring *ring = kring->ring;
 3189 
 3190                         if (unlikely(nm_kr_tryget(kring, 1, &error))) {
 3191                                 error = (error ? EIO : 0);
 3192                                 continue;
 3193                         }
 3194 
 3195                         if (cmd == NIOCTXSYNC) {
 3196                                 if (netmap_debug & NM_DEBUG_TXSYNC)
 3197                                         nm_prinf("pre txsync ring %d cur %d hwcur %d",
 3198                                             i, ring->cur,
 3199                                             kring->nr_hwcur);
 3200                                 if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
 3201                                         netmap_ring_reinit(kring);
 3202                                 } else if (kring->nm_sync(kring, sync_flags | NAF_FORCE_RECLAIM) == 0) {
 3203                                         nm_sync_finalize(kring);
 3204                                 }
 3205                                 if (netmap_debug & NM_DEBUG_TXSYNC)
 3206                                         nm_prinf("post txsync ring %d cur %d hwcur %d",
 3207                                             i, ring->cur,
 3208                                             kring->nr_hwcur);
 3209                         } else {
 3210                                 if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
 3211                                         netmap_ring_reinit(kring);
 3212                                 }
 3213                                 if (nm_may_forward_up(kring)) {
 3214                                         /* transparent forwarding, see netmap_poll() */
 3215                                         netmap_grab_packets(kring, &q, netmap_fwd);
 3216                                 }
 3217                                 if (kring->nm_sync(kring, sync_flags | NAF_FORCE_READ) == 0) {
 3218                                         nm_sync_finalize(kring);
 3219                                 }
 3220                                 ring_timestamp_set(ring);
 3221                         }
 3222                         nm_kr_put(kring);
 3223                 }
 3224 
 3225                 if (mbq_peek(&q)) {
 3226                         netmap_send_up(na->ifp, &q);
 3227                 }
 3228 
 3229                 break;
 3230         }
 3231 
 3232         default: {
 3233                 return netmap_ioctl_legacy(priv, cmd, data, td);
 3234                 break;
 3235         }
 3236         }
 3237 
 3238         return (error);
 3239 }
 3240 
 3241 size_t
 3242 nmreq_size_by_type(uint16_t nr_reqtype)
 3243 {
 3244         switch (nr_reqtype) {
 3245         case NETMAP_REQ_REGISTER:
 3246                 return sizeof(struct nmreq_register);
 3247         case NETMAP_REQ_PORT_INFO_GET:
 3248                 return sizeof(struct nmreq_port_info_get);
 3249         case NETMAP_REQ_VALE_ATTACH:
 3250                 return sizeof(struct nmreq_vale_attach);
 3251         case NETMAP_REQ_VALE_DETACH:
 3252                 return sizeof(struct nmreq_vale_detach);
 3253         case NETMAP_REQ_VALE_LIST:
 3254                 return sizeof(struct nmreq_vale_list);
 3255         case NETMAP_REQ_PORT_HDR_SET:
 3256         case NETMAP_REQ_PORT_HDR_GET:
 3257                 return sizeof(struct nmreq_port_hdr);
 3258         case NETMAP_REQ_VALE_NEWIF:
 3259                 return sizeof(struct nmreq_vale_newif);
 3260         case NETMAP_REQ_VALE_DELIF:
 3261         case NETMAP_REQ_SYNC_KLOOP_STOP:
 3262         case NETMAP_REQ_CSB_ENABLE:
 3263                 return 0;
 3264         case NETMAP_REQ_VALE_POLLING_ENABLE:
 3265         case NETMAP_REQ_VALE_POLLING_DISABLE:
 3266                 return sizeof(struct nmreq_vale_polling);
 3267         case NETMAP_REQ_POOLS_INFO_GET:
 3268                 return sizeof(struct nmreq_pools_info);
 3269         case NETMAP_REQ_SYNC_KLOOP_START:
 3270                 return sizeof(struct nmreq_sync_kloop_start);
 3271         }
 3272         return 0;
 3273 }
 3274 
 3275 static size_t
 3276 nmreq_opt_size_by_type(uint32_t nro_reqtype, uint64_t nro_size)
 3277 {
 3278         size_t rv = sizeof(struct nmreq_option);
 3279 #ifdef NETMAP_REQ_OPT_DEBUG
 3280         if (nro_reqtype & NETMAP_REQ_OPT_DEBUG)
 3281                 return (nro_reqtype & ~NETMAP_REQ_OPT_DEBUG);
 3282 #endif /* NETMAP_REQ_OPT_DEBUG */
 3283         switch (nro_reqtype) {
 3284 #ifdef WITH_EXTMEM
 3285         case NETMAP_REQ_OPT_EXTMEM:
 3286                 rv = sizeof(struct nmreq_opt_extmem);
 3287                 break;
 3288 #endif /* WITH_EXTMEM */
 3289         case NETMAP_REQ_OPT_SYNC_KLOOP_EVENTFDS:
 3290                 if (nro_size >= rv)
 3291                         rv = nro_size;
 3292                 break;
 3293         case NETMAP_REQ_OPT_CSB:
 3294                 rv = sizeof(struct nmreq_opt_csb);
 3295                 break;
 3296         case NETMAP_REQ_OPT_SYNC_KLOOP_MODE:
 3297                 rv = sizeof(struct nmreq_opt_sync_kloop_mode);
 3298                 break;
 3299         case NETMAP_REQ_OPT_OFFSETS:
 3300                 rv = sizeof(struct nmreq_opt_offsets);
 3301                 break;
 3302         }
 3303         /* subtract the common header */
 3304         return rv - sizeof(struct nmreq_option);
 3305 }
 3306 
 3307 /*
 3308  * nmreq_copyin: create an in-kernel version of the request.
 3309  *
 3310  * We build the following data structure:
 3311  *
 3312  * hdr -> +-------+                buf
 3313  *        |       |          +---------------+
 3314  *        +-------+          |usr body ptr   |
 3315  *        |options|-.        +---------------+
 3316  *        +-------+ |        |usr options ptr|
 3317  *        |body   |--------->+---------------+
 3318  *        +-------+ |        |               |
 3319  *                  |        |  copy of body |
 3320  *                  |        |               |
 3321  *                  |        +---------------+
 3322  *                  |        |    NULL       |
 3323  *                  |        +---------------+
 3324  *                  |    .---|               |\
 3325  *                  |    |   +---------------+ |
 3326  *                  | .------|               | |
 3327  *                  | |  |   +---------------+  \ option table
 3328  *                  | |  |   |      ...      |  / indexed by option
 3329  *                  | |  |   +---------------+ |  type
 3330  *                  | |  |   |               | |
 3331  *                  | |  |   +---------------+/
 3332  *                  | |  |   |usr next ptr 1 |
 3333  *                  `-|----->+---------------+
 3334  *                    |  |   | copy of opt 1 |
 3335  *                    |  |   |               |
 3336  *                    |  | .-| nro_next      |
 3337  *                    |  | | +---------------+
 3338  *                    |  | | |usr next ptr 2 |
 3339  *                    |  `-`>+---------------+
 3340  *                    |      | copy of opt 2 |
 3341  *                    |      |               |
 3342  *                    |    .-| nro_next      |
 3343  *                    |    | +---------------+
 3344  *                    |    | |               |
 3345  *                    ~    ~ ~      ...      ~
 3346  *                    |    .-|               |
 3347  *                    `----->+---------------+
 3348  *                         | |usr next ptr n |
 3349  *                         `>+---------------+
 3350  *                           | copy of opt n |
 3351  *                           |               |
 3352  *                           | nro_next(NULL)|
 3353  *                           +---------------+
 3354  *
 3355  * The options and body fields of the hdr structure are overwritten
 3356  * with in-kernel valid pointers inside the buf. The original user
 3357  * pointers are saved in the buf and restored on copyout.
 3358  * The list of options is copied and the pointers adjusted. The
 3359  * original pointers are saved before the option they belonged.
 3360  *
 3361  * The option table has an entry for every available option.  Entries
 3362  * for options that have not been passed contain NULL.
 3363  *
 3364  */
 3365 
 3366 int
 3367 nmreq_copyin(struct nmreq_header *hdr, int nr_body_is_user)
 3368 {
 3369         size_t rqsz, optsz, bufsz;
 3370         int error = 0;
 3371         char *ker = NULL, *p;
 3372         struct nmreq_option **next, *src, **opt_tab;
 3373         uint64_t *ptrs;
 3374 
 3375         if (hdr->nr_reserved) {
 3376                 if (netmap_verbose)
 3377                         nm_prerr("nr_reserved must be zero");
 3378                 return EINVAL;
 3379         }
 3380 
 3381         if (!nr_body_is_user)
 3382                 return 0;
 3383 
 3384         hdr->nr_reserved = nr_body_is_user;
 3385 
 3386         /* compute the total size of the buffer */
 3387         rqsz = nmreq_size_by_type(hdr->nr_reqtype);
 3388         if (rqsz > NETMAP_REQ_MAXSIZE) {
 3389                 error = EMSGSIZE;
 3390                 goto out_err;
 3391         }
 3392         if ((rqsz && hdr->nr_body == (uintptr_t)NULL) ||
 3393                 (!rqsz && hdr->nr_body != (uintptr_t)NULL)) {
 3394                 /* Request body expected, but not found; or
 3395                  * request body found but unexpected. */
 3396                 if (netmap_verbose)
 3397                         nm_prerr("nr_body expected but not found, or vice versa");
 3398                 error = EINVAL;
 3399                 goto out_err;
 3400         }
 3401 
 3402         /*
 3403          * The buffer size must be large enough to store the request body,
 3404          * all the possible options and the additional user pointers
 3405          * (2+NETMAP_REQ_OPT_MAX). Note that the maximum size of body plus
 3406          * options can not exceed NETMAP_REQ_MAXSIZE;
 3407          */
 3408         bufsz = (2 + NETMAP_REQ_OPT_MAX) * sizeof(void *) + NETMAP_REQ_MAXSIZE +
 3409                 NETMAP_REQ_OPT_MAX * sizeof(opt_tab);
 3410 
 3411         ker = nm_os_malloc(bufsz);
 3412         if (ker == NULL) {
 3413                 error = ENOMEM;
 3414                 goto out_err;
 3415         }
 3416         p = ker;        /* write pointer into the buffer */
 3417 
 3418         /* make a copy of the user pointers */
 3419         ptrs = (uint64_t*)p;
 3420         *ptrs++ = hdr->nr_body;
 3421         *ptrs++ = hdr->nr_options;
 3422         p = (char *)ptrs;
 3423 
 3424         /* copy the body */
 3425         error = copyin((void *)(uintptr_t)hdr->nr_body, p, rqsz);
 3426         if (error)
 3427                 goto out_restore;
 3428         /* overwrite the user pointer with the in-kernel one */
 3429         hdr->nr_body = (uintptr_t)p;
 3430         p += rqsz;
 3431         /* start of the options table */
 3432         opt_tab = (struct nmreq_option **)p;
 3433         p += sizeof(opt_tab) * NETMAP_REQ_OPT_MAX;
 3434 
 3435         /* copy the options */
 3436         next = (struct nmreq_option **)&hdr->nr_options;
 3437         src = *next;
 3438         while (src) {
 3439                 struct nmreq_option *opt;
 3440 
 3441                 /* copy the option header */
 3442                 ptrs = (uint64_t *)p;
 3443                 opt = (struct nmreq_option *)(ptrs + 1);
 3444                 error = copyin(src, opt, sizeof(*src));
 3445                 if (error)
 3446                         goto out_restore;
 3447                 rqsz += sizeof(*src);
 3448                 /* make a copy of the user next pointer */
 3449                 *ptrs = opt->nro_next;
 3450                 /* overwrite the user pointer with the in-kernel one */
 3451                 *next = opt;
 3452 
 3453                 /* initialize the option as not supported.
 3454                  * Recognized options will update this field.
 3455                  */
 3456                 opt->nro_status = EOPNOTSUPP;
 3457 
 3458                 /* check for invalid types */
 3459                 if (opt->nro_reqtype < 1) {
 3460                         if (netmap_verbose)
 3461                                 nm_prinf("invalid option type: %u", opt->nro_reqtype);
 3462                         opt->nro_status = EINVAL;
 3463                         error = EINVAL;
 3464                         goto next;
 3465                 }
 3466 
 3467                 if (opt->nro_reqtype >= NETMAP_REQ_OPT_MAX) {
 3468                         /* opt->nro_status is already EOPNOTSUPP */
 3469                         error = EOPNOTSUPP;
 3470                         goto next;
 3471                 }
 3472 
 3473                 /* if the type is valid, index the option in the table
 3474                  * unless it is a duplicate.
 3475                  */
 3476                 if (opt_tab[opt->nro_reqtype] != NULL) {
 3477                         if (netmap_verbose)
 3478                                 nm_prinf("duplicate option: %u", opt->nro_reqtype);
 3479                         opt->nro_status = EINVAL;
 3480                         opt_tab[opt->nro_reqtype]->nro_status = EINVAL;
 3481                         error = EINVAL;
 3482                         goto next;
 3483                 }
 3484                 opt_tab[opt->nro_reqtype] = opt;
 3485 
 3486                 p = (char *)(opt + 1);
 3487 
 3488                 /* copy the option body */
 3489                 optsz = nmreq_opt_size_by_type(opt->nro_reqtype,
 3490                                                 opt->nro_size);
 3491                 /* check optsz and nro_size to avoid for possible integer overflows of rqsz */
 3492                 if ((optsz > NETMAP_REQ_MAXSIZE) || (opt->nro_size > NETMAP_REQ_MAXSIZE)
 3493                                 || (rqsz + optsz > NETMAP_REQ_MAXSIZE)
 3494                                 || (optsz > 0 && rqsz + optsz <= rqsz)) {
 3495                         error = EMSGSIZE;
 3496                         goto out_restore;
 3497                 }
 3498                 rqsz += optsz;
 3499                 if (optsz) {
 3500                         /* the option body follows the option header */
 3501                         error = copyin(src + 1, p, optsz);
 3502                         if (error)
 3503                                 goto out_restore;
 3504                         p += optsz;
 3505                 }
 3506 
 3507         next:
 3508                 /* move to next option */
 3509                 next = (struct nmreq_option **)&opt->nro_next;
 3510                 src = *next;
 3511         }
 3512         if (error)
 3513                 nmreq_copyout(hdr, error);
 3514         return error;
 3515 
 3516 out_restore:
 3517         ptrs = (uint64_t *)ker;
 3518         hdr->nr_body = *ptrs++;
 3519         hdr->nr_options = *ptrs++;
 3520         hdr->nr_reserved = 0;
 3521         nm_os_free(ker);
 3522 out_err:
 3523         return error;
 3524 }
 3525 
 3526 static int
 3527 nmreq_copyout(struct nmreq_header *hdr, int rerror)
 3528 {
 3529         struct nmreq_option *src, *dst;
 3530         void *ker = (void *)(uintptr_t)hdr->nr_body, *bufstart;
 3531         uint64_t *ptrs;
 3532         size_t bodysz;
 3533         int error;
 3534 
 3535         if (!hdr->nr_reserved)
 3536                 return rerror;
 3537 
 3538         /* restore the user pointers in the header */
 3539         ptrs = (uint64_t *)ker - 2;
 3540         bufstart = ptrs;
 3541         hdr->nr_body = *ptrs++;
 3542         src = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
 3543         hdr->nr_options = *ptrs;
 3544 
 3545         if (!rerror) {
 3546                 /* copy the body */
 3547                 bodysz = nmreq_size_by_type(hdr->nr_reqtype);
 3548                 error = copyout(ker, (void *)(uintptr_t)hdr->nr_body, bodysz);
 3549                 if (error) {
 3550                         rerror = error;
 3551                         goto out;
 3552                 }
 3553         }
 3554 
 3555         /* copy the options */
 3556         dst = (struct nmreq_option *)(uintptr_t)hdr->nr_options;
 3557         while (src) {
 3558                 size_t optsz;
 3559                 uint64_t next;
 3560 
 3561                 /* restore the user pointer */
 3562                 next = src->nro_next;
 3563                 ptrs = (uint64_t *)src - 1;
 3564                 src->nro_next = *ptrs;
 3565 
 3566                 /* always copy the option header */
 3567                 error = copyout(src, dst, sizeof(*src));
 3568                 if (error) {
 3569                         rerror = error;
 3570                         goto out;
 3571                 }
 3572 
 3573                 /* copy the option body only if there was no error */
 3574                 if (!rerror && !src->nro_status) {
 3575                         optsz = nmreq_opt_size_by_type(src->nro_reqtype,
 3576                                                         src->nro_size);
 3577                         if (optsz) {
 3578                                 error = copyout(src + 1, dst + 1, optsz);
 3579                                 if (error) {
 3580                                         rerror = error;
 3581                                         goto out;
 3582                                 }
 3583                         }
 3584                 }
 3585                 src = (struct nmreq_option *)(uintptr_t)next;
 3586                 dst = (struct nmreq_option *)(uintptr_t)*ptrs;
 3587         }
 3588 
 3589 
 3590 out:
 3591         hdr->nr_reserved = 0;
 3592         nm_os_free(bufstart);
 3593         return rerror;
 3594 }
 3595 
 3596 struct nmreq_option *
 3597 nmreq_getoption(struct nmreq_header *hdr, uint16_t reqtype)
 3598 {
 3599         struct nmreq_option **opt_tab;
 3600 
 3601         if (!hdr->nr_options)
 3602                 return NULL;
 3603 
 3604         opt_tab = (struct nmreq_option **)((uintptr_t)hdr->nr_options) -
 3605             (NETMAP_REQ_OPT_MAX + 1);
 3606         return opt_tab[reqtype];
 3607 }
 3608 
 3609 static int
 3610 nmreq_checkoptions(struct nmreq_header *hdr)
 3611 {
 3612         struct nmreq_option *opt;
 3613         /* return error if there is still any option
 3614          * marked as not supported
 3615          */
 3616 
 3617         for (opt = (struct nmreq_option *)(uintptr_t)hdr->nr_options; opt;
 3618              opt = (struct nmreq_option *)(uintptr_t)opt->nro_next)
 3619                 if (opt->nro_status == EOPNOTSUPP)
 3620                         return EOPNOTSUPP;
 3621 
 3622         return 0;
 3623 }
 3624 
 3625 /*
 3626  * select(2) and poll(2) handlers for the "netmap" device.
 3627  *
 3628  * Can be called for one or more queues.
 3629  * Return true the event mask corresponding to ready events.
 3630  * If there are no ready events (and 'sr' is not NULL), do a
 3631  * selrecord on either individual selinfo or on the global one.
 3632  * Device-dependent parts (locking and sync of tx/rx rings)
 3633  * are done through callbacks.
 3634  *
 3635  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
 3636  * The first one is remapped to pwait as selrecord() uses the name as an
 3637  * hidden argument.
 3638  */
 3639 int
 3640 netmap_poll(struct netmap_priv_d *priv, int events, NM_SELRECORD_T *sr)
 3641 {
 3642         struct netmap_adapter *na;
 3643         struct netmap_kring *kring;
 3644         struct netmap_ring *ring;
 3645         u_int i, want[NR_TXRX], revents = 0;
 3646         NM_SELINFO_T *si[NR_TXRX];
 3647 #define want_tx want[NR_TX]
 3648 #define want_rx want[NR_RX]
 3649         struct mbq q;   /* packets from RX hw queues to host stack */
 3650 
 3651         /*
 3652          * In order to avoid nested locks, we need to "double check"
 3653          * txsync and rxsync if we decide to do a selrecord().
 3654          * retry_tx (and retry_rx, later) prevent looping forever.
 3655          */
 3656         int retry_tx = 1, retry_rx = 1;
 3657 
 3658         /* Transparent mode: send_down is 1 if we have found some
 3659          * packets to forward (host RX ring --> NIC) during the rx
 3660          * scan and we have not sent them down to the NIC yet.
 3661          * Transparent mode requires to bind all rings to a single
 3662          * file descriptor.
 3663          */
 3664         int send_down = 0;
 3665         int sync_flags = priv->np_sync_flags;
 3666 
 3667         mbq_init(&q);
 3668 
 3669         if (unlikely(priv->np_nifp == NULL)) {
 3670                 return POLLERR;
 3671         }
 3672         mb(); /* make sure following reads are not from cache */
 3673 
 3674         na = priv->np_na;
 3675 
 3676         if (unlikely(!nm_netmap_on(na)))
 3677                 return POLLERR;
 3678 
 3679         if (unlikely(priv->np_csb_atok_base)) {
 3680                 nm_prerr("Invalid poll in CSB mode");
 3681                 return POLLERR;
 3682         }
 3683 
 3684         if (netmap_debug & NM_DEBUG_ON)
 3685                 nm_prinf("device %s events 0x%x", na->name, events);
 3686         want_tx = events & (POLLOUT | POLLWRNORM);
 3687         want_rx = events & (POLLIN | POLLRDNORM);
 3688 
 3689         /*
 3690          * If the card has more than one queue AND the file descriptor is
 3691          * bound to all of them, we sleep on the "global" selinfo, otherwise
 3692          * we sleep on individual selinfo (FreeBSD only allows two selinfo's
 3693          * per file descriptor).
 3694          * The interrupt routine in the driver wake one or the other
 3695          * (or both) depending on which clients are active.
 3696          *
 3697          * rxsync() is only called if we run out of buffers on a POLLIN.
 3698          * txsync() is called if we run out of buffers on POLLOUT, or
 3699          * there are pending packets to send. The latter can be disabled
 3700          * passing NETMAP_NO_TX_POLL in the NIOCREG call.
 3701          */
 3702         si[NR_RX] = priv->np_si[NR_RX];
 3703         si[NR_TX] = priv->np_si[NR_TX];
 3704 
 3705 #ifdef __FreeBSD__
 3706         /*
 3707          * We start with a lock free round which is cheap if we have
 3708          * slots available. If this fails, then lock and call the sync
 3709          * routines. We can't do this on Linux, as the contract says
 3710          * that we must call nm_os_selrecord() unconditionally.
 3711          */
 3712         if (want_tx) {
 3713                 const enum txrx t = NR_TX;
 3714                 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
 3715                         kring = NMR(na, t)[i];
 3716                         if (kring->ring->cur != kring->ring->tail) {
 3717                                 /* Some unseen TX space is available, so what
 3718                                  * we don't need to run txsync. */
 3719                                 revents |= want[t];
 3720                                 want[t] = 0;
 3721                                 break;
 3722                         }
 3723                 }
 3724         }
 3725         if (want_rx) {
 3726                 const enum txrx t = NR_RX;
 3727                 int rxsync_needed = 0;
 3728 
 3729                 for (i = priv->np_qfirst[t]; i < priv->np_qlast[t]; i++) {
 3730                         kring = NMR(na, t)[i];
 3731                         if (kring->ring->cur == kring->ring->tail
 3732                                 || kring->rhead != kring->ring->head) {
 3733                                 /* There are no unseen packets on this ring,
 3734                                  * or there are some buffers to be returned
 3735                                  * to the netmap port. We therefore go ahead
 3736                                  * and run rxsync. */
 3737                                 rxsync_needed = 1;
 3738                                 break;
 3739                         }
 3740                 }
 3741                 if (!rxsync_needed) {
 3742                         revents |= want_rx;
 3743                         want_rx = 0;
 3744                 }
 3745         }
 3746 #endif
 3747 
 3748 #ifdef linux
 3749         /* The selrecord must be unconditional on linux. */
 3750         nm_os_selrecord(sr, si[NR_RX]);
 3751         nm_os_selrecord(sr, si[NR_TX]);
 3752 #endif /* linux */
 3753 
 3754         /*
 3755          * If we want to push packets out (priv->np_txpoll) or
 3756          * want_tx is still set, we must issue txsync calls
 3757          * (on all rings, to avoid that the tx rings stall).
 3758          * Fortunately, normal tx mode has np_txpoll set.
 3759          */
 3760         if (priv->np_txpoll || want_tx) {
 3761                 /*
 3762                  * The first round checks if anyone is ready, if not
 3763                  * do a selrecord and another round to handle races.
 3764                  * want_tx goes to 0 if any space is found, and is
 3765                  * used to skip rings with no pending transmissions.
 3766                  */
 3767 flush_tx:
 3768                 for (i = priv->np_qfirst[NR_TX]; i < priv->np_qlast[NR_TX]; i++) {
 3769                         int found = 0;
 3770 
 3771                         kring = na->tx_rings[i];
 3772                         ring = kring->ring;
 3773 
 3774                         /*
 3775                          * Don't try to txsync this TX ring if we already found some
 3776                          * space in some of the TX rings (want_tx == 0) and there are no
 3777                          * TX slots in this ring that need to be flushed to the NIC
 3778                          * (head == hwcur).
 3779                          */
 3780                         if (!send_down && !want_tx && ring->head == kring->nr_hwcur)
 3781                                 continue;
 3782 
 3783                         if (nm_kr_tryget(kring, 1, &revents))
 3784                                 continue;
 3785 
 3786                         if (nm_txsync_prologue(kring, ring) >= kring->nkr_num_slots) {
 3787                                 netmap_ring_reinit(kring);
 3788                                 revents |= POLLERR;
 3789                         } else {
 3790                                 if (kring->nm_sync(kring, sync_flags))
 3791                                         revents |= POLLERR;
 3792                                 else
 3793                                         nm_sync_finalize(kring);
 3794                         }
 3795 
 3796                         /*
 3797                          * If we found new slots, notify potential
 3798                          * listeners on the same ring.
 3799                          * Since we just did a txsync, look at the copies
 3800                          * of cur,tail in the kring.
 3801                          */
 3802                         found = kring->rcur != kring->rtail;
 3803                         nm_kr_put(kring);
 3804                         if (found) { /* notify other listeners */
 3805                                 revents |= want_tx;
 3806                                 want_tx = 0;
 3807 #ifndef linux
 3808                                 kring->nm_notify(kring, 0);
 3809 #endif /* linux */
 3810                         }
 3811                 }
 3812                 /* if there were any packet to forward we must have handled them by now */
 3813                 send_down = 0;
 3814                 if (want_tx && retry_tx && sr) {
 3815 #ifndef linux
 3816                         nm_os_selrecord(sr, si[NR_TX]);
 3817 #endif /* !linux */
 3818                         retry_tx = 0;
 3819                         goto flush_tx;
 3820                 }
 3821         }
 3822 
 3823         /*
 3824          * If want_rx is still set scan receive rings.
 3825          * Do it on all rings because otherwise we starve.
 3826          */
 3827         if (want_rx) {
 3828                 /* two rounds here for race avoidance */
 3829 do_retry_rx:
 3830                 for (i = priv->np_qfirst[NR_RX]; i < priv->np_qlast[NR_RX]; i++) {
 3831                         int found = 0;
 3832 
 3833                         kring = na->rx_rings[i];
 3834                         ring = kring->ring;
 3835 
 3836                         if (unlikely(nm_kr_tryget(kring, 1, &revents)))
 3837                                 continue;
 3838 
 3839                         if (nm_rxsync_prologue(kring, ring) >= kring->nkr_num_slots) {
 3840                                 netmap_ring_reinit(kring);
 3841                                 revents |= POLLERR;
 3842                         }
 3843                         /* now we can use kring->rcur, rtail */
 3844 
 3845                         /*
 3846                          * transparent mode support: collect packets from
 3847                          * hw rxring(s) that have been released by the user
 3848                          */
 3849                         if (nm_may_forward_up(kring)) {
 3850                                 netmap_grab_packets(kring, &q, netmap_fwd);
 3851                         }
 3852 
 3853                         /* Clear the NR_FORWARD flag anyway, it may be set by
 3854                          * the nm_sync() below only on for the host RX ring (see
 3855                          * netmap_rxsync_from_host()). */
 3856                         kring->nr_kflags &= ~NR_FORWARD;
 3857                         if (kring->nm_sync(kring, sync_flags))
 3858                                 revents |= POLLERR;
 3859                         else
 3860                                 nm_sync_finalize(kring);
 3861                         send_down |= (kring->nr_kflags & NR_FORWARD);
 3862                         ring_timestamp_set(ring);
 3863                         found = kring->rcur != kring->rtail;
 3864                         nm_kr_put(kring);
 3865                         if (found) {
 3866                                 revents |= want_rx;
 3867                                 retry_rx = 0;
 3868 #ifndef linux
 3869                                 kring->nm_notify(kring, 0);
 3870 #endif /* linux */
 3871                         }
 3872                 }
 3873 
 3874 #ifndef linux
 3875                 if (retry_rx && sr) {
 3876                         nm_os_selrecord(sr, si[NR_RX]);
 3877                 }
 3878 #endif /* !linux */
 3879                 if (send_down || retry_rx) {
 3880                         retry_rx = 0;
 3881                         if (send_down)
 3882                                 goto flush_tx; /* and retry_rx */
 3883                         else
 3884                                 goto do_retry_rx;
 3885                 }
 3886         }
 3887 
 3888         /*
 3889          * Transparent mode: released bufs (i.e. between kring->nr_hwcur and
 3890          * ring->head) marked with NS_FORWARD on hw rx rings are passed up
 3891          * to the host stack.
 3892          */
 3893 
 3894         if (mbq_peek(&q)) {
 3895                 netmap_send_up(na->ifp, &q);
 3896         }
 3897 
 3898         return (revents);
 3899 #undef want_tx
 3900 #undef want_rx
 3901 }
 3902 
 3903 int
 3904 nma_intr_enable(struct netmap_adapter *na, int onoff)
 3905 {
 3906         bool changed = false;
 3907         enum txrx t;
 3908         int i;
 3909 
 3910         for_rx_tx(t) {
 3911                 for (i = 0; i < nma_get_nrings(na, t); i++) {
 3912                         struct netmap_kring *kring = NMR(na, t)[i];
 3913                         int on = !(kring->nr_kflags & NKR_NOINTR);
 3914 
 3915                         if (!!onoff != !!on) {
 3916                                 changed = true;
 3917                         }
 3918                         if (onoff) {
 3919                                 kring->nr_kflags &= ~NKR_NOINTR;
 3920                         } else {
 3921                                 kring->nr_kflags |= NKR_NOINTR;
 3922                         }
 3923                 }
 3924         }
 3925 
 3926         if (!changed) {
 3927                 return 0; /* nothing to do */
 3928         }
 3929 
 3930         if (!na->nm_intr) {
 3931                 nm_prerr("Cannot %s interrupts for %s", onoff ? "enable" : "disable",
 3932                   na->name);
 3933                 return -1;
 3934         }
 3935 
 3936         na->nm_intr(na, onoff);
 3937 
 3938         return 0;
 3939 }
 3940 
 3941 
 3942 /*-------------------- driver support routines -------------------*/
 3943 
 3944 /* default notify callback */
 3945 static int
 3946 netmap_notify(struct netmap_kring *kring, int flags)
 3947 {
 3948         struct netmap_adapter *na = kring->notify_na;
 3949         enum txrx t = kring->tx;
 3950 
 3951         nm_os_selwakeup(&kring->si);
 3952         /* optimization: avoid a wake up on the global
 3953          * queue if nobody has registered for more
 3954          * than one ring
 3955          */
 3956         if (na->si_users[t] > 0)
 3957                 nm_os_selwakeup(&na->si[t]);
 3958 
 3959         return NM_IRQ_COMPLETED;
 3960 }
 3961 
 3962 /* called by all routines that create netmap_adapters.
 3963  * provide some defaults and get a reference to the
 3964  * memory allocator
 3965  */
 3966 int
 3967 netmap_attach_common(struct netmap_adapter *na)
 3968 {
 3969         if (!na->rx_buf_maxsize) {
 3970                 /* Set a conservative default (larger is safer). */
 3971                 na->rx_buf_maxsize = PAGE_SIZE;
 3972         }
 3973 
 3974 #ifdef __FreeBSD__
 3975         if (na->na_flags & NAF_HOST_RINGS && na->ifp) {
 3976                 na->if_input = na->ifp->if_input; /* for netmap_send_up */
 3977         }
 3978         na->pdev = na; /* make sure netmap_mem_map() is called */
 3979 #endif /* __FreeBSD__ */
 3980         if (na->na_flags & NAF_HOST_RINGS) {
 3981                 if (na->num_host_rx_rings == 0)
 3982                         na->num_host_rx_rings = 1;
 3983                 if (na->num_host_tx_rings == 0)
 3984                         na->num_host_tx_rings = 1;
 3985         }
 3986         if (na->nm_krings_create == NULL) {
 3987                 /* we assume that we have been called by a driver,
 3988                  * since other port types all provide their own
 3989                  * nm_krings_create
 3990                  */
 3991                 na->nm_krings_create = netmap_hw_krings_create;
 3992                 na->nm_krings_delete = netmap_hw_krings_delete;
 3993         }
 3994         if (na->nm_notify == NULL)
 3995                 na->nm_notify = netmap_notify;
 3996         na->active_fds = 0;
 3997 
 3998         if (na->nm_mem == NULL) {
 3999                 /* use iommu or global allocator */
 4000                 na->nm_mem = netmap_mem_get_iommu(na);
 4001         }
 4002         if (na->nm_bdg_attach == NULL)
 4003                 /* no special nm_bdg_attach callback. On VALE
 4004                  * attach, we need to interpose a bwrap
 4005                  */
 4006                 na->nm_bdg_attach = netmap_default_bdg_attach;
 4007 
 4008         return 0;
 4009 }
 4010 
 4011 /* Wrapper for the register callback provided netmap-enabled
 4012  * hardware drivers.
 4013  * nm_iszombie(na) means that the driver module has been
 4014  * unloaded, so we cannot call into it.
 4015  * nm_os_ifnet_lock() must guarantee mutual exclusion with
 4016  * module unloading.
 4017  */
 4018 static int
 4019 netmap_hw_reg(struct netmap_adapter *na, int onoff)
 4020 {
 4021         struct netmap_hw_adapter *hwna =
 4022                 (struct netmap_hw_adapter*)na;
 4023         int error = 0;
 4024 
 4025         nm_os_ifnet_lock();
 4026 
 4027         if (nm_iszombie(na)) {
 4028                 if (onoff) {
 4029                         error = ENXIO;
 4030                 } else if (na != NULL) {
 4031                         na->na_flags &= ~NAF_NETMAP_ON;
 4032                 }
 4033                 goto out;
 4034         }
 4035 
 4036         error = hwna->nm_hw_register(na, onoff);
 4037 
 4038 out:
 4039         nm_os_ifnet_unlock();
 4040 
 4041         return error;
 4042 }
 4043 
 4044 static void
 4045 netmap_hw_dtor(struct netmap_adapter *na)
 4046 {
 4047         if (na->ifp == NULL)
 4048                 return;
 4049 
 4050         NM_DETACH_NA(na->ifp);
 4051 }
 4052 
 4053 
 4054 /*
 4055  * Allocate a netmap_adapter object, and initialize it from the
 4056  * 'arg' passed by the driver on attach.
 4057  * We allocate a block of memory of 'size' bytes, which has room
 4058  * for struct netmap_adapter plus additional room private to
 4059  * the caller.
 4060  * Return 0 on success, ENOMEM otherwise.
 4061  */
 4062 int
 4063 netmap_attach_ext(struct netmap_adapter *arg, size_t size, int override_reg)
 4064 {
 4065         struct netmap_hw_adapter *hwna = NULL;
 4066         struct ifnet *ifp = NULL;
 4067 
 4068         if (size < sizeof(struct netmap_hw_adapter)) {
 4069                 if (netmap_debug & NM_DEBUG_ON)
 4070                         nm_prerr("Invalid netmap adapter size %d", (int)size);
 4071                 return EINVAL;
 4072         }
 4073 
 4074         if (arg == NULL || arg->ifp == NULL) {
 4075                 if (netmap_debug & NM_DEBUG_ON)
 4076                         nm_prerr("either arg or arg->ifp is NULL");
 4077                 return EINVAL;
 4078         }
 4079 
 4080         if (arg->num_tx_rings == 0 || arg->num_rx_rings == 0) {
 4081                 if (netmap_debug & NM_DEBUG_ON)
 4082                         nm_prerr("%s: invalid rings tx %d rx %d",
 4083                                 arg->name, arg->num_tx_rings, arg->num_rx_rings);
 4084                 return EINVAL;
 4085         }
 4086 
 4087         ifp = arg->ifp;
 4088         if (NM_NA_CLASH(ifp)) {
 4089                 /* If NA(ifp) is not null but there is no valid netmap
 4090                  * adapter it means that someone else is using the same
 4091                  * pointer (e.g. ax25_ptr on linux). This happens for
 4092                  * instance when also PF_RING is in use. */
 4093                 nm_prerr("Error: netmap adapter hook is busy");
 4094                 return EBUSY;
 4095         }
 4096 
 4097         hwna = nm_os_malloc(size);
 4098         if (hwna == NULL)
 4099                 goto fail;
 4100         hwna->up = *arg;
 4101         hwna->up.na_flags |= NAF_HOST_RINGS | NAF_NATIVE;
 4102         strlcpy(hwna->up.name, ifp->if_xname, sizeof(hwna->up.name));
 4103         if (override_reg) {
 4104                 hwna->nm_hw_register = hwna->up.nm_register;
 4105                 hwna->up.nm_register = netmap_hw_reg;
 4106         }
 4107         if (netmap_attach_common(&hwna->up)) {
 4108                 nm_os_free(hwna);
 4109                 goto fail;
 4110         }
 4111         netmap_adapter_get(&hwna->up);
 4112 
 4113         NM_ATTACH_NA(ifp, &hwna->up);
 4114 
 4115         nm_os_onattach(ifp);
 4116 
 4117         if (arg->nm_dtor == NULL) {
 4118                 hwna->up.nm_dtor = netmap_hw_dtor;
 4119         }
 4120 
 4121         if_printf(ifp, "netmap queues/slots: TX %d/%d, RX %d/%d\n",
 4122             hwna->up.num_tx_rings, hwna->up.num_tx_desc,
 4123             hwna->up.num_rx_rings, hwna->up.num_rx_desc);
 4124         return 0;
 4125 
 4126 fail:
 4127         nm_prerr("fail, arg %p ifp %p na %p", arg, ifp, hwna);
 4128         return (hwna ? EINVAL : ENOMEM);
 4129 }
 4130 
 4131 
 4132 int
 4133 netmap_attach(struct netmap_adapter *arg)
 4134 {
 4135         return netmap_attach_ext(arg, sizeof(struct netmap_hw_adapter),
 4136                         1 /* override nm_reg */);
 4137 }
 4138 
 4139 
 4140 void
 4141 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
 4142 {
 4143         if (!na) {
 4144                 return;
 4145         }
 4146 
 4147         refcount_acquire(&na->na_refcount);
 4148 }
 4149 
 4150 
 4151 /* returns 1 iff the netmap_adapter is destroyed */
 4152 int
 4153 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
 4154 {
 4155         if (!na)
 4156                 return 1;
 4157 
 4158         if (!refcount_release(&na->na_refcount))
 4159                 return 0;
 4160 
 4161         if (na->nm_dtor)
 4162                 na->nm_dtor(na);
 4163 
 4164         if (na->tx_rings) { /* XXX should not happen */
 4165                 if (netmap_debug & NM_DEBUG_ON)
 4166                         nm_prerr("freeing leftover tx_rings");
 4167                 na->nm_krings_delete(na);
 4168         }
 4169         netmap_pipe_dealloc(na);
 4170         if (na->nm_mem)
 4171                 netmap_mem_put(na->nm_mem);
 4172         bzero(na, sizeof(*na));
 4173         nm_os_free(na);
 4174 
 4175         return 1;
 4176 }
 4177 
 4178 /* nm_krings_create callback for all hardware native adapters */
 4179 int
 4180 netmap_hw_krings_create(struct netmap_adapter *na)
 4181 {
 4182         int ret = netmap_krings_create(na, 0);
 4183         if (ret == 0) {
 4184                 /* initialize the mbq for the sw rx ring */
 4185                 u_int lim = netmap_real_rings(na, NR_RX), i;
 4186                 for (i = na->num_rx_rings; i < lim; i++) {
 4187                         mbq_safe_init(&NMR(na, NR_RX)[i]->rx_queue);
 4188                 }
 4189                 nm_prdis("initialized sw rx queue %d", na->num_rx_rings);
 4190         }
 4191         return ret;
 4192 }
 4193 
 4194 
 4195 
 4196 /*
 4197  * Called on module unload by the netmap-enabled drivers
 4198  */
 4199 void
 4200 netmap_detach(struct ifnet *ifp)
 4201 {
 4202         struct netmap_adapter *na;
 4203 
 4204         NMG_LOCK();
 4205 
 4206         if (!NM_NA_VALID(ifp)) {
 4207                 NMG_UNLOCK();
 4208                 return;
 4209         }
 4210 
 4211         na = NA(ifp);
 4212         netmap_set_all_rings(na, NM_KR_LOCKED);
 4213         /*
 4214          * if the netmap adapter is not native, somebody
 4215          * changed it, so we can not release it here.
 4216          * The NAF_ZOMBIE flag will notify the new owner that
 4217          * the driver is gone.
 4218          */
 4219         if (!(na->na_flags & NAF_NATIVE) || !netmap_adapter_put(na)) {
 4220                 na->na_flags |= NAF_ZOMBIE;
 4221         }
 4222         /* give active users a chance to notice that NAF_ZOMBIE has been
 4223          * turned on, so that they can stop and return an error to userspace.
 4224          * Note that this becomes a NOP if there are no active users and,
 4225          * therefore, the put() above has deleted the na, since now NA(ifp) is
 4226          * NULL.
 4227          */
 4228         netmap_enable_all_rings(ifp);
 4229         NMG_UNLOCK();
 4230 }
 4231 
 4232 
 4233 /*
 4234  * Intercept packets from the network stack and pass them
 4235  * to netmap as incoming packets on the 'software' ring.
 4236  *
 4237  * We only store packets in a bounded mbq and then copy them
 4238  * in the relevant rxsync routine.
 4239  *
 4240  * We rely on the OS to make sure that the ifp and na do not go
 4241  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
 4242  * In nm_register() or whenever there is a reinitialization,
 4243  * we make sure to make the mode change visible here.
 4244  */
 4245 int
 4246 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
 4247 {
 4248         struct netmap_adapter *na = NA(ifp);
 4249         struct netmap_kring *kring, *tx_kring;
 4250         u_int len = MBUF_LEN(m);
 4251         u_int error = ENOBUFS;
 4252         unsigned int txr;
 4253         struct mbq *q;
 4254         int busy;
 4255         u_int i;
 4256 
 4257         i = MBUF_TXQ(m);
 4258         if (i >= na->num_host_rx_rings) {
 4259                 i = i % na->num_host_rx_rings;
 4260         }
 4261         kring = NMR(na, NR_RX)[nma_get_nrings(na, NR_RX) + i];
 4262 
 4263         // XXX [Linux] we do not need this lock
 4264         // if we follow the down/configure/up protocol -gl
 4265         // mtx_lock(&na->core_lock);
 4266 
 4267         if (!nm_netmap_on(na)) {
 4268                 nm_prerr("%s not in netmap mode anymore", na->name);
 4269                 error = ENXIO;
 4270                 goto done;
 4271         }
 4272 
 4273         txr = MBUF_TXQ(m);
 4274         if (txr >= na->num_tx_rings) {
 4275                 txr %= na->num_tx_rings;
 4276         }
 4277         tx_kring = NMR(na, NR_TX)[txr];
 4278 
 4279         if (tx_kring->nr_mode == NKR_NETMAP_OFF) {
 4280                 return MBUF_TRANSMIT(na, ifp, m);
 4281         }
 4282 
 4283         q = &kring->rx_queue;
 4284 
 4285         // XXX reconsider long packets if we handle fragments
 4286         if (len > NETMAP_BUF_SIZE(na)) { /* too long for us */
 4287                 nm_prerr("%s from_host, drop packet size %d > %d", na->name,
 4288                         len, NETMAP_BUF_SIZE(na));
 4289                 goto done;
 4290         }
 4291 
 4292         if (!netmap_generic_hwcsum) {
 4293                 if (nm_os_mbuf_has_csum_offld(m)) {
 4294                         nm_prlim(1, "%s drop mbuf that needs checksum offload", na->name);
 4295                         goto done;
 4296                 }
 4297         }
 4298 
 4299         if (nm_os_mbuf_has_seg_offld(m)) {
 4300                 nm_prlim(1, "%s drop mbuf that needs generic segmentation offload", na->name);
 4301                 goto done;
 4302         }
 4303 
 4304 #ifdef __FreeBSD__
 4305         ETHER_BPF_MTAP(ifp, m);
 4306 #endif /* __FreeBSD__ */
 4307 
 4308         /* protect against netmap_rxsync_from_host(), netmap_sw_to_nic()
 4309          * and maybe other instances of netmap_transmit (the latter
 4310          * not possible on Linux).
 4311          * We enqueue the mbuf only if we are sure there is going to be
 4312          * enough room in the host RX ring, otherwise we drop it.
 4313          */
 4314         mbq_lock(q);
 4315 
 4316         busy = kring->nr_hwtail - kring->nr_hwcur;
 4317         if (busy < 0)
 4318                 busy += kring->nkr_num_slots;
 4319         if (busy + mbq_len(q) >= kring->nkr_num_slots - 1) {
 4320                 nm_prlim(2, "%s full hwcur %d hwtail %d qlen %d", na->name,
 4321                         kring->nr_hwcur, kring->nr_hwtail, mbq_len(q));
 4322         } else {
 4323                 mbq_enqueue(q, m);
 4324                 nm_prdis(2, "%s %d bufs in queue", na->name, mbq_len(q));
 4325                 /* notify outside the lock */
 4326                 m = NULL;
 4327                 error = 0;
 4328         }
 4329         mbq_unlock(q);
 4330 
 4331 done:
 4332         if (m) {
 4333                 if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 4334                 m_freem(m);
 4335         }
 4336         /* unconditionally wake up listeners */
 4337         kring->nm_notify(kring, 0);
 4338         /* this is normally netmap_notify(), but for nics
 4339          * connected to a bridge it is netmap_bwrap_intr_notify(),
 4340          * that possibly forwards the frames through the switch
 4341          */
 4342 
 4343         return (error);
 4344 }
 4345 
 4346 
 4347 /*
 4348  * Reset function to be called by the driver routines when reinitializing
 4349  * a hardware ring. The driver is in charge of locking to protect the kring
 4350  * while this operation is being performed. This is normally achieved by
 4351  * calling netmap_disable_all_rings() before triggering a reset.
 4352  * If the kring is not in netmap mode, return NULL to inform the caller
 4353  * that this is the case.
 4354  * If the kring is in netmap mode, set hwofs so that the netmap indices
 4355  * seen by userspace (head/cut/tail) do not change, although the internal
 4356  * NIC indices have been reset to 0.
 4357  * In any case, adjust kring->nr_mode.
 4358  */
 4359 struct netmap_slot *
 4360 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
 4361         u_int new_cur)
 4362 {
 4363         struct netmap_kring *kring;
 4364         u_int new_hwtail, new_hwofs;
 4365 
 4366         if (!nm_native_on(na)) {
 4367                 nm_prdis("interface not in native netmap mode");
 4368                 return NULL;    /* nothing to reinitialize */
 4369         }
 4370 
 4371         if (tx == NR_TX) {
 4372                 if (n >= na->num_tx_rings)
 4373                         return NULL;
 4374                 kring = na->tx_rings[n];
 4375                 /*
 4376                  * Set hwofs to rhead, so that slots[rhead] is mapped to
 4377                  * the NIC internal slot 0, and thus the netmap buffer
 4378                  * at rhead is the next to be transmitted. Transmissions
 4379                  * that were pending before the reset are considered as
 4380                  * sent, so that we can have hwcur = rhead. All the slots
 4381                  * are now owned by the user, so we can also reinit hwtail.
 4382                  */
 4383                 new_hwofs = kring->rhead;
 4384                 new_hwtail = nm_prev(kring->rhead, kring->nkr_num_slots - 1);
 4385         } else {
 4386                 if (n >= na->num_rx_rings)
 4387                         return NULL;
 4388                 kring = na->rx_rings[n];
 4389                 /*
 4390                  * Set hwofs to hwtail, so that slots[hwtail] is mapped to
 4391                  * the NIC internal slot 0, and thus the netmap buffer
 4392                  * at hwtail is the next to be given to the NIC.
 4393                  * Unread slots (the ones in [rhead,hwtail[) are owned by
 4394                  * the user, and thus the caller cannot give them
 4395                  * to the NIC right now.
 4396                  */
 4397                 new_hwofs = kring->nr_hwtail;
 4398                 new_hwtail = kring->nr_hwtail;
 4399         }
 4400         if (kring->nr_pending_mode == NKR_NETMAP_OFF) {
 4401                 kring->nr_mode = NKR_NETMAP_OFF;
 4402                 return NULL;
 4403         }
 4404         if (netmap_verbose) {
 4405             nm_prinf("%s, hc %u->%u, ht %u->%u, ho %u->%u", kring->name,
 4406                 kring->nr_hwcur, kring->rhead,
 4407                 kring->nr_hwtail, new_hwtail,
 4408                 kring->nkr_hwofs, new_hwofs);
 4409         }
 4410         kring->nr_hwcur = kring->rhead;
 4411         kring->nr_hwtail = new_hwtail;
 4412         kring->nkr_hwofs = new_hwofs;
 4413 
 4414         /*
 4415          * Wakeup on the individual and global selwait
 4416          * We do the wakeup here, but the ring is not yet reconfigured.
 4417          * However, we are under lock so there are no races.
 4418          */
 4419         kring->nr_mode = NKR_NETMAP_ON;
 4420         kring->nm_notify(kring, 0);
 4421         return kring->ring->slot;
 4422 }
 4423 
 4424 
 4425 /*
 4426  * Dispatch rx/tx interrupts to the netmap rings.
 4427  *
 4428  * "work_done" is non-null on the RX path, NULL for the TX path.
 4429  * We rely on the OS to make sure that there is only one active
 4430  * instance per queue, and that there is appropriate locking.
 4431  *
 4432  * The 'notify' routine depends on what the ring is attached to.
 4433  * - for a netmap file descriptor, do a selwakeup on the individual
 4434  *   waitqueue, plus one on the global one if needed
 4435  *   (see netmap_notify)
 4436  * - for a nic connected to a switch, call the proper forwarding routine
 4437  *   (see netmap_bwrap_intr_notify)
 4438  */
 4439 int
 4440 netmap_common_irq(struct netmap_adapter *na, u_int q, u_int *work_done)
 4441 {
 4442         struct netmap_kring *kring;
 4443         enum txrx t = (work_done ? NR_RX : NR_TX);
 4444 
 4445         q &= NETMAP_RING_MASK;
 4446 
 4447         if (netmap_debug & (NM_DEBUG_RXINTR|NM_DEBUG_TXINTR)) {
 4448                 nm_prlim(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
 4449         }
 4450 
 4451         if (q >= nma_get_nrings(na, t))
 4452                 return NM_IRQ_PASS; // not a physical queue
 4453 
 4454         kring = NMR(na, t)[q];
 4455 
 4456         if (kring->nr_mode == NKR_NETMAP_OFF) {
 4457                 return NM_IRQ_PASS;
 4458         }
 4459 
 4460         if (t == NR_RX) {
 4461                 kring->nr_kflags |= NKR_PENDINTR;       // XXX atomic ?
 4462                 *work_done = 1; /* do not fire napi again */
 4463         }
 4464 
 4465         return kring->nm_notify(kring, 0);
 4466 }
 4467 
 4468 
 4469 /*
 4470  * Default functions to handle rx/tx interrupts from a physical device.
 4471  * "work_done" is non-null on the RX path, NULL for the TX path.
 4472  *
 4473  * If the card is not in netmap mode, simply return NM_IRQ_PASS,
 4474  * so that the caller proceeds with regular processing.
 4475  * Otherwise call netmap_common_irq().
 4476  *
 4477  * If the card is connected to a netmap file descriptor,
 4478  * do a selwakeup on the individual queue, plus one on the global one
 4479  * if needed (multiqueue card _and_ there are multiqueue listeners),
 4480  * and return NR_IRQ_COMPLETED.
 4481  *
 4482  * Finally, if called on rx from an interface connected to a switch,
 4483  * calls the proper forwarding routine.
 4484  */
 4485 int
 4486 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
 4487 {
 4488         struct netmap_adapter *na = NA(ifp);
 4489 
 4490         /*
 4491          * XXX emulated netmap mode sets NAF_SKIP_INTR so
 4492          * we still use the regular driver even though the previous
 4493          * check fails. It is unclear whether we should use
 4494          * nm_native_on() here.
 4495          */
 4496         if (!nm_netmap_on(na))
 4497                 return NM_IRQ_PASS;
 4498 
 4499         if (na->na_flags & NAF_SKIP_INTR) {
 4500                 nm_prdis("use regular interrupt");
 4501                 return NM_IRQ_PASS;
 4502         }
 4503 
 4504         return netmap_common_irq(na, q, work_done);
 4505 }
 4506 
 4507 /* set/clear native flags and if_transmit/netdev_ops */
 4508 void
 4509 nm_set_native_flags(struct netmap_adapter *na)
 4510 {
 4511         struct ifnet *ifp = na->ifp;
 4512 
 4513         /* We do the setup for intercepting packets only if we are the
 4514          * first user of this adapter. */
 4515         if (na->active_fds > 0) {
 4516                 return;
 4517         }
 4518 
 4519         na->na_flags |= NAF_NETMAP_ON;
 4520         nm_os_onenter(ifp);
 4521         netmap_update_hostrings_mode(na);
 4522 }
 4523 
 4524 void
 4525 nm_clear_native_flags(struct netmap_adapter *na)
 4526 {
 4527         struct ifnet *ifp = na->ifp;
 4528 
 4529         /* We undo the setup for intercepting packets only if we are the
 4530          * last user of this adapter. */
 4531         if (na->active_fds > 0) {
 4532                 return;
 4533         }
 4534 
 4535         netmap_update_hostrings_mode(na);
 4536         nm_os_onexit(ifp);
 4537 
 4538         na->na_flags &= ~NAF_NETMAP_ON;
 4539 }
 4540 
 4541 void
 4542 netmap_krings_mode_commit(struct netmap_adapter *na, int onoff)
 4543 {
 4544         enum txrx t;
 4545 
 4546         for_rx_tx(t) {
 4547                 int i;
 4548 
 4549                 for (i = 0; i < netmap_real_rings(na, t); i++) {
 4550                         struct netmap_kring *kring = NMR(na, t)[i];
 4551 
 4552                         if (onoff && nm_kring_pending_on(kring))
 4553                                 kring->nr_mode = NKR_NETMAP_ON;
 4554                         else if (!onoff && nm_kring_pending_off(kring))
 4555                                 kring->nr_mode = NKR_NETMAP_OFF;
 4556                 }
 4557         }
 4558 }
 4559 
 4560 /*
 4561  * Module loader and unloader
 4562  *
 4563  * netmap_init() creates the /dev/netmap device and initializes
 4564  * all global variables. Returns 0 on success, errno on failure
 4565  * (but there is no chance)
 4566  *
 4567  * netmap_fini() destroys everything.
 4568  */
 4569 
 4570 static struct cdev *netmap_dev; /* /dev/netmap character device. */
 4571 extern struct cdevsw netmap_cdevsw;
 4572 
 4573 
 4574 void
 4575 netmap_fini(void)
 4576 {
 4577         if (netmap_dev)
 4578                 destroy_dev(netmap_dev);
 4579         /* we assume that there are no longer netmap users */
 4580         nm_os_ifnet_fini();
 4581         netmap_uninit_bridges();
 4582         netmap_mem_fini();
 4583         NMG_LOCK_DESTROY();
 4584         nm_prinf("netmap: unloaded module.");
 4585 }
 4586 
 4587 
 4588 int
 4589 netmap_init(void)
 4590 {
 4591         int error;
 4592 
 4593         NMG_LOCK_INIT();
 4594 
 4595         error = netmap_mem_init();
 4596         if (error != 0)
 4597                 goto fail;
 4598         /*
 4599          * MAKEDEV_ETERNAL_KLD avoids an expensive check on syscalls
 4600          * when the module is compiled in.
 4601          * XXX could use make_dev_credv() to get error number
 4602          */
 4603         netmap_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
 4604                 &netmap_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600,
 4605                               "netmap");
 4606         if (!netmap_dev)
 4607                 goto fail;
 4608 
 4609         error = netmap_init_bridges();
 4610         if (error)
 4611                 goto fail;
 4612 
 4613 #ifdef __FreeBSD__
 4614         nm_os_vi_init_index();
 4615 #endif
 4616 
 4617         error = nm_os_ifnet_init();
 4618         if (error)
 4619                 goto fail;
 4620 
 4621 #if !defined(__FreeBSD__) || defined(KLD_MODULE)
 4622         nm_prinf("netmap: loaded module");
 4623 #endif
 4624         return (0);
 4625 fail:
 4626         netmap_fini();
 4627         return (EINVAL); /* may be incorrect */
 4628 }
Cache object: 83b513b259c8271a9786219eb48ce807 
 
 |