The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/dev/netmap/netmap_vale.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (C) 2013-2016 Universita` di Pisa
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  *   1. Redistributions of source code must retain the above copyright
   11  *      notice, this list of conditions and the following disclaimer.
   12  *   2. Redistributions in binary form must reproduce the above copyright
   13  *      notice, this list of conditions and the following disclaimer in the
   14  *      documentation and/or other materials provided with the distribution.
   15  *
   16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   26  * SUCH DAMAGE.
   27  */
   28 
   29 
   30 #if defined(__FreeBSD__)
   31 #include <sys/cdefs.h> /* prerequisite */
   32 __FBSDID("$FreeBSD$");
   33 
   34 #include <sys/types.h>
   35 #include <sys/errno.h>
   36 #include <sys/param.h>  /* defines used in kernel.h */
   37 #include <sys/kernel.h> /* types used in module initialization */
   38 #include <sys/conf.h>   /* cdevsw struct, UID, GID */
   39 #include <sys/sockio.h>
   40 #include <sys/socketvar.h>      /* struct socket */
   41 #include <sys/malloc.h>
   42 #include <sys/poll.h>
   43 #include <sys/rwlock.h>
   44 #include <sys/socket.h> /* sockaddrs */
   45 #include <sys/selinfo.h>
   46 #include <sys/sysctl.h>
   47 #include <net/if.h>
   48 #include <net/if_var.h>
   49 #include <net/bpf.h>            /* BIOCIMMEDIATE */
   50 #include <machine/bus.h>        /* bus_dmamap_* */
   51 #include <sys/endian.h>
   52 #include <sys/refcount.h>
   53 #include <sys/smp.h>
   54 
   55 
   56 #elif defined(linux)
   57 
   58 #include "bsd_glue.h"
   59 
   60 #elif defined(__APPLE__)
   61 
   62 #warning OSX support is only partial
   63 #include "osx_glue.h"
   64 
   65 #elif defined(_WIN32)
   66 #include "win_glue.h"
   67 
   68 #else
   69 
   70 #error  Unsupported platform
   71 
   72 #endif /* unsupported */
   73 
   74 /*
   75  * common headers
   76  */
   77 
   78 #include <net/netmap.h>
   79 #include <dev/netmap/netmap_kern.h>
   80 #include <dev/netmap/netmap_mem2.h>
   81 #include <dev/netmap/netmap_bdg.h>
   82 
   83 #ifdef WITH_VALE
   84 
   85 /*
   86  * system parameters (most of them in netmap_kern.h)
   87  * NM_BDG_NAME          prefix for switch port names, default "vale"
   88  * NM_BDG_MAXPORTS      number of ports
   89  * NM_BRIDGES           max number of switches in the system.
   90  *
   91  * Switch ports are named valeX:Y where X is the switch name and Y
   92  * is the port. If Y matches a physical interface name, the port is
   93  * connected to a physical device.
   94  *
   95  * Unlike physical interfaces, switch ports use their own memory region
   96  * for rings and buffers.
   97  * The virtual interfaces use per-queue lock instead of core lock.
   98  * In the tx loop, we aggregate traffic in batches to make all operations
   99  * faster. The batch size is bridge_batch.
  100  */
  101 #define NM_BDG_MAXRINGS         16      /* XXX unclear how many (must be a pow of 2). */
  102 #define NM_BDG_MAXSLOTS         4096    /* XXX same as above */
  103 #define NM_BRIDGE_RINGSIZE      1024    /* in the device */
  104 #define NM_BDG_BATCH            1024    /* entries in the forwarding buffer */
  105 /* actual size of the tables */
  106 #define NM_BDG_BATCH_MAX        (NM_BDG_BATCH + NETMAP_MAX_FRAGS)
  107 /* NM_FT_NULL terminates a list of slots in the ft */
  108 #define NM_FT_NULL              NM_BDG_BATCH_MAX
  109 
  110 
  111 /*
  112  * bridge_batch is set via sysctl to the max batch size to be
  113  * used in the bridge. The actual value may be larger as the
  114  * last packet in the block may overflow the size.
  115  */
  116 static int bridge_batch = NM_BDG_BATCH; /* bridge batch size */
  117 
  118 /* Max number of vale bridges (loader tunable). */
  119 unsigned int vale_max_bridges = NM_BRIDGES;
  120 
  121 SYSBEGIN(vars_vale);
  122 SYSCTL_DECL(_dev_netmap);
  123 SYSCTL_INT(_dev_netmap, OID_AUTO, bridge_batch, CTLFLAG_RW, &bridge_batch, 0,
  124                 "Max batch size to be used in the bridge");
  125 SYSCTL_UINT(_dev_netmap, OID_AUTO, max_bridges, CTLFLAG_RDTUN, &vale_max_bridges, 0,
  126                 "Max number of vale bridges");
  127 SYSEND;
  128 
  129 static int netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *,
  130                 struct netmap_mem_d *nmd, struct netmap_vp_adapter **);
  131 static int netmap_vale_vp_bdg_attach(const char *, struct netmap_adapter *,
  132                 struct nm_bridge *);
  133 static int netmap_vale_bwrap_attach(const char *, struct netmap_adapter *);
  134 
  135 /*
  136  * For each output interface, nm_vale_q is used to construct a list.
  137  * bq_len is the number of output buffers (we can have coalescing
  138  * during the copy).
  139  */
  140 struct nm_vale_q {
  141         uint16_t bq_head;
  142         uint16_t bq_tail;
  143         uint32_t bq_len;        /* number of buffers */
  144 };
  145 
  146 /* Holds the default callbacks */
  147 struct netmap_bdg_ops vale_bdg_ops = {
  148         .lookup = netmap_vale_learning,
  149         .config = NULL,
  150         .dtor = NULL,
  151         .vp_create = netmap_vale_vp_create,
  152         .bwrap_attach = netmap_vale_bwrap_attach,
  153         .name = NM_BDG_NAME,
  154 };
  155 
  156 /*
  157  * this is a slightly optimized copy routine which rounds
  158  * to multiple of 64 bytes and is often faster than dealing
  159  * with other odd sizes. We assume there is enough room
  160  * in the source and destination buffers.
  161  *
  162  * XXX only for multiples of NM_BUF_ALIGN bytes, non overlapped.
  163  */
  164 
  165 static inline void
  166 pkt_copy(void *_src, void *_dst, int l)
  167 {
  168         uint64_t *src = _src;
  169         uint64_t *dst = _dst;
  170         if (unlikely(l >= 1024)) {
  171                 memcpy(dst, src, l);
  172                 return;
  173         }
  174         for (; likely(l > 0); l -= NM_BUF_ALIGN) {
  175                 /* XXX NM_BUF_ALIGN/sizeof(uint64_t) statements */
  176                 *dst++ = *src++;
  177                 *dst++ = *src++;
  178                 *dst++ = *src++;
  179                 *dst++ = *src++;
  180                 *dst++ = *src++;
  181                 *dst++ = *src++;
  182                 *dst++ = *src++;
  183                 *dst++ = *src++;
  184         }
  185 }
  186 
  187 
  188 /*
  189  * Free the forwarding tables for rings attached to switch ports.
  190  */
  191 static void
  192 nm_free_bdgfwd(struct netmap_adapter *na)
  193 {
  194         int nrings, i;
  195         struct netmap_kring **kring;
  196 
  197         NMG_LOCK_ASSERT();
  198         nrings = na->num_tx_rings;
  199         kring = na->tx_rings;
  200         for (i = 0; i < nrings; i++) {
  201                 if (kring[i]->nkr_ft) {
  202                         nm_os_free(kring[i]->nkr_ft);
  203                         kring[i]->nkr_ft = NULL; /* protect from freeing twice */
  204                 }
  205         }
  206 }
  207 
  208 
  209 /*
  210  * Allocate the forwarding tables for the rings attached to the bridge ports.
  211  */
  212 static int
  213 nm_alloc_bdgfwd(struct netmap_adapter *na)
  214 {
  215         int nrings, l, i, num_dstq;
  216         struct netmap_kring **kring;
  217 
  218         NMG_LOCK_ASSERT();
  219         /* all port:rings + broadcast */
  220         num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
  221         l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH_MAX;
  222         l += sizeof(struct nm_vale_q) * num_dstq;
  223         l += sizeof(uint16_t) * NM_BDG_BATCH_MAX;
  224 
  225         nrings = netmap_real_rings(na, NR_TX);
  226         kring = na->tx_rings;
  227         for (i = 0; i < nrings; i++) {
  228                 struct nm_bdg_fwd *ft;
  229                 struct nm_vale_q *dstq;
  230                 int j;
  231 
  232                 ft = nm_os_malloc(l);
  233                 if (!ft) {
  234                         nm_free_bdgfwd(na);
  235                         return ENOMEM;
  236                 }
  237                 dstq = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
  238                 for (j = 0; j < num_dstq; j++) {
  239                         dstq[j].bq_head = dstq[j].bq_tail = NM_FT_NULL;
  240                         dstq[j].bq_len = 0;
  241                 }
  242                 kring[i]->nkr_ft = ft;
  243         }
  244         return 0;
  245 }
  246 
  247 /* Allows external modules to create bridges in exclusive mode,
  248  * returns an authentication token that the external module will need
  249  * to provide during nm_bdg_ctl_{attach, detach}(), netmap_bdg_regops(),
  250  * and nm_bdg_update_private_data() operations.
  251  * Successfully executed if ret != NULL and *return_status == 0.
  252  */
  253 void *
  254 netmap_vale_create(const char *bdg_name, int *return_status)
  255 {
  256         struct nm_bridge *b = NULL;
  257         void *ret = NULL;
  258 
  259         NMG_LOCK();
  260         b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
  261         if (b) {
  262                 *return_status = EEXIST;
  263                 goto unlock_bdg_create;
  264         }
  265 
  266         b = nm_find_bridge(bdg_name, 1 /* create */, &vale_bdg_ops);
  267         if (!b) {
  268                 *return_status = ENOMEM;
  269                 goto unlock_bdg_create;
  270         }
  271 
  272         b->bdg_flags |= NM_BDG_ACTIVE | NM_BDG_EXCLUSIVE;
  273         ret = nm_bdg_get_auth_token(b);
  274         *return_status = 0;
  275 
  276 unlock_bdg_create:
  277         NMG_UNLOCK();
  278         return ret;
  279 }
  280 
  281 /* Allows external modules to destroy a bridge created through
  282  * netmap_bdg_create(), the bridge must be empty.
  283  */
  284 int
  285 netmap_vale_destroy(const char *bdg_name, void *auth_token)
  286 {
  287         struct nm_bridge *b = NULL;
  288         int ret = 0;
  289 
  290         NMG_LOCK();
  291         b = nm_find_bridge(bdg_name, 0 /* don't create */, NULL);
  292         if (!b) {
  293                 ret = ENXIO;
  294                 goto unlock_bdg_free;
  295         }
  296 
  297         if (!nm_bdg_valid_auth_token(b, auth_token)) {
  298                 ret = EACCES;
  299                 goto unlock_bdg_free;
  300         }
  301         if (!(b->bdg_flags & NM_BDG_EXCLUSIVE)) {
  302                 ret = EINVAL;
  303                 goto unlock_bdg_free;
  304         }
  305 
  306         b->bdg_flags &= ~(NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE);
  307         ret = netmap_bdg_free(b);
  308         if (ret) {
  309                 b->bdg_flags |= NM_BDG_EXCLUSIVE | NM_BDG_ACTIVE;
  310         }
  311 
  312 unlock_bdg_free:
  313         NMG_UNLOCK();
  314         return ret;
  315 }
  316 
  317 /* Process NETMAP_REQ_VALE_LIST. */
  318 int
  319 netmap_vale_list(struct nmreq_header *hdr)
  320 {
  321         struct nmreq_vale_list *req =
  322                 (struct nmreq_vale_list *)(uintptr_t)hdr->nr_body;
  323         int namelen = strlen(hdr->nr_name);
  324         struct nm_bridge *b, *bridges;
  325         struct netmap_vp_adapter *vpna;
  326         int error = 0, i, j;
  327         u_int num_bridges;
  328 
  329         netmap_bns_getbridges(&bridges, &num_bridges);
  330 
  331         /* this is used to enumerate bridges and ports */
  332         if (namelen) { /* look up indexes of bridge and port */
  333                 if (strncmp(hdr->nr_name, NM_BDG_NAME,
  334                                         strlen(NM_BDG_NAME))) {
  335                         return EINVAL;
  336                 }
  337                 NMG_LOCK();
  338                 b = nm_find_bridge(hdr->nr_name, 0 /* don't create */, NULL);
  339                 if (!b) {
  340                         NMG_UNLOCK();
  341                         return ENOENT;
  342                 }
  343 
  344                 req->nr_bridge_idx = b - bridges; /* bridge index */
  345                 req->nr_port_idx = NM_BDG_NOPORT;
  346                 for (j = 0; j < b->bdg_active_ports; j++) {
  347                         i = b->bdg_port_index[j];
  348                         vpna = b->bdg_ports[i];
  349                         if (vpna == NULL) {
  350                                 nm_prerr("This should not happen");
  351                                 continue;
  352                         }
  353                         /* the former and the latter identify a
  354                          * virtual port and a NIC, respectively
  355                          */
  356                         if (!strcmp(vpna->up.name, hdr->nr_name)) {
  357                                 req->nr_port_idx = i; /* port index */
  358                                 break;
  359                         }
  360                 }
  361                 NMG_UNLOCK();
  362         } else {
  363                 /* return the first non-empty entry starting from
  364                  * bridge nr_arg1 and port nr_arg2.
  365                  *
  366                  * Users can detect the end of the same bridge by
  367                  * seeing the new and old value of nr_arg1, and can
  368                  * detect the end of all the bridge by error != 0
  369                  */
  370                 i = req->nr_bridge_idx;
  371                 j = req->nr_port_idx;
  372 
  373                 NMG_LOCK();
  374                 for (error = ENOENT; i < vale_max_bridges; i++) {
  375                         b = bridges + i;
  376                         for ( ; j < NM_BDG_MAXPORTS; j++) {
  377                                 if (b->bdg_ports[j] == NULL)
  378                                         continue;
  379                                 vpna = b->bdg_ports[j];
  380                                 /* write back the VALE switch name */
  381                                 strlcpy(hdr->nr_name, vpna->up.name,
  382                                         sizeof(hdr->nr_name));
  383                                 error = 0;
  384                                 goto out;
  385                         }
  386                         j = 0; /* following bridges scan from 0 */
  387                 }
  388         out:
  389                 req->nr_bridge_idx = i;
  390                 req->nr_port_idx = j;
  391                 NMG_UNLOCK();
  392         }
  393 
  394         return error;
  395 }
  396 
  397 
  398 /* nm_dtor callback for ephemeral VALE ports */
  399 static void
  400 netmap_vale_vp_dtor(struct netmap_adapter *na)
  401 {
  402         struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter*)na;
  403         struct nm_bridge *b = vpna->na_bdg;
  404 
  405         nm_prdis("%s has %d references", na->name, na->na_refcount);
  406 
  407         if (b) {
  408                 netmap_bdg_detach_common(b, vpna->bdg_port, -1);
  409         }
  410 
  411         if (na->ifp != NULL && !nm_iszombie(na)) {
  412                 NM_DETACH_NA(na->ifp);
  413                 if (vpna->autodelete) {
  414                         nm_prdis("releasing %s", na->ifp->if_xname);
  415                         NMG_UNLOCK();
  416                         nm_os_vi_detach(na->ifp);
  417                         NMG_LOCK();
  418                 }
  419         }
  420 }
  421 
  422 
  423 
  424 /* nm_krings_create callback for VALE ports.
  425  * Calls the standard netmap_krings_create, then adds leases on rx
  426  * rings and bdgfwd on tx rings.
  427  */
  428 static int
  429 netmap_vale_vp_krings_create(struct netmap_adapter *na)
  430 {
  431         u_int tailroom;
  432         int error, i;
  433         uint32_t *leases;
  434         u_int nrx = netmap_real_rings(na, NR_RX);
  435 
  436         /*
  437          * Leases are attached to RX rings on vale ports
  438          */
  439         tailroom = sizeof(uint32_t) * na->num_rx_desc * nrx;
  440 
  441         error = netmap_krings_create(na, tailroom);
  442         if (error)
  443                 return error;
  444 
  445         leases = na->tailroom;
  446 
  447         for (i = 0; i < nrx; i++) { /* Receive rings */
  448                 na->rx_rings[i]->nkr_leases = leases;
  449                 leases += na->num_rx_desc;
  450         }
  451 
  452         error = nm_alloc_bdgfwd(na);
  453         if (error) {
  454                 netmap_krings_delete(na);
  455                 return error;
  456         }
  457 
  458         return 0;
  459 }
  460 
  461 
  462 /* nm_krings_delete callback for VALE ports. */
  463 static void
  464 netmap_vale_vp_krings_delete(struct netmap_adapter *na)
  465 {
  466         nm_free_bdgfwd(na);
  467         netmap_krings_delete(na);
  468 }
  469 
  470 
  471 static int
  472 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n,
  473         struct netmap_vp_adapter *na, u_int ring_nr);
  474 
  475 
  476 /*
  477  * main dispatch routine for the bridge.
  478  * Grab packets from a kring, move them into the ft structure
  479  * associated to the tx (input) port. Max one instance per port,
  480  * filtered on input (ioctl, poll or XXX).
  481  * Returns the next position in the ring.
  482  */
  483 static int
  484 nm_vale_preflush(struct netmap_kring *kring, u_int end)
  485 {
  486         struct netmap_vp_adapter *na =
  487                 (struct netmap_vp_adapter*)kring->na;
  488         struct netmap_ring *ring = kring->ring;
  489         struct nm_bdg_fwd *ft;
  490         u_int ring_nr = kring->ring_id;
  491         u_int j = kring->nr_hwcur, lim = kring->nkr_num_slots - 1;
  492         u_int ft_i = 0; /* start from 0 */
  493         u_int frags = 1; /* how many frags ? */
  494         struct nm_bridge *b = na->na_bdg;
  495 
  496         /* To protect against modifications to the bridge we acquire a
  497          * shared lock, waiting if we can sleep (if the source port is
  498          * attached to a user process) or with a trylock otherwise (NICs).
  499          */
  500         nm_prdis("wait rlock for %d packets", ((j > end ? lim+1 : 0) + end) - j);
  501         if (na->up.na_flags & NAF_BDG_MAYSLEEP)
  502                 BDG_RLOCK(b);
  503         else if (!BDG_RTRYLOCK(b))
  504                 return j;
  505         nm_prdis(5, "rlock acquired for %d packets", ((j > end ? lim+1 : 0) + end) - j);
  506         ft = kring->nkr_ft;
  507 
  508         for (; likely(j != end); j = nm_next(j, lim)) {
  509                 struct netmap_slot *slot = &ring->slot[j];
  510                 char *buf;
  511 
  512                 ft[ft_i].ft_len = slot->len;
  513                 ft[ft_i].ft_flags = slot->flags;
  514                 ft[ft_i].ft_offset = 0;
  515 
  516                 nm_prdis("flags is 0x%x", slot->flags);
  517                 /* we do not use the buf changed flag, but we still need to reset it */
  518                 slot->flags &= ~NS_BUF_CHANGED;
  519 
  520                 /* this slot goes into a list so initialize the link field */
  521                 ft[ft_i].ft_next = NM_FT_NULL;
  522                 buf = ft[ft_i].ft_buf = (slot->flags & NS_INDIRECT) ?
  523                         (void *)(uintptr_t)slot->ptr : NMB_O(kring, slot);
  524                 if (unlikely(buf == NULL ||
  525                      slot->len > NETMAP_BUF_SIZE(&na->up) - nm_get_offset(kring, slot))) {
  526                         nm_prlim(5, "NULL %s buffer pointer from %s slot %d len %d",
  527                                 (slot->flags & NS_INDIRECT) ? "INDIRECT" : "DIRECT",
  528                                 kring->name, j, ft[ft_i].ft_len);
  529                         buf = ft[ft_i].ft_buf = NETMAP_BUF_BASE(&na->up);
  530                         ft[ft_i].ft_len = 0;
  531                         ft[ft_i].ft_flags = 0;
  532                 }
  533                 __builtin_prefetch(buf);
  534                 ++ft_i;
  535                 if (slot->flags & NS_MOREFRAG) {
  536                         frags++;
  537                         continue;
  538                 }
  539                 if (unlikely(netmap_verbose && frags > 1))
  540                         nm_prlim(5, "%d frags at %d", frags, ft_i - frags);
  541                 ft[ft_i - frags].ft_frags = frags;
  542                 frags = 1;
  543                 if (unlikely((int)ft_i >= bridge_batch))
  544                         ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
  545         }
  546         if (frags > 1) {
  547                 /* Here ft_i > 0, ft[ft_i-1].flags has NS_MOREFRAG, and we
  548                  * have to fix frags count. */
  549                 frags--;
  550                 ft[ft_i - 1].ft_flags &= ~NS_MOREFRAG;
  551                 ft[ft_i - frags].ft_frags = frags;
  552                 nm_prlim(5, "Truncate incomplete fragment at %d (%d frags)", ft_i, frags);
  553         }
  554         if (ft_i)
  555                 ft_i = nm_vale_flush(ft, ft_i, na, ring_nr);
  556         BDG_RUNLOCK(b);
  557         return j;
  558 }
  559 
  560 
  561 /* ----- FreeBSD if_bridge hash function ------- */
  562 
  563 /*
  564  * The following hash function is adapted from "Hash Functions" by Bob Jenkins
  565  * ("Algorithm Alley", Dr. Dobbs Journal, September 1997).
  566  *
  567  * http://www.burtleburtle.net/bob/hash/spooky.html
  568  */
  569 #define mix(a, b, c)                                                    \
  570 do {                                                                    \
  571         a -= b; a -= c; a ^= (c >> 13);                                 \
  572         b -= c; b -= a; b ^= (a << 8);                                  \
  573         c -= a; c -= b; c ^= (b >> 13);                                 \
  574         a -= b; a -= c; a ^= (c >> 12);                                 \
  575         b -= c; b -= a; b ^= (a << 16);                                 \
  576         c -= a; c -= b; c ^= (b >> 5);                                  \
  577         a -= b; a -= c; a ^= (c >> 3);                                  \
  578         b -= c; b -= a; b ^= (a << 10);                                 \
  579         c -= a; c -= b; c ^= (b >> 15);                                 \
  580 } while (/*CONSTCOND*/0)
  581 
  582 
  583 static __inline uint32_t
  584 nm_vale_rthash(const uint8_t *addr)
  585 {
  586         uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = 0; // hash key
  587 
  588         b += addr[5] << 8;
  589         b += addr[4];
  590         a += addr[3] << 24;
  591         a += addr[2] << 16;
  592         a += addr[1] << 8;
  593         a += addr[0];
  594 
  595         mix(a, b, c);
  596 #define BRIDGE_RTHASH_MASK      (NM_BDG_HASH-1)
  597         return (c & BRIDGE_RTHASH_MASK);
  598 }
  599 
  600 #undef mix
  601 
  602 
  603 /*
  604  * Lookup function for a learning bridge.
  605  * Update the hash table with the source address,
  606  * and then returns the destination port index, and the
  607  * ring in *dst_ring (at the moment, always use ring 0)
  608  */
  609 uint32_t
  610 netmap_vale_learning(struct nm_bdg_fwd *ft, uint8_t *dst_ring,
  611                 struct netmap_vp_adapter *na, void *private_data)
  612 {
  613         uint8_t *buf = ((uint8_t *)ft->ft_buf) + ft->ft_offset;
  614         u_int buf_len = ft->ft_len - ft->ft_offset;
  615         struct nm_hash_ent *ht = private_data;
  616         uint32_t sh, dh;
  617         u_int dst, mysrc = na->bdg_port;
  618         uint64_t smac, dmac;
  619         uint8_t indbuf[12];
  620 
  621         if (buf_len < 14) {
  622                 return NM_BDG_NOPORT;
  623         }
  624 
  625         if (ft->ft_flags & NS_INDIRECT) {
  626                 if (copyin(buf, indbuf, sizeof(indbuf))) {
  627                         return NM_BDG_NOPORT;
  628                 }
  629                 buf = indbuf;
  630         }
  631 
  632         dmac = le64toh(*(uint64_t *)(buf)) & 0xffffffffffff;
  633         smac = le64toh(*(uint64_t *)(buf + 4));
  634         smac >>= 16;
  635 
  636         /*
  637          * The hash is somewhat expensive, there might be some
  638          * worthwhile optimizations here.
  639          */
  640         if (((buf[6] & 1) == 0) && (na->last_smac != smac)) { /* valid src */
  641                 uint8_t *s = buf+6;
  642                 sh = nm_vale_rthash(s); /* hash of source */
  643                 /* update source port forwarding entry */
  644                 na->last_smac = ht[sh].mac = smac;      /* XXX expire ? */
  645                 ht[sh].ports = mysrc;
  646                 if (netmap_debug & NM_DEBUG_VALE)
  647                     nm_prinf("src %02x:%02x:%02x:%02x:%02x:%02x on port %d",
  648                         s[0], s[1], s[2], s[3], s[4], s[5], mysrc);
  649         }
  650         dst = NM_BDG_BROADCAST;
  651         if ((buf[0] & 1) == 0) { /* unicast */
  652                 dh = nm_vale_rthash(buf); /* hash of dst */
  653                 if (ht[dh].mac == dmac) {       /* found dst */
  654                         dst = ht[dh].ports;
  655                 }
  656         }
  657         return dst;
  658 }
  659 
  660 
  661 /*
  662  * Available space in the ring. Only used in VALE code
  663  * and only with is_rx = 1
  664  */
  665 static inline uint32_t
  666 nm_kr_space(struct netmap_kring *k, int is_rx)
  667 {
  668         int space;
  669 
  670         if (is_rx) {
  671                 int busy = k->nkr_hwlease - k->nr_hwcur;
  672                 if (busy < 0)
  673                         busy += k->nkr_num_slots;
  674                 space = k->nkr_num_slots - 1 - busy;
  675         } else {
  676                 /* XXX never used in this branch */
  677                 space = k->nr_hwtail - k->nkr_hwlease;
  678                 if (space < 0)
  679                         space += k->nkr_num_slots;
  680         }
  681 #if 0
  682         // sanity check
  683         if (k->nkr_hwlease >= k->nkr_num_slots ||
  684                 k->nr_hwcur >= k->nkr_num_slots ||
  685                 k->nr_tail >= k->nkr_num_slots ||
  686                 busy < 0 ||
  687                 busy >= k->nkr_num_slots) {
  688                 nm_prerr("invalid kring, cur %d tail %d lease %d lease_idx %d lim %d",
  689                     k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
  690                     k->nkr_lease_idx, k->nkr_num_slots);
  691         }
  692 #endif
  693         return space;
  694 }
  695 
  696 
  697 
  698 
  699 /* make a lease on the kring for N positions. return the
  700  * lease index
  701  * XXX only used in VALE code and with is_rx = 1
  702  */
  703 static inline uint32_t
  704 nm_kr_lease(struct netmap_kring *k, u_int n, int is_rx)
  705 {
  706         uint32_t lim = k->nkr_num_slots - 1;
  707         uint32_t lease_idx = k->nkr_lease_idx;
  708 
  709         k->nkr_leases[lease_idx] = NR_NOSLOT;
  710         k->nkr_lease_idx = nm_next(lease_idx, lim);
  711 
  712 #ifdef CONFIG_NETMAP_DEBUG
  713         if (n > nm_kr_space(k, is_rx)) {
  714                 nm_prerr("invalid request for %d slots", n);
  715                 panic("x");
  716         }
  717 #endif /* CONFIG NETMAP_DEBUG */
  718         /* XXX verify that there are n slots */
  719         k->nkr_hwlease += n;
  720         if (k->nkr_hwlease > lim)
  721                 k->nkr_hwlease -= lim + 1;
  722 
  723 #ifdef CONFIG_NETMAP_DEBUG
  724         if (k->nkr_hwlease >= k->nkr_num_slots ||
  725                 k->nr_hwcur >= k->nkr_num_slots ||
  726                 k->nr_hwtail >= k->nkr_num_slots ||
  727                 k->nkr_lease_idx >= k->nkr_num_slots) {
  728                 nm_prerr("invalid kring %s, cur %d tail %d lease %d lease_idx %d lim %d",
  729                         k->na->name,
  730                         k->nr_hwcur, k->nr_hwtail, k->nkr_hwlease,
  731                         k->nkr_lease_idx, k->nkr_num_slots);
  732         }
  733 #endif /* CONFIG_NETMAP_DEBUG */
  734         return lease_idx;
  735 }
  736 
  737 /*
  738  *
  739  * This flush routine supports only unicast and broadcast but a large
  740  * number of ports, and lets us replace the learn and dispatch functions.
  741  */
  742 int
  743 nm_vale_flush(struct nm_bdg_fwd *ft, u_int n, struct netmap_vp_adapter *na,
  744                 u_int ring_nr)
  745 {
  746         struct nm_vale_q *dst_ents, *brddst;
  747         uint16_t num_dsts = 0, *dsts;
  748         struct nm_bridge *b = na->na_bdg;
  749         u_int i, me = na->bdg_port;
  750 
  751         /*
  752          * The work area (pointed by ft) is followed by an array of
  753          * pointers to queues , dst_ents; there are NM_BDG_MAXRINGS
  754          * queues per port plus one for the broadcast traffic.
  755          * Then we have an array of destination indexes.
  756          */
  757         dst_ents = (struct nm_vale_q *)(ft + NM_BDG_BATCH_MAX);
  758         dsts = (uint16_t *)(dst_ents + NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1);
  759 
  760         /* first pass: find a destination for each packet in the batch */
  761         for (i = 0; likely(i < n); i += ft[i].ft_frags) {
  762                 uint8_t dst_ring = ring_nr; /* default, same ring as origin */
  763                 uint16_t dst_port, d_i;
  764                 struct nm_vale_q *d;
  765                 struct nm_bdg_fwd *start_ft = NULL;
  766 
  767                 nm_prdis("slot %d frags %d", i, ft[i].ft_frags);
  768 
  769                 if (na->up.virt_hdr_len < ft[i].ft_len) {
  770                         ft[i].ft_offset = na->up.virt_hdr_len;
  771                         start_ft = &ft[i];
  772                 } else if (na->up.virt_hdr_len == ft[i].ft_len && ft[i].ft_flags & NS_MOREFRAG) {
  773                         ft[i].ft_offset = ft[i].ft_len;
  774                         start_ft = &ft[i+1];
  775                 } else {
  776                         /* Drop the packet if the virtio-net header is not into the first
  777                          * fragment nor at the very beginning of the second.
  778                          */
  779                         continue;
  780                 }
  781                 dst_port = b->bdg_ops.lookup(start_ft, &dst_ring, na, b->private_data);
  782                 if (netmap_verbose > 255)
  783                         nm_prlim(5, "slot %d port %d -> %d", i, me, dst_port);
  784                 if (dst_port >= NM_BDG_NOPORT)
  785                         continue; /* this packet is identified to be dropped */
  786                 else if (dst_port == NM_BDG_BROADCAST)
  787                         dst_ring = 0; /* broadcasts always go to ring 0 */
  788                 else if (unlikely(dst_port == me ||
  789                     !b->bdg_ports[dst_port]))
  790                         continue;
  791 
  792                 /* get a position in the scratch pad */
  793                 d_i = dst_port * NM_BDG_MAXRINGS + dst_ring;
  794                 d = dst_ents + d_i;
  795 
  796                 /* append the first fragment to the list */
  797                 if (d->bq_head == NM_FT_NULL) { /* new destination */
  798                         d->bq_head = d->bq_tail = i;
  799                         /* remember this position to be scanned later */
  800                         if (dst_port != NM_BDG_BROADCAST)
  801                                 dsts[num_dsts++] = d_i;
  802                 } else {
  803                         ft[d->bq_tail].ft_next = i;
  804                         d->bq_tail = i;
  805                 }
  806                 d->bq_len += ft[i].ft_frags;
  807         }
  808 
  809         /*
  810          * Broadcast traffic goes to ring 0 on all destinations.
  811          * So we need to add these rings to the list of ports to scan.
  812          */
  813         brddst = dst_ents + NM_BDG_BROADCAST * NM_BDG_MAXRINGS;
  814         if (brddst->bq_head != NM_FT_NULL) {
  815                 u_int j;
  816                 for (j = 0; likely(j < b->bdg_active_ports); j++) {
  817                         uint16_t d_i;
  818                         i = b->bdg_port_index[j];
  819                         if (unlikely(i == me))
  820                                 continue;
  821                         d_i = i * NM_BDG_MAXRINGS;
  822                         if (dst_ents[d_i].bq_head == NM_FT_NULL)
  823                                 dsts[num_dsts++] = d_i;
  824                 }
  825         }
  826 
  827         nm_prdis(5, "pass 1 done %d pkts %d dsts", n, num_dsts);
  828         /* second pass: scan destinations */
  829         for (i = 0; i < num_dsts; i++) {
  830                 struct netmap_vp_adapter *dst_na;
  831                 struct netmap_kring *kring;
  832                 struct netmap_ring *ring;
  833                 u_int dst_nr, lim, j, d_i, next, brd_next;
  834                 u_int needed, howmany;
  835                 int retry = netmap_txsync_retry;
  836                 struct nm_vale_q *d;
  837                 uint32_t my_start = 0, lease_idx = 0;
  838                 int nrings;
  839                 int virt_hdr_mismatch = 0;
  840 
  841                 d_i = dsts[i];
  842                 nm_prdis("second pass %d port %d", i, d_i);
  843                 d = dst_ents + d_i;
  844                 // XXX fix the division
  845                 dst_na = b->bdg_ports[d_i/NM_BDG_MAXRINGS];
  846                 /* protect from the lookup function returning an inactive
  847                  * destination port
  848                  */
  849                 if (unlikely(dst_na == NULL))
  850                         goto cleanup;
  851                 if (dst_na->up.na_flags & NAF_SW_ONLY)
  852                         goto cleanup;
  853                 /*
  854                  * The interface may be in !netmap mode in two cases:
  855                  * - when na is attached but not activated yet;
  856                  * - when na is being deactivated but is still attached.
  857                  */
  858                 if (unlikely(!nm_netmap_on(&dst_na->up))) {
  859                         nm_prdis("not in netmap mode!");
  860                         goto cleanup;
  861                 }
  862 
  863                 /* there is at least one either unicast or broadcast packet */
  864                 brd_next = brddst->bq_head;
  865                 next = d->bq_head;
  866                 /* we need to reserve this many slots. If fewer are
  867                  * available, some packets will be dropped.
  868                  * Packets may have multiple fragments, so
  869                  * there is a chance that we may not use all of the slots
  870                  * we have claimed, so we will need to handle the leftover
  871                  * ones when we regain the lock.
  872                  */
  873                 needed = d->bq_len + brddst->bq_len;
  874 
  875                 if (unlikely(dst_na->up.virt_hdr_len != na->up.virt_hdr_len)) {
  876                         if (netmap_verbose) {
  877                                 nm_prlim(3, "virt_hdr_mismatch, src %d dst %d", na->up.virt_hdr_len,
  878                                                 dst_na->up.virt_hdr_len);
  879                         }
  880                         /* There is a virtio-net header/offloadings mismatch between
  881                          * source and destination. The slower mismatch datapath will
  882                          * be used to cope with all the mismatches.
  883                          */
  884                         virt_hdr_mismatch = 1;
  885                         if (dst_na->mfs < na->mfs) {
  886                                 /* We may need to do segmentation offloadings, and so
  887                                  * we may need a number of destination slots greater
  888                                  * than the number of input slots ('needed').
  889                                  * We look for the smallest integer 'x' which satisfies:
  890                                  *      needed * na->mfs + x * H <= x * na->mfs
  891                                  * where 'H' is the length of the longest header that may
  892                                  * be replicated in the segmentation process (e.g. for
  893                                  * TCPv4 we must account for ethernet header, IP header
  894                                  * and TCPv4 header).
  895                                  */
  896                                 KASSERT(dst_na->mfs > 0, ("vpna->mfs is 0"));
  897                                 needed = (needed * na->mfs) /
  898                                                 (dst_na->mfs - WORST_CASE_GSO_HEADER) + 1;
  899                                 nm_prdis(3, "srcmtu=%u, dstmtu=%u, x=%u", na->mfs, dst_na->mfs, needed);
  900                         }
  901                 }
  902 
  903                 nm_prdis(5, "pass 2 dst %d is %x %s",
  904                         i, d_i, nm_is_bwrap(&dst_na->up) ? "nic/host" : "virtual");
  905                 dst_nr = d_i & (NM_BDG_MAXRINGS-1);
  906                 nrings = dst_na->up.num_rx_rings;
  907                 if (dst_nr >= nrings)
  908                         dst_nr = dst_nr % nrings;
  909                 kring = dst_na->up.rx_rings[dst_nr];
  910                 ring = kring->ring;
  911                 /* the destination ring may have not been opened for RX */
  912                 if (unlikely(ring == NULL || kring->nr_mode != NKR_NETMAP_ON))
  913                         goto cleanup;
  914                 lim = kring->nkr_num_slots - 1;
  915 
  916 retry:
  917 
  918                 if (dst_na->retry && retry) {
  919                         /* try to get some free slot from the previous run */
  920                         kring->nm_notify(kring, NAF_FORCE_RECLAIM);
  921                         /* actually useful only for bwraps, since there
  922                          * the notify will trigger a txsync on the hwna. VALE ports
  923                          * have dst_na->retry == 0
  924                          */
  925                 }
  926                 /* reserve the buffers in the queue and an entry
  927                  * to report completion, and drop lock.
  928                  * XXX this might become a helper function.
  929                  */
  930                 mtx_lock(&kring->q_lock);
  931                 if (kring->nkr_stopped) {
  932                         mtx_unlock(&kring->q_lock);
  933                         goto cleanup;
  934                 }
  935                 my_start = j = kring->nkr_hwlease;
  936                 howmany = nm_kr_space(kring, 1);
  937                 if (needed < howmany)
  938                         howmany = needed;
  939                 lease_idx = nm_kr_lease(kring, howmany, 1);
  940                 mtx_unlock(&kring->q_lock);
  941 
  942                 /* only retry if we need more than available slots */
  943                 if (retry && needed <= howmany)
  944                         retry = 0;
  945 
  946                 /* copy to the destination queue */
  947                 while (howmany > 0) {
  948                         struct netmap_slot *slot;
  949                         struct nm_bdg_fwd *ft_p, *ft_end;
  950                         u_int cnt;
  951 
  952                         /* find the queue from which we pick next packet.
  953                          * NM_FT_NULL is always higher than valid indexes
  954                          * so we never dereference it if the other list
  955                          * has packets (and if both are empty we never
  956                          * get here).
  957                          */
  958                         if (next < brd_next) {
  959                                 ft_p = ft + next;
  960                                 next = ft_p->ft_next;
  961                         } else { /* insert broadcast */
  962                                 ft_p = ft + brd_next;
  963                                 brd_next = ft_p->ft_next;
  964                         }
  965                         cnt = ft_p->ft_frags; // cnt > 0
  966                         if (unlikely(cnt > howmany))
  967                             break; /* no more space */
  968                         if (netmap_verbose && cnt > 1)
  969                                 nm_prlim(5, "rx %d frags to %d", cnt, j);
  970                         ft_end = ft_p + cnt;
  971                         if (unlikely(virt_hdr_mismatch)) {
  972                                 bdg_mismatch_datapath(na, dst_na, ft_p, ring, &j, lim, &howmany);
  973                         } else {
  974                                 howmany -= cnt;
  975                                 do {
  976                                         char *dst, *src = ft_p->ft_buf;
  977                                         size_t copy_len = ft_p->ft_len, dst_len = copy_len;
  978                                         uintptr_t src_cb;
  979                                         uint64_t dstoff, dstoff_cb;
  980                                         int src_co, dst_co;
  981                                         const uintptr_t mask = NM_BUF_ALIGN - 1;
  982 
  983                                         slot = &ring->slot[j];
  984                                         dst = NMB(&dst_na->up, slot);
  985                                         dstoff = nm_get_offset(kring, slot);
  986                                         dstoff_cb = dstoff & ~mask;
  987                                         src_cb = ((uintptr_t)src) & ~mask;
  988                                         src_co = ((uintptr_t)src) & mask;
  989                                         dst_co = ((uintptr_t)(dst + dstoff)) & mask;
  990                                         if (dst_co < src_co) {
  991                                                 dstoff_cb += NM_BUF_ALIGN;
  992                                         }
  993                                         dstoff = dstoff_cb + src_co;
  994                                         copy_len += src_co;
  995 
  996                                         nm_prdis("send [%d] %d(%d) bytes at %s:%d",
  997                                                         i, (int)copy_len, (int)dst_len,
  998                                                         NM_IFPNAME(dst_ifp), j);
  999 
 1000                                         if (unlikely(dstoff > NETMAP_BUF_SIZE(&dst_na->up) ||
 1001                                                      dst_len > NETMAP_BUF_SIZE(&dst_na->up) - dstoff)) {
 1002                                                 nm_prlim(5, "dropping packet/fragment of len %zu, dest offset %llu",
 1003                                                                 dst_len, (unsigned long long)dstoff);
 1004                                                 copy_len = dst_len = 0;
 1005                                                 dstoff = nm_get_offset(kring, slot);
 1006                                         }
 1007 
 1008                                         if (ft_p->ft_flags & NS_INDIRECT) {
 1009                                                 if (copyin(src, dst, copy_len)) {
 1010                                                         // invalid user pointer, pretend len is 0
 1011                                                         dst_len = 0;
 1012                                                 }
 1013                                         } else {
 1014                                                 //memcpy(dst, src, copy_len);
 1015                                                 pkt_copy((char *)src_cb, dst + dstoff_cb, (int)copy_len);
 1016                                         }
 1017                                         slot->len = dst_len;
 1018                                         slot->flags = (cnt << 8)| NS_MOREFRAG;
 1019                                         nm_write_offset(kring, slot, dstoff);
 1020                                         j = nm_next(j, lim);
 1021                                         needed--;
 1022                                         ft_p++;
 1023                                 } while (ft_p != ft_end);
 1024                                 slot->flags = (cnt << 8); /* clear flag on last entry */
 1025                         }
 1026                         /* are we done ? */
 1027                         if (next == NM_FT_NULL && brd_next == NM_FT_NULL)
 1028                                 break;
 1029                 }
 1030                 {
 1031                     /* current position */
 1032                     uint32_t *p = kring->nkr_leases; /* shorthand */
 1033                     uint32_t update_pos;
 1034                     int still_locked = 1;
 1035 
 1036                     mtx_lock(&kring->q_lock);
 1037                     if (unlikely(howmany > 0)) {
 1038                         /* not used all bufs. If i am the last one
 1039                          * i can recover the slots, otherwise must
 1040                          * fill them with 0 to mark empty packets.
 1041                          */
 1042                         nm_prdis("leftover %d bufs", howmany);
 1043                         if (nm_next(lease_idx, lim) == kring->nkr_lease_idx) {
 1044                             /* yes i am the last one */
 1045                             nm_prdis("roll back nkr_hwlease to %d", j);
 1046                             kring->nkr_hwlease = j;
 1047                         } else {
 1048                             while (howmany-- > 0) {
 1049                                 ring->slot[j].len = 0;
 1050                                 ring->slot[j].flags = 0;
 1051                                 j = nm_next(j, lim);
 1052                             }
 1053                         }
 1054                     }
 1055                     p[lease_idx] = j; /* report I am done */
 1056 
 1057                     update_pos = kring->nr_hwtail;
 1058 
 1059                     if (my_start == update_pos) {
 1060                         /* all slots before my_start have been reported,
 1061                          * so scan subsequent leases to see if other ranges
 1062                          * have been completed, and to a selwakeup or txsync.
 1063                          */
 1064                         while (lease_idx != kring->nkr_lease_idx &&
 1065                                 p[lease_idx] != NR_NOSLOT) {
 1066                             j = p[lease_idx];
 1067                             p[lease_idx] = NR_NOSLOT;
 1068                             lease_idx = nm_next(lease_idx, lim);
 1069                         }
 1070                         /* j is the new 'write' position. j != my_start
 1071                          * means there are new buffers to report
 1072                          */
 1073                         if (likely(j != my_start)) {
 1074                                 kring->nr_hwtail = j;
 1075                                 still_locked = 0;
 1076                                 mtx_unlock(&kring->q_lock);
 1077                                 kring->nm_notify(kring, 0);
 1078                                 /* this is netmap_notify for VALE ports and
 1079                                  * netmap_bwrap_notify for bwrap. The latter will
 1080                                  * trigger a txsync on the underlying hwna
 1081                                  */
 1082                                 if (dst_na->retry && retry--) {
 1083                                         /* XXX this is going to call nm_notify again.
 1084                                          * Only useful for bwrap in virtual machines
 1085                                          */
 1086                                         goto retry;
 1087                                 }
 1088                         }
 1089                     }
 1090                     if (still_locked)
 1091                         mtx_unlock(&kring->q_lock);
 1092                 }
 1093 cleanup:
 1094                 d->bq_head = d->bq_tail = NM_FT_NULL; /* cleanup */
 1095                 d->bq_len = 0;
 1096         }
 1097         brddst->bq_head = brddst->bq_tail = NM_FT_NULL; /* cleanup */
 1098         brddst->bq_len = 0;
 1099         return 0;
 1100 }
 1101 
 1102 /* nm_txsync callback for VALE ports */
 1103 static int
 1104 netmap_vale_vp_txsync(struct netmap_kring *kring, int flags)
 1105 {
 1106         struct netmap_vp_adapter *na =
 1107                 (struct netmap_vp_adapter *)kring->na;
 1108         u_int done;
 1109         u_int const lim = kring->nkr_num_slots - 1;
 1110         u_int const head = kring->rhead;
 1111 
 1112         if (bridge_batch <= 0) { /* testing only */
 1113                 done = head; // used all
 1114                 goto done;
 1115         }
 1116         if (!na->na_bdg) {
 1117                 done = head;
 1118                 goto done;
 1119         }
 1120         if (bridge_batch > NM_BDG_BATCH)
 1121                 bridge_batch = NM_BDG_BATCH;
 1122 
 1123         done = nm_vale_preflush(kring, head);
 1124 done:
 1125         if (done != head)
 1126                 nm_prerr("early break at %d/ %d, tail %d", done, head, kring->nr_hwtail);
 1127         /*
 1128          * packets between 'done' and 'cur' are left unsent.
 1129          */
 1130         kring->nr_hwcur = done;
 1131         kring->nr_hwtail = nm_prev(done, lim);
 1132         if (netmap_debug & NM_DEBUG_TXSYNC)
 1133                 nm_prinf("%s ring %d flags %d", na->up.name, kring->ring_id, flags);
 1134         return 0;
 1135 }
 1136 
 1137 
 1138 /* create a netmap_vp_adapter that describes a VALE port.
 1139  * Only persistent VALE ports have a non-null ifp.
 1140  */
 1141 static int
 1142 netmap_vale_vp_create(struct nmreq_header *hdr, struct ifnet *ifp,
 1143                 struct netmap_mem_d *nmd, struct netmap_vp_adapter **ret)
 1144 {
 1145         struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
 1146         struct netmap_vp_adapter *vpna;
 1147         struct netmap_adapter *na;
 1148         int error = 0;
 1149         u_int npipes = 0;
 1150         u_int extrabufs = 0;
 1151 
 1152         if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
 1153                 return EINVAL;
 1154         }
 1155 
 1156         vpna = nm_os_malloc(sizeof(*vpna));
 1157         if (vpna == NULL)
 1158                 return ENOMEM;
 1159 
 1160         na = &vpna->up;
 1161 
 1162         na->ifp = ifp;
 1163         strlcpy(na->name, hdr->nr_name, sizeof(na->name));
 1164 
 1165         /* bound checking */
 1166         na->num_tx_rings = req->nr_tx_rings;
 1167         nm_bound_var(&na->num_tx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
 1168         req->nr_tx_rings = na->num_tx_rings; /* write back */
 1169         na->num_rx_rings = req->nr_rx_rings;
 1170         nm_bound_var(&na->num_rx_rings, 1, 1, NM_BDG_MAXRINGS, NULL);
 1171         req->nr_rx_rings = na->num_rx_rings; /* write back */
 1172         nm_bound_var(&req->nr_tx_slots, NM_BRIDGE_RINGSIZE,
 1173                         1, NM_BDG_MAXSLOTS, NULL);
 1174         na->num_tx_desc = req->nr_tx_slots;
 1175         nm_bound_var(&req->nr_rx_slots, NM_BRIDGE_RINGSIZE,
 1176                         1, NM_BDG_MAXSLOTS, NULL);
 1177         /* validate number of pipes. We want at least 1,
 1178          * but probably can do with some more.
 1179          * So let's use 2 as default (when 0 is supplied)
 1180          */
 1181         nm_bound_var(&npipes, 2, 1, NM_MAXPIPES, NULL);
 1182         /* validate extra bufs */
 1183         extrabufs = req->nr_extra_bufs;
 1184         nm_bound_var(&extrabufs, 0, 0,
 1185                         128*NM_BDG_MAXSLOTS, NULL);
 1186         req->nr_extra_bufs = extrabufs; /* write back */
 1187         na->num_rx_desc = req->nr_rx_slots;
 1188         /* Set the mfs to a default value, as it is needed on the VALE
 1189          * mismatch datapath. XXX We should set it according to the MTU
 1190          * known to the kernel. */
 1191         vpna->mfs = NM_BDG_MFS_DEFAULT;
 1192         vpna->last_smac = ~0llu;
 1193         /*if (vpna->mfs > netmap_buf_size)  TODO netmap_buf_size is zero??
 1194                 vpna->mfs = netmap_buf_size; */
 1195         if (netmap_verbose)
 1196                 nm_prinf("max frame size %u", vpna->mfs);
 1197 
 1198         na->na_flags |= (NAF_BDG_MAYSLEEP | NAF_OFFSETS);
 1199         /* persistent VALE ports look like hw devices
 1200          * with a native netmap adapter
 1201          */
 1202         if (ifp)
 1203                 na->na_flags |= NAF_NATIVE;
 1204         na->nm_txsync = netmap_vale_vp_txsync;
 1205         na->nm_rxsync = netmap_vp_rxsync; /* use the one provided by bdg */
 1206         na->nm_register = netmap_vp_reg;  /* use the one provided by bdg */
 1207         na->nm_krings_create = netmap_vale_vp_krings_create;
 1208         na->nm_krings_delete = netmap_vale_vp_krings_delete;
 1209         na->nm_dtor = netmap_vale_vp_dtor;
 1210         nm_prdis("nr_mem_id %d", req->nr_mem_id);
 1211         na->nm_mem = nmd ?
 1212                 netmap_mem_get(nmd):
 1213                 netmap_mem_private_new(
 1214                         na->num_tx_rings, na->num_tx_desc,
 1215                         na->num_rx_rings, na->num_rx_desc,
 1216                         req->nr_extra_bufs, npipes, &error);
 1217         if (na->nm_mem == NULL)
 1218                 goto err;
 1219         na->nm_bdg_attach = netmap_vale_vp_bdg_attach;
 1220         /* other nmd fields are set in the common routine */
 1221         error = netmap_attach_common(na);
 1222         if (error)
 1223                 goto err;
 1224         *ret = vpna;
 1225         return 0;
 1226 
 1227 err:
 1228         if (na->nm_mem != NULL)
 1229                 netmap_mem_put(na->nm_mem);
 1230         nm_os_free(vpna);
 1231         return error;
 1232 }
 1233 
 1234 /* nm_bdg_attach callback for VALE ports
 1235  * The na_vp port is this same netmap_adapter. There is no host port.
 1236  */
 1237 static int
 1238 netmap_vale_vp_bdg_attach(const char *name, struct netmap_adapter *na,
 1239                 struct nm_bridge *b)
 1240 {
 1241         struct netmap_vp_adapter *vpna = (struct netmap_vp_adapter *)na;
 1242 
 1243         if ((b->bdg_flags & NM_BDG_NEED_BWRAP) || vpna->na_bdg) {
 1244                 return NM_NEED_BWRAP;
 1245         }
 1246         na->na_vp = vpna;
 1247         strlcpy(na->name, name, sizeof(na->name));
 1248         na->na_hostvp = NULL;
 1249         return 0;
 1250 }
 1251 
 1252 static int
 1253 netmap_vale_bwrap_krings_create(struct netmap_adapter *na)
 1254 {
 1255         int error;
 1256 
 1257         /* impersonate a netmap_vp_adapter */
 1258         error = netmap_vale_vp_krings_create(na);
 1259         if (error)
 1260                 return error;
 1261         error = netmap_bwrap_krings_create_common(na);
 1262         if (error) {
 1263                 netmap_vale_vp_krings_delete(na);
 1264         }
 1265         return error;
 1266 }
 1267 
 1268 static void
 1269 netmap_vale_bwrap_krings_delete(struct netmap_adapter *na)
 1270 {
 1271         netmap_bwrap_krings_delete_common(na);
 1272         netmap_vale_vp_krings_delete(na);
 1273 }
 1274 
 1275 static int
 1276 netmap_vale_bwrap_attach(const char *nr_name, struct netmap_adapter *hwna)
 1277 {
 1278         struct netmap_bwrap_adapter *bna;
 1279         struct netmap_adapter *na = NULL;
 1280         struct netmap_adapter *hostna = NULL;
 1281         int error;
 1282 
 1283         bna = nm_os_malloc(sizeof(*bna));
 1284         if (bna == NULL) {
 1285                 return ENOMEM;
 1286         }
 1287         na = &bna->up.up;
 1288         strlcpy(na->name, nr_name, sizeof(na->name));
 1289         na->nm_register = netmap_bwrap_reg;
 1290         na->nm_txsync = netmap_vale_vp_txsync;
 1291         // na->nm_rxsync = netmap_bwrap_rxsync;
 1292         na->nm_krings_create = netmap_vale_bwrap_krings_create;
 1293         na->nm_krings_delete = netmap_vale_bwrap_krings_delete;
 1294         na->nm_notify = netmap_bwrap_notify;
 1295         bna->nm_intr_notify = netmap_bwrap_intr_notify;
 1296         bna->up.retry = 1; /* XXX maybe this should depend on the hwna */
 1297         /* Set the mfs, needed on the VALE mismatch datapath. */
 1298         bna->up.mfs = NM_BDG_MFS_DEFAULT;
 1299 
 1300         if (hwna->na_flags & NAF_HOST_RINGS) {
 1301                 hostna = &bna->host.up;
 1302                 hostna->nm_notify = netmap_bwrap_notify;
 1303                 bna->host.mfs = NM_BDG_MFS_DEFAULT;
 1304         }
 1305 
 1306         error = netmap_bwrap_attach_common(na, hwna);
 1307         if (error) {
 1308                 nm_os_free(bna);
 1309         }
 1310         return error;
 1311 }
 1312 
 1313 int
 1314 netmap_get_vale_na(struct nmreq_header *hdr, struct netmap_adapter **na,
 1315                 struct netmap_mem_d *nmd, int create)
 1316 {
 1317         return netmap_get_bdg_na(hdr, na, nmd, create, &vale_bdg_ops);
 1318 }
 1319 
 1320 
 1321 /* creates a persistent VALE port */
 1322 int
 1323 nm_vi_create(struct nmreq_header *hdr)
 1324 {
 1325         struct nmreq_vale_newif *req =
 1326                 (struct nmreq_vale_newif *)(uintptr_t)hdr->nr_body;
 1327         int error = 0;
 1328         /* Build a nmreq_register out of the nmreq_vale_newif,
 1329          * so that we can call netmap_get_bdg_na(). */
 1330         struct nmreq_register regreq;
 1331         bzero(&regreq, sizeof(regreq));
 1332         regreq.nr_tx_slots = req->nr_tx_slots;
 1333         regreq.nr_rx_slots = req->nr_rx_slots;
 1334         regreq.nr_tx_rings = req->nr_tx_rings;
 1335         regreq.nr_rx_rings = req->nr_rx_rings;
 1336         regreq.nr_mem_id = req->nr_mem_id;
 1337         hdr->nr_reqtype = NETMAP_REQ_REGISTER;
 1338         hdr->nr_body = (uintptr_t)&regreq;
 1339         error = netmap_vi_create(hdr, 0 /* no autodelete */);
 1340         hdr->nr_reqtype = NETMAP_REQ_VALE_NEWIF;
 1341         hdr->nr_body = (uintptr_t)req;
 1342         /* Write back to the original struct. */
 1343         req->nr_tx_slots = regreq.nr_tx_slots;
 1344         req->nr_rx_slots = regreq.nr_rx_slots;
 1345         req->nr_tx_rings = regreq.nr_tx_rings;
 1346         req->nr_rx_rings = regreq.nr_rx_rings;
 1347         req->nr_mem_id = regreq.nr_mem_id;
 1348         return error;
 1349 }
 1350 
 1351 /* remove a persistent VALE port from the system */
 1352 int
 1353 nm_vi_destroy(const char *name)
 1354 {
 1355         struct ifnet *ifp;
 1356         struct netmap_vp_adapter *vpna;
 1357         int error;
 1358 
 1359         ifp = ifunit_ref(name);
 1360         if (!ifp)
 1361                 return ENXIO;
 1362         NMG_LOCK();
 1363         /* make sure this is actually a VALE port */
 1364         if (!NM_NA_VALID(ifp) || NA(ifp)->nm_register != netmap_vp_reg) {
 1365                 error = EINVAL;
 1366                 goto err;
 1367         }
 1368 
 1369         vpna = (struct netmap_vp_adapter *)NA(ifp);
 1370 
 1371         /* we can only destroy ports that were created via NETMAP_BDG_NEWIF */
 1372         if (vpna->autodelete) {
 1373                 error = EINVAL;
 1374                 goto err;
 1375         }
 1376 
 1377         /* also make sure that nobody is using the interface */
 1378         if (NETMAP_OWNED_BY_ANY(&vpna->up) ||
 1379             vpna->up.na_refcount > 1 /* any ref besides the one in nm_vi_create()? */) {
 1380                 error = EBUSY;
 1381                 goto err;
 1382         }
 1383 
 1384         NMG_UNLOCK();
 1385 
 1386         if (netmap_verbose)
 1387                 nm_prinf("destroying a persistent vale interface %s", ifp->if_xname);
 1388         /* Linux requires all the references are released
 1389          * before unregister
 1390          */
 1391         netmap_detach(ifp);
 1392         if_rele(ifp);
 1393         nm_os_vi_detach(ifp);
 1394         return 0;
 1395 
 1396 err:
 1397         NMG_UNLOCK();
 1398         if_rele(ifp);
 1399         return error;
 1400 }
 1401 
 1402 static int
 1403 nm_update_info(struct nmreq_register *req, struct netmap_adapter *na)
 1404 {
 1405         req->nr_rx_rings = na->num_rx_rings;
 1406         req->nr_tx_rings = na->num_tx_rings;
 1407         req->nr_rx_slots = na->num_rx_desc;
 1408         req->nr_tx_slots = na->num_tx_desc;
 1409         return netmap_mem_get_info(na->nm_mem, &req->nr_memsize, NULL,
 1410                                         &req->nr_mem_id);
 1411 }
 1412 
 1413 
 1414 /*
 1415  * Create a virtual interface registered to the system.
 1416  * The interface will be attached to a bridge later.
 1417  */
 1418 int
 1419 netmap_vi_create(struct nmreq_header *hdr, int autodelete)
 1420 {
 1421         struct nmreq_register *req = (struct nmreq_register *)(uintptr_t)hdr->nr_body;
 1422         struct ifnet *ifp;
 1423         struct netmap_vp_adapter *vpna;
 1424         struct netmap_mem_d *nmd = NULL;
 1425         int error;
 1426 
 1427         if (hdr->nr_reqtype != NETMAP_REQ_REGISTER) {
 1428                 return EINVAL;
 1429         }
 1430 
 1431         /* don't include VALE prefix */
 1432         if (!strncmp(hdr->nr_name, NM_BDG_NAME, strlen(NM_BDG_NAME)))
 1433                 return EINVAL;
 1434         if (strlen(hdr->nr_name) >= IFNAMSIZ) {
 1435                 return EINVAL;
 1436         }
 1437         ifp = ifunit_ref(hdr->nr_name);
 1438         if (ifp) { /* already exist, cannot create new one */
 1439                 error = EEXIST;
 1440                 NMG_LOCK();
 1441                 if (NM_NA_VALID(ifp)) {
 1442                         int update_err = nm_update_info(req, NA(ifp));
 1443                         if (update_err)
 1444                                 error = update_err;
 1445                 }
 1446                 NMG_UNLOCK();
 1447                 if_rele(ifp);
 1448                 return error;
 1449         }
 1450         error = nm_os_vi_persist(hdr->nr_name, &ifp);
 1451         if (error)
 1452                 return error;
 1453 
 1454         NMG_LOCK();
 1455         if (req->nr_mem_id) {
 1456                 nmd = netmap_mem_find(req->nr_mem_id);
 1457                 if (nmd == NULL) {
 1458                         error = EINVAL;
 1459                         goto err_1;
 1460                 }
 1461         }
 1462         /* netmap_vp_create creates a struct netmap_vp_adapter */
 1463         error = netmap_vale_vp_create(hdr, ifp, nmd, &vpna);
 1464         if (error) {
 1465                 if (netmap_debug & NM_DEBUG_VALE)
 1466                         nm_prerr("error %d", error);
 1467                 goto err_1;
 1468         }
 1469         /* persist-specific routines */
 1470         vpna->up.nm_bdg_ctl = netmap_vp_bdg_ctl;
 1471         if (!autodelete) {
 1472                 netmap_adapter_get(&vpna->up);
 1473         } else {
 1474                 vpna->autodelete = 1;
 1475         }
 1476         NM_ATTACH_NA(ifp, &vpna->up);
 1477         /* return the updated info */
 1478         error = nm_update_info(req, &vpna->up);
 1479         if (error) {
 1480                 goto err_2;
 1481         }
 1482         nm_prdis("returning nr_mem_id %d", req->nr_mem_id);
 1483         if (nmd)
 1484                 netmap_mem_put(nmd);
 1485         NMG_UNLOCK();
 1486         nm_prdis("created %s", ifp->if_xname);
 1487         return 0;
 1488 
 1489 err_2:
 1490         netmap_detach(ifp);
 1491 err_1:
 1492         if (nmd)
 1493                 netmap_mem_put(nmd);
 1494         NMG_UNLOCK();
 1495         nm_os_vi_detach(ifp);
 1496 
 1497         return error;
 1498 }
 1499 
 1500 #endif /* WITH_VALE */

Cache object: 51a8659e19bbacc8857464eda55dd8e8


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.