The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/net/rss_config.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010-2011 Juniper Networks, Inc.
    3  * All rights reserved.
    4  *
    5  * This software was developed by Robert N. M. Watson under contract
    6  * to Juniper Networks, Inc.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 
   32 __FBSDID("$FreeBSD$");
   33 
   34 #include "opt_inet6.h"
   35 
   36 #include <sys/param.h>
   37 #include <sys/mbuf.h>
   38 #include <sys/socket.h>
   39 #include <sys/priv.h>
   40 #include <sys/kernel.h>
   41 #include <sys/smp.h>
   42 #include <sys/sysctl.h>
   43 #include <sys/sbuf.h>
   44 
   45 #include <net/if.h>
   46 #include <net/if_var.h>
   47 #include <net/netisr.h>
   48 #include <net/rss_config.h>
   49 #include <net/toeplitz.h>
   50 
   51 /*-
   52  * Operating system parts of receiver-side scaling (RSS), which allows
   53  * network cards to direct flows to particular receive queues based on hashes
   54  * of header tuples.  This implementation aligns RSS buckets with connection
   55  * groups at the TCP/IP layer, so each bucket is associated with exactly one
   56  * group.  As a result, the group lookup structures (and lock) should have an
   57  * effective affinity with exactly one CPU.
   58  *
   59  * Network device drivers needing to configure RSS will query this framework
   60  * for parameters, such as the current RSS key, hashing policies, number of
   61  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
   62  * provide their own supplementary information, such as queue<->CPU bindings.
   63  * It is the responsibility of the network device driver to inject packets
   64  * into the stack on as close to the right CPU as possible, if playing by RSS
   65  * rules.
   66  *
   67  * TODO:
   68  *
   69  * - Synchronization for rss_key and other future-configurable parameters.
   70  * - Event handler drivers can register to pick up RSS configuration changes.
   71  * - Should we allow rss_basecpu to be configured?
   72  * - Randomize key on boot.
   73  * - IPv6 support.
   74  * - Statistics on how often there's a misalignment between hardware
   75  *   placement and pcbgroup expectations.
   76  */
   77 
   78 SYSCTL_DECL(_net_inet);
   79 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
   80     "Receive-side steering");
   81 
   82 /*
   83  * Toeplitz is the only required hash function in the RSS spec, so use it by
   84  * default.
   85  */
   86 static u_int    rss_hashalgo = RSS_HASH_TOEPLITZ;
   87 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
   88     "RSS hash algorithm");
   89 
   90 /*
   91  * Size of the indirection table; at most 128 entries per the RSS spec.  We
   92  * size it to at least 2 times the number of CPUs by default to allow useful
   93  * rebalancing.  If not set explicitly with a loader tunable, we tune based
   94  * on the number of CPUs present.
   95  *
   96  * XXXRW: buckets might be better to use for the tunable than bits.
   97  */
   98 static u_int    rss_bits;
   99 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
  100     "RSS bits");
  101 
  102 static u_int    rss_mask;
  103 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
  104     "RSS mask");
  105 
  106 static const u_int      rss_maxbits = RSS_MAXBITS;
  107 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
  108     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
  109 
  110 /*
  111  * RSS's own count of the number of CPUs it could be using for processing.
  112  * Bounded to 64 by RSS constants.
  113  */
  114 static u_int    rss_ncpus;
  115 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
  116     "Number of CPUs available to RSS");
  117 
  118 #define RSS_MAXCPUS     (1 << (RSS_MAXBITS - 1))
  119 static const u_int      rss_maxcpus = RSS_MAXCPUS;
  120 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
  121     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
  122 
  123 /*
  124  * Variable exists just for reporting rss_bits in a user-friendly way.
  125  */
  126 static u_int    rss_buckets;
  127 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
  128     "RSS buckets");
  129 
  130 /*
  131  * Base CPU number; devices will add this to all CPU numbers returned by the
  132  * RSS indirection table.  Currently unmodifable in FreeBSD.
  133  */
  134 static const u_int      rss_basecpu;
  135 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
  136     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
  137 
  138 /*
  139  * Print verbose debugging messages.
  140  * 0 - disable
  141  * non-zero - enable
  142  */
  143 int     rss_debug = 0;
  144 SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0,
  145     "RSS debug level");
  146 
  147 /*
  148  * RSS secret key, intended to prevent attacks on load-balancing.  Its
  149  * effectiveness may be limited by algorithm choice and available entropy
  150  * during the boot.
  151  *
  152  * XXXRW: And that we don't randomize it yet!
  153  *
  154  * This is the default Microsoft RSS specification key which is also
  155  * the Chelsio T5 firmware default key.
  156  */
  157 static uint8_t rss_key[RSS_KEYSIZE] = {
  158         0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
  159         0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
  160         0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
  161         0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
  162         0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
  163 };
  164 
  165 /*
  166  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
  167  * Drivers may supplement this table with a separate CPU<->queue table when
  168  * programming devices.
  169  */
  170 struct rss_table_entry {
  171         uint8_t         rte_cpu;        /* CPU affinity of bucket. */
  172 };
  173 static struct rss_table_entry   rss_table[RSS_TABLE_MAXLEN];
  174 
  175 static void
  176 rss_init(__unused void *arg)
  177 {
  178         u_int i;
  179         u_int cpuid;
  180 
  181         /*
  182          * Validate tunables, coerce to sensible values.
  183          */
  184         switch (rss_hashalgo) {
  185         case RSS_HASH_TOEPLITZ:
  186         case RSS_HASH_NAIVE:
  187                 break;
  188 
  189         default:
  190                 RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n",
  191                     rss_hashalgo, RSS_HASH_TOEPLITZ);
  192                 rss_hashalgo = RSS_HASH_TOEPLITZ;
  193         }
  194 
  195         /*
  196          * Count available CPUs.
  197          *
  198          * XXXRW: Note incorrect assumptions regarding contiguity of this set
  199          * elsewhere.
  200          */
  201         rss_ncpus = 0;
  202         for (i = 0; i <= mp_maxid; i++) {
  203                 if (CPU_ABSENT(i))
  204                         continue;
  205                 rss_ncpus++;
  206         }
  207         if (rss_ncpus > RSS_MAXCPUS)
  208                 rss_ncpus = RSS_MAXCPUS;
  209 
  210         /*
  211          * Tune RSS table entries to be no less than 2x the number of CPUs
  212          * -- unless we're running uniprocessor, in which case there's not
  213          * much point in having buckets to rearrange for load-balancing!
  214          */
  215         if (rss_ncpus > 1) {
  216                 if (rss_bits == 0)
  217                         rss_bits = fls(rss_ncpus - 1) + 1;
  218 
  219                 /*
  220                  * Microsoft limits RSS table entries to 128, so apply that
  221                  * limit to both auto-detected CPU counts and user-configured
  222                  * ones.
  223                  */
  224                 if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
  225                         RSS_DEBUG("RSS bits %u not valid, coercing to %u\n",
  226                             rss_bits, RSS_MAXBITS);
  227                         rss_bits = RSS_MAXBITS;
  228                 }
  229 
  230                 /*
  231                  * Figure out how many buckets to use; warn if less than the
  232                  * number of configured CPUs, although this is not a fatal
  233                  * problem.
  234                  */
  235                 rss_buckets = (1 << rss_bits);
  236                 if (rss_buckets < rss_ncpus)
  237                         RSS_DEBUG("WARNING: rss_buckets (%u) less than "
  238                             "rss_ncpus (%u)\n", rss_buckets, rss_ncpus);
  239                 rss_mask = rss_buckets - 1;
  240         } else {
  241                 rss_bits = 0;
  242                 rss_buckets = 1;
  243                 rss_mask = 0;
  244         }
  245 
  246         /*
  247          * Set up initial CPU assignments: round-robin by default.
  248          */
  249         cpuid = CPU_FIRST();
  250         for (i = 0; i < rss_buckets; i++) {
  251                 rss_table[i].rte_cpu = cpuid;
  252                 cpuid = CPU_NEXT(cpuid);
  253         }
  254 
  255         /*
  256          * Randomize rrs_key.
  257          *
  258          * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
  259          * loop to check for "bad" RSS keys.
  260          */
  261 }
  262 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
  263 
  264 static uint32_t
  265 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
  266     const uint8_t *data)
  267 {
  268         uint32_t v;
  269         u_int i;
  270 
  271         v = 0;
  272         for (i = 0; i < keylen; i++)
  273                 v += key[i];
  274         for (i = 0; i < datalen; i++)
  275                 v += data[i];
  276         return (v);
  277 }
  278 
  279 uint32_t
  280 rss_hash(u_int datalen, const uint8_t *data)
  281 {
  282 
  283         switch (rss_hashalgo) {
  284         case RSS_HASH_TOEPLITZ:
  285                 return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
  286                     data));
  287 
  288         case RSS_HASH_NAIVE:
  289                 return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
  290                     data));
  291 
  292         default:
  293                 panic("%s: unsupported/unknown hashalgo %d", __func__,
  294                     rss_hashalgo);
  295         }
  296 }
  297 
  298 /*
  299  * Query the number of RSS bits in use.
  300  */
  301 u_int
  302 rss_getbits(void)
  303 {
  304 
  305         return (rss_bits);
  306 }
  307 
  308 /*
  309  * Query the RSS bucket associated with an RSS hash.
  310  */
  311 u_int
  312 rss_getbucket(u_int hash)
  313 {
  314 
  315         return (hash & rss_mask);
  316 }
  317 
  318 /*
  319  * Query the RSS layer bucket associated with the given
  320  * entry in the RSS hash space.
  321  *
  322  * The RSS indirection table is 0 .. rss_buckets-1,
  323  * covering the low 'rss_bits' of the total 128 slot
  324  * RSS indirection table.  So just mask off rss_bits and
  325  * return that.
  326  *
  327  * NIC drivers can then iterate over the 128 slot RSS
  328  * indirection table and fetch which RSS bucket to
  329  * map it to.  This will typically be a CPU queue
  330  */
  331 u_int
  332 rss_get_indirection_to_bucket(u_int index)
  333 {
  334 
  335         return (index & rss_mask);
  336 }
  337 
  338 /*
  339  * Query the RSS CPU associated with an RSS bucket.
  340  */
  341 u_int
  342 rss_getcpu(u_int bucket)
  343 {
  344 
  345         return (rss_table[bucket].rte_cpu);
  346 }
  347 
  348 /*
  349  * netisr CPU affinity lookup given just the hash and hashtype.
  350  */
  351 u_int
  352 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
  353 {
  354 
  355         switch (hash_type) {
  356         case M_HASHTYPE_RSS_IPV4:
  357         case M_HASHTYPE_RSS_TCP_IPV4:
  358         case M_HASHTYPE_RSS_UDP_IPV4:
  359         case M_HASHTYPE_RSS_IPV6:
  360         case M_HASHTYPE_RSS_TCP_IPV6:
  361         case M_HASHTYPE_RSS_UDP_IPV6:
  362                 return (rss_getcpu(rss_getbucket(hash_val)));
  363         default:
  364                 return (NETISR_CPUID_NONE);
  365         }
  366 }
  367 
  368 /*
  369  * Query the RSS bucket associated with the given hash value and
  370  * type.
  371  */
  372 int
  373 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
  374 {
  375 
  376         switch (hash_type) {
  377         case M_HASHTYPE_RSS_IPV4:
  378         case M_HASHTYPE_RSS_TCP_IPV4:
  379         case M_HASHTYPE_RSS_UDP_IPV4:
  380         case M_HASHTYPE_RSS_IPV6:
  381         case M_HASHTYPE_RSS_TCP_IPV6:
  382         case M_HASHTYPE_RSS_UDP_IPV6:
  383                 *bucket_id = rss_getbucket(hash_val);
  384                 return (0);
  385         default:
  386                 return (-1);
  387         }
  388 }
  389 
  390 /*
  391  * netisr CPU affinity lookup routine for use by protocols.
  392  */
  393 struct mbuf *
  394 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
  395 {
  396 
  397         M_ASSERTPKTHDR(m);
  398         *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
  399         return (m);
  400 }
  401 
  402 int
  403 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
  404 {
  405 
  406         M_ASSERTPKTHDR(m);
  407 
  408         return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
  409             bucket_id));
  410 }
  411 
  412 /*
  413  * Query the RSS hash algorithm.
  414  */
  415 u_int
  416 rss_gethashalgo(void)
  417 {
  418 
  419         return (rss_hashalgo);
  420 }
  421 
  422 /*
  423  * Query the current RSS key; likely to be used by device drivers when
  424  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
  425  *
  426  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
  427  */
  428 void
  429 rss_getkey(uint8_t *key)
  430 {
  431 
  432         bcopy(rss_key, key, sizeof(rss_key));
  433 }
  434 
  435 /*
  436  * Query the number of buckets; this may be used by both network device
  437  * drivers, which will need to populate hardware shadows of the software
  438  * indirection table, and the network stack itself (such as when deciding how
  439  * many connection groups to allocate).
  440  */
  441 u_int
  442 rss_getnumbuckets(void)
  443 {
  444 
  445         return (rss_buckets);
  446 }
  447 
  448 /*
  449  * Query the number of CPUs in use by RSS; may be useful to device drivers
  450  * trying to figure out how to map a larger number of CPUs into a smaller
  451  * number of receive queues.
  452  */
  453 u_int
  454 rss_getnumcpus(void)
  455 {
  456 
  457         return (rss_ncpus);
  458 }
  459 
  460 /*
  461  * Return the supported RSS hash configuration.
  462  *
  463  * NICs should query this to determine what to configure in their redirection
  464  * matching table.
  465  */
  466 inline u_int
  467 rss_gethashconfig(void)
  468 {
  469 
  470         /* Return 4-tuple for TCP; 2-tuple for others */
  471         /*
  472          * UDP may fragment more often than TCP and thus we'll end up with
  473          * NICs returning 2-tuple fragments.
  474          * udp_init() and udplite_init() both currently initialise things
  475          * as 2-tuple.
  476          * So for now disable UDP 4-tuple hashing until all of the other
  477          * pieces are in place.
  478          */
  479         return (
  480             RSS_HASHTYPE_RSS_IPV4
  481         |    RSS_HASHTYPE_RSS_TCP_IPV4
  482         |    RSS_HASHTYPE_RSS_IPV6
  483         |    RSS_HASHTYPE_RSS_TCP_IPV6
  484         |    RSS_HASHTYPE_RSS_IPV6_EX
  485         |    RSS_HASHTYPE_RSS_TCP_IPV6_EX
  486 #if 0
  487         |    RSS_HASHTYPE_RSS_UDP_IPV4
  488         |    RSS_HASHTYPE_RSS_UDP_IPV6
  489         |    RSS_HASHTYPE_RSS_UDP_IPV6_EX
  490 #endif
  491         );
  492 }
  493 
  494 /*
  495  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
  496  * it appearing in debugging output unnecessarily.
  497  */
  498 static int
  499 sysctl_rss_key(SYSCTL_HANDLER_ARGS)
  500 {
  501         uint8_t temp_rss_key[RSS_KEYSIZE];
  502         int error;
  503 
  504         error = priv_check(req->td, PRIV_NETINET_HASHKEY);
  505         if (error)
  506                 return (error);
  507 
  508         bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
  509         error = sysctl_handle_opaque(oidp, temp_rss_key,
  510             sizeof(temp_rss_key), req);
  511         if (error)
  512                 return (error);
  513         if (req->newptr != NULL) {
  514                 /* XXXRW: Not yet. */
  515                 return (EINVAL);
  516         }
  517         return (0);
  518 }
  519 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
  520     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
  521     "", "RSS keying material");
  522 
  523 static int
  524 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
  525 {
  526         struct sbuf *sb;
  527         int error;
  528         int i;
  529 
  530         error = 0;
  531         error = sysctl_wire_old_buffer(req, 0);
  532         if (error != 0)
  533                 return (error);
  534         sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
  535         if (sb == NULL)
  536                 return (ENOMEM);
  537         for (i = 0; i < rss_buckets; i++) {
  538                 sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
  539                     i,
  540                     rss_getcpu(i));
  541         }
  542         error = sbuf_finish(sb);
  543         sbuf_delete(sb);
  544 
  545         return (error);
  546 }
  547 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
  548     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  549     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");

Cache object: a82455ae1f15f565fd78473a45822af1


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.