The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcbgroup.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2010-2011 Juniper Networks, Inc.
    3  * All rights reserved.
    4  *
    5  * This software was developed by Robert N. M. Watson under contract
    6  * to Juniper Networks, Inc.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 #include <sys/cdefs.h>
   31 
   32 __FBSDID("$FreeBSD: releng/9.1/sys/netinet/in_pcbgroup.c 222748 2011-06-06 12:55:02Z rwatson $");
   33 
   34 #include "opt_inet6.h"
   35 
   36 #include <sys/param.h>
   37 #include <sys/lock.h>
   38 #include <sys/malloc.h>
   39 #include <sys/mbuf.h>
   40 #include <sys/mutex.h>
   41 #include <sys/smp.h>
   42 #include <sys/socketvar.h>
   43 
   44 #include <netinet/in.h>
   45 #include <netinet/in_pcb.h>
   46 #ifdef INET6
   47 #include <netinet6/in6_pcb.h>
   48 #endif /* INET6 */
   49 
   50 /*
   51  * pcbgroups, or "connection groups" are based on Willman, Rixner, and Cox's
   52  * 2006 USENIX paper, "An Evaluation of Network Stack Parallelization
   53  * Strategies in Modern Operating Systems".  This implementation differs
   54  * significantly from that described in the paper, in that it attempts to
   55  * introduce not just notions of affinity for connections and distribute work
   56  * so as to reduce lock contention, but also align those notions with
   57  * hardware work distribution strategies such as RSS.  In this construction,
   58  * connection groups supplement, rather than replace, existing reservation
   59  * tables for protocol 4-tuples, offering CPU-affine lookup tables with
   60  * minimal cache line migration and lock contention during steady state
   61  * operation.
   62  *
   63  * Internet protocols, such as UDP and TCP, register to use connection groups
   64  * by providing an ipi_hashfields value other than IPI_HASHFIELDS_NONE; this
   65  * indicates to the connection group code whether a 2-tuple or 4-tuple is
   66  * used as an argument to hashes that assign a connection to a particular
   67  * group.  This must be aligned with any hardware offloaded distribution
   68  * model, such as RSS or similar approaches taken in embedded network boards.
   69  * Wildcard sockets require special handling, as in Willman 2006, and are
   70  * shared between connection groups -- while being protected by group-local
   71  * locks.  This means that connection establishment and teardown can be
   72  * signficantly more expensive than without connection groups, but that
   73  * steady-state processing can be significantly faster.
   74  *
   75  * Most of the implementation of connection groups is in this file; however,
   76  * connection group lookup is implemented in in_pcb.c alongside reservation
   77  * table lookups -- see in_pcblookup_group().
   78  *
   79  * TODO:
   80  *
   81  * Implement dynamic rebalancing of buckets with connection groups; when
   82  * load is unevenly distributed, search for more optimal balancing on
   83  * demand.  This might require scaling up the number of connection groups
   84  * by <<1.
   85  *
   86  * Provide an IP 2-tuple or 4-tuple netisr m2cpu handler based on connection
   87  * groups for ip_input and ip6_input, allowing non-offloaded work
   88  * distribution.
   89  *
   90  * Expose effective CPU affinity of connections to userspace using socket
   91  * options.
   92  *
   93  * Investigate per-connection affinity overrides based on socket options; an
   94  * option could be set, certainly resulting in work being distributed
   95  * differently in software, and possibly propagated to supporting hardware
   96  * with TCAMs or hardware hash tables.  This might require connections to
   97  * exist in more than one connection group at a time.
   98  *
   99  * Hook netisr thread reconfiguration events, and propagate those to RSS so
  100  * that rebalancing can occur when the thread pool grows or shrinks.
  101  *
  102  * Expose per-pcbgroup statistics to userspace monitoring tools such as
  103  * netstat, in order to allow better debugging and profiling.
  104  */
  105 
  106 void
  107 in_pcbgroup_init(struct inpcbinfo *pcbinfo, u_int hashfields,
  108     int hash_nelements)
  109 {
  110         struct inpcbgroup *pcbgroup;
  111         u_int numpcbgroups, pgn;
  112 
  113         /*
  114          * Only enable connection groups for a protocol if it has been
  115          * specifically requested.
  116          */
  117         if (hashfields == IPI_HASHFIELDS_NONE)
  118                 return;
  119 
  120         /*
  121          * Connection groups are about multi-processor load distribution,
  122          * lock contention, and connection CPU affinity.  As such, no point
  123          * in turning them on for a uniprocessor machine, it only wastes
  124          * memory.
  125          */
  126         if (mp_ncpus == 1)
  127                 return;
  128 
  129         /*
  130          * Use one group per CPU for now.  If we decide to do dynamic
  131          * rebalancing a la RSS, we'll need to shift left by at least 1.
  132          */
  133         numpcbgroups = mp_ncpus;
  134 
  135         pcbinfo->ipi_hashfields = hashfields;
  136         pcbinfo->ipi_pcbgroups = malloc(numpcbgroups *
  137             sizeof(*pcbinfo->ipi_pcbgroups), M_PCB, M_WAITOK | M_ZERO);
  138         pcbinfo->ipi_npcbgroups = numpcbgroups;
  139         pcbinfo->ipi_wildbase = hashinit(hash_nelements, M_PCB,
  140             &pcbinfo->ipi_wildmask);
  141         for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
  142                 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
  143                 pcbgroup->ipg_hashbase = hashinit(hash_nelements, M_PCB,
  144                     &pcbgroup->ipg_hashmask);
  145                 INP_GROUP_LOCK_INIT(pcbgroup, "pcbgroup");
  146 
  147                 /*
  148                  * Initialise notional affinity of the pcbgroup -- for RSS,
  149                  * we want the same notion of affinity as NICs to be used.
  150                  * Just round robin for the time being.
  151                  */
  152                 pcbgroup->ipg_cpu = (pgn % mp_ncpus);
  153         }
  154 }
  155 
  156 void
  157 in_pcbgroup_destroy(struct inpcbinfo *pcbinfo)
  158 {
  159         struct inpcbgroup *pcbgroup;
  160         u_int pgn;
  161 
  162         if (pcbinfo->ipi_npcbgroups == 0)
  163                 return;
  164 
  165         for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++) {
  166                 pcbgroup = &pcbinfo->ipi_pcbgroups[pgn];
  167                 KASSERT(LIST_EMPTY(pcbinfo->ipi_listhead),
  168                     ("in_pcbinfo_destroy: listhead not empty"));
  169                 INP_GROUP_LOCK_DESTROY(pcbgroup);
  170                 hashdestroy(pcbgroup->ipg_hashbase, M_PCB,
  171                     pcbgroup->ipg_hashmask);
  172         }
  173         hashdestroy(pcbinfo->ipi_wildbase, M_PCB, pcbinfo->ipi_wildmask);
  174         free(pcbinfo->ipi_pcbgroups, M_PCB);
  175         pcbinfo->ipi_pcbgroups = NULL;
  176         pcbinfo->ipi_npcbgroups = 0;
  177         pcbinfo->ipi_hashfields = 0;
  178 }
  179 
  180 /*
  181  * Given a hash of whatever the covered tuple might be, return a pcbgroup
  182  * index.
  183  */
  184 static __inline u_int
  185 in_pcbgroup_getbucket(struct inpcbinfo *pcbinfo, uint32_t hash)
  186 {
  187 
  188         return (hash % pcbinfo->ipi_npcbgroups);
  189 }
  190 
  191 /*
  192  * Map a (hashtype, hash) tuple into a connection group, or NULL if the hash
  193  * information is insufficient to identify the pcbgroup.
  194  */
  195 struct inpcbgroup *
  196 in_pcbgroup_byhash(struct inpcbinfo *pcbinfo, u_int hashtype, uint32_t hash)
  197 {
  198 
  199         return (NULL);
  200 }
  201 
  202 static struct inpcbgroup *
  203 in_pcbgroup_bymbuf(struct inpcbinfo *pcbinfo, struct mbuf *m)
  204 {
  205 
  206         return (in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
  207             m->m_pkthdr.flowid));
  208 }
  209 
  210 struct inpcbgroup *
  211 in_pcbgroup_bytuple(struct inpcbinfo *pcbinfo, struct in_addr laddr,
  212     u_short lport, struct in_addr faddr, u_short fport)
  213 {
  214         uint32_t hash;
  215 
  216         switch (pcbinfo->ipi_hashfields) {
  217         case IPI_HASHFIELDS_4TUPLE:
  218                 hash = faddr.s_addr ^ fport;
  219                 break;
  220 
  221         case IPI_HASHFIELDS_2TUPLE:
  222                 hash = faddr.s_addr ^ laddr.s_addr;
  223                 break;
  224 
  225         default:
  226                 hash = 0;
  227         }
  228         return (&pcbinfo->ipi_pcbgroups[in_pcbgroup_getbucket(pcbinfo,
  229             hash)]);
  230 }
  231 
  232 struct inpcbgroup *
  233 in_pcbgroup_byinpcb(struct inpcb *inp)
  234 {
  235 
  236         return (in_pcbgroup_bytuple(inp->inp_pcbinfo, inp->inp_laddr,
  237             inp->inp_lport, inp->inp_faddr, inp->inp_fport));
  238 }
  239 
  240 static void
  241 in_pcbwild_add(struct inpcb *inp)
  242 {
  243         struct inpcbinfo *pcbinfo;
  244         struct inpcbhead *head;
  245         u_int pgn;
  246 
  247         INP_WLOCK_ASSERT(inp);
  248         KASSERT(!(inp->inp_flags2 & INP_PCBGROUPWILD),
  249             ("%s: is wild",__func__));
  250 
  251         pcbinfo = inp->inp_pcbinfo;
  252         for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
  253                 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
  254         head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, inp->inp_lport,
  255             0, pcbinfo->ipi_wildmask)];
  256         LIST_INSERT_HEAD(head, inp, inp_pcbgroup_wild);
  257         inp->inp_flags2 |= INP_PCBGROUPWILD;
  258         for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
  259                 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
  260 }
  261 
  262 static void
  263 in_pcbwild_remove(struct inpcb *inp)
  264 {
  265         struct inpcbinfo *pcbinfo;
  266         u_int pgn;
  267 
  268         INP_WLOCK_ASSERT(inp);
  269         KASSERT((inp->inp_flags2 & INP_PCBGROUPWILD),
  270             ("%s: not wild", __func__));
  271 
  272         pcbinfo = inp->inp_pcbinfo;
  273         for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
  274                 INP_GROUP_LOCK(&pcbinfo->ipi_pcbgroups[pgn]);
  275         LIST_REMOVE(inp, inp_pcbgroup_wild);
  276         for (pgn = 0; pgn < pcbinfo->ipi_npcbgroups; pgn++)
  277                 INP_GROUP_UNLOCK(&pcbinfo->ipi_pcbgroups[pgn]);
  278         inp->inp_flags2 &= ~INP_PCBGROUPWILD;
  279 }
  280 
  281 static __inline int
  282 in_pcbwild_needed(struct inpcb *inp)
  283 {
  284 
  285 #ifdef INET6
  286         if (inp->inp_vflag & INP_IPV6)
  287                 return (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr));
  288         else
  289 #endif
  290                 return (inp->inp_faddr.s_addr == htonl(INADDR_ANY));
  291 }
  292 
  293 static void
  294 in_pcbwild_update_internal(struct inpcb *inp)
  295 {
  296         int wildcard_needed;
  297 
  298         wildcard_needed = in_pcbwild_needed(inp);
  299         if (wildcard_needed && !(inp->inp_flags2 & INP_PCBGROUPWILD))
  300                 in_pcbwild_add(inp);
  301         else if (!wildcard_needed && (inp->inp_flags2 & INP_PCBGROUPWILD))
  302                 in_pcbwild_remove(inp);
  303 }
  304 
  305 /*
  306  * Update the pcbgroup of an inpcb, which might include removing an old
  307  * pcbgroup reference and/or adding a new one.  Wildcard processing is not
  308  * performed here, although ideally we'll never install a pcbgroup for a
  309  * wildcard inpcb (asserted below).
  310  */
  311 static void
  312 in_pcbgroup_update_internal(struct inpcbinfo *pcbinfo,
  313     struct inpcbgroup *newpcbgroup, struct inpcb *inp)
  314 {
  315         struct inpcbgroup *oldpcbgroup;
  316         struct inpcbhead *pcbhash;
  317         uint32_t hashkey_faddr;
  318 
  319         INP_WLOCK_ASSERT(inp);
  320 
  321         oldpcbgroup = inp->inp_pcbgroup;
  322         if (oldpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
  323                 INP_GROUP_LOCK(oldpcbgroup);
  324                 LIST_REMOVE(inp, inp_pcbgrouphash);
  325                 inp->inp_pcbgroup = NULL;
  326                 INP_GROUP_UNLOCK(oldpcbgroup);
  327         }
  328         if (newpcbgroup != NULL && oldpcbgroup != newpcbgroup) {
  329 #ifdef INET6
  330                 if (inp->inp_vflag & INP_IPV6)
  331                         hashkey_faddr = inp->in6p_faddr.s6_addr32[3]; /* XXX */
  332                 else
  333 #endif
  334                         hashkey_faddr = inp->inp_faddr.s_addr;
  335                 INP_GROUP_LOCK(newpcbgroup);
  336                 pcbhash = &newpcbgroup->ipg_hashbase[
  337                     INP_PCBHASH(hashkey_faddr, inp->inp_lport, inp->inp_fport,
  338                     newpcbgroup->ipg_hashmask)];
  339                 LIST_INSERT_HEAD(pcbhash, inp, inp_pcbgrouphash);
  340                 inp->inp_pcbgroup = newpcbgroup;
  341                 INP_GROUP_UNLOCK(newpcbgroup);
  342         }
  343 
  344         KASSERT(!(newpcbgroup != NULL && in_pcbwild_needed(inp)),
  345             ("%s: pcbgroup and wildcard!", __func__));
  346 }
  347 
  348 /*
  349  * Two update paths: one in which the 4-tuple on an inpcb has been updated
  350  * and therefore connection groups may need to change (or a wildcard entry
  351  * may needed to be installed), and another in which the 4-tuple has been
  352  * set as a result of a packet received, in which case we may be able to use
  353  * the hash on the mbuf to avoid doing a software hash calculation for RSS.
  354  *
  355  * In each case: first, let the wildcard code have a go at placing it as a
  356  * wildcard socket.  If it was a wildcard, or if the connection has been
  357  * dropped, then no pcbgroup is required (so potentially clear it);
  358  * otherwise, calculate and update the pcbgroup for the inpcb.
  359  */
  360 void
  361 in_pcbgroup_update(struct inpcb *inp)
  362 {
  363         struct inpcbinfo *pcbinfo;
  364         struct inpcbgroup *newpcbgroup;
  365 
  366         INP_WLOCK_ASSERT(inp);
  367 
  368         pcbinfo = inp->inp_pcbinfo;
  369         if (!in_pcbgroup_enabled(pcbinfo))
  370                 return;
  371 
  372         in_pcbwild_update_internal(inp);
  373         if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
  374             !(inp->inp_flags & INP_DROPPED)) {
  375 #ifdef INET6
  376                 if (inp->inp_vflag & INP_IPV6)
  377                         newpcbgroup = in6_pcbgroup_byinpcb(inp);
  378                 else
  379 #endif
  380                         newpcbgroup = in_pcbgroup_byinpcb(inp);
  381         } else
  382                 newpcbgroup = NULL;
  383         in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
  384 }
  385 
  386 void
  387 in_pcbgroup_update_mbuf(struct inpcb *inp, struct mbuf *m)
  388 {
  389         struct inpcbinfo *pcbinfo;
  390         struct inpcbgroup *newpcbgroup;
  391 
  392         INP_WLOCK_ASSERT(inp);
  393 
  394         pcbinfo = inp->inp_pcbinfo;
  395         if (!in_pcbgroup_enabled(pcbinfo))
  396                 return;
  397 
  398         /*
  399          * Possibly should assert !INP_PCBGROUPWILD rather than testing for
  400          * it; presumably this function should never be called for anything
  401          * other than non-wildcard socket?
  402          */
  403         in_pcbwild_update_internal(inp);
  404         if (!(inp->inp_flags2 & INP_PCBGROUPWILD) &&
  405             !(inp->inp_flags & INP_DROPPED)) {
  406                 newpcbgroup = in_pcbgroup_bymbuf(pcbinfo, m);
  407 #ifdef INET6
  408                 if (inp->inp_vflag & INP_IPV6) {
  409                         if (newpcbgroup == NULL)
  410                                 newpcbgroup = in6_pcbgroup_byinpcb(inp);
  411                 } else {
  412 #endif
  413                         if (newpcbgroup == NULL)
  414                                 newpcbgroup = in_pcbgroup_byinpcb(inp);
  415 #ifdef INET6
  416                 }
  417 #endif
  418         } else
  419                 newpcbgroup = NULL;
  420         in_pcbgroup_update_internal(pcbinfo, newpcbgroup, inp);
  421 }
  422 
  423 /*
  424  * Remove pcbgroup entry and optional pcbgroup wildcard entry for this inpcb.
  425  */
  426 void
  427 in_pcbgroup_remove(struct inpcb *inp)
  428 {
  429         struct inpcbgroup *pcbgroup;
  430 
  431         INP_WLOCK_ASSERT(inp);
  432 
  433         if (!in_pcbgroup_enabled(inp->inp_pcbinfo))
  434                 return;
  435 
  436         if (inp->inp_flags2 & INP_PCBGROUPWILD)
  437                 in_pcbwild_remove(inp);
  438 
  439         pcbgroup = inp->inp_pcbgroup;
  440         if (pcbgroup != NULL) {
  441                 INP_GROUP_LOCK(pcbgroup);
  442                 LIST_REMOVE(inp, inp_pcbgrouphash);
  443                 inp->inp_pcbgroup = NULL;
  444                 INP_GROUP_UNLOCK(pcbgroup);
  445         }
  446 }
  447 
  448 /*
  449  * Query whether or not it is appropriate to use pcbgroups to look up inpcbs
  450  * for a protocol.
  451  */
  452 int
  453 in_pcbgroup_enabled(struct inpcbinfo *pcbinfo)
  454 {
  455 
  456         return (pcbinfo->ipi_npcbgroups > 0);
  457 }

Cache object: a488280d7d367da4f698045a3ff40d5d


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.