The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_fib_dxr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2012-2022 Marko Zec
    5  * Copyright (c) 2005, 2018 University of Zagreb
    6  * Copyright (c) 2005 International Computer Science Institute
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  */
   29 
   30 /*
   31  * An implementation of DXR, a simple IPv4 LPM scheme with compact lookup
   32  * structures and a trivial search procedure.  More significant bits of
   33  * the search key are used to directly index a two-stage trie, while the
   34  * remaining bits are used for finding the next hop in a sorted array.
   35  * More details in:
   36  *
   37  * M. Zec, L. Rizzo, M. Mikuc, DXR: towards a billion routing lookups per
   38  * second in software, ACM SIGCOMM Computer Communication Review, September
   39  * 2012
   40  *
   41  * M. Zec, M. Mikuc, Pushing the envelope: beyond two billion IP routing
   42  * lookups per second on commodity CPUs, IEEE SoftCOM, September 2017, Split
   43  */
   44 
   45 #include <sys/cdefs.h>
   46 __FBSDID("$FreeBSD$");
   47 
   48 #include "opt_inet.h"
   49 
   50 #include <sys/param.h>
   51 #include <sys/kernel.h>
   52 #include <sys/ctype.h>
   53 #include <sys/epoch.h>
   54 #include <sys/malloc.h>
   55 #include <sys/module.h>
   56 #include <sys/socket.h>
   57 #include <sys/sysctl.h>
   58 #include <sys/syslog.h>
   59 
   60 #include <vm/uma.h>
   61 
   62 #include <netinet/in.h>
   63 #include <netinet/in_fib.h>
   64 
   65 #include <net/route.h>
   66 #include <net/route/route_ctl.h>
   67 #include <net/route/fib_algo.h>
   68 
   69 #define DXR_TRIE_BITS           20
   70 
   71 CTASSERT(DXR_TRIE_BITS >= 16 && DXR_TRIE_BITS <= 24);
   72 
   73 /* DXR2: two-stage primary trie, instead of a single direct lookup table */
   74 #define DXR2
   75 
   76 #if DXR_TRIE_BITS > 16
   77 #define DXR_D                   16
   78 #else
   79 #define DXR_D                   (DXR_TRIE_BITS - 1)
   80 #endif
   81 
   82 #define D_TBL_SIZE              (1 << DXR_D)
   83 #define DIRECT_TBL_SIZE         (1 << DXR_TRIE_BITS)
   84 #define DXR_RANGE_MASK          (0xffffffffU >> DXR_TRIE_BITS)
   85 #define DXR_RANGE_SHIFT         (32 - DXR_TRIE_BITS)
   86 
   87 #define DESC_BASE_BITS          22
   88 #define DESC_FRAGMENTS_BITS     (32 - DESC_BASE_BITS)
   89 #define BASE_MAX                ((1 << DESC_BASE_BITS) - 1)
   90 #define RTBL_SIZE_INCR          (BASE_MAX / 64)
   91 
   92 #if DXR_TRIE_BITS < 24
   93 #define FRAGS_MASK_SHORT        ((1 << (23 - DXR_TRIE_BITS)) - 1)
   94 #else
   95 #define FRAGS_MASK_SHORT        0
   96 #endif
   97 #define FRAGS_PREF_SHORT        (((1 << DESC_FRAGMENTS_BITS) - 1) & \
   98                                  ~FRAGS_MASK_SHORT)
   99 #define FRAGS_MARK_XL           (FRAGS_PREF_SHORT - 1)
  100 #define FRAGS_MARK_HIT          (FRAGS_PREF_SHORT - 2)
  101 
  102 #define IS_SHORT_FORMAT(x)      ((x & FRAGS_PREF_SHORT) == FRAGS_PREF_SHORT)
  103 #define IS_LONG_FORMAT(x)       ((x & FRAGS_PREF_SHORT) != FRAGS_PREF_SHORT)
  104 #define IS_XL_FORMAT(x)         (x == FRAGS_MARK_XL)
  105 
  106 #define RE_SHORT_MAX_NH         ((1 << (DXR_TRIE_BITS - 8)) - 1)
  107 
  108 #define CHUNK_HASH_BITS         16
  109 #define CHUNK_HASH_SIZE         (1 << CHUNK_HASH_BITS)
  110 #define CHUNK_HASH_MASK         (CHUNK_HASH_SIZE - 1)
  111 
  112 #define TRIE_HASH_BITS          16
  113 #define TRIE_HASH_SIZE          (1 << TRIE_HASH_BITS)
  114 #define TRIE_HASH_MASK          (TRIE_HASH_SIZE - 1)
  115 
  116 #define XTBL_SIZE_INCR          (DIRECT_TBL_SIZE / 16)
  117 
  118 #define UNUSED_BUCKETS          8
  119 
  120 /* Lookup structure elements */
  121 
  122 struct direct_entry {
  123         uint32_t                fragments: DESC_FRAGMENTS_BITS,
  124                                 base: DESC_BASE_BITS;
  125 };
  126 
  127 struct range_entry_long {
  128         uint32_t                start: DXR_RANGE_SHIFT,
  129                                 nexthop: DXR_TRIE_BITS;
  130 };
  131 
  132 #if DXR_TRIE_BITS < 24
  133 struct range_entry_short {
  134         uint16_t                start: DXR_RANGE_SHIFT - 8,
  135                                 nexthop: DXR_TRIE_BITS - 8;
  136 };
  137 #endif
  138 
  139 /* Auxiliary structures */
  140 
  141 struct heap_entry {
  142         uint32_t                start;
  143         uint32_t                end;
  144         uint32_t                preflen;
  145         uint32_t                nexthop;
  146 };
  147 
  148 struct chunk_desc {
  149         LIST_ENTRY(chunk_desc)  cd_all_le;
  150         LIST_ENTRY(chunk_desc)  cd_hash_le;
  151         uint32_t                cd_hash;
  152         uint32_t                cd_refcnt;
  153         uint32_t                cd_base;
  154         uint32_t                cd_cur_size;
  155         uint32_t                cd_max_size;
  156 };
  157 
  158 struct trie_desc {
  159         LIST_ENTRY(trie_desc)   td_all_le;
  160         LIST_ENTRY(trie_desc)   td_hash_le;
  161         uint32_t                td_hash;
  162         uint32_t                td_index;
  163         uint32_t                td_refcnt;
  164 };
  165 
  166 struct dxr_aux {
  167         /* Glue to external state */
  168         struct fib_data         *fd;
  169         uint32_t                fibnum;
  170         int                     refcnt;
  171 
  172         /* Auxiliary build-time tables */
  173         struct direct_entry     direct_tbl[DIRECT_TBL_SIZE];
  174         uint16_t                d_tbl[D_TBL_SIZE];
  175         struct direct_entry     *x_tbl;
  176         union {
  177                 struct range_entry_long re;
  178                 uint32_t        fragments;
  179         }                       *range_tbl;
  180 
  181         /* Auxiliary internal state */
  182         uint32_t                updates_mask[DIRECT_TBL_SIZE / 32];
  183         struct trie_desc        *trietbl[D_TBL_SIZE];
  184         LIST_HEAD(, chunk_desc) chunk_hashtbl[CHUNK_HASH_SIZE];
  185         LIST_HEAD(, chunk_desc) all_chunks;
  186         LIST_HEAD(, chunk_desc) unused_chunks[UNUSED_BUCKETS];
  187         LIST_HEAD(, trie_desc)  trie_hashtbl[TRIE_HASH_SIZE];
  188         LIST_HEAD(, trie_desc)  all_trie;
  189         LIST_HEAD(, trie_desc)  unused_trie; /* abuses hash link entry */
  190         struct sockaddr_in      dst;
  191         struct sockaddr_in      mask;
  192         struct heap_entry       heap[33];
  193         uint32_t                prefixes;
  194         uint32_t                updates_low;
  195         uint32_t                updates_high;
  196         uint32_t                unused_chunks_size;
  197         uint32_t                xtbl_size;
  198         uint32_t                all_trie_cnt;
  199         uint32_t                unused_trie_cnt;
  200         uint32_t                trie_rebuilt_prefixes;
  201         uint32_t                heap_index;
  202         uint32_t                d_bits;
  203         uint32_t                rtbl_size;
  204         uint32_t                rtbl_top;
  205         uint32_t                rtbl_work_frags;
  206         uint32_t                work_chunk;
  207 };
  208 
  209 /* Main lookup structure container */
  210 
  211 struct dxr {
  212         /* Lookup tables */
  213         void                    *d;
  214         void                    *x;
  215         void                    *r;
  216         struct nhop_object      **nh_tbl;
  217 
  218         /* Glue to external state */
  219         struct dxr_aux          *aux;
  220         struct fib_data         *fd;
  221         struct epoch_context    epoch_ctx;
  222         uint32_t                fibnum;
  223         uint16_t                d_shift;
  224         uint16_t                x_shift;
  225         uint32_t                x_mask;
  226 };
  227 
  228 static MALLOC_DEFINE(M_DXRLPM, "dxr", "DXR LPM");
  229 static MALLOC_DEFINE(M_DXRAUX, "dxr aux", "DXR auxiliary");
  230 
  231 uma_zone_t chunk_zone;
  232 uma_zone_t trie_zone;
  233 
  234 VNET_DEFINE_STATIC(int, frag_limit) = 100;
  235 #define V_frag_limit    VNET(frag_limit)
  236 
  237 /* Binary search for a matching address range */
  238 #define DXR_LOOKUP_STAGE                                        \
  239         if (masked_dst < range[middle].start) {                 \
  240                 upperbound = middle;                            \
  241                 middle = (middle + lowerbound) / 2;             \
  242         } else if (masked_dst < range[middle + 1].start)        \
  243                 return (range[middle].nexthop);                 \
  244         else {                                                  \
  245                 lowerbound = middle + 1;                        \
  246                 middle = (upperbound + middle + 1) / 2;         \
  247         }                                                       \
  248         if (upperbound == lowerbound)                           \
  249                 return (range[lowerbound].nexthop);
  250 
  251 static int
  252 range_lookup(struct range_entry_long *rt, struct direct_entry de, uint32_t dst)
  253 {
  254         uint32_t base;
  255         uint32_t upperbound;
  256         uint32_t middle;
  257         uint32_t lowerbound;
  258         uint32_t masked_dst;
  259 
  260         base = de.base;
  261         lowerbound = 0;
  262         masked_dst = dst & DXR_RANGE_MASK;
  263 
  264 #if DXR_TRIE_BITS < 24
  265         if (__predict_true(IS_SHORT_FORMAT(de.fragments))) {
  266                 upperbound = de.fragments & FRAGS_MASK_SHORT;
  267                 struct range_entry_short *range =
  268                     (struct range_entry_short *) &rt[base];
  269 
  270                 masked_dst >>= 8;
  271                 middle = upperbound;
  272                 upperbound = upperbound * 2 + 1;
  273 
  274                 for (;;) {
  275                         DXR_LOOKUP_STAGE
  276                         DXR_LOOKUP_STAGE
  277                 }
  278         }
  279 #endif
  280 
  281         upperbound = de.fragments;
  282         middle = upperbound / 2;
  283         struct range_entry_long *range = &rt[base];
  284         if (__predict_false(IS_XL_FORMAT(de.fragments))) {
  285                 upperbound = *((uint32_t *) range);
  286                 range++;
  287                 middle = upperbound / 2;
  288         }
  289 
  290         for (;;) {
  291                 DXR_LOOKUP_STAGE
  292                 DXR_LOOKUP_STAGE
  293         }
  294 }
  295 
  296 #define DXR_LOOKUP_DEFINE(D)                                            \
  297         static int inline                                               \
  298         dxr_lookup_##D(struct dxr *dxr, uint32_t dst)                   \
  299         {                                                               \
  300                 struct direct_entry de;                                 \
  301                 uint16_t *dt = dxr->d;                                  \
  302                 struct direct_entry *xt = dxr->x;                       \
  303                                                                         \
  304                 de = xt[(dt[dst >> (32 - (D))] << (DXR_TRIE_BITS - (D))) \
  305                     + ((dst >> DXR_RANGE_SHIFT) &                       \
  306                     (0xffffffffU >> (32 - DXR_TRIE_BITS + (D))))];      \
  307                 if (__predict_true(de.fragments == FRAGS_MARK_HIT))     \
  308                         return (de.base);                               \
  309                 return (range_lookup(dxr->r, de, dst));                 \
  310         }                                                               \
  311                                                                         \
  312         static struct nhop_object *                                     \
  313         dxr_fib_lookup_##D(void *algo_data,                             \
  314             const struct flm_lookup_key key, uint32_t scopeid __unused) \
  315         {                                                               \
  316                 struct dxr *dxr = algo_data;                            \
  317                                                                         \
  318                 return (dxr->nh_tbl[dxr_lookup_##D(dxr,                 \
  319                     ntohl(key.addr4.s_addr))]);                         \
  320         }
  321 
  322 #ifdef DXR2
  323 #if DXR_TRIE_BITS > 16
  324 DXR_LOOKUP_DEFINE(16)
  325 #endif
  326 DXR_LOOKUP_DEFINE(15)
  327 DXR_LOOKUP_DEFINE(14)
  328 DXR_LOOKUP_DEFINE(13)
  329 DXR_LOOKUP_DEFINE(12)
  330 DXR_LOOKUP_DEFINE(11)
  331 DXR_LOOKUP_DEFINE(10)
  332 DXR_LOOKUP_DEFINE(9)
  333 #endif /* DXR2 */
  334 
  335 static int inline
  336 dxr_lookup(struct dxr *dxr, uint32_t dst)
  337 {
  338         struct direct_entry de;
  339 #ifdef DXR2
  340         uint16_t *dt = dxr->d;
  341         struct direct_entry *xt = dxr->x;
  342 
  343         de = xt[(dt[dst >> dxr->d_shift] << dxr->x_shift) +
  344             ((dst >> DXR_RANGE_SHIFT) & dxr->x_mask)];
  345 #else /* !DXR2 */
  346         struct direct_entry *dt = dxr->d;
  347 
  348         de = dt[dst >> DXR_RANGE_SHIFT];
  349 #endif /* !DXR2 */
  350         if (__predict_true(de.fragments == FRAGS_MARK_HIT))
  351                 return (de.base);
  352         return (range_lookup(dxr->r, de, dst));
  353 }
  354 
  355 static void
  356 initheap(struct dxr_aux *da, uint32_t dst_u32, uint32_t chunk)
  357 {
  358         struct heap_entry *fhp = &da->heap[0];
  359         struct rtentry *rt;
  360         struct route_nhop_data rnd;
  361  
  362         da->heap_index = 0;
  363         da->dst.sin_addr.s_addr = htonl(dst_u32);
  364         rt = fib4_lookup_rt(da->fibnum, da->dst.sin_addr, 0, NHR_UNLOCKED,
  365             &rnd);
  366         if (rt != NULL) {
  367                 struct in_addr addr;
  368                 uint32_t scopeid;
  369 
  370                 rt_get_inet_prefix_plen(rt, &addr, &fhp->preflen, &scopeid);
  371                 fhp->start = ntohl(addr.s_addr);
  372                 fhp->end = fhp->start;
  373                 if (fhp->preflen < 32)
  374                         fhp->end |= (0xffffffffU >> fhp->preflen);
  375                 fhp->nexthop = fib_get_nhop_idx(da->fd, rnd.rnd_nhop);
  376         } else {
  377                 fhp->preflen = fhp->nexthop = fhp->start = 0;
  378                 fhp->end = 0xffffffffU;
  379         }
  380 }
  381 
  382 static uint32_t
  383 chunk_size(struct dxr_aux *da, struct direct_entry *fdesc)
  384 {
  385 
  386         if (IS_SHORT_FORMAT(fdesc->fragments))
  387                 return ((fdesc->fragments & FRAGS_MASK_SHORT) + 1);
  388         else if (IS_XL_FORMAT(fdesc->fragments))
  389                 return (da->range_tbl[fdesc->base].fragments + 2);
  390         else /* if (IS_LONG_FORMAT(fdesc->fragments)) */
  391                 return (fdesc->fragments + 1);
  392 }
  393 
  394 static uint32_t
  395 chunk_hash(struct dxr_aux *da, struct direct_entry *fdesc)
  396 {
  397         uint32_t size = chunk_size(da, fdesc);
  398         uint32_t *p = (uint32_t *) &da->range_tbl[fdesc->base];
  399         uint32_t *l = (uint32_t *) &da->range_tbl[fdesc->base + size];
  400         uint32_t hash = fdesc->fragments;
  401 
  402         for (; p < l; p++)
  403                 hash = (hash << 7) + (hash >> 13) + *p;
  404 
  405         return (hash + (hash >> 16));
  406 }
  407 
  408 static int
  409 chunk_ref(struct dxr_aux *da, uint32_t chunk)
  410 {
  411         struct direct_entry *fdesc = &da->direct_tbl[chunk];
  412         struct chunk_desc *cdp, *empty_cdp;
  413         uint32_t base = fdesc->base;
  414         uint32_t size = chunk_size(da, fdesc);
  415         uint32_t hash = chunk_hash(da, fdesc);
  416         int i;
  417 
  418         /* Find an existing descriptor */
  419         LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
  420             cd_hash_le) {
  421                 if (cdp->cd_hash != hash || cdp->cd_cur_size != size ||
  422                     memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
  423                     sizeof(struct range_entry_long) * size))
  424                         continue;
  425                 da->rtbl_top = fdesc->base;
  426                 fdesc->base = cdp->cd_base;
  427                 cdp->cd_refcnt++;
  428                 return (0);
  429         }
  430 
  431         /* No matching chunks found. Find an empty one to recycle. */
  432         for (cdp = NULL, i = size; cdp == NULL && i < UNUSED_BUCKETS; i++)
  433                 cdp = LIST_FIRST(&da->unused_chunks[i]);
  434 
  435         if (cdp == NULL)
  436                 LIST_FOREACH(empty_cdp, &da->unused_chunks[0], cd_hash_le)
  437                         if (empty_cdp->cd_max_size >= size && (cdp == NULL ||
  438                             empty_cdp->cd_max_size < cdp->cd_max_size)) {
  439                                 cdp = empty_cdp;
  440                                 if (empty_cdp->cd_max_size == size)
  441                                         break;
  442                         }
  443 
  444         if (cdp != NULL) {
  445                 /* Copy from heap into the recycled chunk */
  446                 bcopy(&da->range_tbl[fdesc->base], &da->range_tbl[cdp->cd_base],
  447                     size * sizeof(struct range_entry_long));
  448                 fdesc->base = cdp->cd_base;
  449                 da->rtbl_top -= size;
  450                 da->unused_chunks_size -= cdp->cd_max_size;
  451                 if (cdp->cd_max_size > size) {
  452                         /* Split the range in two, need a new descriptor */
  453                         empty_cdp = uma_zalloc(chunk_zone, M_NOWAIT);
  454                         if (empty_cdp == NULL)
  455                                 return (1);
  456                         LIST_INSERT_BEFORE(cdp, empty_cdp, cd_all_le);
  457                         empty_cdp->cd_base = cdp->cd_base + size;
  458                         empty_cdp->cd_cur_size = 0;
  459                         empty_cdp->cd_max_size = cdp->cd_max_size - size;
  460 
  461                         i = empty_cdp->cd_max_size;
  462                         if (i >= UNUSED_BUCKETS)
  463                                 i = 0;
  464                         LIST_INSERT_HEAD(&da->unused_chunks[i], empty_cdp,
  465                             cd_hash_le);
  466 
  467                         da->unused_chunks_size += empty_cdp->cd_max_size;
  468                         cdp->cd_max_size = size;
  469                 }
  470                 LIST_REMOVE(cdp, cd_hash_le);
  471         } else {
  472                 /* Alloc a new descriptor at the top of the heap*/
  473                 cdp = uma_zalloc(chunk_zone, M_NOWAIT);
  474                 if (cdp == NULL)
  475                         return (1);
  476                 cdp->cd_max_size = size;
  477                 cdp->cd_base = fdesc->base;
  478                 LIST_INSERT_HEAD(&da->all_chunks, cdp, cd_all_le);
  479                 KASSERT(cdp->cd_base + cdp->cd_max_size == da->rtbl_top,
  480                     ("dxr: %s %d", __FUNCTION__, __LINE__));
  481         }
  482 
  483         cdp->cd_hash = hash;
  484         cdp->cd_refcnt = 1;
  485         cdp->cd_cur_size = size;
  486         LIST_INSERT_HEAD(&da->chunk_hashtbl[hash & CHUNK_HASH_MASK], cdp,
  487             cd_hash_le);
  488         if (da->rtbl_top >= da->rtbl_size) {
  489                 if (da->rtbl_top >= BASE_MAX) {
  490                         FIB_PRINTF(LOG_ERR, da->fd,
  491                             "structural limit exceeded at %d "
  492                             "range table elements", da->rtbl_top);
  493                         return (1);
  494                 }
  495                 da->rtbl_size += RTBL_SIZE_INCR;
  496                 i = (BASE_MAX - da->rtbl_top) * LOG_DEBUG / BASE_MAX;
  497                 FIB_PRINTF(i, da->fd, "range table at %d%% structural limit",
  498                     da->rtbl_top * 100 / BASE_MAX);
  499                 da->range_tbl = realloc(da->range_tbl,
  500                     sizeof(*da->range_tbl) * da->rtbl_size + FRAGS_PREF_SHORT,
  501                     M_DXRAUX, M_NOWAIT);
  502                 if (da->range_tbl == NULL)
  503                         return (1);
  504         }
  505 
  506         return (0);
  507 }
  508 
  509 static void
  510 chunk_unref(struct dxr_aux *da, uint32_t chunk)
  511 {
  512         struct direct_entry *fdesc = &da->direct_tbl[chunk];
  513         struct chunk_desc *cdp, *cdp2;
  514         uint32_t base = fdesc->base;
  515         uint32_t size = chunk_size(da, fdesc);
  516         uint32_t hash = chunk_hash(da, fdesc);
  517         int i;
  518 
  519         /* Find the corresponding descriptor */
  520         LIST_FOREACH(cdp, &da->chunk_hashtbl[hash & CHUNK_HASH_MASK],
  521             cd_hash_le)
  522                 if (cdp->cd_hash == hash && cdp->cd_cur_size == size &&
  523                     memcmp(&da->range_tbl[base], &da->range_tbl[cdp->cd_base],
  524                     sizeof(struct range_entry_long) * size) == 0)
  525                         break;
  526 
  527         KASSERT(cdp != NULL, ("dxr: dangling chunk"));
  528         if (--cdp->cd_refcnt > 0)
  529                 return;
  530 
  531         LIST_REMOVE(cdp, cd_hash_le);
  532         da->unused_chunks_size += cdp->cd_max_size;
  533         cdp->cd_cur_size = 0;
  534 
  535         /* Attempt to merge with the preceding chunk, if empty */
  536         cdp2 = LIST_NEXT(cdp, cd_all_le);
  537         if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
  538                 KASSERT(cdp2->cd_base + cdp2->cd_max_size == cdp->cd_base,
  539                     ("dxr: %s %d", __FUNCTION__, __LINE__));
  540                 LIST_REMOVE(cdp, cd_all_le);
  541                 LIST_REMOVE(cdp2, cd_hash_le);
  542                 cdp2->cd_max_size += cdp->cd_max_size;
  543                 uma_zfree(chunk_zone, cdp);
  544                 cdp = cdp2;
  545         }
  546 
  547         /* Attempt to merge with the subsequent chunk, if empty */
  548         cdp2 = LIST_PREV(cdp, &da->all_chunks, chunk_desc, cd_all_le);
  549         if (cdp2 != NULL && cdp2->cd_cur_size == 0) {
  550                 KASSERT(cdp->cd_base + cdp->cd_max_size == cdp2->cd_base,
  551                     ("dxr: %s %d", __FUNCTION__, __LINE__));
  552                 LIST_REMOVE(cdp, cd_all_le);
  553                 LIST_REMOVE(cdp2, cd_hash_le);
  554                 cdp2->cd_max_size += cdp->cd_max_size;
  555                 cdp2->cd_base = cdp->cd_base;
  556                 uma_zfree(chunk_zone, cdp);
  557                 cdp = cdp2;
  558         }
  559 
  560         if (cdp->cd_base + cdp->cd_max_size == da->rtbl_top) {
  561                 /* Free the chunk on the top of the range heap, trim the heap */
  562                 KASSERT(cdp == LIST_FIRST(&da->all_chunks),
  563                     ("dxr: %s %d", __FUNCTION__, __LINE__));
  564                 da->rtbl_top -= cdp->cd_max_size;
  565                 da->unused_chunks_size -= cdp->cd_max_size;
  566                 LIST_REMOVE(cdp, cd_all_le);
  567                 uma_zfree(chunk_zone, cdp);
  568                 return;
  569         }
  570 
  571         i = cdp->cd_max_size;
  572         if (i >= UNUSED_BUCKETS)
  573                 i = 0;
  574         LIST_INSERT_HEAD(&da->unused_chunks[i], cdp, cd_hash_le);
  575 }
  576 
  577 #ifdef DXR2
  578 static uint32_t
  579 trie_hash(struct dxr_aux *da, uint32_t dxr_x, uint32_t index)
  580 {
  581         uint32_t i, *val;
  582         uint32_t hash = 0;
  583 
  584         for (i = 0; i < (1 << dxr_x); i++) {
  585                 hash = (hash << 3) ^ (hash >> 3);
  586                 val = (uint32_t *)
  587                     (void *) &da->direct_tbl[(index << dxr_x) + i];
  588                 hash += (*val << 5);
  589                 hash += (*val >> 5);
  590         }
  591 
  592         return (hash + (hash >> 16));
  593 }
  594 
  595 static int
  596 trie_ref(struct dxr_aux *da, uint32_t index)
  597 {
  598         struct trie_desc *tp;
  599         uint32_t dxr_d = da->d_bits;
  600         uint32_t dxr_x = DXR_TRIE_BITS - dxr_d;
  601         uint32_t hash = trie_hash(da, dxr_x, index);
  602 
  603         /* Find an existing descriptor */
  604         LIST_FOREACH(tp, &da->trie_hashtbl[hash & TRIE_HASH_MASK], td_hash_le)
  605                 if (tp->td_hash == hash &&
  606                     memcmp(&da->direct_tbl[index << dxr_x],
  607                     &da->x_tbl[tp->td_index << dxr_x],
  608                     sizeof(*da->x_tbl) << dxr_x) == 0) {
  609                         tp->td_refcnt++;
  610                         da->trietbl[index] = tp;
  611                         return(tp->td_index);
  612                 }
  613 
  614         tp = LIST_FIRST(&da->unused_trie);
  615         if (tp != NULL) {
  616                 LIST_REMOVE(tp, td_hash_le);
  617                 da->unused_trie_cnt--;
  618         } else {
  619                 tp = uma_zalloc(trie_zone, M_NOWAIT);
  620                 if (tp == NULL)
  621                         return (-1);
  622                 LIST_INSERT_HEAD(&da->all_trie, tp, td_all_le);
  623                 tp->td_index = da->all_trie_cnt++;
  624         }
  625 
  626         tp->td_hash = hash;
  627         tp->td_refcnt = 1;
  628         LIST_INSERT_HEAD(&da->trie_hashtbl[hash & TRIE_HASH_MASK], tp,
  629            td_hash_le);
  630         memcpy(&da->x_tbl[tp->td_index << dxr_x],
  631             &da->direct_tbl[index << dxr_x], sizeof(*da->x_tbl) << dxr_x);
  632         da->trietbl[index] = tp;
  633         if (da->all_trie_cnt >= da->xtbl_size >> dxr_x) {
  634                 da->xtbl_size += XTBL_SIZE_INCR;
  635                 da->x_tbl = realloc(da->x_tbl,
  636                     sizeof(*da->x_tbl) * da->xtbl_size, M_DXRAUX, M_NOWAIT);
  637                 if (da->x_tbl == NULL)
  638                         return (-1);
  639         }
  640         return(tp->td_index);
  641 }
  642 
  643 static void
  644 trie_unref(struct dxr_aux *da, uint32_t index)
  645 {
  646         struct trie_desc *tp = da->trietbl[index];
  647 
  648         if (tp == NULL)
  649                 return;
  650         da->trietbl[index] = NULL;
  651         if (--tp->td_refcnt > 0)
  652                 return;
  653 
  654         LIST_REMOVE(tp, td_hash_le);
  655         da->unused_trie_cnt++;
  656         if (tp->td_index != da->all_trie_cnt - 1) {
  657                 LIST_INSERT_HEAD(&da->unused_trie, tp, td_hash_le);
  658                 return;
  659         }
  660 
  661         do {
  662                 da->all_trie_cnt--;
  663                 da->unused_trie_cnt--;
  664                 LIST_REMOVE(tp, td_all_le);
  665                 uma_zfree(trie_zone, tp);
  666                 LIST_FOREACH(tp, &da->unused_trie, td_hash_le)
  667                         if (tp->td_index == da->all_trie_cnt - 1) {
  668                                 LIST_REMOVE(tp, td_hash_le);
  669                                 break;
  670                         }
  671         } while (tp != NULL);
  672 }
  673 #endif
  674 
  675 static void
  676 heap_inject(struct dxr_aux *da, uint32_t start, uint32_t end, uint32_t preflen,
  677     uint32_t nh)
  678 {
  679         struct heap_entry *fhp;
  680         int i;
  681 
  682         for (i = da->heap_index; i >= 0; i--) {
  683                 if (preflen > da->heap[i].preflen)
  684                         break;
  685                 else if (preflen < da->heap[i].preflen)
  686                         da->heap[i + 1] = da->heap[i];
  687                 else
  688                         return;
  689         }
  690 
  691         fhp = &da->heap[i + 1];
  692         fhp->preflen = preflen;
  693         fhp->start = start;
  694         fhp->end = end;
  695         fhp->nexthop = nh;
  696         da->heap_index++;
  697 }
  698 
  699 static int
  700 dxr_walk(struct rtentry *rt, void *arg)
  701 {
  702         struct dxr_aux *da = arg;
  703         uint32_t chunk = da->work_chunk;
  704         uint32_t first = chunk << DXR_RANGE_SHIFT;
  705         uint32_t last = first | DXR_RANGE_MASK;
  706         struct range_entry_long *fp =
  707             &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
  708         struct heap_entry *fhp = &da->heap[da->heap_index];
  709         uint32_t preflen, nh, start, end, scopeid;
  710         struct in_addr addr;
  711 
  712         rt_get_inet_prefix_plen(rt, &addr, &preflen, &scopeid);
  713         start = ntohl(addr.s_addr);
  714         if (start > last)
  715                 return (-1);    /* Beyond chunk boundaries, we are done */
  716         if (start < first)
  717                 return (0);     /* Skip this route */
  718 
  719         end = start;
  720         if (preflen < 32)
  721                 end |= (0xffffffffU >> preflen);
  722         nh = fib_get_nhop_idx(da->fd, rt_get_raw_nhop(rt));
  723 
  724         if (start == fhp->start)
  725                 heap_inject(da, start, end, preflen, nh);
  726         else {
  727                 /* start > fhp->start */
  728                 while (start > fhp->end) {
  729                         uint32_t oend = fhp->end;
  730 
  731                         if (da->heap_index > 0) {
  732                                 fhp--;
  733                                 da->heap_index--;
  734                         } else
  735                                 initheap(da, fhp->end + 1, chunk);
  736                         if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
  737                                 fp++;
  738                                 da->rtbl_work_frags++;
  739                                 fp->start = (oend + 1) & DXR_RANGE_MASK;
  740                                 fp->nexthop = fhp->nexthop;
  741                         }
  742                 }
  743                 if (start > ((chunk << DXR_RANGE_SHIFT) | fp->start) &&
  744                     nh != fp->nexthop) {
  745                         fp++;
  746                         da->rtbl_work_frags++;
  747                         fp->start = start & DXR_RANGE_MASK;
  748                 } else if (da->rtbl_work_frags) {
  749                         if ((--fp)->nexthop == nh)
  750                                 da->rtbl_work_frags--;
  751                         else
  752                                 fp++;
  753                 }
  754                 fp->nexthop = nh;
  755                 heap_inject(da, start, end, preflen, nh);
  756         }
  757 
  758         return (0);
  759 }
  760 
  761 static int
  762 update_chunk(struct dxr_aux *da, uint32_t chunk)
  763 {
  764         struct range_entry_long *fp;
  765 #if DXR_TRIE_BITS < 24
  766         struct range_entry_short *fps;
  767         uint32_t start, nh, i;
  768 #endif
  769         struct heap_entry *fhp;
  770         uint32_t first = chunk << DXR_RANGE_SHIFT;
  771         uint32_t last = first | DXR_RANGE_MASK;
  772 
  773         if (da->direct_tbl[chunk].fragments != FRAGS_MARK_HIT)
  774                 chunk_unref(da, chunk);
  775 
  776         initheap(da, first, chunk);
  777 
  778         fp = &da->range_tbl[da->rtbl_top].re;
  779         da->rtbl_work_frags = 0;
  780         fp->start = first & DXR_RANGE_MASK;
  781         fp->nexthop = da->heap[0].nexthop;
  782 
  783         da->dst.sin_addr.s_addr = htonl(first);
  784         da->mask.sin_addr.s_addr = htonl(~DXR_RANGE_MASK);
  785 
  786         da->work_chunk = chunk;
  787         rib_walk_from(da->fibnum, AF_INET, RIB_FLAG_LOCKED,
  788             (struct sockaddr *) &da->dst, (struct sockaddr *) &da->mask,
  789             dxr_walk, da);
  790 
  791         /* Flush any remaining objects on the heap */
  792         fp = &da->range_tbl[da->rtbl_top + da->rtbl_work_frags].re;
  793         fhp = &da->heap[da->heap_index];
  794         while (fhp->preflen > DXR_TRIE_BITS) {
  795                 uint32_t oend = fhp->end;
  796 
  797                 if (da->heap_index > 0) {
  798                         fhp--;
  799                         da->heap_index--;
  800                 } else
  801                         initheap(da, fhp->end + 1, chunk);
  802                 if (fhp->end > oend && fhp->nexthop != fp->nexthop) {
  803                         /* Have we crossed the upper chunk boundary? */
  804                         if (oend >= last)
  805                                 break;
  806                         fp++;
  807                         da->rtbl_work_frags++;
  808                         fp->start = (oend + 1) & DXR_RANGE_MASK;
  809                         fp->nexthop = fhp->nexthop;
  810                 }
  811         }
  812 
  813         /* Direct hit if the chunk contains only a single fragment */
  814         if (da->rtbl_work_frags == 0) {
  815                 da->direct_tbl[chunk].base = fp->nexthop;
  816                 da->direct_tbl[chunk].fragments = FRAGS_MARK_HIT;
  817                 return (0);
  818         }
  819 
  820         da->direct_tbl[chunk].base = da->rtbl_top;
  821         da->direct_tbl[chunk].fragments = da->rtbl_work_frags;
  822 
  823 #if DXR_TRIE_BITS < 24
  824         /* Check whether the chunk can be more compactly encoded */
  825         fp = &da->range_tbl[da->rtbl_top].re;
  826         for (i = 0; i <= da->rtbl_work_frags; i++, fp++)
  827                 if ((fp->start & 0xff) != 0 || fp->nexthop > RE_SHORT_MAX_NH)
  828                         break;
  829         if (i == da->rtbl_work_frags + 1) {
  830                 fp = &da->range_tbl[da->rtbl_top].re;
  831                 fps = (void *) fp;
  832                 for (i = 0; i <= da->rtbl_work_frags; i++, fp++, fps++) {
  833                         start = fp->start;
  834                         nh = fp->nexthop;
  835                         fps->start = start >> 8;
  836                         fps->nexthop = nh;
  837                 }
  838                 fps->start = start >> 8;
  839                 fps->nexthop = nh;
  840                 da->rtbl_work_frags >>= 1;
  841                 da->direct_tbl[chunk].fragments =
  842                     da->rtbl_work_frags | FRAGS_PREF_SHORT;
  843         } else
  844 #endif
  845         if (da->rtbl_work_frags >= FRAGS_MARK_HIT) {
  846                 da->direct_tbl[chunk].fragments = FRAGS_MARK_XL;
  847                 memmove(&da->range_tbl[da->rtbl_top + 1],
  848                    &da->range_tbl[da->rtbl_top],
  849                    (da->rtbl_work_frags + 1) * sizeof(*da->range_tbl));
  850                 da->range_tbl[da->rtbl_top].fragments = da->rtbl_work_frags;
  851                 da->rtbl_work_frags++;
  852         }
  853         da->rtbl_top += (da->rtbl_work_frags + 1);
  854         return (chunk_ref(da, chunk));
  855 }
  856 
  857 static void
  858 dxr_build(struct dxr *dxr)
  859 {
  860         struct dxr_aux *da = dxr->aux;
  861         struct chunk_desc *cdp;
  862         struct rib_rtable_info rinfo;
  863         struct timeval t0, t1, t2, t3;
  864         uint32_t r_size, dxr_tot_size;
  865         uint32_t i, m, range_rebuild = 0;
  866         uint32_t range_frag;
  867 #ifdef DXR2
  868         struct trie_desc *tp;
  869         uint32_t d_tbl_size, dxr_x, d_size, x_size;
  870         uint32_t ti, trie_rebuild = 0, prev_size = 0;
  871         uint32_t trie_frag;
  872 #endif
  873 
  874         KASSERT(dxr->d == NULL, ("dxr: d not free"));
  875 
  876         if (da == NULL) {
  877                 da = malloc(sizeof(*dxr->aux), M_DXRAUX, M_NOWAIT);
  878                 if (da == NULL)
  879                         return;
  880                 dxr->aux = da;
  881                 da->fibnum = dxr->fibnum;
  882                 da->refcnt = 1;
  883                 LIST_INIT(&da->all_chunks);
  884                 LIST_INIT(&da->all_trie);
  885                 da->rtbl_size = RTBL_SIZE_INCR;
  886                 da->range_tbl = NULL;
  887                 da->xtbl_size = XTBL_SIZE_INCR;
  888                 da->x_tbl = NULL;
  889                 bzero(&da->dst, sizeof(da->dst));
  890                 bzero(&da->mask, sizeof(da->mask));
  891                 da->dst.sin_len = sizeof(da->dst);
  892                 da->mask.sin_len = sizeof(da->mask);
  893                 da->dst.sin_family = AF_INET;
  894                 da->mask.sin_family = AF_INET;
  895         }
  896         if (da->range_tbl == NULL) {
  897                 da->range_tbl = malloc(sizeof(*da->range_tbl) * da->rtbl_size
  898                     + FRAGS_PREF_SHORT, M_DXRAUX, M_NOWAIT);
  899                 if (da->range_tbl == NULL)
  900                         return;
  901                 range_rebuild = 1;
  902         }
  903 #ifdef DXR2
  904         if (da->x_tbl == NULL) {
  905                 da->x_tbl = malloc(sizeof(*da->x_tbl) * da->xtbl_size,
  906                     M_DXRAUX, M_NOWAIT);
  907                 if (da->x_tbl == NULL)
  908                         return;
  909                 trie_rebuild = 1;
  910         }
  911 #endif
  912         da->fd = dxr->fd;
  913 
  914         microuptime(&t0);
  915 
  916         dxr->nh_tbl = fib_get_nhop_array(da->fd);
  917         fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
  918 
  919         if (da->updates_low > da->updates_high)
  920                 range_rebuild = 1;
  921 
  922 range_build:
  923         if (range_rebuild) {
  924                 /* Bulk cleanup */
  925                 bzero(da->chunk_hashtbl, sizeof(da->chunk_hashtbl));
  926                 while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
  927                         LIST_REMOVE(cdp, cd_all_le);
  928                         uma_zfree(chunk_zone, cdp);
  929                 }
  930                 for (i = 0; i < UNUSED_BUCKETS; i++)
  931                         LIST_INIT(&da->unused_chunks[i]);
  932                 da->unused_chunks_size = 0;
  933                 da->rtbl_top = 0;
  934                 da->updates_low = 0;
  935                 da->updates_high = DIRECT_TBL_SIZE - 1;
  936                 memset(da->updates_mask, 0xff, sizeof(da->updates_mask));
  937                 for (i = 0; i < DIRECT_TBL_SIZE; i++) {
  938                         da->direct_tbl[i].fragments = FRAGS_MARK_HIT;
  939                         da->direct_tbl[i].base = 0;
  940                 }
  941         }
  942         da->prefixes = rinfo.num_prefixes;
  943 
  944         /* DXR: construct direct & range table */
  945         for (i = da->updates_low; i <= da->updates_high; i++) {
  946                 m = da->updates_mask[i >> 5] >> (i & 0x1f);
  947                 if (m == 0)
  948                         i |= 0x1f;
  949                 else if (m & 1 && update_chunk(da, i) != 0)
  950                         return;
  951         }
  952 
  953         range_frag = 0;
  954         if (da->rtbl_top)
  955                 range_frag = da->unused_chunks_size * 10000ULL / da->rtbl_top;
  956         if (range_frag > V_frag_limit) {
  957                 range_rebuild = 1;
  958                 goto range_build;
  959         }
  960 
  961         r_size = sizeof(*da->range_tbl) * da->rtbl_top;
  962         microuptime(&t1);
  963 
  964 #ifdef DXR2
  965         if (range_rebuild ||
  966             abs(fls(da->prefixes) - fls(da->trie_rebuilt_prefixes)) > 1)
  967                 trie_rebuild = 1;
  968 
  969 trie_build:
  970         if (trie_rebuild) {
  971                 da->trie_rebuilt_prefixes = da->prefixes;
  972                 da->d_bits = DXR_D;
  973                 da->updates_low = 0;
  974                 da->updates_high = DIRECT_TBL_SIZE - 1;
  975                 if (!range_rebuild)
  976                         memset(da->updates_mask, 0xff,
  977                             sizeof(da->updates_mask));
  978         }
  979 
  980 dxr2_try_squeeze:
  981         if (trie_rebuild) {
  982                 /* Bulk cleanup */
  983                 bzero(da->trietbl, sizeof(da->trietbl));
  984                 bzero(da->trie_hashtbl, sizeof(da->trie_hashtbl));
  985                 while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
  986                         LIST_REMOVE(tp, td_all_le);
  987                         uma_zfree(trie_zone, tp);
  988                 }
  989                 LIST_INIT(&da->unused_trie);
  990                 da->all_trie_cnt = da->unused_trie_cnt = 0;
  991         }
  992 
  993         /* Populate d_tbl, x_tbl */
  994         dxr_x = DXR_TRIE_BITS - da->d_bits;
  995         d_tbl_size = (1 << da->d_bits);
  996 
  997         for (i = da->updates_low >> dxr_x; i <= da->updates_high >> dxr_x;
  998             i++) {
  999                 if (!trie_rebuild) {
 1000                         m = 0;
 1001                         for (int j = 0; j < (1 << dxr_x); j += 32)
 1002                                 m |= da->updates_mask[((i << dxr_x) + j) >> 5];
 1003                         if (m == 0)
 1004                                 continue;
 1005                         trie_unref(da, i);
 1006                 }
 1007                 ti = trie_ref(da, i);
 1008                 if (ti < 0)
 1009                         return;
 1010                 da->d_tbl[i] = ti;
 1011         }
 1012 
 1013         trie_frag = 0;
 1014         if (da->all_trie_cnt)
 1015                 trie_frag = da->unused_trie_cnt * 10000ULL / da->all_trie_cnt;
 1016         if (trie_frag > V_frag_limit) {
 1017                 trie_rebuild = 1;
 1018                 goto trie_build;
 1019         }
 1020 
 1021         d_size = sizeof(*da->d_tbl) * d_tbl_size;
 1022         x_size = sizeof(*da->x_tbl) * DIRECT_TBL_SIZE / d_tbl_size
 1023             * da->all_trie_cnt;
 1024         dxr_tot_size = d_size + x_size + r_size;
 1025 
 1026         if (trie_rebuild == 1) {
 1027                 /* Try to find a more compact D/X split */
 1028                 if (prev_size == 0 || dxr_tot_size <= prev_size)
 1029                         da->d_bits--;
 1030                 else {
 1031                         da->d_bits++;
 1032                         trie_rebuild = 2;
 1033                 }
 1034                 prev_size = dxr_tot_size;
 1035                 goto dxr2_try_squeeze;
 1036         }
 1037         microuptime(&t2);
 1038 #else /* !DXR2 */
 1039         dxr_tot_size = sizeof(da->direct_tbl) + r_size;
 1040         t2 = t1;
 1041 #endif
 1042 
 1043         dxr->d = malloc(dxr_tot_size, M_DXRLPM, M_NOWAIT);
 1044         if (dxr->d == NULL)
 1045                 return;
 1046 #ifdef DXR2
 1047         memcpy(dxr->d, da->d_tbl, d_size);
 1048         dxr->x = ((char *) dxr->d) + d_size;
 1049         memcpy(dxr->x, da->x_tbl, x_size);
 1050         dxr->r = ((char *) dxr->x) + x_size;
 1051         dxr->d_shift = 32 - da->d_bits;
 1052         dxr->x_shift = dxr_x;
 1053         dxr->x_mask = 0xffffffffU >> (32 - dxr_x);
 1054 #else /* !DXR2 */
 1055         memcpy(dxr->d, da->direct_tbl, sizeof(da->direct_tbl));
 1056         dxr->r = ((char *) dxr->d) + sizeof(da->direct_tbl);
 1057 #endif
 1058         memcpy(dxr->r, da->range_tbl, r_size);
 1059 
 1060         if (da->updates_low <= da->updates_high)
 1061                 bzero(&da->updates_mask[da->updates_low / 32],
 1062                     (da->updates_high - da->updates_low) / 8 + 1);
 1063         da->updates_low = DIRECT_TBL_SIZE - 1;
 1064         da->updates_high = 0;
 1065         microuptime(&t3);
 1066 
 1067 #ifdef DXR2
 1068         FIB_PRINTF(LOG_INFO, da->fd, "D%dX%dR, %d prefixes, %d nhops (max)",
 1069             da->d_bits, dxr_x, rinfo.num_prefixes, rinfo.num_nhops);
 1070 #else
 1071         FIB_PRINTF(LOG_INFO, da->fd, "D%dR, %d prefixes, %d nhops (max)",
 1072             DXR_D, rinfo.num_prefixes, rinfo.num_nhops);
 1073 #endif
 1074         i = dxr_tot_size * 100;
 1075         if (rinfo.num_prefixes)
 1076                 i /= rinfo.num_prefixes;
 1077         FIB_PRINTF(LOG_INFO, da->fd, "%d.%02d KBytes, %d.%02d Bytes/prefix",
 1078             dxr_tot_size / 1024, dxr_tot_size * 100 / 1024 % 100,
 1079             i / 100, i % 100);
 1080 #ifdef DXR2
 1081         FIB_PRINTF(LOG_INFO, da->fd,
 1082             "%d.%02d%% trie, %d.%02d%% range fragmentation",
 1083             trie_frag / 100, trie_frag % 100,
 1084             range_frag / 100, range_frag % 100);
 1085 #else
 1086         FIB_PRINTF(LOG_INFO, da->fd, "%d.%01d%% range fragmentation",
 1087             range_frag / 100, range_frag % 100);
 1088 #endif
 1089         i = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
 1090         FIB_PRINTF(LOG_INFO, da->fd, "range table %s in %u.%03u ms",
 1091             range_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
 1092 #ifdef DXR2
 1093         i = (t2.tv_sec - t1.tv_sec) * 1000000 + t2.tv_usec - t1.tv_usec;
 1094         FIB_PRINTF(LOG_INFO, da->fd, "trie %s in %u.%03u ms",
 1095             trie_rebuild ? "rebuilt" : "updated", i / 1000, i % 1000);
 1096 #endif
 1097         i = (t3.tv_sec - t2.tv_sec) * 1000000 + t3.tv_usec - t2.tv_usec;
 1098         FIB_PRINTF(LOG_INFO, da->fd, "snapshot forked in %u.%03u ms",
 1099             i / 1000, i % 1000);
 1100 }
 1101 
 1102 /*
 1103  * Glue functions for attaching to FreeBSD 13 fib_algo infrastructure.
 1104  */
 1105 
 1106 static struct nhop_object *
 1107 dxr_fib_lookup(void *algo_data, const struct flm_lookup_key key,
 1108     uint32_t scopeid)
 1109 {
 1110         struct dxr *dxr = algo_data;
 1111 
 1112         return (dxr->nh_tbl[dxr_lookup(dxr, ntohl(key.addr4.s_addr))]);
 1113 }
 1114 
 1115 static enum flm_op_result
 1116 dxr_init(uint32_t fibnum, struct fib_data *fd, void *old_data, void **data)
 1117 {
 1118         struct dxr *old_dxr = old_data;
 1119         struct dxr_aux *da = NULL;
 1120         struct dxr *dxr;
 1121 
 1122         dxr = malloc(sizeof(*dxr), M_DXRAUX, M_NOWAIT);
 1123         if (dxr == NULL)
 1124                 return (FLM_REBUILD);
 1125 
 1126         /* Check whether we may reuse the old auxiliary structures */
 1127         if (old_dxr != NULL && old_dxr->aux != NULL) {
 1128                 da = old_dxr->aux;
 1129                 atomic_add_int(&da->refcnt, 1);
 1130         }
 1131 
 1132         dxr->aux = da;
 1133         dxr->d = NULL;
 1134         dxr->fd = fd;
 1135         dxr->fibnum = fibnum;
 1136         *data = dxr;
 1137 
 1138         return (FLM_SUCCESS);
 1139 }
 1140 
 1141 static void
 1142 dxr_destroy(void *data)
 1143 {
 1144         struct dxr *dxr = data;
 1145         struct dxr_aux *da;
 1146         struct chunk_desc *cdp;
 1147         struct trie_desc *tp;
 1148 
 1149         if (dxr->d != NULL)
 1150                 free(dxr->d, M_DXRLPM);
 1151 
 1152         da = dxr->aux;
 1153         free(dxr, M_DXRAUX);
 1154 
 1155         if (da == NULL || atomic_fetchadd_int(&da->refcnt, -1) > 1)
 1156                 return;
 1157 
 1158         /* Release auxiliary structures */
 1159         while ((cdp = LIST_FIRST(&da->all_chunks)) != NULL) {
 1160                 LIST_REMOVE(cdp, cd_all_le);
 1161                 uma_zfree(chunk_zone, cdp);
 1162         }
 1163         while ((tp = LIST_FIRST(&da->all_trie)) != NULL) {
 1164                 LIST_REMOVE(tp, td_all_le);
 1165                 uma_zfree(trie_zone, tp);
 1166         }
 1167         free(da->range_tbl, M_DXRAUX);
 1168         free(da->x_tbl, M_DXRAUX);
 1169         free(da, M_DXRAUX);
 1170 }
 1171 
 1172 static void 
 1173 epoch_dxr_destroy(epoch_context_t ctx)
 1174 {
 1175         struct dxr *dxr = __containerof(ctx, struct dxr, epoch_ctx);
 1176 
 1177         dxr_destroy(dxr);
 1178 }
 1179 
 1180 static void *
 1181 choose_lookup_fn(struct dxr_aux *da)
 1182 {
 1183 
 1184 #ifdef DXR2
 1185         switch (da->d_bits) {
 1186 #if DXR_TRIE_BITS > 16
 1187         case 16:
 1188                 return (dxr_fib_lookup_16);
 1189 #endif
 1190         case 15:
 1191                 return (dxr_fib_lookup_15);
 1192         case 14:
 1193                 return (dxr_fib_lookup_14);
 1194         case 13:
 1195                 return (dxr_fib_lookup_13);
 1196         case 12:
 1197                 return (dxr_fib_lookup_12);
 1198         case 11:
 1199                 return (dxr_fib_lookup_11);
 1200         case 10:
 1201                 return (dxr_fib_lookup_10);
 1202         case 9:
 1203                 return (dxr_fib_lookup_9);
 1204         }
 1205 #endif /* DXR2 */
 1206         return (dxr_fib_lookup);
 1207 }
 1208 
 1209 static enum flm_op_result
 1210 dxr_dump_end(void *data, struct fib_dp *dp)
 1211 {
 1212         struct dxr *dxr = data;
 1213         struct dxr_aux *da;
 1214 
 1215         dxr_build(dxr);
 1216 
 1217         da = dxr->aux;
 1218         if (da == NULL)
 1219                 return (FLM_REBUILD);
 1220 
 1221         /* Structural limit exceeded, hard error */
 1222         if (da->rtbl_top >= BASE_MAX)
 1223                 return (FLM_ERROR);
 1224 
 1225         /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
 1226         if (dxr->d == NULL)
 1227                 return (FLM_REBUILD);
 1228 
 1229         dp->f = choose_lookup_fn(da);
 1230         dp->arg = dxr;
 1231 
 1232         return (FLM_SUCCESS);
 1233 }
 1234 
 1235 static enum flm_op_result
 1236 dxr_dump_rib_item(struct rtentry *rt, void *data)
 1237 {
 1238         
 1239         return (FLM_SUCCESS);
 1240 }
 1241 
 1242 static enum flm_op_result
 1243 dxr_change_rib_item(struct rib_head *rnh, struct rib_cmd_info *rc,
 1244     void *data)
 1245 {
 1246 
 1247         return (FLM_BATCH);
 1248 }
 1249 
 1250 static enum flm_op_result
 1251 dxr_change_rib_batch(struct rib_head *rnh, struct fib_change_queue *q,
 1252     void *data)
 1253 {
 1254         struct dxr *dxr = data;
 1255         struct dxr *new_dxr;
 1256         struct dxr_aux *da;
 1257         struct fib_dp new_dp;
 1258         enum flm_op_result res;
 1259         uint32_t ip, plen, hmask, start, end, i, ui;
 1260 #ifdef INVARIANTS
 1261         struct rib_rtable_info rinfo;
 1262         int update_delta = 0;
 1263 #endif
 1264 
 1265         KASSERT(data != NULL, ("%s: NULL data", __FUNCTION__));
 1266         KASSERT(q != NULL, ("%s: NULL q", __FUNCTION__));
 1267         KASSERT(q->count < q->size, ("%s: q->count %d q->size %d",
 1268             __FUNCTION__, q->count, q->size));
 1269 
 1270         da = dxr->aux;
 1271         KASSERT(da != NULL, ("%s: NULL dxr->aux", __FUNCTION__));
 1272 
 1273         FIB_PRINTF(LOG_INFO, da->fd, "processing %d update(s)", q->count);
 1274         for (ui = 0; ui < q->count; ui++) {
 1275 #ifdef INVARIANTS
 1276                 if (q->entries[ui].nh_new != NULL)
 1277                         update_delta++;
 1278                 if (q->entries[ui].nh_old != NULL)
 1279                         update_delta--;
 1280 #endif
 1281                 plen = q->entries[ui].plen;
 1282                 ip = ntohl(q->entries[ui].addr4.s_addr);
 1283                 if (plen < 32)
 1284                         hmask = 0xffffffffU >> plen;
 1285                 else
 1286                         hmask = 0;
 1287                 start = (ip & ~hmask) >> DXR_RANGE_SHIFT;
 1288                 end = (ip | hmask) >> DXR_RANGE_SHIFT;
 1289 
 1290                 if ((start & 0x1f) == 0 && (end & 0x1f) == 0x1f)
 1291                         for (i = start >> 5; i <= end >> 5; i++)
 1292                                 da->updates_mask[i] = 0xffffffffU;
 1293                 else
 1294                         for (i = start; i <= end; i++)
 1295                                 da->updates_mask[i >> 5] |= (1 << (i & 0x1f));
 1296                 if (start < da->updates_low)
 1297                         da->updates_low = start;
 1298                 if (end > da->updates_high)
 1299                         da->updates_high = end;
 1300         }
 1301 
 1302 #ifdef INVARIANTS
 1303         fib_get_rtable_info(fib_get_rh(da->fd), &rinfo);
 1304         KASSERT(da->prefixes + update_delta == rinfo.num_prefixes,
 1305             ("%s: update count mismatch", __FUNCTION__));
 1306 #endif
 1307 
 1308         res = dxr_init(0, dxr->fd, data, (void **) &new_dxr);
 1309         if (res != FLM_SUCCESS)
 1310                 return (res);
 1311 
 1312         dxr_build(new_dxr);
 1313 
 1314         /* Structural limit exceeded, hard error */
 1315         if (da->rtbl_top >= BASE_MAX) {
 1316                 dxr_destroy(new_dxr);
 1317                 return (FLM_ERROR);
 1318         }
 1319 
 1320         /* A malloc(,, M_NOWAIT) failed somewhere, retry later */
 1321         if (new_dxr->d == NULL) {
 1322                 dxr_destroy(new_dxr);
 1323                 return (FLM_REBUILD);
 1324         }
 1325 
 1326         new_dp.f = choose_lookup_fn(da);
 1327         new_dp.arg = new_dxr;
 1328         if (fib_set_datapath_ptr(dxr->fd, &new_dp)) {
 1329                 fib_set_algo_ptr(dxr->fd, new_dxr);
 1330                 fib_epoch_call(epoch_dxr_destroy, &dxr->epoch_ctx);
 1331                 return (FLM_SUCCESS);
 1332         }
 1333 
 1334         dxr_destroy(new_dxr);
 1335         return (FLM_REBUILD);
 1336 }
 1337 
 1338 static uint8_t
 1339 dxr_get_pref(const struct rib_rtable_info *rinfo)
 1340 {
 1341 
 1342         /* Below bsearch4 up to 10 prefixes. Always supersedes dpdk_lpm4. */
 1343         return (251);
 1344 }
 1345 
 1346 SYSCTL_DECL(_net_route_algo);
 1347 SYSCTL_NODE(_net_route_algo, OID_AUTO, dxr, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 1348     "DXR tunables");
 1349 
 1350 static int
 1351 sysctl_dxr_frag_limit(SYSCTL_HANDLER_ARGS)
 1352 {
 1353         char buf[8];
 1354         int error, new, i;
 1355 
 1356         snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
 1357             V_frag_limit % 100);
 1358         error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 1359         if (error != 0 || req->newptr == NULL)
 1360                 return (error);
 1361         if (!isdigit(*buf) && *buf != '.')
 1362                 return (EINVAL);
 1363         for (i = 0, new = 0; isdigit(buf[i]) && i < sizeof(buf); i++)
 1364                 new = new * 10 + buf[i] - '';
 1365         new *= 100;
 1366         if (buf[i++] == '.') {
 1367                 if (!isdigit(buf[i]))
 1368                         return (EINVAL);
 1369                 new += (buf[i++] - '') * 10;
 1370                 if (isdigit(buf[i]))
 1371                         new += buf[i++] - '';
 1372         }
 1373         if (new > 1000)
 1374                 return (EINVAL);
 1375         V_frag_limit = new;
 1376         snprintf(buf, sizeof(buf), "%d.%02d%%", V_frag_limit / 100,
 1377             V_frag_limit % 100);
 1378         return (0);
 1379 }
 1380 
 1381 SYSCTL_PROC(_net_route_algo_dxr, OID_AUTO, frag_limit,
 1382     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_VNET,
 1383     0, 0, sysctl_dxr_frag_limit, "A",
 1384     "Fragmentation threshold to full rebuild");
 1385 
 1386 static struct fib_lookup_module fib_dxr_mod = {
 1387         .flm_name = "dxr",
 1388         .flm_family = AF_INET,
 1389         .flm_init_cb = dxr_init,
 1390         .flm_destroy_cb = dxr_destroy,
 1391         .flm_dump_rib_item_cb = dxr_dump_rib_item,
 1392         .flm_dump_end_cb = dxr_dump_end,
 1393         .flm_change_rib_item_cb = dxr_change_rib_item,
 1394         .flm_change_rib_items_cb = dxr_change_rib_batch,
 1395         .flm_get_pref = dxr_get_pref,
 1396 };
 1397 
 1398 static int
 1399 dxr_modevent(module_t mod, int type, void *unused)
 1400 {
 1401         int error;
 1402 
 1403         switch (type) {
 1404         case MOD_LOAD:
 1405                 chunk_zone = uma_zcreate("dxr chunk", sizeof(struct chunk_desc),
 1406                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 1407                 trie_zone = uma_zcreate("dxr trie", sizeof(struct trie_desc),
 1408                     NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 1409                 fib_module_register(&fib_dxr_mod);
 1410                 return(0);
 1411         case MOD_UNLOAD:
 1412                 error = fib_module_unregister(&fib_dxr_mod);
 1413                 if (error)
 1414                         return (error);
 1415                 uma_zdestroy(chunk_zone);
 1416                 uma_zdestroy(trie_zone);
 1417                 return(0);
 1418         default:
 1419                 return(EOPNOTSUPP);
 1420         }
 1421 }
 1422 
 1423 static moduledata_t dxr_mod = {"fib_dxr", dxr_modevent, 0};
 1424 
 1425 DECLARE_MODULE(fib_dxr, dxr_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 1426 MODULE_VERSION(fib_dxr, 1);

Cache object: e1189b999aeac6fe87820724c1f027e9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.