The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/net/route/fib_algo.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2020 Alexander V. Chernikov
    5  *
    6  * Redistribution and use in source and binary forms, with or without
    7  * modification, are permitted provided that the following conditions
    8  * are met:
    9  * 1. Redistributions of source code must retain the above copyright
   10  *    notice, this list of conditions and the following disclaimer.
   11  * 2. Redistributions in binary form must reproduce the above copyright
   12  *    notice, this list of conditions and the following disclaimer in the
   13  *    documentation and/or other materials provided with the distribution.
   14  *
   15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   25  * SUCH DAMAGE.
   26  */
   27 
   28 #include <sys/cdefs.h>
   29 __FBSDID("$FreeBSD$");
   30 #include "opt_inet.h"
   31 #include "opt_inet6.h"
   32 #include "opt_route.h"
   33 
   34 #include <sys/param.h>
   35 #include <sys/eventhandler.h>
   36 #include <sys/kernel.h>
   37 #include <sys/sbuf.h>
   38 #include <sys/lock.h>
   39 #include <sys/rmlock.h>
   40 #include <sys/malloc.h>
   41 #include <sys/mbuf.h>
   42 #include <sys/module.h>
   43 #include <sys/kernel.h>
   44 #include <sys/priv.h>
   45 #include <sys/proc.h>
   46 #include <sys/socket.h>
   47 #include <sys/socketvar.h>
   48 #include <sys/sysctl.h>
   49 #include <sys/syslog.h>
   50 #include <sys/queue.h>
   51 #include <net/vnet.h>
   52 
   53 #include <net/if.h>
   54 #include <net/if_var.h>
   55 
   56 #include <netinet/in.h>
   57 #include <netinet/in_var.h>
   58 #include <netinet/ip.h>
   59 #include <netinet/ip_var.h>
   60 #ifdef INET6
   61 #include <netinet/ip6.h>
   62 #include <netinet6/ip6_var.h>
   63 #endif
   64 
   65 #include <net/route.h>
   66 #include <net/route/nhop.h>
   67 #include <net/route/route_ctl.h>
   68 #include <net/route/route_var.h>
   69 #include <net/route/fib_algo.h>
   70 
   71 #include <machine/stdarg.h>
   72 
   73 /*
   74  * Fib lookup framework.
   75  *
   76  * This framework enables accelerated longest-prefix-match lookups for the
   77  *  routing tables by adding the ability to dynamically attach/detach lookup
   78  *  algorithms implementation to/from the datapath.
   79  *
   80  * flm - fib lookup modules - implementation of particular lookup algorithm
   81  * fd - fib data - instance of an flm bound to specific routing table
   82  *
   83  * This file provides main framework functionality.
   84  *
   85  * The following are the features provided by the framework
   86  *
   87  * 1) nexhops abstraction -> provides transparent referencing, indexing
   88  *   and efficient idx->ptr mappings for nexthop and nexthop groups.
   89  * 2) Routing table synchronisation
   90  * 3) dataplane attachment points
   91  * 4) automatic algorithm selection based on the provided preference.
   92  *
   93  *
   94  * DATAPATH
   95  * For each supported address family, there is a an allocated array of fib_dp
   96  *  structures, indexed by fib number. Each array entry contains callback function
   97  *  and its argument. This function will be called with a family-specific lookup key,
   98  *  scope and provided argument. This array gets re-created every time when new algo
   99  *  instance gets created. Please take a look at the replace_rtables_family() function
  100  *  for more details.
  101  *
  102  */
  103 
  104 SYSCTL_DECL(_net_route);
  105 SYSCTL_NODE(_net_route, OID_AUTO, algo, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  106     "Fib algorithm lookups");
  107 
  108 /* Algorithm sync policy */
  109 
  110 /* Time interval to bucket updates */
  111 VNET_DEFINE_STATIC(unsigned int, update_bucket_time_ms) = 50;
  112 #define V_update_bucket_time_ms VNET(update_bucket_time_ms)
  113 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_time_ms, CTLFLAG_RW | CTLFLAG_VNET,
  114     &VNET_NAME(update_bucket_time_ms), 0, "Time interval to calculate update rate");
  115 
  116 /* Minimum update rate to delay sync */
  117 VNET_DEFINE_STATIC(unsigned int, bucket_change_threshold_rate) = 500;
  118 #define V_bucket_change_threshold_rate  VNET(bucket_change_threshold_rate)
  119 SYSCTL_UINT(_net_route_algo, OID_AUTO, bucket_change_threshold_rate, CTLFLAG_RW | CTLFLAG_VNET,
  120     &VNET_NAME(bucket_change_threshold_rate), 0, "Minimum update rate to delay sync");
  121 
  122 /* Max allowed delay to sync */
  123 VNET_DEFINE_STATIC(unsigned int, fib_max_sync_delay_ms) = 1000;
  124 #define V_fib_max_sync_delay_ms VNET(fib_max_sync_delay_ms)
  125 SYSCTL_UINT(_net_route_algo, OID_AUTO, fib_max_sync_delay_ms, CTLFLAG_RW | CTLFLAG_VNET,
  126     &VNET_NAME(fib_max_sync_delay_ms), 0, "Maximum time to delay sync (ms)");
  127 
  128 
  129 #ifdef INET6
  130 VNET_DEFINE_STATIC(bool, algo_fixed_inet6) = false;
  131 #define V_algo_fixed_inet6      VNET(algo_fixed_inet6)
  132 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  133     "IPv6 longest prefix match lookups");
  134 #endif
  135 #ifdef INET
  136 VNET_DEFINE_STATIC(bool, algo_fixed_inet) = false;
  137 #define V_algo_fixed_inet       VNET(algo_fixed_inet)
  138 SYSCTL_NODE(_net_route_algo, OID_AUTO, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  139     "IPv4 longest prefix match lookups");
  140 #endif
  141 
  142 /* Fib instance counter */
  143 static uint32_t fib_gen = 0;
  144 
  145 struct nhop_ref_table {
  146         uint32_t                count;
  147         int32_t                 refcnt[0];
  148 };
  149 
  150 enum fib_callout_action {
  151         FDA_NONE,       /* No callout scheduled */
  152         FDA_REBUILD,    /* Asks to rebuild algo instance */
  153         FDA_EVAL,       /* Asks to evaluate if the current algo is still be best */
  154         FDA_BATCH,      /* Asks to submit batch of updates to the algo */
  155 };
  156 
  157 struct fib_sync_status {
  158         struct timeval          diverge_time;   /* ts when diverged */
  159         uint32_t                num_changes;    /* number of changes since sync */
  160         uint32_t                bucket_changes; /* num changes within the current bucket */
  161         uint64_t                bucket_id;      /* 50ms bucket # */
  162         struct fib_change_queue fd_change_queue;/* list of scheduled entries */
  163 };
  164 
  165 /*
  166  * Data structure for the fib lookup instance tied to the particular rib.
  167  */
  168 struct fib_data {
  169         uint32_t                number_nhops;   /* current # of nhops */
  170         uint8_t                 hit_nhops;      /* true if out of nhop limit */
  171         uint8_t                 init_done;      /* true if init is competed */
  172         uint32_t                fd_dead:1;      /* Scheduled for deletion */
  173         uint32_t                fd_linked:1;    /* true if linked */
  174         uint32_t                fd_need_rebuild:1;      /* true if rebuild scheduled */
  175         uint32_t                fd_batch:1;     /* true if batched notification scheduled */
  176         uint8_t                 fd_family;      /* family */
  177         uint32_t                fd_fibnum;      /* fibnum */
  178         uint32_t                fd_failed_rebuilds;     /* stat: failed rebuilds */
  179         uint32_t                fd_gen;         /* instance gen# */
  180         struct callout          fd_callout;     /* rebuild callout */
  181         enum fib_callout_action fd_callout_action;      /* Callout action to take */
  182         void                    *fd_algo_data;  /* algorithm data */
  183         struct nhop_object      **nh_idx;       /* nhop idx->ptr array */
  184         struct nhop_ref_table   *nh_ref_table;  /* array with # of nhop references */
  185         struct rib_head         *fd_rh;         /* RIB table we're attached to */
  186         struct rib_subscription *fd_rs;         /* storing table subscription */
  187         struct fib_dp           fd_dp;          /* fib datapath data */
  188         struct vnet             *fd_vnet;       /* vnet fib belongs to */
  189         struct epoch_context    fd_epoch_ctx;   /* epoch context for deletion */
  190         struct fib_lookup_module        *fd_flm;/* pointer to the lookup module */
  191         struct fib_sync_status  fd_ss;          /* State relevant to the rib sync  */
  192         uint32_t                fd_num_changes; /* number of changes since last callout */
  193         TAILQ_ENTRY(fib_data)   entries;        /* list of all fds in vnet */
  194 };
  195 
  196 static bool rebuild_fd(struct fib_data *fd, const char *reason);
  197 static bool rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new);
  198 static void handle_fd_callout(void *_data);
  199 static void destroy_fd_instance_epoch(epoch_context_t ctx);
  200 static bool is_idx_free(struct fib_data *fd, uint32_t index);
  201 static void set_algo_fixed(struct rib_head *rh);
  202 static bool is_algo_fixed(struct rib_head *rh);
  203 
  204 static uint32_t fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh);
  205 static void fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh);
  206 
  207 static struct fib_lookup_module *fib_check_best_algo(struct rib_head *rh,
  208     struct fib_lookup_module *orig_flm);
  209 static void fib_unref_algo(struct fib_lookup_module *flm);
  210 static bool flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum);
  211 
  212 struct mtx fib_mtx;
  213 #define FIB_MOD_LOCK()          mtx_lock(&fib_mtx)
  214 #define FIB_MOD_UNLOCK()        mtx_unlock(&fib_mtx)
  215 #define FIB_MOD_LOCK_ASSERT()   mtx_assert(&fib_mtx, MA_OWNED)
  216 
  217 MTX_SYSINIT(fib_mtx, &fib_mtx, "algo list mutex", MTX_DEF);
  218 
  219 /* Algorithm has to be this percent better than the current to switch */
  220 #define BEST_DIFF_PERCENT       (5 * 256 / 100)
  221 /* Schedule algo re-evaluation X seconds after a change */
  222 #define ALGO_EVAL_DELAY_MS      30000
  223 /* Force algo re-evaluation after X changes */
  224 #define ALGO_EVAL_NUM_ROUTES    100
  225 /* Try to setup algorithm X times */
  226 #define FIB_MAX_TRIES           32
  227 /* Max amount of supported nexthops */
  228 #define FIB_MAX_NHOPS           262144
  229 #define FIB_CALLOUT_DELAY_MS    50
  230 
  231 
  232 /* Debug */
  233 static int flm_debug_level = LOG_NOTICE;
  234 SYSCTL_INT(_net_route_algo, OID_AUTO, debug_level, CTLFLAG_RW | CTLFLAG_RWTUN,
  235     &flm_debug_level, 0, "debuglevel");
  236 #define FLM_MAX_DEBUG_LEVEL     LOG_DEBUG
  237 #ifndef LOG_DEBUG2
  238 #define LOG_DEBUG2      8
  239 #endif
  240 
  241 #define _PASS_MSG(_l)   (flm_debug_level >= (_l))
  242 #define ALGO_PRINTF(_l, _fmt, ...)      if (_PASS_MSG(_l)) {            \
  243         printf("[fib_algo] %s: " _fmt "\n", __func__, ##__VA_ARGS__);   \
  244 }
  245 #define _ALGO_PRINTF(_fib, _fam, _aname, _gen, _func, _fmt, ...) \
  246     printf("[fib_algo] %s.%u (%s#%u) %s: " _fmt "\n",\
  247     print_family(_fam), _fib, _aname, _gen, _func, ## __VA_ARGS__)
  248 #define _RH_PRINTF(_fib, _fam, _func, _fmt, ...) \
  249     printf("[fib_algo] %s.%u %s: " _fmt "\n", print_family(_fam), _fib, _func, ## __VA_ARGS__)
  250 #define RH_PRINTF(_l, _rh, _fmt, ...)   if (_PASS_MSG(_l)) {    \
  251     _RH_PRINTF(_rh->rib_fibnum, _rh->rib_family, __func__, _fmt, ## __VA_ARGS__);\
  252 }
  253 #define FD_PRINTF(_l, _fd, _fmt, ...)   FD_PRINTF_##_l(_l, _fd, _fmt, ## __VA_ARGS__)
  254 #define _FD_PRINTF(_l, _fd, _fmt, ...)  if (_PASS_MSG(_l)) {            \
  255     _ALGO_PRINTF(_fd->fd_fibnum, _fd->fd_family, _fd->fd_flm->flm_name, \
  256     _fd->fd_gen, __func__, _fmt, ## __VA_ARGS__);                       \
  257 }
  258 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG2
  259 #define FD_PRINTF_LOG_DEBUG2    _FD_PRINTF
  260 #else
  261 #define FD_PRINTF_LOG_DEBUG2(_l, _fd, _fmt, ...)
  262 #endif
  263 #if FLM_MAX_DEBUG_LEVEL>=LOG_DEBUG
  264 #define FD_PRINTF_LOG_DEBUG     _FD_PRINTF
  265 #else
  266 #define FD_PRINTF_LOG_DEBUG()
  267 #endif
  268 #if FLM_MAX_DEBUG_LEVEL>=LOG_INFO
  269 #define FD_PRINTF_LOG_INFO      _FD_PRINTF
  270 #else
  271 #define FD_PRINTF_LOG_INFO()
  272 #endif
  273 #define FD_PRINTF_LOG_NOTICE    _FD_PRINTF
  274 #define FD_PRINTF_LOG_ERR       _FD_PRINTF
  275 #define FD_PRINTF_LOG_WARNING   _FD_PRINTF
  276 
  277 
  278 /* List of all registered lookup algorithms */
  279 static TAILQ_HEAD(, fib_lookup_module) all_algo_list = TAILQ_HEAD_INITIALIZER(all_algo_list);
  280 
  281 /* List of all fib lookup instances in the vnet */
  282 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_data_head, fib_data), fib_data_list);
  283 #define V_fib_data_list VNET(fib_data_list)
  284 
  285 /* Datastructure for storing non-transient fib lookup module failures */
  286 struct fib_error {
  287         int                             fe_family;
  288         uint32_t                        fe_fibnum;      /* failed rtable */
  289         struct fib_lookup_module        *fe_flm;        /* failed module */
  290         TAILQ_ENTRY(fib_error)          entries;/* list of all errored entries */
  291 };
  292 VNET_DEFINE_STATIC(TAILQ_HEAD(fib_error_head, fib_error), fib_error_list);
  293 #define V_fib_error_list VNET(fib_error_list)
  294 
  295 /* Per-family array of fibnum -> {func, arg} mappings used in datapath */
  296 struct fib_dp_header {
  297         struct epoch_context    fdh_epoch_ctx;
  298         uint32_t                fdh_num_tables;
  299         struct fib_dp           fdh_idx[0];
  300 };
  301 
  302 /*
  303  * Tries to add new non-transient algorithm error to the list of
  304  *  errors.
  305  * Returns true on success.
  306  */
  307 static bool
  308 flm_error_add(struct fib_lookup_module *flm, uint32_t fibnum)
  309 {
  310         struct fib_error *fe;
  311 
  312         fe = malloc(sizeof(struct fib_error), M_TEMP, M_NOWAIT | M_ZERO);
  313         if (fe == NULL)
  314                 return (false);
  315         fe->fe_flm = flm;
  316         fe->fe_family = flm->flm_family;
  317         fe->fe_fibnum = fibnum;
  318 
  319         FIB_MOD_LOCK();
  320         /* Avoid duplicates by checking if error already exists first */
  321         if (flm_error_check(flm, fibnum)) {
  322                 FIB_MOD_UNLOCK();
  323                 free(fe, M_TEMP);
  324                 return (true);
  325         }
  326         TAILQ_INSERT_HEAD(&V_fib_error_list, fe, entries);
  327         FIB_MOD_UNLOCK();
  328 
  329         return (true);
  330 }
  331 
  332 /*
  333  * True if non-transient error has been registered for @flm in @fibnum.
  334  */
  335 static bool
  336 flm_error_check(const struct fib_lookup_module *flm, uint32_t fibnum)
  337 {
  338         const struct fib_error *fe;
  339 
  340         TAILQ_FOREACH(fe, &V_fib_error_list, entries) {
  341                 if ((fe->fe_flm == flm) && (fe->fe_fibnum == fibnum))
  342                         return (true);
  343         }
  344 
  345         return (false);
  346 }
  347 
  348 /*
  349  * Clear all errors of algo specified by @flm.
  350  */
  351 static void
  352 fib_error_clear_flm(struct fib_lookup_module *flm)
  353 {
  354         struct fib_error *fe, *fe_tmp;
  355 
  356         FIB_MOD_LOCK_ASSERT();
  357 
  358         TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
  359                 if (fe->fe_flm == flm) {
  360                         TAILQ_REMOVE(&V_fib_error_list, fe, entries);
  361                         free(fe, M_TEMP);
  362                 }
  363         }
  364 }
  365 
  366 /*
  367  * Clears all errors in current VNET.
  368  */
  369 static void
  370 fib_error_clear(void)
  371 {
  372         struct fib_error *fe, *fe_tmp;
  373 
  374         FIB_MOD_LOCK_ASSERT();
  375 
  376         TAILQ_FOREACH_SAFE(fe, &V_fib_error_list, entries, fe_tmp) {
  377                 TAILQ_REMOVE(&V_fib_error_list, fe, entries);
  378                 free(fe, M_TEMP);
  379         }
  380 }
  381 
  382 static const char *
  383 print_op_result(enum flm_op_result result)
  384 {
  385         switch (result) {
  386         case FLM_SUCCESS:
  387                 return "success";
  388         case FLM_REBUILD:
  389                 return "rebuild";
  390         case FLM_BATCH:
  391                 return "batch";
  392         case FLM_ERROR:
  393                 return "error";
  394         }
  395 
  396         return "unknown";
  397 }
  398 
  399 static const char *
  400 print_family(int family)
  401 {
  402 
  403         if (family == AF_INET)
  404                 return ("inet");
  405         else if (family == AF_INET6)
  406                 return ("inet6");
  407         else
  408                 return ("unknown");
  409 }
  410 
  411 /*
  412  * Debug function used by lookup algorithms.
  413  * Outputs message denoted by @fmt, prepended by "[fib_algo] inetX.Y (algo) "
  414  */
  415 void
  416 fib_printf(int level, struct fib_data *fd, const char *func, char *fmt, ...)
  417 {
  418         char buf[128];
  419         va_list ap;
  420 
  421         if (level > flm_debug_level)
  422                 return;
  423 
  424         va_start(ap, fmt);
  425         vsnprintf(buf, sizeof(buf), fmt, ap);
  426         va_end(ap);
  427 
  428         _ALGO_PRINTF(fd->fd_fibnum, fd->fd_family, fd->fd_flm->flm_name,
  429             fd->fd_gen, func, "%s", buf);
  430 }
  431 
  432 /*
  433  * Outputs list of algorithms supported by the provided address family.
  434  */
  435 static int
  436 print_algos_sysctl(struct sysctl_req *req, int family)
  437 {
  438         struct fib_lookup_module *flm;
  439         struct sbuf sbuf;
  440         int error, count = 0;
  441 
  442         error = sysctl_wire_old_buffer(req, 0);
  443         if (error == 0) {
  444                 sbuf_new_for_sysctl(&sbuf, NULL, 512, req);
  445                 TAILQ_FOREACH(flm, &all_algo_list, entries) {
  446                         if (flm->flm_family == family) {
  447                                 if (count++ > 0)
  448                                         sbuf_cat(&sbuf, ", ");
  449                                 sbuf_cat(&sbuf, flm->flm_name);
  450                         }
  451                 }
  452                 error = sbuf_finish(&sbuf);
  453                 sbuf_delete(&sbuf);
  454         }
  455         return (error);
  456 }
  457 
  458 #ifdef INET6
  459 static int
  460 print_algos_sysctl_inet6(SYSCTL_HANDLER_ARGS)
  461 {
  462 
  463         return (print_algos_sysctl(req, AF_INET6));
  464 }
  465 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo_list,
  466     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  467     print_algos_sysctl_inet6, "A", "List of IPv6 lookup algorithms");
  468 #endif
  469 
  470 #ifdef INET
  471 static int
  472 print_algos_sysctl_inet(SYSCTL_HANDLER_ARGS)
  473 {
  474 
  475         return (print_algos_sysctl(req, AF_INET));
  476 }
  477 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo_list,
  478     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  479     print_algos_sysctl_inet, "A", "List of IPv4 lookup algorithms");
  480 #endif
  481 
  482 /*
  483  * Calculate delay between repeated failures.
  484  * Returns current delay in milliseconds.
  485  */
  486 static uint32_t
  487 callout_calc_delay_ms(struct fib_data *fd)
  488 {
  489         uint32_t shift;
  490 
  491         if (fd->fd_failed_rebuilds > 10)
  492                 shift = 10;
  493         else
  494                 shift = fd->fd_failed_rebuilds;
  495 
  496         return ((1 << shift) * FIB_CALLOUT_DELAY_MS);
  497 }
  498 
  499 static void
  500 schedule_callout(struct fib_data *fd, enum fib_callout_action action, int delay_ms)
  501 {
  502 
  503         FD_PRINTF(LOG_DEBUG, fd, "delay=%d action=%d", delay_ms, action);
  504         fd->fd_callout_action = action;
  505         callout_reset_sbt(&fd->fd_callout, SBT_1MS * delay_ms, 0,
  506             handle_fd_callout, fd, 0);
  507 }
  508 
  509 static void
  510 schedule_fd_rebuild(struct fib_data *fd, const char *reason)
  511 {
  512 
  513         RIB_WLOCK_ASSERT(fd->fd_rh);
  514 
  515         if (!fd->fd_need_rebuild) {
  516                 fd->fd_need_rebuild = true;
  517                 /* Stop batch updates */
  518                 fd->fd_batch = false;
  519 
  520                 /*
  521                  * Potentially re-schedules pending callout
  522                  *  initiated by schedule_algo_eval.
  523                  */
  524                 FD_PRINTF(LOG_INFO, fd, "Scheduling rebuild: %s (failures=%d)",
  525                     reason, fd->fd_failed_rebuilds);
  526                 schedule_callout(fd, FDA_REBUILD, callout_calc_delay_ms(fd));
  527         }
  528 }
  529 
  530 static void
  531 sync_rib_gen(struct fib_data *fd)
  532 {
  533         FD_PRINTF(LOG_DEBUG, fd, "Sync gen %u -> %u", fd->fd_rh->rnh_gen, fd->fd_rh->rnh_gen_rib);
  534         fd->fd_rh->rnh_gen = fd->fd_rh->rnh_gen_rib;
  535 }
  536 
  537 static int64_t
  538 get_tv_diff_ms(const struct timeval *old_tv, const struct timeval *new_tv)
  539 {
  540         int64_t diff = 0;
  541 
  542         diff = ((int64_t)(new_tv->tv_sec - old_tv->tv_sec)) * 1000;
  543         diff += (new_tv->tv_usec - old_tv->tv_usec) / 1000;
  544 
  545         return (diff);
  546 }
  547 
  548 static void
  549 add_tv_diff_ms(struct timeval *tv, int ms)
  550 {
  551         tv->tv_sec += ms / 1000;
  552         ms = ms % 1000;
  553         if (ms * 1000 + tv->tv_usec < 1000000)
  554                 tv->tv_usec += ms * 1000;
  555         else {
  556                 tv->tv_sec += 1;
  557                 tv->tv_usec = ms * 1000 + tv->tv_usec - 1000000;
  558         }
  559 }
  560 
  561 /*
  562  * Marks the time when algo state diverges from the rib state.
  563  */
  564 static void
  565 mark_diverge_time(struct fib_data *fd)
  566 {
  567         struct fib_sync_status *fd_ss = &fd->fd_ss;
  568 
  569         getmicrouptime(&fd_ss->diverge_time);
  570         fd_ss->bucket_id = 0;
  571         fd_ss->bucket_changes = 0;
  572 }
  573 
  574 /*
  575  * Calculates and updates the next algorithm sync time, based on the current activity.
  576  *
  577  * The intent is to provide reasonable balance between the update
  578  *  latency and efficient batching when changing large amount of routes.
  579  *
  580  * High-level algorithm looks the following:
  581  * 1) all changes are bucketed in 50ms intervals
  582  * 2) If amount of changes within the bucket is greater than the threshold,
  583  *   the update gets delayed, up to maximum delay threshold.
  584  */
  585 static void
  586 update_rebuild_delay(struct fib_data *fd, enum fib_callout_action action)
  587 {
  588         uint32_t bucket_id, new_delay = 0;
  589         struct timeval tv;
  590 
  591         /* Fetch all variables at once to ensure consistent reads */
  592         uint32_t bucket_time_ms = V_update_bucket_time_ms;
  593         uint32_t threshold_rate = V_bucket_change_threshold_rate;
  594         uint32_t max_delay_ms = V_fib_max_sync_delay_ms;
  595 
  596         if (bucket_time_ms == 0)
  597                 bucket_time_ms = 50;
  598         /* calculate per-bucket threshold rate */
  599         threshold_rate = threshold_rate * bucket_time_ms / 1000;
  600 
  601         getmicrouptime(&tv);
  602 
  603         struct fib_sync_status *fd_ss = &fd->fd_ss;
  604 
  605         bucket_id = get_tv_diff_ms(&fd_ss->diverge_time, &tv) / bucket_time_ms;
  606 
  607         if (fd_ss->bucket_id == bucket_id) {
  608                 fd_ss->bucket_changes++;
  609                 if (fd_ss->bucket_changes == threshold_rate) {
  610                         new_delay = (bucket_id + 2) * bucket_time_ms;
  611                         if (new_delay <= max_delay_ms) {
  612                                 FD_PRINTF(LOG_DEBUG, fd,
  613                                     "hit threshold of %u routes, delay update,"
  614                                     "bucket: %u, total delay: %u",
  615                                     threshold_rate, bucket_id + 1, new_delay);
  616                         } else {
  617                                 new_delay = 0;
  618                                 FD_PRINTF(LOG_DEBUG, fd,
  619                                     "maximum sync delay (%u ms) reached", max_delay_ms);
  620                         }
  621                 } else if ((bucket_id == 0) && (fd_ss->bucket_changes == 1))
  622                         new_delay = bucket_time_ms;
  623         } else {
  624                 fd_ss->bucket_id = bucket_id;
  625                 fd_ss->bucket_changes = 1;
  626         }
  627 
  628         if (new_delay > 0) {
  629                 /* Calculated time has been updated */
  630                 struct timeval new_tv = fd_ss->diverge_time;
  631                 add_tv_diff_ms(&new_tv, new_delay);
  632 
  633                 int32_t delay_ms = get_tv_diff_ms(&tv, &new_tv);
  634                 schedule_callout(fd, action, delay_ms);
  635         }
  636 }
  637 
  638 static void
  639 update_algo_state(struct fib_data *fd)
  640 {
  641 
  642         RIB_WLOCK_ASSERT(fd->fd_rh);
  643 
  644         if (fd->fd_batch || fd->fd_need_rebuild) {
  645                 enum fib_callout_action action = fd->fd_need_rebuild ? FDA_REBUILD : FDA_BATCH;
  646                 update_rebuild_delay(fd, action);
  647                 return;
  648         }
  649 
  650         if (fd->fd_num_changes++ == 0) {
  651                 /* Start callout to consider switch */
  652                 if (!callout_pending(&fd->fd_callout))
  653                         schedule_callout(fd, FDA_EVAL, ALGO_EVAL_DELAY_MS);
  654         } else if (fd->fd_num_changes == ALGO_EVAL_NUM_ROUTES) {
  655                 /* Reset callout to exec immediately */
  656                 if (fd->fd_callout_action == FDA_EVAL)
  657                         schedule_callout(fd, FDA_EVAL, 1);
  658         }
  659 }
  660 
  661 static bool
  662 need_immediate_sync(struct fib_data *fd, struct rib_cmd_info *rc)
  663 {
  664         struct nhop_object *nh;
  665 
  666         /* Sync addition/removal of interface routes */
  667         switch (rc->rc_cmd) {
  668         case RTM_ADD:
  669                 nh = rc->rc_nh_new;
  670                 if (!NH_IS_NHGRP(nh)) {
  671                         if (!(nh->nh_flags & NHF_GATEWAY))
  672                                 return (true);
  673                         if (nhop_get_rtflags(nh) & RTF_STATIC)
  674                                 return (true);
  675                 }
  676                 break;
  677         case RTM_DELETE:
  678                 nh = rc->rc_nh_old;
  679                 if (!NH_IS_NHGRP(nh)) {
  680                         if (!(nh->nh_flags & NHF_GATEWAY))
  681                                 return (true);
  682                         if (nhop_get_rtflags(nh) & RTF_STATIC)
  683                                 return (true);
  684                 }
  685                 break;
  686         }
  687 
  688         return (false);
  689 }
  690 
  691 static bool
  692 apply_rtable_changes(struct fib_data *fd)
  693 {
  694         enum flm_op_result result;
  695         struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
  696 
  697         result = fd->fd_flm->flm_change_rib_items_cb(fd->fd_rh, q, fd->fd_algo_data);
  698 
  699         if (result == FLM_SUCCESS) {
  700                 sync_rib_gen(fd);
  701                 for (int i = 0; i < q->count; i++)
  702                         if (q->entries[i].nh_old)
  703                                 fib_unref_nhop(fd, q->entries[i].nh_old);
  704                 q->count = 0;
  705         }
  706         fd->fd_batch = false;
  707 
  708         return (result == FLM_SUCCESS);
  709 }
  710 
  711 static bool
  712 fill_change_entry(struct fib_data *fd, struct fib_change_entry *ce, struct rib_cmd_info *rc)
  713 {
  714         int plen = 0;
  715 
  716         switch (fd->fd_family) {
  717 #ifdef INET
  718         case AF_INET:
  719                 rt_get_inet_prefix_plen(rc->rc_rt, &ce->addr4, &plen, &ce->scopeid);
  720                 break;
  721 #endif
  722 #ifdef INET6
  723         case AF_INET6:
  724                 rt_get_inet6_prefix_plen(rc->rc_rt, &ce->addr6, &plen, &ce->scopeid);
  725                 break;
  726 #endif
  727         }
  728 
  729         ce->plen = plen;
  730         ce->nh_old = rc->rc_nh_old;
  731         ce->nh_new = rc->rc_nh_new;
  732         if (ce->nh_new != NULL) {
  733                 if (fib_ref_nhop(fd, ce->nh_new) == 0)
  734                         return (false);
  735         }
  736 
  737         return (true);
  738 }
  739 
  740 static bool
  741 queue_rtable_change(struct fib_data *fd, struct rib_cmd_info *rc)
  742 {
  743         struct fib_change_queue *q = &fd->fd_ss.fd_change_queue;
  744 
  745         if (q->count >= q->size) {
  746                 uint32_t q_size;
  747 
  748                 if (q->size == 0)
  749                         q_size = 256; /* ~18k memory */
  750                 else
  751                         q_size = q->size * 2;
  752 
  753                 size_t size = q_size * sizeof(struct fib_change_entry);
  754                 void *a = realloc(q->entries, size, M_TEMP, M_NOWAIT | M_ZERO);
  755                 if (a == NULL) {
  756                         FD_PRINTF(LOG_INFO, fd, "Unable to realloc queue for %u elements",
  757                             q_size);
  758                         return (false);
  759                 }
  760                 q->entries = a;
  761                 q->size = q_size;
  762         }
  763 
  764         return (fill_change_entry(fd, &q->entries[q->count++], rc));
  765 }
  766 
  767 /*
  768  * Rib subscription handler. Checks if the algorithm is ready to
  769  *  receive updates, handles nexthop refcounting and passes change
  770  *  data to the algorithm callback.
  771  */
  772 static void
  773 handle_rtable_change_cb(struct rib_head *rnh, struct rib_cmd_info *rc,
  774     void *_data)
  775 {
  776         struct fib_data *fd = (struct fib_data *)_data;
  777         enum flm_op_result result;
  778 
  779         RIB_WLOCK_ASSERT(rnh);
  780 
  781         /*
  782          * There is a small gap between subscribing for route changes
  783          *  and initiating rtable dump. Avoid receiving route changes
  784          *  prior to finishing rtable dump by checking `init_done`.
  785          */
  786         if (!fd->init_done)
  787                 return;
  788 
  789         bool immediate_sync = need_immediate_sync(fd, rc);
  790 
  791         /* Consider scheduling algorithm re-evaluation */
  792         update_algo_state(fd);
  793 
  794         /*
  795          * If algo requested rebuild, stop sending updates by default.
  796          * This simplifies nexthop refcount handling logic.
  797          */
  798         if (fd->fd_need_rebuild) {
  799                 if (immediate_sync)
  800                         rebuild_fd(fd, "rtable change type enforced sync");
  801                 return;
  802         }
  803 
  804         /*
  805          * Algo requested updates to be delivered in batches.
  806          * Add the current change to the queue and return.
  807          */
  808         if (fd->fd_batch) {
  809                 if (immediate_sync) {
  810                         if (!queue_rtable_change(fd, rc) || !apply_rtable_changes(fd))
  811                                 rebuild_fd(fd, "batch sync failed");
  812                 } else {
  813                         if (!queue_rtable_change(fd, rc))
  814                                 schedule_fd_rebuild(fd, "batch queue failed");
  815                 }
  816                 return;
  817         }
  818 
  819         /*
  820          * Maintain guarantee that every nexthop returned by the dataplane
  821          *  lookup has > 0 refcount, so can be safely referenced within current
  822          *  epoch.
  823          */
  824         if (rc->rc_nh_new != NULL) {
  825                 if (fib_ref_nhop(fd, rc->rc_nh_new) == 0) {
  826                         /* ran out of indexes */
  827                         schedule_fd_rebuild(fd, "ran out of nhop indexes");
  828                         return;
  829                 }
  830         }
  831 
  832         result = fd->fd_flm->flm_change_rib_item_cb(rnh, rc, fd->fd_algo_data);
  833 
  834         switch (result) {
  835         case FLM_SUCCESS:
  836                 sync_rib_gen(fd);
  837                 /* Unref old nexthop on success */
  838                 if (rc->rc_nh_old != NULL)
  839                         fib_unref_nhop(fd, rc->rc_nh_old);
  840                 break;
  841         case FLM_BATCH:
  842 
  843                 /*
  844                  * Algo asks to batch the changes.
  845                  */
  846                 if (queue_rtable_change(fd, rc)) {
  847                         if (!immediate_sync) {
  848                                 fd->fd_batch = true;
  849                                 mark_diverge_time(fd);
  850                                 update_rebuild_delay(fd, FDA_BATCH);
  851                                 break;
  852                         }
  853                         if (apply_rtable_changes(fd))
  854                                 break;
  855                 }
  856                 FD_PRINTF(LOG_ERR, fd, "batched sync failed, force the rebuild");
  857 
  858         case FLM_REBUILD:
  859 
  860                 /*
  861                  * Algo is not able to apply the update.
  862                  * Schedule algo rebuild.
  863                  */
  864                 if (!immediate_sync) {
  865                         mark_diverge_time(fd);
  866                         schedule_fd_rebuild(fd, "algo requested rebuild");
  867                         break;
  868                 }
  869 
  870                 FD_PRINTF(LOG_INFO, fd, "running sync rebuild");
  871                 rebuild_fd(fd, "rtable change type enforced sync");
  872                 break;
  873         case FLM_ERROR:
  874 
  875                 /*
  876                  * Algo reported a non-recoverable error.
  877                  * Record the error and schedule rebuild, which will
  878                  *  trigger best algo selection.
  879                  */
  880                 FD_PRINTF(LOG_ERR, fd, "algo reported non-recoverable error");
  881                 if (!flm_error_add(fd->fd_flm, fd->fd_fibnum))
  882                         FD_PRINTF(LOG_ERR, fd, "failed to ban algo");
  883                 schedule_fd_rebuild(fd, "algo reported non-recoverable error");
  884         }
  885 }
  886 
  887 static void
  888 estimate_nhop_scale(const struct fib_data *old_fd, struct fib_data *fd)
  889 {
  890 
  891         if (old_fd == NULL) {
  892                 // TODO: read from rtable
  893                 fd->number_nhops = 16;
  894                 return;
  895         }
  896 
  897         if (old_fd->hit_nhops && old_fd->number_nhops < FIB_MAX_NHOPS)
  898                 fd->number_nhops = 2 * old_fd->number_nhops;
  899         else
  900                 fd->number_nhops = old_fd->number_nhops;
  901 }
  902 
  903 struct walk_cbdata {
  904         struct fib_data         *fd;
  905         flm_dump_t              *func;
  906         enum flm_op_result      result;
  907 };
  908 
  909 /*
  910  * Handler called after all rtenties have been dumped.
  911  * Performs post-dump framework checks and calls
  912  * algo:flm_dump_end_cb().
  913  *
  914  * Updates walk_cbdata result.
  915  */
  916 static void
  917 sync_algo_end_cb(struct rib_head *rnh, enum rib_walk_hook stage, void *_data)
  918 {
  919         struct walk_cbdata *w = (struct walk_cbdata *)_data;
  920         struct fib_data *fd = w->fd;
  921 
  922         RIB_WLOCK_ASSERT(w->fd->fd_rh);
  923 
  924         if (rnh->rib_dying) {
  925                 w->result = FLM_ERROR;
  926                 return;
  927         }
  928 
  929         if (fd->hit_nhops) {
  930                 FD_PRINTF(LOG_INFO, fd, "ran out of nexthops at %u nhops",
  931                     fd->nh_ref_table->count);
  932                 if (w->result == FLM_SUCCESS)
  933                         w->result = FLM_REBUILD;
  934                 return;
  935         }
  936 
  937         if (stage != RIB_WALK_HOOK_POST || w->result != FLM_SUCCESS)
  938                 return;
  939 
  940         /* Post-dump hook, dump successful */
  941         w->result = fd->fd_flm->flm_dump_end_cb(fd->fd_algo_data, &fd->fd_dp);
  942 
  943         if (w->result == FLM_SUCCESS) {
  944                 /* Mark init as done to allow routing updates */
  945                 fd->init_done = 1;
  946         }
  947 }
  948 
  949 /*
  950  * Callback for each entry in rib.
  951  * Calls algo:flm_dump_rib_item_cb func as a part of initial
  952  *  route table synchronisation.
  953  */
  954 static int
  955 sync_algo_cb(struct rtentry *rt, void *_data)
  956 {
  957         struct walk_cbdata *w = (struct walk_cbdata *)_data;
  958 
  959         RIB_WLOCK_ASSERT(w->fd->fd_rh);
  960 
  961         if (w->result == FLM_SUCCESS && w->func) {
  962 
  963                 /*
  964                  * Reference nexthops to maintain guarantee that
  965                  *  each nexthop returned by datapath has > 0 references
  966                  *  and can be safely referenced within current epoch.
  967                  */
  968                 struct nhop_object *nh = rt_get_raw_nhop(rt);
  969                 if (fib_ref_nhop(w->fd, nh) != 0)
  970                         w->result = w->func(rt, w->fd->fd_algo_data);
  971                 else
  972                         w->result = FLM_REBUILD;
  973         }
  974 
  975         return (0);
  976 }
  977 
  978 /*
  979  * Dump all routing table state to the algo instance.
  980  */
  981 static enum flm_op_result
  982 sync_algo(struct fib_data *fd)
  983 {
  984         struct walk_cbdata w = {
  985                 .fd = fd,
  986                 .func = fd->fd_flm->flm_dump_rib_item_cb,
  987                 .result = FLM_SUCCESS,
  988         };
  989 
  990         rib_walk_ext_locked(fd->fd_rh, sync_algo_cb, sync_algo_end_cb, &w);
  991 
  992         FD_PRINTF(LOG_INFO, fd,
  993             "initial dump completed (rtable version: %d), result: %s",
  994             fd->fd_rh->rnh_gen, print_op_result(w.result));
  995 
  996         return (w.result);
  997 }
  998 
  999 /*
 1000  * Schedules epoch-backed @fd instance deletion.
 1001  * * Unlinks @fd from the list of active algo instances.
 1002  * * Removes rib subscription.
 1003  * * Stops callout.
 1004  * * Schedules actual deletion.
 1005  *
 1006  * Assume @fd is already unlinked from the datapath.
 1007  */
 1008 static int
 1009 schedule_destroy_fd_instance(struct fib_data *fd, bool in_callout)
 1010 {
 1011         bool is_dead;
 1012 
 1013         NET_EPOCH_ASSERT();
 1014         RIB_WLOCK_ASSERT(fd->fd_rh);
 1015 
 1016         FIB_MOD_LOCK();
 1017         is_dead = fd->fd_dead;
 1018         if (!is_dead)
 1019                 fd->fd_dead = true;
 1020         if (fd->fd_linked) {
 1021                 TAILQ_REMOVE(&V_fib_data_list, fd, entries);
 1022                 fd->fd_linked = false;
 1023         }
 1024         FIB_MOD_UNLOCK();
 1025         if (is_dead)
 1026                 return (0);
 1027 
 1028         FD_PRINTF(LOG_INFO, fd, "DETACH");
 1029 
 1030         if (fd->fd_rs != NULL)
 1031                 rib_unsubscribe_locked(fd->fd_rs);
 1032 
 1033         /*
 1034          * After rib_unsubscribe() no _new_ handle_rtable_change_cb() calls
 1035          * will be executed, hence no _new_ callout schedules will happen.
 1036          */
 1037         callout_stop(&fd->fd_callout);
 1038 
 1039         fib_epoch_call(destroy_fd_instance_epoch, &fd->fd_epoch_ctx);
 1040 
 1041         return (0);
 1042 }
 1043 
 1044 /*
 1045  * Wipe all fd instances from the list matching rib specified by @rh.
 1046  * If @keep_first is set, remove all but the first record.
 1047  */
 1048 static void
 1049 fib_cleanup_algo(struct rib_head *rh, bool keep_first, bool in_callout)
 1050 {
 1051         struct fib_data_head tmp_head = TAILQ_HEAD_INITIALIZER(tmp_head);
 1052         struct fib_data *fd, *fd_tmp;
 1053         struct epoch_tracker et;
 1054 
 1055         FIB_MOD_LOCK();
 1056         TAILQ_FOREACH_SAFE(fd, &V_fib_data_list, entries, fd_tmp) {
 1057                 if (fd->fd_rh == rh) {
 1058                         if (keep_first) {
 1059                                 keep_first = false;
 1060                                 continue;
 1061                         }
 1062                         TAILQ_REMOVE(&V_fib_data_list, fd, entries);
 1063                         fd->fd_linked = false;
 1064                         TAILQ_INSERT_TAIL(&tmp_head, fd, entries);
 1065                 }
 1066         }
 1067         FIB_MOD_UNLOCK();
 1068 
 1069         /* Pass 2: remove each entry */
 1070         NET_EPOCH_ENTER(et);
 1071         TAILQ_FOREACH_SAFE(fd, &tmp_head, entries, fd_tmp) {
 1072                 if (!in_callout)
 1073                         RIB_WLOCK(fd->fd_rh);
 1074                 schedule_destroy_fd_instance(fd, in_callout);
 1075                 if (!in_callout)
 1076                         RIB_WUNLOCK(fd->fd_rh);
 1077         }
 1078         NET_EPOCH_EXIT(et);
 1079 }
 1080 
 1081 void
 1082 fib_destroy_rib(struct rib_head *rh)
 1083 {
 1084 
 1085         /*
 1086          * rnh has `is_dying` flag set, so setup of new fd's will fail at
 1087          *  sync_algo() stage, preventing new entries to be added to the list
 1088          *  of active algos. Remove all existing entries for the particular rib.
 1089          */
 1090         fib_cleanup_algo(rh, false, false);
 1091 }
 1092 
 1093 /*
 1094  * Finalises fd destruction by freeing all fd resources.
 1095  */
 1096 static void
 1097 destroy_fd_instance(struct fib_data *fd)
 1098 {
 1099 
 1100         FD_PRINTF(LOG_INFO, fd, "destroy fd %p", fd);
 1101 
 1102         /* Call destroy callback first */
 1103         if (fd->fd_algo_data != NULL)
 1104                 fd->fd_flm->flm_destroy_cb(fd->fd_algo_data);
 1105 
 1106         /* Nhop table */
 1107         if ((fd->nh_idx != NULL) && (fd->nh_ref_table != NULL)) {
 1108                 for (int i = 0; i < fd->number_nhops; i++) {
 1109                         if (!is_idx_free(fd, i)) {
 1110                                 FD_PRINTF(LOG_DEBUG2, fd, " FREE nhop %d %p",
 1111                                     i, fd->nh_idx[i]);
 1112                                 nhop_free_any(fd->nh_idx[i]);
 1113                         }
 1114                 }
 1115                 free(fd->nh_idx, M_RTABLE);
 1116         }
 1117         if (fd->nh_ref_table != NULL)
 1118                 free(fd->nh_ref_table, M_RTABLE);
 1119 
 1120         if (fd->fd_ss.fd_change_queue.entries != NULL)
 1121                 free(fd->fd_ss.fd_change_queue.entries, M_TEMP);
 1122 
 1123         fib_unref_algo(fd->fd_flm);
 1124 
 1125         free(fd, M_RTABLE);
 1126 }
 1127 
 1128 /*
 1129  * Epoch callback indicating fd is safe to destroy
 1130  */
 1131 static void
 1132 destroy_fd_instance_epoch(epoch_context_t ctx)
 1133 {
 1134         struct fib_data *fd;
 1135 
 1136         fd = __containerof(ctx, struct fib_data, fd_epoch_ctx);
 1137 
 1138         CURVNET_SET(fd->fd_vnet);
 1139         destroy_fd_instance(fd);
 1140         CURVNET_RESTORE();
 1141 }
 1142 
 1143 /*
 1144  * Tries to setup fd instance.
 1145  * - Allocates fd/nhop table
 1146  * - Runs algo:flm_init_cb algo init
 1147  * - Subscribes fd to the rib
 1148  * - Runs rtable dump
 1149  * - Adds instance to the list of active instances.
 1150  *
 1151  * Returns: operation result. Fills in @pfd with resulting fd on success.
 1152  *
 1153  */
 1154 static enum flm_op_result
 1155 try_setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 1156     struct fib_data *old_fd, struct fib_data **pfd)
 1157 {
 1158         struct fib_data *fd;
 1159         size_t size;
 1160         enum flm_op_result result;
 1161 
 1162         /* Allocate */
 1163         fd = malloc(sizeof(struct fib_data), M_RTABLE, M_NOWAIT | M_ZERO);
 1164         if (fd == NULL)  {
 1165                 *pfd = NULL;
 1166                 RH_PRINTF(LOG_INFO, rh, "Unable to allocate fib_data structure");
 1167                 return (FLM_REBUILD);
 1168         }
 1169         *pfd = fd;
 1170 
 1171         estimate_nhop_scale(old_fd, fd);
 1172 
 1173         fd->fd_rh = rh;
 1174         fd->fd_family = rh->rib_family;
 1175         fd->fd_fibnum = rh->rib_fibnum;
 1176         callout_init_rm(&fd->fd_callout, &rh->rib_lock, 0);
 1177         fd->fd_vnet = curvnet;
 1178         fd->fd_flm = flm;
 1179 
 1180         FIB_MOD_LOCK();
 1181         flm->flm_refcount++;
 1182         fd->fd_gen = ++fib_gen;
 1183         FIB_MOD_UNLOCK();
 1184 
 1185         FD_PRINTF(LOG_DEBUG, fd, "allocated fd %p", fd);
 1186 
 1187         /* Allocate nhidx -> nhop_ptr table */
 1188         size = fd->number_nhops * sizeof(void *);
 1189         fd->nh_idx = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
 1190         if (fd->nh_idx == NULL) {
 1191                 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop table idx (sz:%zu)", size);
 1192                 return (FLM_REBUILD);
 1193         }
 1194 
 1195         /* Allocate nhop index refcount table */
 1196         size = sizeof(struct nhop_ref_table);
 1197         size += fd->number_nhops * sizeof(uint32_t);
 1198         fd->nh_ref_table = malloc(size, M_RTABLE, M_NOWAIT | M_ZERO);
 1199         if (fd->nh_ref_table == NULL) {
 1200                 FD_PRINTF(LOG_INFO, fd, "Unable to allocate nhop refcount table (sz:%zu)", size);
 1201                 return (FLM_REBUILD);
 1202         }
 1203         FD_PRINTF(LOG_DEBUG, fd, "Allocated %u nhop indexes", fd->number_nhops);
 1204 
 1205         /* Okay, we're ready for algo init */
 1206         void *old_algo_data = (old_fd != NULL) ? old_fd->fd_algo_data : NULL;
 1207         result = flm->flm_init_cb(fd->fd_fibnum, fd, old_algo_data, &fd->fd_algo_data);
 1208         if (result != FLM_SUCCESS) {
 1209                 FD_PRINTF(LOG_INFO, fd, "%s algo init failed", flm->flm_name);
 1210                 return (result);
 1211         }
 1212 
 1213         /* Try to subscribe */
 1214         if (flm->flm_change_rib_item_cb != NULL) {
 1215                 fd->fd_rs = rib_subscribe_locked(fd->fd_rh,
 1216                     handle_rtable_change_cb, fd, RIB_NOTIFY_IMMEDIATE);
 1217                 if (fd->fd_rs == NULL) {
 1218                         FD_PRINTF(LOG_INFO, fd, "failed to subscribe to the rib changes");
 1219                         return (FLM_REBUILD);
 1220                 }
 1221         }
 1222 
 1223         /* Dump */
 1224         result = sync_algo(fd);
 1225         if (result != FLM_SUCCESS) {
 1226                 FD_PRINTF(LOG_INFO, fd, "rib sync failed");
 1227                 return (result);
 1228         }
 1229         FD_PRINTF(LOG_INFO, fd, "DUMP completed successfully.");
 1230 
 1231         FIB_MOD_LOCK();
 1232         /*
 1233          * Insert fd in the beginning of a list, to maintain invariant
 1234          *  that first matching entry for the AF/fib is always the active
 1235          *  one.
 1236          */
 1237         TAILQ_INSERT_HEAD(&V_fib_data_list, fd, entries);
 1238         fd->fd_linked = true;
 1239         FIB_MOD_UNLOCK();
 1240 
 1241         return (FLM_SUCCESS);
 1242 }
 1243 
 1244 /*
 1245  * Sets up algo @flm for table @rh and links it to the datapath.
 1246  *
 1247  */
 1248 static enum flm_op_result
 1249 setup_fd_instance(struct fib_lookup_module *flm, struct rib_head *rh,
 1250     struct fib_data *orig_fd, struct fib_data **pfd, bool attach)
 1251 {
 1252         struct fib_data *prev_fd, *new_fd;
 1253         enum flm_op_result result;
 1254 
 1255         NET_EPOCH_ASSERT();
 1256         RIB_WLOCK_ASSERT(rh);
 1257 
 1258         prev_fd = orig_fd;
 1259         new_fd = NULL;
 1260         for (int i = 0; i < FIB_MAX_TRIES; i++) {
 1261                 result = try_setup_fd_instance(flm, rh, prev_fd, &new_fd);
 1262 
 1263                 if ((result == FLM_SUCCESS) && attach) {
 1264                         if (fib_set_datapath_ptr(new_fd, &new_fd->fd_dp))
 1265                                 sync_rib_gen(new_fd);
 1266                         else
 1267                                 result = FLM_REBUILD;
 1268                 }
 1269 
 1270                 if ((prev_fd != NULL) && (prev_fd != orig_fd)) {
 1271                         schedule_destroy_fd_instance(prev_fd, false);
 1272                         prev_fd = NULL;
 1273                 }
 1274 
 1275                 RH_PRINTF(LOG_INFO, rh, "try %d: fib algo result: %s", i,
 1276                     print_op_result(result));
 1277 
 1278                 if (result == FLM_REBUILD) {
 1279                         prev_fd = new_fd;
 1280                         new_fd = NULL;
 1281                         continue;
 1282                 }
 1283 
 1284                 break;
 1285         }
 1286 
 1287         if (result != FLM_SUCCESS) {
 1288                 RH_PRINTF(LOG_WARNING, rh,
 1289                     "%s algo instance setup failed, failures=%d", flm->flm_name,
 1290                     orig_fd ? orig_fd->fd_failed_rebuilds + 1 : 0);
 1291                 /* update failure count */
 1292                 FIB_MOD_LOCK();
 1293                 if (orig_fd != NULL)
 1294                         orig_fd->fd_failed_rebuilds++;
 1295                 FIB_MOD_UNLOCK();
 1296 
 1297                 /* Ban algo on non-recoverable error */
 1298                 if (result == FLM_ERROR)
 1299                         flm_error_add(flm, rh->rib_fibnum);
 1300 
 1301                 if ((prev_fd != NULL) && (prev_fd != orig_fd))
 1302                         schedule_destroy_fd_instance(prev_fd, false);
 1303                 if (new_fd != NULL) {
 1304                         schedule_destroy_fd_instance(new_fd, false);
 1305                         new_fd = NULL;
 1306                 }
 1307         }
 1308 
 1309         *pfd = new_fd;
 1310         return (result);
 1311 }
 1312 
 1313 /*
 1314  * Tries to sync algo with the current rtable state, either
 1315  * by executing batch update or rebuilding.
 1316  * Returns true on success.
 1317  */
 1318 static bool
 1319 execute_callout_action(struct fib_data *fd)
 1320 {
 1321         enum fib_callout_action action = fd->fd_callout_action;
 1322         struct fib_lookup_module *flm_new = NULL;
 1323         bool result = true;
 1324 
 1325         NET_EPOCH_ASSERT();
 1326         RIB_WLOCK_ASSERT(fd->fd_rh);
 1327 
 1328         fd->fd_need_rebuild = false;
 1329         fd->fd_batch = false;
 1330         fd->fd_num_changes = 0;
 1331 
 1332         /* First, check if we're still OK to use this algo */
 1333         if (!is_algo_fixed(fd->fd_rh))
 1334                 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
 1335         if (flm_new != NULL)
 1336                 action = FDA_REBUILD;
 1337 
 1338         if (action == FDA_BATCH) {
 1339                 /* Try to sync */
 1340                 if (!apply_rtable_changes(fd))
 1341                         action = FDA_REBUILD;
 1342         }
 1343 
 1344         if (action == FDA_REBUILD)
 1345                 result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
 1346         if (flm_new != NULL)
 1347                 fib_unref_algo(flm_new);
 1348 
 1349         return (result);
 1350 }
 1351 
 1352 /*
 1353  * Callout for all scheduled fd-related work.
 1354  * - Checks if the current algo is still the best algo
 1355  * - Synchronises algo instance to the rtable (batch usecase)
 1356  * - Creates a new instance of an algo for af/fib if desired.
 1357  */
 1358 static void
 1359 handle_fd_callout(void *_data)
 1360 {
 1361         struct fib_data *fd = (struct fib_data *)_data;
 1362         struct epoch_tracker et;
 1363 
 1364         FD_PRINTF(LOG_INFO, fd, "running callout type=%d", fd->fd_callout_action);
 1365 
 1366         NET_EPOCH_ENTER(et);
 1367         CURVNET_SET(fd->fd_vnet);
 1368         execute_callout_action(fd);
 1369         CURVNET_RESTORE();
 1370         NET_EPOCH_EXIT(et);
 1371 }
 1372 
 1373 /*
 1374  * Tries to create new algo instance based on @fd data.
 1375  * Returns true on success.
 1376  */
 1377 static bool
 1378 rebuild_fd_flm(struct fib_data *fd, struct fib_lookup_module *flm_new)
 1379 {
 1380         struct fib_data *fd_new, *fd_tmp = NULL;
 1381         bool result;
 1382 
 1383         if (flm_new == fd->fd_flm)
 1384                 fd_tmp = fd;
 1385         else
 1386                 FD_PRINTF(LOG_NOTICE, fd, "switching algo to %s", flm_new->flm_name);
 1387 
 1388         result = setup_fd_instance(flm_new, fd->fd_rh, fd_tmp, &fd_new, true);
 1389         if (result != FLM_SUCCESS) {
 1390                 FD_PRINTF(LOG_NOTICE, fd, "table rebuild failed");
 1391                 return (false);
 1392         }
 1393         FD_PRINTF(LOG_INFO, fd_new, "switched to new instance");
 1394 
 1395         /* Remove old instance */
 1396         schedule_destroy_fd_instance(fd, true);
 1397 
 1398         return (true);
 1399 }
 1400 
 1401 static bool
 1402 rebuild_fd(struct fib_data *fd, const char *reason)
 1403 {
 1404         struct fib_lookup_module *flm_new = NULL;
 1405         bool result;
 1406 
 1407         if (!is_algo_fixed(fd->fd_rh))
 1408                 flm_new = fib_check_best_algo(fd->fd_rh, fd->fd_flm);
 1409 
 1410         FD_PRINTF(LOG_INFO, fd, "running sync rebuild: %s", reason);
 1411         result = rebuild_fd_flm(fd, flm_new != NULL ? flm_new : fd->fd_flm);
 1412         if (flm_new != NULL)
 1413                 fib_unref_algo(flm_new);
 1414 
 1415         if (!result) {
 1416                 FD_PRINTF(LOG_ERR, fd, "sync rebuild failed");
 1417                 schedule_fd_rebuild(fd, "sync rebuild failed");
 1418         }
 1419 
 1420         return (result);
 1421 }
 1422 
 1423 /*
 1424  * Finds algo by name/family.
 1425  * Returns referenced algo or NULL.
 1426  */
 1427 static struct fib_lookup_module *
 1428 fib_find_algo(const char *algo_name, int family)
 1429 {
 1430         struct fib_lookup_module *flm;
 1431 
 1432         FIB_MOD_LOCK();
 1433         TAILQ_FOREACH(flm, &all_algo_list, entries) {
 1434                 if ((strcmp(flm->flm_name, algo_name) == 0) &&
 1435                     (family == flm->flm_family)) {
 1436                         flm->flm_refcount++;
 1437                         FIB_MOD_UNLOCK();
 1438                         return (flm);
 1439                 }
 1440         }
 1441         FIB_MOD_UNLOCK();
 1442 
 1443         return (NULL);
 1444 }
 1445 
 1446 static void
 1447 fib_unref_algo(struct fib_lookup_module *flm)
 1448 {
 1449 
 1450         FIB_MOD_LOCK();
 1451         flm->flm_refcount--;
 1452         FIB_MOD_UNLOCK();
 1453 }
 1454 
 1455 static int
 1456 set_fib_algo(uint32_t fibnum, int family, struct sysctl_oid *oidp, struct sysctl_req *req)
 1457 {
 1458         struct fib_lookup_module *flm = NULL;
 1459         struct fib_data *fd = NULL;
 1460         char old_algo_name[32], algo_name[32];
 1461         struct rib_head *rh = NULL;
 1462         enum flm_op_result result;
 1463         struct epoch_tracker et;
 1464         int error;
 1465 
 1466         /* Fetch current algo/rib for af/family */
 1467         FIB_MOD_LOCK();
 1468         TAILQ_FOREACH(fd, &V_fib_data_list, entries) {
 1469                 if ((fd->fd_family == family) && (fd->fd_fibnum == fibnum))
 1470                         break;
 1471         }
 1472         if (fd == NULL) {
 1473                 FIB_MOD_UNLOCK();
 1474                 return (ENOENT);
 1475         }
 1476         rh = fd->fd_rh;
 1477         strlcpy(old_algo_name, fd->fd_flm->flm_name,
 1478             sizeof(old_algo_name));
 1479         FIB_MOD_UNLOCK();
 1480 
 1481         strlcpy(algo_name, old_algo_name, sizeof(algo_name));
 1482         error = sysctl_handle_string(oidp, algo_name, sizeof(algo_name), req);
 1483         if (error != 0 || req->newptr == NULL)
 1484                 return (error);
 1485 
 1486         if (strcmp(algo_name, old_algo_name) == 0)
 1487                 return (0);
 1488 
 1489         /* New algorithm name is different */
 1490         flm = fib_find_algo(algo_name, family);
 1491         if (flm == NULL) {
 1492                 RH_PRINTF(LOG_INFO, rh, "unable to find algo %s", algo_name);
 1493                 return (ESRCH);
 1494         }
 1495 
 1496         fd = NULL;
 1497         NET_EPOCH_ENTER(et);
 1498         RIB_WLOCK(rh);
 1499         result = setup_fd_instance(flm, rh, NULL, &fd, true);
 1500         RIB_WUNLOCK(rh);
 1501         NET_EPOCH_EXIT(et);
 1502         fib_unref_algo(flm);
 1503         if (result != FLM_SUCCESS)
 1504                 return (EINVAL);
 1505 
 1506         /* Disable automated jumping between algos */
 1507         FIB_MOD_LOCK();
 1508         set_algo_fixed(rh);
 1509         FIB_MOD_UNLOCK();
 1510         /* Remove old instance(s) */
 1511         fib_cleanup_algo(rh, true, false);
 1512 
 1513         /* Drain cb so user can unload the module after userret if so desired */
 1514         NET_EPOCH_DRAIN_CALLBACKS();
 1515 
 1516         return (0);
 1517 }
 1518 
 1519 #ifdef INET
 1520 static int
 1521 set_algo_inet_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1522 {
 1523 
 1524         return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET, oidp, req));
 1525 }
 1526 SYSCTL_PROC(_net_route_algo_inet, OID_AUTO, algo,
 1527     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
 1528     set_algo_inet_sysctl_handler, "A", "Set IPv4 lookup algo");
 1529 #endif
 1530 
 1531 #ifdef INET6
 1532 static int
 1533 set_algo_inet6_sysctl_handler(SYSCTL_HANDLER_ARGS)
 1534 {
 1535 
 1536         return (set_fib_algo(curthread->td_proc->p_fibnum, AF_INET6, oidp, req));
 1537 }
 1538 SYSCTL_PROC(_net_route_algo_inet6, OID_AUTO, algo,
 1539     CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
 1540     set_algo_inet6_sysctl_handler, "A", "Set IPv6 lookup algo");
 1541 #endif
 1542 
 1543 static struct nhop_object *
 1544 dummy_lookup(void *algo_data, const struct flm_lookup_key key, uint32_t scopeid)
 1545 {
 1546         return (NULL);
 1547 }
 1548 
 1549 static void
 1550 destroy_fdh_epoch(epoch_context_t ctx)
 1551 {
 1552         struct fib_dp_header *fdh;
 1553 
 1554         fdh = __containerof(ctx, struct fib_dp_header, fdh_epoch_ctx);
 1555         free(fdh, M_RTABLE);
 1556 }
 1557 
 1558 static struct fib_dp_header *
 1559 alloc_fib_dp_array(uint32_t num_tables, bool waitok)
 1560 {
 1561         size_t sz;
 1562         struct fib_dp_header *fdh;
 1563 
 1564         sz = sizeof(struct fib_dp_header);
 1565         sz += sizeof(struct fib_dp) * num_tables;
 1566         fdh = malloc(sz, M_RTABLE, (waitok ? M_WAITOK : M_NOWAIT) | M_ZERO);
 1567         if (fdh != NULL) {
 1568                 fdh->fdh_num_tables = num_tables;
 1569                 /*
 1570                  * Set dummy lookup function ptr always returning NULL, so
 1571                  * we can delay algo init.
 1572                  */
 1573                 for (uint32_t i = 0; i < num_tables; i++)
 1574                         fdh->fdh_idx[i].f = dummy_lookup;
 1575         }
 1576         return (fdh);
 1577 }
 1578 
 1579 static struct fib_dp_header *
 1580 get_fib_dp_header(struct fib_dp *dp)
 1581 {
 1582 
 1583         return (__containerof((void *)dp, struct fib_dp_header, fdh_idx));
 1584 }
 1585 
 1586 /*
 1587  * Replace per-family index pool @pdp with a new one which
 1588  * contains updated callback/algo data from @fd.
 1589  * Returns true on success.
 1590  */
 1591 static bool
 1592 replace_rtables_family(struct fib_dp **pdp, struct fib_data *fd, struct fib_dp *dp)
 1593 {
 1594         struct fib_dp_header *new_fdh, *old_fdh;
 1595 
 1596         NET_EPOCH_ASSERT();
 1597 
 1598         FD_PRINTF(LOG_DEBUG, fd, "[vnet %p] replace with f:%p arg:%p",
 1599             curvnet, dp->f, dp->arg);
 1600 
 1601         FIB_MOD_LOCK();
 1602         old_fdh = get_fib_dp_header(*pdp);
 1603 
 1604         if (old_fdh->fdh_idx[fd->fd_fibnum].f == dp->f) {
 1605                 /*
 1606                  * Function is the same, data pointer needs update.
 1607                  * Perform in-line replace without reallocation.
 1608                  */
 1609                 old_fdh->fdh_idx[fd->fd_fibnum].arg = dp->arg;
 1610                 FD_PRINTF(LOG_DEBUG, fd, "FDH %p inline update", old_fdh);
 1611                 FIB_MOD_UNLOCK();
 1612                 return (true);
 1613         }
 1614 
 1615         new_fdh = alloc_fib_dp_array(old_fdh->fdh_num_tables, false);
 1616         FD_PRINTF(LOG_DEBUG, fd, "OLD FDH: %p NEW FDH: %p", old_fdh, new_fdh);
 1617         if (new_fdh == NULL) {
 1618                 FIB_MOD_UNLOCK();
 1619                 FD_PRINTF(LOG_WARNING, fd, "error attaching datapath");
 1620                 return (false);
 1621         }
 1622 
 1623         memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
 1624             old_fdh->fdh_num_tables * sizeof(struct fib_dp));
 1625         /* Update relevant data structure for @fd */
 1626         new_fdh->fdh_idx[fd->fd_fibnum] = *dp;
 1627 
 1628         /* Ensure memcpy() writes have completed */
 1629         atomic_thread_fence_rel();
 1630         /* Set new datapath pointer */
 1631         *pdp = &new_fdh->fdh_idx[0];
 1632         FIB_MOD_UNLOCK();
 1633         FD_PRINTF(LOG_DEBUG, fd, "update %p -> %p", old_fdh, new_fdh);
 1634 
 1635         fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
 1636 
 1637         return (true);
 1638 }
 1639 
 1640 static struct fib_dp **
 1641 get_family_dp_ptr(int family)
 1642 {
 1643         switch (family) {
 1644 #ifdef INET
 1645         case AF_INET:
 1646                 return (&V_inet_dp);
 1647 #endif
 1648 #ifdef INET6
 1649         case AF_INET6:
 1650                 return (&V_inet6_dp);
 1651 #endif
 1652         }
 1653         return (NULL);
 1654 }
 1655 
 1656 /*
 1657  * Make datapath use fib instance @fd
 1658  */
 1659 bool
 1660 fib_set_datapath_ptr(struct fib_data *fd, struct fib_dp *dp)
 1661 {
 1662         struct fib_dp **pdp;
 1663 
 1664         pdp = get_family_dp_ptr(fd->fd_family);
 1665         return (replace_rtables_family(pdp, fd, dp));
 1666 }
 1667 
 1668 /*
 1669  * Grow datapath pointers array.
 1670  * Called from sysctl handler on growing number of routing tables.
 1671  */
 1672 static void
 1673 grow_rtables_family(struct fib_dp **pdp, uint32_t new_num_tables)
 1674 {
 1675         struct fib_dp_header *new_fdh, *old_fdh = NULL;
 1676 
 1677         new_fdh = alloc_fib_dp_array(new_num_tables, true);
 1678 
 1679         FIB_MOD_LOCK();
 1680         if (*pdp != NULL) {
 1681                 old_fdh = get_fib_dp_header(*pdp);
 1682                 memcpy(&new_fdh->fdh_idx[0], &old_fdh->fdh_idx[0],
 1683                     old_fdh->fdh_num_tables * sizeof(struct fib_dp));
 1684         }
 1685 
 1686         /* Wait till all writes completed */
 1687         atomic_thread_fence_rel();
 1688 
 1689         *pdp = &new_fdh->fdh_idx[0];
 1690         FIB_MOD_UNLOCK();
 1691 
 1692         if (old_fdh != NULL)
 1693                 fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
 1694 }
 1695 
 1696 /*
 1697  * Grows per-AF arrays of datapath pointers for each supported family.
 1698  * Called from fibs resize sysctl handler.
 1699  */
 1700 void
 1701 fib_grow_rtables(uint32_t new_num_tables)
 1702 {
 1703 
 1704 #ifdef INET
 1705         grow_rtables_family(get_family_dp_ptr(AF_INET), new_num_tables);
 1706 #endif
 1707 #ifdef INET6
 1708         grow_rtables_family(get_family_dp_ptr(AF_INET6), new_num_tables);
 1709 #endif
 1710 }
 1711 
 1712 void
 1713 fib_get_rtable_info(struct rib_head *rh, struct rib_rtable_info *rinfo)
 1714 {
 1715 
 1716         bzero(rinfo, sizeof(struct rib_rtable_info));
 1717         rinfo->num_prefixes = rh->rnh_prefixes;
 1718         rinfo->num_nhops = nhops_get_count(rh);
 1719 #ifdef ROUTE_MPATH
 1720         rinfo->num_nhgrp = nhgrp_get_count(rh);
 1721 #endif
 1722 }
 1723 
 1724 /*
 1725  * Updates pointer to the algo data for the @fd.
 1726  */
 1727 void
 1728 fib_set_algo_ptr(struct fib_data *fd, void *algo_data)
 1729 {
 1730         RIB_WLOCK_ASSERT(fd->fd_rh);
 1731 
 1732         fd->fd_algo_data = algo_data;
 1733 }
 1734 
 1735 /*
 1736  * Calls @callback with @ctx after the end of a current epoch.
 1737  */
 1738 void
 1739 fib_epoch_call(epoch_callback_t callback, epoch_context_t ctx)
 1740 {
 1741         NET_EPOCH_CALL(callback, ctx);
 1742 }
 1743 
 1744 /*
 1745  * Accessor to get rib instance @fd is attached to.
 1746  */
 1747 struct rib_head *
 1748 fib_get_rh(struct fib_data *fd)
 1749 {
 1750 
 1751         return (fd->fd_rh);
 1752 }
 1753 
 1754 /*
 1755  * Accessor to export idx->nhop array
 1756  */
 1757 struct nhop_object **
 1758 fib_get_nhop_array(struct fib_data *fd)
 1759 {
 1760 
 1761         return (fd->nh_idx);
 1762 }
 1763 
 1764 static uint32_t
 1765 get_nhop_idx(struct nhop_object *nh)
 1766 {
 1767 #ifdef ROUTE_MPATH
 1768         if (NH_IS_NHGRP(nh))
 1769                 return (nhgrp_get_idx((struct nhgrp_object *)nh));
 1770         else
 1771 #endif
 1772                 return (nhop_get_idx(nh));
 1773 }
 1774 
 1775 uint32_t
 1776 fib_get_nhop_idx(struct fib_data *fd, struct nhop_object *nh)
 1777 {
 1778 
 1779         return (get_nhop_idx(nh));
 1780 }
 1781 
 1782 static bool
 1783 is_idx_free(struct fib_data *fd, uint32_t index)
 1784 {
 1785 
 1786         return (fd->nh_ref_table->refcnt[index] == 0);
 1787 }
 1788 
 1789 static uint32_t
 1790 fib_ref_nhop(struct fib_data *fd, struct nhop_object *nh)
 1791 {
 1792         uint32_t idx = get_nhop_idx(nh);
 1793 
 1794         if (idx >= fd->number_nhops) {
 1795                 fd->hit_nhops = 1;
 1796                 return (0);
 1797         }
 1798 
 1799         if (is_idx_free(fd, idx)) {
 1800                 nhop_ref_any(nh);
 1801                 fd->nh_idx[idx] = nh;
 1802                 fd->nh_ref_table->count++;
 1803                 FD_PRINTF(LOG_DEBUG2, fd, " REF nhop %u %p", idx, fd->nh_idx[idx]);
 1804         }
 1805         fd->nh_ref_table->refcnt[idx]++;
 1806 
 1807         return (idx);
 1808 }
 1809 
 1810 struct nhop_release_data {
 1811         struct nhop_object      *nh;
 1812         struct epoch_context    ctx;
 1813 };
 1814 
 1815 static void
 1816 release_nhop_epoch(epoch_context_t ctx)
 1817 {
 1818         struct nhop_release_data *nrd;
 1819 
 1820         nrd = __containerof(ctx, struct nhop_release_data, ctx);
 1821         nhop_free_any(nrd->nh);
 1822         free(nrd, M_TEMP);
 1823 }
 1824 
 1825 /*
 1826  * Delays nexthop refcount release.
 1827  * Datapath may have the datastructures not updated yet, so the old
 1828  *  nexthop may still be returned till the end of current epoch. Delay
 1829  *  refcount removal, as we may be removing the last instance, which will
 1830  *  trigger nexthop deletion, rendering returned nexthop invalid.
 1831  */
 1832 static void
 1833 fib_schedule_release_nhop(struct fib_data *fd, struct nhop_object *nh)
 1834 {
 1835         struct nhop_release_data *nrd;
 1836 
 1837         nrd = malloc(sizeof(struct nhop_release_data), M_TEMP, M_NOWAIT | M_ZERO);
 1838         if (nrd != NULL) {
 1839                 nrd->nh = nh;
 1840                 fib_epoch_call(release_nhop_epoch, &nrd->ctx);
 1841         } else {
 1842                 /*
 1843                  * Unable to allocate memory. Leak nexthop to maintain guarantee
 1844                  *  that each nhop can be referenced.
 1845                  */
 1846                 FD_PRINTF(LOG_ERR, fd, "unable to schedule nhop %p deletion", nh);
 1847         }
 1848 }
 1849 
 1850 static void
 1851 fib_unref_nhop(struct fib_data *fd, struct nhop_object *nh)
 1852 {
 1853         uint32_t idx = get_nhop_idx(nh);
 1854 
 1855         KASSERT((idx < fd->number_nhops), ("invalid nhop index"));
 1856         KASSERT((nh == fd->nh_idx[idx]), ("index table contains whong nh"));
 1857 
 1858         fd->nh_ref_table->refcnt[idx]--;
 1859         if (fd->nh_ref_table->refcnt[idx] == 0) {
 1860                 FD_PRINTF(LOG_DEBUG, fd, " FREE nhop %d %p", idx, fd->nh_idx[idx]);
 1861                 fib_schedule_release_nhop(fd, fd->nh_idx[idx]);
 1862         }
 1863 }
 1864 
 1865 static void
 1866 set_algo_fixed(struct rib_head *rh)
 1867 {
 1868         switch (rh->rib_family) {
 1869 #ifdef INET
 1870         case AF_INET:
 1871                 V_algo_fixed_inet = true;
 1872                 break;
 1873 #endif
 1874 #ifdef INET6
 1875         case AF_INET6:
 1876                 V_algo_fixed_inet6 = true;
 1877                 break;
 1878 #endif
 1879         }
 1880 }
 1881 
 1882 static bool
 1883 is_algo_fixed(struct rib_head *rh)
 1884 {
 1885 
 1886         switch (rh->rib_family) {
 1887 #ifdef INET
 1888         case AF_INET:
 1889                 return (V_algo_fixed_inet);
 1890 #endif
 1891 #ifdef INET6
 1892         case AF_INET6:
 1893                 return (V_algo_fixed_inet6);
 1894 #endif
 1895         }
 1896         return (false);
 1897 }
 1898 
 1899 /*
 1900  * Runs the check on what would be the best algo for rib @rh, assuming
 1901  *  that the current algo is the one specified by @orig_flm. Note that
 1902  *  it can be NULL for initial selection.
 1903  *
 1904  * Returns referenced new algo or NULL if the current one is the best.
 1905  */
 1906 static struct fib_lookup_module *
 1907 fib_check_best_algo(struct rib_head *rh, struct fib_lookup_module *orig_flm)
 1908 {
 1909         uint8_t preference, curr_preference = 0, best_preference = 0;
 1910         struct fib_lookup_module *flm, *best_flm = NULL;
 1911         struct rib_rtable_info rinfo;
 1912         int candidate_algos = 0;
 1913 
 1914         fib_get_rtable_info(rh, &rinfo);
 1915 
 1916         FIB_MOD_LOCK();
 1917         TAILQ_FOREACH(flm, &all_algo_list, entries) {
 1918                 if (flm->flm_family != rh->rib_family)
 1919                         continue;
 1920                 candidate_algos++;
 1921                 preference = flm->flm_get_pref(&rinfo);
 1922                 if (preference > best_preference) {
 1923                         if (!flm_error_check(flm, rh->rib_fibnum)) {
 1924                                 best_preference = preference;
 1925                                 best_flm = flm;
 1926                         }
 1927                 }
 1928                 if (flm == orig_flm)
 1929                         curr_preference = preference;
 1930         }
 1931         if ((best_flm != NULL) && (curr_preference + BEST_DIFF_PERCENT < best_preference))
 1932                 best_flm->flm_refcount++;
 1933         else
 1934                 best_flm = NULL;
 1935         FIB_MOD_UNLOCK();
 1936 
 1937         RH_PRINTF(LOG_DEBUG, rh, "candidate_algos: %d, curr: %s(%d) result: %s(%d)",
 1938             candidate_algos, orig_flm ? orig_flm->flm_name : "NULL", curr_preference,
 1939             best_flm ? best_flm->flm_name : (orig_flm ? orig_flm->flm_name : "NULL"),
 1940             best_preference);
 1941 
 1942         return (best_flm);
 1943 }
 1944 
 1945 /*
 1946  * Called when new route table is created.
 1947  * Selects, allocates and attaches fib algo for the table.
 1948  */
 1949 static bool
 1950 fib_select_algo_initial(struct rib_head *rh, struct fib_dp *dp)
 1951 {
 1952         struct fib_lookup_module *flm;
 1953         struct fib_data *fd = NULL;
 1954         enum flm_op_result result;
 1955         struct epoch_tracker et;
 1956 
 1957         flm = fib_check_best_algo(rh, NULL);
 1958         if (flm == NULL) {
 1959                 RH_PRINTF(LOG_CRIT, rh, "no algo selected");
 1960                 return (false);
 1961         }
 1962         RH_PRINTF(LOG_INFO, rh, "selected algo %s", flm->flm_name);
 1963 
 1964         NET_EPOCH_ENTER(et);
 1965         RIB_WLOCK(rh);
 1966         result = setup_fd_instance(flm, rh, NULL, &fd, false);
 1967         RIB_WUNLOCK(rh);
 1968         NET_EPOCH_EXIT(et);
 1969 
 1970         RH_PRINTF(LOG_DEBUG, rh, "result=%d fd=%p", result, fd);
 1971         if (result == FLM_SUCCESS)
 1972                 *dp = fd->fd_dp;
 1973         else
 1974                 RH_PRINTF(LOG_CRIT, rh, "unable to setup algo %s", flm->flm_name);
 1975 
 1976         fib_unref_algo(flm);
 1977 
 1978         return (result == FLM_SUCCESS);
 1979 }
 1980 
 1981 /*
 1982  * Sets up fib algo instances for the non-initialized RIBs in the @family.
 1983  * Allocates temporary datapath index to amortize datapaint index updates
 1984  * with large @num_tables.
 1985  */
 1986 void
 1987 fib_setup_family(int family, uint32_t num_tables)
 1988 {
 1989         struct fib_dp_header *new_fdh = alloc_fib_dp_array(num_tables, false);
 1990         if (new_fdh == NULL) {
 1991                 ALGO_PRINTF(LOG_CRIT, "Unable to setup framework for %s", print_family(family));
 1992                 return;
 1993         }
 1994 
 1995         for (int i = 0; i < num_tables; i++) {
 1996                 struct rib_head *rh = rt_tables_get_rnh(i, family);
 1997                 if (rh->rib_algo_init)
 1998                         continue;
 1999                 if (!fib_select_algo_initial(rh, &new_fdh->fdh_idx[i]))
 2000                         continue;
 2001 
 2002                 rh->rib_algo_init = true;
 2003         }
 2004 
 2005         FIB_MOD_LOCK();
 2006         struct fib_dp **pdp = get_family_dp_ptr(family);
 2007         struct fib_dp_header *old_fdh = get_fib_dp_header(*pdp);
 2008 
 2009         /* Update the items not touched by the new init, from the old data pointer */
 2010         for (int i = 0; i < num_tables; i++) {
 2011                 if (new_fdh->fdh_idx[i].f == dummy_lookup)
 2012                         new_fdh->fdh_idx[i] = old_fdh->fdh_idx[i];
 2013         }
 2014 
 2015         /* Ensure all index writes have completed */
 2016         atomic_thread_fence_rel();
 2017         /* Set new datapath pointer */
 2018         *pdp = &new_fdh->fdh_idx[0];
 2019 
 2020         FIB_MOD_UNLOCK();
 2021 
 2022         fib_epoch_call(destroy_fdh_epoch, &old_fdh->fdh_epoch_ctx);
 2023 }
 2024 
 2025 /*
 2026  * Registers fib lookup module within the subsystem.
 2027  */
 2028 int
 2029 fib_module_register(struct fib_lookup_module *flm)
 2030 {
 2031 
 2032         FIB_MOD_LOCK();
 2033         ALGO_PRINTF(LOG_INFO, "attaching %s to %s", flm->flm_name,
 2034             print_family(flm->flm_family));
 2035         TAILQ_INSERT_TAIL(&all_algo_list, flm, entries);
 2036         FIB_MOD_UNLOCK();
 2037 
 2038         return (0);
 2039 }
 2040 
 2041 /*
 2042  * Tries to unregister fib lookup module.
 2043  *
 2044  * Returns 0 on success, EBUSY if module is still used
 2045  *  by some of the tables.
 2046  */
 2047 int
 2048 fib_module_unregister(struct fib_lookup_module *flm)
 2049 {
 2050 
 2051         FIB_MOD_LOCK();
 2052         if (flm->flm_refcount > 0) {
 2053                 FIB_MOD_UNLOCK();
 2054                 return (EBUSY);
 2055         }
 2056         fib_error_clear_flm(flm);
 2057         ALGO_PRINTF(LOG_INFO, "detaching %s from %s", flm->flm_name,
 2058             print_family(flm->flm_family));
 2059         TAILQ_REMOVE(&all_algo_list, flm, entries);
 2060         FIB_MOD_UNLOCK();
 2061 
 2062         return (0);
 2063 }
 2064 
 2065 void
 2066 vnet_fib_init(void)
 2067 {
 2068 
 2069         TAILQ_INIT(&V_fib_data_list);
 2070 }
 2071 
 2072 void
 2073 vnet_fib_destroy(void)
 2074 {
 2075 
 2076         FIB_MOD_LOCK();
 2077         fib_error_clear();
 2078         FIB_MOD_UNLOCK();
 2079 }

Cache object: a60b62654b6a08ed460513de5d5368bf


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.