The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_ratelimit.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  *
    3  * SPDX-License-Identifier: BSD-3-Clause
    4  *
    5  * Copyright (c) 2018-2020
    6  *      Netflix Inc.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  */
   30 /**
   31  * Author: Randall Stewart <rrs@netflix.com>
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD$");
   36 #include "opt_inet.h"
   37 #include "opt_inet6.h"
   38 #include "opt_ipsec.h"
   39 #include "opt_ratelimit.h"
   40 #include <sys/param.h>
   41 #include <sys/kernel.h>
   42 #include <sys/malloc.h>
   43 #include <sys/mbuf.h>
   44 #include <sys/socket.h>
   45 #include <sys/socketvar.h>
   46 #include <sys/sysctl.h>
   47 #include <sys/eventhandler.h>
   48 #include <sys/mutex.h>
   49 #include <sys/ck.h>
   50 #include <net/if.h>
   51 #include <net/if_var.h>
   52 #include <netinet/in.h>
   53 #include <netinet/in_pcb.h>
   54 #define TCPSTATES               /* for logging */
   55 #include <netinet/tcp_var.h>
   56 #include <netinet/tcp_hpts.h>
   57 #include <netinet/tcp_log_buf.h>
   58 #include <netinet/tcp_ratelimit.h>
   59 #ifndef USECS_IN_SECOND
   60 #define USECS_IN_SECOND 1000000
   61 #endif
   62 /*
   63  * For the purposes of each send, what is the size
   64  * of an ethernet frame.
   65  */
   66 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
   67 #ifdef RATELIMIT
   68 
   69 /*
   70  * The following preferred table will seem weird to
   71  * the casual viewer. Why do we not have any rates below
   72  * 1Mbps? Why do we have a rate at 1.44Mbps called common?
   73  * Why do the rates cluster in the 1-100Mbps range more
   74  * than others? Why does the table jump around at the beginnign
   75  * and then be more consistently raising?
   76  *
   77  * Let me try to answer those questions. A lot of
   78  * this is dependant on the hardware. We have three basic
   79  * supporters of rate limiting
   80  *
   81  * Chelsio - Supporting 16 configurable rates.
   82  * Mlx  - c4 supporting 13 fixed rates.
   83  * Mlx  - c5 & c6 supporting 127 configurable rates.
   84  *
   85  * The c4 is why we have a common rate that is available
   86  * in all rate tables. This is a selected rate from the
   87  * c4 table and we assure its available in all ratelimit
   88  * tables. This way the tcp_ratelimit code has an assured
   89  * rate it should always be able to get. This answers a
   90  * couple of the questions above.
   91  *
   92  * So what about the rest, well the table is built to
   93  * try to get the most out of a joint hardware/software
   94  * pacing system.  The software pacer will always pick
   95  * a rate higher than the b/w that it is estimating
   96  *
   97  * on the path. This is done for two reasons.
   98  * a) So we can discover more b/w
   99  * and
  100  * b) So we can send a block of MSS's down and then
  101  *    have the software timer go off after the previous
  102  *    send is completely out of the hardware.
  103  *
  104  * But when we do <b> we don't want to have the delay
  105  * between the last packet sent by the hardware be
  106  * excessively long (to reach our desired rate).
  107  *
  108  * So let me give an example for clarity.
  109  *
  110  * Lets assume that the tcp stack sees that 29,110,000 bps is
  111  * what the bw of the path is. The stack would select the
  112  * rate 31Mbps. 31Mbps means that each send that is done
  113  * by the hardware will cause a 390 micro-second gap between
  114  * the packets sent at that rate. For 29,110,000 bps we
  115  * would need 416 micro-seconds gap between each send.
  116  *
  117  * Note that are calculating a complete time for pacing
  118  * which includes the ethernet, IP and TCP overhead. So
  119  * a full 1514 bytes is used for the above calculations.
  120  * My testing has shown that both cards are also using this
  121  * as their basis i.e. full payload size of the ethernet frame.
  122  * The TCP stack caller needs to be aware of this and make the
  123  * appropriate overhead calculations be included in its choices.
  124  *
  125  * Now, continuing our example, we pick a MSS size based on the
  126  * delta between the two rates (416 - 390) divided into the rate
  127  * we really wish to send at rounded up.  That results in a MSS
  128  * send of 17 mss's at once. The hardware then will
  129  * run out of data in a single 17MSS send in 6,630 micro-seconds.
  130  *
  131  * On the other hand the software pacer will send more data
  132  * in 7,072 micro-seconds. This means that we will refill
  133  * the hardware 52 microseconds after it would have sent
  134  * next if it had not ran out of data. This is a win since we are
  135  * only sending every 7ms or so and yet all the packets are spaced on
  136  * the wire with 94% of what they should be and only
  137  * the last packet is delayed extra to make up for the
  138  * difference.
  139  *
  140  * Note that the above formula has two important caveat.
  141  * If we are above (b/w wise) over 100Mbps we double the result
  142  * of the MSS calculation. The second caveat is if we are 500Mbps
  143  * or more we just send the maximum MSS at once i.e. 45MSS. At
  144  * the higher b/w's even the cards have limits to what times (timer granularity)
  145  * they can insert between packets and start to send more than one
  146  * packet at a time on the wire.
  147  *
  148  */
  149 #define COMMON_RATE 180500
  150 const uint64_t desired_rates[] = {
  151         122500,                 /* 1Mbps  - rate 1 */
  152         180500,                 /* 1.44Mpbs - rate 2  common rate */
  153         375000,                 /* 3Mbps    - rate 3 */
  154         625000,                 /* 5Mbps    - rate 4 */
  155         1250000,                /* 10Mbps   - rate 5 */
  156         1875000,                /* 15Mbps   - rate 6 */
  157         2500000,                /* 20Mbps   - rate 7 */
  158         3125000,                /* 25Mbps   - rate 8 */
  159         3750000,                /* 30Mbps   - rate 9 */
  160         4375000,                /* 35Mbps   - rate 10 */
  161         5000000,                /* 40Meg    - rate 11 */
  162         6250000,                /* 50Mbps   - rate 12 */
  163         12500000,               /* 100Mbps  - rate 13 */
  164         25000000,               /* 200Mbps  - rate 14 */
  165         50000000,               /* 400Mbps  - rate 15 */
  166         100000000,              /* 800Mbps  - rate 16 */
  167         5625000,                /* 45Mbps   - rate 17 */
  168         6875000,                /* 55Mbps   - rate 19 */
  169         7500000,                /* 60Mbps   - rate 20 */
  170         8125000,                /* 65Mbps   - rate 21 */
  171         8750000,                /* 70Mbps   - rate 22 */
  172         9375000,                /* 75Mbps   - rate 23 */
  173         10000000,               /* 80Mbps   - rate 24 */
  174         10625000,               /* 85Mbps   - rate 25 */
  175         11250000,               /* 90Mbps   - rate 26 */
  176         11875000,               /* 95Mbps   - rate 27 */
  177         12500000,               /* 100Mbps  - rate 28 */
  178         13750000,               /* 110Mbps  - rate 29 */
  179         15000000,               /* 120Mbps  - rate 30 */
  180         16250000,               /* 130Mbps  - rate 31 */
  181         17500000,               /* 140Mbps  - rate 32 */
  182         18750000,               /* 150Mbps  - rate 33 */
  183         20000000,               /* 160Mbps  - rate 34 */
  184         21250000,               /* 170Mbps  - rate 35 */
  185         22500000,               /* 180Mbps  - rate 36 */
  186         23750000,               /* 190Mbps  - rate 37 */
  187         26250000,               /* 210Mbps  - rate 38 */
  188         27500000,               /* 220Mbps  - rate 39 */
  189         28750000,               /* 230Mbps  - rate 40 */
  190         30000000,               /* 240Mbps  - rate 41 */
  191         31250000,               /* 250Mbps  - rate 42 */
  192         34375000,               /* 275Mbps  - rate 43 */
  193         37500000,               /* 300Mbps  - rate 44 */
  194         40625000,               /* 325Mbps  - rate 45 */
  195         43750000,               /* 350Mbps  - rate 46 */
  196         46875000,               /* 375Mbps  - rate 47 */
  197         53125000,               /* 425Mbps  - rate 48 */
  198         56250000,               /* 450Mbps  - rate 49 */
  199         59375000,               /* 475Mbps  - rate 50 */
  200         62500000,               /* 500Mbps  - rate 51 */
  201         68750000,               /* 550Mbps  - rate 52 */
  202         75000000,               /* 600Mbps  - rate 53 */
  203         81250000,               /* 650Mbps  - rate 54 */
  204         87500000,               /* 700Mbps  - rate 55 */
  205         93750000,               /* 750Mbps  - rate 56 */
  206         106250000,              /* 850Mbps  - rate 57 */
  207         112500000,              /* 900Mbps  - rate 58 */
  208         125000000,              /* 1Gbps    - rate 59 */
  209         156250000,              /* 1.25Gps  - rate 60 */
  210         187500000,              /* 1.5Gps   - rate 61 */
  211         218750000,              /* 1.75Gps  - rate 62 */
  212         250000000,              /* 2Gbps    - rate 63 */
  213         281250000,              /* 2.25Gps  - rate 64 */
  214         312500000,              /* 2.5Gbps  - rate 65 */
  215         343750000,              /* 2.75Gbps - rate 66 */
  216         375000000,              /* 3Gbps    - rate 67 */
  217         500000000,              /* 4Gbps    - rate 68 */
  218         625000000,              /* 5Gbps    - rate 69 */
  219         750000000,              /* 6Gbps    - rate 70 */
  220         875000000,              /* 7Gbps    - rate 71 */
  221         1000000000,             /* 8Gbps    - rate 72 */
  222         1125000000,             /* 9Gbps    - rate 73 */
  223         1250000000,             /* 10Gbps   - rate 74 */
  224         1875000000,             /* 15Gbps   - rate 75 */
  225         2500000000              /* 20Gbps   - rate 76 */
  226 };
  227 
  228 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
  229 #define RS_ORDERED_COUNT 16     /*
  230                                  * Number that are in order
  231                                  * at the beginning of the table,
  232                                  * over this a sort is required.
  233                                  */
  234 #define RS_NEXT_ORDER_GROUP 16  /*
  235                                  * The point in our table where
  236                                  * we come fill in a second ordered
  237                                  * group (index wise means -1).
  238                                  */
  239 #define ALL_HARDWARE_RATES 1004 /*
  240                                  * 1Meg - 1Gig in 1 Meg steps
  241                                  * plus 100, 200k  and 500k and
  242                                  * 10Gig
  243                                  */
  244 
  245 #define RS_ONE_MEGABIT_PERSEC 1000000
  246 #define RS_ONE_GIGABIT_PERSEC 1000000000
  247 #define RS_TEN_GIGABIT_PERSEC 10000000000
  248 
  249 static struct head_tcp_rate_set int_rs;
  250 static struct mtx rs_mtx;
  251 uint32_t rs_number_alive;
  252 uint32_t rs_number_dead;
  253 static uint32_t rs_floor_mss = 0;
  254 static uint32_t wait_time_floor = 8000; /* 8 ms */
  255 static uint32_t rs_hw_floor_mss = 16;
  256 static uint32_t num_of_waits_allowed = 1; /* How many time blocks are we willing to wait */
  257 
  258 static uint32_t mss_divisor = RL_DEFAULT_DIVISOR;
  259 static uint32_t even_num_segs = 1;
  260 static uint32_t even_threshold = 4;
  261 
  262 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  263     "TCP Ratelimit stats");
  264 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
  265     &rs_number_alive, 0,
  266     "Number of interfaces initialized for ratelimiting");
  267 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
  268     &rs_number_dead, 0,
  269     "Number of interfaces departing from ratelimiting");
  270 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, floor_mss, CTLFLAG_RW,
  271     &rs_floor_mss, 0,
  272     "Number of MSS that will override the normal minimums (0 means don't enforce)");
  273 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, wait_floor, CTLFLAG_RW,
  274     &wait_time_floor, 2000,
  275     "Has b/w increases what is the wait floor we are willing to wait at the end?");
  276 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, time_blocks, CTLFLAG_RW,
  277     &num_of_waits_allowed, 1,
  278     "How many time blocks on the end should software pacing be willing to wait?");
  279 
  280 SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, hw_floor_mss, CTLFLAG_RW,
  281     &rs_hw_floor_mss, 16,
  282     "Number of mss that are a minum for hardware pacing?");
  283 
  284 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, divisor, CTLFLAG_RW,
  285     &mss_divisor, RL_DEFAULT_DIVISOR,
  286     "The value divided into bytes per second to help establish mss size");
  287 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, even, CTLFLAG_RW,
  288     &even_num_segs, 1,
  289     "Do we round mss size up to an even number of segments for delayed ack");
  290 SYSCTL_INT(_net_inet_tcp_rl, OID_AUTO, eventhresh, CTLFLAG_RW,
  291     &even_threshold, 4,
  292     "At what number of mss do we start rounding up to an even number of mss?");
  293 
  294 static void
  295 rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
  296 {
  297         /*
  298          * Add sysctl entries for thus interface.
  299          */
  300         if (rs->rs_flags & RS_INTF_NO_SUP) {
  301                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
  302                    SYSCTL_CHILDREN(rl_sysctl_root),
  303                    OID_AUTO, "disable", CTLFLAG_RD,
  304                    &rs->rs_disable, 0,
  305                    "Disable this interface from new hdwr limiting?");
  306         } else {
  307                 SYSCTL_ADD_S32(&rs->sysctl_ctx,
  308                    SYSCTL_CHILDREN(rl_sysctl_root),
  309                    OID_AUTO, "disable", CTLFLAG_RW,
  310                    &rs->rs_disable, 0,
  311                    "Disable this interface from new hdwr limiting?");
  312         }
  313         SYSCTL_ADD_S32(&rs->sysctl_ctx,
  314             SYSCTL_CHILDREN(rl_sysctl_root),
  315             OID_AUTO, "minseg", CTLFLAG_RW,
  316             &rs->rs_min_seg, 0,
  317             "What is the minimum we need to send on this interface?");
  318         SYSCTL_ADD_U64(&rs->sysctl_ctx,
  319             SYSCTL_CHILDREN(rl_sysctl_root),
  320             OID_AUTO, "flow_limit", CTLFLAG_RW,
  321             &rs->rs_flow_limit, 0,
  322             "What is the limit for number of flows (0=unlimited)?");
  323         SYSCTL_ADD_S32(&rs->sysctl_ctx,
  324             SYSCTL_CHILDREN(rl_sysctl_root),
  325             OID_AUTO, "highest", CTLFLAG_RD,
  326             &rs->rs_highest_valid, 0,
  327             "Highest valid rate");
  328         SYSCTL_ADD_S32(&rs->sysctl_ctx,
  329             SYSCTL_CHILDREN(rl_sysctl_root),
  330             OID_AUTO, "lowest", CTLFLAG_RD,
  331             &rs->rs_lowest_valid, 0,
  332             "Lowest valid rate");
  333         SYSCTL_ADD_S32(&rs->sysctl_ctx,
  334             SYSCTL_CHILDREN(rl_sysctl_root),
  335             OID_AUTO, "flags", CTLFLAG_RD,
  336             &rs->rs_flags, 0,
  337             "What lags are on the entry?");
  338         SYSCTL_ADD_S32(&rs->sysctl_ctx,
  339             SYSCTL_CHILDREN(rl_sysctl_root),
  340             OID_AUTO, "numrates", CTLFLAG_RD,
  341             &rs->rs_rate_cnt, 0,
  342             "How many rates re there?");
  343         SYSCTL_ADD_U64(&rs->sysctl_ctx,
  344             SYSCTL_CHILDREN(rl_sysctl_root),
  345             OID_AUTO, "flows_using", CTLFLAG_RD,
  346             &rs->rs_flows_using, 0,
  347             "How many flows are using this interface now?");
  348 #ifdef DETAILED_RATELIMIT_SYSCTL
  349         if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
  350                 /*  Lets display the rates */
  351                 int i;
  352                 struct sysctl_oid *rl_rates;
  353                 struct sysctl_oid *rl_rate_num;
  354                 char rate_num[16];
  355                 rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
  356                                             SYSCTL_CHILDREN(rl_sysctl_root),
  357                                             OID_AUTO,
  358                                             "rate",
  359                                             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  360                                             "Ratelist");
  361                 for( i = 0; i < rs->rs_rate_cnt; i++) {
  362                         sprintf(rate_num, "%d", i);
  363                         rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
  364                                             SYSCTL_CHILDREN(rl_rates),
  365                                             OID_AUTO,
  366                                             rate_num,
  367                                             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  368                                             "Individual Rate");
  369                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
  370                                        SYSCTL_CHILDREN(rl_rate_num),
  371                                        OID_AUTO, "flags", CTLFLAG_RD,
  372                                        &rs->rs_rlt[i].flags, 0,
  373                                        "Flags on this rate");
  374                         SYSCTL_ADD_U32(&rs->sysctl_ctx,
  375                                        SYSCTL_CHILDREN(rl_rate_num),
  376                                        OID_AUTO, "pacetime", CTLFLAG_RD,
  377                                        &rs->rs_rlt[i].time_between, 0,
  378                                        "Time hardware inserts between 1500 byte sends");
  379                         SYSCTL_ADD_LONG(&rs->sysctl_ctx,
  380                                        SYSCTL_CHILDREN(rl_rate_num),
  381                                        OID_AUTO, "rate", CTLFLAG_RD,
  382                                        &rs->rs_rlt[i].rate,
  383                                        "Rate in bytes per second");
  384                         SYSCTL_ADD_LONG(&rs->sysctl_ctx,
  385                                        SYSCTL_CHILDREN(rl_rate_num),
  386                                        OID_AUTO, "using", CTLFLAG_RD,
  387                                        &rs->rs_rlt[i].using,
  388                                        "Number of flows using");
  389                         SYSCTL_ADD_LONG(&rs->sysctl_ctx,
  390                                        SYSCTL_CHILDREN(rl_rate_num),
  391                                        OID_AUTO, "enobufs", CTLFLAG_RD,
  392                                        &rs->rs_rlt[i].rs_num_enobufs,
  393                                        "Number of enobufs logged on this rate");
  394 
  395                 }
  396         }
  397 #endif
  398 }
  399 
  400 static void
  401 rs_destroy(epoch_context_t ctx)
  402 {
  403         struct tcp_rate_set *rs;
  404         bool do_free_rs;
  405 
  406         rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
  407 
  408         mtx_lock(&rs_mtx);
  409         rs->rs_flags &= ~RS_FUNERAL_SCHD;
  410         /*
  411          * In theory its possible (but unlikely)
  412          * that while the delete was occuring
  413          * and we were applying the DEAD flag
  414          * someone slipped in and found the
  415          * interface in a lookup. While we
  416          * decided rs_flows_using were 0 and
  417          * scheduling the epoch_call, the other
  418          * thread incremented rs_flow_using. This
  419          * is because users have a pointer and
  420          * we only use the rs_flows_using in an
  421          * atomic fashion, i.e. the other entities
  422          * are not protected. To assure this did
  423          * not occur, we check rs_flows_using here
  424          * before deleting.
  425          */
  426         do_free_rs = (rs->rs_flows_using == 0);
  427         rs_number_dead--;
  428         mtx_unlock(&rs_mtx);
  429 
  430         if (do_free_rs) {
  431                 sysctl_ctx_free(&rs->sysctl_ctx);
  432                 free(rs->rs_rlt, M_TCPPACE);
  433                 free(rs, M_TCPPACE);
  434         }
  435 }
  436 
  437 static void
  438 rs_defer_destroy(struct tcp_rate_set *rs)
  439 {
  440 
  441         mtx_assert(&rs_mtx, MA_OWNED);
  442 
  443         /* Check if already pending. */
  444         if (rs->rs_flags & RS_FUNERAL_SCHD)
  445                 return;
  446 
  447         rs_number_dead++;
  448 
  449         /* Set flag to only defer once. */
  450         rs->rs_flags |= RS_FUNERAL_SCHD;
  451         NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
  452 }
  453 
  454 #ifdef INET
  455 extern counter_u64_t rate_limit_new;
  456 extern counter_u64_t rate_limit_chg;
  457 extern counter_u64_t rate_limit_set_ok;
  458 extern counter_u64_t rate_limit_active;
  459 extern counter_u64_t rate_limit_alloc_fail;
  460 #endif
  461 
  462 static int
  463 rl_attach_txrtlmt(struct ifnet *ifp,
  464     uint32_t flowtype,
  465     int flowid,
  466     uint64_t cfg_rate,
  467     struct m_snd_tag **tag)
  468 {
  469         int error;
  470         union if_snd_tag_alloc_params params = {
  471                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
  472                 .rate_limit.hdr.flowid = flowid,
  473                 .rate_limit.hdr.flowtype = flowtype,
  474                 .rate_limit.max_rate = cfg_rate,
  475                 .rate_limit.flags = M_NOWAIT,
  476         };
  477 
  478         error = m_snd_tag_alloc(ifp, &params, tag);
  479 #ifdef INET
  480         if (error == 0) {
  481                 counter_u64_add(rate_limit_set_ok, 1);
  482                 counter_u64_add(rate_limit_active, 1);
  483         } else if (error != EOPNOTSUPP)
  484                 counter_u64_add(rate_limit_alloc_fail, 1);
  485 #endif
  486         return (error);
  487 }
  488 
  489 static void
  490 populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
  491 {
  492         /*
  493          * The internal table is "special", it
  494          * is two seperate ordered tables that
  495          * must be merged. We get here when the
  496          * adapter specifies a number of rates that
  497          * covers both ranges in the table in some
  498          * form.
  499          */
  500         int i, at_low, at_high;
  501         uint8_t low_disabled = 0, high_disabled = 0;
  502 
  503         for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
  504                 rs->rs_rlt[i].flags = 0;
  505                 rs->rs_rlt[i].time_between = 0;
  506                 if ((low_disabled == 0) &&
  507                     (high_disabled ||
  508                      (rate_table_act[at_low] < rate_table_act[at_high]))) {
  509                         rs->rs_rlt[i].rate = rate_table_act[at_low];
  510                         at_low++;
  511                         if (at_low == RS_NEXT_ORDER_GROUP)
  512                                 low_disabled = 1;
  513                 } else if (high_disabled == 0) {
  514                         rs->rs_rlt[i].rate = rate_table_act[at_high];
  515                         at_high++;
  516                         if (at_high == MAX_HDWR_RATES)
  517                                 high_disabled = 1;
  518                 }
  519         }
  520 }
  521 
  522 static struct tcp_rate_set *
  523 rt_setup_new_rs(struct ifnet *ifp, int *error)
  524 {
  525         struct tcp_rate_set *rs;
  526         const uint64_t *rate_table_act;
  527         uint64_t lentim, res;
  528         size_t sz;
  529         uint32_t hash_type;
  530         int i;
  531         struct if_ratelimit_query_results rl;
  532         struct sysctl_oid *rl_sysctl_root;
  533         struct epoch_tracker et;
  534         /*
  535          * We expect to enter with the
  536          * mutex locked.
  537          */
  538 
  539         if (ifp->if_ratelimit_query == NULL) {
  540                 /*
  541                  * We can do nothing if we cannot
  542                  * get a query back from the driver.
  543                  */
  544                 printf("Warning:No query functions for %s:%d-- failed\n",
  545                        ifp->if_dname, ifp->if_dunit);
  546                 return (NULL);
  547         }
  548         rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
  549         if (rs == NULL) {
  550                 if (error)
  551                         *error = ENOMEM;
  552                 printf("Warning:No memory for malloc of tcp_rate_set\n");
  553                 return (NULL);
  554         }
  555         memset(&rl, 0, sizeof(rl));
  556         rl.flags = RT_NOSUPPORT;
  557         ifp->if_ratelimit_query(ifp, &rl);
  558         if (rl.flags & RT_IS_UNUSABLE) {
  559                 /*
  560                  * The interface does not really support
  561                  * the rate-limiting.
  562                  */
  563                 memset(rs, 0, sizeof(struct tcp_rate_set));
  564                 rs->rs_ifp = ifp;
  565                 rs->rs_if_dunit = ifp->if_dunit;
  566                 rs->rs_flags = RS_INTF_NO_SUP;
  567                 rs->rs_disable = 1;
  568                 rs_number_alive++;
  569                 sysctl_ctx_init(&rs->sysctl_ctx);
  570                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
  571                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
  572                     OID_AUTO,
  573                     rs->rs_ifp->if_xname,
  574                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  575                     "");
  576                 rl_add_syctl_entries(rl_sysctl_root, rs);
  577                 NET_EPOCH_ENTER(et);
  578                 mtx_lock(&rs_mtx);
  579                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
  580                 mtx_unlock(&rs_mtx);
  581                 NET_EPOCH_EXIT(et);
  582                 return (rs);
  583         } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
  584                 memset(rs, 0, sizeof(struct tcp_rate_set));
  585                 rs->rs_ifp = ifp;
  586                 rs->rs_if_dunit = ifp->if_dunit;
  587                 rs->rs_flags = RS_IS_DEFF;
  588                 rs_number_alive++;
  589                 sysctl_ctx_init(&rs->sysctl_ctx);
  590                 rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
  591                     SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
  592                     OID_AUTO,
  593                     rs->rs_ifp->if_xname,
  594                     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  595                     "");
  596                 rl_add_syctl_entries(rl_sysctl_root, rs);
  597                 NET_EPOCH_ENTER(et);
  598                 mtx_lock(&rs_mtx);
  599                 CK_LIST_INSERT_HEAD(&int_rs, rs, next);
  600                 mtx_unlock(&rs_mtx);
  601                 NET_EPOCH_EXIT(et);
  602                 return (rs);
  603         } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
  604                 /* Mellanox C4 likely */
  605                 rs->rs_ifp = ifp;
  606                 rs->rs_if_dunit = ifp->if_dunit;
  607                 rs->rs_rate_cnt = rl.number_of_rates;
  608                 rs->rs_min_seg = rl.min_segment_burst;
  609                 rs->rs_highest_valid = 0;
  610                 rs->rs_flow_limit = rl.max_flows;
  611                 rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
  612                 rs->rs_disable = 0;
  613                 rate_table_act = rl.rate_table;
  614         } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
  615                 /* Chelsio, C5 and C6 of Mellanox? */
  616                 rs->rs_ifp = ifp;
  617                 rs->rs_if_dunit = ifp->if_dunit;
  618                 rs->rs_rate_cnt = rl.number_of_rates;
  619                 rs->rs_min_seg = rl.min_segment_burst;
  620                 rs->rs_disable = 0;
  621                 rs->rs_flow_limit = rl.max_flows;
  622                 rate_table_act = desired_rates;
  623                 if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
  624                     (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
  625                         /*
  626                          * Our desired table is not big
  627                          * enough, do what we can.
  628                          */
  629                         rs->rs_rate_cnt = MAX_HDWR_RATES;
  630                  }
  631                 if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
  632                         rs->rs_flags = RS_IS_INTF;
  633                 else
  634                         rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
  635                 if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
  636                         rs->rs_rate_cnt = ALL_HARDWARE_RATES;
  637         } else {
  638                 free(rs, M_TCPPACE);
  639                 return (NULL);
  640         }
  641         sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
  642         rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
  643         if (rs->rs_rlt == NULL) {
  644                 if (error)
  645                         *error = ENOMEM;
  646 bail:
  647                 free(rs, M_TCPPACE);
  648                 return (NULL);
  649         }
  650         if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
  651                 /*
  652                  * The interface supports all
  653                  * the rates we could possibly want.
  654                  */
  655                 uint64_t rat;
  656 
  657                 rs->rs_rlt[0].rate = 12500;     /* 100k */
  658                 rs->rs_rlt[1].rate = 25000;     /* 200k */
  659                 rs->rs_rlt[2].rate = 62500;     /* 500k */
  660                 /* Note 125000 == 1Megabit
  661                  * populate 1Meg - 1000meg.
  662                  */
  663                 for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
  664                         rs->rs_rlt[i].rate = rat;
  665                         rat += 125000;
  666                 }
  667                 rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
  668         } else if (rs->rs_flags & RS_INT_TBL) {
  669                 /* We populate this in a special way */
  670                 populate_canned_table(rs, rate_table_act);
  671         } else {
  672                 /*
  673                  * Just copy in the rates from
  674                  * the table, it is in order.
  675                  */
  676                 for (i=0; i<rs->rs_rate_cnt; i++) {
  677                         rs->rs_rlt[i].rate = rate_table_act[i];
  678                         rs->rs_rlt[i].time_between = 0;
  679                         rs->rs_rlt[i].flags = 0;
  680                 }
  681         }
  682         for (i = (rs->rs_rate_cnt - 1); i >= 0; i--) {
  683                 /*
  684                  * We go backwards through the list so that if we can't get
  685                  * a rate and fail to init one, we have at least a chance of
  686                  * getting the highest one.
  687                  */
  688                 rs->rs_rlt[i].ptbl = rs;
  689                 rs->rs_rlt[i].tag = NULL;
  690                 rs->rs_rlt[i].using = 0;
  691                 rs->rs_rlt[i].rs_num_enobufs = 0;
  692                 /*
  693                  * Calculate the time between.
  694                  */
  695                 lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
  696                 res = lentim / rs->rs_rlt[i].rate;
  697                 if (res > 0)
  698                         rs->rs_rlt[i].time_between = res;
  699                 else
  700                         rs->rs_rlt[i].time_between = 1;
  701                 if (rs->rs_flags & RS_NO_PRE) {
  702                         rs->rs_rlt[i].flags = HDWRPACE_INITED;
  703                         rs->rs_lowest_valid = i;
  704                 } else {
  705                         int err;
  706 
  707                         if ((rl.flags & RT_IS_SETUP_REQ)  &&
  708                             (ifp->if_ratelimit_query)) {
  709                                 err = ifp->if_ratelimit_setup(ifp,
  710                                          rs->rs_rlt[i].rate, i);
  711                                 if (err)
  712                                         goto handle_err;
  713                         }
  714 #ifdef RSS
  715                         hash_type = M_HASHTYPE_RSS_TCP_IPV4;
  716 #else
  717                         hash_type = M_HASHTYPE_OPAQUE_HASH;
  718 #endif
  719                         err = rl_attach_txrtlmt(ifp,
  720                             hash_type,
  721                             (i + 1),
  722                             rs->rs_rlt[i].rate,
  723                             &rs->rs_rlt[i].tag);
  724                         if (err) {
  725 handle_err:
  726                                 if (i == (rs->rs_rate_cnt - 1)) {
  727                                         /*
  728                                          * Huh - first rate and we can't get
  729                                          * it?
  730                                          */
  731                                         free(rs->rs_rlt, M_TCPPACE);
  732                                         if (error)
  733                                                 *error = err;
  734                                         goto bail;
  735                                 } else {
  736                                         if (error)
  737                                                 *error = err;
  738                                 }
  739                                 break;
  740                         } else {
  741                                 rs->rs_rlt[i].flags = HDWRPACE_INITED | HDWRPACE_TAGPRESENT;
  742                                 rs->rs_lowest_valid = i;
  743                         }
  744                 }
  745         }
  746         /* Did we get at least 1 rate? */
  747         if (rs->rs_rlt[(rs->rs_rate_cnt - 1)].flags & HDWRPACE_INITED)
  748                 rs->rs_highest_valid = rs->rs_rate_cnt - 1;
  749         else {
  750                 free(rs->rs_rlt, M_TCPPACE);
  751                 goto bail;
  752         }
  753         rs_number_alive++;
  754         sysctl_ctx_init(&rs->sysctl_ctx);
  755         rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
  756             SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
  757             OID_AUTO,
  758             rs->rs_ifp->if_xname,
  759             CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  760             "");
  761         rl_add_syctl_entries(rl_sysctl_root, rs);
  762         NET_EPOCH_ENTER(et);
  763         mtx_lock(&rs_mtx);
  764         CK_LIST_INSERT_HEAD(&int_rs, rs, next);
  765         mtx_unlock(&rs_mtx);
  766         NET_EPOCH_EXIT(et);
  767         return (rs);
  768 }
  769 
  770 /*
  771  * For an explanation of why the argument is volatile please
  772  * look at the comments around rt_setup_rate().
  773  */
  774 static const struct tcp_hwrate_limit_table *
  775 tcp_int_find_suitable_rate(const volatile struct tcp_rate_set *rs,
  776     uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
  777 {
  778         struct tcp_hwrate_limit_table *arte = NULL, *rte = NULL;
  779         uint64_t mbits_per_sec, ind_calc, previous_rate = 0;
  780         int i;
  781 
  782         mbits_per_sec = (bytes_per_sec * 8);
  783         if (flags & RS_PACING_LT) {
  784                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
  785                     (rs->rs_lowest_valid <= 2)){
  786                         /*
  787                          * Smaller than 1Meg, only
  788                          * 3 entries can match it.
  789                          */
  790                         previous_rate = 0;
  791                         for(i = rs->rs_lowest_valid; i < 3; i++) {
  792                                 if (bytes_per_sec <= rs->rs_rlt[i].rate) {
  793                                         rte = &rs->rs_rlt[i];
  794                                         break;
  795                                 } else if (rs->rs_rlt[i].flags & HDWRPACE_INITED) {
  796                                         arte = &rs->rs_rlt[i];
  797                                 }
  798                                 previous_rate = rs->rs_rlt[i].rate;
  799                         }
  800                         goto done;
  801                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
  802                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
  803                         /*
  804                          * Larger than 1G (the majority of
  805                          * our table.
  806                          */
  807                         if (mbits_per_sec < RS_TEN_GIGABIT_PERSEC)
  808                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
  809                         else
  810                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
  811                         previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
  812                         goto done;
  813                 }
  814                 /*
  815                  * If we reach here its in our table (between 1Meg - 1000Meg),
  816                  * just take the rounded down mbits per second, and add
  817                  * 1Megabit to it, from this we can calculate
  818                  * the index in the table.
  819                  */
  820                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
  821                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) != mbits_per_sec)
  822                         ind_calc++;
  823                 /* our table is offset by 3, we add 2 */
  824                 ind_calc += 2;
  825                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
  826                         /* This should not happen */
  827                         ind_calc = ALL_HARDWARE_RATES-1;
  828                 }
  829                 if ((ind_calc >= rs->rs_lowest_valid) &&
  830                     (ind_calc <= rs->rs_highest_valid)) {
  831                         rte = &rs->rs_rlt[ind_calc];
  832                         if (ind_calc >= 1)
  833                                 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
  834                 }
  835         } else if (flags & RS_PACING_EXACT_MATCH) {
  836                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
  837                     (rs->rs_lowest_valid <= 2)){
  838                         for(i = rs->rs_lowest_valid; i < 3; i++) {
  839                                 if (bytes_per_sec == rs->rs_rlt[i].rate) {
  840                                         rte = &rs->rs_rlt[i];
  841                                         break;
  842                                 }
  843                         }
  844                 } else if ((mbits_per_sec > RS_ONE_GIGABIT_PERSEC) &&
  845                            (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
  846                         /* > 1Gbps only one rate */
  847                         if (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) {
  848                                 /* Its 10G wow */
  849                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
  850                         }
  851                 } else {
  852                         /* Ok it must be a exact meg (its between 1G and 1Meg) */
  853                         ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
  854                         if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
  855                                 /* its an exact Mbps */
  856                                 ind_calc += 2;
  857                                 if (ind_calc > (ALL_HARDWARE_RATES-1)) {
  858                                         /* This should not happen */
  859                                         ind_calc = ALL_HARDWARE_RATES-1;
  860                                 }
  861                                 if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED)
  862                                         rte = &rs->rs_rlt[ind_calc];
  863                         }
  864                 }
  865         } else {
  866                 /* we want greater than the requested rate */
  867                 if ((mbits_per_sec < RS_ONE_MEGABIT_PERSEC) &&
  868                     (rs->rs_lowest_valid <= 2)){
  869                         arte = &rs->rs_rlt[3]; /* set alternate to 1Meg */
  870                         for (i=2; i>=rs->rs_lowest_valid; i--) {
  871                                 if (bytes_per_sec < rs->rs_rlt[i].rate) {
  872                                         rte = &rs->rs_rlt[i];
  873                                         if (i >= 1) {
  874                                                 previous_rate = rs->rs_rlt[(i-1)].rate;
  875                                         }
  876                                         break;
  877                                 } else if ((flags & RS_PACING_GEQ) &&
  878                                            (bytes_per_sec == rs->rs_rlt[i].rate)) {
  879                                         rte = &rs->rs_rlt[i];
  880                                         if (i >= 1) {
  881                                                 previous_rate = rs->rs_rlt[(i-1)].rate;
  882                                         }
  883                                         break;
  884                                 } else {
  885                                         arte = &rs->rs_rlt[i]; /* new alternate */
  886                                 }
  887                         }
  888                 } else if (mbits_per_sec > RS_ONE_GIGABIT_PERSEC) {
  889                         if ((bytes_per_sec < rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
  890                             (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)){
  891                                 /* Our top rate is larger than the request */
  892                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
  893                         } else if ((flags & RS_PACING_GEQ) &&
  894                                    (bytes_per_sec == rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate) &&
  895                                    (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED)) {
  896                                 /* It matches our top rate */
  897                                 rte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
  898                         } else if (rs->rs_rlt[(ALL_HARDWARE_RATES-1)].flags & HDWRPACE_INITED) {
  899                                 /* The top rate is an alternative */
  900                                 arte = &rs->rs_rlt[(ALL_HARDWARE_RATES-1)];
  901                         }
  902                         previous_rate = rs->rs_rlt[(ALL_HARDWARE_RATES-2)].rate;
  903                 } else {
  904                         /* Its in our range 1Meg - 1Gig */
  905                         if (flags & RS_PACING_GEQ) {
  906                                 ind_calc = mbits_per_sec/RS_ONE_MEGABIT_PERSEC;
  907                                 if ((ind_calc * RS_ONE_MEGABIT_PERSEC) == mbits_per_sec) {
  908                                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
  909                                                 /* This should not happen */
  910                                                 ind_calc = (ALL_HARDWARE_RATES-1);
  911                                         }
  912                                         rte = &rs->rs_rlt[ind_calc];
  913                                         if (ind_calc >= 1)
  914                                                 previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
  915                                 }
  916                                 goto done;
  917                         }
  918                         ind_calc = (mbits_per_sec + (RS_ONE_MEGABIT_PERSEC-1))/RS_ONE_MEGABIT_PERSEC;
  919                         ind_calc += 2;
  920                         if (ind_calc > (ALL_HARDWARE_RATES-1)) {
  921                                 /* This should not happen */
  922                                 ind_calc = ALL_HARDWARE_RATES-1;
  923                         }
  924                         if (rs->rs_rlt[ind_calc].flags & HDWRPACE_INITED) {
  925                                 rte = &rs->rs_rlt[ind_calc];
  926                                 if (ind_calc >= 1)
  927                                         previous_rate = rs->rs_rlt[(ind_calc-1)].rate;
  928                         }
  929                 }
  930         }
  931 done:
  932         if ((rte == NULL) &&
  933             (arte != NULL) &&
  934             (flags & RS_PACING_SUB_OK)) {
  935                 /* We can use the substitute */
  936                 rte = arte;
  937         }
  938         if (lower_rate)
  939                 *lower_rate = previous_rate;
  940         return (rte);
  941 }
  942 
  943 /*
  944  * For an explanation of why the argument is volatile please
  945  * look at the comments around rt_setup_rate().
  946  */
  947 static const struct tcp_hwrate_limit_table *
  948 tcp_find_suitable_rate(const volatile struct tcp_rate_set *rs, uint64_t bytes_per_sec, uint32_t flags, uint64_t *lower_rate)
  949 {
  950         /**
  951          * Hunt the rate table with the restrictions in flags and find a
  952          * suitable rate if possible.
  953          * RS_PACING_EXACT_MATCH - look for an exact match to rate.
  954          * RS_PACING_GT     - must be greater than.
  955          * RS_PACING_GEQ    - must be greater than or equal.
  956          * RS_PACING_LT     - must be less than.
  957          * RS_PACING_SUB_OK - If we don't meet criteria a
  958          *                    substitute is ok.
  959          */
  960         int i, matched;
  961         struct tcp_hwrate_limit_table *rte = NULL;
  962         uint64_t previous_rate = 0;
  963 
  964         if ((rs->rs_flags & RS_INT_TBL) &&
  965             (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)) {
  966                 /*
  967                  * Here we don't want to paw thru
  968                  * a big table, we have everything
  969                  * from 1Meg - 1000Meg in 1Meg increments.
  970                  * Use an alternate method to "lookup".
  971                  */
  972                 return (tcp_int_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate));
  973         }
  974         if ((flags & RS_PACING_LT) ||
  975             (flags & RS_PACING_EXACT_MATCH)) {
  976                 /*
  977                  * For exact and less than we go forward through the table.
  978                  * This way when we find one larger we stop (exact was a
  979                  * toss up).
  980                  */
  981                 for (i = rs->rs_lowest_valid, matched = 0; i <= rs->rs_highest_valid; i++) {
  982                         if ((flags & RS_PACING_EXACT_MATCH) &&
  983                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
  984                                 rte = &rs->rs_rlt[i];
  985                                 matched = 1;
  986                                 if (lower_rate != NULL)
  987                                         *lower_rate = previous_rate;
  988                                 break;
  989                         } else if ((flags & RS_PACING_LT) &&
  990                             (bytes_per_sec <= rs->rs_rlt[i].rate)) {
  991                                 rte = &rs->rs_rlt[i];
  992                                 matched = 1;
  993                                 if (lower_rate != NULL)
  994                                         *lower_rate = previous_rate;
  995                                 break;
  996                         }
  997                         previous_rate = rs->rs_rlt[i].rate;
  998                         if (bytes_per_sec > rs->rs_rlt[i].rate)
  999                                 break;
 1000                 }
 1001                 if ((matched == 0) &&
 1002                     (flags & RS_PACING_LT) &&
 1003                     (flags & RS_PACING_SUB_OK)) {
 1004                         /* Kick in a substitute (the lowest) */
 1005                         rte = &rs->rs_rlt[rs->rs_lowest_valid];
 1006                 }
 1007         } else {
 1008                 /*
 1009                  * Here we go backward through the table so that we can find
 1010                  * the one greater in theory faster (but its probably a
 1011                  * wash).
 1012                  */
 1013                 for (i = rs->rs_highest_valid, matched = 0; i >= rs->rs_lowest_valid; i--) {
 1014                         if (rs->rs_rlt[i].rate > bytes_per_sec) {
 1015                                 /* A possible candidate */
 1016                                 rte = &rs->rs_rlt[i];
 1017                         }
 1018                         if ((flags & RS_PACING_GEQ) &&
 1019                             (bytes_per_sec == rs->rs_rlt[i].rate)) {
 1020                                 /* An exact match and we want equal */
 1021                                 matched = 1;
 1022                                 rte = &rs->rs_rlt[i];
 1023                                 break;
 1024                         } else if (rte) {
 1025                                 /*
 1026                                  * Found one that is larger than but don't
 1027                                  * stop, there may be a more closer match.
 1028                                  */
 1029                                 matched = 1;
 1030                         }
 1031                         if (rs->rs_rlt[i].rate < bytes_per_sec) {
 1032                                 /*
 1033                                  * We found a table entry that is smaller,
 1034                                  * stop there will be none greater or equal.
 1035                                  */
 1036                                 if (lower_rate != NULL)
 1037                                         *lower_rate = rs->rs_rlt[i].rate;
 1038                                 break;
 1039                         }
 1040                 }
 1041                 if ((matched == 0) &&
 1042                     (flags & RS_PACING_SUB_OK)) {
 1043                         /* Kick in a substitute (the highest) */
 1044                         rte = &rs->rs_rlt[rs->rs_highest_valid];
 1045                 }
 1046         }
 1047         return (rte);
 1048 }
 1049 
 1050 static struct ifnet *
 1051 rt_find_real_interface(struct ifnet *ifp, struct inpcb *inp, int *error)
 1052 {
 1053         struct ifnet *tifp;
 1054         struct m_snd_tag *tag, *ntag;
 1055         union if_snd_tag_alloc_params params = {
 1056                 .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
 1057                 .rate_limit.hdr.flowid = inp->inp_flowid,
 1058                 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 1059                 .rate_limit.max_rate = COMMON_RATE,
 1060                 .rate_limit.flags = M_NOWAIT,
 1061         };
 1062         int err;
 1063 #ifdef RSS
 1064         params.rate_limit.hdr.flowtype = ((inp->inp_vflag & INP_IPV6) ?
 1065             M_HASHTYPE_RSS_TCP_IPV6 : M_HASHTYPE_RSS_TCP_IPV4);
 1066 #else
 1067         params.rate_limit.hdr.flowtype = M_HASHTYPE_OPAQUE_HASH;
 1068 #endif
 1069         err = m_snd_tag_alloc(ifp, &params, &tag);
 1070         if (err) {
 1071                 /* Failed to setup a tag? */
 1072                 if (error)
 1073                         *error = err;
 1074                 return (NULL);
 1075         }
 1076         ntag = tag;
 1077         while (ntag->sw->next_snd_tag != NULL) {
 1078                 ntag = ntag->sw->next_snd_tag(ntag);
 1079         }
 1080         tifp = ntag->ifp;
 1081         m_snd_tag_rele(tag);
 1082         return (tifp);
 1083 }
 1084 
 1085 static void
 1086 rl_increment_using(const struct tcp_hwrate_limit_table *rte)
 1087 {
 1088         struct tcp_hwrate_limit_table *decon_rte;
 1089 
 1090         decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 1091         atomic_add_long(&decon_rte->using, 1);
 1092 }
 1093 
 1094 static void
 1095 rl_decrement_using(const struct tcp_hwrate_limit_table *rte)
 1096 {
 1097         struct tcp_hwrate_limit_table *decon_rte;
 1098 
 1099         decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 1100         atomic_subtract_long(&decon_rte->using, 1);
 1101 }
 1102 
 1103 void
 1104 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
 1105 {
 1106         struct tcp_hwrate_limit_table *decon_rte;
 1107 
 1108         decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte);
 1109         atomic_add_long(&decon_rte->rs_num_enobufs, 1);
 1110 }
 1111 
 1112 /*
 1113  * Do NOT take the __noinline out of the
 1114  * find_rs_for_ifp() function. If you do the inline
 1115  * of it for the rt_setup_rate() will show you a
 1116  * compiler bug. For some reason the compiler thinks
 1117  * the list can never be empty. The consequence of
 1118  * this will be a crash when we dereference NULL
 1119  * if an ifp is removed just has a hw rate limit
 1120  * is attempted. If you are working on the compiler
 1121  * and want to "test" this go ahead and take the noinline
 1122  * out otherwise let sleeping dogs ly until such time
 1123  * as we get a compiler fix 10/2/20 -- RRS
 1124  */
 1125 static __noinline struct tcp_rate_set *
 1126 find_rs_for_ifp(struct ifnet *ifp)
 1127 {
 1128         struct tcp_rate_set *rs;
 1129 
 1130         CK_LIST_FOREACH(rs, &int_rs, next) {
 1131                 if ((rs->rs_ifp == ifp) &&
 1132                     (rs->rs_if_dunit == ifp->if_dunit)) {
 1133                         /* Ok we found it */
 1134                         return (rs);
 1135                 }
 1136         }
 1137         return (NULL);
 1138 }
 1139 
 1140 
 1141 static const struct tcp_hwrate_limit_table *
 1142 rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, uint64_t bytes_per_sec,
 1143     uint32_t flags, int *error, uint64_t *lower_rate)
 1144 {
 1145         /* First lets find the interface if it exists */
 1146         const struct tcp_hwrate_limit_table *rte;
 1147         /*
 1148          * So why is rs volatile? This is to defeat a
 1149          * compiler bug where in the compiler is convinced
 1150          * that rs can never be NULL (which is not true). Because
 1151          * of its conviction it nicely optimizes out the if ((rs == NULL
 1152          * below which means if you get a NULL back you dereference it.
 1153          */
 1154         volatile struct tcp_rate_set *rs;
 1155         struct epoch_tracker et;
 1156         struct ifnet *oifp = ifp;
 1157         int err;
 1158 
 1159         NET_EPOCH_ENTER(et);
 1160 use_real_interface:
 1161         rs = find_rs_for_ifp(ifp);
 1162         if ((rs == NULL) ||
 1163             (rs->rs_flags & RS_INTF_NO_SUP) ||
 1164             (rs->rs_flags & RS_IS_DEAD)) {
 1165                 /*
 1166                  * This means we got a packet *before*
 1167                  * the IF-UP was processed below, <or>
 1168                  * while or after we already received an interface
 1169                  * departed event. In either case we really don't
 1170                  * want to do anything with pacing, in
 1171                  * the departing case the packet is not
 1172                  * going to go very far. The new case
 1173                  * might be arguable, but its impossible
 1174                  * to tell from the departing case.
 1175                  */
 1176                 if (error)
 1177                         *error = ENODEV;
 1178                 NET_EPOCH_EXIT(et);
 1179                 return (NULL);
 1180         }
 1181 
 1182         if ((rs == NULL) || (rs->rs_disable != 0)) {
 1183                 if (error)
 1184                         *error = ENOSPC;
 1185                 NET_EPOCH_EXIT(et);
 1186                 return (NULL);
 1187         }
 1188         if (rs->rs_flags & RS_IS_DEFF) {
 1189                 /* We need to find the real interface */
 1190                 struct ifnet *tifp;
 1191 
 1192                 tifp = rt_find_real_interface(ifp, inp, error);
 1193                 if (tifp == NULL) {
 1194                         if (rs->rs_disable && error)
 1195                                 *error = ENOTSUP;
 1196                         NET_EPOCH_EXIT(et);
 1197                         return (NULL);
 1198                 }
 1199                 KASSERT((tifp != ifp),
 1200                         ("Lookup failure ifp:%p inp:%p rt_find_real_interface() returns the same interface tifp:%p?\n",
 1201                          ifp, inp, tifp));
 1202                 ifp = tifp;
 1203                 goto use_real_interface;
 1204         }
 1205         if (rs->rs_flow_limit &&
 1206             ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
 1207                 if (error)
 1208                         *error = ENOSPC;
 1209                 NET_EPOCH_EXIT(et);
 1210                 return (NULL);
 1211         }
 1212         rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
 1213         if (rte) {
 1214                 err = in_pcbattach_txrtlmt(inp, oifp,
 1215                     inp->inp_flowtype,
 1216                     inp->inp_flowid,
 1217                     rte->rate,
 1218                     &inp->inp_snd_tag);
 1219                 if (err) {
 1220                         /* Failed to attach */
 1221                         if (error)
 1222                                 *error = err;
 1223                         rte = NULL;
 1224                 } else {
 1225                         KASSERT((inp->inp_snd_tag != NULL) ,
 1226                                 ("Setup rate has no snd_tag inp:%p rte:%p rate:%llu rs:%p",
 1227                                  inp, rte, (unsigned long long)rte->rate, rs));
 1228 #ifdef INET
 1229                         counter_u64_add(rate_limit_new, 1);
 1230 #endif
 1231                 }
 1232         }
 1233         if (rte) {
 1234                 /*
 1235                  * We use an atomic here for accounting so we don't have to
 1236                  * use locks when freeing.
 1237                  */
 1238                 atomic_add_64(&rs->rs_flows_using, 1);
 1239         }
 1240         NET_EPOCH_EXIT(et);
 1241         return (rte);
 1242 }
 1243 
 1244 static void
 1245 tcp_rl_ifnet_link(void *arg __unused, struct ifnet *ifp, int link_state)
 1246 {
 1247         int error;
 1248         struct tcp_rate_set *rs;
 1249         struct epoch_tracker et;
 1250 
 1251         if (((ifp->if_capenable & IFCAP_TXRTLMT) == 0) ||
 1252             (link_state != LINK_STATE_UP)) {
 1253                 /*
 1254                  * We only care on an interface going up that is rate-limit
 1255                  * capable.
 1256                  */
 1257                 return;
 1258         }
 1259         NET_EPOCH_ENTER(et);
 1260         mtx_lock(&rs_mtx);
 1261         rs = find_rs_for_ifp(ifp);
 1262         if (rs) {
 1263                 /* We already have initialized this guy */
 1264                 mtx_unlock(&rs_mtx);
 1265                 NET_EPOCH_EXIT(et);
 1266                 return;
 1267         }
 1268         mtx_unlock(&rs_mtx);
 1269         NET_EPOCH_EXIT(et);
 1270         rt_setup_new_rs(ifp, &error);
 1271 }
 1272 
 1273 static void
 1274 tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 1275 {
 1276         struct tcp_rate_set *rs;
 1277         struct epoch_tracker et;
 1278         int i;
 1279 
 1280         NET_EPOCH_ENTER(et);
 1281         mtx_lock(&rs_mtx);
 1282         rs = find_rs_for_ifp(ifp);
 1283         if (rs) {
 1284                 CK_LIST_REMOVE(rs, next);
 1285                 rs_number_alive--;
 1286                 rs->rs_flags |= RS_IS_DEAD;
 1287                 for (i = 0; i < rs->rs_rate_cnt; i++) {
 1288                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 1289                                 in_pcbdetach_tag(rs->rs_rlt[i].tag);
 1290                                 rs->rs_rlt[i].tag = NULL;
 1291                         }
 1292                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 1293                 }
 1294                 if (rs->rs_flows_using == 0)
 1295                         rs_defer_destroy(rs);
 1296         }
 1297         mtx_unlock(&rs_mtx);
 1298         NET_EPOCH_EXIT(et);
 1299 }
 1300 
 1301 static void
 1302 tcp_rl_shutdown(void *arg __unused, int howto __unused)
 1303 {
 1304         struct tcp_rate_set *rs, *nrs;
 1305         struct epoch_tracker et;
 1306         int i;
 1307 
 1308         NET_EPOCH_ENTER(et);
 1309         mtx_lock(&rs_mtx);
 1310         CK_LIST_FOREACH_SAFE(rs, &int_rs, next, nrs) {
 1311                 CK_LIST_REMOVE(rs, next);
 1312                 rs_number_alive--;
 1313                 rs->rs_flags |= RS_IS_DEAD;
 1314                 for (i = 0; i < rs->rs_rate_cnt; i++) {
 1315                         if (rs->rs_rlt[i].flags & HDWRPACE_TAGPRESENT) {
 1316                                 in_pcbdetach_tag(rs->rs_rlt[i].tag);
 1317                                 rs->rs_rlt[i].tag = NULL;
 1318                         }
 1319                         rs->rs_rlt[i].flags = HDWRPACE_IFPDEPARTED;
 1320                 }
 1321                 if (rs->rs_flows_using == 0)
 1322                         rs_defer_destroy(rs);
 1323         }
 1324         mtx_unlock(&rs_mtx);
 1325         NET_EPOCH_EXIT(et);
 1326 }
 1327 
 1328 const struct tcp_hwrate_limit_table *
 1329 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
 1330     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
 1331 {
 1332         struct inpcb *inp = tptoinpcb(tp);
 1333         const struct tcp_hwrate_limit_table *rte;
 1334 #ifdef KERN_TLS
 1335         struct ktls_session *tls;
 1336 #endif
 1337 
 1338         INP_WLOCK_ASSERT(inp);
 1339 
 1340         if (inp->inp_snd_tag == NULL) {
 1341                 /*
 1342                  * We are setting up a rate for the first time.
 1343                  */
 1344                 if ((ifp->if_capenable & IFCAP_TXRTLMT) == 0) {
 1345                         /* Not supported by the egress */
 1346                         if (error)
 1347                                 *error = ENODEV;
 1348                         return (NULL);
 1349                 }
 1350 #ifdef KERN_TLS
 1351                 tls = NULL;
 1352                 if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) {
 1353                         tls = tptosocket(tp)->so_snd.sb_tls_info;
 1354 
 1355                         if ((ifp->if_capenable & IFCAP_TXTLS_RTLMT) == 0 ||
 1356                             tls->mode != TCP_TLS_MODE_IFNET) {
 1357                                 if (error)
 1358                                         *error = ENODEV;
 1359                                 return (NULL);
 1360                         }
 1361                 }
 1362 #endif
 1363                 rte = rt_setup_rate(inp, ifp, bytes_per_sec, flags, error, lower_rate);
 1364                 if (rte)
 1365                         rl_increment_using(rte);
 1366 #ifdef KERN_TLS
 1367                 if (rte != NULL && tls != NULL && tls->snd_tag != NULL) {
 1368                         /*
 1369                          * Fake a route change error to reset the TLS
 1370                          * send tag.  This will convert the existing
 1371                          * tag to a TLS ratelimit tag.
 1372                          */
 1373                         MPASS(tls->snd_tag->sw->type == IF_SND_TAG_TYPE_TLS);
 1374                         ktls_output_eagain(inp, tls);
 1375                 }
 1376 #endif
 1377         } else {
 1378                 /*
 1379                  * We are modifying a rate, wrong interface?
 1380                  */
 1381                 if (error)
 1382                         *error = EINVAL;
 1383                 rte = NULL;
 1384         }
 1385         if (rte != NULL) {
 1386                 tp->t_pacing_rate = rte->rate;
 1387                 *error = 0;
 1388         }
 1389         return (rte);
 1390 }
 1391 
 1392 const struct tcp_hwrate_limit_table *
 1393 tcp_chg_pacing_rate(const struct tcp_hwrate_limit_table *crte,
 1394     struct tcpcb *tp, struct ifnet *ifp,
 1395     uint64_t bytes_per_sec, int flags, int *error, uint64_t *lower_rate)
 1396 {
 1397         struct inpcb *inp = tptoinpcb(tp);
 1398         const struct tcp_hwrate_limit_table *nrte;
 1399         const struct tcp_rate_set *rs;
 1400 #ifdef KERN_TLS
 1401         struct ktls_session *tls = NULL;
 1402 #endif
 1403         int err;
 1404 
 1405         INP_WLOCK_ASSERT(inp);
 1406 
 1407         if (crte == NULL) {
 1408                 /* Wrong interface */
 1409                 if (error)
 1410                         *error = EINVAL;
 1411                 return (NULL);
 1412         }
 1413 
 1414 #ifdef KERN_TLS
 1415         if (tptosocket(tp)->so_snd.sb_flags & SB_TLS_IFNET) {
 1416                 tls = tptosocket(tp)->so_snd.sb_tls_info;
 1417                 if (tls->mode != TCP_TLS_MODE_IFNET)
 1418                         tls = NULL;
 1419                 else if (tls->snd_tag != NULL &&
 1420                     tls->snd_tag->sw->type != IF_SND_TAG_TYPE_TLS_RATE_LIMIT) {
 1421                         if (!tls->reset_pending) {
 1422                                 /*
 1423                                  * NIC probably doesn't support
 1424                                  * ratelimit TLS tags if it didn't
 1425                                  * allocate one when an existing rate
 1426                                  * was present, so ignore.
 1427                                  */
 1428                                 tcp_rel_pacing_rate(crte, tp);
 1429                                 if (error)
 1430                                         *error = EOPNOTSUPP;
 1431                                 return (NULL);
 1432                         }
 1433 
 1434                         /*
 1435                          * The send tag is being converted, so set the
 1436                          * rate limit on the inpcb tag.  There is a
 1437                          * race that the new NIC send tag might use
 1438                          * the current rate instead of this one.
 1439                          */
 1440                         tls = NULL;
 1441                 }
 1442         }
 1443 #endif
 1444         if (inp->inp_snd_tag == NULL) {
 1445                 /* Wrong interface */
 1446                 tcp_rel_pacing_rate(crte, tp);
 1447                 if (error)
 1448                         *error = EINVAL;
 1449                 return (NULL);
 1450         }
 1451         rs = crte->ptbl;
 1452         if ((rs->rs_flags & RS_IS_DEAD) ||
 1453             (crte->flags & HDWRPACE_IFPDEPARTED)) {
 1454                 /* Release the rate, and try anew */
 1455 
 1456                 tcp_rel_pacing_rate(crte, tp);
 1457                 nrte = tcp_set_pacing_rate(tp, ifp,
 1458                     bytes_per_sec, flags, error, lower_rate);
 1459                 return (nrte);
 1460         }
 1461         nrte = tcp_find_suitable_rate(rs, bytes_per_sec, flags, lower_rate);
 1462         if (nrte == crte) {
 1463                 /* No change */
 1464                 if (error)
 1465                         *error = 0;
 1466                 return (crte);
 1467         }
 1468         if (nrte == NULL) {
 1469                 /* Release the old rate */
 1470                 if (error)
 1471                         *error = ENOENT;
 1472                 tcp_rel_pacing_rate(crte, tp);
 1473                 return (NULL);
 1474         }
 1475         rl_decrement_using(crte);
 1476         rl_increment_using(nrte);
 1477         /* Change rates to our new entry */
 1478 #ifdef KERN_TLS
 1479         if (tls != NULL)
 1480                 err = ktls_modify_txrtlmt(tls, nrte->rate);
 1481         else
 1482 #endif
 1483                 err = in_pcbmodify_txrtlmt(inp, nrte->rate);
 1484         if (err) {
 1485                 struct tcp_rate_set *lrs;
 1486                 uint64_t pre;
 1487 
 1488                 rl_decrement_using(nrte);
 1489                 lrs = __DECONST(struct tcp_rate_set *, rs);
 1490                 pre = atomic_fetchadd_64(&lrs->rs_flows_using, -1);
 1491                 /* Do we still have a snd-tag attached? */
 1492                 if (inp->inp_snd_tag)
 1493                         in_pcbdetach_txrtlmt(inp);
 1494 
 1495                 if (pre == 1) {
 1496                         struct epoch_tracker et;
 1497 
 1498                         NET_EPOCH_ENTER(et);
 1499                         mtx_lock(&rs_mtx);
 1500                         /*
 1501                          * Is it dead?
 1502                          */
 1503                         if (lrs->rs_flags & RS_IS_DEAD)
 1504                                 rs_defer_destroy(lrs);
 1505                         mtx_unlock(&rs_mtx);
 1506                         NET_EPOCH_EXIT(et);
 1507                 }
 1508                 if (error)
 1509                         *error = err;
 1510                 return (NULL);
 1511         } else {
 1512 #ifdef INET
 1513                 counter_u64_add(rate_limit_chg, 1);
 1514 #endif
 1515         }
 1516         if (error)
 1517                 *error = 0;
 1518         tp->t_pacing_rate = nrte->rate;
 1519         return (nrte);
 1520 }
 1521 
 1522 void
 1523 tcp_rel_pacing_rate(const struct tcp_hwrate_limit_table *crte, struct tcpcb *tp)
 1524 {
 1525         struct inpcb *inp = tptoinpcb(tp);
 1526         const struct tcp_rate_set *crs;
 1527         struct tcp_rate_set *rs;
 1528         uint64_t pre;
 1529 
 1530         INP_WLOCK_ASSERT(inp);
 1531 
 1532         tp->t_pacing_rate = -1;
 1533         crs = crte->ptbl;
 1534         /*
 1535          * Now we must break the const
 1536          * in order to release our refcount.
 1537          */
 1538         rs = __DECONST(struct tcp_rate_set *, crs);
 1539         rl_decrement_using(crte);
 1540         pre = atomic_fetchadd_64(&rs->rs_flows_using, -1);
 1541         if (pre == 1) {
 1542                 struct epoch_tracker et;
 1543 
 1544                 NET_EPOCH_ENTER(et);
 1545                 mtx_lock(&rs_mtx);
 1546                 /*
 1547                  * Is it dead?
 1548                  */
 1549                 if (rs->rs_flags & RS_IS_DEAD)
 1550                         rs_defer_destroy(rs);
 1551                 mtx_unlock(&rs_mtx);
 1552                 NET_EPOCH_EXIT(et);
 1553         }
 1554 
 1555         /*
 1556          * XXX: If this connection is using ifnet TLS, should we
 1557          * switch it to using an unlimited rate, or perhaps use
 1558          * ktls_output_eagain() to reset the send tag to a plain
 1559          * TLS tag?
 1560          */
 1561         in_pcbdetach_txrtlmt(inp);
 1562 }
 1563 
 1564 #define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
 1565 #define ONE_HUNDRED_MBPS 12500000       /* 100Mbps in bytes per second */
 1566 #define FIVE_HUNDRED_MBPS 62500000      /* 500Mbps in bytes per second */
 1567 #define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */
 1568 
 1569 static void
 1570 tcp_log_pacing_size(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, uint32_t new_tso,
 1571                     uint64_t hw_rate, uint32_t time_between, uint32_t calc_time_between,
 1572                     uint32_t segs, uint32_t res_div, uint16_t mult, uint8_t mod)
 1573 {
 1574         if (tp->t_logstate != TCP_LOG_STATE_OFF) {
 1575                 union tcp_log_stackspecific log;
 1576                 struct timeval tv;
 1577 
 1578                 memset(&log, 0, sizeof(log));
 1579                 log.u_bbr.flex1 = segsiz;
 1580                 log.u_bbr.flex2 = new_tso;
 1581                 log.u_bbr.flex3 = time_between;
 1582                 log.u_bbr.flex4 = calc_time_between;
 1583                 log.u_bbr.flex5 = segs;
 1584                 log.u_bbr.flex6 = res_div;
 1585                 log.u_bbr.flex7 = mult;
 1586                 log.u_bbr.flex8 = mod;
 1587                 log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 1588                 log.u_bbr.cur_del_rate = bw;
 1589                 log.u_bbr.delRate = hw_rate;
 1590                 TCP_LOG_EVENTP(tp, NULL,
 1591                     &tptosocket(tp)->so_rcv,
 1592                     &tptosocket(tp)->so_snd,
 1593                     TCP_HDWR_PACE_SIZE, 0,
 1594                     0, &log, false, &tv);
 1595         }
 1596 }
 1597 
 1598 uint32_t
 1599 tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segsiz, int can_use_1mss,
 1600    const struct tcp_hwrate_limit_table *te, int *err, int divisor)
 1601 {
 1602         /*
 1603          * We use the google formula to calculate the
 1604          * TSO size. I.E.
 1605          * bw < 24Meg
 1606          *   tso = 2mss
 1607          * else
 1608          *   tso = min(bw/(div=1000), 64k)
 1609          *
 1610          * Note for these calculations we ignore the
 1611          * packet overhead (enet hdr, ip hdr and tcp hdr).
 1612          * We only get the google formula when we have
 1613          * divisor = 1000, which is the default for now.
 1614          */
 1615         uint64_t lentim, res, bytes;
 1616         uint32_t new_tso, min_tso_segs;
 1617 
 1618         /* It can't be zero */
 1619         if ((divisor == 0) ||
 1620             (divisor < RL_MIN_DIVISOR)) {
 1621                 if (mss_divisor)
 1622                         bytes = bw / mss_divisor;
 1623                 else
 1624                         bytes = bw / 1000;
 1625         } else
 1626                 bytes = bw / divisor;
 1627         /* We can't ever send more than 65k in a TSO */
 1628         if (bytes > 0xffff) {
 1629                 bytes = 0xffff;
 1630         }
 1631         /* Round up */
 1632         new_tso = (bytes + segsiz - 1) / segsiz;
 1633         /* Are we enforcing even boundaries? */
 1634         if (even_num_segs && (new_tso & 1) && (new_tso > even_threshold))
 1635                 new_tso++;
 1636         if (can_use_1mss)
 1637                 min_tso_segs = 1;
 1638         else
 1639                 min_tso_segs = 2;
 1640         if (rs_floor_mss && (new_tso < rs_floor_mss))
 1641                 new_tso = rs_floor_mss;
 1642         else if (new_tso < min_tso_segs)
 1643                 new_tso = min_tso_segs;
 1644         if (new_tso > MAX_MSS_SENT)
 1645                 new_tso = MAX_MSS_SENT;
 1646         new_tso *= segsiz;
 1647         tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 1648                             0, 0, 0, 0, 0, 0, 1);
 1649         /*
 1650          * If we are not doing hardware pacing
 1651          * then we are done.
 1652          */
 1653         if (te == NULL) {
 1654                 if (err)
 1655                         *err = 0;
 1656                 return(new_tso);
 1657         }
 1658         /*
 1659          * For hardware pacing we look at the
 1660          * rate you are sending at and compare
 1661          * that to the rate you have in hardware.
 1662          *
 1663          * If the hardware rate is slower than your
 1664          * software rate then you are in error and
 1665          * we will build a queue in our hardware whic
 1666          * is probably not desired, in such a case
 1667          * just return the non-hardware TSO size.
 1668          *
 1669          * If the rate in hardware is faster (which
 1670          * it should be) then look at how long it
 1671          * takes to send one ethernet segment size at
 1672          * your b/w and compare that to the time it
 1673          * takes to send at the rate you had selected.
 1674          *
 1675          * If your time is greater (which we hope it is)
 1676          * we get the delta between the two, and then
 1677          * divide that into your pacing time. This tells
 1678          * us how many MSS you can send down at once (rounded up).
 1679          *
 1680          * Note we also double this value if the b/w is over
 1681          * 100Mbps. If its over 500meg we just set you to the
 1682          * max (43 segments).
 1683          */
 1684         if (te->rate > FIVE_HUNDRED_MBPS)
 1685                 goto max;
 1686         if (te->rate == bw) {
 1687                 /* We are pacing at exactly the hdwr rate */
 1688 max:
 1689                 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 1690                                     te->rate, te->time_between, (uint32_t)0,
 1691                                     (segsiz * MAX_MSS_SENT), 0, 0, 3);
 1692                 return (segsiz * MAX_MSS_SENT);
 1693         }
 1694         lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
 1695         res = lentim / bw;
 1696         if (res > te->time_between) {
 1697                 uint32_t delta, segs, res_div;
 1698 
 1699                 res_div = ((res * num_of_waits_allowed) + wait_time_floor);
 1700                 delta = res - te->time_between;
 1701                 segs = (res_div + delta - 1)/delta;
 1702                 if (segs < min_tso_segs)
 1703                         segs = min_tso_segs;
 1704                 if (segs < rs_hw_floor_mss)
 1705                         segs = rs_hw_floor_mss;
 1706                 if (segs > MAX_MSS_SENT)
 1707                         segs = MAX_MSS_SENT;
 1708                 segs *= segsiz;
 1709                 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 1710                                     te->rate, te->time_between, (uint32_t)res,
 1711                                     segs, res_div, 1, 3);
 1712                 if (err)
 1713                         *err = 0;
 1714                 if (segs < new_tso) {
 1715                         /* unexpected ? */
 1716                         return(new_tso);
 1717                 } else {
 1718                         return (segs);
 1719                 }
 1720         } else {
 1721                 /*
 1722                  * Your time is smaller which means
 1723                  * we will grow a queue on our
 1724                  * hardware. Send back the non-hardware
 1725                  * rate.
 1726                  */
 1727                 tcp_log_pacing_size(tp, bw, segsiz, new_tso,
 1728                                     te->rate, te->time_between, (uint32_t)res,
 1729                                     0, 0, 0, 4);
 1730                 if (err)
 1731                         *err = -1;
 1732                 return (new_tso);
 1733         }
 1734 }
 1735 
 1736 uint64_t
 1737 tcp_hw_highest_rate_ifp(struct ifnet *ifp, struct inpcb *inp)
 1738 {
 1739         struct epoch_tracker et;
 1740         struct tcp_rate_set *rs;
 1741         uint64_t rate_ret;
 1742 
 1743         NET_EPOCH_ENTER(et);
 1744 use_next_interface:
 1745         rs = find_rs_for_ifp(ifp);
 1746         if (rs == NULL) {
 1747                 /* This interface does not do ratelimiting */
 1748                 rate_ret = 0;
 1749         } else if (rs->rs_flags & RS_IS_DEFF) {
 1750                 /* We need to find the real interface */
 1751                 struct ifnet *tifp;
 1752 
 1753                 tifp = rt_find_real_interface(ifp, inp, NULL);
 1754                 if (tifp == NULL) {
 1755                         NET_EPOCH_EXIT(et);
 1756                         return (0);
 1757                 }
 1758                 ifp = tifp;
 1759                 goto use_next_interface;
 1760         } else {
 1761                 /* Lets return the highest rate this guy has */
 1762                 rate_ret = rs->rs_rlt[rs->rs_highest_valid].rate;
 1763         }
 1764         NET_EPOCH_EXIT(et);
 1765         return(rate_ret);
 1766 }
 1767 
 1768 static eventhandler_tag rl_ifnet_departs;
 1769 static eventhandler_tag rl_ifnet_arrives;
 1770 static eventhandler_tag rl_shutdown_start;
 1771 
 1772 static void
 1773 tcp_rs_init(void *st __unused)
 1774 {
 1775         CK_LIST_INIT(&int_rs);
 1776         rs_number_alive = 0;
 1777         rs_number_dead = 0;
 1778         mtx_init(&rs_mtx, "tcp_rs_mtx", "rsmtx", MTX_DEF);
 1779         rl_ifnet_departs = EVENTHANDLER_REGISTER(ifnet_departure_event,
 1780             tcp_rl_ifnet_departure,
 1781             NULL, EVENTHANDLER_PRI_ANY);
 1782         rl_ifnet_arrives = EVENTHANDLER_REGISTER(ifnet_link_event,
 1783             tcp_rl_ifnet_link,
 1784             NULL, EVENTHANDLER_PRI_ANY);
 1785         rl_shutdown_start = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 1786             tcp_rl_shutdown, NULL,
 1787             SHUTDOWN_PRI_FIRST);
 1788         printf("TCP_ratelimit: Is now initialized\n");
 1789 }
 1790 
 1791 SYSINIT(tcp_rl_init, SI_SUB_SMP + 1, SI_ORDER_ANY, tcp_rs_init, NULL);
 1792 #endif

Cache object: 51b5ce2e48200a7c5d277e5c88d90ba9


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.