The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/cc/cc_chd.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 2009-2010
    5  *      Swinburne University of Technology, Melbourne, Australia
    6  * Copyright (c) 2010-2011 The FreeBSD Foundation
    7  * All rights reserved.
    8  *
    9  * This software was developed at the Centre for Advanced Internet
   10  * Architectures, Swinburne University of Technology, by David Hayes and
   11  * Lawrence Stewart, made possible in part by a grant from the Cisco University
   12  * Research Program Fund at Community Foundation Silicon Valley.
   13  *
   14  * Portions of this software were developed at the Centre for Advanced Internet
   15  * Architectures, Swinburne University of Technology, Melbourne, Australia by
   16  * David Hayes under sponsorship from the FreeBSD Foundation.
   17  *
   18  * Redistribution and use in source and binary forms, with or without
   19  * modification, are permitted provided that the following conditions
   20  * are met:
   21  * 1. Redistributions of source code must retain the above copyright
   22  *    notice, this list of conditions and the following disclaimer.
   23  * 2. Redistributions in binary form must reproduce the above copyright
   24  *    notice, this list of conditions and the following disclaimer in the
   25  *    documentation and/or other materials provided with the distribution.
   26  *
   27  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   30  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   37  * SUCH DAMAGE.
   38  */
   39 
   40 /*
   41  * An implementation of the CAIA-Hamilton delay based congestion control
   42  * algorithm, based on "Improved coexistence and loss tolerance for delay based
   43  * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE
   44  * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA,
   45  * 11-14 October 2010.
   46  *
   47  * Originally released as part of the NewTCP research project at Swinburne
   48  * University of Technology's Centre for Advanced Internet Architectures,
   49  * Melbourne, Australia, which was made possible in part by a grant from the
   50  * Cisco University Research Program Fund at Community Foundation Silicon
   51  * Valley. More details are available at:
   52  *   http://caia.swin.edu.au/urp/newtcp/
   53  */
   54 
   55 #include <sys/cdefs.h>
   56 __FBSDID("$FreeBSD$");
   57 
   58 #include <sys/param.h>
   59 #include <sys/kernel.h>
   60 #include <sys/khelp.h>
   61 #include <sys/limits.h>
   62 #include <sys/malloc.h>
   63 #include <sys/module.h>
   64 #include <sys/queue.h>
   65 #include <sys/socket.h>
   66 #include <sys/socketvar.h>
   67 #include <sys/sysctl.h>
   68 #include <sys/systm.h>
   69 
   70 #include <net/vnet.h>
   71 
   72 #include <net/route.h>
   73 #include <net/route/nhop.h>
   74 
   75 #include <netinet/in_pcb.h>
   76 #include <netinet/tcp.h>
   77 #include <netinet/tcp_seq.h>
   78 #include <netinet/tcp_timer.h>
   79 #include <netinet/tcp_var.h>
   80 #include <netinet/cc/cc.h>
   81 #include <netinet/cc/cc_module.h>
   82 
   83 #include <netinet/khelp/h_ertt.h>
   84 
   85 /*
   86  * Private signal type for rate based congestion signal.
   87  * See <netinet/cc.h> for appropriate bit-range to use for private signals.
   88  */
   89 #define CC_CHD_DELAY    0x02000000
   90 
   91 /* Largest possible number returned by random(). */
   92 #define RANDOM_MAX      INT_MAX
   93 
   94 static void     chd_ack_received(struct cc_var *ccv, uint16_t ack_type);
   95 static void     chd_cb_destroy(struct cc_var *ccv);
   96 static int      chd_cb_init(struct cc_var *ccv, void *ptr);
   97 static void     chd_cong_signal(struct cc_var *ccv, uint32_t signal_type);
   98 static void     chd_conn_init(struct cc_var *ccv);
   99 static int      chd_mod_init(void);
  100 static size_t   chd_data_sz(void);
  101 
  102 struct chd {
  103         /*
  104          * Shadow window - keeps track of what the NewReno congestion window
  105          * would have been if delay-based cwnd backoffs had not been made. This
  106          * functionality aids coexistence with loss-based TCP flows which may be
  107          * sharing links along the path.
  108          */
  109         unsigned long shadow_w;
  110         /*
  111          * Loss-based TCP compatibility flag - When set, it turns on the shadow
  112          * window functionality.
  113          */
  114         int loss_compete;
  115          /* The maximum round trip time seen within a measured rtt period. */
  116         int maxrtt_in_rtt;
  117         /* The previous qdly that caused cwnd to backoff. */
  118         int prev_backoff_qdly;
  119 };
  120 
  121 static int ertt_id;
  122 
  123 VNET_DEFINE_STATIC(uint32_t, chd_qmin) = 5;
  124 VNET_DEFINE_STATIC(uint32_t, chd_pmax) = 50;
  125 VNET_DEFINE_STATIC(uint32_t, chd_loss_fair) = 1;
  126 VNET_DEFINE_STATIC(uint32_t, chd_use_max) = 1;
  127 VNET_DEFINE_STATIC(uint32_t, chd_qthresh) = 20;
  128 #define V_chd_qthresh   VNET(chd_qthresh)
  129 #define V_chd_qmin      VNET(chd_qmin)
  130 #define V_chd_pmax      VNET(chd_pmax)
  131 #define V_chd_loss_fair VNET(chd_loss_fair)
  132 #define V_chd_use_max   VNET(chd_use_max)
  133 
  134 
  135 struct cc_algo chd_cc_algo = {
  136         .name = "chd",
  137         .ack_received = chd_ack_received,
  138         .cb_destroy = chd_cb_destroy,
  139         .cb_init = chd_cb_init,
  140         .cong_signal = chd_cong_signal,
  141         .conn_init = chd_conn_init,
  142         .mod_init = chd_mod_init,
  143         .cc_data_sz = chd_data_sz,
  144         .after_idle = newreno_cc_after_idle,
  145         .post_recovery = newreno_cc_post_recovery,
  146 };
  147 
  148 static __inline void
  149 chd_window_decrease(struct cc_var *ccv)
  150 {
  151         unsigned long win;
  152 
  153         win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg);
  154         win -= max((win / 2), 1);
  155         CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg);
  156 }
  157 
  158 /*
  159  * Probabilistic backoff function. Returns 1 if we should backoff or 0
  160  * otherwise. The calculation of p is similar to the calculation of p in cc_hd.
  161  */
  162 static __inline int
  163 should_backoff(int qdly, int maxqdly, struct chd *chd_data)
  164 {
  165         unsigned long p, rand;
  166 
  167         rand = random();
  168 
  169         if (qdly < V_chd_qthresh) {
  170                 chd_data->loss_compete = 0;
  171                 p = (((RANDOM_MAX / 100) * V_chd_pmax) /
  172                     (V_chd_qthresh - V_chd_qmin)) *
  173                     (qdly - V_chd_qmin);
  174         } else {
  175                 if (qdly > V_chd_qthresh) {
  176                         p = (((RANDOM_MAX / 100) * V_chd_pmax) /
  177                             (maxqdly - V_chd_qthresh)) *
  178                             (maxqdly - qdly);
  179                         if (V_chd_loss_fair && rand < p)
  180                                 chd_data->loss_compete = 1;
  181                 } else {
  182                         p = (RANDOM_MAX / 100) * V_chd_pmax;
  183                         chd_data->loss_compete = 0;
  184                 }
  185         }
  186 
  187         return (rand < p);
  188 }
  189 
  190 static __inline void
  191 chd_window_increase(struct cc_var *ccv, int new_measurement)
  192 {
  193         struct chd *chd_data;
  194         int incr;
  195 
  196         chd_data = ccv->cc_data;
  197         incr = 0;
  198 
  199         if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
  200                 /* Adapted from NewReno slow start. */
  201                 if (V_tcp_do_rfc3465) {
  202                         /* In slow-start with ABC enabled. */
  203                         if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) {
  204                                 /* Not due to RTO. */
  205                                 incr = min(ccv->bytes_this_ack,
  206                                     V_tcp_abc_l_var * CCV(ccv, t_maxseg));
  207                         } else {
  208                                 /* Due to RTO. */
  209                                 incr = min(ccv->bytes_this_ack,
  210                                     CCV(ccv, t_maxseg));
  211                         }
  212                 } else
  213                         incr = CCV(ccv, t_maxseg);
  214 
  215         } else { /* Congestion avoidance. */
  216                 if (V_tcp_do_rfc3465) {
  217                         if (ccv->flags & CCF_ABC_SENTAWND) {
  218                                 ccv->flags &= ~CCF_ABC_SENTAWND;
  219                                 incr = CCV(ccv, t_maxseg);
  220                         }
  221                 } else if (new_measurement)
  222                         incr = CCV(ccv, t_maxseg);
  223         }
  224 
  225         if (chd_data->shadow_w > 0) {
  226                 /* Track NewReno window. */
  227                 chd_data->shadow_w = min(chd_data->shadow_w + incr,
  228                     TCP_MAXWIN << CCV(ccv, snd_scale));
  229         }
  230 
  231         CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr,
  232             TCP_MAXWIN << CCV(ccv, snd_scale));
  233 }
  234 
  235 /*
  236  * All ACK signals are used for timing measurements to determine delay-based
  237  * congestion. However, window increases are only performed when
  238  * ack_type == CC_ACK.
  239  */
  240 static void
  241 chd_ack_received(struct cc_var *ccv, uint16_t ack_type)
  242 {
  243         struct chd *chd_data;
  244         struct ertt *e_t;
  245         int backoff, new_measurement, qdly, rtt;
  246 
  247         e_t = khelp_get_osd(&CCV(ccv, t_osd), ertt_id);
  248         chd_data = ccv->cc_data;
  249         new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
  250         backoff = qdly = 0;
  251 
  252         chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt);
  253 
  254         if (new_measurement) {
  255                 /*
  256                  * There is a new per RTT measurement, so check to see if there
  257                  * is delay based congestion.
  258                  */
  259                 rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt;
  260                 chd_data->maxrtt_in_rtt = 0;
  261 
  262                 if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) {
  263                         qdly = rtt - e_t->minrtt;
  264                         if (qdly > V_chd_qmin) {
  265                                 /*
  266                                  * Probabilistic delay based congestion
  267                                  * indication.
  268                                  */
  269                                 backoff = should_backoff(qdly,
  270                                     e_t->maxrtt - e_t->minrtt, chd_data);
  271                         } else
  272                                 chd_data->loss_compete = 0;
  273                 }
  274                 /* Reset per RTT measurement flag to start a new measurement. */
  275                 e_t->flags &= ~ERTT_NEW_MEASUREMENT;
  276         }
  277 
  278         if (backoff) {
  279                 /*
  280                  * Update shadow_w before delay based backoff.
  281                  */
  282                 if (chd_data->loss_compete ||
  283                     qdly > chd_data->prev_backoff_qdly) {
  284                         /*
  285                          * Delay is higher than when we backed off previously,
  286                          * so it is possible that this flow is competing with
  287                          * loss based flows.
  288                          */
  289                         chd_data->shadow_w = max(CCV(ccv, snd_cwnd),
  290                             chd_data->shadow_w);
  291                 } else {
  292                         /*
  293                          * Reset shadow_w, as it is probable that this flow is
  294                          * not competing with loss based flows at the moment.
  295                          */
  296                         chd_data->shadow_w = 0;
  297                 }
  298 
  299                 chd_data->prev_backoff_qdly = qdly;
  300                 /*
  301                  * Send delay-based congestion signal to the congestion signal
  302                  * handler.
  303                  */
  304                 chd_cong_signal(ccv, CC_CHD_DELAY);
  305 
  306         } else if (ack_type == CC_ACK)
  307                 chd_window_increase(ccv, new_measurement);
  308 }
  309 
  310 static void
  311 chd_cb_destroy(struct cc_var *ccv)
  312 {
  313         free(ccv->cc_data, M_CC_MEM);
  314 }
  315 
  316 size_t
  317 chd_data_sz(void)
  318 {
  319         return (sizeof(struct chd));
  320 }
  321 
  322 static int
  323 chd_cb_init(struct cc_var *ccv, void *ptr)
  324 {
  325         struct chd *chd_data;
  326 
  327         INP_WLOCK_ASSERT(tptoinpcb(ccv->ccvc.tcp));
  328         if (ptr == NULL) {
  329                 chd_data = malloc(sizeof(struct chd), M_CC_MEM, M_NOWAIT);
  330                 if (chd_data == NULL)
  331                         return (ENOMEM);
  332         } else
  333                 chd_data = ptr;
  334 
  335         chd_data->shadow_w = 0;
  336         ccv->cc_data = chd_data;
  337 
  338         return (0);
  339 }
  340 
  341 static void
  342 chd_cong_signal(struct cc_var *ccv, uint32_t signal_type)
  343 {
  344         struct ertt *e_t;
  345         struct chd *chd_data;
  346         int qdly;
  347 
  348         e_t = khelp_get_osd(&CCV(ccv, t_osd), ertt_id);
  349         chd_data = ccv->cc_data;
  350         qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt;
  351 
  352         switch(signal_type) {
  353         case CC_CHD_DELAY:
  354                 chd_window_decrease(ccv); /* Set new ssthresh. */
  355                 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
  356                 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
  357                 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
  358                 break;
  359 
  360         case CC_NDUPACK: /* Packet loss. */
  361                 /*
  362                  * Only react to loss as a congestion signal if qdly >
  363                  * V_chd_qthresh.  If qdly is less than qthresh, presume that
  364                  * this is a non congestion related loss. If qdly is greater
  365                  * than qthresh, assume that we are competing with loss based
  366                  * tcp flows and restore window from any unnecessary backoffs,
  367                  * before the decrease.
  368                  */
  369                 if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) {
  370                         if (chd_data->loss_compete) {
  371                                 CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd),
  372                                     chd_data->shadow_w);
  373                         }
  374                         chd_window_decrease(ccv);
  375                 } else {
  376                          /*
  377                           * This loss isn't congestion related, or already
  378                           * recovering from congestion.
  379                           */
  380                         CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
  381                         CCV(ccv, snd_recover) = CCV(ccv, snd_max);
  382                 }
  383 
  384                 if (chd_data->shadow_w > 0) {
  385                         chd_data->shadow_w = max(chd_data->shadow_w /
  386                             CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg);
  387                 }
  388                 ENTER_FASTRECOVERY(CCV(ccv, t_flags));
  389                 break;
  390 
  391         default:
  392                 newreno_cc_cong_signal(ccv, signal_type);
  393         }
  394 }
  395 
  396 static void
  397 chd_conn_init(struct cc_var *ccv)
  398 {
  399         struct chd *chd_data;
  400 
  401         chd_data = ccv->cc_data;
  402         chd_data->prev_backoff_qdly = 0;
  403         chd_data->maxrtt_in_rtt = 0;
  404         chd_data->loss_compete = 0;
  405         /*
  406          * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are
  407          * competing with loss based flows from the start.
  408          */
  409         chd_data->shadow_w = CCV(ccv, snd_cwnd);
  410 }
  411 
  412 static int
  413 chd_mod_init(void)
  414 {
  415 
  416         ertt_id = khelp_get_id("ertt");
  417         if (ertt_id <= 0) {
  418                 printf("%s: h_ertt module not found\n", __func__);
  419                 return (ENOENT);
  420         }
  421         return (0);
  422 }
  423 
  424 static int
  425 chd_loss_fair_handler(SYSCTL_HANDLER_ARGS)
  426 {
  427         int error;
  428         uint32_t new;
  429 
  430         new = V_chd_loss_fair;
  431         error = sysctl_handle_int(oidp, &new, 0, req);
  432         if (error == 0 && req->newptr != NULL) {
  433                 if (new > 1)
  434                         error = EINVAL;
  435                 else
  436                         V_chd_loss_fair = new;
  437         }
  438 
  439         return (error);
  440 }
  441 
  442 static int
  443 chd_pmax_handler(SYSCTL_HANDLER_ARGS)
  444 {
  445         int error;
  446         uint32_t new;
  447 
  448         new = V_chd_pmax;
  449         error = sysctl_handle_int(oidp, &new, 0, req);
  450         if (error == 0 && req->newptr != NULL) {
  451                 if (new == 0 || new > 100)
  452                         error = EINVAL;
  453                 else
  454                         V_chd_pmax = new;
  455         }
  456 
  457         return (error);
  458 }
  459 
  460 static int
  461 chd_qthresh_handler(SYSCTL_HANDLER_ARGS)
  462 {
  463         int error;
  464         uint32_t new;
  465 
  466         new = V_chd_qthresh;
  467         error = sysctl_handle_int(oidp, &new, 0, req);
  468         if (error == 0 && req->newptr != NULL) {
  469                 if (new <= V_chd_qmin)
  470                         error = EINVAL;
  471                 else
  472                         V_chd_qthresh = new;
  473         }
  474 
  475         return (error);
  476 }
  477 
  478 SYSCTL_DECL(_net_inet_tcp_cc_chd);
  479 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL,
  480     "CAIA Hamilton delay-based congestion control related settings");
  481 
  482 SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair,
  483     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  484     &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler,
  485     "IU", "Flag to enable shadow window functionality.");
  486 
  487 SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax,
  488     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  489     &VNET_NAME(chd_pmax), 5, &chd_pmax_handler,
  490     "IU", "Per RTT maximum backoff probability as a percentage");
  491 
  492 SYSCTL_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold,
  493     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  494     &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler,
  495     "IU", "Queueing congestion threshold in ticks");
  496 
  497 SYSCTL_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min,
  498     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_qmin), 5,
  499     "Minimum queueing delay threshold in ticks");
  500 
  501 SYSCTL_UINT(_net_inet_tcp_cc_chd,  OID_AUTO, use_max,
  502     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(chd_use_max), 1,
  503     "Use the maximum RTT seen within the measurement period (RTT) "
  504     "as the basic delay measurement for the algorithm.");
  505 
  506 DECLARE_CC_MODULE(chd, &chd_cc_algo);
  507 MODULE_VERSION(chd, 2);
  508 MODULE_DEPEND(chd, ertt, 1, 1, 1);

Cache object: 2ba1989f8f7d6582303f03deef2bba29


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.