The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/cc/cc_chd.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 2009-2010
    3  *      Swinburne University of Technology, Melbourne, Australia
    4  * Copyright (c) 2010-2011 The FreeBSD Foundation
    5  * All rights reserved.
    6  *
    7  * This software was developed at the Centre for Advanced Internet
    8  * Architectures, Swinburne University of Technology, by David Hayes and
    9  * Lawrence Stewart, made possible in part by a grant from the Cisco University
   10  * Research Program Fund at Community Foundation Silicon Valley.
   11  *
   12  * Portions of this software were developed at the Centre for Advanced Internet
   13  * Architectures, Swinburne University of Technology, Melbourne, Australia by
   14  * David Hayes under sponsorship from the FreeBSD Foundation.
   15  *
   16  * Redistribution and use in source and binary forms, with or without
   17  * modification, are permitted provided that the following conditions
   18  * are met:
   19  * 1. Redistributions of source code must retain the above copyright
   20  *    notice, this list of conditions and the following disclaimer.
   21  * 2. Redistributions in binary form must reproduce the above copyright
   22  *    notice, this list of conditions and the following disclaimer in the
   23  *    documentation and/or other materials provided with the distribution.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  */
   37 
   38 /*
   39  * An implementation of the CAIA-Hamilton delay based congestion control
   40  * algorithm, based on "Improved coexistence and loss tolerance for delay based
   41  * TCP congestion control" by D. A. Hayes and G. Armitage., in 35th Annual IEEE
   42  * Conference on Local Computer Networks (LCN 2010), Denver, Colorado, USA,
   43  * 11-14 October 2010.
   44  *
   45  * Originally released as part of the NewTCP research project at Swinburne
   46  * University of Technology's Centre for Advanced Internet Architectures,
   47  * Melbourne, Australia, which was made possible in part by a grant from the
   48  * Cisco University Research Program Fund at Community Foundation Silicon
   49  * Valley. More details are available at:
   50  *   http://caia.swin.edu.au/urp/newtcp/
   51  */
   52 
   53 #include <sys/cdefs.h>
   54 __FBSDID("$FreeBSD: releng/10.3/sys/netinet/cc/cc_chd.c 273736 2014-10-27 14:38:00Z hselasky $");
   55 
   56 #include <sys/param.h>
   57 #include <sys/kernel.h>
   58 #include <sys/khelp.h>
   59 #include <sys/limits.h>
   60 #include <sys/malloc.h>
   61 #include <sys/module.h>
   62 #include <sys/queue.h>
   63 #include <sys/socket.h>
   64 #include <sys/socketvar.h>
   65 #include <sys/sysctl.h>
   66 #include <sys/systm.h>
   67 
   68 #include <net/if.h>
   69 #include <net/vnet.h>
   70 
   71 #include <netinet/cc.h>
   72 #include <netinet/tcp_seq.h>
   73 #include <netinet/tcp_timer.h>
   74 #include <netinet/tcp_var.h>
   75 
   76 #include <netinet/cc/cc_module.h>
   77 
   78 #include <netinet/khelp/h_ertt.h>
   79 
   80 #define CAST_PTR_INT(X) (*((int*)(X)))
   81 
   82 /*
   83  * Private signal type for rate based congestion signal.
   84  * See <netinet/cc.h> for appropriate bit-range to use for private signals.
   85  */
   86 #define CC_CHD_DELAY    0x02000000
   87 
   88 /* Largest possible number returned by random(). */
   89 #define RANDOM_MAX      INT_MAX
   90 
   91 static void     chd_ack_received(struct cc_var *ccv, uint16_t ack_type);
   92 static void     chd_cb_destroy(struct cc_var *ccv);
   93 static int      chd_cb_init(struct cc_var *ccv);
   94 static void     chd_cong_signal(struct cc_var *ccv, uint32_t signal_type);
   95 static void     chd_conn_init(struct cc_var *ccv);
   96 static int      chd_mod_init(void);
   97 
   98 struct chd {
   99         /*
  100          * Shadow window - keeps track of what the NewReno congestion window
  101          * would have been if delay-based cwnd backoffs had not been made. This
  102          * functionality aids coexistence with loss-based TCP flows which may be
  103          * sharing links along the path.
  104          */
  105         unsigned long shadow_w;
  106         /*
  107          * Loss-based TCP compatibility flag - When set, it turns on the shadow
  108          * window functionality.
  109          */
  110         int loss_compete;
  111          /* The maximum round trip time seen within a measured rtt period. */
  112         int maxrtt_in_rtt;
  113         /* The previous qdly that caused cwnd to backoff. */
  114         int prev_backoff_qdly;
  115 };
  116 
  117 static int ertt_id;
  118 
  119 static VNET_DEFINE(uint32_t, chd_qmin) = 5;
  120 static VNET_DEFINE(uint32_t, chd_pmax) = 50;
  121 static VNET_DEFINE(uint32_t, chd_loss_fair) = 1;
  122 static VNET_DEFINE(uint32_t, chd_use_max) = 1;
  123 static VNET_DEFINE(uint32_t, chd_qthresh) = 20;
  124 #define V_chd_qthresh   VNET(chd_qthresh)
  125 #define V_chd_qmin      VNET(chd_qmin)
  126 #define V_chd_pmax      VNET(chd_pmax)
  127 #define V_chd_loss_fair VNET(chd_loss_fair)
  128 #define V_chd_use_max   VNET(chd_use_max)
  129 
  130 static MALLOC_DEFINE(M_CHD, "chd data",
  131     "Per connection data required for the CHD congestion control algorithm");
  132 
  133 struct cc_algo chd_cc_algo = {
  134         .name = "chd",
  135         .ack_received = chd_ack_received,
  136         .cb_destroy = chd_cb_destroy,
  137         .cb_init = chd_cb_init,
  138         .cong_signal = chd_cong_signal,
  139         .conn_init = chd_conn_init,
  140         .mod_init = chd_mod_init
  141 };
  142 
  143 static __inline void
  144 chd_window_decrease(struct cc_var *ccv)
  145 {
  146         unsigned long win;
  147 
  148         win = min(CCV(ccv, snd_wnd), CCV(ccv, snd_cwnd)) / CCV(ccv, t_maxseg);
  149         win -= max((win / 2), 1);
  150         CCV(ccv, snd_ssthresh) = max(win, 2) * CCV(ccv, t_maxseg);
  151 }
  152 
  153 /*
  154  * Probabilistic backoff function. Returns 1 if we should backoff or 0
  155  * otherwise. The calculation of p is similar to the calculation of p in cc_hd.
  156  */
  157 static __inline int
  158 should_backoff(int qdly, int maxqdly, struct chd *chd_data)
  159 {
  160         unsigned long p, rand;
  161 
  162         rand = random();
  163 
  164         if (qdly < V_chd_qthresh) {
  165                 chd_data->loss_compete = 0;
  166                 p = (((RANDOM_MAX / 100) * V_chd_pmax) /
  167                     (V_chd_qthresh - V_chd_qmin)) *
  168                     (qdly - V_chd_qmin);
  169         } else {
  170                 if (qdly > V_chd_qthresh) {
  171                         p = (((RANDOM_MAX / 100) * V_chd_pmax) /
  172                             (maxqdly - V_chd_qthresh)) *
  173                             (maxqdly - qdly);
  174                         if (V_chd_loss_fair && rand < p)
  175                                 chd_data->loss_compete = 1;
  176                 } else {
  177                         p = (RANDOM_MAX / 100) * V_chd_pmax;
  178                         chd_data->loss_compete = 0;
  179                 }
  180         }
  181 
  182         return (rand < p);
  183 }
  184 
  185 static __inline void
  186 chd_window_increase(struct cc_var *ccv, int new_measurement)
  187 {
  188         struct chd *chd_data;
  189         int incr;
  190 
  191         chd_data = ccv->cc_data;
  192         incr = 0;
  193 
  194         if (CCV(ccv, snd_cwnd) <= CCV(ccv, snd_ssthresh)) {
  195                 /* Adapted from NewReno slow start. */
  196                 if (V_tcp_do_rfc3465) {
  197                         /* In slow-start with ABC enabled. */
  198                         if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) {
  199                                 /* Not due to RTO. */
  200                                 incr = min(ccv->bytes_this_ack,
  201                                     V_tcp_abc_l_var * CCV(ccv, t_maxseg));
  202                         } else {
  203                                 /* Due to RTO. */
  204                                 incr = min(ccv->bytes_this_ack,
  205                                     CCV(ccv, t_maxseg));
  206                         }
  207                 } else
  208                         incr = CCV(ccv, t_maxseg);
  209 
  210         } else { /* Congestion avoidance. */
  211                 if (V_tcp_do_rfc3465) {
  212                         if (ccv->flags & CCF_ABC_SENTAWND) {
  213                                 ccv->flags &= ~CCF_ABC_SENTAWND;
  214                                 incr = CCV(ccv, t_maxseg);
  215                         }
  216                 } else if (new_measurement)
  217                         incr = CCV(ccv, t_maxseg);
  218         }
  219 
  220         if (chd_data->shadow_w > 0) {
  221                 /* Track NewReno window. */
  222                 chd_data->shadow_w = min(chd_data->shadow_w + incr,
  223                     TCP_MAXWIN << CCV(ccv, snd_scale));
  224         }
  225 
  226         CCV(ccv,snd_cwnd) = min(CCV(ccv, snd_cwnd) + incr,
  227             TCP_MAXWIN << CCV(ccv, snd_scale));
  228 }
  229 
  230 /*
  231  * All ACK signals are used for timing measurements to determine delay-based
  232  * congestion. However, window increases are only performed when
  233  * ack_type == CC_ACK.
  234  */
  235 static void
  236 chd_ack_received(struct cc_var *ccv, uint16_t ack_type)
  237 {
  238         struct chd *chd_data;
  239         struct ertt *e_t;
  240         int backoff, new_measurement, qdly, rtt;
  241 
  242         e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
  243         chd_data = ccv->cc_data;
  244         new_measurement = e_t->flags & ERTT_NEW_MEASUREMENT;
  245         backoff = qdly = 0;
  246 
  247         chd_data->maxrtt_in_rtt = imax(e_t->rtt, chd_data->maxrtt_in_rtt);
  248 
  249         if (new_measurement) {
  250                 /*
  251                  * There is a new per RTT measurement, so check to see if there
  252                  * is delay based congestion.
  253                  */
  254                 rtt = V_chd_use_max ? chd_data->maxrtt_in_rtt : e_t->rtt;
  255                 chd_data->maxrtt_in_rtt = 0;
  256 
  257                 if (rtt && e_t->minrtt && !IN_RECOVERY(CCV(ccv, t_flags))) {
  258                         qdly = rtt - e_t->minrtt;
  259                         if (qdly > V_chd_qmin) {
  260                                 /*
  261                                  * Probabilistic delay based congestion
  262                                  * indication.
  263                                  */
  264                                 backoff = should_backoff(qdly,
  265                                     e_t->maxrtt - e_t->minrtt, chd_data);
  266                         } else
  267                                 chd_data->loss_compete = 0;
  268                 }
  269                 /* Reset per RTT measurement flag to start a new measurement. */
  270                 e_t->flags &= ~ERTT_NEW_MEASUREMENT;
  271         }
  272 
  273         if (backoff) {
  274                 /*
  275                  * Update shadow_w before delay based backoff.
  276                  */
  277                 if (chd_data->loss_compete ||
  278                     qdly > chd_data->prev_backoff_qdly) {
  279                         /*
  280                          * Delay is higher than when we backed off previously,
  281                          * so it is possible that this flow is competing with
  282                          * loss based flows.
  283                          */
  284                         chd_data->shadow_w = max(CCV(ccv, snd_cwnd),
  285                             chd_data->shadow_w);
  286                 } else {
  287                         /*
  288                          * Reset shadow_w, as it is probable that this flow is
  289                          * not competing with loss based flows at the moment.
  290                          */
  291                         chd_data->shadow_w = 0;
  292                 }
  293 
  294                 chd_data->prev_backoff_qdly = qdly;
  295                 /*
  296                  * Send delay-based congestion signal to the congestion signal
  297                  * handler.
  298                  */
  299                 chd_cong_signal(ccv, CC_CHD_DELAY);
  300 
  301         } else if (ack_type == CC_ACK)
  302                 chd_window_increase(ccv, new_measurement);
  303 }
  304 
  305 static void
  306 chd_cb_destroy(struct cc_var *ccv)
  307 {
  308 
  309         if (ccv->cc_data != NULL)
  310                 free(ccv->cc_data, M_CHD);
  311 }
  312 
  313 static int
  314 chd_cb_init(struct cc_var *ccv)
  315 {
  316         struct chd *chd_data;
  317 
  318         chd_data = malloc(sizeof(struct chd), M_CHD, M_NOWAIT);
  319         if (chd_data == NULL)
  320                 return (ENOMEM);
  321 
  322         chd_data->shadow_w = 0;
  323         ccv->cc_data = chd_data;
  324 
  325         return (0);
  326 }
  327 
  328 static void
  329 chd_cong_signal(struct cc_var *ccv, uint32_t signal_type)
  330 {
  331         struct ertt *e_t;
  332         struct chd *chd_data;
  333         int qdly;
  334 
  335         e_t = khelp_get_osd(CCV(ccv, osd), ertt_id);
  336         chd_data = ccv->cc_data;
  337         qdly = imax(e_t->rtt, chd_data->maxrtt_in_rtt) - e_t->minrtt;
  338 
  339         switch(signal_type) {
  340         case CC_CHD_DELAY:
  341                 chd_window_decrease(ccv); /* Set new ssthresh. */
  342                 CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
  343                 CCV(ccv, snd_recover) = CCV(ccv, snd_max);
  344                 ENTER_CONGRECOVERY(CCV(ccv, t_flags));
  345                 break;
  346 
  347         case CC_NDUPACK: /* Packet loss. */
  348                 /*
  349                  * Only react to loss as a congestion signal if qdly >
  350                  * V_chd_qthresh.  If qdly is less than qthresh, presume that
  351                  * this is a non congestion related loss. If qdly is greater
  352                  * than qthresh, assume that we are competing with loss based
  353                  * tcp flows and restore window from any unnecessary backoffs,
  354                  * before the decrease.
  355                  */
  356                 if (!IN_RECOVERY(CCV(ccv, t_flags)) && qdly > V_chd_qthresh) {
  357                         if (chd_data->loss_compete) {
  358                                 CCV(ccv, snd_cwnd) = max(CCV(ccv, snd_cwnd),
  359                                     chd_data->shadow_w);
  360                         }
  361                         chd_window_decrease(ccv);
  362                 } else {
  363                          /*
  364                           * This loss isn't congestion related, or already
  365                           * recovering from congestion.
  366                           */
  367                         CCV(ccv, snd_ssthresh) = CCV(ccv, snd_cwnd);
  368                         CCV(ccv, snd_recover) = CCV(ccv, snd_max);
  369                 }
  370 
  371                 if (chd_data->shadow_w > 0) {
  372                         chd_data->shadow_w = max(chd_data->shadow_w /
  373                             CCV(ccv, t_maxseg) / 2, 2) * CCV(ccv, t_maxseg);
  374                 }
  375                 ENTER_FASTRECOVERY(CCV(ccv, t_flags));
  376                 break;
  377 
  378         default:
  379                 newreno_cc_algo.cong_signal(ccv, signal_type);
  380         }
  381 }
  382 
  383 static void
  384 chd_conn_init(struct cc_var *ccv)
  385 {
  386         struct chd *chd_data;
  387 
  388         chd_data = ccv->cc_data;
  389         chd_data->prev_backoff_qdly = 0;
  390         chd_data->maxrtt_in_rtt = 0;
  391         chd_data->loss_compete = 0;
  392         /*
  393          * Initialise the shadow_cwnd to be equal to snd_cwnd in case we are
  394          * competing with loss based flows from the start.
  395          */
  396         chd_data->shadow_w = CCV(ccv, snd_cwnd);
  397 }
  398 
  399 static int
  400 chd_mod_init(void)
  401 {
  402 
  403         ertt_id = khelp_get_id("ertt");
  404         if (ertt_id <= 0) {
  405                 printf("%s: h_ertt module not found\n", __func__);
  406                 return (ENOENT);
  407         }
  408 
  409         chd_cc_algo.after_idle = newreno_cc_algo.after_idle;
  410         chd_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
  411 
  412         return (0);
  413 }
  414 
  415 static int
  416 chd_loss_fair_handler(SYSCTL_HANDLER_ARGS)
  417 {
  418         int error;
  419         uint32_t new;
  420 
  421         new = V_chd_loss_fair;
  422         error = sysctl_handle_int(oidp, &new, 0, req);
  423         if (error == 0 && req->newptr != NULL) {
  424                 if (CAST_PTR_INT(req->newptr) > 1)
  425                         error = EINVAL;
  426                 else
  427                         V_chd_loss_fair = new;
  428         }
  429 
  430         return (error);
  431 }
  432 
  433 static int
  434 chd_pmax_handler(SYSCTL_HANDLER_ARGS)
  435 {
  436         int error;
  437         uint32_t new;
  438 
  439         new = V_chd_pmax;
  440         error = sysctl_handle_int(oidp, &new, 0, req);
  441         if (error == 0 && req->newptr != NULL) {
  442                 if (CAST_PTR_INT(req->newptr) == 0 ||
  443                     CAST_PTR_INT(req->newptr) > 100)
  444                         error = EINVAL;
  445                 else
  446                         V_chd_pmax = new;
  447         }
  448 
  449         return (error);
  450 }
  451 
  452 static int
  453 chd_qthresh_handler(SYSCTL_HANDLER_ARGS)
  454 {
  455         int error;
  456         uint32_t new;
  457 
  458         new = V_chd_qthresh;
  459         error = sysctl_handle_int(oidp, &new, 0, req);
  460         if (error == 0 && req->newptr != NULL) {
  461                 if (CAST_PTR_INT(req->newptr) <= V_chd_qmin)
  462                         error = EINVAL;
  463                 else
  464                         V_chd_qthresh = new;
  465         }
  466 
  467         return (error);
  468 }
  469 
  470 SYSCTL_DECL(_net_inet_tcp_cc_chd);
  471 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, chd, CTLFLAG_RW, NULL,
  472     "CAIA Hamilton delay-based congestion control related settings");
  473 
  474 SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, loss_fair,
  475     CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_loss_fair), 1, &chd_loss_fair_handler,
  476     "IU", "Flag to enable shadow window functionality.");
  477 
  478 SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, pmax,
  479     CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_pmax), 5, &chd_pmax_handler,
  480     "IU", "Per RTT maximum backoff probability as a percentage");
  481 
  482 SYSCTL_VNET_PROC(_net_inet_tcp_cc_chd, OID_AUTO, queue_threshold,
  483     CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(chd_qthresh), 20, &chd_qthresh_handler,
  484     "IU", "Queueing congestion threshold in ticks");
  485 
  486 SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd, OID_AUTO, queue_min,
  487     CTLFLAG_RW, &VNET_NAME(chd_qmin), 5,
  488     "Minimum queueing delay threshold in ticks");
  489 
  490 SYSCTL_VNET_UINT(_net_inet_tcp_cc_chd,  OID_AUTO, use_max,
  491     CTLFLAG_RW, &VNET_NAME(chd_use_max), 1,
  492     "Use the maximum RTT seen within the measurement period (RTT) "
  493     "as the basic delay measurement for the algorithm.");
  494 
  495 DECLARE_CC_MODULE(chd, &chd_cc_algo);
  496 MODULE_DEPEND(chd, ertt, 1, 1, 1);

Cache object: 0f97c3f03a8730134b198aff48c56ad2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.