The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_congctl.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $  */
    2 
    3 /*-
    4  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
    5  * All rights reserved.
    6  *
    7  * This code is derived from software contributed to The NetBSD Foundation
    8  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
    9  * Facility, NASA Ames Research Center.
   10  * This code is derived from software contributed to The NetBSD Foundation
   11  * by Charles M. Hannum.
   12  * This code is derived from software contributed to The NetBSD Foundation
   13  * by Rui Paulo.
   14  *
   15  * Redistribution and use in source and binary forms, with or without
   16  * modification, are permitted provided that the following conditions
   17  * are met:
   18  * 1. Redistributions of source code must retain the above copyright
   19  *    notice, this list of conditions and the following disclaimer.
   20  * 2. Redistributions in binary form must reproduce the above copyright
   21  *    notice, this list of conditions and the following disclaimer in the
   22  *    documentation and/or other materials provided with the distribution.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   26  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   27  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   28  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   32  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   33  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   34  * POSSIBILITY OF SUCH DAMAGE.
   35  */
   36 
   37 /*
   38  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
   39  * All rights reserved.
   40  *
   41  * Redistribution and use in source and binary forms, with or without
   42  * modification, are permitted provided that the following conditions
   43  * are met:
   44  * 1. Redistributions of source code must retain the above copyright
   45  *    notice, this list of conditions and the following disclaimer.
   46  * 2. Redistributions in binary form must reproduce the above copyright
   47  *    notice, this list of conditions and the following disclaimer in the
   48  *    documentation and/or other materials provided with the distribution.
   49  * 3. Neither the name of the project nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
   54  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   55  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   56  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
   57  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   58  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   59  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   60  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   61  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   62  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   63  * SUCH DAMAGE.
   64  */
   65 
   66 /*
   67  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   68  *
   69  * NRL grants permission for redistribution and use in source and binary
   70  * forms, with or without modification, of the software and documentation
   71  * created at NRL provided that the following conditions are met:
   72  *
   73  * 1. Redistributions of source code must retain the above copyright
   74  *    notice, this list of conditions and the following disclaimer.
   75  * 2. Redistributions in binary form must reproduce the above copyright
   76  *    notice, this list of conditions and the following disclaimer in the
   77  *    documentation and/or other materials provided with the distribution.
   78  * 3. All advertising materials mentioning features or use of this software
   79  *    must display the following acknowledgements:
   80  *      This product includes software developed by the University of
   81  *      California, Berkeley and its contributors.
   82  *      This product includes software developed at the Information
   83  *      Technology Division, US Naval Research Laboratory.
   84  * 4. Neither the name of the NRL nor the names of its contributors
   85  *    may be used to endorse or promote products derived from this software
   86  *    without specific prior written permission.
   87  *
   88  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   89  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   90  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   91  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   92  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   93  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   94  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   95  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   96  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   97  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   98  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   99  *
  100  * The views and conclusions contained in the software and documentation
  101  * are those of the authors and should not be interpreted as representing
  102  * official policies, either expressed or implied, of the US Naval
  103  * Research Laboratory (NRL).
  104  */
  105 
  106 /*
  107  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  108  *      The Regents of the University of California.  All rights reserved.
  109  *
  110  * Redistribution and use in source and binary forms, with or without
  111  * modification, are permitted provided that the following conditions
  112  * are met:
  113  * 1. Redistributions of source code must retain the above copyright
  114  *    notice, this list of conditions and the following disclaimer.
  115  * 2. Redistributions in binary form must reproduce the above copyright
  116  *    notice, this list of conditions and the following disclaimer in the
  117  *    documentation and/or other materials provided with the distribution.
  118  * 3. Neither the name of the University nor the names of its contributors
  119  *    may be used to endorse or promote products derived from this software
  120  *    without specific prior written permission.
  121  *
  122  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  123  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  124  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  125  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  126  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  127  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  128  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  129  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  130  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  131  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  132  * SUCH DAMAGE.
  133  *
  134  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  135  */
  136 
  137 #include <sys/cdefs.h>
  138 __KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $");
  139 
  140 #ifdef _KERNEL_OPT
  141 #include "opt_inet.h"
  142 #include "opt_tcp_debug.h"
  143 #include "opt_tcp_congctl.h"
  144 #endif
  145 
  146 #include <sys/param.h>
  147 #include <sys/systm.h>
  148 #include <sys/malloc.h>
  149 #include <sys/mbuf.h>
  150 #include <sys/protosw.h>
  151 #include <sys/socket.h>
  152 #include <sys/socketvar.h>
  153 #include <sys/errno.h>
  154 #include <sys/syslog.h>
  155 #include <sys/pool.h>
  156 #include <sys/domain.h>
  157 #include <sys/kernel.h>
  158 #include <sys/mutex.h>
  159 
  160 #include <net/if.h>
  161 
  162 #include <netinet/in.h>
  163 #include <netinet/in_systm.h>
  164 #include <netinet/ip.h>
  165 #include <netinet/in_pcb.h>
  166 #include <netinet/in_var.h>
  167 #include <netinet/ip_var.h>
  168 
  169 #ifdef INET6
  170 #include <netinet/ip6.h>
  171 #include <netinet6/ip6_var.h>
  172 #include <netinet6/in6_pcb.h>
  173 #include <netinet6/ip6_var.h>
  174 #include <netinet6/in6_var.h>
  175 #include <netinet/icmp6.h>
  176 #endif
  177 
  178 #include <netinet/tcp.h>
  179 #include <netinet/tcp_fsm.h>
  180 #include <netinet/tcp_seq.h>
  181 #include <netinet/tcp_timer.h>
  182 #include <netinet/tcp_var.h>
  183 #include <netinet/tcp_congctl.h>
  184 #ifdef TCP_DEBUG
  185 #include <netinet/tcp_debug.h>
  186 #endif
  187 
  188 /*
  189  * TODO:
  190  *   consider separating the actual implementations in another file.
  191  */
  192 
  193 static void tcp_common_congestion_exp(struct tcpcb *, int, int);
  194 
  195 static int  tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
  196 static int  tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
  197 static void tcp_reno_slow_retransmit(struct tcpcb *);
  198 static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
  199     const struct tcphdr *);
  200 static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
  201 static void tcp_reno_congestion_exp(struct tcpcb *tp);
  202 
  203 static int  tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
  204 static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
  205         const struct tcphdr *);
  206 static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
  207 
  208 static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
  209 static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
  210 static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
  211 static void tcp_cubic_congestion_exp(struct tcpcb *);
  212 
  213 static void tcp_congctl_fillnames(void);
  214 
  215 extern int tcprexmtthresh;
  216 
  217 MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
  218 
  219 /* currently selected global congestion control */
  220 char tcp_congctl_global_name[TCPCC_MAXLEN];
  221 
  222 /* available global congestion control algorithms */
  223 char tcp_congctl_avail[10 * TCPCC_MAXLEN];
  224 
  225 /*
  226  * Used to list the available congestion control algorithms.
  227  */
  228 TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
  229     TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
  230 
  231 static struct tcp_congctlent * tcp_congctl_global;
  232 
  233 static kmutex_t tcp_congctl_mtx;
  234 
  235 void
  236 tcp_congctl_init(void)
  237 {
  238         int r __diagused;
  239         
  240         mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
  241 
  242         /* Base algorithms. */
  243         r = tcp_congctl_register("reno", &tcp_reno_ctl);
  244         KASSERT(r == 0);
  245         r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
  246         KASSERT(r == 0);
  247         r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
  248         KASSERT(r == 0);
  249 
  250         /* NewReno is the default. */
  251 #ifndef TCP_CONGCTL_DEFAULT
  252 #define TCP_CONGCTL_DEFAULT "newreno"
  253 #endif
  254 
  255         r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
  256         KASSERT(r == 0);
  257 }
  258 
  259 /*
  260  * Register a congestion algorithm and select it if we have none.
  261  */
  262 int
  263 tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
  264 {
  265         struct tcp_congctlent *ntcc, *tccp;
  266 
  267         TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) 
  268                 if (!strcmp(name, tccp->congctl_name)) {
  269                         /* name already registered */
  270                         return EEXIST;
  271                 }
  272 
  273         ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
  274 
  275         strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
  276         ntcc->congctl_ctl = tcc;
  277 
  278         TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
  279         tcp_congctl_fillnames();
  280 
  281         if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
  282                 tcp_congctl_select(NULL, name);
  283                 
  284         return 0;
  285 }
  286 
  287 int
  288 tcp_congctl_unregister(const char *name)
  289 {
  290         struct tcp_congctlent *tccp, *rtccp;
  291         unsigned int size;
  292         
  293         rtccp = NULL;
  294         size = 0;
  295         TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
  296                 if (!strcmp(name, tccp->congctl_name))
  297                         rtccp = tccp;
  298                 size++;
  299         }
  300         
  301         if (!rtccp)
  302                 return ENOENT;
  303 
  304         if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
  305                 return EBUSY;
  306 
  307         TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
  308         free(rtccp, M_TCPCONGCTL);
  309         tcp_congctl_fillnames();
  310 
  311         return 0;
  312 }
  313 
  314 /*
  315  * Select a congestion algorithm by name.
  316  */
  317 int
  318 tcp_congctl_select(struct tcpcb *tp, const char *name)
  319 {
  320         struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
  321         bool old_found, new_found;
  322 
  323         KASSERT(name);
  324 
  325         old_found = (tp == NULL || tp->t_congctl == NULL);
  326         old_tccp = NULL;
  327         new_found = false;
  328         new_tccp = NULL;
  329 
  330         TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
  331                 if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
  332                         old_tccp = tccp;
  333                         old_found = true;
  334                 }
  335 
  336                 if (!new_found && !strcmp(name, tccp->congctl_name)) {
  337                         new_tccp = tccp;
  338                         new_found = true;
  339                 }
  340 
  341                 if (new_found && old_found) {
  342                         if (tp) {
  343                                 mutex_enter(&tcp_congctl_mtx);
  344                                 if (old_tccp)
  345                                         old_tccp->congctl_refcnt--;
  346                                 tp->t_congctl = new_tccp->congctl_ctl;
  347                                 new_tccp->congctl_refcnt++;
  348                                 mutex_exit(&tcp_congctl_mtx);
  349                         } else {
  350                                 tcp_congctl_global = new_tccp;
  351                                 strlcpy(tcp_congctl_global_name,
  352                                     new_tccp->congctl_name,
  353                                     sizeof(tcp_congctl_global_name) - 1);
  354                         }
  355                         return 0;
  356                 }
  357         }
  358 
  359         return EINVAL;
  360 }
  361 
  362 void
  363 tcp_congctl_release(struct tcpcb *tp)
  364 {
  365         struct tcp_congctlent *tccp;
  366 
  367         KASSERT(tp->t_congctl);
  368         
  369         TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
  370                 if (tccp->congctl_ctl == tp->t_congctl) {
  371                         tccp->congctl_refcnt--;
  372                         return;
  373                 }
  374         }
  375 }
  376 
  377 /*
  378  * Returns the name of a congestion algorithm.
  379  */
  380 const char *
  381 tcp_congctl_bystruct(const struct tcp_congctl *tcc)
  382 {
  383         struct tcp_congctlent *tccp;
  384         
  385         KASSERT(tcc);
  386         
  387         TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
  388                 if (tccp->congctl_ctl == tcc)
  389                         return tccp->congctl_name;
  390 
  391         return NULL;
  392 }
  393 
  394 static void
  395 tcp_congctl_fillnames(void)
  396 {
  397         struct tcp_congctlent *tccp;
  398         const char *delim = " ";
  399         
  400         tcp_congctl_avail[0] = '\0';
  401         TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
  402                 strlcat(tcp_congctl_avail, tccp->congctl_name,
  403                     sizeof(tcp_congctl_avail) - 1);
  404                 if (TAILQ_NEXT(tccp, congctl_ent))
  405                         strlcat(tcp_congctl_avail, delim, 
  406                             sizeof(tcp_congctl_avail) - 1);
  407         }       
  408         
  409 }
  410 
  411 /* ------------------------------------------------------------------------ */
  412 
  413 /*
  414  * Common stuff
  415  */
  416 
  417 /* Window reduction (1-beta) for [New]Reno: 0.5 */
  418 #define RENO_BETAA 1
  419 #define RENO_BETAB 2
  420 /* Window reduction (1-beta) for Cubic: 0.8 */
  421 #define CUBIC_BETAA 4
  422 #define CUBIC_BETAB 5
  423 /* Draft Rhee Section 4.1 */
  424 #define CUBIC_CA 4
  425 #define CUBIC_CB 10
  426 
  427 static void
  428 tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
  429 {
  430         u_long win;
  431 
  432         /* 
  433          * Reduce the congestion window and the slow start threshold.
  434          */
  435         win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
  436         if (win < 2)
  437                 win = 2;
  438 
  439         tp->snd_ssthresh = win * tp->t_segsz;
  440         tp->snd_recover = tp->snd_max;
  441         tp->snd_cwnd = tp->snd_ssthresh;
  442 
  443         /*
  444          * When using TCP ECN, notify the peer that
  445          * we reduced the cwnd.
  446          */
  447         if (TCP_ECN_ALLOWED(tp))
  448                 tp->t_flags |= TF_ECN_SND_CWR;
  449 }
  450 
  451 
  452 /* ------------------------------------------------------------------------ */
  453 
  454 /*
  455  * TCP/Reno congestion control.
  456  */
  457 static void
  458 tcp_reno_congestion_exp(struct tcpcb *tp)
  459 {
  460 
  461         tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
  462 }
  463 
  464 static int
  465 tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
  466 {
  467         /*
  468          * Dup acks mean that packets have left the
  469          * network (they're now cached at the receiver)
  470          * so bump cwnd by the amount in the receiver
  471          * to keep a constant cwnd packets in the
  472          * network.
  473          *
  474          * If we are using TCP/SACK, then enter
  475          * Fast Recovery if the receiver SACKs
  476          * data that is tcprexmtthresh * MSS
  477          * bytes past the last ACKed segment,
  478          * irrespective of the number of DupAcks.
  479          */
  480         
  481         tcp_seq onxt = tp->snd_nxt;
  482 
  483         tp->t_partialacks = 0;
  484         TCP_TIMER_DISARM(tp, TCPT_REXMT);
  485         tp->t_rtttime = 0;
  486         if (TCP_SACK_ENABLED(tp)) {
  487                 tp->t_dupacks = tcprexmtthresh;
  488                 tp->sack_newdata = tp->snd_nxt;
  489                 tp->snd_cwnd = tp->t_segsz;
  490                 (void) tcp_output(tp);
  491                 return 0;
  492         }
  493         tp->snd_nxt = th->th_ack;
  494         tp->snd_cwnd = tp->t_segsz;
  495         (void) tcp_output(tp);
  496         tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
  497         if (SEQ_GT(onxt, tp->snd_nxt))
  498                 tp->snd_nxt = onxt;
  499 
  500         return 0;
  501 }
  502 
  503 static int
  504 tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
  505 {
  506 
  507         /*
  508          * We know we're losing at the current
  509          * window size so do congestion avoidance
  510          * (set ssthresh to half the current window
  511          * and pull our congestion window back to
  512          * the new ssthresh).
  513          */
  514 
  515         tcp_reno_congestion_exp(tp);
  516         return tcp_reno_do_fast_retransmit(tp, th);
  517 }
  518 
  519 static void
  520 tcp_reno_slow_retransmit(struct tcpcb *tp)
  521 {
  522         u_long win;
  523 
  524         /*
  525          * Close the congestion window down to one segment
  526          * (we'll open it by one segment for each ack we get).
  527          * Since we probably have a window's worth of unacked
  528          * data accumulated, this "slow start" keeps us from
  529          * dumping all that data as back-to-back packets (which
  530          * might overwhelm an intermediate gateway).
  531          *
  532          * There are two phases to the opening: Initially we
  533          * open by one mss on each ack.  This makes the window
  534          * size increase exponentially with time.  If the
  535          * window is larger than the path can handle, this
  536          * exponential growth results in dropped packet(s)
  537          * almost immediately.  To get more time between
  538          * drops but still "push" the network to take advantage
  539          * of improving conditions, we switch from exponential
  540          * to linear window opening at some threshold size.
  541          * For a threshold, we use half the current window
  542          * size, truncated to a multiple of the mss.
  543          *
  544          * (the minimum cwnd that will give us exponential
  545          * growth is 2 mss.  We don't allow the threshold
  546          * to go below this.)
  547          */
  548 
  549         win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
  550         if (win < 2)
  551                 win = 2;
  552         /* Loss Window MUST be one segment. */
  553         tp->snd_cwnd = tp->t_segsz;
  554         tp->snd_ssthresh = win * tp->t_segsz;
  555         tp->t_partialacks = -1;
  556         tp->t_dupacks = 0;
  557         tp->t_bytes_acked = 0;
  558 
  559         if (TCP_ECN_ALLOWED(tp))
  560                 tp->t_flags |= TF_ECN_SND_CWR;
  561 }
  562 
  563 static void
  564 tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
  565     const struct tcphdr *th)
  566 {
  567         if (tp->t_partialacks < 0) {
  568                 /*
  569                  * We were not in fast recovery.  Reset the duplicate ack
  570                  * counter.
  571                  */
  572                 tp->t_dupacks = 0;
  573         } else {
  574                 /*
  575                  * Clamp the congestion window to the crossover point and
  576                  * exit fast recovery.
  577                  */
  578                 if (tp->snd_cwnd > tp->snd_ssthresh)
  579                         tp->snd_cwnd = tp->snd_ssthresh;
  580                 tp->t_partialacks = -1;
  581                 tp->t_dupacks = 0;
  582                 tp->t_bytes_acked = 0;
  583                 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
  584                         tp->snd_fack = th->th_ack;
  585         }
  586 }
  587 
  588 static void
  589 tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
  590 {
  591         /*
  592          * When new data is acked, open the congestion window.
  593          */
  594 
  595         u_int cw = tp->snd_cwnd;
  596         u_int incr = tp->t_segsz;
  597 
  598         if (tcp_do_abc) {
  599 
  600                 /*
  601                  * RFC 3465 Appropriate Byte Counting (ABC)
  602                  */
  603 
  604                 int acked = th->th_ack - tp->snd_una;
  605 
  606                 if (cw >= tp->snd_ssthresh) {
  607                         tp->t_bytes_acked += acked;
  608                         if (tp->t_bytes_acked >= cw) {
  609                                 /* Time to increase the window. */
  610                                 tp->t_bytes_acked -= cw;
  611                         } else {
  612                                 /* No need to increase yet. */
  613                                 incr = 0;
  614                         }
  615                 } else {
  616                         /*
  617                          * use 2*SMSS or 1*SMSS for the "L" param,
  618                          * depending on sysctl setting.
  619                          *
  620                          * (See RFC 3465 2.3 Choosing the Limit)
  621                          */
  622                         u_int abc_lim;
  623 
  624                         abc_lim = (tcp_abc_aggressive == 0 ||
  625                             tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
  626                         incr = uimin(acked, abc_lim);
  627                 }
  628         } else {
  629 
  630                 /*
  631                  * If the window gives us less than ssthresh packets
  632                  * in flight, open exponentially (segsz per packet).
  633                  * Otherwise open linearly: segsz per window
  634                  * (segsz^2 / cwnd per packet).
  635                  */
  636 
  637                 if (cw >= tp->snd_ssthresh) {
  638                         incr = incr * incr / cw;
  639                 }
  640         }
  641 
  642         tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
  643 }
  644 
  645 const struct tcp_congctl tcp_reno_ctl = {
  646         .fast_retransmit = tcp_reno_fast_retransmit,
  647         .slow_retransmit = tcp_reno_slow_retransmit,
  648         .fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
  649         .newack = tcp_reno_newack,
  650         .cong_exp = tcp_reno_congestion_exp,
  651 };
  652 
  653 /*
  654  * TCP/NewReno Congestion control.
  655  */
  656 static int
  657 tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
  658 {
  659 
  660         if (SEQ_LT(th->th_ack, tp->snd_high)) {
  661                 /*
  662                  * False fast retransmit after timeout.
  663                  * Do not enter fast recovery
  664                  */
  665                 tp->t_dupacks = 0;
  666                 return 1;
  667         }
  668         /*
  669          * Fast retransmit is same as reno.
  670          */
  671         return tcp_reno_fast_retransmit(tp, th);
  672 }
  673 
  674 /*
  675  * Implement the NewReno response to a new ack, checking for partial acks in
  676  * fast recovery.
  677  */
  678 static void
  679 tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
  680 {
  681         if (tp->t_partialacks < 0) {
  682                 /*
  683                  * We were not in fast recovery.  Reset the duplicate ack
  684                  * counter.
  685                  */
  686                 tp->t_dupacks = 0;
  687         } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
  688                 /*
  689                  * This is a partial ack.  Retransmit the first unacknowledged
  690                  * segment and deflate the congestion window by the amount of
  691                  * acknowledged data.  Do not exit fast recovery.
  692                  */
  693                 tcp_seq onxt = tp->snd_nxt;
  694                 u_long ocwnd = tp->snd_cwnd;
  695                 int sack_num_segs = 1, sack_bytes_rxmt = 0;
  696 
  697                 /*
  698                  * snd_una has not yet been updated and the socket's send
  699                  * buffer has not yet drained off the ACK'd data, so we
  700                  * have to leave snd_una as it was to get the correct data
  701                  * offset in tcp_output().
  702                  */
  703                 tp->t_partialacks++;
  704                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
  705                 tp->t_rtttime = 0;
  706 
  707                 if (TCP_SACK_ENABLED(tp)) {
  708                         /*
  709                          * Partial ack handling within a sack recovery episode.
  710                          * Keeping this very simple for now. When a partial ack
  711                          * is received, force snd_cwnd to a value that will
  712                          * allow the sender to transmit no more than 2 segments.
  713                          * If necessary, a fancier scheme can be adopted at a
  714                          * later point, but for now, the goal is to prevent the
  715                          * sender from bursting a large amount of data in the
  716                          * midst of sack recovery.
  717                          */
  718 
  719                         /*
  720                          * send one or 2 segments based on how much
  721                          * new data was acked
  722                          */
  723                         if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
  724                                 sack_num_segs = 2;
  725                         (void)tcp_sack_output(tp, &sack_bytes_rxmt);
  726                         tp->snd_cwnd = sack_bytes_rxmt +
  727                             (tp->snd_nxt - tp->sack_newdata) +
  728                             sack_num_segs * tp->t_segsz;
  729                         tp->t_flags |= TF_ACKNOW;
  730                         (void) tcp_output(tp);
  731                 } else {
  732                         tp->snd_nxt = th->th_ack;
  733                         /*
  734                          * Set snd_cwnd to one segment beyond ACK'd offset
  735                          * snd_una is not yet updated when we're called
  736                          */
  737                         tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
  738                         (void) tcp_output(tp);
  739                         tp->snd_cwnd = ocwnd;
  740                         if (SEQ_GT(onxt, tp->snd_nxt))
  741                                 tp->snd_nxt = onxt;
  742                         /*
  743                          * Partial window deflation.  Relies on fact that
  744                          * tp->snd_una not updated yet.
  745                          */
  746                         tp->snd_cwnd -= (th->th_ack - tp->snd_una -
  747                             tp->t_segsz);
  748                 }
  749         } else {
  750                 /*
  751                  * Complete ack.  Inflate the congestion window to ssthresh
  752                  * and exit fast recovery.
  753                  *
  754                  * Window inflation should have left us with approx.
  755                  * snd_ssthresh outstanding data.  But in case we
  756                  * would be inclined to send a burst, better to do
  757                  * it via the slow start mechanism.
  758                  */
  759                 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
  760                         tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
  761                             + tp->t_segsz;
  762                 else
  763                         tp->snd_cwnd = tp->snd_ssthresh;
  764                 tp->t_partialacks = -1;
  765                 tp->t_dupacks = 0;
  766                 tp->t_bytes_acked = 0;
  767                 if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
  768                         tp->snd_fack = th->th_ack;
  769         }
  770 }
  771 
  772 static void
  773 tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
  774 {
  775         /*
  776          * If we are still in fast recovery (meaning we are using
  777          * NewReno and we have only received partial acks), do not
  778          * inflate the window yet.
  779          */
  780         if (tp->t_partialacks < 0)
  781                 tcp_reno_newack(tp, th);
  782 }
  783 
  784 
  785 const struct tcp_congctl tcp_newreno_ctl = {
  786         .fast_retransmit = tcp_newreno_fast_retransmit,
  787         .slow_retransmit = tcp_reno_slow_retransmit,
  788         .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
  789         .newack = tcp_newreno_newack,
  790         .cong_exp = tcp_reno_congestion_exp,
  791 };
  792 
  793 /*
  794  * CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
  795  */
  796 
  797 /* Cubic prototypes */
  798 static void     tcp_cubic_update_ctime(struct tcpcb *tp);
  799 static uint32_t tcp_cubic_diff_ctime(struct tcpcb *);
  800 static uint32_t tcp_cubic_cbrt(uint32_t);
  801 static ulong    tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
  802 
  803 /* Cubic TIME functions - XXX I don't like using timevals and microuptime */
  804 /*
  805  * Set congestion timer to now
  806  */
  807 static void
  808 tcp_cubic_update_ctime(struct tcpcb *tp)
  809 {
  810         struct timeval now_timeval;
  811 
  812         getmicrouptime(&now_timeval);
  813         tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
  814             now_timeval.tv_usec / 1000;
  815 }
  816 
  817 /*
  818  * miliseconds from last congestion
  819  */
  820 static uint32_t
  821 tcp_cubic_diff_ctime(struct tcpcb *tp)
  822 {
  823         struct timeval now_timeval;
  824 
  825         getmicrouptime(&now_timeval);
  826         return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
  827             tp->snd_cubic_ctime;
  828 }
  829 
  830 /*
  831  * Approximate cubic root
  832  */
  833 #define CBRT_ROUNDS 30
  834 static uint32_t
  835 tcp_cubic_cbrt(uint32_t v)
  836 {
  837         int i, rounds = CBRT_ROUNDS;
  838         uint64_t x = v / 3;
  839 
  840         /* We fail to calculate correct for small numbers */
  841         if (v == 0)
  842                 return 0;
  843         else if (v < 4)
  844                 return 1;
  845 
  846         /*
  847          * largest x that 2*x^3+3*x fits 64bit
  848          * Avoid overflow for a time cost
  849          */
  850         if (x > 2097151)
  851                 rounds += 10;
  852 
  853         for (i = 0; i < rounds; i++)
  854                 if (rounds == CBRT_ROUNDS)
  855                         x = (v + 2 * x * x * x) / (3 * x * x);
  856                 else
  857                         /* Avoid overflow */
  858                         x = v / (3 * x * x) + 2 * x / 3;
  859 
  860         return (uint32_t)x;
  861 }
  862 
  863 /* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
  864 static ulong
  865 tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
  866 {
  867         uint32_t K;
  868         long tK3;
  869 
  870         /* Section 3.1 Eq. 2 */
  871         K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
  872             CUBIC_CB / CUBIC_CA);
  873         /*  (t-K)^3 - not clear why is the measure unit mattering */
  874         tK3 = (long)(ms_elapsed + rtt) - (long)K;
  875         tK3 = tK3 * tK3 * tK3;
  876 
  877         return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
  878 }
  879 
  880 static void
  881 tcp_cubic_congestion_exp(struct tcpcb *tp)
  882 {
  883 
  884         /*
  885          * Congestion - Set WMax and shrink cwnd
  886          */
  887         tcp_cubic_update_ctime(tp);
  888 
  889         /* Section 3.6 - Fast Convergence */
  890         if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
  891                 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
  892                 tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
  893                     tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
  894         } else {
  895                 tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
  896                 tp->snd_cubic_wmax = tp->snd_cwnd;
  897         }
  898 
  899         tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
  900 
  901         /* Shrink CWND */
  902         tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
  903 }
  904 
  905 static int
  906 tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
  907 {
  908 
  909         if (SEQ_LT(th->th_ack, tp->snd_high)) {
  910                 /* See newreno */
  911                 tp->t_dupacks = 0;
  912                 return 1;
  913         }
  914 
  915         /*
  916          * mark WMax
  917          */
  918         tcp_cubic_congestion_exp(tp);
  919 
  920         /* Do fast retransmit */
  921         return tcp_reno_do_fast_retransmit(tp, th);
  922 }
  923 
  924 static void
  925 tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
  926 {
  927         uint32_t ms_elapsed, rtt;
  928         u_long w_tcp;
  929 
  930         /* Congestion avoidance and not in fast recovery and usable rtt */
  931         if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
  932             /*
  933              * t_srtt is 1/32 units of slow ticks
  934              * converting it in ms would be equal to
  935              * (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
  936              */
  937             (rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
  938                 ms_elapsed = tcp_cubic_diff_ctime(tp);
  939 
  940                 /* Compute W_tcp(t) */
  941                 w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
  942                     ms_elapsed / rtt / 3;
  943 
  944                 if (tp->snd_cwnd > w_tcp) {
  945                         /* Not in TCP friendly mode */
  946                         tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
  947                             tp->snd_cwnd) / tp->snd_cwnd;
  948                 } else {
  949                         /* friendly TCP mode */
  950                         tp->snd_cwnd = w_tcp;
  951                 }
  952 
  953                 /* Make sure we are within limits */
  954                 tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
  955                 tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
  956         } else {
  957                 /* Use New Reno */
  958                 tcp_newreno_newack(tp, th);
  959         }
  960 }
  961 
  962 static void
  963 tcp_cubic_slow_retransmit(struct tcpcb *tp)
  964 {
  965 
  966         /* Timeout - Mark new congestion */
  967         tcp_cubic_congestion_exp(tp);
  968 
  969         /* Loss Window MUST be one segment. */
  970         tp->snd_cwnd = tp->t_segsz;
  971         tp->t_partialacks = -1;
  972         tp->t_dupacks = 0;
  973         tp->t_bytes_acked = 0;
  974 
  975         if (TCP_ECN_ALLOWED(tp))
  976                 tp->t_flags |= TF_ECN_SND_CWR;
  977 }
  978 
  979 const struct tcp_congctl tcp_cubic_ctl = {
  980         .fast_retransmit = tcp_cubic_fast_retransmit,
  981         .slow_retransmit = tcp_cubic_slow_retransmit,
  982         .fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
  983         .newack = tcp_cubic_newack,
  984         .cong_exp = tcp_cubic_congestion_exp,
  985 };

Cache object: cc8d7ccf60642ddabfb5b82cea4e28de


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.