tcp_timer.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
    5  *      The Regents of the University of California.  All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. Neither the name of the University nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  *
   31  *      @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
   32  */
   33 
   34 #include <sys/cdefs.h>
   35 __FBSDID("$FreeBSD$");
   36 
   37 #include "opt_inet.h"
   38 #include "opt_inet6.h"
   39 #include "opt_rss.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/kernel.h>
   43 #include <sys/lock.h>
   44 #include <sys/mbuf.h>
   45 #include <sys/mutex.h>
   46 #include <sys/protosw.h>
   47 #include <sys/smp.h>
   48 #include <sys/socket.h>
   49 #include <sys/socketvar.h>
   50 #include <sys/sysctl.h>
   51 #include <sys/systm.h>
   52 
   53 #include <net/if.h>
   54 #include <net/route.h>
   55 #include <net/rss_config.h>
   56 #include <net/vnet.h>
   57 #include <net/netisr.h>
   58 
   59 #include <netinet/in.h>
   60 #include <netinet/in_kdtrace.h>
   61 #include <netinet/in_pcb.h>
   62 #include <netinet/in_rss.h>
   63 #include <netinet/in_systm.h>
   64 #ifdef INET6
   65 #include <netinet6/in6_pcb.h>
   66 #endif
   67 #include <netinet/ip_var.h>
   68 #include <netinet/tcp.h>
   69 #include <netinet/tcp_fsm.h>
   70 #include <netinet/tcp_log_buf.h>
   71 #include <netinet/tcp_timer.h>
   72 #include <netinet/tcp_var.h>
   73 #include <netinet/tcp_seq.h>
   74 #include <netinet/cc/cc.h>
   75 #ifdef INET6
   76 #include <netinet6/tcp6_var.h>
   77 #endif
   78 #include <netinet/tcpip.h>
   79 
   80 int    tcp_persmin;
   81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin,
   82     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
   83     &tcp_persmin, 0, sysctl_msec_to_ticks, "I",
   84     "minimum persistence interval");
   85 
   86 int    tcp_persmax;
   87 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax,
   88     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
   89     &tcp_persmax, 0, sysctl_msec_to_ticks, "I",
   90     "maximum persistence interval");
   91 
   92 int     tcp_keepinit;
   93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
   94     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
   95     &tcp_keepinit, 0, sysctl_msec_to_ticks, "I",
   96     "time to establish connection");
   97 
   98 int     tcp_keepidle;
   99 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
  100     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  101     &tcp_keepidle, 0, sysctl_msec_to_ticks, "I",
  102     "time before keepalive probes begin");
  103 
  104 int     tcp_keepintvl;
  105 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
  106     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  107     &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I",
  108     "time between keepalive probes");
  109 
  110 int     tcp_delacktime;
  111 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime,
  112     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  113     &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
  114     "Time before a delayed ACK is sent");
  115 
  116 VNET_DEFINE(int, tcp_msl);
  117 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
  118     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_VNET,
  119     &VNET_NAME(tcp_msl), 0, sysctl_msec_to_ticks, "I",
  120     "Maximum segment lifetime");
  121 
  122 int     tcp_rexmit_initial;
  123 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_initial,
  124    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  125     &tcp_rexmit_initial, 0, sysctl_msec_to_ticks, "I",
  126     "Initial Retransmission Timeout");
  127 
  128 int     tcp_rexmit_min;
  129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min,
  130     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  131     &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
  132     "Minimum Retransmission Timeout");
  133 
  134 int     tcp_rexmit_slop;
  135 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop,
  136     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  137     &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
  138     "Retransmission Timer Slop");
  139 
  140 VNET_DEFINE(int, tcp_always_keepalive) = 1;
  141 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_VNET|CTLFLAG_RW,
  142     &VNET_NAME(tcp_always_keepalive) , 0,
  143     "Assume SO_KEEPALIVE on all TCP connections");
  144 
  145 int    tcp_fast_finwait2_recycle = 0;
  146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
  147     &tcp_fast_finwait2_recycle, 0,
  148     "Recycle closed FIN_WAIT_2 connections faster");
  149 
  150 int    tcp_finwait2_timeout;
  151 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout,
  152     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  153     &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I",
  154     "FIN-WAIT2 timeout");
  155 
  156 int     tcp_keepcnt = TCPTV_KEEPCNT;
  157 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
  158     "Number of keepalive probes to send");
  159 
  160         /* max idle probes */
  161 int     tcp_maxpersistidle;
  162 
  163 int     tcp_rexmit_drop_options = 0;
  164 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
  165     &tcp_rexmit_drop_options, 0,
  166     "Drop TCP options from 3rd and later retransmitted SYN");
  167 
  168 int     tcp_maxunacktime = TCPTV_MAXUNACKTIME;
  169 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime,
  170     CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  171     &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I",
  172     "Maximum time (in ms) that a session can linger without making progress");
  173 
  174 VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
  175 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
  176     CTLFLAG_RW|CTLFLAG_VNET,
  177     &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
  178     "Path MTU Discovery Black Hole Detection Enabled");
  179 
  180 #ifdef INET
  181 VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
  182 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
  183     CTLFLAG_RW|CTLFLAG_VNET,
  184     &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
  185     "Path MTU Discovery Black Hole Detection lowered MSS");
  186 #endif
  187 
  188 #ifdef INET6
  189 VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
  190 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
  191     CTLFLAG_RW|CTLFLAG_VNET,
  192     &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
  193     "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
  194 #endif
  195 
  196 #ifdef  RSS
  197 static int      per_cpu_timers = 1;
  198 #else
  199 static int      per_cpu_timers = 0;
  200 #endif
  201 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
  202     &per_cpu_timers , 0, "run tcp timers on all cpus");
  203 
  204 /*
  205  * Map the given inp to a CPU id.
  206  *
  207  * This queries RSS if it's compiled in, else it defaults to the current
  208  * CPU ID.
  209  */
  210 inline int
  211 inp_to_cpuid(struct inpcb *inp)
  212 {
  213         u_int cpuid;
  214 
  215         if (per_cpu_timers) {
  216 #ifdef  RSS
  217                 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
  218                 if (cpuid == NETISR_CPUID_NONE)
  219                         return (curcpu);        /* XXX */
  220                 else
  221                         return (cpuid);
  222 #endif
  223                 /*
  224                  * We don't have a flowid -> cpuid mapping, so cheat and
  225                  * just map unknown cpuids to curcpu.  Not the best, but
  226                  * apparently better than defaulting to swi 0.
  227                  */
  228                 cpuid = inp->inp_flowid % (mp_maxid + 1);
  229                 if (! CPU_ABSENT(cpuid))
  230                         return (cpuid);
  231                 return (curcpu);
  232         } else {
  233                 return (0);
  234         }
  235 }
  236 
  237 int     tcp_backoff[TCP_MAXRXTSHIFT + 1] =
  238     { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
  239 
  240 int tcp_totbackoff = 2559;      /* sum of tcp_backoff[] */
  241 
  242 /*
  243  * TCP timer processing.
  244  *
  245  * Each connection has 5 timers associated with it, which can be scheduled
  246  * simultaneously.  They all are serviced by one callout tcp_timer_enter().
  247  * This function executes the next timer via tcp_timersw[] vector.  Each
  248  * timer is supposed to return 'true' unless the connection was destroyed.
  249  * In the former case tcp_timer_enter() will schedule callout for next timer.
  250  */
  251 
  252 typedef bool tcp_timer_t(struct tcpcb *);
  253 static tcp_timer_t tcp_timer_delack;
  254 static tcp_timer_t tcp_timer_2msl;
  255 static tcp_timer_t tcp_timer_keep;
  256 static tcp_timer_t tcp_timer_persist;
  257 static tcp_timer_t tcp_timer_rexmt;
  258 
  259 static tcp_timer_t * const tcp_timersw[TT_N] = {
  260         [TT_DELACK] = tcp_timer_delack,
  261         [TT_REXMT] = tcp_timer_rexmt,
  262         [TT_PERSIST] = tcp_timer_persist,
  263         [TT_KEEP] = tcp_timer_keep,
  264         [TT_2MSL] = tcp_timer_2msl,
  265 };
  266 
  267 /*
  268  * tcp_output_locked() s a timer specific variation of call to tcp_output(),
  269  * see tcp_var.h for the rest.  It handles drop request from advanced stacks,
  270  * but keeps tcpcb locked unless tcp_drop() destroyed it.
  271  * Returns true if tcpcb is valid and locked.
  272  */
  273 static inline bool
  274 tcp_output_locked(struct tcpcb *tp)
  275 {
  276         int rv;
  277 
  278         INP_WLOCK_ASSERT(tptoinpcb(tp));
  279 
  280         if ((rv = tp->t_fb->tfb_tcp_output(tp)) < 0) {
  281                 KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
  282                     ("TCP stack %s requested tcp_drop(%p)",
  283                     tp->t_fb->tfb_tcp_block_name, tp));
  284                 tp = tcp_drop(tp, rv);
  285         }
  286 
  287         return (tp != NULL);
  288 }
  289 
  290 static bool
  291 tcp_timer_delack(struct tcpcb *tp)
  292 {
  293         struct epoch_tracker et;
  294 #if defined(INVARIANTS) || defined(VIMAGE)
  295         struct inpcb *inp = tptoinpcb(tp);
  296 #endif
  297         bool rv;
  298 
  299         INP_WLOCK_ASSERT(inp);
  300 
  301         CURVNET_SET(inp->inp_vnet);
  302         tp->t_flags |= TF_ACKNOW;
  303         TCPSTAT_INC(tcps_delack);
  304         NET_EPOCH_ENTER(et);
  305         rv = tcp_output_locked(tp);
  306         NET_EPOCH_EXIT(et);
  307         CURVNET_RESTORE();
  308 
  309         return (rv);
  310 }
  311 
  312 static bool
  313 tcp_timer_2msl(struct tcpcb *tp)
  314 {
  315         struct inpcb *inp = tptoinpcb(tp);
  316         bool close = false;
  317 
  318         INP_WLOCK_ASSERT(inp);
  319 
  320         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
  321         CURVNET_SET(inp->inp_vnet);
  322         tcp_log_end_status(tp, TCP_EI_STATUS_2MSL);
  323         tcp_free_sackholes(tp);
  324         /*
  325          * 2 MSL timeout in shutdown went off.  If we're closed but
  326          * still waiting for peer to close and connection has been idle
  327          * too long delete connection control block.  Otherwise, check
  328          * again in a bit.
  329          *
  330          * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
  331          * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
  332          * Ignore fact that there were recent incoming segments.
  333          *
  334          * XXXGL: check if inp_socket shall always be !NULL here?
  335          */
  336         if (tp->t_state == TCPS_TIME_WAIT) {
  337                 close = true;
  338         } else if (tp->t_state == TCPS_FIN_WAIT_2 &&
  339             tcp_fast_finwait2_recycle && inp->inp_socket &&
  340             (inp->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
  341                 TCPSTAT_INC(tcps_finwait2_drops);
  342                 close = true;
  343         } else {
  344                 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp))
  345                         tcp_timer_activate(tp, TT_2MSL, TP_KEEPINTVL(tp));
  346                 else
  347                         close = true;
  348         }
  349         if (close) {
  350                 struct epoch_tracker et;
  351 
  352                 NET_EPOCH_ENTER(et);
  353                 tp = tcp_close(tp);
  354                 NET_EPOCH_EXIT(et);
  355         }
  356         CURVNET_RESTORE();
  357 
  358         return (tp != NULL);
  359 }
  360 
  361 static bool
  362 tcp_timer_keep(struct tcpcb *tp)
  363 {
  364         struct epoch_tracker et;
  365         struct inpcb *inp = tptoinpcb(tp);
  366         struct tcptemp *t_template;
  367 
  368         INP_WLOCK_ASSERT(inp);
  369 
  370         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
  371         CURVNET_SET(inp->inp_vnet);
  372         /*
  373          * Because we don't regularly reset the keepalive callout in
  374          * the ESTABLISHED state, it may be that we don't actually need
  375          * to send a keepalive yet. If that occurs, schedule another
  376          * call for the next time the keepalive timer might expire.
  377          */
  378         if (TCPS_HAVEESTABLISHED(tp->t_state)) {
  379                 u_int idletime;
  380 
  381                 idletime = ticks - tp->t_rcvtime;
  382                 if (idletime < TP_KEEPIDLE(tp)) {
  383                         tcp_timer_activate(tp, TT_KEEP,
  384                             TP_KEEPIDLE(tp) - idletime);
  385                         CURVNET_RESTORE();
  386                         return (true);
  387                 }
  388         }
  389 
  390         /*
  391          * Keep-alive timer went off; send something
  392          * or drop connection if idle for too long.
  393          */
  394         TCPSTAT_INC(tcps_keeptimeo);
  395         if (tp->t_state < TCPS_ESTABLISHED)
  396                 goto dropit;
  397         if ((V_tcp_always_keepalive ||
  398             inp->inp_socket->so_options & SO_KEEPALIVE) &&
  399             tp->t_state <= TCPS_CLOSING) {
  400                 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
  401                         goto dropit;
  402                 /*
  403                  * Send a packet designed to force a response
  404                  * if the peer is up and reachable:
  405                  * either an ACK if the connection is still alive,
  406                  * or an RST if the peer has closed the connection
  407                  * due to timeout or reboot.
  408                  * Using sequence number tp->snd_una-1
  409                  * causes the transmitted zero-length segment
  410                  * to lie outside the receive window;
  411                  * by the protocol spec, this requires the
  412                  * correspondent TCP to respond.
  413                  */
  414                 TCPSTAT_INC(tcps_keepprobe);
  415                 t_template = tcpip_maketemplate(inp);
  416                 if (t_template) {
  417                         NET_EPOCH_ENTER(et);
  418                         tcp_respond(tp, t_template->tt_ipgen,
  419                                     &t_template->tt_t, (struct mbuf *)NULL,
  420                                     tp->rcv_nxt, tp->snd_una - 1, 0);
  421                         NET_EPOCH_EXIT(et);
  422                         free(t_template, M_TEMP);
  423                 }
  424                 tcp_timer_activate(tp, TT_KEEP, TP_KEEPINTVL(tp));
  425         } else
  426                 tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp));
  427 
  428         CURVNET_RESTORE();
  429         return (true);
  430 
  431 dropit:
  432         TCPSTAT_INC(tcps_keepdrops);
  433         NET_EPOCH_ENTER(et);
  434         tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX);
  435         tp = tcp_drop(tp, ETIMEDOUT);
  436         NET_EPOCH_EXIT(et);
  437         CURVNET_RESTORE();
  438 
  439         return (tp != NULL);
  440 }
  441 
  442 /*
  443  * Has this session exceeded the maximum time without seeing a substantive
  444  * acknowledgement? If so, return true; otherwise false.
  445  */
  446 static bool
  447 tcp_maxunacktime_check(struct tcpcb *tp)
  448 {
  449 
  450         /* Are we tracking this timer for this session? */
  451         if (TP_MAXUNACKTIME(tp) == 0)
  452                 return false;
  453 
  454         /* Do we have a current measurement. */
  455         if (tp->t_acktime == 0)
  456                 return false;
  457 
  458         /* Are we within the acceptable range? */
  459         if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks))
  460                 return false;
  461 
  462         /* We exceeded the timer. */
  463         TCPSTAT_INC(tcps_progdrops);
  464         return true;
  465 }
  466 
  467 static bool
  468 tcp_timer_persist(struct tcpcb *tp)
  469 {
  470         struct epoch_tracker et;
  471 #if defined(INVARIANTS) || defined(VIMAGE)
  472         struct inpcb *inp = tptoinpcb(tp);
  473 #endif
  474         bool progdrop, rv;
  475 
  476         INP_WLOCK_ASSERT(inp);
  477 
  478         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
  479         CURVNET_SET(inp->inp_vnet);
  480         /*
  481          * Persistence timer into zero window.
  482          * Force a byte to be output, if possible.
  483          */
  484         TCPSTAT_INC(tcps_persisttimeo);
  485         /*
  486          * Hack: if the peer is dead/unreachable, we do not
  487          * time out if the window is closed.  After a full
  488          * backoff, drop the connection if the idle time
  489          * (no responses to probes) reaches the maximum
  490          * backoff that we would use if retransmitting.
  491          * Also, drop the connection if we haven't been making
  492          * progress.
  493          */
  494         progdrop = tcp_maxunacktime_check(tp);
  495         if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
  496             (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
  497              ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) {
  498                 if (!progdrop)
  499                         TCPSTAT_INC(tcps_persistdrop);
  500                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
  501                 goto dropit;
  502         }
  503         /*
  504          * If the user has closed the socket then drop a persisting
  505          * connection after a much reduced timeout.
  506          */
  507         if (tp->t_state > TCPS_CLOSE_WAIT &&
  508             (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
  509                 TCPSTAT_INC(tcps_persistdrop);
  510                 tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX);
  511                 goto dropit;
  512         }
  513         tcp_setpersist(tp);
  514         tp->t_flags |= TF_FORCEDATA;
  515         NET_EPOCH_ENTER(et);
  516         if ((rv = tcp_output_locked(tp)))
  517                 tp->t_flags &= ~TF_FORCEDATA;
  518         NET_EPOCH_EXIT(et);
  519         CURVNET_RESTORE();
  520 
  521         return (rv);
  522 
  523 dropit:
  524         NET_EPOCH_ENTER(et);
  525         tp = tcp_drop(tp, ETIMEDOUT);
  526         NET_EPOCH_EXIT(et);
  527         CURVNET_RESTORE();
  528 
  529         return (tp != NULL);
  530 }
  531 
  532 static bool
  533 tcp_timer_rexmt(struct tcpcb *tp)
  534 {
  535         struct epoch_tracker et;
  536         struct inpcb *inp = tptoinpcb(tp);
  537         int rexmt;
  538         bool isipv6, rv;
  539 
  540         INP_WLOCK_ASSERT(inp);
  541 
  542         TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
  543         CURVNET_SET(inp->inp_vnet);
  544         tcp_free_sackholes(tp);
  545         TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
  546         if (tp->t_fb->tfb_tcp_rexmit_tmr) {
  547                 /* The stack has a timer action too. */
  548                 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
  549         }
  550         /*
  551          * Retransmission timer went off.  Message has not
  552          * been acked within retransmit interval.  Back off
  553          * to a longer retransmit interval and retransmit one segment.
  554          *
  555          * If we've either exceeded the maximum number of retransmissions,
  556          * or we've gone long enough without making progress, then drop
  557          * the session.
  558          */
  559         if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) {
  560                 if (tp->t_rxtshift > TCP_MAXRXTSHIFT)
  561                         TCPSTAT_INC(tcps_timeoutdrop);
  562                 tp->t_rxtshift = TCP_MAXRXTSHIFT;
  563                 tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN);
  564                 NET_EPOCH_ENTER(et);
  565                 tp = tcp_drop(tp, ETIMEDOUT);
  566                 NET_EPOCH_EXIT(et);
  567                 CURVNET_RESTORE();
  568 
  569                 return (tp != NULL);
  570         }
  571         if (tp->t_state == TCPS_SYN_SENT) {
  572                 /*
  573                  * If the SYN was retransmitted, indicate CWND to be
  574                  * limited to 1 segment in cc_conn_init().
  575                  */
  576                 tp->snd_cwnd = 1;
  577         } else if (tp->t_rxtshift == 1) {
  578                 /*
  579                  * first retransmit; record ssthresh and cwnd so they can
  580                  * be recovered if this turns out to be a "bad" retransmit.
  581                  * A retransmit is considered "bad" if an ACK for this
  582                  * segment is received within RTT/2 interval; the assumption
  583                  * here is that the ACK was already in flight.  See
  584                  * "On Estimating End-to-End Network Path Properties" by
  585                  * Allman and Paxson for more details.
  586                  */
  587                 tp->snd_cwnd_prev = tp->snd_cwnd;
  588                 tp->snd_ssthresh_prev = tp->snd_ssthresh;
  589                 tp->snd_recover_prev = tp->snd_recover;
  590                 if (IN_FASTRECOVERY(tp->t_flags))
  591                         tp->t_flags |= TF_WASFRECOVERY;
  592                 else
  593                         tp->t_flags &= ~TF_WASFRECOVERY;
  594                 if (IN_CONGRECOVERY(tp->t_flags))
  595                         tp->t_flags |= TF_WASCRECOVERY;
  596                 else
  597                         tp->t_flags &= ~TF_WASCRECOVERY;
  598                 if ((tp->t_flags & TF_RCVD_TSTMP) == 0)
  599                         tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
  600                 /* In the event that we've negotiated timestamps
  601                  * badrxtwin will be set to the value that we set
  602                  * the retransmitted packet's to_tsval to by tcp_output
  603                  */
  604                 tp->t_flags |= TF_PREVVALID;
  605         } else
  606                 tp->t_flags &= ~TF_PREVVALID;
  607         TCPSTAT_INC(tcps_rexmttimeo);
  608         if ((tp->t_state == TCPS_SYN_SENT) ||
  609             (tp->t_state == TCPS_SYN_RECEIVED))
  610                 rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
  611         else
  612                 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
  613         TCPT_RANGESET(tp->t_rxtcur, rexmt,
  614                       tp->t_rttmin, TCPTV_REXMTMAX);
  615 
  616         /*
  617          * We enter the path for PLMTUD if connection is established or, if
  618          * connection is FIN_WAIT_1 status, reason for the last is that if
  619          * amount of data we send is very small, we could send it in couple of
  620          * packets and process straight to FIN. In that case we won't catch
  621          * ESTABLISHED state.
  622          */
  623 #ifdef INET6
  624         isipv6 = (inp->inp_vflag & INP_IPV6) ? true : false;
  625 #else
  626         isipv6 = false;
  627 #endif
  628         if (((V_tcp_pmtud_blackhole_detect == 1) ||
  629             (V_tcp_pmtud_blackhole_detect == 2 && !isipv6) ||
  630             (V_tcp_pmtud_blackhole_detect == 3 && isipv6)) &&
  631             ((tp->t_state == TCPS_ESTABLISHED) ||
  632             (tp->t_state == TCPS_FIN_WAIT_1))) {
  633                 if (tp->t_rxtshift == 1) {
  634                         /*
  635                          * We enter blackhole detection after the first
  636                          * unsuccessful timer based retransmission.
  637                          * Then we reduce up to two times the MSS, each
  638                          * candidate giving two tries of retransmissions.
  639                          * But we give a candidate only two tries, if it
  640                          * actually reduces the MSS.
  641                          */
  642                         tp->t_blackhole_enter = 2;
  643                         tp->t_blackhole_exit = tp->t_blackhole_enter;
  644                         if (isipv6) {
  645 #ifdef INET6
  646                                 if (tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss)
  647                                         tp->t_blackhole_exit += 2;
  648                                 if (tp->t_maxseg > V_tcp_v6mssdflt &&
  649                                     V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt)
  650                                         tp->t_blackhole_exit += 2;
  651 #endif
  652                         } else {
  653 #ifdef INET
  654                                 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss)
  655                                         tp->t_blackhole_exit += 2;
  656                                 if (tp->t_maxseg > V_tcp_mssdflt &&
  657                                     V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt)
  658                                         tp->t_blackhole_exit += 2;
  659 #endif
  660                         }
  661                 }
  662                 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
  663                     (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
  664                     (tp->t_rxtshift >= tp->t_blackhole_enter &&
  665                     tp->t_rxtshift < tp->t_blackhole_exit &&
  666                     (tp->t_rxtshift - tp->t_blackhole_enter) % 2 == 0)) {
  667                         /*
  668                          * Enter Path MTU Black-hole Detection mechanism:
  669                          * - Disable Path MTU Discovery (IP "DF" bit).
  670                          * - Reduce MTU to lower value than what we
  671                          *   negotiated with peer.
  672                          */
  673                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) == 0) {
  674                                 /* Record that we may have found a black hole. */
  675                                 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
  676                                 /* Keep track of previous MSS. */
  677                                 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
  678                         }
  679 
  680                         /*
  681                          * Reduce the MSS to blackhole value or to the default
  682                          * in an attempt to retransmit.
  683                          */
  684 #ifdef INET6
  685                         if (isipv6 &&
  686                             tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss &&
  687                             V_tcp_v6pmtud_blackhole_mss > V_tcp_v6mssdflt) {
  688                                 /* Use the sysctl tuneable blackhole MSS. */
  689                                 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
  690                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
  691                         } else if (isipv6) {
  692                                 /* Use the default MSS. */
  693                                 tp->t_maxseg = V_tcp_v6mssdflt;
  694                                 /*
  695                                  * Disable Path MTU Discovery when we switch to
  696                                  * minmss.
  697                                  */
  698                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
  699                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
  700                         }
  701 #endif
  702 #if defined(INET6) && defined(INET)
  703                         else
  704 #endif
  705 #ifdef INET
  706                         if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss &&
  707                             V_tcp_pmtud_blackhole_mss > V_tcp_mssdflt) {
  708                                 /* Use the sysctl tuneable blackhole MSS. */
  709                                 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
  710                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated);
  711                         } else {
  712                                 /* Use the default MSS. */
  713                                 tp->t_maxseg = V_tcp_mssdflt;
  714                                 /*
  715                                  * Disable Path MTU Discovery when we switch to
  716                                  * minmss.
  717                                  */
  718                                 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
  719                                 TCPSTAT_INC(tcps_pmtud_blackhole_activated_min_mss);
  720                         }
  721 #endif
  722                         /*
  723                          * Reset the slow-start flight size
  724                          * as it may depend on the new MSS.
  725                          */
  726                         if (CC_ALGO(tp)->conn_init != NULL)
  727                                 CC_ALGO(tp)->conn_init(&tp->t_ccv);
  728                 } else {
  729                         /*
  730                          * If further retransmissions are still unsuccessful
  731                          * with a lowered MTU, maybe this isn't a blackhole and
  732                          * we restore the previous MSS and blackhole detection
  733                          * flags.
  734                          */
  735                         if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
  736                             (tp->t_rxtshift >= tp->t_blackhole_exit)) {
  737                                 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
  738                                 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
  739                                 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
  740                                 TCPSTAT_INC(tcps_pmtud_blackhole_failed);
  741                                 /*
  742                                  * Reset the slow-start flight size as it
  743                                  * may depend on the new MSS.
  744                                  */
  745                                 if (CC_ALGO(tp)->conn_init != NULL)
  746                                         CC_ALGO(tp)->conn_init(&tp->t_ccv);
  747                         }
  748                 }
  749         }
  750 
  751         /*
  752          * Disable RFC1323 and SACK if we haven't got any response to
  753          * our third SYN to work-around some broken terminal servers
  754          * (most of which have hopefully been retired) that have bad VJ
  755          * header compression code which trashes TCP segments containing
  756          * unknown-to-them TCP options.
  757          */
  758         if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
  759             (tp->t_rxtshift == 3))
  760                 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
  761         /*
  762          * If we backed off this far, notify the L3 protocol that we're having
  763          * connection problems.
  764          */
  765         if (tp->t_rxtshift > TCP_RTT_INVALIDATE) {
  766 #ifdef INET6
  767                 if ((inp->inp_vflag & INP_IPV6) != 0)
  768                         in6_losing(inp);
  769                 else
  770 #endif
  771                         in_losing(inp);
  772         }
  773         tp->snd_nxt = tp->snd_una;
  774         tp->snd_recover = tp->snd_max;
  775         /*
  776          * Force a segment to be sent.
  777          */
  778         tp->t_flags |= TF_ACKNOW;
  779         /*
  780          * If timing a segment in this window, stop the timer.
  781          */
  782         tp->t_rtttime = 0;
  783 
  784         cc_cong_signal(tp, NULL, CC_RTO);
  785         NET_EPOCH_ENTER(et);
  786         rv = tcp_output_locked(tp);
  787         NET_EPOCH_EXIT(et);
  788         CURVNET_RESTORE();
  789 
  790         return (rv);
  791 }
  792 
  793 static inline tt_which
  794 tcp_timer_next(struct tcpcb *tp, sbintime_t *precision)
  795 {
  796         tt_which i, rv;
  797         sbintime_t after, before;
  798 
  799         for (i = 0, rv = TT_N, after = before = SBT_MAX; i < TT_N; i++) {
  800                 if (tp->t_timers[i] < after) {
  801                         after = tp->t_timers[i];
  802                         rv = i;
  803                 }
  804                 before = MIN(before, tp->t_timers[i] + tp->t_precisions[i]);
  805         }
  806         if (precision != NULL)
  807                 *precision = before - after;
  808 
  809         return (rv);
  810 }
  811 
  812 static void
  813 tcp_timer_enter(void *xtp)
  814 {
  815         struct tcpcb *tp = xtp;
  816         struct inpcb *inp = tptoinpcb(tp);
  817         sbintime_t precision;
  818         tt_which which;
  819 
  820         INP_WLOCK_ASSERT(inp);
  821         MPASS((curthread->td_pflags & TDP_INTCPCALLOUT) == 0);
  822 
  823         curthread->td_pflags |= TDP_INTCPCALLOUT;
  824 
  825         which = tcp_timer_next(tp, NULL);
  826         MPASS(which < TT_N);
  827         tp->t_timers[which] = SBT_MAX;
  828         tp->t_precisions[which] = 0;
  829 
  830         if (tcp_timersw[which](tp)) {
  831                 if ((which = tcp_timer_next(tp, &precision)) != TT_N) {
  832                         callout_reset_sbt_on(&tp->t_callout,
  833                             tp->t_timers[which], precision, tcp_timer_enter,
  834                             tp, inp_to_cpuid(inp), C_ABSOLUTE);
  835                 }
  836                 INP_WUNLOCK(inp);
  837         }
  838 
  839         curthread->td_pflags &= ~TDP_INTCPCALLOUT;
  840 }
  841 
  842 /*
  843  * Activate or stop (delta == 0) a TCP timer.
  844  */
  845 void
  846 tcp_timer_activate(struct tcpcb *tp, tt_which which, u_int delta)
  847 {
  848         struct inpcb *inp = tptoinpcb(tp);
  849         sbintime_t precision;
  850 
  851 #ifdef TCP_OFFLOAD
  852         if (tp->t_flags & TF_TOE)
  853                 return;
  854 #endif
  855 
  856         INP_WLOCK_ASSERT(inp);
  857 
  858         if (delta > 0)
  859                 callout_when(tick_sbt * delta, 0, C_HARDCLOCK,
  860                     &tp->t_timers[which], &tp->t_precisions[which]);
  861         else
  862                 tp->t_timers[which] = SBT_MAX;
  863 
  864         if ((which = tcp_timer_next(tp, &precision)) != TT_N)
  865                 callout_reset_sbt_on(&tp->t_callout, tp->t_timers[which],
  866                     precision, tcp_timer_enter, tp, inp_to_cpuid(inp),
  867                     C_ABSOLUTE);
  868         else
  869                 callout_stop(&tp->t_callout);
  870 }
  871 
  872 bool
  873 tcp_timer_active(struct tcpcb *tp, tt_which which)
  874 {
  875 
  876         INP_WLOCK_ASSERT(tptoinpcb(tp));
  877 
  878         return (tp->t_timers[which] != SBT_MAX);
  879 }
  880 
  881 /*
  882  * Stop all timers associated with tcpcb.
  883  *
  884  * Called only on tcpcb destruction.  The tcpcb shall already be dropped from
  885  * the pcb lookup database and socket is not losing the last reference.
  886  *
  887  * XXXGL: unfortunately our callout(9) is not able to fully stop a locked
  888  * callout even when only two threads are involved: the callout itself and the
  889  * thread that does callout_stop().  See where softclock_call_cc() swaps the
  890  * callwheel lock to callout lock and then checks cc_exec_cancel().  This is
  891  * the race window.  If it happens, the tcp_timer_enter() won't be executed,
  892  * however pcb lock will be locked and released, hence we can't free memory.
  893  * Until callout(9) is improved, just keep retrying.  In my profiling I've seen
  894  * such event happening less than 1 time per hour with 20-30 Gbit/s of traffic.
  895  */
  896 void
  897 tcp_timer_stop(struct tcpcb *tp)
  898 {
  899         struct inpcb *inp = tptoinpcb(tp);
  900 
  901         INP_WLOCK_ASSERT(inp);
  902 
  903         if (curthread->td_pflags & TDP_INTCPCALLOUT) {
  904                 int stopped __diagused;
  905 
  906                 stopped = callout_stop(&tp->t_callout);
  907                 MPASS(stopped == 0);
  908         } else while(__predict_false(callout_stop(&tp->t_callout) == 0)) {
  909                 INP_WUNLOCK(inp);
  910                 kern_yield(PRI_UNCHANGED);
  911                 INP_WLOCK(inp);
  912         }
  913 }
Cache object: e4d014ddac5215416e54c3e00c1e37b9
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/netinet/tcp_timer.c

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_timer.c