1 /*-
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 4. Neither the name of the University nor the names of its contributors
14 * may be used to endorse or promote products derived from this software
15 * without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
30 */
31
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD: releng/11.0/sys/netinet/tcp_timer.c 300042 2016-05-17 09:53:22Z rrs $");
34
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_tcpdebug.h"
38 #include "opt_rss.h"
39
40 #include <sys/param.h>
41 #include <sys/kernel.h>
42 #include <sys/lock.h>
43 #include <sys/mbuf.h>
44 #include <sys/mutex.h>
45 #include <sys/protosw.h>
46 #include <sys/smp.h>
47 #include <sys/socket.h>
48 #include <sys/socketvar.h>
49 #include <sys/sysctl.h>
50 #include <sys/systm.h>
51
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <net/rss_config.h>
55 #include <net/vnet.h>
56 #include <net/netisr.h>
57
58 #include <netinet/in.h>
59 #include <netinet/in_kdtrace.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_rss.h>
62 #include <netinet/in_systm.h>
63 #ifdef INET6
64 #include <netinet6/in6_pcb.h>
65 #endif
66 #include <netinet/ip_var.h>
67 #include <netinet/tcp.h>
68 #include <netinet/tcp_fsm.h>
69 #include <netinet/tcp_timer.h>
70 #include <netinet/tcp_var.h>
71 #include <netinet/cc/cc.h>
72 #ifdef INET6
73 #include <netinet6/tcp6_var.h>
74 #endif
75 #include <netinet/tcpip.h>
76 #ifdef TCPDEBUG
77 #include <netinet/tcp_debug.h>
78 #endif
79
80 int tcp_persmin;
81 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmin, CTLTYPE_INT|CTLFLAG_RW,
82 &tcp_persmin, 0, sysctl_msec_to_ticks, "I", "minimum persistence interval");
83
84 int tcp_persmax;
85 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, persmax, CTLTYPE_INT|CTLFLAG_RW,
86 &tcp_persmax, 0, sysctl_msec_to_ticks, "I", "maximum persistence interval");
87
88 int tcp_keepinit;
89 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
90 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
91
92 int tcp_keepidle;
93 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle, CTLTYPE_INT|CTLFLAG_RW,
94 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "time before keepalive probes begin");
95
96 int tcp_keepintvl;
97 SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl, CTLTYPE_INT|CTLFLAG_RW,
98 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "time between keepalive probes");
99
100 int tcp_delacktime;
101 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DELACKTIME, delacktime, CTLTYPE_INT|CTLFLAG_RW,
102 &tcp_delacktime, 0, sysctl_msec_to_ticks, "I",
103 "Time before a delayed ACK is sent");
104
105 int tcp_msl;
106 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl, CTLTYPE_INT|CTLFLAG_RW,
107 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
108
109 int tcp_rexmit_min;
110 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_min, CTLTYPE_INT|CTLFLAG_RW,
111 &tcp_rexmit_min, 0, sysctl_msec_to_ticks, "I",
112 "Minimum Retransmission Timeout");
113
114 int tcp_rexmit_slop;
115 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmit_slop, CTLTYPE_INT|CTLFLAG_RW,
116 &tcp_rexmit_slop, 0, sysctl_msec_to_ticks, "I",
117 "Retransmission Timer Slop");
118
119 static int always_keepalive = 1;
120 SYSCTL_INT(_net_inet_tcp, OID_AUTO, always_keepalive, CTLFLAG_RW,
121 &always_keepalive , 0, "Assume SO_KEEPALIVE on all TCP connections");
122
123 int tcp_fast_finwait2_recycle = 0;
124 SYSCTL_INT(_net_inet_tcp, OID_AUTO, fast_finwait2_recycle, CTLFLAG_RW,
125 &tcp_fast_finwait2_recycle, 0,
126 "Recycle closed FIN_WAIT_2 connections faster");
127
128 int tcp_finwait2_timeout;
129 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, finwait2_timeout, CTLTYPE_INT|CTLFLAG_RW,
130 &tcp_finwait2_timeout, 0, sysctl_msec_to_ticks, "I", "FIN-WAIT2 timeout");
131
132 int tcp_keepcnt = TCPTV_KEEPCNT;
133 SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt, CTLFLAG_RW, &tcp_keepcnt, 0,
134 "Number of keepalive probes to send");
135
136 /* max idle probes */
137 int tcp_maxpersistidle;
138
139 static int tcp_rexmit_drop_options = 0;
140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, rexmit_drop_options, CTLFLAG_RW,
141 &tcp_rexmit_drop_options, 0,
142 "Drop TCP options from 3rd and later retransmitted SYN");
143
144 static VNET_DEFINE(int, tcp_pmtud_blackhole_detect);
145 #define V_tcp_pmtud_blackhole_detect VNET(tcp_pmtud_blackhole_detect)
146 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection,
147 CTLFLAG_RW|CTLFLAG_VNET,
148 &VNET_NAME(tcp_pmtud_blackhole_detect), 0,
149 "Path MTU Discovery Black Hole Detection Enabled");
150
151 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated);
152 #define V_tcp_pmtud_blackhole_activated \
153 VNET(tcp_pmtud_blackhole_activated)
154 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated,
155 CTLFLAG_RD|CTLFLAG_VNET,
156 &VNET_NAME(tcp_pmtud_blackhole_activated), 0,
157 "Path MTU Discovery Black Hole Detection, Activation Count");
158
159 static VNET_DEFINE(int, tcp_pmtud_blackhole_activated_min_mss);
160 #define V_tcp_pmtud_blackhole_activated_min_mss \
161 VNET(tcp_pmtud_blackhole_activated_min_mss)
162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_activated_min_mss,
163 CTLFLAG_RD|CTLFLAG_VNET,
164 &VNET_NAME(tcp_pmtud_blackhole_activated_min_mss), 0,
165 "Path MTU Discovery Black Hole Detection, Activation Count at min MSS");
166
167 static VNET_DEFINE(int, tcp_pmtud_blackhole_failed);
168 #define V_tcp_pmtud_blackhole_failed VNET(tcp_pmtud_blackhole_failed)
169 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_failed,
170 CTLFLAG_RD|CTLFLAG_VNET,
171 &VNET_NAME(tcp_pmtud_blackhole_failed), 0,
172 "Path MTU Discovery Black Hole Detection, Failure Count");
173
174 #ifdef INET
175 static VNET_DEFINE(int, tcp_pmtud_blackhole_mss) = 1200;
176 #define V_tcp_pmtud_blackhole_mss VNET(tcp_pmtud_blackhole_mss)
177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_mss,
178 CTLFLAG_RW|CTLFLAG_VNET,
179 &VNET_NAME(tcp_pmtud_blackhole_mss), 0,
180 "Path MTU Discovery Black Hole Detection lowered MSS");
181 #endif
182
183 #ifdef INET6
184 static VNET_DEFINE(int, tcp_v6pmtud_blackhole_mss) = 1220;
185 #define V_tcp_v6pmtud_blackhole_mss VNET(tcp_v6pmtud_blackhole_mss)
186 SYSCTL_INT(_net_inet_tcp, OID_AUTO, v6pmtud_blackhole_mss,
187 CTLFLAG_RW|CTLFLAG_VNET,
188 &VNET_NAME(tcp_v6pmtud_blackhole_mss), 0,
189 "Path MTU Discovery IPv6 Black Hole Detection lowered MSS");
190 #endif
191
192 #ifdef RSS
193 static int per_cpu_timers = 1;
194 #else
195 static int per_cpu_timers = 0;
196 #endif
197 SYSCTL_INT(_net_inet_tcp, OID_AUTO, per_cpu_timers, CTLFLAG_RW,
198 &per_cpu_timers , 0, "run tcp timers on all cpus");
199
200 #if 0
201 #define INP_CPU(inp) (per_cpu_timers ? (!CPU_ABSENT(((inp)->inp_flowid % (mp_maxid+1))) ? \
202 ((inp)->inp_flowid % (mp_maxid+1)) : curcpu) : 0)
203 #endif
204
205 /*
206 * Map the given inp to a CPU id.
207 *
208 * This queries RSS if it's compiled in, else it defaults to the current
209 * CPU ID.
210 */
211 static inline int
212 inp_to_cpuid(struct inpcb *inp)
213 {
214 u_int cpuid;
215
216 #ifdef RSS
217 if (per_cpu_timers) {
218 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
219 if (cpuid == NETISR_CPUID_NONE)
220 return (curcpu); /* XXX */
221 else
222 return (cpuid);
223 }
224 #else
225 /* Legacy, pre-RSS behaviour */
226 if (per_cpu_timers) {
227 /*
228 * We don't have a flowid -> cpuid mapping, so cheat and
229 * just map unknown cpuids to curcpu. Not the best, but
230 * apparently better than defaulting to swi 0.
231 */
232 cpuid = inp->inp_flowid % (mp_maxid + 1);
233 if (! CPU_ABSENT(cpuid))
234 return (cpuid);
235 return (curcpu);
236 }
237 #endif
238 /* Default for RSS and non-RSS - cpuid 0 */
239 else {
240 return (0);
241 }
242 }
243
244 /*
245 * Tcp protocol timeout routine called every 500 ms.
246 * Updates timestamps used for TCP
247 * causes finite state machine actions if timers expire.
248 */
249 void
250 tcp_slowtimo(void)
251 {
252 VNET_ITERATOR_DECL(vnet_iter);
253
254 VNET_LIST_RLOCK_NOSLEEP();
255 VNET_FOREACH(vnet_iter) {
256 CURVNET_SET(vnet_iter);
257 (void) tcp_tw_2msl_scan(0);
258 CURVNET_RESTORE();
259 }
260 VNET_LIST_RUNLOCK_NOSLEEP();
261 }
262
263 int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
264 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
265
266 int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
267 { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 };
268
269 static int tcp_totbackoff = 2559; /* sum of tcp_backoff[] */
270
271 /*
272 * TCP timer processing.
273 */
274
275 void
276 tcp_timer_delack(void *xtp)
277 {
278 struct tcpcb *tp = xtp;
279 struct inpcb *inp;
280 CURVNET_SET(tp->t_vnet);
281
282 inp = tp->t_inpcb;
283 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
284 INP_WLOCK(inp);
285 if (callout_pending(&tp->t_timers->tt_delack) ||
286 !callout_active(&tp->t_timers->tt_delack)) {
287 INP_WUNLOCK(inp);
288 CURVNET_RESTORE();
289 return;
290 }
291 callout_deactivate(&tp->t_timers->tt_delack);
292 if ((inp->inp_flags & INP_DROPPED) != 0) {
293 INP_WUNLOCK(inp);
294 CURVNET_RESTORE();
295 return;
296 }
297 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
298 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
299 KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
300 ("%s: tp %p delack callout should be running", __func__, tp));
301
302 tp->t_flags |= TF_ACKNOW;
303 TCPSTAT_INC(tcps_delack);
304 (void) tp->t_fb->tfb_tcp_output(tp);
305 INP_WUNLOCK(inp);
306 CURVNET_RESTORE();
307 }
308
309 void
310 tcp_timer_2msl(void *xtp)
311 {
312 struct tcpcb *tp = xtp;
313 struct inpcb *inp;
314 CURVNET_SET(tp->t_vnet);
315 #ifdef TCPDEBUG
316 int ostate;
317
318 ostate = tp->t_state;
319 #endif
320 INP_INFO_RLOCK(&V_tcbinfo);
321 inp = tp->t_inpcb;
322 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
323 INP_WLOCK(inp);
324 tcp_free_sackholes(tp);
325 if (callout_pending(&tp->t_timers->tt_2msl) ||
326 !callout_active(&tp->t_timers->tt_2msl)) {
327 INP_WUNLOCK(tp->t_inpcb);
328 INP_INFO_RUNLOCK(&V_tcbinfo);
329 CURVNET_RESTORE();
330 return;
331 }
332 callout_deactivate(&tp->t_timers->tt_2msl);
333 if ((inp->inp_flags & INP_DROPPED) != 0) {
334 INP_WUNLOCK(inp);
335 INP_INFO_RUNLOCK(&V_tcbinfo);
336 CURVNET_RESTORE();
337 return;
338 }
339 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
340 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
341 KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
342 ("%s: tp %p 2msl callout should be running", __func__, tp));
343 /*
344 * 2 MSL timeout in shutdown went off. If we're closed but
345 * still waiting for peer to close and connection has been idle
346 * too long delete connection control block. Otherwise, check
347 * again in a bit.
348 *
349 * If in TIME_WAIT state just ignore as this timeout is handled in
350 * tcp_tw_2msl_scan().
351 *
352 * If fastrecycle of FIN_WAIT_2, in FIN_WAIT_2 and receiver has closed,
353 * there's no point in hanging onto FIN_WAIT_2 socket. Just close it.
354 * Ignore fact that there were recent incoming segments.
355 */
356 if ((inp->inp_flags & INP_TIMEWAIT) != 0) {
357 INP_WUNLOCK(inp);
358 INP_INFO_RUNLOCK(&V_tcbinfo);
359 CURVNET_RESTORE();
360 return;
361 }
362 if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 &&
363 tp->t_inpcb && tp->t_inpcb->inp_socket &&
364 (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) {
365 TCPSTAT_INC(tcps_finwait2_drops);
366 tp = tcp_close(tp);
367 } else {
368 if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
369 if (!callout_reset(&tp->t_timers->tt_2msl,
370 TP_KEEPINTVL(tp), tcp_timer_2msl, tp)) {
371 tp->t_timers->tt_flags &= ~TT_2MSL_RST;
372 }
373 } else
374 tp = tcp_close(tp);
375 }
376
377 #ifdef TCPDEBUG
378 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
379 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
380 PRU_SLOWTIMO);
381 #endif
382 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
383
384 if (tp != NULL)
385 INP_WUNLOCK(inp);
386 INP_INFO_RUNLOCK(&V_tcbinfo);
387 CURVNET_RESTORE();
388 }
389
390 void
391 tcp_timer_keep(void *xtp)
392 {
393 struct tcpcb *tp = xtp;
394 struct tcptemp *t_template;
395 struct inpcb *inp;
396 CURVNET_SET(tp->t_vnet);
397 #ifdef TCPDEBUG
398 int ostate;
399
400 ostate = tp->t_state;
401 #endif
402 INP_INFO_RLOCK(&V_tcbinfo);
403 inp = tp->t_inpcb;
404 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
405 INP_WLOCK(inp);
406 if (callout_pending(&tp->t_timers->tt_keep) ||
407 !callout_active(&tp->t_timers->tt_keep)) {
408 INP_WUNLOCK(inp);
409 INP_INFO_RUNLOCK(&V_tcbinfo);
410 CURVNET_RESTORE();
411 return;
412 }
413 callout_deactivate(&tp->t_timers->tt_keep);
414 if ((inp->inp_flags & INP_DROPPED) != 0) {
415 INP_WUNLOCK(inp);
416 INP_INFO_RUNLOCK(&V_tcbinfo);
417 CURVNET_RESTORE();
418 return;
419 }
420 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
421 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
422 KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
423 ("%s: tp %p keep callout should be running", __func__, tp));
424 /*
425 * Keep-alive timer went off; send something
426 * or drop connection if idle for too long.
427 */
428 TCPSTAT_INC(tcps_keeptimeo);
429 if (tp->t_state < TCPS_ESTABLISHED)
430 goto dropit;
431 if ((always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) &&
432 tp->t_state <= TCPS_CLOSING) {
433 if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
434 goto dropit;
435 /*
436 * Send a packet designed to force a response
437 * if the peer is up and reachable:
438 * either an ACK if the connection is still alive,
439 * or an RST if the peer has closed the connection
440 * due to timeout or reboot.
441 * Using sequence number tp->snd_una-1
442 * causes the transmitted zero-length segment
443 * to lie outside the receive window;
444 * by the protocol spec, this requires the
445 * correspondent TCP to respond.
446 */
447 TCPSTAT_INC(tcps_keepprobe);
448 t_template = tcpip_maketemplate(inp);
449 if (t_template) {
450 tcp_respond(tp, t_template->tt_ipgen,
451 &t_template->tt_t, (struct mbuf *)NULL,
452 tp->rcv_nxt, tp->snd_una - 1, 0);
453 free(t_template, M_TEMP);
454 }
455 if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPINTVL(tp),
456 tcp_timer_keep, tp)) {
457 tp->t_timers->tt_flags &= ~TT_KEEP_RST;
458 }
459 } else if (!callout_reset(&tp->t_timers->tt_keep, TP_KEEPIDLE(tp),
460 tcp_timer_keep, tp)) {
461 tp->t_timers->tt_flags &= ~TT_KEEP_RST;
462 }
463
464 #ifdef TCPDEBUG
465 if (inp->inp_socket->so_options & SO_DEBUG)
466 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
467 PRU_SLOWTIMO);
468 #endif
469 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
470 INP_WUNLOCK(inp);
471 INP_INFO_RUNLOCK(&V_tcbinfo);
472 CURVNET_RESTORE();
473 return;
474
475 dropit:
476 TCPSTAT_INC(tcps_keepdrops);
477 tp = tcp_drop(tp, ETIMEDOUT);
478
479 #ifdef TCPDEBUG
480 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
481 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
482 PRU_SLOWTIMO);
483 #endif
484 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
485 if (tp != NULL)
486 INP_WUNLOCK(tp->t_inpcb);
487 INP_INFO_RUNLOCK(&V_tcbinfo);
488 CURVNET_RESTORE();
489 }
490
491 void
492 tcp_timer_persist(void *xtp)
493 {
494 struct tcpcb *tp = xtp;
495 struct inpcb *inp;
496 CURVNET_SET(tp->t_vnet);
497 #ifdef TCPDEBUG
498 int ostate;
499
500 ostate = tp->t_state;
501 #endif
502 INP_INFO_RLOCK(&V_tcbinfo);
503 inp = tp->t_inpcb;
504 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
505 INP_WLOCK(inp);
506 if (callout_pending(&tp->t_timers->tt_persist) ||
507 !callout_active(&tp->t_timers->tt_persist)) {
508 INP_WUNLOCK(inp);
509 INP_INFO_RUNLOCK(&V_tcbinfo);
510 CURVNET_RESTORE();
511 return;
512 }
513 callout_deactivate(&tp->t_timers->tt_persist);
514 if ((inp->inp_flags & INP_DROPPED) != 0) {
515 INP_WUNLOCK(inp);
516 INP_INFO_RUNLOCK(&V_tcbinfo);
517 CURVNET_RESTORE();
518 return;
519 }
520 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
521 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
522 KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
523 ("%s: tp %p persist callout should be running", __func__, tp));
524 /*
525 * Persistence timer into zero window.
526 * Force a byte to be output, if possible.
527 */
528 TCPSTAT_INC(tcps_persisttimeo);
529 /*
530 * Hack: if the peer is dead/unreachable, we do not
531 * time out if the window is closed. After a full
532 * backoff, drop the connection if the idle time
533 * (no responses to probes) reaches the maximum
534 * backoff that we would use if retransmitting.
535 */
536 if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
537 (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
538 ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
539 TCPSTAT_INC(tcps_persistdrop);
540 tp = tcp_drop(tp, ETIMEDOUT);
541 goto out;
542 }
543 /*
544 * If the user has closed the socket then drop a persisting
545 * connection after a much reduced timeout.
546 */
547 if (tp->t_state > TCPS_CLOSE_WAIT &&
548 (ticks - tp->t_rcvtime) >= TCPTV_PERSMAX) {
549 TCPSTAT_INC(tcps_persistdrop);
550 tp = tcp_drop(tp, ETIMEDOUT);
551 goto out;
552 }
553 tcp_setpersist(tp);
554 tp->t_flags |= TF_FORCEDATA;
555 (void) tp->t_fb->tfb_tcp_output(tp);
556 tp->t_flags &= ~TF_FORCEDATA;
557
558 out:
559 #ifdef TCPDEBUG
560 if (tp != NULL && tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
561 tcp_trace(TA_USER, ostate, tp, NULL, NULL, PRU_SLOWTIMO);
562 #endif
563 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
564 if (tp != NULL)
565 INP_WUNLOCK(inp);
566 INP_INFO_RUNLOCK(&V_tcbinfo);
567 CURVNET_RESTORE();
568 }
569
570 void
571 tcp_timer_rexmt(void * xtp)
572 {
573 struct tcpcb *tp = xtp;
574 CURVNET_SET(tp->t_vnet);
575 int rexmt;
576 int headlocked;
577 struct inpcb *inp;
578 #ifdef TCPDEBUG
579 int ostate;
580
581 ostate = tp->t_state;
582 #endif
583
584 INP_INFO_RLOCK(&V_tcbinfo);
585 inp = tp->t_inpcb;
586 KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
587 INP_WLOCK(inp);
588 if (callout_pending(&tp->t_timers->tt_rexmt) ||
589 !callout_active(&tp->t_timers->tt_rexmt)) {
590 INP_WUNLOCK(inp);
591 INP_INFO_RUNLOCK(&V_tcbinfo);
592 CURVNET_RESTORE();
593 return;
594 }
595 callout_deactivate(&tp->t_timers->tt_rexmt);
596 if ((inp->inp_flags & INP_DROPPED) != 0) {
597 INP_WUNLOCK(inp);
598 INP_INFO_RUNLOCK(&V_tcbinfo);
599 CURVNET_RESTORE();
600 return;
601 }
602 KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
603 ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
604 KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
605 ("%s: tp %p rexmt callout should be running", __func__, tp));
606 tcp_free_sackholes(tp);
607 if (tp->t_fb->tfb_tcp_rexmit_tmr) {
608 /* The stack has a timer action too. */
609 (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
610 }
611 /*
612 * Retransmission timer went off. Message has not
613 * been acked within retransmit interval. Back off
614 * to a longer retransmit interval and retransmit one segment.
615 */
616 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
617 tp->t_rxtshift = TCP_MAXRXTSHIFT;
618 TCPSTAT_INC(tcps_timeoutdrop);
619
620 tp = tcp_drop(tp, tp->t_softerror ?
621 tp->t_softerror : ETIMEDOUT);
622 headlocked = 1;
623 goto out;
624 }
625 INP_INFO_RUNLOCK(&V_tcbinfo);
626 headlocked = 0;
627 if (tp->t_state == TCPS_SYN_SENT) {
628 /*
629 * If the SYN was retransmitted, indicate CWND to be
630 * limited to 1 segment in cc_conn_init().
631 */
632 tp->snd_cwnd = 1;
633 } else if (tp->t_rxtshift == 1) {
634 /*
635 * first retransmit; record ssthresh and cwnd so they can
636 * be recovered if this turns out to be a "bad" retransmit.
637 * A retransmit is considered "bad" if an ACK for this
638 * segment is received within RTT/2 interval; the assumption
639 * here is that the ACK was already in flight. See
640 * "On Estimating End-to-End Network Path Properties" by
641 * Allman and Paxson for more details.
642 */
643 tp->snd_cwnd_prev = tp->snd_cwnd;
644 tp->snd_ssthresh_prev = tp->snd_ssthresh;
645 tp->snd_recover_prev = tp->snd_recover;
646 if (IN_FASTRECOVERY(tp->t_flags))
647 tp->t_flags |= TF_WASFRECOVERY;
648 else
649 tp->t_flags &= ~TF_WASFRECOVERY;
650 if (IN_CONGRECOVERY(tp->t_flags))
651 tp->t_flags |= TF_WASCRECOVERY;
652 else
653 tp->t_flags &= ~TF_WASCRECOVERY;
654 tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
655 tp->t_flags |= TF_PREVVALID;
656 } else
657 tp->t_flags &= ~TF_PREVVALID;
658 TCPSTAT_INC(tcps_rexmttimeo);
659 if ((tp->t_state == TCPS_SYN_SENT) ||
660 (tp->t_state == TCPS_SYN_RECEIVED))
661 rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
662 else
663 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
664 TCPT_RANGESET(tp->t_rxtcur, rexmt,
665 tp->t_rttmin, TCPTV_REXMTMAX);
666
667 /*
668 * We enter the path for PLMTUD if connection is established or, if
669 * connection is FIN_WAIT_1 status, reason for the last is that if
670 * amount of data we send is very small, we could send it in couple of
671 * packets and process straight to FIN. In that case we won't catch
672 * ESTABLISHED state.
673 */
674 if (V_tcp_pmtud_blackhole_detect && (((tp->t_state == TCPS_ESTABLISHED))
675 || (tp->t_state == TCPS_FIN_WAIT_1))) {
676 #ifdef INET6
677 int isipv6;
678 #endif
679
680 /*
681 * Idea here is that at each stage of mtu probe (usually, 1448
682 * -> 1188 -> 524) should be given 2 chances to recover before
683 * further clamping down. 'tp->t_rxtshift % 2 == 0' should
684 * take care of that.
685 */
686 if (((tp->t_flags2 & (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) ==
687 (TF2_PLPMTU_PMTUD|TF2_PLPMTU_MAXSEGSNT)) &&
688 (tp->t_rxtshift >= 2 && tp->t_rxtshift % 2 == 0)) {
689 /*
690 * Enter Path MTU Black-hole Detection mechanism:
691 * - Disable Path MTU Discovery (IP "DF" bit).
692 * - Reduce MTU to lower value than what we
693 * negotiated with peer.
694 */
695 /* Record that we may have found a black hole. */
696 tp->t_flags2 |= TF2_PLPMTU_BLACKHOLE;
697
698 /* Keep track of previous MSS. */
699 tp->t_pmtud_saved_maxseg = tp->t_maxseg;
700
701 /*
702 * Reduce the MSS to blackhole value or to the default
703 * in an attempt to retransmit.
704 */
705 #ifdef INET6
706 isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) ? 1 : 0;
707 if (isipv6 &&
708 tp->t_maxseg > V_tcp_v6pmtud_blackhole_mss) {
709 /* Use the sysctl tuneable blackhole MSS. */
710 tp->t_maxseg = V_tcp_v6pmtud_blackhole_mss;
711 V_tcp_pmtud_blackhole_activated++;
712 } else if (isipv6) {
713 /* Use the default MSS. */
714 tp->t_maxseg = V_tcp_v6mssdflt;
715 /*
716 * Disable Path MTU Discovery when we switch to
717 * minmss.
718 */
719 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
720 V_tcp_pmtud_blackhole_activated_min_mss++;
721 }
722 #endif
723 #if defined(INET6) && defined(INET)
724 else
725 #endif
726 #ifdef INET
727 if (tp->t_maxseg > V_tcp_pmtud_blackhole_mss) {
728 /* Use the sysctl tuneable blackhole MSS. */
729 tp->t_maxseg = V_tcp_pmtud_blackhole_mss;
730 V_tcp_pmtud_blackhole_activated++;
731 } else {
732 /* Use the default MSS. */
733 tp->t_maxseg = V_tcp_mssdflt;
734 /*
735 * Disable Path MTU Discovery when we switch to
736 * minmss.
737 */
738 tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
739 V_tcp_pmtud_blackhole_activated_min_mss++;
740 }
741 #endif
742 /*
743 * Reset the slow-start flight size
744 * as it may depend on the new MSS.
745 */
746 if (CC_ALGO(tp)->conn_init != NULL)
747 CC_ALGO(tp)->conn_init(tp->ccv);
748 } else {
749 /*
750 * If further retransmissions are still unsuccessful
751 * with a lowered MTU, maybe this isn't a blackhole and
752 * we restore the previous MSS and blackhole detection
753 * flags.
754 * The limit '6' is determined by giving each probe
755 * stage (1448, 1188, 524) 2 chances to recover.
756 */
757 if ((tp->t_flags2 & TF2_PLPMTU_BLACKHOLE) &&
758 (tp->t_rxtshift > 6)) {
759 tp->t_flags2 |= TF2_PLPMTU_PMTUD;
760 tp->t_flags2 &= ~TF2_PLPMTU_BLACKHOLE;
761 tp->t_maxseg = tp->t_pmtud_saved_maxseg;
762 V_tcp_pmtud_blackhole_failed++;
763 /*
764 * Reset the slow-start flight size as it
765 * may depend on the new MSS.
766 */
767 if (CC_ALGO(tp)->conn_init != NULL)
768 CC_ALGO(tp)->conn_init(tp->ccv);
769 }
770 }
771 }
772
773 /*
774 * Disable RFC1323 and SACK if we haven't got any response to
775 * our third SYN to work-around some broken terminal servers
776 * (most of which have hopefully been retired) that have bad VJ
777 * header compression code which trashes TCP segments containing
778 * unknown-to-them TCP options.
779 */
780 if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) &&
781 (tp->t_rxtshift == 3))
782 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT);
783 /*
784 * If we backed off this far, our srtt estimate is probably bogus.
785 * Clobber it so we'll take the next rtt measurement as our srtt;
786 * move the current srtt into rttvar to keep the current
787 * retransmit times until then.
788 */
789 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
790 #ifdef INET6
791 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
792 in6_losing(tp->t_inpcb);
793 else
794 #endif
795 in_losing(tp->t_inpcb);
796 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
797 tp->t_srtt = 0;
798 }
799 tp->snd_nxt = tp->snd_una;
800 tp->snd_recover = tp->snd_max;
801 /*
802 * Force a segment to be sent.
803 */
804 tp->t_flags |= TF_ACKNOW;
805 /*
806 * If timing a segment in this window, stop the timer.
807 */
808 tp->t_rtttime = 0;
809
810 cc_cong_signal(tp, NULL, CC_RTO);
811
812 (void) tp->t_fb->tfb_tcp_output(tp);
813
814 out:
815 #ifdef TCPDEBUG
816 if (tp != NULL && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
817 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
818 PRU_SLOWTIMO);
819 #endif
820 TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO);
821 if (tp != NULL)
822 INP_WUNLOCK(inp);
823 if (headlocked)
824 INP_INFO_RUNLOCK(&V_tcbinfo);
825 CURVNET_RESTORE();
826 }
827
828 void
829 tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
830 {
831 struct callout *t_callout;
832 timeout_t *f_callout;
833 struct inpcb *inp = tp->t_inpcb;
834 int cpu = inp_to_cpuid(inp);
835 uint32_t f_reset;
836
837 #ifdef TCP_OFFLOAD
838 if (tp->t_flags & TF_TOE)
839 return;
840 #endif
841
842 if (tp->t_timers->tt_flags & TT_STOPPED)
843 return;
844
845 switch (timer_type) {
846 case TT_DELACK:
847 t_callout = &tp->t_timers->tt_delack;
848 f_callout = tcp_timer_delack;
849 f_reset = TT_DELACK_RST;
850 break;
851 case TT_REXMT:
852 t_callout = &tp->t_timers->tt_rexmt;
853 f_callout = tcp_timer_rexmt;
854 f_reset = TT_REXMT_RST;
855 break;
856 case TT_PERSIST:
857 t_callout = &tp->t_timers->tt_persist;
858 f_callout = tcp_timer_persist;
859 f_reset = TT_PERSIST_RST;
860 break;
861 case TT_KEEP:
862 t_callout = &tp->t_timers->tt_keep;
863 f_callout = tcp_timer_keep;
864 f_reset = TT_KEEP_RST;
865 break;
866 case TT_2MSL:
867 t_callout = &tp->t_timers->tt_2msl;
868 f_callout = tcp_timer_2msl;
869 f_reset = TT_2MSL_RST;
870 break;
871 default:
872 if (tp->t_fb->tfb_tcp_timer_activate) {
873 tp->t_fb->tfb_tcp_timer_activate(tp, timer_type, delta);
874 return;
875 }
876 panic("tp %p bad timer_type %#x", tp, timer_type);
877 }
878 if (delta == 0) {
879 if ((tp->t_timers->tt_flags & timer_type) &&
880 (callout_stop(t_callout) > 0) &&
881 (tp->t_timers->tt_flags & f_reset)) {
882 tp->t_timers->tt_flags &= ~(timer_type | f_reset);
883 }
884 } else {
885 if ((tp->t_timers->tt_flags & timer_type) == 0) {
886 tp->t_timers->tt_flags |= (timer_type | f_reset);
887 callout_reset_on(t_callout, delta, f_callout, tp, cpu);
888 } else {
889 /* Reset already running callout on the same CPU. */
890 if (!callout_reset(t_callout, delta, f_callout, tp)) {
891 /*
892 * Callout not cancelled, consider it as not
893 * properly restarted. */
894 tp->t_timers->tt_flags &= ~f_reset;
895 }
896 }
897 }
898 }
899
900 int
901 tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
902 {
903 struct callout *t_callout;
904
905 switch (timer_type) {
906 case TT_DELACK:
907 t_callout = &tp->t_timers->tt_delack;
908 break;
909 case TT_REXMT:
910 t_callout = &tp->t_timers->tt_rexmt;
911 break;
912 case TT_PERSIST:
913 t_callout = &tp->t_timers->tt_persist;
914 break;
915 case TT_KEEP:
916 t_callout = &tp->t_timers->tt_keep;
917 break;
918 case TT_2MSL:
919 t_callout = &tp->t_timers->tt_2msl;
920 break;
921 default:
922 if (tp->t_fb->tfb_tcp_timer_active) {
923 return(tp->t_fb->tfb_tcp_timer_active(tp, timer_type));
924 }
925 panic("tp %p bad timer_type %#x", tp, timer_type);
926 }
927 return callout_active(t_callout);
928 }
929
930 void
931 tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
932 {
933 struct callout *t_callout;
934 uint32_t f_reset;
935
936 tp->t_timers->tt_flags |= TT_STOPPED;
937
938 switch (timer_type) {
939 case TT_DELACK:
940 t_callout = &tp->t_timers->tt_delack;
941 f_reset = TT_DELACK_RST;
942 break;
943 case TT_REXMT:
944 t_callout = &tp->t_timers->tt_rexmt;
945 f_reset = TT_REXMT_RST;
946 break;
947 case TT_PERSIST:
948 t_callout = &tp->t_timers->tt_persist;
949 f_reset = TT_PERSIST_RST;
950 break;
951 case TT_KEEP:
952 t_callout = &tp->t_timers->tt_keep;
953 f_reset = TT_KEEP_RST;
954 break;
955 case TT_2MSL:
956 t_callout = &tp->t_timers->tt_2msl;
957 f_reset = TT_2MSL_RST;
958 break;
959 default:
960 if (tp->t_fb->tfb_tcp_timer_stop) {
961 /*
962 * XXXrrs we need to look at this with the
963 * stop case below (flags).
964 */
965 tp->t_fb->tfb_tcp_timer_stop(tp, timer_type);
966 return;
967 }
968 panic("tp %p bad timer_type %#x", tp, timer_type);
969 }
970
971 if (tp->t_timers->tt_flags & timer_type) {
972 if (callout_async_drain(t_callout, tcp_timer_discard) == 0) {
973 /*
974 * Can't stop the callout, defer tcpcb actual deletion
975 * to the last one. We do this using the async drain
976 * function and incrementing the count in
977 */
978 tp->t_timers->tt_draincnt++;
979 }
980 }
981 }
982
983 #define ticks_to_msecs(t) (1000*(t) / hz)
984
985 void
986 tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
987 struct xtcp_timer *xtimer)
988 {
989 sbintime_t now;
990
991 bzero(xtimer, sizeof(*xtimer));
992 if (timer == NULL)
993 return;
994 now = getsbinuptime();
995 if (callout_active(&timer->tt_delack))
996 xtimer->tt_delack = (timer->tt_delack.c_time - now) / SBT_1MS;
997 if (callout_active(&timer->tt_rexmt))
998 xtimer->tt_rexmt = (timer->tt_rexmt.c_time - now) / SBT_1MS;
999 if (callout_active(&timer->tt_persist))
1000 xtimer->tt_persist = (timer->tt_persist.c_time - now) / SBT_1MS;
1001 if (callout_active(&timer->tt_keep))
1002 xtimer->tt_keep = (timer->tt_keep.c_time - now) / SBT_1MS;
1003 if (callout_active(&timer->tt_2msl))
1004 xtimer->tt_2msl = (timer->tt_2msl.c_time - now) / SBT_1MS;
1005 xtimer->t_rcvtime = ticks_to_msecs(ticks - tp->t_rcvtime);
1006 }
Cache object: 44d2911ceec0365c24f5d0b75b038856
|