FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_ecn.c
1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
5 * The Regents of the University of California. All rights reserved.
6 * Copyright (c) 2007-2008,2010
7 * Swinburne University of Technology, Melbourne, Australia.
8 * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
9 * Copyright (c) 2010 The FreeBSD Foundation
10 * Copyright (c) 2010-2011 Juniper Networks, Inc.
11 * Copyright (c) 2019 Richard Scheffenegger <srichard@netapp.com>
12 * All rights reserved.
13 *
14 * Portions of this software were developed at the Centre for Advanced Internet
15 * Architectures, Swinburne University of Technology, by Lawrence Stewart,
16 * James Healy and David Hayes, made possible in part by a grant from the Cisco
17 * University Research Program Fund at Community Foundation Silicon Valley.
18 *
19 * Portions of this software were developed at the Centre for Advanced
20 * Internet Architectures, Swinburne University of Technology, Melbourne,
21 * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
22 *
23 * Portions of this software were developed by Robert N. M. Watson under
24 * contract to Juniper Networks, Inc.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 * notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 * notice, this list of conditions and the following disclaimer in the
33 * documentation and/or other materials provided with the distribution.
34 * 3. Neither the name of the University nor the names of its contributors
35 * may be used to endorse or promote products derived from this software
36 * without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 *
50 * @(#)tcp_ecn.c 8.12 (Berkeley) 5/24/95
51 */
52
53 /*
54 * Utility functions to deal with Explicit Congestion Notification in TCP
55 * implementing the essential parts of the Accurate ECN extension
56 * https://tools.ietf.org/html/draft-ietf-tcpm-accurate-ecn-09
57 */
58
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
61
62 #include "opt_inet.h"
63 #include "opt_inet6.h"
64
65 #include <sys/param.h>
66 #include <sys/systm.h>
67 #include <sys/kernel.h>
68 #include <sys/sysctl.h>
69 #include <sys/malloc.h>
70 #include <sys/mbuf.h>
71 #include <sys/socket.h>
72 #include <sys/socketvar.h>
73
74 #include <machine/cpu.h>
75
76 #include <vm/uma.h>
77
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/route.h>
81 #include <net/vnet.h>
82
83 #include <netinet/in.h>
84 #include <netinet/in_systm.h>
85 #include <netinet/ip.h>
86 #include <netinet/in_var.h>
87 #include <netinet/in_pcb.h>
88 #include <netinet/ip_var.h>
89 #include <netinet/ip6.h>
90 #include <netinet/icmp6.h>
91 #include <netinet6/nd6.h>
92 #include <netinet6/ip6_var.h>
93 #include <netinet6/in6_pcb.h>
94 #include <netinet/tcp.h>
95 #include <netinet/tcp_fsm.h>
96 #include <netinet/tcp_seq.h>
97 #include <netinet/tcp_var.h>
98 #include <netinet/tcp_syncache.h>
99 #include <netinet/tcp_timer.h>
100 #include <netinet/tcpip.h>
101 #include <netinet/tcp_ecn.h>
102
103 static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn,
104 CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
105 "TCP ECN");
106
107 VNET_DEFINE(int, tcp_do_ecn) = 2;
108 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable,
109 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0,
110 "TCP ECN support");
111
112 VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
113 SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries,
114 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0,
115 "Max retries before giving up on ECN");
116
117 /*
118 * Process incoming SYN,ACK packet
119 */
120 void
121 tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
122 {
123
124 if (V_tcp_do_ecn == 0)
125 return;
126 if ((V_tcp_do_ecn == 1) ||
127 (V_tcp_do_ecn == 2)) {
128 /* RFC3168 ECN handling */
129 if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) {
130 tp->t_flags2 |= TF2_ECN_PERMIT;
131 tp->t_flags2 &= ~TF2_ACE_PERMIT;
132 TCPSTAT_INC(tcps_ecn_shs);
133 }
134 } else
135 /* decoding Accurate ECN according to table in section 3.1.1 */
136 if ((V_tcp_do_ecn == 3) ||
137 (V_tcp_do_ecn == 4)) {
138 /*
139 * on the SYN,ACK, process the AccECN
140 * flags indicating the state the SYN
141 * was delivered.
142 * Reactions to Path ECN mangling can
143 * come here.
144 */
145 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
146 /* RFC3168 SYN */
147 case (0|0|TH_ECE):
148 tp->t_flags2 |= TF2_ECN_PERMIT;
149 tp->t_flags2 &= ~TF2_ACE_PERMIT;
150 TCPSTAT_INC(tcps_ecn_shs);
151 break;
152 /* non-ECT SYN */
153 case (0|TH_CWR|0):
154 tp->t_flags2 |= TF2_ACE_PERMIT;
155 tp->t_flags2 &= ~TF2_ECN_PERMIT;
156 tp->t_scep = 5;
157 TCPSTAT_INC(tcps_ecn_shs);
158 TCPSTAT_INC(tcps_ace_nect);
159 break;
160 /* ECT0 SYN */
161 case (TH_AE|0|0):
162 tp->t_flags2 |= TF2_ACE_PERMIT;
163 tp->t_flags2 &= ~TF2_ECN_PERMIT;
164 tp->t_scep = 5;
165 TCPSTAT_INC(tcps_ecn_shs);
166 TCPSTAT_INC(tcps_ace_ect0);
167 break;
168 /* ECT1 SYN */
169 case (0|TH_CWR|TH_ECE):
170 tp->t_flags2 |= TF2_ACE_PERMIT;
171 tp->t_flags2 &= ~TF2_ECN_PERMIT;
172 tp->t_scep = 5;
173 TCPSTAT_INC(tcps_ecn_shs);
174 TCPSTAT_INC(tcps_ace_ect1);
175 break;
176 /* CE SYN */
177 case (TH_AE|TH_CWR|0):
178 tp->t_flags2 |= TF2_ACE_PERMIT;
179 tp->t_flags2 &= ~TF2_ECN_PERMIT;
180 tp->t_scep = 6;
181 /*
182 * reduce the IW to 2 MSS (to
183 * account for delayed acks) if
184 * the SYN,ACK was CE marked
185 */
186 tp->snd_cwnd = 2 * tcp_maxseg(tp);
187 TCPSTAT_INC(tcps_ecn_shs);
188 TCPSTAT_INC(tcps_ace_nect);
189 break;
190 default:
191 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
192 break;
193 }
194 /*
195 * Set the AccECN Codepoints on
196 * the outgoing <ACK> to the ECN
197 * state of the <SYN,ACK>
198 * according to table 3 in the
199 * AccECN draft
200 */
201 switch (iptos & IPTOS_ECN_MASK) {
202 case (IPTOS_ECN_NOTECT):
203 tp->t_rcep = 0b010;
204 break;
205 case (IPTOS_ECN_ECT0):
206 tp->t_rcep = 0b100;
207 break;
208 case (IPTOS_ECN_ECT1):
209 tp->t_rcep = 0b011;
210 break;
211 case (IPTOS_ECN_CE):
212 tp->t_rcep = 0b110;
213 break;
214 }
215 }
216 }
217
218 /*
219 * Handle parallel SYN for ECN
220 */
221 void
222 tcp_ecn_input_parallel_syn(struct tcpcb *tp, uint16_t thflags, int iptos)
223 {
224 if (thflags & TH_ACK)
225 return;
226 if (V_tcp_do_ecn == 0)
227 return;
228 if ((V_tcp_do_ecn == 1) ||
229 (V_tcp_do_ecn == 2)) {
230 /* RFC3168 ECN handling */
231 if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) {
232 tp->t_flags2 |= TF2_ECN_PERMIT;
233 tp->t_flags2 &= ~TF2_ACE_PERMIT;
234 tp->t_flags2 |= TF2_ECN_SND_ECE;
235 TCPSTAT_INC(tcps_ecn_shs);
236 }
237 } else
238 if ((V_tcp_do_ecn == 3) ||
239 (V_tcp_do_ecn == 4)) {
240 /* AccECN handling */
241 switch (thflags & (TH_AE | TH_CWR | TH_ECE)) {
242 default:
243 case (0|0|0):
244 tp->t_flags2 &= ~(TF2_ECN_PERMIT | TF2_ACE_PERMIT);
245 break;
246 case (0|TH_CWR|TH_ECE):
247 tp->t_flags2 |= TF2_ECN_PERMIT;
248 tp->t_flags2 &= ~TF2_ACE_PERMIT;
249 tp->t_flags2 |= TF2_ECN_SND_ECE;
250 TCPSTAT_INC(tcps_ecn_shs);
251 break;
252 case (TH_AE|TH_CWR|TH_ECE):
253 tp->t_flags2 |= TF2_ACE_PERMIT;
254 tp->t_flags2 &= ~TF2_ECN_PERMIT;
255 TCPSTAT_INC(tcps_ecn_shs);
256 /*
257 * Set the AccECN Codepoints on
258 * the outgoing <ACK> to the ECN
259 * state of the <SYN,ACK>
260 * according to table 3 in the
261 * AccECN draft
262 */
263 switch (iptos & IPTOS_ECN_MASK) {
264 case (IPTOS_ECN_NOTECT):
265 tp->t_rcep = 0b010;
266 break;
267 case (IPTOS_ECN_ECT0):
268 tp->t_rcep = 0b100;
269 break;
270 case (IPTOS_ECN_ECT1):
271 tp->t_rcep = 0b011;
272 break;
273 case (IPTOS_ECN_CE):
274 tp->t_rcep = 0b110;
275 break;
276 }
277 break;
278 }
279 }
280 }
281
282 /*
283 * TCP ECN processing.
284 */
285 int
286 tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int pkts, int iptos)
287 {
288 int delta_cep = 0;
289
290 switch (iptos & IPTOS_ECN_MASK) {
291 case IPTOS_ECN_CE:
292 TCPSTAT_INC(tcps_ecn_rcvce);
293 break;
294 case IPTOS_ECN_ECT0:
295 TCPSTAT_INC(tcps_ecn_rcvect0);
296 break;
297 case IPTOS_ECN_ECT1:
298 TCPSTAT_INC(tcps_ecn_rcvect1);
299 break;
300 }
301
302 if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
303 if (tp->t_flags2 & TF2_ACE_PERMIT) {
304 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
305 tp->t_rcep += 1;
306 if (tp->t_flags2 & TF2_ECN_PERMIT) {
307 delta_cep = (tcp_ecn_get_ace(thflags) + 8 -
308 (tp->t_scep & 7)) & 7;
309 if (delta_cep < pkts)
310 delta_cep = pkts -
311 ((pkts - delta_cep) & 7);
312 tp->t_scep += delta_cep;
313 } else {
314 /*
315 * process the final ACK of the 3WHS
316 * see table 3 in draft-ietf-tcpm-accurate-ecn
317 */
318 switch (tcp_ecn_get_ace(thflags)) {
319 case 0b010:
320 /* nonECT SYN or SYN,ACK */
321 /* Fallthrough */
322 case 0b011:
323 /* ECT1 SYN or SYN,ACK */
324 /* Fallthrough */
325 case 0b100:
326 /* ECT0 SYN or SYN,ACK */
327 tp->t_scep = 5;
328 break;
329 case 0b110:
330 /* CE SYN or SYN,ACK */
331 tp->t_scep = 6;
332 tp->snd_cwnd = 2 * tcp_maxseg(tp);
333 break;
334 default:
335 /* mangled AccECN handshake */
336 tp->t_scep = 5;
337 break;
338 }
339 tp->t_flags2 |= TF2_ECN_PERMIT;
340 }
341 } else {
342 /* RFC3168 ECN handling */
343 if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) {
344 delta_cep = 1;
345 tp->t_scep++;
346 }
347 if (thflags & TH_CWR) {
348 tp->t_flags2 &= ~TF2_ECN_SND_ECE;
349 tp->t_flags |= TF_ACKNOW;
350 }
351 if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
352 tp->t_flags2 |= TF2_ECN_SND_ECE;
353 }
354
355 /* Process a packet differently from RFC3168. */
356 cc_ecnpkt_handler_flags(tp, thflags, iptos);
357 }
358
359 return delta_cep;
360 }
361
362 /*
363 * Send ECN setup <SYN> packet header flags
364 */
365 uint16_t
366 tcp_ecn_output_syn_sent(struct tcpcb *tp)
367 {
368 uint16_t thflags = 0;
369
370 if (V_tcp_do_ecn == 0)
371 return thflags;
372 if (V_tcp_do_ecn == 1) {
373 /* Send a RFC3168 ECN setup <SYN> packet */
374 if (tp->t_rxtshift >= 1) {
375 if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
376 thflags = TH_ECE|TH_CWR;
377 } else
378 thflags = TH_ECE|TH_CWR;
379 } else
380 if (V_tcp_do_ecn == 3) {
381 /* Send an Accurate ECN setup <SYN> packet */
382 if (tp->t_rxtshift >= 1) {
383 if (tp->t_rxtshift <= V_tcp_ecn_maxretries)
384 thflags = TH_ECE|TH_CWR|TH_AE;
385 } else
386 thflags = TH_ECE|TH_CWR|TH_AE;
387 }
388
389 return thflags;
390 }
391
392 /*
393 * output processing of ECN feature
394 * returning IP ECN header codepoint
395 */
396 int
397 tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit)
398 {
399 int ipecn = IPTOS_ECN_NOTECT;
400 bool newdata;
401
402 /*
403 * If the peer has ECN, mark data packets with
404 * ECN capable transmission (ECT).
405 * Ignore pure control packets, retransmissions
406 * and window probes.
407 */
408 newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
409 !rxmit &&
410 !((tp->t_flags & TF_FORCEDATA) && len == 1));
411 /* RFC3168 ECN marking, only new data segments */
412 if (newdata) {
413 if (tp->t_flags2 & TF2_ECN_USE_ECT1) {
414 ipecn = IPTOS_ECN_ECT1;
415 TCPSTAT_INC(tcps_ecn_sndect1);
416 } else {
417 ipecn = IPTOS_ECN_ECT0;
418 TCPSTAT_INC(tcps_ecn_sndect0);
419 }
420 }
421 /*
422 * Reply with proper ECN notifications.
423 */
424 if (tp->t_flags2 & TF2_ACE_PERMIT) {
425 *thflags &= ~(TH_AE|TH_CWR|TH_ECE);
426 if (tp->t_rcep & 0x01)
427 *thflags |= TH_ECE;
428 if (tp->t_rcep & 0x02)
429 *thflags |= TH_CWR;
430 if (tp->t_rcep & 0x04)
431 *thflags |= TH_AE;
432 if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
433 /*
434 * here we process the final
435 * ACK of the 3WHS
436 */
437 if (tp->t_rcep == 0b110) {
438 tp->t_rcep = 6;
439 } else {
440 tp->t_rcep = 5;
441 }
442 tp->t_flags2 |= TF2_ECN_PERMIT;
443 }
444 } else {
445 if (newdata &&
446 (tp->t_flags2 & TF2_ECN_SND_CWR)) {
447 *thflags |= TH_CWR;
448 tp->t_flags2 &= ~TF2_ECN_SND_CWR;
449 }
450 if (tp->t_flags2 & TF2_ECN_SND_ECE)
451 *thflags |= TH_ECE;
452 }
453
454 return ipecn;
455 }
456
457 /*
458 * Set up the ECN related tcpcb fields from
459 * a syncache entry
460 */
461 void
462 tcp_ecn_syncache_socket(struct tcpcb *tp, struct syncache *sc)
463 {
464 if (sc->sc_flags & SCF_ECN_MASK) {
465 switch (sc->sc_flags & SCF_ECN_MASK) {
466 case SCF_ECN:
467 tp->t_flags2 |= TF2_ECN_PERMIT;
468 break;
469 case SCF_ACE_N:
470 /* Fallthrough */
471 case SCF_ACE_0:
472 /* Fallthrough */
473 case SCF_ACE_1:
474 tp->t_flags2 |= TF2_ACE_PERMIT;
475 tp->t_scep = 5;
476 tp->t_rcep = 5;
477 break;
478 case SCF_ACE_CE:
479 tp->t_flags2 |= TF2_ACE_PERMIT;
480 tp->t_scep = 6;
481 tp->t_rcep = 6;
482 break;
483 /* undefined SCF codepoint */
484 default:
485 break;
486 }
487 }
488 }
489
490 /*
491 * Process a <SYN> packets ECN information, and provide the
492 * syncache with the relevant information.
493 */
494 int
495 tcp_ecn_syncache_add(uint16_t thflags, int iptos)
496 {
497 int scflags = 0;
498
499 switch (iptos & IPTOS_ECN_MASK) {
500 case IPTOS_ECN_CE:
501 TCPSTAT_INC(tcps_ecn_rcvce);
502 break;
503 case IPTOS_ECN_ECT0:
504 TCPSTAT_INC(tcps_ecn_rcvect0);
505 break;
506 case IPTOS_ECN_ECT1:
507 TCPSTAT_INC(tcps_ecn_rcvect1);
508 break;
509 }
510
511 switch (thflags & (TH_AE|TH_CWR|TH_ECE)) {
512 /* no ECN */
513 case (0|0|0):
514 break;
515 /* legacy ECN */
516 case (0|TH_CWR|TH_ECE):
517 scflags = SCF_ECN;
518 break;
519 /* Accurate ECN */
520 case (TH_AE|TH_CWR|TH_ECE):
521 if ((V_tcp_do_ecn == 3) ||
522 (V_tcp_do_ecn == 4)) {
523 switch (iptos & IPTOS_ECN_MASK) {
524 case IPTOS_ECN_CE:
525 scflags = SCF_ACE_CE;
526 break;
527 case IPTOS_ECN_ECT0:
528 scflags = SCF_ACE_0;
529 break;
530 case IPTOS_ECN_ECT1:
531 scflags = SCF_ACE_1;
532 break;
533 case IPTOS_ECN_NOTECT:
534 scflags = SCF_ACE_N;
535 break;
536 }
537 } else
538 scflags = SCF_ECN;
539 break;
540 /* Default Case (section 3.1.2) */
541 default:
542 if ((V_tcp_do_ecn == 3) ||
543 (V_tcp_do_ecn == 4)) {
544 switch (iptos & IPTOS_ECN_MASK) {
545 case IPTOS_ECN_CE:
546 scflags = SCF_ACE_CE;
547 break;
548 case IPTOS_ECN_ECT0:
549 scflags = SCF_ACE_0;
550 break;
551 case IPTOS_ECN_ECT1:
552 scflags = SCF_ACE_1;
553 break;
554 case IPTOS_ECN_NOTECT:
555 scflags = SCF_ACE_N;
556 break;
557 }
558 }
559 break;
560 }
561 return scflags;
562 }
563
564 /*
565 * Set up the ECN information for the <SYN,ACK> from
566 * syncache information.
567 */
568 uint16_t
569 tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc)
570 {
571 if ((thflags & TH_SYN) &&
572 (sc->sc_flags & SCF_ECN_MASK)) {
573 switch (sc->sc_flags & SCF_ECN_MASK) {
574 case SCF_ECN:
575 thflags |= (0 | 0 | TH_ECE);
576 TCPSTAT_INC(tcps_ecn_shs);
577 break;
578 case SCF_ACE_N:
579 thflags |= (0 | TH_CWR | 0);
580 TCPSTAT_INC(tcps_ecn_shs);
581 TCPSTAT_INC(tcps_ace_nect);
582 break;
583 case SCF_ACE_0:
584 thflags |= (TH_AE | 0 | 0);
585 TCPSTAT_INC(tcps_ecn_shs);
586 TCPSTAT_INC(tcps_ace_ect0);
587 break;
588 case SCF_ACE_1:
589 thflags |= (0 | TH_ECE | TH_CWR);
590 TCPSTAT_INC(tcps_ecn_shs);
591 TCPSTAT_INC(tcps_ace_ect1);
592 break;
593 case SCF_ACE_CE:
594 thflags |= (TH_AE | TH_CWR | 0);
595 TCPSTAT_INC(tcps_ecn_shs);
596 TCPSTAT_INC(tcps_ace_ce);
597 break;
598 /* undefined SCF codepoint */
599 default:
600 break;
601 }
602 }
603 return thflags;
604 }
605
606 int
607 tcp_ecn_get_ace(uint16_t thflags)
608 {
609 int ace = 0;
610
611 if (thflags & TH_ECE)
612 ace += 1;
613 if (thflags & TH_CWR)
614 ace += 2;
615 if (thflags & TH_AE)
616 ace += 4;
617 return ace;
618 }
Cache object: 63dafb88c9910528dffc64ac793b7c49
|