1 /* $NetBSD: tcp_sack.c,v 1.22 2006/10/21 10:26:21 yamt Exp $ */
2
3 /*
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kentaro A. Kurahone.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the NetBSD
21 * Foundation, Inc. and its contributors.
22 * 4. Neither the name of The NetBSD Foundation nor the names of its
23 * contributors may be used to endorse or promote products derived
24 * from this software without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 */
38
39 /*
40 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
41 * The Regents of the University of California. All rights reserved.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
68 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
69 */
70
71 /*
72 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
73 *
74 * NRL grants permission for redistribution and use in source and binary
75 * forms, with or without modification, of the software and documentation
76 * created at NRL provided that the following conditions are met:
77 *
78 * 1. Redistributions of source code must retain the above copyright
79 * notice, this list of conditions and the following disclaimer.
80 * 2. Redistributions in binary form must reproduce the above copyright
81 * notice, this list of conditions and the following disclaimer in the
82 * documentation and/or other materials provided with the distribution.
83 * 3. All advertising materials mentioning features or use of this software
84 * must display the following acknowledgements:
85 * This product includes software developed by the University of
86 * California, Berkeley and its contributors.
87 * This product includes software developed at the Information
88 * Technology Division, US Naval Research Laboratory.
89 * 4. Neither the name of the NRL nor the names of its contributors
90 * may be used to endorse or promote products derived from this software
91 * without specific prior written permission.
92 *
93 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
94 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
95 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
96 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
97 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
98 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
99 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
100 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
101 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
102 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
103 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
104 *
105 * The views and conclusions contained in the software and documentation
106 * are those of the authors and should not be interpreted as representing
107 * official policies, either expressed or implied, of the US Naval
108 * Research Laboratory (NRL).
109 */
110
111 #include <sys/cdefs.h>
112 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.22 2006/10/21 10:26:21 yamt Exp $");
113
114 #include "opt_inet.h"
115 #include "opt_ipsec.h"
116 #include "opt_inet_csum.h"
117 #include "opt_tcp_debug.h"
118 #include "opt_ddb.h"
119
120 #include <sys/param.h>
121 #include <sys/systm.h>
122 #include <sys/malloc.h>
123 #include <sys/mbuf.h>
124 #include <sys/protosw.h>
125 #include <sys/socket.h>
126 #include <sys/socketvar.h>
127 #include <sys/errno.h>
128 #include <sys/syslog.h>
129 #include <sys/pool.h>
130 #include <sys/domain.h>
131 #include <sys/kernel.h>
132
133 #include <net/if.h>
134 #include <net/route.h>
135 #include <net/if_types.h>
136
137 #include <netinet/in.h>
138 #include <netinet/in_systm.h>
139 #include <netinet/ip.h>
140 #include <netinet/in_pcb.h>
141 #include <netinet/in_var.h>
142 #include <netinet/ip_var.h>
143
144 #ifdef INET6
145 #ifndef INET
146 #include <netinet/in.h>
147 #endif
148 #include <netinet/ip6.h>
149 #include <netinet6/ip6_var.h>
150 #include <netinet6/in6_pcb.h>
151 #include <netinet6/ip6_var.h>
152 #include <netinet6/in6_var.h>
153 #include <netinet/icmp6.h>
154 #include <netinet6/nd6.h>
155 #endif
156
157 #ifndef INET6
158 /* always need ip6.h for IP6_EXTHDR_GET */
159 #include <netinet/ip6.h>
160 #endif
161
162 #include <netinet/tcp.h>
163 #include <netinet/tcp_fsm.h>
164 #include <netinet/tcp_seq.h>
165 #include <netinet/tcp_timer.h>
166 #include <netinet/tcp_var.h>
167 #include <netinet/tcpip.h>
168 #include <netinet/tcp_debug.h>
169
170 #include <machine/stdarg.h>
171
172 /* SACK block pool. */
173 static POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl",
174 NULL);
175
176 static struct sackhole *
177 sack_allochole(struct tcpcb *tp)
178 {
179 struct sackhole *hole;
180
181 if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
182 tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
183 return NULL;
184 }
185 hole = pool_get(&sackhole_pool, PR_NOWAIT);
186 if (hole == NULL) {
187 return NULL;
188 }
189 tp->snd_numholes++;
190 tcp_sack_globalholes++;
191
192 return hole;
193 }
194
195 static struct sackhole *
196 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
197 struct sackhole *prev)
198 {
199 struct sackhole *hole;
200
201 hole = sack_allochole(tp);
202 if (hole == NULL) {
203 return NULL;
204 }
205 hole->start = hole->rxmit = start;
206 hole->end = end;
207 if (prev != NULL) {
208 TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
209 } else {
210 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
211 }
212 return hole;
213 }
214
215 static struct sackhole *
216 sack_removehole(struct tcpcb *tp, struct sackhole *hole)
217 {
218 struct sackhole *next;
219
220 next = TAILQ_NEXT(hole, sackhole_q);
221 tp->snd_numholes--;
222 tcp_sack_globalholes--;
223 TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
224 pool_put(&sackhole_pool, hole);
225
226 return next;
227 }
228
229 void
230 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
231 {
232 if (TCP_SACK_ENABLED(tp)) {
233 tp->rcv_dsack_block.left = seq;
234 tp->rcv_dsack_block.right = seq + len;
235 tp->rcv_sack_flags |= TCPSACK_HAVED;
236 }
237 }
238
239 void
240 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
241 int optlen)
242 {
243 struct sackblk
244 t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
245 struct sackblk *sack = NULL;
246 struct sackhole *cur = NULL;
247 struct sackhole *tmp = NULL;
248 const char *lp = cp + 2;
249 int i, j, num_sack_blks;
250 tcp_seq left, right, acked;
251
252 /*
253 * If we aren't processing SACK responses, this is not an ACK
254 * or the peer sends us a sack option with invalid length, don't
255 * update the scoreboard.
256 */
257 if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
258 (optlen % 8 != 2 || optlen < 10)) {
259 return;
260 }
261
262 /*
263 * If we don't want any SACK holes to be allocated, just return.
264 */
265 if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
266 return;
267 }
268
269 /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
270 if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
271 return;
272
273 /*
274 * Extract SACK blocks.
275 *
276 * Note that t_sack_block is sorted so that we only need to do
277 * one pass over the sequence number space. (SACK "fast-path")
278 */
279 num_sack_blks = optlen / 8;
280 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
281 for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
282 memcpy(&left, lp, sizeof(uint32_t));
283 memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
284 left = ntohl(left);
285 right = ntohl(right);
286
287 if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
288 SEQ_GEQ(left, right)) {
289 /* SACK entry that's old, or invalid. */
290 i--;
291 num_sack_blks--;
292 continue;
293 }
294
295 /* Insertion sort. */
296 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
297 j--) {
298 t_sack_block[j].left = t_sack_block[j - 1].left;
299 t_sack_block[j].right = t_sack_block[j - 1].right;
300 }
301 t_sack_block[j].left = left;
302 t_sack_block[j].right = right;
303 }
304
305 /* Update the scoreboard. */
306 cur = TAILQ_FIRST(&tp->snd_holes);
307 for (i = 0; i < num_sack_blks; i++) {
308 sack = &t_sack_block[i];
309 /*
310 * FACK TCP. Update snd_fack so we can enter Fast
311 * Recovery early.
312 */
313 if (SEQ_GEQ(sack->right, tp->snd_fack))
314 tp->snd_fack = sack->right;
315
316 if (TAILQ_EMPTY(&tp->snd_holes)) {
317 /* First hole. */
318 cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
319 if (cur == NULL) {
320 /* ENOBUFS, bail out*/
321 return;
322 }
323 tp->rcv_lastsack = sack->right;
324 continue; /* With next sack block */
325 }
326
327 /* Go through the list of holes. */
328 while (cur) {
329 if (SEQ_LEQ(sack->right, cur->start))
330 /* SACKs data before the current hole */
331 break; /* No use going through more holes */
332
333 if (SEQ_GEQ(sack->left, cur->end)) {
334 /* SACKs data beyond the current hole */
335 cur = TAILQ_NEXT(cur, sackhole_q);
336 continue;
337 }
338
339 if (SEQ_LEQ(sack->left, cur->start)) {
340 /* Data acks at least the beginning of hole */
341 if (SEQ_GEQ(sack->right, cur->end)) {
342 /* Acks entire hole, so delete hole */
343 cur = sack_removehole(tp, cur);
344 break;
345 }
346
347 /* Otherwise, move start of hole forward */
348 cur->start = sack->right;
349 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
350 break;
351 }
352
353 if (SEQ_GEQ(sack->right, cur->end)) {
354 /* Move end of hole backward. */
355 cur->end = sack->left;
356 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
357 cur = TAILQ_NEXT(cur, sackhole_q);
358 break;
359 }
360
361 if (SEQ_LT(cur->start, sack->left) &&
362 SEQ_GT(cur->end, sack->right)) {
363 /*
364 * ACKs some data in middle of a hole; need to
365 * split current hole
366 */
367 tmp = sack_inserthole(tp, sack->right, cur->end,
368 cur);
369 if (tmp == NULL) {
370 return;
371 }
372 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
373 cur->end = sack->left;
374 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
375 cur = tmp;
376 break;
377 }
378 }
379
380 /* At this point, we have reached the tail of the list. */
381 if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
382 /*
383 * Need to append new hole at end.
384 */
385 cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
386 NULL);
387 if (cur == NULL) {
388 return;
389 }
390 }
391 if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
392 tp->rcv_lastsack = sack->right;
393 }
394 }
395 }
396
397 void
398 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
399 {
400 /* Max because this could be an older ack that just arrived. */
401 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
402 th->th_ack : tp->snd_una;
403 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
404
405 while (cur) {
406 if (SEQ_LEQ(cur->end, lastack)) {
407 cur = sack_removehole(tp, cur);
408 } else if (SEQ_LT(cur->start, lastack)) {
409 cur->start = lastack;
410 if (SEQ_LT(cur->rxmit, cur->start))
411 cur->rxmit = cur->start;
412 break;
413 } else
414 break;
415 }
416 }
417
418 void
419 tcp_free_sackholes(struct tcpcb *tp)
420 {
421 struct sackhole *sack;
422
423 /* Free up the SACK hole list. */
424 while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
425 sack_removehole(tp, sack);
426 }
427 KASSERT(tp->snd_numholes == 0);
428 }
429
430 /*
431 * Implements the SACK response to a new ack, checking for partial acks
432 * in fast recovery.
433 */
434 void
435 tcp_sack_newack(struct tcpcb *tp, const struct tcphdr *th)
436 {
437 if (tp->t_partialacks < 0) {
438 /*
439 * Not in fast recovery. Reset the duplicate ack
440 * counter.
441 */
442 tp->t_dupacks = 0;
443 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
444 /*
445 * Partial ack handling within a sack recovery episode.
446 * Keeping this very simple for now. When a partial ack
447 * is received, force snd_cwnd to a value that will allow
448 * the sender to transmit no more than 2 segments.
449 * If necessary, a fancier scheme can be adopted at a
450 * later point, but for now, the goal is to prevent the
451 * sender from bursting a large amount of data in the midst
452 * of sack recovery.
453 */
454 int num_segs = 1;
455 int sack_bytes_rxmt = 0;
456
457 tp->t_partialacks++;
458 TCP_TIMER_DISARM(tp, TCPT_REXMT);
459 tp->t_rtttime = 0;
460
461 /*
462 * send one or 2 segments based on how much new data was acked
463 */
464 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
465 num_segs = 2;
466 (void)tcp_sack_output(tp, &sack_bytes_rxmt);
467 tp->snd_cwnd = sack_bytes_rxmt +
468 (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz;
469 tp->t_flags |= TF_ACKNOW;
470 (void) tcp_output(tp);
471 } else {
472 /*
473 * Complete ack, inflate the congestion window to
474 * ssthresh and exit fast recovery.
475 *
476 * Window inflation should have left us with approx.
477 * snd_ssthresh outstanding data. But in case we
478 * would be inclined to send a burst, better to do
479 * it via the slow start mechanism.
480 */
481 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
482 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
483 + tp->t_segsz;
484 else
485 tp->snd_cwnd = tp->snd_ssthresh;
486 tp->t_partialacks = -1;
487 tp->t_dupacks = 0;
488 if (SEQ_GT(th->th_ack, tp->snd_fack))
489 tp->snd_fack = th->th_ack;
490 }
491 }
492
493 /*
494 * Returns pointer to a sackhole if there are any pending retransmissions;
495 * NULL otherwise.
496 */
497 struct sackhole *
498 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
499 {
500 struct sackhole *cur = NULL;
501
502 if (!TCP_SACK_ENABLED(tp))
503 return (NULL);
504
505 *sack_bytes_rexmt = 0;
506 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
507 if (SEQ_LT(cur->rxmit, cur->end)) {
508 if (SEQ_LT(cur->rxmit, tp->snd_una)) {
509 /* old SACK hole */
510 continue;
511 }
512 *sack_bytes_rexmt += (cur->rxmit - cur->start);
513 break;
514 }
515 *sack_bytes_rexmt += (cur->rxmit - cur->start);
516 }
517
518 return (cur);
519 }
520
521 /*
522 * After a timeout, the SACK list may be rebuilt. This SACK information
523 * should be used to avoid retransmitting SACKed data. This function
524 * traverses the SACK list to see if snd_nxt should be moved forward.
525 */
526 void
527 tcp_sack_adjust(struct tcpcb *tp)
528 {
529 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
530 struct sackhole *n = NULL;
531
532 if (TAILQ_EMPTY(&tp->snd_holes))
533 return; /* No holes */
534 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
535 return; /* We're already beyond any SACKed blocks */
536
537 /*
538 * Two cases for which we want to advance snd_nxt:
539 * i) snd_nxt lies between end of one hole and beginning of another
540 * ii) snd_nxt lies between end of last hole and rcv_lastsack
541 */
542 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
543 if (SEQ_LT(tp->snd_nxt, cur->end))
544 return;
545 if (SEQ_GEQ(tp->snd_nxt, n->start))
546 cur = n;
547 else {
548 tp->snd_nxt = n->start;
549 return;
550 }
551 }
552 if (SEQ_LT(tp->snd_nxt, cur->end))
553 return;
554 tp->snd_nxt = tp->rcv_lastsack;
555
556 return;
557 }
558
559 int
560 tcp_sack_numblks(const struct tcpcb *tp)
561 {
562 int numblks;
563
564 if (!TCP_SACK_ENABLED(tp)) {
565 return 0;
566 }
567
568 numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
569 tp->t_segqlen;
570
571 if (numblks == 0) {
572 return 0;
573 }
574
575 if (numblks > TCP_SACK_MAX) {
576 numblks = TCP_SACK_MAX;
577 }
578
579 return numblks;
580 }
581
582 #if defined(DDB)
583 void sack_dump(const struct tcpcb *);
584
585 void
586 sack_dump(const struct tcpcb *tp)
587 {
588 const struct sackhole *cur;
589
590 printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
591 tp->snd_una, tp->snd_max);
592 printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
593 tp->rcv_lastsack, tp->snd_fack);
594 printf("numholes=%d\n", tp->snd_numholes);
595 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
596 printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
597 cur->start, cur->end, cur->rxmit);
598 }
599 }
600 #endif /* defined(DDB) */
Cache object: 36f4f28b1ef01ac1da93e7edc4b15f88
|