1 /* $NetBSD: tcp_sack.c,v 1.24 2008/04/28 20:24:09 martin Exp $ */
2
3 /*
4 * Copyright (c) 2005 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Kentaro A. Kurahone.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
34 * The Regents of the University of California. All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
62 */
63
64 /*
65 * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
66 *
67 * NRL grants permission for redistribution and use in source and binary
68 * forms, with or without modification, of the software and documentation
69 * created at NRL provided that the following conditions are met:
70 *
71 * 1. Redistributions of source code must retain the above copyright
72 * notice, this list of conditions and the following disclaimer.
73 * 2. Redistributions in binary form must reproduce the above copyright
74 * notice, this list of conditions and the following disclaimer in the
75 * documentation and/or other materials provided with the distribution.
76 * 3. All advertising materials mentioning features or use of this software
77 * must display the following acknowledgements:
78 * This product includes software developed by the University of
79 * California, Berkeley and its contributors.
80 * This product includes software developed at the Information
81 * Technology Division, US Naval Research Laboratory.
82 * 4. Neither the name of the NRL nor the names of its contributors
83 * may be used to endorse or promote products derived from this software
84 * without specific prior written permission.
85 *
86 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
87 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
88 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
89 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
90 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
91 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
92 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
93 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
94 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
95 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
96 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97 *
98 * The views and conclusions contained in the software and documentation
99 * are those of the authors and should not be interpreted as representing
100 * official policies, either expressed or implied, of the US Naval
101 * Research Laboratory (NRL).
102 */
103
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.24 2008/04/28 20:24:09 martin Exp $");
106
107 #include "opt_inet.h"
108 #include "opt_ipsec.h"
109 #include "opt_inet_csum.h"
110 #include "opt_tcp_debug.h"
111 #include "opt_ddb.h"
112
113 #include <sys/param.h>
114 #include <sys/systm.h>
115 #include <sys/malloc.h>
116 #include <sys/mbuf.h>
117 #include <sys/protosw.h>
118 #include <sys/socket.h>
119 #include <sys/socketvar.h>
120 #include <sys/errno.h>
121 #include <sys/syslog.h>
122 #include <sys/pool.h>
123 #include <sys/domain.h>
124 #include <sys/kernel.h>
125
126 #include <net/if.h>
127 #include <net/route.h>
128 #include <net/if_types.h>
129
130 #include <netinet/in.h>
131 #include <netinet/in_systm.h>
132 #include <netinet/ip.h>
133 #include <netinet/in_pcb.h>
134 #include <netinet/in_var.h>
135 #include <netinet/ip_var.h>
136
137 #ifdef INET6
138 #ifndef INET
139 #include <netinet/in.h>
140 #endif
141 #include <netinet/ip6.h>
142 #include <netinet6/ip6_var.h>
143 #include <netinet6/in6_pcb.h>
144 #include <netinet6/ip6_var.h>
145 #include <netinet6/in6_var.h>
146 #include <netinet/icmp6.h>
147 #include <netinet6/nd6.h>
148 #endif
149
150 #ifndef INET6
151 /* always need ip6.h for IP6_EXTHDR_GET */
152 #include <netinet/ip6.h>
153 #endif
154
155 #include <netinet/tcp.h>
156 #include <netinet/tcp_fsm.h>
157 #include <netinet/tcp_seq.h>
158 #include <netinet/tcp_timer.h>
159 #include <netinet/tcp_var.h>
160 #include <netinet/tcpip.h>
161 #include <netinet/tcp_debug.h>
162
163 #include <machine/stdarg.h>
164
165 /* SACK block pool. */
166 static POOL_INIT(sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl",
167 NULL, IPL_SOFTNET);
168
169 static struct sackhole *
170 sack_allochole(struct tcpcb *tp)
171 {
172 struct sackhole *hole;
173
174 if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
175 tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
176 return NULL;
177 }
178 hole = pool_get(&sackhole_pool, PR_NOWAIT);
179 if (hole == NULL) {
180 return NULL;
181 }
182 tp->snd_numholes++;
183 tcp_sack_globalholes++;
184
185 return hole;
186 }
187
188 static struct sackhole *
189 sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
190 struct sackhole *prev)
191 {
192 struct sackhole *hole;
193
194 hole = sack_allochole(tp);
195 if (hole == NULL) {
196 return NULL;
197 }
198 hole->start = hole->rxmit = start;
199 hole->end = end;
200 if (prev != NULL) {
201 TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
202 } else {
203 TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
204 }
205 return hole;
206 }
207
208 static struct sackhole *
209 sack_removehole(struct tcpcb *tp, struct sackhole *hole)
210 {
211 struct sackhole *next;
212
213 next = TAILQ_NEXT(hole, sackhole_q);
214 tp->snd_numholes--;
215 tcp_sack_globalholes--;
216 TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
217 pool_put(&sackhole_pool, hole);
218
219 return next;
220 }
221
222 void
223 tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
224 {
225 if (TCP_SACK_ENABLED(tp)) {
226 tp->rcv_dsack_block.left = seq;
227 tp->rcv_dsack_block.right = seq + len;
228 tp->rcv_sack_flags |= TCPSACK_HAVED;
229 }
230 }
231
232 void
233 tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
234 int optlen)
235 {
236 struct sackblk
237 t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
238 struct sackblk *sack = NULL;
239 struct sackhole *cur = NULL;
240 struct sackhole *tmp = NULL;
241 const char *lp = cp + 2;
242 int i, j, num_sack_blks;
243 tcp_seq left, right, acked;
244
245 /*
246 * If we aren't processing SACK responses, this is not an ACK
247 * or the peer sends us a sack option with invalid length, don't
248 * update the scoreboard.
249 */
250 if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
251 (optlen % 8 != 2 || optlen < 10)) {
252 return;
253 }
254
255 /*
256 * If we don't want any SACK holes to be allocated, just return.
257 */
258 if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
259 return;
260 }
261
262 /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
263 if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
264 return;
265
266 /*
267 * Extract SACK blocks.
268 *
269 * Note that t_sack_block is sorted so that we only need to do
270 * one pass over the sequence number space. (SACK "fast-path")
271 */
272 num_sack_blks = optlen / 8;
273 acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
274 for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
275 memcpy(&left, lp, sizeof(uint32_t));
276 memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
277 left = ntohl(left);
278 right = ntohl(right);
279
280 if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
281 SEQ_GEQ(left, right)) {
282 /* SACK entry that's old, or invalid. */
283 i--;
284 num_sack_blks--;
285 continue;
286 }
287
288 /* Insertion sort. */
289 for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
290 j--) {
291 t_sack_block[j].left = t_sack_block[j - 1].left;
292 t_sack_block[j].right = t_sack_block[j - 1].right;
293 }
294 t_sack_block[j].left = left;
295 t_sack_block[j].right = right;
296 }
297
298 /* Update the scoreboard. */
299 cur = TAILQ_FIRST(&tp->snd_holes);
300 for (i = 0; i < num_sack_blks; i++) {
301 sack = &t_sack_block[i];
302 /*
303 * FACK TCP. Update snd_fack so we can enter Fast
304 * Recovery early.
305 */
306 if (SEQ_GEQ(sack->right, tp->snd_fack))
307 tp->snd_fack = sack->right;
308
309 if (TAILQ_EMPTY(&tp->snd_holes)) {
310 /* First hole. */
311 cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
312 if (cur == NULL) {
313 /* ENOBUFS, bail out*/
314 return;
315 }
316 tp->rcv_lastsack = sack->right;
317 continue; /* With next sack block */
318 }
319
320 /* Go through the list of holes. */
321 while (cur) {
322 if (SEQ_LEQ(sack->right, cur->start))
323 /* SACKs data before the current hole */
324 break; /* No use going through more holes */
325
326 if (SEQ_GEQ(sack->left, cur->end)) {
327 /* SACKs data beyond the current hole */
328 cur = TAILQ_NEXT(cur, sackhole_q);
329 continue;
330 }
331
332 if (SEQ_LEQ(sack->left, cur->start)) {
333 /* Data acks at least the beginning of hole */
334 if (SEQ_GEQ(sack->right, cur->end)) {
335 /* Acks entire hole, so delete hole */
336 cur = sack_removehole(tp, cur);
337 break;
338 }
339
340 /* Otherwise, move start of hole forward */
341 cur->start = sack->right;
342 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
343 break;
344 }
345
346 if (SEQ_GEQ(sack->right, cur->end)) {
347 /* Move end of hole backward. */
348 cur->end = sack->left;
349 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
350 cur = TAILQ_NEXT(cur, sackhole_q);
351 break;
352 }
353
354 if (SEQ_LT(cur->start, sack->left) &&
355 SEQ_GT(cur->end, sack->right)) {
356 /*
357 * ACKs some data in middle of a hole; need to
358 * split current hole
359 */
360 tmp = sack_inserthole(tp, sack->right, cur->end,
361 cur);
362 if (tmp == NULL) {
363 return;
364 }
365 tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
366 cur->end = sack->left;
367 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
368 cur = tmp;
369 break;
370 }
371 }
372
373 /* At this point, we have reached the tail of the list. */
374 if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
375 /*
376 * Need to append new hole at end.
377 */
378 cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
379 NULL);
380 if (cur == NULL) {
381 return;
382 }
383 }
384 if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
385 tp->rcv_lastsack = sack->right;
386 }
387 }
388 }
389
390 void
391 tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
392 {
393 /* Max because this could be an older ack that just arrived. */
394 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
395 th->th_ack : tp->snd_una;
396 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
397
398 while (cur) {
399 if (SEQ_LEQ(cur->end, lastack)) {
400 cur = sack_removehole(tp, cur);
401 } else if (SEQ_LT(cur->start, lastack)) {
402 cur->start = lastack;
403 if (SEQ_LT(cur->rxmit, cur->start))
404 cur->rxmit = cur->start;
405 break;
406 } else
407 break;
408 }
409 }
410
411 void
412 tcp_free_sackholes(struct tcpcb *tp)
413 {
414 struct sackhole *sack;
415
416 /* Free up the SACK hole list. */
417 while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
418 sack_removehole(tp, sack);
419 }
420 KASSERT(tp->snd_numholes == 0);
421 }
422
423 /*
424 * Implements the SACK response to a new ack, checking for partial acks
425 * in fast recovery.
426 */
427 void
428 tcp_sack_newack(struct tcpcb *tp, const struct tcphdr *th)
429 {
430 if (tp->t_partialacks < 0) {
431 /*
432 * Not in fast recovery. Reset the duplicate ack
433 * counter.
434 */
435 tp->t_dupacks = 0;
436 } else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
437 /*
438 * Partial ack handling within a sack recovery episode.
439 * Keeping this very simple for now. When a partial ack
440 * is received, force snd_cwnd to a value that will allow
441 * the sender to transmit no more than 2 segments.
442 * If necessary, a fancier scheme can be adopted at a
443 * later point, but for now, the goal is to prevent the
444 * sender from bursting a large amount of data in the midst
445 * of sack recovery.
446 */
447 int num_segs = 1;
448 int sack_bytes_rxmt = 0;
449
450 tp->t_partialacks++;
451 TCP_TIMER_DISARM(tp, TCPT_REXMT);
452 tp->t_rtttime = 0;
453
454 /*
455 * send one or 2 segments based on how much new data was acked
456 */
457 if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
458 num_segs = 2;
459 (void)tcp_sack_output(tp, &sack_bytes_rxmt);
460 tp->snd_cwnd = sack_bytes_rxmt +
461 (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz;
462 tp->t_flags |= TF_ACKNOW;
463 (void) tcp_output(tp);
464 } else {
465 /*
466 * Complete ack, inflate the congestion window to
467 * ssthresh and exit fast recovery.
468 *
469 * Window inflation should have left us with approx.
470 * snd_ssthresh outstanding data. But in case we
471 * would be inclined to send a burst, better to do
472 * it via the slow start mechanism.
473 */
474 if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
475 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
476 + tp->t_segsz;
477 else
478 tp->snd_cwnd = tp->snd_ssthresh;
479 tp->t_partialacks = -1;
480 tp->t_dupacks = 0;
481 if (SEQ_GT(th->th_ack, tp->snd_fack))
482 tp->snd_fack = th->th_ack;
483 }
484 }
485
486 /*
487 * Returns pointer to a sackhole if there are any pending retransmissions;
488 * NULL otherwise.
489 */
490 struct sackhole *
491 tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
492 {
493 struct sackhole *cur = NULL;
494
495 if (!TCP_SACK_ENABLED(tp))
496 return (NULL);
497
498 *sack_bytes_rexmt = 0;
499 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
500 if (SEQ_LT(cur->rxmit, cur->end)) {
501 if (SEQ_LT(cur->rxmit, tp->snd_una)) {
502 /* old SACK hole */
503 continue;
504 }
505 *sack_bytes_rexmt += (cur->rxmit - cur->start);
506 break;
507 }
508 *sack_bytes_rexmt += (cur->rxmit - cur->start);
509 }
510
511 return (cur);
512 }
513
514 /*
515 * After a timeout, the SACK list may be rebuilt. This SACK information
516 * should be used to avoid retransmitting SACKed data. This function
517 * traverses the SACK list to see if snd_nxt should be moved forward.
518 */
519 void
520 tcp_sack_adjust(struct tcpcb *tp)
521 {
522 struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
523 struct sackhole *n = NULL;
524
525 if (TAILQ_EMPTY(&tp->snd_holes))
526 return; /* No holes */
527 if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
528 return; /* We're already beyond any SACKed blocks */
529
530 /*
531 * Two cases for which we want to advance snd_nxt:
532 * i) snd_nxt lies between end of one hole and beginning of another
533 * ii) snd_nxt lies between end of last hole and rcv_lastsack
534 */
535 while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
536 if (SEQ_LT(tp->snd_nxt, cur->end))
537 return;
538 if (SEQ_GEQ(tp->snd_nxt, n->start))
539 cur = n;
540 else {
541 tp->snd_nxt = n->start;
542 return;
543 }
544 }
545 if (SEQ_LT(tp->snd_nxt, cur->end))
546 return;
547 tp->snd_nxt = tp->rcv_lastsack;
548
549 return;
550 }
551
552 int
553 tcp_sack_numblks(const struct tcpcb *tp)
554 {
555 int numblks;
556
557 if (!TCP_SACK_ENABLED(tp)) {
558 return 0;
559 }
560
561 numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
562 tp->t_segqlen;
563
564 if (numblks == 0) {
565 return 0;
566 }
567
568 if (numblks > TCP_SACK_MAX) {
569 numblks = TCP_SACK_MAX;
570 }
571
572 return numblks;
573 }
574
575 #if defined(DDB)
576 void sack_dump(const struct tcpcb *);
577
578 void
579 sack_dump(const struct tcpcb *tp)
580 {
581 const struct sackhole *cur;
582
583 printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
584 tp->snd_una, tp->snd_max);
585 printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
586 tp->rcv_lastsack, tp->snd_fack);
587 printf("numholes=%d\n", tp->snd_numholes);
588 TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
589 printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
590 cur->start, cur->end, cur->rxmit);
591 }
592 }
593 #endif /* defined(DDB) */
Cache object: 4657d54a0f715481075e66f085957d82
|