FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_lro.c
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 2007, Myricom Inc.
5 * Copyright (c) 2008, Intel Corporation.
6 * Copyright (c) 2012 The FreeBSD Foundation
7 * Copyright (c) 2016 Mellanox Technologies.
8 * All rights reserved.
9 *
10 * Portions of this software were developed by Bjoern Zeeb
11 * under sponsorship from the FreeBSD Foundation.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37
38 #include "opt_inet.h"
39 #include "opt_inet6.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/socket.h>
47 #include <sys/sysctl.h>
48
49 #include <net/if.h>
50 #include <net/if_var.h>
51 #include <net/ethernet.h>
52 #include <net/vnet.h>
53
54 #include <netinet/in_systm.h>
55 #include <netinet/in.h>
56 #include <netinet/ip6.h>
57 #include <netinet/ip.h>
58 #include <netinet/ip_var.h>
59 #include <netinet/tcp.h>
60 #include <netinet/tcp_seq.h>
61 #include <netinet/tcp_lro.h>
62 #include <netinet/tcp_var.h>
63
64 #include <netinet6/ip6_var.h>
65
66 #include <machine/in_cksum.h>
67
68 static MALLOC_DEFINE(M_LRO, "LRO", "LRO control structures");
69
70 #define TCP_LRO_UPDATE_CSUM 1
71 #ifndef TCP_LRO_UPDATE_CSUM
72 #define TCP_LRO_INVALID_CSUM 0x0000
73 #endif
74
75 static void tcp_lro_rx_done(struct lro_ctrl *lc);
76 static int tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m,
77 uint32_t csum, int use_hash);
78
79 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, lro, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
80 "TCP LRO");
81
82 static unsigned tcp_lro_entries = TCP_LRO_ENTRIES;
83 SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, entries,
84 CTLFLAG_RDTUN | CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
85 "default number of LRO entries");
86
87 static __inline void
88 tcp_lro_active_insert(struct lro_ctrl *lc, struct lro_head *bucket,
89 struct lro_entry *le)
90 {
91
92 LIST_INSERT_HEAD(&lc->lro_active, le, next);
93 LIST_INSERT_HEAD(bucket, le, hash_next);
94 }
95
96 static __inline void
97 tcp_lro_active_remove(struct lro_entry *le)
98 {
99
100 LIST_REMOVE(le, next); /* active list */
101 LIST_REMOVE(le, hash_next); /* hash bucket */
102 }
103
104 int
105 tcp_lro_init(struct lro_ctrl *lc)
106 {
107 return (tcp_lro_init_args(lc, NULL, tcp_lro_entries, 0));
108 }
109
110 int
111 tcp_lro_init_args(struct lro_ctrl *lc, struct ifnet *ifp,
112 unsigned lro_entries, unsigned lro_mbufs)
113 {
114 struct lro_entry *le;
115 size_t size;
116 unsigned i, elements;
117
118 lc->lro_bad_csum = 0;
119 lc->lro_queued = 0;
120 lc->lro_flushed = 0;
121 lc->lro_mbuf_count = 0;
122 lc->lro_mbuf_max = lro_mbufs;
123 lc->lro_cnt = lro_entries;
124 lc->lro_ackcnt_lim = TCP_LRO_ACKCNT_MAX;
125 lc->lro_length_lim = TCP_LRO_LENGTH_MAX;
126 lc->ifp = ifp;
127 LIST_INIT(&lc->lro_free);
128 LIST_INIT(&lc->lro_active);
129
130 /* create hash table to accelerate entry lookup */
131 if (lro_entries > lro_mbufs)
132 elements = lro_entries;
133 else
134 elements = lro_mbufs;
135 lc->lro_hash = phashinit_flags(elements, M_LRO, &lc->lro_hashsz,
136 HASH_NOWAIT);
137 if (lc->lro_hash == NULL) {
138 memset(lc, 0, sizeof(*lc));
139 return (ENOMEM);
140 }
141
142 /* compute size to allocate */
143 size = (lro_mbufs * sizeof(struct lro_mbuf_sort)) +
144 (lro_entries * sizeof(*le));
145 lc->lro_mbuf_data = (struct lro_mbuf_sort *)
146 malloc(size, M_LRO, M_NOWAIT | M_ZERO);
147
148 /* check for out of memory */
149 if (lc->lro_mbuf_data == NULL) {
150 free(lc->lro_hash, M_LRO);
151 memset(lc, 0, sizeof(*lc));
152 return (ENOMEM);
153 }
154 /* compute offset for LRO entries */
155 le = (struct lro_entry *)
156 (lc->lro_mbuf_data + lro_mbufs);
157
158 /* setup linked list */
159 for (i = 0; i != lro_entries; i++)
160 LIST_INSERT_HEAD(&lc->lro_free, le + i, next);
161
162 return (0);
163 }
164
165 void
166 tcp_lro_free(struct lro_ctrl *lc)
167 {
168 struct lro_entry *le;
169 unsigned x;
170
171 /* reset LRO free list */
172 LIST_INIT(&lc->lro_free);
173
174 /* free active mbufs, if any */
175 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
176 tcp_lro_active_remove(le);
177 m_freem(le->m_head);
178 }
179
180 /* free hash table */
181 free(lc->lro_hash, M_LRO);
182 lc->lro_hash = NULL;
183 lc->lro_hashsz = 0;
184
185 /* free mbuf array, if any */
186 for (x = 0; x != lc->lro_mbuf_count; x++)
187 m_freem(lc->lro_mbuf_data[x].mb);
188 lc->lro_mbuf_count = 0;
189
190 /* free allocated memory, if any */
191 free(lc->lro_mbuf_data, M_LRO);
192 lc->lro_mbuf_data = NULL;
193 }
194
195 #ifdef TCP_LRO_UPDATE_CSUM
196 static uint16_t
197 tcp_lro_csum_th(struct tcphdr *th)
198 {
199 uint32_t ch;
200 uint16_t *p, l;
201
202 ch = th->th_sum = 0x0000;
203 l = th->th_off;
204 p = (uint16_t *)th;
205 while (l > 0) {
206 ch += *p;
207 p++;
208 ch += *p;
209 p++;
210 l--;
211 }
212 while (ch > 0xffff)
213 ch = (ch >> 16) + (ch & 0xffff);
214
215 return (ch & 0xffff);
216 }
217
218 static uint16_t
219 tcp_lro_rx_csum_fixup(struct lro_entry *le, void *l3hdr, struct tcphdr *th,
220 uint16_t tcp_data_len, uint16_t csum)
221 {
222 uint32_t c;
223 uint16_t cs;
224
225 c = csum;
226
227 /* Remove length from checksum. */
228 switch (le->eh_type) {
229 #ifdef INET6
230 case ETHERTYPE_IPV6:
231 {
232 struct ip6_hdr *ip6;
233
234 ip6 = (struct ip6_hdr *)l3hdr;
235 if (le->append_cnt == 0)
236 cs = ip6->ip6_plen;
237 else {
238 uint32_t cx;
239
240 cx = ntohs(ip6->ip6_plen);
241 cs = in6_cksum_pseudo(ip6, cx, ip6->ip6_nxt, 0);
242 }
243 break;
244 }
245 #endif
246 #ifdef INET
247 case ETHERTYPE_IP:
248 {
249 struct ip *ip4;
250
251 ip4 = (struct ip *)l3hdr;
252 if (le->append_cnt == 0)
253 cs = ip4->ip_len;
254 else {
255 cs = in_addword(ntohs(ip4->ip_len) - sizeof(*ip4),
256 IPPROTO_TCP);
257 cs = in_pseudo(ip4->ip_src.s_addr, ip4->ip_dst.s_addr,
258 htons(cs));
259 }
260 break;
261 }
262 #endif
263 default:
264 cs = 0; /* Keep compiler happy. */
265 }
266
267 cs = ~cs;
268 c += cs;
269
270 /* Remove TCP header csum. */
271 cs = ~tcp_lro_csum_th(th);
272 c += cs;
273 while (c > 0xffff)
274 c = (c >> 16) + (c & 0xffff);
275
276 return (c & 0xffff);
277 }
278 #endif
279
280 static void
281 tcp_lro_rx_done(struct lro_ctrl *lc)
282 {
283 struct lro_entry *le;
284
285 while ((le = LIST_FIRST(&lc->lro_active)) != NULL) {
286 tcp_lro_active_remove(le);
287 tcp_lro_flush(lc, le);
288 }
289 }
290
291 void
292 tcp_lro_flush_inactive(struct lro_ctrl *lc, const struct timeval *timeout)
293 {
294 struct lro_entry *le, *le_tmp;
295 struct timeval tv;
296
297 if (LIST_EMPTY(&lc->lro_active))
298 return;
299
300 getmicrotime(&tv);
301 timevalsub(&tv, timeout);
302 LIST_FOREACH_SAFE(le, &lc->lro_active, next, le_tmp) {
303 if (timevalcmp(&tv, &le->mtime, >=)) {
304 tcp_lro_active_remove(le);
305 tcp_lro_flush(lc, le);
306 }
307 }
308 }
309
310 void
311 tcp_lro_flush(struct lro_ctrl *lc, struct lro_entry *le)
312 {
313
314 if (le->append_cnt > 0) {
315 struct tcphdr *th;
316 uint16_t p_len;
317
318 p_len = htons(le->p_len);
319 switch (le->eh_type) {
320 #ifdef INET6
321 case ETHERTYPE_IPV6:
322 {
323 struct ip6_hdr *ip6;
324
325 ip6 = le->le_ip6;
326 ip6->ip6_plen = p_len;
327 th = (struct tcphdr *)(ip6 + 1);
328 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
329 CSUM_PSEUDO_HDR;
330 le->p_len += ETHER_HDR_LEN + sizeof(*ip6);
331 break;
332 }
333 #endif
334 #ifdef INET
335 case ETHERTYPE_IP:
336 {
337 struct ip *ip4;
338 #ifdef TCP_LRO_UPDATE_CSUM
339 uint32_t cl;
340 uint16_t c;
341 #endif
342
343 ip4 = le->le_ip4;
344 #ifdef TCP_LRO_UPDATE_CSUM
345 /* Fix IP header checksum for new length. */
346 c = ~ip4->ip_sum;
347 cl = c;
348 c = ~ip4->ip_len;
349 cl += c + p_len;
350 while (cl > 0xffff)
351 cl = (cl >> 16) + (cl & 0xffff);
352 c = cl;
353 ip4->ip_sum = ~c;
354 #else
355 ip4->ip_sum = TCP_LRO_INVALID_CSUM;
356 #endif
357 ip4->ip_len = p_len;
358 th = (struct tcphdr *)(ip4 + 1);
359 le->m_head->m_pkthdr.csum_flags = CSUM_DATA_VALID |
360 CSUM_PSEUDO_HDR | CSUM_IP_CHECKED | CSUM_IP_VALID;
361 le->p_len += ETHER_HDR_LEN;
362 break;
363 }
364 #endif
365 default:
366 th = NULL; /* Keep compiler happy. */
367 }
368 le->m_head->m_pkthdr.csum_data = 0xffff;
369 le->m_head->m_pkthdr.len = le->p_len;
370
371 /* Incorporate the latest ACK into the TCP header. */
372 th->th_ack = le->ack_seq;
373 th->th_win = le->window;
374 /* Incorporate latest timestamp into the TCP header. */
375 if (le->timestamp != 0) {
376 uint32_t *ts_ptr;
377
378 ts_ptr = (uint32_t *)(th + 1);
379 ts_ptr[1] = htonl(le->tsval);
380 ts_ptr[2] = le->tsecr;
381 }
382 #ifdef TCP_LRO_UPDATE_CSUM
383 /* Update the TCP header checksum. */
384 le->ulp_csum += p_len;
385 le->ulp_csum += tcp_lro_csum_th(th);
386 while (le->ulp_csum > 0xffff)
387 le->ulp_csum = (le->ulp_csum >> 16) +
388 (le->ulp_csum & 0xffff);
389 th->th_sum = (le->ulp_csum & 0xffff);
390 th->th_sum = ~th->th_sum;
391 #else
392 th->th_sum = TCP_LRO_INVALID_CSUM;
393 #endif
394 }
395
396 le->m_head->m_pkthdr.lro_nsegs = le->append_cnt + 1;
397 (*lc->ifp->if_input)(lc->ifp, le->m_head);
398 lc->lro_queued += le->append_cnt + 1;
399 lc->lro_flushed++;
400 bzero(le, sizeof(*le));
401 LIST_INSERT_HEAD(&lc->lro_free, le, next);
402 }
403
404 #ifdef HAVE_INLINE_FLSLL
405 #define tcp_lro_msb_64(x) (1ULL << (flsll(x) - 1))
406 #else
407 static inline uint64_t
408 tcp_lro_msb_64(uint64_t x)
409 {
410 x |= (x >> 1);
411 x |= (x >> 2);
412 x |= (x >> 4);
413 x |= (x >> 8);
414 x |= (x >> 16);
415 x |= (x >> 32);
416 return (x & ~(x >> 1));
417 }
418 #endif
419
420 /*
421 * The tcp_lro_sort() routine is comparable to qsort(), except it has
422 * a worst case complexity limit of O(MIN(N,64)*N), where N is the
423 * number of elements to sort and 64 is the number of sequence bits
424 * available. The algorithm is bit-slicing the 64-bit sequence number,
425 * sorting one bit at a time from the most significant bit until the
426 * least significant one, skipping the constant bits. This is
427 * typically called a radix sort.
428 */
429 static void
430 tcp_lro_sort(struct lro_mbuf_sort *parray, uint32_t size)
431 {
432 struct lro_mbuf_sort temp;
433 uint64_t ones;
434 uint64_t zeros;
435 uint32_t x;
436 uint32_t y;
437
438 repeat:
439 /* for small arrays insertion sort is faster */
440 if (size <= 12) {
441 for (x = 1; x < size; x++) {
442 temp = parray[x];
443 for (y = x; y > 0 && temp.seq < parray[y - 1].seq; y--)
444 parray[y] = parray[y - 1];
445 parray[y] = temp;
446 }
447 return;
448 }
449
450 /* compute sequence bits which are constant */
451 ones = 0;
452 zeros = 0;
453 for (x = 0; x != size; x++) {
454 ones |= parray[x].seq;
455 zeros |= ~parray[x].seq;
456 }
457
458 /* compute bits which are not constant into "ones" */
459 ones &= zeros;
460 if (ones == 0)
461 return;
462
463 /* pick the most significant bit which is not constant */
464 ones = tcp_lro_msb_64(ones);
465
466 /*
467 * Move entries having cleared sequence bits to the beginning
468 * of the array:
469 */
470 for (x = y = 0; y != size; y++) {
471 /* skip set bits */
472 if (parray[y].seq & ones)
473 continue;
474 /* swap entries */
475 temp = parray[x];
476 parray[x] = parray[y];
477 parray[y] = temp;
478 x++;
479 }
480
481 KASSERT(x != 0 && x != size, ("Memory is corrupted\n"));
482
483 /* sort zeros */
484 tcp_lro_sort(parray, x);
485
486 /* sort ones */
487 parray += x;
488 size -= x;
489 goto repeat;
490 }
491
492 void
493 tcp_lro_flush_all(struct lro_ctrl *lc)
494 {
495 uint64_t seq;
496 uint64_t nseq;
497 unsigned x;
498
499 /* check if no mbufs to flush */
500 if (lc->lro_mbuf_count == 0)
501 goto done;
502
503 /* sort all mbufs according to stream */
504 tcp_lro_sort(lc->lro_mbuf_data, lc->lro_mbuf_count);
505
506 /* input data into LRO engine, stream by stream */
507 seq = 0;
508 for (x = 0; x != lc->lro_mbuf_count; x++) {
509 struct mbuf *mb;
510
511 /* get mbuf */
512 mb = lc->lro_mbuf_data[x].mb;
513
514 /* get sequence number, masking away the packet index */
515 nseq = lc->lro_mbuf_data[x].seq & (-1ULL << 24);
516
517 /* check for new stream */
518 if (seq != nseq) {
519 seq = nseq;
520
521 /* flush active streams */
522 tcp_lro_rx_done(lc);
523 }
524
525 /* add packet to LRO engine */
526 if (tcp_lro_rx2(lc, mb, 0, 0) != 0) {
527 /* input packet to network layer */
528 (*lc->ifp->if_input)(lc->ifp, mb);
529 lc->lro_queued++;
530 lc->lro_flushed++;
531 }
532 }
533 done:
534 /* flush active streams */
535 tcp_lro_rx_done(lc);
536
537 lc->lro_mbuf_count = 0;
538 }
539
540 #ifdef INET6
541 static int
542 tcp_lro_rx_ipv6(struct lro_ctrl *lc, struct mbuf *m, struct ip6_hdr *ip6,
543 struct tcphdr **th)
544 {
545
546 /* XXX-BZ we should check the flow-label. */
547
548 /* XXX-BZ We do not yet support ext. hdrs. */
549 if (ip6->ip6_nxt != IPPROTO_TCP)
550 return (TCP_LRO_NOT_SUPPORTED);
551
552 /* Find the TCP header. */
553 *th = (struct tcphdr *)(ip6 + 1);
554
555 return (0);
556 }
557 #endif
558
559 #ifdef INET
560 static int
561 tcp_lro_rx_ipv4(struct lro_ctrl *lc, struct mbuf *m, struct ip *ip4,
562 struct tcphdr **th)
563 {
564 int csum_flags;
565 uint16_t csum;
566
567 if (ip4->ip_p != IPPROTO_TCP)
568 return (TCP_LRO_NOT_SUPPORTED);
569
570 /* Ensure there are no options. */
571 if ((ip4->ip_hl << 2) != sizeof (*ip4))
572 return (TCP_LRO_CANNOT);
573
574 /* .. and the packet is not fragmented. */
575 if (ip4->ip_off & htons(IP_MF|IP_OFFMASK))
576 return (TCP_LRO_CANNOT);
577
578 /* Legacy IP has a header checksum that needs to be correct. */
579 csum_flags = m->m_pkthdr.csum_flags;
580 if (csum_flags & CSUM_IP_CHECKED) {
581 if (__predict_false((csum_flags & CSUM_IP_VALID) == 0)) {
582 lc->lro_bad_csum++;
583 return (TCP_LRO_CANNOT);
584 }
585 } else {
586 csum = in_cksum_hdr(ip4);
587 if (__predict_false((csum) != 0)) {
588 lc->lro_bad_csum++;
589 return (TCP_LRO_CANNOT);
590 }
591 }
592
593 /* Find the TCP header (we assured there are no IP options). */
594 *th = (struct tcphdr *)(ip4 + 1);
595
596 return (0);
597 }
598 #endif
599
600 static int
601 tcp_lro_rx2(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum, int use_hash)
602 {
603 struct lro_entry *le;
604 struct ether_header *eh;
605 #ifdef INET6
606 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
607 #endif
608 #ifdef INET
609 struct ip *ip4 = NULL; /* Keep compiler happy. */
610 #endif
611 struct tcphdr *th;
612 void *l3hdr = NULL; /* Keep compiler happy. */
613 uint32_t *ts_ptr;
614 tcp_seq seq;
615 int error, ip_len, l;
616 uint16_t eh_type, tcp_data_len;
617 struct lro_head *bucket;
618 int force_flush = 0;
619
620 /* We expect a contiguous header [eh, ip, tcp]. */
621
622 eh = mtod(m, struct ether_header *);
623 eh_type = ntohs(eh->ether_type);
624 switch (eh_type) {
625 #ifdef INET6
626 case ETHERTYPE_IPV6:
627 {
628 CURVNET_SET(lc->ifp->if_vnet);
629 if (V_ip6_forwarding != 0) {
630 /* XXX-BZ stats but changing lro_ctrl is a problem. */
631 CURVNET_RESTORE();
632 return (TCP_LRO_CANNOT);
633 }
634 CURVNET_RESTORE();
635 l3hdr = ip6 = (struct ip6_hdr *)(eh + 1);
636 error = tcp_lro_rx_ipv6(lc, m, ip6, &th);
637 if (error != 0)
638 return (error);
639 tcp_data_len = ntohs(ip6->ip6_plen);
640 ip_len = sizeof(*ip6) + tcp_data_len;
641 break;
642 }
643 #endif
644 #ifdef INET
645 case ETHERTYPE_IP:
646 {
647 CURVNET_SET(lc->ifp->if_vnet);
648 if (V_ipforwarding != 0) {
649 /* XXX-BZ stats but changing lro_ctrl is a problem. */
650 CURVNET_RESTORE();
651 return (TCP_LRO_CANNOT);
652 }
653 CURVNET_RESTORE();
654 l3hdr = ip4 = (struct ip *)(eh + 1);
655 error = tcp_lro_rx_ipv4(lc, m, ip4, &th);
656 if (error != 0)
657 return (error);
658 ip_len = ntohs(ip4->ip_len);
659 tcp_data_len = ip_len - sizeof(*ip4);
660 break;
661 }
662 #endif
663 /* XXX-BZ what happens in case of VLAN(s)? */
664 default:
665 return (TCP_LRO_NOT_SUPPORTED);
666 }
667
668 /*
669 * If the frame is padded beyond the end of the IP packet, then we must
670 * trim the extra bytes off.
671 */
672 l = m->m_pkthdr.len - (ETHER_HDR_LEN + ip_len);
673 if (l != 0) {
674 if (l < 0)
675 /* Truncated packet. */
676 return (TCP_LRO_CANNOT);
677
678 m_adj(m, -l);
679 }
680
681 /*
682 * Check TCP header constraints.
683 */
684 /* Ensure no bits set besides ACK or PSH. */
685 if ((th->th_flags & ~(TH_ACK | TH_PUSH)) != 0) {
686 if (th->th_flags & TH_SYN)
687 return (TCP_LRO_CANNOT);
688 /*
689 * Make sure that previously seen segements/ACKs are delivered
690 * before this segement, e.g. FIN.
691 */
692 force_flush = 1;
693 }
694
695 /* XXX-BZ We lose a ACK|PUSH flag concatenating multiple segments. */
696 /* XXX-BZ Ideally we'd flush on PUSH? */
697
698 /*
699 * Check for timestamps.
700 * Since the only option we handle are timestamps, we only have to
701 * handle the simple case of aligned timestamps.
702 */
703 l = (th->th_off << 2);
704 tcp_data_len -= l;
705 l -= sizeof(*th);
706 ts_ptr = (uint32_t *)(th + 1);
707 if (l != 0 && (__predict_false(l != TCPOLEN_TSTAMP_APPA) ||
708 (*ts_ptr != ntohl(TCPOPT_NOP<<24|TCPOPT_NOP<<16|
709 TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)))) {
710 /*
711 * Make sure that previously seen segements/ACKs are delivered
712 * before this segement.
713 */
714 force_flush = 1;
715 }
716
717 /* If the driver did not pass in the checksum, set it now. */
718 if (csum == 0x0000)
719 csum = th->th_sum;
720
721 seq = ntohl(th->th_seq);
722
723 if (!use_hash) {
724 bucket = &lc->lro_hash[0];
725 } else if (M_HASHTYPE_ISHASH(m)) {
726 bucket = &lc->lro_hash[m->m_pkthdr.flowid % lc->lro_hashsz];
727 } else {
728 uint32_t hash;
729
730 switch (eh_type) {
731 #ifdef INET
732 case ETHERTYPE_IP:
733 hash = ip4->ip_src.s_addr + ip4->ip_dst.s_addr;
734 break;
735 #endif
736 #ifdef INET6
737 case ETHERTYPE_IPV6:
738 hash = ip6->ip6_src.s6_addr32[0] +
739 ip6->ip6_dst.s6_addr32[0];
740 hash += ip6->ip6_src.s6_addr32[1] +
741 ip6->ip6_dst.s6_addr32[1];
742 hash += ip6->ip6_src.s6_addr32[2] +
743 ip6->ip6_dst.s6_addr32[2];
744 hash += ip6->ip6_src.s6_addr32[3] +
745 ip6->ip6_dst.s6_addr32[3];
746 break;
747 #endif
748 default:
749 hash = 0;
750 break;
751 }
752 hash += th->th_sport + th->th_dport;
753 bucket = &lc->lro_hash[hash % lc->lro_hashsz];
754 }
755
756 /* Try to find a matching previous segment. */
757 LIST_FOREACH(le, bucket, hash_next) {
758 if (le->eh_type != eh_type)
759 continue;
760 if (le->source_port != th->th_sport ||
761 le->dest_port != th->th_dport)
762 continue;
763 switch (eh_type) {
764 #ifdef INET6
765 case ETHERTYPE_IPV6:
766 if (bcmp(&le->source_ip6, &ip6->ip6_src,
767 sizeof(struct in6_addr)) != 0 ||
768 bcmp(&le->dest_ip6, &ip6->ip6_dst,
769 sizeof(struct in6_addr)) != 0)
770 continue;
771 break;
772 #endif
773 #ifdef INET
774 case ETHERTYPE_IP:
775 if (le->source_ip4 != ip4->ip_src.s_addr ||
776 le->dest_ip4 != ip4->ip_dst.s_addr)
777 continue;
778 break;
779 #endif
780 }
781
782 if (force_flush) {
783 /* Timestamps mismatch; this is a FIN, etc */
784 tcp_lro_active_remove(le);
785 tcp_lro_flush(lc, le);
786 return (TCP_LRO_CANNOT);
787 }
788
789 /* Flush now if appending will result in overflow. */
790 if (le->p_len > (lc->lro_length_lim - tcp_data_len)) {
791 tcp_lro_active_remove(le);
792 tcp_lro_flush(lc, le);
793 break;
794 }
795
796 /* Try to append the new segment. */
797 if (__predict_false(seq != le->next_seq ||
798 (tcp_data_len == 0 &&
799 le->ack_seq == th->th_ack &&
800 le->window == th->th_win))) {
801 /* Out of order packet or duplicate ACK. */
802 tcp_lro_active_remove(le);
803 tcp_lro_flush(lc, le);
804 return (TCP_LRO_CANNOT);
805 }
806
807 if (l != 0) {
808 uint32_t tsval = ntohl(*(ts_ptr + 1));
809 /* Make sure timestamp values are increasing. */
810 /* XXX-BZ flip and use TSTMP_GEQ macro for this? */
811 if (__predict_false(le->tsval > tsval ||
812 *(ts_ptr + 2) == 0))
813 return (TCP_LRO_CANNOT);
814 le->tsval = tsval;
815 le->tsecr = *(ts_ptr + 2);
816 }
817 if (tcp_data_len || SEQ_GT(ntohl(th->th_ack), ntohl(le->ack_seq))) {
818 le->next_seq += tcp_data_len;
819 le->ack_seq = th->th_ack;
820 le->window = th->th_win;
821 le->append_cnt++;
822 } else if (th->th_ack == le->ack_seq) {
823 le->window = WIN_MAX(le->window, th->th_win);
824 le->append_cnt++;
825 } else {
826 /* no data and old ack */
827 le->append_cnt++;
828 m_freem(m);
829 return (0);
830 }
831 #ifdef TCP_LRO_UPDATE_CSUM
832 le->ulp_csum += tcp_lro_rx_csum_fixup(le, l3hdr, th,
833 tcp_data_len, ~csum);
834 #endif
835
836 if (tcp_data_len == 0) {
837 m_freem(m);
838 /*
839 * Flush this LRO entry, if this ACK should not
840 * be further delayed.
841 */
842 if (le->append_cnt >= lc->lro_ackcnt_lim) {
843 tcp_lro_active_remove(le);
844 tcp_lro_flush(lc, le);
845 }
846 return (0);
847 }
848
849 le->p_len += tcp_data_len;
850
851 /*
852 * Adjust the mbuf so that m_data points to the first byte of
853 * the ULP payload. Adjust the mbuf to avoid complications and
854 * append new segment to existing mbuf chain.
855 */
856 m_adj(m, m->m_pkthdr.len - tcp_data_len);
857 m_demote_pkthdr(m);
858
859 le->m_tail->m_next = m;
860 le->m_tail = m_last(m);
861
862 /*
863 * If a possible next full length packet would cause an
864 * overflow, pro-actively flush now.
865 */
866 if (le->p_len > (lc->lro_length_lim - lc->ifp->if_mtu)) {
867 tcp_lro_active_remove(le);
868 tcp_lro_flush(lc, le);
869 } else
870 getmicrotime(&le->mtime);
871
872 return (0);
873 }
874
875 if (force_flush) {
876 /*
877 * Nothing to flush, but this segment can not be further
878 * aggregated/delayed.
879 */
880 return (TCP_LRO_CANNOT);
881 }
882
883 /* Try to find an empty slot. */
884 if (LIST_EMPTY(&lc->lro_free))
885 return (TCP_LRO_NO_ENTRIES);
886
887 /* Start a new segment chain. */
888 le = LIST_FIRST(&lc->lro_free);
889 LIST_REMOVE(le, next);
890 tcp_lro_active_insert(lc, bucket, le);
891 getmicrotime(&le->mtime);
892
893 /* Start filling in details. */
894 switch (eh_type) {
895 #ifdef INET6
896 case ETHERTYPE_IPV6:
897 le->le_ip6 = ip6;
898 le->source_ip6 = ip6->ip6_src;
899 le->dest_ip6 = ip6->ip6_dst;
900 le->eh_type = eh_type;
901 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN - sizeof(*ip6);
902 break;
903 #endif
904 #ifdef INET
905 case ETHERTYPE_IP:
906 le->le_ip4 = ip4;
907 le->source_ip4 = ip4->ip_src.s_addr;
908 le->dest_ip4 = ip4->ip_dst.s_addr;
909 le->eh_type = eh_type;
910 le->p_len = m->m_pkthdr.len - ETHER_HDR_LEN;
911 break;
912 #endif
913 }
914 le->source_port = th->th_sport;
915 le->dest_port = th->th_dport;
916
917 le->next_seq = seq + tcp_data_len;
918 le->ack_seq = th->th_ack;
919 le->window = th->th_win;
920 if (l != 0) {
921 le->timestamp = 1;
922 le->tsval = ntohl(*(ts_ptr + 1));
923 le->tsecr = *(ts_ptr + 2);
924 }
925
926 #ifdef TCP_LRO_UPDATE_CSUM
927 /*
928 * Do not touch the csum of the first packet. However save the
929 * "adjusted" checksum of just the source and destination addresses,
930 * the next header and the TCP payload. The length and TCP header
931 * parts may change, so we remove those from the saved checksum and
932 * re-add with final values on tcp_lro_flush() if needed.
933 */
934 KASSERT(le->ulp_csum == 0, ("%s: le=%p le->ulp_csum=0x%04x\n",
935 __func__, le, le->ulp_csum));
936
937 le->ulp_csum = tcp_lro_rx_csum_fixup(le, l3hdr, th, tcp_data_len,
938 ~csum);
939 th->th_sum = csum; /* Restore checksum on first packet. */
940 #endif
941
942 le->m_head = m;
943 le->m_tail = m_last(m);
944
945 return (0);
946 }
947
948 int
949 tcp_lro_rx(struct lro_ctrl *lc, struct mbuf *m, uint32_t csum)
950 {
951
952 return tcp_lro_rx2(lc, m, csum, 1);
953 }
954
955 void
956 tcp_lro_queue_mbuf(struct lro_ctrl *lc, struct mbuf *mb)
957 {
958 /* sanity checks */
959 if (__predict_false(lc->ifp == NULL || lc->lro_mbuf_data == NULL ||
960 lc->lro_mbuf_max == 0)) {
961 /* packet drop */
962 m_freem(mb);
963 return;
964 }
965
966 /* check if packet is not LRO capable */
967 if (__predict_false(mb->m_pkthdr.csum_flags == 0 ||
968 (lc->ifp->if_capenable & IFCAP_LRO) == 0)) {
969
970 /* input packet to network layer */
971 (*lc->ifp->if_input) (lc->ifp, mb);
972 return;
973 }
974
975 /* create sequence number */
976 lc->lro_mbuf_data[lc->lro_mbuf_count].seq =
977 (((uint64_t)M_HASHTYPE_GET(mb)) << 56) |
978 (((uint64_t)mb->m_pkthdr.flowid) << 24) |
979 ((uint64_t)lc->lro_mbuf_count);
980
981 /* enter mbuf */
982 lc->lro_mbuf_data[lc->lro_mbuf_count].mb = mb;
983
984 /* flush if array is full */
985 if (__predict_false(++lc->lro_mbuf_count == lc->lro_mbuf_max))
986 tcp_lro_flush_all(lc);
987 }
988
989 /* end */
Cache object: 1b3cc4379fca989b93de1874bafe307b
|