1 /*-
2 * Copyright (c) 2010-2011 Solarflare Communications, Inc.
3 * All rights reserved.
4 *
5 * This software was developed in part by Philip Paeps under contract for
6 * Solarflare Communications, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD: releng/9.2/sys/dev/sfxge/sfxge_rx.c 227569 2011-11-16 17:11:13Z philip $");
32
33 #include <sys/types.h>
34 #include <sys/mbuf.h>
35 #include <sys/smp.h>
36 #include <sys/socket.h>
37 #include <sys/sysctl.h>
38 #include <sys/limits.h>
39
40 #include <net/ethernet.h>
41 #include <net/if.h>
42 #include <net/if_vlan_var.h>
43
44 #include <netinet/in.h>
45 #include <netinet/ip.h>
46 #include <netinet/ip6.h>
47 #include <netinet/tcp.h>
48
49 #include <machine/in_cksum.h>
50
51 #include "common/efx.h"
52
53
54 #include "sfxge.h"
55 #include "sfxge_rx.h"
56
57 #define RX_REFILL_THRESHOLD (EFX_RXQ_LIMIT(SFXGE_NDESCS) * 9 / 10)
58 #define RX_REFILL_THRESHOLD_2 (RX_REFILL_THRESHOLD / 2)
59
60 /* Size of the LRO hash table. Must be a power of 2. A larger table
61 * means we can accelerate a larger number of streams.
62 */
63 static unsigned lro_table_size = 128;
64
65 /* Maximum length of a hash chain. If chains get too long then the lookup
66 * time increases and may exceed the benefit of LRO.
67 */
68 static unsigned lro_chain_max = 20;
69
70 /* Maximum time (in ticks) that a connection can be idle before it's LRO
71 * state is discarded.
72 */
73 static unsigned lro_idle_ticks; /* initialised in sfxge_rx_init() */
74
75 /* Number of packets with payload that must arrive in-order before a
76 * connection is eligible for LRO. The idea is we should avoid coalescing
77 * segments when the sender is in slow-start because reducing the ACK rate
78 * can damage performance.
79 */
80 static int lro_slow_start_packets = 2000;
81
82 /* Number of packets with payload that must arrive in-order following loss
83 * before a connection is eligible for LRO. The idea is we should avoid
84 * coalescing segments when the sender is recovering from loss, because
85 * reducing the ACK rate can damage performance.
86 */
87 static int lro_loss_packets = 20;
88
89 /* Flags for sfxge_lro_conn::l2_id; must not collide with EVL_VLID_MASK */
90 #define SFXGE_LRO_L2_ID_VLAN 0x4000
91 #define SFXGE_LRO_L2_ID_IPV6 0x8000
92 #define SFXGE_LRO_CONN_IS_VLAN_ENCAP(c) ((c)->l2_id & SFXGE_LRO_L2_ID_VLAN)
93 #define SFXGE_LRO_CONN_IS_TCPIPV4(c) (!((c)->l2_id & SFXGE_LRO_L2_ID_IPV6))
94
95 /* Compare IPv6 addresses, avoiding conditional branches */
96 static __inline unsigned long ipv6_addr_cmp(const struct in6_addr *left,
97 const struct in6_addr *right)
98 {
99 #if LONG_BIT == 64
100 const uint64_t *left64 = (const uint64_t *)left;
101 const uint64_t *right64 = (const uint64_t *)right;
102 return (left64[0] - right64[0]) | (left64[1] - right64[1]);
103 #else
104 return (left->s6_addr32[0] - right->s6_addr32[0]) |
105 (left->s6_addr32[1] - right->s6_addr32[1]) |
106 (left->s6_addr32[2] - right->s6_addr32[2]) |
107 (left->s6_addr32[3] - right->s6_addr32[3]);
108 #endif
109 }
110
111 void
112 sfxge_rx_qflush_done(struct sfxge_rxq *rxq)
113 {
114
115 rxq->flush_state = SFXGE_FLUSH_DONE;
116 }
117
118 void
119 sfxge_rx_qflush_failed(struct sfxge_rxq *rxq)
120 {
121
122 rxq->flush_state = SFXGE_FLUSH_FAILED;
123 }
124
125 static uint8_t toep_key[] = {
126 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
127 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
128 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
129 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
130 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
131 };
132
133 static void
134 sfxge_rx_post_refill(void *arg)
135 {
136 struct sfxge_rxq *rxq = arg;
137 struct sfxge_softc *sc;
138 unsigned int index;
139 struct sfxge_evq *evq;
140 uint16_t magic;
141
142 sc = rxq->sc;
143 index = rxq->index;
144 evq = sc->evq[index];
145
146 magic = SFXGE_MAGIC_RX_QREFILL | index;
147
148 /* This is guaranteed due to the start/stop order of rx and ev */
149 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
150 ("evq not started"));
151 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
152 ("rxq not started"));
153 efx_ev_qpost(evq->common, magic);
154 }
155
156 static void
157 sfxge_rx_schedule_refill(struct sfxge_rxq *rxq, boolean_t retrying)
158 {
159 /* Initially retry after 100 ms, but back off in case of
160 * repeated failures as we probably have to wait for the
161 * administrator to raise the pool limit. */
162 if (retrying)
163 rxq->refill_delay = min(rxq->refill_delay * 2, 10 * hz);
164 else
165 rxq->refill_delay = hz / 10;
166
167 callout_reset_curcpu(&rxq->refill_callout, rxq->refill_delay,
168 sfxge_rx_post_refill, rxq);
169 }
170
171 static inline struct mbuf *sfxge_rx_alloc_mbuf(struct sfxge_softc *sc)
172 {
173 struct mb_args args;
174 struct mbuf *m;
175
176 /* Allocate mbuf structure */
177 args.flags = M_PKTHDR;
178 args.type = MT_DATA;
179 m = (struct mbuf *)uma_zalloc_arg(zone_mbuf, &args, M_DONTWAIT);
180
181 /* Allocate (and attach) packet buffer */
182 if (m && !uma_zalloc_arg(sc->rx_buffer_zone, m, M_DONTWAIT)) {
183 uma_zfree(zone_mbuf, m);
184 m = NULL;
185 }
186
187 return m;
188 }
189
190 #define SFXGE_REFILL_BATCH 64
191
192 static void
193 sfxge_rx_qfill(struct sfxge_rxq *rxq, unsigned int target, boolean_t retrying)
194 {
195 struct sfxge_softc *sc;
196 unsigned int index;
197 struct sfxge_evq *evq;
198 unsigned int batch;
199 unsigned int rxfill;
200 unsigned int mblksize;
201 int ntodo;
202 efsys_dma_addr_t addr[SFXGE_REFILL_BATCH];
203
204 sc = rxq->sc;
205 index = rxq->index;
206 evq = sc->evq[index];
207
208 prefetch_read_many(sc->enp);
209 prefetch_read_many(rxq->common);
210
211 mtx_assert(&evq->lock, MA_OWNED);
212
213 if (rxq->init_state != SFXGE_RXQ_STARTED)
214 return;
215
216 rxfill = rxq->added - rxq->completed;
217 KASSERT(rxfill <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
218 ("rxfill > EFX_RXQ_LIMIT(SFXGE_NDESCS)"));
219 ntodo = min(EFX_RXQ_LIMIT(SFXGE_NDESCS) - rxfill, target);
220 KASSERT(ntodo <= EFX_RXQ_LIMIT(SFXGE_NDESCS),
221 ("ntodo > EFX_RQX_LIMIT(SFXGE_NDESCS)"));
222
223 if (ntodo == 0)
224 return;
225
226 batch = 0;
227 mblksize = sc->rx_buffer_size;
228 while (ntodo-- > 0) {
229 unsigned int id;
230 struct sfxge_rx_sw_desc *rx_desc;
231 bus_dma_segment_t seg;
232 struct mbuf *m;
233
234 id = (rxq->added + batch) & (SFXGE_NDESCS - 1);
235 rx_desc = &rxq->queue[id];
236 KASSERT(rx_desc->mbuf == NULL, ("rx_desc->mbuf != NULL"));
237
238 rx_desc->flags = EFX_DISCARD;
239 m = rx_desc->mbuf = sfxge_rx_alloc_mbuf(sc);
240 if (m == NULL)
241 break;
242 sfxge_map_mbuf_fast(rxq->mem.esm_tag, rxq->mem.esm_map, m, &seg);
243 addr[batch++] = seg.ds_addr;
244
245 if (batch == SFXGE_REFILL_BATCH) {
246 efx_rx_qpost(rxq->common, addr, mblksize, batch,
247 rxq->completed, rxq->added);
248 rxq->added += batch;
249 batch = 0;
250 }
251 }
252
253 if (ntodo != 0)
254 sfxge_rx_schedule_refill(rxq, retrying);
255
256 if (batch != 0) {
257 efx_rx_qpost(rxq->common, addr, mblksize, batch,
258 rxq->completed, rxq->added);
259 rxq->added += batch;
260 }
261
262 /* Make the descriptors visible to the hardware */
263 bus_dmamap_sync(rxq->mem.esm_tag, rxq->mem.esm_map,
264 BUS_DMASYNC_PREWRITE);
265
266 efx_rx_qpush(rxq->common, rxq->added);
267 }
268
269 void
270 sfxge_rx_qrefill(struct sfxge_rxq *rxq)
271 {
272
273 if (rxq->init_state != SFXGE_RXQ_STARTED)
274 return;
275
276 /* Make sure the queue is full */
277 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_TRUE);
278 }
279
280 static void __sfxge_rx_deliver(struct sfxge_softc *sc, struct mbuf *m)
281 {
282 struct ifnet *ifp = sc->ifnet;
283
284 m->m_pkthdr.rcvif = ifp;
285 m->m_pkthdr.header = m->m_data;
286 m->m_pkthdr.csum_data = 0xffff;
287 ifp->if_input(ifp, m);
288 }
289
290 static void
291 sfxge_rx_deliver(struct sfxge_softc *sc, struct sfxge_rx_sw_desc *rx_desc)
292 {
293 struct mbuf *m = rx_desc->mbuf;
294 int csum_flags;
295
296 /* Convert checksum flags */
297 csum_flags = (rx_desc->flags & EFX_CKSUM_IPV4) ?
298 (CSUM_IP_CHECKED | CSUM_IP_VALID) : 0;
299 if (rx_desc->flags & EFX_CKSUM_TCPUDP)
300 csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
301
302 #ifdef SFXGE_HAVE_MQ
303 /* The hash covers a 4-tuple for TCP only */
304 if (rx_desc->flags & EFX_PKT_TCP) {
305 m->m_pkthdr.flowid = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
306 mtod(m, uint8_t *));
307 m->m_flags |= M_FLOWID;
308 }
309 #endif
310 m->m_data += sc->rx_prefix_size;
311 m->m_len = rx_desc->size - sc->rx_prefix_size;
312 m->m_pkthdr.len = m->m_len;
313 m->m_pkthdr.csum_flags = csum_flags;
314 __sfxge_rx_deliver(sc, rx_desc->mbuf);
315
316 rx_desc->flags = EFX_DISCARD;
317 rx_desc->mbuf = NULL;
318 }
319
320 static void
321 sfxge_lro_deliver(struct sfxge_lro_state *st, struct sfxge_lro_conn *c)
322 {
323 struct sfxge_softc *sc = st->sc;
324 struct mbuf *m = c->mbuf;
325 struct tcphdr *c_th;
326 int csum_flags;
327
328 KASSERT(m, ("no mbuf to deliver"));
329
330 ++st->n_bursts;
331
332 /* Finish off packet munging and recalculate IP header checksum. */
333 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
334 struct ip *iph = c->nh;
335 iph->ip_len = htons(iph->ip_len);
336 iph->ip_sum = 0;
337 iph->ip_sum = in_cksum_hdr(iph);
338 c_th = (struct tcphdr *)(iph + 1);
339 csum_flags = (CSUM_DATA_VALID | CSUM_PSEUDO_HDR |
340 CSUM_IP_CHECKED | CSUM_IP_VALID);
341 } else {
342 struct ip6_hdr *iph = c->nh;
343 iph->ip6_plen = htons(iph->ip6_plen);
344 c_th = (struct tcphdr *)(iph + 1);
345 csum_flags = CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
346 }
347
348 c_th->th_win = c->th_last->th_win;
349 c_th->th_ack = c->th_last->th_ack;
350 if (c_th->th_off == c->th_last->th_off) {
351 /* Copy TCP options (take care to avoid going negative). */
352 int optlen = ((c_th->th_off - 5) & 0xf) << 2u;
353 memcpy(c_th + 1, c->th_last + 1, optlen);
354 }
355
356 #ifdef SFXGE_HAVE_MQ
357 m->m_pkthdr.flowid = c->conn_hash;
358 m->m_flags |= M_FLOWID;
359 #endif
360 m->m_pkthdr.csum_flags = csum_flags;
361 __sfxge_rx_deliver(sc, m);
362
363 c->mbuf = NULL;
364 c->delivered = 1;
365 }
366
367 /* Drop the given connection, and add it to the free list. */
368 static void sfxge_lro_drop(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
369 {
370 unsigned bucket;
371
372 KASSERT(!c->mbuf, ("found orphaned mbuf"));
373
374 if (c->next_buf.mbuf) {
375 sfxge_rx_deliver(rxq->sc, &c->next_buf);
376 LIST_REMOVE(c, active_link);
377 }
378
379 bucket = c->conn_hash & rxq->lro.conns_mask;
380 KASSERT(rxq->lro.conns_n[bucket] > 0, ("LRO: bucket fill level wrong"));
381 --rxq->lro.conns_n[bucket];
382 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
383 TAILQ_INSERT_HEAD(&rxq->lro.free_conns, c, link);
384 }
385
386 /* Stop tracking connections that have gone idle in order to keep hash
387 * chains short.
388 */
389 static void sfxge_lro_purge_idle(struct sfxge_rxq *rxq, unsigned now)
390 {
391 struct sfxge_lro_conn *c;
392 unsigned i;
393
394 KASSERT(LIST_EMPTY(&rxq->lro.active_conns),
395 ("found active connections"));
396
397 rxq->lro.last_purge_ticks = now;
398 for (i = 0; i <= rxq->lro.conns_mask; ++i) {
399 if (TAILQ_EMPTY(&rxq->lro.conns[i]))
400 continue;
401
402 c = TAILQ_LAST(&rxq->lro.conns[i], sfxge_lro_tailq);
403 if (now - c->last_pkt_ticks > lro_idle_ticks) {
404 ++rxq->lro.n_drop_idle;
405 sfxge_lro_drop(rxq, c);
406 }
407 }
408 }
409
410 static void
411 sfxge_lro_merge(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
412 struct mbuf *mbuf, struct tcphdr *th)
413 {
414 struct tcphdr *c_th;
415
416 /* Tack the new mbuf onto the chain. */
417 KASSERT(!mbuf->m_next, ("mbuf already chained"));
418 c->mbuf_tail->m_next = mbuf;
419 c->mbuf_tail = mbuf;
420
421 /* Increase length appropriately */
422 c->mbuf->m_pkthdr.len += mbuf->m_len;
423
424 /* Update the connection state flags */
425 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
426 struct ip *iph = c->nh;
427 iph->ip_len += mbuf->m_len;
428 c_th = (struct tcphdr *)(iph + 1);
429 } else {
430 struct ip6_hdr *iph = c->nh;
431 iph->ip6_plen += mbuf->m_len;
432 c_th = (struct tcphdr *)(iph + 1);
433 }
434 c_th->th_flags |= (th->th_flags & TH_PUSH);
435 c->th_last = th;
436 ++st->n_merges;
437
438 /* Pass packet up now if another segment could overflow the IP
439 * length.
440 */
441 if (c->mbuf->m_pkthdr.len > 65536 - 9200)
442 sfxge_lro_deliver(st, c);
443 }
444
445 static void
446 sfxge_lro_start(struct sfxge_lro_state *st, struct sfxge_lro_conn *c,
447 struct mbuf *mbuf, void *nh, struct tcphdr *th)
448 {
449 /* Start the chain */
450 c->mbuf = mbuf;
451 c->mbuf_tail = c->mbuf;
452 c->nh = nh;
453 c->th_last = th;
454
455 mbuf->m_pkthdr.len = mbuf->m_len;
456
457 /* Mangle header fields for later processing */
458 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
459 struct ip *iph = nh;
460 iph->ip_len = ntohs(iph->ip_len);
461 } else {
462 struct ip6_hdr *iph = nh;
463 iph->ip6_plen = ntohs(iph->ip6_plen);
464 }
465 }
466
467 /* Try to merge or otherwise hold or deliver (as appropriate) the
468 * packet buffered for this connection (c->next_buf). Return a flag
469 * indicating whether the connection is still active for LRO purposes.
470 */
471 static int
472 sfxge_lro_try_merge(struct sfxge_rxq *rxq, struct sfxge_lro_conn *c)
473 {
474 struct sfxge_rx_sw_desc *rx_buf = &c->next_buf;
475 char *eh = c->next_eh;
476 int data_length, hdr_length, dont_merge;
477 unsigned th_seq, pkt_length;
478 struct tcphdr *th;
479 unsigned now;
480
481 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
482 struct ip *iph = c->next_nh;
483 th = (struct tcphdr *)(iph + 1);
484 pkt_length = ntohs(iph->ip_len) + (char *) iph - eh;
485 } else {
486 struct ip6_hdr *iph = c->next_nh;
487 th = (struct tcphdr *)(iph + 1);
488 pkt_length = ntohs(iph->ip6_plen) + (char *) th - eh;
489 }
490
491 hdr_length = (char *) th + th->th_off * 4 - eh;
492 data_length = (min(pkt_length, rx_buf->size - rxq->sc->rx_prefix_size) -
493 hdr_length);
494 th_seq = ntohl(th->th_seq);
495 dont_merge = ((data_length <= 0)
496 | (th->th_flags & (TH_URG | TH_SYN | TH_RST | TH_FIN)));
497
498 /* Check for options other than aligned timestamp. */
499 if (th->th_off != 5) {
500 const uint32_t *opt_ptr = (const uint32_t *) (th + 1);
501 if (th->th_off == 8 &&
502 opt_ptr[0] == ntohl((TCPOPT_NOP << 24) |
503 (TCPOPT_NOP << 16) |
504 (TCPOPT_TIMESTAMP << 8) |
505 TCPOLEN_TIMESTAMP)) {
506 /* timestamp option -- okay */
507 } else {
508 dont_merge = 1;
509 }
510 }
511
512 if (__predict_false(th_seq != c->next_seq)) {
513 /* Out-of-order, so start counting again. */
514 if (c->mbuf)
515 sfxge_lro_deliver(&rxq->lro, c);
516 c->n_in_order_pkts -= lro_loss_packets;
517 c->next_seq = th_seq + data_length;
518 ++rxq->lro.n_misorder;
519 goto deliver_buf_out;
520 }
521 c->next_seq = th_seq + data_length;
522
523 now = ticks;
524 if (now - c->last_pkt_ticks > lro_idle_ticks) {
525 ++rxq->lro.n_drop_idle;
526 if (c->mbuf)
527 sfxge_lro_deliver(&rxq->lro, c);
528 sfxge_lro_drop(rxq, c);
529 return 0;
530 }
531 c->last_pkt_ticks = ticks;
532
533 if (c->n_in_order_pkts < lro_slow_start_packets) {
534 /* May be in slow-start, so don't merge. */
535 ++rxq->lro.n_slow_start;
536 ++c->n_in_order_pkts;
537 goto deliver_buf_out;
538 }
539
540 if (__predict_false(dont_merge)) {
541 if (c->mbuf)
542 sfxge_lro_deliver(&rxq->lro, c);
543 if (th->th_flags & (TH_FIN | TH_RST)) {
544 ++rxq->lro.n_drop_closed;
545 sfxge_lro_drop(rxq, c);
546 return 0;
547 }
548 goto deliver_buf_out;
549 }
550
551 rx_buf->mbuf->m_data += rxq->sc->rx_prefix_size;
552
553 if (__predict_true(c->mbuf != NULL)) {
554 /* Remove headers and any padding */
555 rx_buf->mbuf->m_data += hdr_length;
556 rx_buf->mbuf->m_len = data_length;
557
558 sfxge_lro_merge(&rxq->lro, c, rx_buf->mbuf, th);
559 } else {
560 /* Remove any padding */
561 rx_buf->mbuf->m_len = pkt_length;
562
563 sfxge_lro_start(&rxq->lro, c, rx_buf->mbuf, c->next_nh, th);
564 }
565
566 rx_buf->mbuf = NULL;
567 return 1;
568
569 deliver_buf_out:
570 sfxge_rx_deliver(rxq->sc, rx_buf);
571 return 1;
572 }
573
574 static void sfxge_lro_new_conn(struct sfxge_lro_state *st, uint32_t conn_hash,
575 uint16_t l2_id, void *nh, struct tcphdr *th)
576 {
577 unsigned bucket = conn_hash & st->conns_mask;
578 struct sfxge_lro_conn *c;
579
580 if (st->conns_n[bucket] >= lro_chain_max) {
581 ++st->n_too_many;
582 return;
583 }
584
585 if (!TAILQ_EMPTY(&st->free_conns)) {
586 c = TAILQ_FIRST(&st->free_conns);
587 TAILQ_REMOVE(&st->free_conns, c, link);
588 } else {
589 c = malloc(sizeof(*c), M_SFXGE, M_DONTWAIT);
590 if (c == NULL)
591 return;
592 c->mbuf = NULL;
593 c->next_buf.mbuf = NULL;
594 }
595
596 /* Create the connection tracking data */
597 ++st->conns_n[bucket];
598 TAILQ_INSERT_HEAD(&st->conns[bucket], c, link);
599 c->l2_id = l2_id;
600 c->conn_hash = conn_hash;
601 c->source = th->th_sport;
602 c->dest = th->th_dport;
603 c->n_in_order_pkts = 0;
604 c->last_pkt_ticks = *(volatile int *)&ticks;
605 c->delivered = 0;
606 ++st->n_new_stream;
607 /* NB. We don't initialise c->next_seq, and it doesn't matter what
608 * value it has. Most likely the next packet received for this
609 * connection will not match -- no harm done.
610 */
611 }
612
613 /* Process mbuf and decide whether to dispatch it to the stack now or
614 * later.
615 */
616 static void
617 sfxge_lro(struct sfxge_rxq *rxq, struct sfxge_rx_sw_desc *rx_buf)
618 {
619 struct sfxge_softc *sc = rxq->sc;
620 struct mbuf *m = rx_buf->mbuf;
621 struct ether_header *eh;
622 struct sfxge_lro_conn *c;
623 uint16_t l2_id;
624 uint16_t l3_proto;
625 void *nh;
626 struct tcphdr *th;
627 uint32_t conn_hash;
628 unsigned bucket;
629
630 /* Get the hardware hash */
631 conn_hash = EFX_RX_HASH_VALUE(EFX_RX_HASHALG_TOEPLITZ,
632 mtod(m, uint8_t *));
633
634 eh = (struct ether_header *)(m->m_data + sc->rx_prefix_size);
635 if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
636 struct ether_vlan_header *veh = (struct ether_vlan_header *)eh;
637 l2_id = EVL_VLANOFTAG(ntohs(veh->evl_tag)) |
638 SFXGE_LRO_L2_ID_VLAN;
639 l3_proto = veh->evl_proto;
640 nh = veh + 1;
641 } else {
642 l2_id = 0;
643 l3_proto = eh->ether_type;
644 nh = eh + 1;
645 }
646
647 /* Check whether this is a suitable packet (unfragmented
648 * TCP/IPv4 or TCP/IPv6). If so, find the TCP header and
649 * length, and compute a hash if necessary. If not, return.
650 */
651 if (l3_proto == htons(ETHERTYPE_IP)) {
652 struct ip *iph = nh;
653 if ((iph->ip_p - IPPROTO_TCP) |
654 (iph->ip_hl - (sizeof(*iph) >> 2u)) |
655 (iph->ip_off & htons(IP_MF | IP_OFFMASK)))
656 goto deliver_now;
657 th = (struct tcphdr *)(iph + 1);
658 } else if (l3_proto == htons(ETHERTYPE_IPV6)) {
659 struct ip6_hdr *iph = nh;
660 if (iph->ip6_nxt != IPPROTO_TCP)
661 goto deliver_now;
662 l2_id |= SFXGE_LRO_L2_ID_IPV6;
663 th = (struct tcphdr *)(iph + 1);
664 } else {
665 goto deliver_now;
666 }
667
668 bucket = conn_hash & rxq->lro.conns_mask;
669
670 TAILQ_FOREACH(c, &rxq->lro.conns[bucket], link) {
671 if ((c->l2_id - l2_id) | (c->conn_hash - conn_hash))
672 continue;
673 if ((c->source - th->th_sport) | (c->dest - th->th_dport))
674 continue;
675 if (c->mbuf) {
676 if (SFXGE_LRO_CONN_IS_TCPIPV4(c)) {
677 struct ip *c_iph, *iph = nh;
678 c_iph = c->nh;
679 if ((c_iph->ip_src.s_addr - iph->ip_src.s_addr) |
680 (c_iph->ip_dst.s_addr - iph->ip_dst.s_addr))
681 continue;
682 } else {
683 struct ip6_hdr *c_iph, *iph = nh;
684 c_iph = c->nh;
685 if (ipv6_addr_cmp(&c_iph->ip6_src, &iph->ip6_src) |
686 ipv6_addr_cmp(&c_iph->ip6_dst, &iph->ip6_dst))
687 continue;
688 }
689 }
690
691 /* Re-insert at head of list to reduce lookup time. */
692 TAILQ_REMOVE(&rxq->lro.conns[bucket], c, link);
693 TAILQ_INSERT_HEAD(&rxq->lro.conns[bucket], c, link);
694
695 if (c->next_buf.mbuf) {
696 if (!sfxge_lro_try_merge(rxq, c))
697 goto deliver_now;
698 } else {
699 LIST_INSERT_HEAD(&rxq->lro.active_conns, c,
700 active_link);
701 }
702 c->next_buf = *rx_buf;
703 c->next_eh = eh;
704 c->next_nh = nh;
705
706 rx_buf->mbuf = NULL;
707 rx_buf->flags = EFX_DISCARD;
708 return;
709 }
710
711 sfxge_lro_new_conn(&rxq->lro, conn_hash, l2_id, nh, th);
712 deliver_now:
713 sfxge_rx_deliver(sc, rx_buf);
714 }
715
716 static void sfxge_lro_end_of_burst(struct sfxge_rxq *rxq)
717 {
718 struct sfxge_lro_state *st = &rxq->lro;
719 struct sfxge_lro_conn *c;
720 unsigned t;
721
722 while (!LIST_EMPTY(&st->active_conns)) {
723 c = LIST_FIRST(&st->active_conns);
724 if (!c->delivered && c->mbuf)
725 sfxge_lro_deliver(st, c);
726 if (sfxge_lro_try_merge(rxq, c)) {
727 if (c->mbuf)
728 sfxge_lro_deliver(st, c);
729 LIST_REMOVE(c, active_link);
730 }
731 c->delivered = 0;
732 }
733
734 t = *(volatile int *)&ticks;
735 if (__predict_false(t != st->last_purge_ticks))
736 sfxge_lro_purge_idle(rxq, t);
737 }
738
739 void
740 sfxge_rx_qcomplete(struct sfxge_rxq *rxq, boolean_t eop)
741 {
742 struct sfxge_softc *sc = rxq->sc;
743 int lro_enabled = sc->ifnet->if_capenable & IFCAP_LRO;
744 unsigned int index;
745 struct sfxge_evq *evq;
746 unsigned int completed;
747 unsigned int level;
748 struct mbuf *m;
749 struct sfxge_rx_sw_desc *prev = NULL;
750
751 index = rxq->index;
752 evq = sc->evq[index];
753
754 mtx_assert(&evq->lock, MA_OWNED);
755
756 completed = rxq->completed;
757 while (completed != rxq->pending) {
758 unsigned int id;
759 struct sfxge_rx_sw_desc *rx_desc;
760
761 id = completed++ & (SFXGE_NDESCS - 1);
762 rx_desc = &rxq->queue[id];
763 m = rx_desc->mbuf;
764
765 if (rxq->init_state != SFXGE_RXQ_STARTED)
766 goto discard;
767
768 if (rx_desc->flags & (EFX_ADDR_MISMATCH | EFX_DISCARD))
769 goto discard;
770
771 prefetch_read_many(mtod(m, caddr_t));
772
773 /* Check for loopback packets */
774 if (!(rx_desc->flags & EFX_PKT_IPV4) &&
775 !(rx_desc->flags & EFX_PKT_IPV6)) {
776 struct ether_header *etherhp;
777
778 /*LINTED*/
779 etherhp = mtod(m, struct ether_header *);
780
781 if (etherhp->ether_type ==
782 htons(SFXGE_ETHERTYPE_LOOPBACK)) {
783 EFSYS_PROBE(loopback);
784
785 rxq->loopback++;
786 goto discard;
787 }
788 }
789
790 /* Pass packet up the stack or into LRO (pipelined) */
791 if (prev != NULL) {
792 if (lro_enabled)
793 sfxge_lro(rxq, prev);
794 else
795 sfxge_rx_deliver(sc, prev);
796 }
797 prev = rx_desc;
798 continue;
799
800 discard:
801 /* Return the packet to the pool */
802 m_free(m);
803 rx_desc->mbuf = NULL;
804 }
805 rxq->completed = completed;
806
807 level = rxq->added - rxq->completed;
808
809 /* Pass last packet up the stack or into LRO */
810 if (prev != NULL) {
811 if (lro_enabled)
812 sfxge_lro(rxq, prev);
813 else
814 sfxge_rx_deliver(sc, prev);
815 }
816
817 /*
818 * If there are any pending flows and this is the end of the
819 * poll then they must be completed.
820 */
821 if (eop)
822 sfxge_lro_end_of_burst(rxq);
823
824 /* Top up the queue if necessary */
825 if (level < RX_REFILL_THRESHOLD)
826 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
827 }
828
829 static void
830 sfxge_rx_qstop(struct sfxge_softc *sc, unsigned int index)
831 {
832 struct sfxge_rxq *rxq;
833 struct sfxge_evq *evq;
834 unsigned int count;
835
836 rxq = sc->rxq[index];
837 evq = sc->evq[index];
838
839 mtx_lock(&evq->lock);
840
841 KASSERT(rxq->init_state == SFXGE_RXQ_STARTED,
842 ("rxq not started"));
843
844 rxq->init_state = SFXGE_RXQ_INITIALIZED;
845
846 callout_stop(&rxq->refill_callout);
847
848 again:
849 rxq->flush_state = SFXGE_FLUSH_PENDING;
850
851 /* Flush the receive queue */
852 efx_rx_qflush(rxq->common);
853
854 mtx_unlock(&evq->lock);
855
856 count = 0;
857 do {
858 /* Spin for 100 ms */
859 DELAY(100000);
860
861 if (rxq->flush_state != SFXGE_FLUSH_PENDING)
862 break;
863
864 } while (++count < 20);
865
866 mtx_lock(&evq->lock);
867
868 if (rxq->flush_state == SFXGE_FLUSH_FAILED)
869 goto again;
870
871 rxq->flush_state = SFXGE_FLUSH_DONE;
872
873 rxq->pending = rxq->added;
874 sfxge_rx_qcomplete(rxq, B_TRUE);
875
876 KASSERT(rxq->completed == rxq->pending,
877 ("rxq->completed != rxq->pending"));
878
879 rxq->added = 0;
880 rxq->pending = 0;
881 rxq->completed = 0;
882 rxq->loopback = 0;
883
884 /* Destroy the common code receive queue. */
885 efx_rx_qdestroy(rxq->common);
886
887 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
888 EFX_RXQ_NBUFS(SFXGE_NDESCS));
889
890 mtx_unlock(&evq->lock);
891 }
892
893 static int
894 sfxge_rx_qstart(struct sfxge_softc *sc, unsigned int index)
895 {
896 struct sfxge_rxq *rxq;
897 efsys_mem_t *esmp;
898 struct sfxge_evq *evq;
899 int rc;
900
901 rxq = sc->rxq[index];
902 esmp = &rxq->mem;
903 evq = sc->evq[index];
904
905 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
906 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
907 KASSERT(evq->init_state == SFXGE_EVQ_STARTED,
908 ("evq->init_state != SFXGE_EVQ_STARTED"));
909
910 /* Program the buffer table. */
911 if ((rc = efx_sram_buf_tbl_set(sc->enp, rxq->buf_base_id, esmp,
912 EFX_RXQ_NBUFS(SFXGE_NDESCS))) != 0)
913 return rc;
914
915 /* Create the common code receive queue. */
916 if ((rc = efx_rx_qcreate(sc->enp, index, index, EFX_RXQ_TYPE_DEFAULT,
917 esmp, SFXGE_NDESCS, rxq->buf_base_id, evq->common,
918 &rxq->common)) != 0)
919 goto fail;
920
921 mtx_lock(&evq->lock);
922
923 /* Enable the receive queue. */
924 efx_rx_qenable(rxq->common);
925
926 rxq->init_state = SFXGE_RXQ_STARTED;
927
928 /* Try to fill the queue from the pool. */
929 sfxge_rx_qfill(rxq, EFX_RXQ_LIMIT(SFXGE_NDESCS), B_FALSE);
930
931 mtx_unlock(&evq->lock);
932
933 return (0);
934
935 fail:
936 efx_sram_buf_tbl_clear(sc->enp, rxq->buf_base_id,
937 EFX_RXQ_NBUFS(SFXGE_NDESCS));
938 return rc;
939 }
940
941 void
942 sfxge_rx_stop(struct sfxge_softc *sc)
943 {
944 struct sfxge_intr *intr;
945 int index;
946
947 intr = &sc->intr;
948
949 /* Stop the receive queue(s) */
950 index = intr->n_alloc;
951 while (--index >= 0)
952 sfxge_rx_qstop(sc, index);
953
954 sc->rx_prefix_size = 0;
955 sc->rx_buffer_size = 0;
956
957 efx_rx_fini(sc->enp);
958 }
959
960 int
961 sfxge_rx_start(struct sfxge_softc *sc)
962 {
963 struct sfxge_intr *intr;
964 int index;
965 int rc;
966
967 intr = &sc->intr;
968
969 /* Initialize the common code receive module. */
970 if ((rc = efx_rx_init(sc->enp)) != 0)
971 return (rc);
972
973 /* Calculate the receive packet buffer size. */
974 sc->rx_prefix_size = EFX_RX_PREFIX_SIZE;
975 sc->rx_buffer_size = (EFX_MAC_PDU(sc->ifnet->if_mtu) +
976 sc->rx_prefix_size);
977
978 /* Select zone for packet buffers */
979 if (sc->rx_buffer_size <= MCLBYTES)
980 sc->rx_buffer_zone = zone_clust;
981 else if (sc->rx_buffer_size <= MJUMPAGESIZE)
982 sc->rx_buffer_zone = zone_jumbop;
983 else if (sc->rx_buffer_size <= MJUM9BYTES)
984 sc->rx_buffer_zone = zone_jumbo9;
985 else
986 sc->rx_buffer_zone = zone_jumbo16;
987
988 /*
989 * Set up the scale table. Enable all hash types and hash insertion.
990 */
991 for (index = 0; index < SFXGE_RX_SCALE_MAX; index++)
992 sc->rx_indir_table[index] = index % sc->intr.n_alloc;
993 if ((rc = efx_rx_scale_tbl_set(sc->enp, sc->rx_indir_table,
994 SFXGE_RX_SCALE_MAX)) != 0)
995 goto fail;
996 (void)efx_rx_scale_mode_set(sc->enp, EFX_RX_HASHALG_TOEPLITZ,
997 (1 << EFX_RX_HASH_IPV4) | (1 << EFX_RX_HASH_TCPIPV4) |
998 (1 << EFX_RX_HASH_IPV6) | (1 << EFX_RX_HASH_TCPIPV6), B_TRUE);
999
1000 if ((rc = efx_rx_scale_toeplitz_ipv4_key_set(sc->enp, toep_key,
1001 sizeof(toep_key))) != 0)
1002 goto fail;
1003
1004 /* Start the receive queue(s). */
1005 for (index = 0; index < intr->n_alloc; index++) {
1006 if ((rc = sfxge_rx_qstart(sc, index)) != 0)
1007 goto fail2;
1008 }
1009
1010 return (0);
1011
1012 fail2:
1013 while (--index >= 0)
1014 sfxge_rx_qstop(sc, index);
1015
1016 fail:
1017 efx_rx_fini(sc->enp);
1018
1019 return (rc);
1020 }
1021
1022 static void sfxge_lro_init(struct sfxge_rxq *rxq)
1023 {
1024 struct sfxge_lro_state *st = &rxq->lro;
1025 unsigned i;
1026
1027 st->conns_mask = lro_table_size - 1;
1028 KASSERT(!((st->conns_mask + 1) & st->conns_mask),
1029 ("lro_table_size must be a power of 2"));
1030 st->sc = rxq->sc;
1031 st->conns = malloc((st->conns_mask + 1) * sizeof(st->conns[0]),
1032 M_SFXGE, M_WAITOK);
1033 st->conns_n = malloc((st->conns_mask + 1) * sizeof(st->conns_n[0]),
1034 M_SFXGE, M_WAITOK);
1035 for (i = 0; i <= st->conns_mask; ++i) {
1036 TAILQ_INIT(&st->conns[i]);
1037 st->conns_n[i] = 0;
1038 }
1039 LIST_INIT(&st->active_conns);
1040 TAILQ_INIT(&st->free_conns);
1041 }
1042
1043 static void sfxge_lro_fini(struct sfxge_rxq *rxq)
1044 {
1045 struct sfxge_lro_state *st = &rxq->lro;
1046 struct sfxge_lro_conn *c;
1047 unsigned i;
1048
1049 /* Return cleanly if sfxge_lro_init() has not been called. */
1050 if (st->conns == NULL)
1051 return;
1052
1053 KASSERT(LIST_EMPTY(&st->active_conns), ("found active connections"));
1054
1055 for (i = 0; i <= st->conns_mask; ++i) {
1056 while (!TAILQ_EMPTY(&st->conns[i])) {
1057 c = TAILQ_LAST(&st->conns[i], sfxge_lro_tailq);
1058 sfxge_lro_drop(rxq, c);
1059 }
1060 }
1061
1062 while (!TAILQ_EMPTY(&st->free_conns)) {
1063 c = TAILQ_FIRST(&st->free_conns);
1064 TAILQ_REMOVE(&st->free_conns, c, link);
1065 KASSERT(!c->mbuf, ("found orphaned mbuf"));
1066 free(c, M_SFXGE);
1067 }
1068
1069 free(st->conns_n, M_SFXGE);
1070 free(st->conns, M_SFXGE);
1071 st->conns = NULL;
1072 }
1073
1074 static void
1075 sfxge_rx_qfini(struct sfxge_softc *sc, unsigned int index)
1076 {
1077 struct sfxge_rxq *rxq;
1078
1079 rxq = sc->rxq[index];
1080
1081 KASSERT(rxq->init_state == SFXGE_RXQ_INITIALIZED,
1082 ("rxq->init_state != SFXGE_RXQ_INITIALIZED"));
1083
1084 /* Free the context array and the flow table. */
1085 free(rxq->queue, M_SFXGE);
1086 sfxge_lro_fini(rxq);
1087
1088 /* Release DMA memory. */
1089 sfxge_dma_free(&rxq->mem);
1090
1091 sc->rxq[index] = NULL;
1092
1093 free(rxq, M_SFXGE);
1094 }
1095
1096 static int
1097 sfxge_rx_qinit(struct sfxge_softc *sc, unsigned int index)
1098 {
1099 struct sfxge_rxq *rxq;
1100 struct sfxge_evq *evq;
1101 efsys_mem_t *esmp;
1102 int rc;
1103
1104 KASSERT(index < sc->intr.n_alloc, ("index >= %d", sc->intr.n_alloc));
1105
1106 rxq = malloc(sizeof(struct sfxge_rxq), M_SFXGE, M_ZERO | M_WAITOK);
1107 rxq->sc = sc;
1108 rxq->index = index;
1109
1110 sc->rxq[index] = rxq;
1111 esmp = &rxq->mem;
1112
1113 evq = sc->evq[index];
1114
1115 /* Allocate and zero DMA space. */
1116 if ((rc = sfxge_dma_alloc(sc, EFX_RXQ_SIZE(SFXGE_NDESCS), esmp)) != 0)
1117 return (rc);
1118 (void)memset(esmp->esm_base, 0, EFX_RXQ_SIZE(SFXGE_NDESCS));
1119
1120 /* Allocate buffer table entries. */
1121 sfxge_sram_buf_tbl_alloc(sc, EFX_RXQ_NBUFS(SFXGE_NDESCS),
1122 &rxq->buf_base_id);
1123
1124 /* Allocate the context array and the flow table. */
1125 rxq->queue = malloc(sizeof(struct sfxge_rx_sw_desc) * SFXGE_NDESCS,
1126 M_SFXGE, M_WAITOK | M_ZERO);
1127 sfxge_lro_init(rxq);
1128
1129 callout_init(&rxq->refill_callout, B_TRUE);
1130
1131 rxq->init_state = SFXGE_RXQ_INITIALIZED;
1132
1133 return (0);
1134 }
1135
1136 static const struct {
1137 const char *name;
1138 size_t offset;
1139 } sfxge_rx_stats[] = {
1140 #define SFXGE_RX_STAT(name, member) \
1141 { #name, offsetof(struct sfxge_rxq, member) }
1142 SFXGE_RX_STAT(lro_merges, lro.n_merges),
1143 SFXGE_RX_STAT(lro_bursts, lro.n_bursts),
1144 SFXGE_RX_STAT(lro_slow_start, lro.n_slow_start),
1145 SFXGE_RX_STAT(lro_misorder, lro.n_misorder),
1146 SFXGE_RX_STAT(lro_too_many, lro.n_too_many),
1147 SFXGE_RX_STAT(lro_new_stream, lro.n_new_stream),
1148 SFXGE_RX_STAT(lro_drop_idle, lro.n_drop_idle),
1149 SFXGE_RX_STAT(lro_drop_closed, lro.n_drop_closed)
1150 };
1151
1152 static int
1153 sfxge_rx_stat_handler(SYSCTL_HANDLER_ARGS)
1154 {
1155 struct sfxge_softc *sc = arg1;
1156 unsigned int id = arg2;
1157 unsigned int sum, index;
1158
1159 /* Sum across all RX queues */
1160 sum = 0;
1161 for (index = 0; index < sc->intr.n_alloc; index++)
1162 sum += *(unsigned int *)((caddr_t)sc->rxq[index] +
1163 sfxge_rx_stats[id].offset);
1164
1165 return SYSCTL_OUT(req, &sum, sizeof(sum));
1166 }
1167
1168 static void
1169 sfxge_rx_stat_init(struct sfxge_softc *sc)
1170 {
1171 struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(sc->dev);
1172 struct sysctl_oid_list *stat_list;
1173 unsigned int id;
1174
1175 stat_list = SYSCTL_CHILDREN(sc->stats_node);
1176
1177 for (id = 0;
1178 id < sizeof(sfxge_rx_stats) / sizeof(sfxge_rx_stats[0]);
1179 id++) {
1180 SYSCTL_ADD_PROC(
1181 ctx, stat_list,
1182 OID_AUTO, sfxge_rx_stats[id].name,
1183 CTLTYPE_UINT|CTLFLAG_RD,
1184 sc, id, sfxge_rx_stat_handler, "IU",
1185 "");
1186 }
1187 }
1188
1189 void
1190 sfxge_rx_fini(struct sfxge_softc *sc)
1191 {
1192 struct sfxge_intr *intr;
1193 int index;
1194
1195 intr = &sc->intr;
1196
1197 index = intr->n_alloc;
1198 while (--index >= 0)
1199 sfxge_rx_qfini(sc, index);
1200 }
1201
1202 int
1203 sfxge_rx_init(struct sfxge_softc *sc)
1204 {
1205 struct sfxge_intr *intr;
1206 int index;
1207 int rc;
1208
1209 if (lro_idle_ticks == 0)
1210 lro_idle_ticks = hz / 10 + 1; /* 100 ms */
1211
1212 intr = &sc->intr;
1213
1214 KASSERT(intr->state == SFXGE_INTR_INITIALIZED,
1215 ("intr->state != SFXGE_INTR_INITIALIZED"));
1216
1217 /* Initialize the receive queue(s) - one per interrupt. */
1218 for (index = 0; index < intr->n_alloc; index++) {
1219 if ((rc = sfxge_rx_qinit(sc, index)) != 0)
1220 goto fail;
1221 }
1222
1223 sfxge_rx_stat_init(sc);
1224
1225 return (0);
1226
1227 fail:
1228 /* Tear down the receive queue(s). */
1229 while (--index >= 0)
1230 sfxge_rx_qfini(sc, index);
1231
1232 return (rc);
1233 }
Cache object: f427e957d9c82322328ae8dd925cf7a1
|