FreeBSD/Linux Kernel Cross Reference
sys/net/flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD: releng/9.1/sys/net/flowtable.c 232292 2012-02-29 09:47:26Z bz $");
38
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
84 };
85
86 union ipv4_flow {
87 struct ipv4_tuple ipf_ipt;
88 uint32_t ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
96 };
97
98 union ipv6_flow {
99 struct ipv6_tuple ipf_ipt;
100 uint32_t ipf_key[9];
101 };
102
103 struct flentry {
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
106 uint8_t f_pad;
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
131
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE 300
136 #define UDP_IDLE 300
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
152 uint64_t ft_misses;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
155 uint64_t ft_frees;
156 uint64_t ft_hits;
157 uint64_t ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161 struct flowtable_stats ft_stats[MAXCPU];
162 int ft_size;
163 int ft_lock_count;
164 uint32_t ft_flags;
165 char *ft_name;
166 fl_lock_t *ft_lock;
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
169 /*
170 * XXX need to pad out
171 */
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
177
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
183 boolean_t ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
196
197
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
203
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
206 do { \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
209 } while (0); \
210
211 #else
212 #define FLDPRINTF(ft, flags, fmt, ...)
213
214 #endif
215
216
217 /*
218 * TODO:
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
221 * counter
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
226 for flows
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
232 */
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
241
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
250
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253 &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
256
257 /*
258 * XXX This does not end up updating timeouts at runtime
259 * and only reflects the value for the last table added :-/
260 */
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262 &VNET_NAME(flowtable_syn_expire), 0,
263 "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265 &VNET_NAME(flowtable_udp_expire), 0,
266 "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268 &VNET_NAME(flowtable_fin_wait_expire), 0,
269 "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271 &VNET_NAME(flowtable_tcp_expire), 0,
272 "seconds after which to remove flow allocated to a TCP connection.");
273
274
275 /*
276 * Maximum number of flows that can be allocated of a given type.
277 *
278 * The table is allocated at boot time (for the pure caching case
279 * there is no reason why this could not be changed at runtime)
280 * and thus (currently) needs to be set with a tunable.
281 */
282 static int
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
284 {
285 int error, newnmbflows;
286
287 newnmbflows = V_flowtable_nmbflows;
288 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
289 if (error == 0 && req->newptr) {
290 if (newnmbflows > V_flowtable_nmbflows) {
291 V_flowtable_nmbflows = newnmbflows;
292 uma_zone_set_max(V_flow_ipv4_zone,
293 V_flowtable_nmbflows);
294 uma_zone_set_max(V_flow_ipv6_zone,
295 V_flowtable_nmbflows);
296 } else
297 error = EINVAL;
298 }
299 return (error);
300 }
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303 "Maximum number of flows allowed");
304
305
306
307 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
308
309 static void
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
311 {
312
313 FS_PRINT(sb, collisions);
314 FS_PRINT(sb, allocated);
315 FS_PRINT(sb, misses);
316 FS_PRINT(sb, max_depth);
317 FS_PRINT(sb, free_checks);
318 FS_PRINT(sb, frees);
319 FS_PRINT(sb, hits);
320 FS_PRINT(sb, lookups);
321 }
322
323 static void
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
325 {
326 int i;
327 struct flowtable_stats fs, *pfs;
328
329 if (ft->ft_flags & FL_PCPU) {
330 bzero(&fs, sizeof(fs));
331 pfs = &fs;
332 CPU_FOREACH(i) {
333 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
334 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
335 pfs->ft_misses += ft->ft_stats[i].ft_misses;
336 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
337 pfs->ft_frees += ft->ft_stats[i].ft_frees;
338 pfs->ft_hits += ft->ft_stats[i].ft_hits;
339 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
340 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
341 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
342 }
343 } else {
344 pfs = &ft->ft_stats[0];
345 }
346 fs_print(sb, pfs);
347 }
348
349 static int
350 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
351 {
352 struct flowtable *ft;
353 struct sbuf *sb;
354 int error;
355
356 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
357
358 ft = V_flow_list_head;
359 while (ft != NULL) {
360 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
361 flowtable_show_stats(sb, ft);
362 ft = ft->ft_next;
363 }
364 sbuf_finish(sb);
365 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
366 sbuf_delete(sb);
367
368 return (error);
369 }
370 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
371 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
372
373
374 #ifndef RADIX_MPATH
375 static void
376 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
377 {
378
379 rtalloc_ign_fib(ro, 0, fibnum);
380 }
381 #endif
382
383 static void
384 flowtable_global_lock(struct flowtable *table, uint32_t hash)
385 {
386 int lock_index = (hash)&(table->ft_lock_count - 1);
387
388 mtx_lock(&table->ft_locks[lock_index]);
389 }
390
391 static void
392 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
393 {
394 int lock_index = (hash)&(table->ft_lock_count - 1);
395
396 mtx_unlock(&table->ft_locks[lock_index]);
397 }
398
399 static void
400 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
401 {
402
403 critical_enter();
404 }
405
406 static void
407 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
408 {
409
410 critical_exit();
411 }
412
413 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
414 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
415 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
416 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
417
418 #define FL_STALE (1<<8)
419 #define FL_OVERWRITE (1<<10)
420
421 void
422 flow_invalidate(struct flentry *fle)
423 {
424
425 fle->f_flags |= FL_STALE;
426 }
427
428 static __inline int
429 proto_to_flags(uint8_t proto)
430 {
431 int flag;
432
433 switch (proto) {
434 case IPPROTO_TCP:
435 flag = FL_TCP;
436 break;
437 case IPPROTO_SCTP:
438 flag = FL_SCTP;
439 break;
440 case IPPROTO_UDP:
441 flag = FL_UDP;
442 break;
443 default:
444 flag = 0;
445 break;
446 }
447
448 return (flag);
449 }
450
451 static __inline int
452 flags_to_proto(int flags)
453 {
454 int proto, protoflags;
455
456 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
457 switch (protoflags) {
458 case FL_TCP:
459 proto = IPPROTO_TCP;
460 break;
461 case FL_SCTP:
462 proto = IPPROTO_SCTP;
463 break;
464 case FL_UDP:
465 proto = IPPROTO_UDP;
466 break;
467 default:
468 proto = 0;
469 break;
470 }
471 return (proto);
472 }
473
474 #ifdef INET
475 #ifdef FLOWTABLE_DEBUG
476 static void
477 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
478 struct sockaddr_in *dsin)
479 {
480 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
481
482 if (flags & FL_HASH_ALL) {
483 inet_ntoa_r(ssin->sin_addr, saddr);
484 inet_ntoa_r(dsin->sin_addr, daddr);
485 printf("proto=%d %s:%d->%s:%d\n",
486 proto, saddr, ntohs(ssin->sin_port), daddr,
487 ntohs(dsin->sin_port));
488 } else {
489 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
490 printf("proto=%d %s\n", proto, daddr);
491 }
492
493 }
494 #endif
495
496 static int
497 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
498 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
499 {
500 struct ip *ip;
501 uint8_t proto;
502 int iphlen;
503 struct tcphdr *th;
504 struct udphdr *uh;
505 struct sctphdr *sh;
506 uint16_t sport, dport;
507
508 proto = sport = dport = 0;
509 ip = mtod(m, struct ip *);
510 dsin->sin_family = AF_INET;
511 dsin->sin_len = sizeof(*dsin);
512 dsin->sin_addr = ip->ip_dst;
513 ssin->sin_family = AF_INET;
514 ssin->sin_len = sizeof(*ssin);
515 ssin->sin_addr = ip->ip_src;
516
517 proto = ip->ip_p;
518 if ((*flags & FL_HASH_ALL) == 0) {
519 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
520 *flags);
521 goto skipports;
522 }
523
524 iphlen = ip->ip_hl << 2; /* XXX options? */
525
526 switch (proto) {
527 case IPPROTO_TCP:
528 th = (struct tcphdr *)((caddr_t)ip + iphlen);
529 sport = th->th_sport;
530 dport = th->th_dport;
531 if ((*flags & FL_HASH_ALL) &&
532 (th->th_flags & (TH_RST|TH_FIN)))
533 *flags |= FL_STALE;
534 break;
535 case IPPROTO_UDP:
536 uh = (struct udphdr *)((caddr_t)ip + iphlen);
537 sport = uh->uh_sport;
538 dport = uh->uh_dport;
539 break;
540 case IPPROTO_SCTP:
541 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
542 sport = sh->src_port;
543 dport = sh->dest_port;
544 break;
545 default:
546 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
547 return (ENOTSUP);
548 /* no port - hence not a protocol we care about */
549 break;
550
551 }
552
553 skipports:
554 *flags |= proto_to_flags(proto);
555 ssin->sin_port = sport;
556 dsin->sin_port = dport;
557 return (0);
558 }
559
560 static uint32_t
561 ipv4_flow_lookup_hash_internal(
562 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
563 uint32_t *key, uint16_t flags)
564 {
565 uint16_t sport, dport;
566 uint8_t proto;
567 int offset = 0;
568
569 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
570 return (0);
571 proto = flags_to_proto(flags);
572 sport = dport = key[2] = key[1] = key[0] = 0;
573 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
574 key[1] = ssin->sin_addr.s_addr;
575 sport = ssin->sin_port;
576 }
577 if (dsin != NULL) {
578 key[2] = dsin->sin_addr.s_addr;
579 dport = dsin->sin_port;
580 }
581 if (flags & FL_HASH_ALL) {
582 ((uint16_t *)key)[0] = sport;
583 ((uint16_t *)key)[1] = dport;
584 } else
585 offset = V_flow_hashjitter + proto;
586
587 return (jenkins_hashword(key, 3, offset));
588 }
589
590 static struct flentry *
591 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
592 {
593 struct sockaddr_storage ssa, dsa;
594 uint16_t flags;
595 struct sockaddr_in *dsin, *ssin;
596
597 dsin = (struct sockaddr_in *)&dsa;
598 ssin = (struct sockaddr_in *)&ssa;
599 bzero(dsin, sizeof(*dsin));
600 bzero(ssin, sizeof(*ssin));
601 flags = ft->ft_flags;
602 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
603 return (NULL);
604
605 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
606 }
607
608 void
609 flow_to_route(struct flentry *fle, struct route *ro)
610 {
611 uint32_t *hashkey = NULL;
612 struct sockaddr_in *sin;
613
614 sin = (struct sockaddr_in *)&ro->ro_dst;
615 sin->sin_family = AF_INET;
616 sin->sin_len = sizeof(*sin);
617 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
618 sin->sin_addr.s_addr = hashkey[2];
619 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
620 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
621 }
622 #endif /* INET */
623
624 #ifdef INET6
625 /*
626 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
627 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
628 * pointer might become stale after other pullups (but we never use it
629 * this way).
630 */
631 #define PULLUP_TO(_len, p, T) \
632 do { \
633 int x = (_len) + sizeof(T); \
634 if ((m)->m_len < x) { \
635 goto receive_failed; \
636 } \
637 p = (mtod(m, char *) + (_len)); \
638 } while (0)
639
640 #define TCP(p) ((struct tcphdr *)(p))
641 #define SCTP(p) ((struct sctphdr *)(p))
642 #define UDP(p) ((struct udphdr *)(p))
643
644 static int
645 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
646 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
647 {
648 struct ip6_hdr *ip6;
649 uint8_t proto;
650 int hlen;
651 uint16_t src_port, dst_port;
652 u_short offset;
653 void *ulp;
654
655 offset = hlen = src_port = dst_port = 0;
656 ulp = NULL;
657 ip6 = mtod(m, struct ip6_hdr *);
658 hlen = sizeof(struct ip6_hdr);
659 proto = ip6->ip6_nxt;
660
661 if ((*flags & FL_HASH_ALL) == 0)
662 goto skipports;
663
664 while (ulp == NULL) {
665 switch (proto) {
666 case IPPROTO_ICMPV6:
667 case IPPROTO_OSPFIGP:
668 case IPPROTO_PIM:
669 case IPPROTO_CARP:
670 case IPPROTO_ESP:
671 case IPPROTO_NONE:
672 ulp = ip6;
673 break;
674 case IPPROTO_TCP:
675 PULLUP_TO(hlen, ulp, struct tcphdr);
676 dst_port = TCP(ulp)->th_dport;
677 src_port = TCP(ulp)->th_sport;
678 if ((*flags & FL_HASH_ALL) &&
679 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
680 *flags |= FL_STALE;
681 break;
682 case IPPROTO_SCTP:
683 PULLUP_TO(hlen, ulp, struct sctphdr);
684 src_port = SCTP(ulp)->src_port;
685 dst_port = SCTP(ulp)->dest_port;
686 break;
687 case IPPROTO_UDP:
688 PULLUP_TO(hlen, ulp, struct udphdr);
689 dst_port = UDP(ulp)->uh_dport;
690 src_port = UDP(ulp)->uh_sport;
691 break;
692 case IPPROTO_HOPOPTS: /* RFC 2460 */
693 PULLUP_TO(hlen, ulp, struct ip6_hbh);
694 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
695 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
696 ulp = NULL;
697 break;
698 case IPPROTO_ROUTING: /* RFC 2460 */
699 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
700 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
701 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
702 ulp = NULL;
703 break;
704 case IPPROTO_FRAGMENT: /* RFC 2460 */
705 PULLUP_TO(hlen, ulp, struct ip6_frag);
706 hlen += sizeof (struct ip6_frag);
707 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
708 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
709 IP6F_OFF_MASK;
710 ulp = NULL;
711 break;
712 case IPPROTO_DSTOPTS: /* RFC 2460 */
713 PULLUP_TO(hlen, ulp, struct ip6_hbh);
714 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
715 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
716 ulp = NULL;
717 break;
718 case IPPROTO_AH: /* RFC 2402 */
719 PULLUP_TO(hlen, ulp, struct ip6_ext);
720 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
721 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
722 ulp = NULL;
723 break;
724 default:
725 PULLUP_TO(hlen, ulp, struct ip6_ext);
726 break;
727 }
728 }
729
730 if (src_port == 0) {
731 receive_failed:
732 return (ENOTSUP);
733 }
734
735 skipports:
736 dsin6->sin6_family = AF_INET6;
737 dsin6->sin6_len = sizeof(*dsin6);
738 dsin6->sin6_port = dst_port;
739 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
740
741 ssin6->sin6_family = AF_INET6;
742 ssin6->sin6_len = sizeof(*ssin6);
743 ssin6->sin6_port = src_port;
744 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
745 *flags |= proto_to_flags(proto);
746
747 return (0);
748 }
749
750 #define zero_key(key) \
751 do { \
752 key[0] = 0; \
753 key[1] = 0; \
754 key[2] = 0; \
755 key[3] = 0; \
756 key[4] = 0; \
757 key[5] = 0; \
758 key[6] = 0; \
759 key[7] = 0; \
760 key[8] = 0; \
761 } while (0)
762
763 static uint32_t
764 ipv6_flow_lookup_hash_internal(
765 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
766 uint32_t *key, uint16_t flags)
767 {
768 uint16_t sport, dport;
769 uint8_t proto;
770 int offset = 0;
771
772 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
773 return (0);
774
775 proto = flags_to_proto(flags);
776 zero_key(key);
777 sport = dport = 0;
778 if (dsin6 != NULL) {
779 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
780 dport = dsin6->sin6_port;
781 }
782 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
783 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
784 sport = ssin6->sin6_port;
785 }
786 if (flags & FL_HASH_ALL) {
787 ((uint16_t *)key)[0] = sport;
788 ((uint16_t *)key)[1] = dport;
789 } else
790 offset = V_flow_hashjitter + proto;
791
792 return (jenkins_hashword(key, 9, offset));
793 }
794
795 static struct flentry *
796 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
797 {
798 struct sockaddr_storage ssa, dsa;
799 struct sockaddr_in6 *dsin6, *ssin6;
800 uint16_t flags;
801
802 dsin6 = (struct sockaddr_in6 *)&dsa;
803 ssin6 = (struct sockaddr_in6 *)&ssa;
804 bzero(dsin6, sizeof(*dsin6));
805 bzero(ssin6, sizeof(*ssin6));
806 flags = ft->ft_flags;
807
808 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
809 return (NULL);
810
811 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
812 }
813
814 void
815 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
816 {
817 uint32_t *hashkey = NULL;
818 struct sockaddr_in6 *sin6;
819
820 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
821
822 sin6->sin6_family = AF_INET6;
823 sin6->sin6_len = sizeof(*sin6);
824 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
825 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
826 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
827 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
828
829 }
830 #endif /* INET6 */
831
832 static bitstr_t *
833 flowtable_mask(struct flowtable *ft)
834 {
835 bitstr_t *mask;
836
837 if (ft->ft_flags & FL_PCPU)
838 mask = ft->ft_masks[curcpu];
839 else
840 mask = ft->ft_masks[0];
841
842 return (mask);
843 }
844
845 static struct flentry **
846 flowtable_entry(struct flowtable *ft, uint32_t hash)
847 {
848 struct flentry **fle;
849 int index = (hash % ft->ft_size);
850
851 if (ft->ft_flags & FL_PCPU) {
852 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
853 fle = &ft->ft_table.pcpu[curcpu][index];
854 } else {
855 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
856 fle = &ft->ft_table.global[index];
857 }
858
859 return (fle);
860 }
861
862 static int
863 flow_stale(struct flowtable *ft, struct flentry *fle)
864 {
865 time_t idle_time;
866
867 if ((fle->f_fhash == 0)
868 || ((fle->f_rt->rt_flags & RTF_HOST) &&
869 ((fle->f_rt->rt_flags & (RTF_UP))
870 != (RTF_UP)))
871 || (fle->f_rt->rt_ifp == NULL)
872 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
873 return (1);
874
875 idle_time = time_uptime - fle->f_uptime;
876
877 if ((fle->f_flags & FL_STALE) ||
878 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
879 && (idle_time > ft->ft_udp_idle)) ||
880 ((fle->f_flags & TH_FIN)
881 && (idle_time > ft->ft_fin_wait_idle)) ||
882 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
883 && (idle_time > ft->ft_syn_idle)) ||
884 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
885 && (idle_time > ft->ft_tcp_idle)) ||
886 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
887 (fle->f_rt->rt_ifp == NULL)))
888 return (1);
889
890 return (0);
891 }
892
893 static void
894 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
895 {
896 uint32_t *hashkey;
897 int i, nwords;
898
899 if (fle->f_flags & FL_IPV6) {
900 nwords = 9;
901 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
902 } else {
903 nwords = 3;
904 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
905 }
906
907 for (i = 0; i < nwords; i++)
908 hashkey[i] = key[i];
909 }
910
911 static struct flentry *
912 flow_alloc(struct flowtable *ft)
913 {
914 struct flentry *newfle;
915 uma_zone_t zone;
916
917 newfle = NULL;
918 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
919
920 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
921 if (newfle != NULL)
922 atomic_add_int(&ft->ft_count, 1);
923 return (newfle);
924 }
925
926 static void
927 flow_free(struct flentry *fle, struct flowtable *ft)
928 {
929 uma_zone_t zone;
930
931 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
932 atomic_add_int(&ft->ft_count, -1);
933 uma_zfree(zone, fle);
934 }
935
936 static int
937 flow_full(struct flowtable *ft)
938 {
939 boolean_t full;
940 uint32_t count;
941
942 full = ft->ft_full;
943 count = ft->ft_count;
944
945 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
946 ft->ft_full = FALSE;
947 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
948 ft->ft_full = TRUE;
949
950 if (full && !ft->ft_full) {
951 flowclean_freq = 4*hz;
952 if ((ft->ft_flags & FL_HASH_ALL) == 0)
953 ft->ft_udp_idle = ft->ft_fin_wait_idle =
954 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
955 cv_broadcast(&flowclean_c_cv);
956 } else if (!full && ft->ft_full) {
957 flowclean_freq = 20*hz;
958 if ((ft->ft_flags & FL_HASH_ALL) == 0)
959 ft->ft_udp_idle = ft->ft_fin_wait_idle =
960 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
961 }
962
963 return (ft->ft_full);
964 }
965
966 static int
967 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
968 uint32_t fibnum, struct route *ro, uint16_t flags)
969 {
970 struct flentry *fle, *fletail, *newfle, **flep;
971 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
972 int depth;
973 bitstr_t *mask;
974 uint8_t proto;
975
976 newfle = flow_alloc(ft);
977 if (newfle == NULL)
978 return (ENOMEM);
979
980 newfle->f_flags |= (flags & FL_IPV6);
981 proto = flags_to_proto(flags);
982
983 FL_ENTRY_LOCK(ft, hash);
984 mask = flowtable_mask(ft);
985 flep = flowtable_entry(ft, hash);
986 fletail = fle = *flep;
987
988 if (fle == NULL) {
989 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
990 *flep = fle = newfle;
991 goto skip;
992 }
993
994 depth = 0;
995 fs->ft_collisions++;
996 /*
997 * find end of list and make sure that we were not
998 * preempted by another thread handling this flow
999 */
1000 while (fle != NULL) {
1001 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1002 /*
1003 * there was either a hash collision
1004 * or we lost a race to insert
1005 */
1006 FL_ENTRY_UNLOCK(ft, hash);
1007 flow_free(newfle, ft);
1008
1009 if (flags & FL_OVERWRITE)
1010 goto skip;
1011 return (EEXIST);
1012 }
1013 /*
1014 * re-visit this double condition XXX
1015 */
1016 if (fletail->f_next != NULL)
1017 fletail = fle->f_next;
1018
1019 depth++;
1020 fle = fle->f_next;
1021 }
1022
1023 if (depth > fs->ft_max_depth)
1024 fs->ft_max_depth = depth;
1025 fletail->f_next = newfle;
1026 fle = newfle;
1027 skip:
1028 flowtable_set_hashkey(fle, key);
1029
1030 fle->f_proto = proto;
1031 fle->f_rt = ro->ro_rt;
1032 fle->f_lle = ro->ro_lle;
1033 fle->f_fhash = hash;
1034 fle->f_fibnum = fibnum;
1035 fle->f_uptime = time_uptime;
1036 FL_ENTRY_UNLOCK(ft, hash);
1037 return (0);
1038 }
1039
1040 int
1041 kern_flowtable_insert(struct flowtable *ft,
1042 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1043 struct route *ro, uint32_t fibnum, int flags)
1044 {
1045 uint32_t key[9], hash;
1046
1047 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1048 hash = 0;
1049
1050 #ifdef INET
1051 if (ssa->ss_family == AF_INET)
1052 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1053 (struct sockaddr_in *)dsa, key, flags);
1054 #endif
1055 #ifdef INET6
1056 if (ssa->ss_family == AF_INET6)
1057 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1058 (struct sockaddr_in6 *)dsa, key, flags);
1059 #endif
1060 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1061 return (EINVAL);
1062
1063 FLDPRINTF(ft, FL_DEBUG,
1064 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1065 key[0], key[1], key[2], hash, fibnum, flags);
1066 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1067 }
1068
1069 static int
1070 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1071 {
1072 uint32_t *hashkey;
1073 int i, nwords;
1074
1075 if (fle->f_flags & FL_IPV6) {
1076 nwords = 9;
1077 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1078 } else {
1079 nwords = 3;
1080 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1081 }
1082
1083 for (i = 0; i < nwords; i++)
1084 if (hashkey[i] != key[i])
1085 return (0);
1086
1087 return (1);
1088 }
1089
1090 struct flentry *
1091 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1092 {
1093 struct flentry *fle = NULL;
1094
1095 #ifdef INET
1096 if (af == AF_INET)
1097 fle = flowtable_lookup_mbuf4(ft, m);
1098 #endif
1099 #ifdef INET6
1100 if (af == AF_INET6)
1101 fle = flowtable_lookup_mbuf6(ft, m);
1102 #endif
1103 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1104 m->m_flags |= M_FLOWID;
1105 m->m_pkthdr.flowid = fle->f_fhash;
1106 }
1107 return (fle);
1108 }
1109
1110 struct flentry *
1111 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1112 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1113 {
1114 uint32_t key[9], hash;
1115 struct flentry *fle;
1116 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1117 uint8_t proto = 0;
1118 int error = 0;
1119 struct rtentry *rt;
1120 struct llentry *lle;
1121 struct route sro, *ro;
1122 struct route_in6 sro6;
1123
1124 sro.ro_rt = sro6.ro_rt = NULL;
1125 sro.ro_lle = sro6.ro_lle = NULL;
1126 ro = NULL;
1127 hash = 0;
1128 flags |= ft->ft_flags;
1129 proto = flags_to_proto(flags);
1130 #ifdef INET
1131 if (ssa->ss_family == AF_INET) {
1132 struct sockaddr_in *ssin, *dsin;
1133
1134 ro = &sro;
1135 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1136 /*
1137 * The harvested source and destination addresses
1138 * may contain port information if the packet is
1139 * from a transport protocol (e.g. TCP/UDP). The
1140 * port field must be cleared before performing
1141 * a route lookup.
1142 */
1143 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1144 dsin = (struct sockaddr_in *)dsa;
1145 ssin = (struct sockaddr_in *)ssa;
1146 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1147 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1148 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1149 return (NULL);
1150
1151 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1152 }
1153 #endif
1154 #ifdef INET6
1155 if (ssa->ss_family == AF_INET6) {
1156 struct sockaddr_in6 *ssin6, *dsin6;
1157
1158 ro = (struct route *)&sro6;
1159 memcpy(&sro6.ro_dst, dsa,
1160 sizeof(struct sockaddr_in6));
1161 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1162 dsin6 = (struct sockaddr_in6 *)dsa;
1163 ssin6 = (struct sockaddr_in6 *)ssa;
1164
1165 flags |= FL_IPV6;
1166 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1167 }
1168 #endif
1169 /*
1170 * Ports are zero and this isn't a transmit cache
1171 * - thus not a protocol for which we need to keep
1172 * state
1173 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1174 */
1175 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1176 return (NULL);
1177
1178 fs->ft_lookups++;
1179 FL_ENTRY_LOCK(ft, hash);
1180 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1181 FL_ENTRY_UNLOCK(ft, hash);
1182 goto uncached;
1183 }
1184 keycheck:
1185 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1186 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1187 if ((rt != NULL)
1188 && lle != NULL
1189 && fle->f_fhash == hash
1190 && flowtable_key_equal(fle, key)
1191 && (proto == fle->f_proto)
1192 && (fibnum == fle->f_fibnum)
1193 && (rt->rt_flags & RTF_UP)
1194 && (rt->rt_ifp != NULL)
1195 && (lle->la_flags & LLE_VALID)) {
1196 fs->ft_hits++;
1197 fle->f_uptime = time_uptime;
1198 fle->f_flags |= flags;
1199 FL_ENTRY_UNLOCK(ft, hash);
1200 return (fle);
1201 } else if (fle->f_next != NULL) {
1202 fle = fle->f_next;
1203 goto keycheck;
1204 }
1205 FL_ENTRY_UNLOCK(ft, hash);
1206 uncached:
1207 if (flags & FL_NOAUTO || flow_full(ft))
1208 return (NULL);
1209
1210 fs->ft_misses++;
1211 /*
1212 * This bit of code ends up locking the
1213 * same route 3 times (just like ip_output + ether_output)
1214 * - at lookup
1215 * - in rt_check when called by arpresolve
1216 * - dropping the refcount for the rtentry
1217 *
1218 * This could be consolidated to one if we wrote a variant
1219 * of arpresolve with an rt_check variant that expected to
1220 * receive the route locked
1221 */
1222
1223 #ifdef INVARIANTS
1224 if ((ro->ro_dst.sa_family != AF_INET) &&
1225 (ro->ro_dst.sa_family != AF_INET6))
1226 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1227 #endif
1228
1229 ft->ft_rtalloc(ro, hash, fibnum);
1230 if (ro->ro_rt == NULL)
1231 error = ENETUNREACH;
1232 else {
1233 struct llentry *lle = NULL;
1234 struct sockaddr_storage *l3addr;
1235 struct rtentry *rt = ro->ro_rt;
1236 struct ifnet *ifp = rt->rt_ifp;
1237
1238 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1239 RTFREE(rt);
1240 ro->ro_rt = NULL;
1241 return (NULL);
1242 }
1243 #ifdef INET6
1244 if (ssa->ss_family == AF_INET6) {
1245 struct sockaddr_in6 *dsin6;
1246
1247 dsin6 = (struct sockaddr_in6 *)dsa;
1248 if (in6_localaddr(&dsin6->sin6_addr)) {
1249 RTFREE(rt);
1250 ro->ro_rt = NULL;
1251 return (NULL);
1252 }
1253
1254 if (rt->rt_flags & RTF_GATEWAY)
1255 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1256
1257 else
1258 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1259 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1260 }
1261 #endif
1262 #ifdef INET
1263 if (ssa->ss_family == AF_INET) {
1264 if (rt->rt_flags & RTF_GATEWAY)
1265 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1266 else
1267 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1268 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1269 }
1270
1271 #endif
1272 ro->ro_lle = lle;
1273
1274 if (lle == NULL) {
1275 RTFREE(rt);
1276 ro->ro_rt = NULL;
1277 return (NULL);
1278 }
1279 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1280
1281 if (error) {
1282 RTFREE(rt);
1283 LLE_FREE(lle);
1284 ro->ro_rt = NULL;
1285 ro->ro_lle = NULL;
1286 }
1287 }
1288
1289 return ((error) ? NULL : fle);
1290 }
1291
1292 /*
1293 * used by the bit_alloc macro
1294 */
1295 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1296
1297 struct flowtable *
1298 flowtable_alloc(char *name, int nentry, int flags)
1299 {
1300 struct flowtable *ft, *fttail;
1301 int i;
1302
1303 if (V_flow_hashjitter == 0)
1304 V_flow_hashjitter = arc4random();
1305
1306 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1307
1308 ft = malloc(sizeof(struct flowtable),
1309 M_RTABLE, M_WAITOK | M_ZERO);
1310
1311 ft->ft_name = name;
1312 ft->ft_flags = flags;
1313 ft->ft_size = nentry;
1314 #ifdef RADIX_MPATH
1315 ft->ft_rtalloc = rtalloc_mpath_fib;
1316 #else
1317 ft->ft_rtalloc = rtalloc_ign_wrapper;
1318 #endif
1319 if (flags & FL_PCPU) {
1320 ft->ft_lock = flowtable_pcpu_lock;
1321 ft->ft_unlock = flowtable_pcpu_unlock;
1322
1323 for (i = 0; i <= mp_maxid; i++) {
1324 ft->ft_table.pcpu[i] =
1325 malloc(nentry*sizeof(struct flentry *),
1326 M_RTABLE, M_WAITOK | M_ZERO);
1327 ft->ft_masks[i] = bit_alloc(nentry);
1328 }
1329 } else {
1330 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1331 (fls(mp_maxid + 1) << 1));
1332
1333 ft->ft_lock = flowtable_global_lock;
1334 ft->ft_unlock = flowtable_global_unlock;
1335 ft->ft_table.global =
1336 malloc(nentry*sizeof(struct flentry *),
1337 M_RTABLE, M_WAITOK | M_ZERO);
1338 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1339 M_RTABLE, M_WAITOK | M_ZERO);
1340 for (i = 0; i < ft->ft_lock_count; i++)
1341 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1342
1343 ft->ft_masks[0] = bit_alloc(nentry);
1344 }
1345 ft->ft_tmpmask = bit_alloc(nentry);
1346
1347 /*
1348 * In the local transmit case the table truly is
1349 * just a cache - so everything is eligible for
1350 * replacement after 5s of non-use
1351 */
1352 if (flags & FL_HASH_ALL) {
1353 ft->ft_udp_idle = V_flowtable_udp_expire;
1354 ft->ft_syn_idle = V_flowtable_syn_expire;
1355 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1356 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1357 } else {
1358 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1359 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1360
1361 }
1362
1363 /*
1364 * hook in to the cleaner list
1365 */
1366 if (V_flow_list_head == NULL)
1367 V_flow_list_head = ft;
1368 else {
1369 fttail = V_flow_list_head;
1370 while (fttail->ft_next != NULL)
1371 fttail = fttail->ft_next;
1372 fttail->ft_next = ft;
1373 }
1374
1375 return (ft);
1376 }
1377
1378 /*
1379 * The rest of the code is devoted to garbage collection of expired entries.
1380 * It is a new additon made necessary by the switch to dynamically allocating
1381 * flow tables.
1382 *
1383 */
1384 static void
1385 fle_free(struct flentry *fle, struct flowtable *ft)
1386 {
1387 struct rtentry *rt;
1388 struct llentry *lle;
1389
1390 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1391 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1392 if (rt != NULL)
1393 RTFREE(rt);
1394 if (lle != NULL)
1395 LLE_FREE(lle);
1396 flow_free(fle, ft);
1397 }
1398
1399 static void
1400 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1401 {
1402 int curbit = 0, count;
1403 struct flentry *fle, **flehead, *fleprev;
1404 struct flentry *flefreehead, *flefreetail, *fletmp;
1405 bitstr_t *mask, *tmpmask;
1406 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1407
1408 flefreehead = flefreetail = NULL;
1409 mask = flowtable_mask(ft);
1410 tmpmask = ft->ft_tmpmask;
1411 memcpy(tmpmask, mask, ft->ft_size/8);
1412 /*
1413 * XXX Note to self, bit_ffs operates at the byte level
1414 * and thus adds gratuitous overhead
1415 */
1416 bit_ffs(tmpmask, ft->ft_size, &curbit);
1417 while (curbit != -1) {
1418 if (curbit >= ft->ft_size || curbit < -1) {
1419 log(LOG_ALERT,
1420 "warning: bad curbit value %d \n",
1421 curbit);
1422 break;
1423 }
1424
1425 FL_ENTRY_LOCK(ft, curbit);
1426 flehead = flowtable_entry(ft, curbit);
1427 fle = fleprev = *flehead;
1428
1429 fs->ft_free_checks++;
1430 #ifdef DIAGNOSTIC
1431 if (fle == NULL && curbit > 0) {
1432 log(LOG_ALERT,
1433 "warning bit=%d set, but no fle found\n",
1434 curbit);
1435 }
1436 #endif
1437 while (fle != NULL) {
1438 if (rt != NULL) {
1439 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1440 fleprev = fle;
1441 fle = fle->f_next;
1442 continue;
1443 }
1444 } else if (!flow_stale(ft, fle)) {
1445 fleprev = fle;
1446 fle = fle->f_next;
1447 continue;
1448 }
1449 /*
1450 * delete head of the list
1451 */
1452 if (fleprev == *flehead) {
1453 fletmp = fleprev;
1454 if (fle == fleprev) {
1455 fleprev = *flehead = fle->f_next;
1456 } else
1457 fleprev = *flehead = fle;
1458 fle = fle->f_next;
1459 } else {
1460 /*
1461 * don't advance fleprev
1462 */
1463 fletmp = fle;
1464 fleprev->f_next = fle->f_next;
1465 fle = fleprev->f_next;
1466 }
1467
1468 if (flefreehead == NULL)
1469 flefreehead = flefreetail = fletmp;
1470 else {
1471 flefreetail->f_next = fletmp;
1472 flefreetail = fletmp;
1473 }
1474 fletmp->f_next = NULL;
1475 }
1476 if (*flehead == NULL)
1477 bit_clear(mask, curbit);
1478 FL_ENTRY_UNLOCK(ft, curbit);
1479 bit_clear(tmpmask, curbit);
1480 bit_ffs(tmpmask, ft->ft_size, &curbit);
1481 }
1482 count = 0;
1483 while ((fle = flefreehead) != NULL) {
1484 flefreehead = fle->f_next;
1485 count++;
1486 fs->ft_frees++;
1487 fle_free(fle, ft);
1488 }
1489 if (V_flowtable_debug && count)
1490 log(LOG_DEBUG, "freed %d flow entries\n", count);
1491 }
1492
1493 void
1494 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1495 {
1496 int i;
1497
1498 if (ft->ft_flags & FL_PCPU) {
1499 CPU_FOREACH(i) {
1500 if (smp_started == 1) {
1501 thread_lock(curthread);
1502 sched_bind(curthread, i);
1503 thread_unlock(curthread);
1504 }
1505
1506 flowtable_free_stale(ft, rt);
1507
1508 if (smp_started == 1) {
1509 thread_lock(curthread);
1510 sched_unbind(curthread);
1511 thread_unlock(curthread);
1512 }
1513 }
1514 } else {
1515 flowtable_free_stale(ft, rt);
1516 }
1517 }
1518
1519 static void
1520 flowtable_clean_vnet(void)
1521 {
1522 struct flowtable *ft;
1523 int i;
1524
1525 ft = V_flow_list_head;
1526 while (ft != NULL) {
1527 if (ft->ft_flags & FL_PCPU) {
1528 CPU_FOREACH(i) {
1529 if (smp_started == 1) {
1530 thread_lock(curthread);
1531 sched_bind(curthread, i);
1532 thread_unlock(curthread);
1533 }
1534
1535 flowtable_free_stale(ft, NULL);
1536
1537 if (smp_started == 1) {
1538 thread_lock(curthread);
1539 sched_unbind(curthread);
1540 thread_unlock(curthread);
1541 }
1542 }
1543 } else {
1544 flowtable_free_stale(ft, NULL);
1545 }
1546 ft = ft->ft_next;
1547 }
1548 }
1549
1550 static void
1551 flowtable_cleaner(void)
1552 {
1553 VNET_ITERATOR_DECL(vnet_iter);
1554 struct thread *td;
1555
1556 if (bootverbose)
1557 log(LOG_INFO, "flowtable cleaner started\n");
1558 td = curthread;
1559 while (1) {
1560 VNET_LIST_RLOCK();
1561 VNET_FOREACH(vnet_iter) {
1562 CURVNET_SET(vnet_iter);
1563 flowtable_clean_vnet();
1564 CURVNET_RESTORE();
1565 }
1566 VNET_LIST_RUNLOCK();
1567
1568 /*
1569 * The 10 second interval between cleaning checks
1570 * is arbitrary
1571 */
1572 mtx_lock(&flowclean_lock);
1573 thread_lock(td);
1574 sched_prio(td, PPAUSE);
1575 thread_unlock(td);
1576 flowclean_cycles++;
1577 cv_broadcast(&flowclean_f_cv);
1578 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1579 mtx_unlock(&flowclean_lock);
1580 }
1581 }
1582
1583 static void
1584 flowtable_flush(void *unused __unused)
1585 {
1586 uint64_t start;
1587
1588 mtx_lock(&flowclean_lock);
1589 start = flowclean_cycles;
1590 while (start == flowclean_cycles) {
1591 cv_broadcast(&flowclean_c_cv);
1592 cv_wait(&flowclean_f_cv, &flowclean_lock);
1593 }
1594 mtx_unlock(&flowclean_lock);
1595 }
1596
1597 static struct kproc_desc flow_kp = {
1598 "flowcleaner",
1599 flowtable_cleaner,
1600 &flowcleanerproc
1601 };
1602 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1603
1604 static void
1605 flowtable_init_vnet(const void *unused __unused)
1606 {
1607
1608 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1609 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1610 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1611 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1612 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1613 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1614 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1615 V_flowtable_ready = 1;
1616 }
1617 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1618 flowtable_init_vnet, NULL);
1619
1620 static void
1621 flowtable_init(const void *unused __unused)
1622 {
1623
1624 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1625 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1626 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1627 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1628 EVENTHANDLER_PRI_ANY);
1629 flowclean_freq = 20*hz;
1630 }
1631 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1632 flowtable_init, NULL);
1633
1634
1635 #ifdef VIMAGE
1636 static void
1637 flowtable_uninit(const void *unused __unused)
1638 {
1639
1640 V_flowtable_ready = 0;
1641 uma_zdestroy(V_flow_ipv4_zone);
1642 uma_zdestroy(V_flow_ipv6_zone);
1643 }
1644
1645 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1646 flowtable_uninit, NULL);
1647 #endif
1648
1649 #ifdef DDB
1650 static uint32_t *
1651 flowtable_get_hashkey(struct flentry *fle)
1652 {
1653 uint32_t *hashkey;
1654
1655 if (fle->f_flags & FL_IPV6)
1656 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1657 else
1658 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1659
1660 return (hashkey);
1661 }
1662
1663 static bitstr_t *
1664 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1665 {
1666 bitstr_t *mask;
1667
1668 if (ft->ft_flags & FL_PCPU)
1669 mask = ft->ft_masks[cpuid];
1670 else
1671 mask = ft->ft_masks[0];
1672
1673 return (mask);
1674 }
1675
1676 static struct flentry **
1677 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1678 {
1679 struct flentry **fle;
1680 int index = (hash % ft->ft_size);
1681
1682 if (ft->ft_flags & FL_PCPU) {
1683 fle = &ft->ft_table.pcpu[cpuid][index];
1684 } else {
1685 fle = &ft->ft_table.global[index];
1686 }
1687
1688 return (fle);
1689 }
1690
1691 static void
1692 flow_show(struct flowtable *ft, struct flentry *fle)
1693 {
1694 int idle_time;
1695 int rt_valid, ifp_valid;
1696 uint16_t sport, dport;
1697 uint32_t *hashkey;
1698 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1699 volatile struct rtentry *rt;
1700 struct ifnet *ifp = NULL;
1701
1702 idle_time = (int)(time_uptime - fle->f_uptime);
1703 rt = fle->f_rt;
1704 rt_valid = rt != NULL;
1705 if (rt_valid)
1706 ifp = rt->rt_ifp;
1707 ifp_valid = ifp != NULL;
1708 hashkey = flowtable_get_hashkey(fle);
1709 if (fle->f_flags & FL_IPV6)
1710 goto skipaddr;
1711
1712 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1713 if (ft->ft_flags & FL_HASH_ALL) {
1714 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1715 sport = ntohs(((uint16_t *)hashkey)[0]);
1716 dport = ntohs(((uint16_t *)hashkey)[1]);
1717 db_printf("%s:%d->%s:%d",
1718 saddr, sport, daddr,
1719 dport);
1720 } else
1721 db_printf("%s ", daddr);
1722
1723 skipaddr:
1724 if (fle->f_flags & FL_STALE)
1725 db_printf(" FL_STALE ");
1726 if (fle->f_flags & FL_TCP)
1727 db_printf(" FL_TCP ");
1728 if (fle->f_flags & FL_UDP)
1729 db_printf(" FL_UDP ");
1730 if (rt_valid) {
1731 if (rt->rt_flags & RTF_UP)
1732 db_printf(" RTF_UP ");
1733 }
1734 if (ifp_valid) {
1735 if (ifp->if_flags & IFF_LOOPBACK)
1736 db_printf(" IFF_LOOPBACK ");
1737 if (ifp->if_flags & IFF_UP)
1738 db_printf(" IFF_UP ");
1739 if (ifp->if_flags & IFF_POINTOPOINT)
1740 db_printf(" IFF_POINTOPOINT ");
1741 }
1742 if (fle->f_flags & FL_IPV6)
1743 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1744 hashkey[0], hashkey[1], hashkey[2],
1745 hashkey[3], hashkey[4], hashkey[5],
1746 hashkey[6], hashkey[7], hashkey[8]);
1747 else
1748 db_printf("\n\tkey=%08x:%08x:%08x ",
1749 hashkey[0], hashkey[1], hashkey[2]);
1750 db_printf("hash=%08x idle_time=%03d"
1751 "\n\tfibnum=%02d rt=%p",
1752 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1753 db_printf("\n");
1754 }
1755
1756 static void
1757 flowtable_show(struct flowtable *ft, int cpuid)
1758 {
1759 int curbit = 0;
1760 struct flentry *fle, **flehead;
1761 bitstr_t *mask, *tmpmask;
1762
1763 if (cpuid != -1)
1764 db_printf("cpu: %d\n", cpuid);
1765 mask = flowtable_mask_pcpu(ft, cpuid);
1766 tmpmask = ft->ft_tmpmask;
1767 memcpy(tmpmask, mask, ft->ft_size/8);
1768 /*
1769 * XXX Note to self, bit_ffs operates at the byte level
1770 * and thus adds gratuitous overhead
1771 */
1772 bit_ffs(tmpmask, ft->ft_size, &curbit);
1773 while (curbit != -1) {
1774 if (curbit >= ft->ft_size || curbit < -1) {
1775 db_printf("warning: bad curbit value %d \n",
1776 curbit);
1777 break;
1778 }
1779
1780 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1781 fle = *flehead;
1782
1783 while (fle != NULL) {
1784 flow_show(ft, fle);
1785 fle = fle->f_next;
1786 continue;
1787 }
1788 bit_clear(tmpmask, curbit);
1789 bit_ffs(tmpmask, ft->ft_size, &curbit);
1790 }
1791 }
1792
1793 static void
1794 flowtable_show_vnet(void)
1795 {
1796 struct flowtable *ft;
1797 int i;
1798
1799 ft = V_flow_list_head;
1800 while (ft != NULL) {
1801 printf("name: %s\n", ft->ft_name);
1802 if (ft->ft_flags & FL_PCPU) {
1803 CPU_FOREACH(i) {
1804 flowtable_show(ft, i);
1805 }
1806 } else {
1807 flowtable_show(ft, -1);
1808 }
1809 ft = ft->ft_next;
1810 }
1811 }
1812
1813 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1814 {
1815 VNET_ITERATOR_DECL(vnet_iter);
1816
1817 VNET_FOREACH(vnet_iter) {
1818 CURVNET_SET(vnet_iter);
1819 #ifdef VIMAGE
1820 db_printf("vnet %p\n", vnet_iter);
1821 #endif
1822 flowtable_show_vnet();
1823 CURVNET_RESTORE();
1824 }
1825 }
1826 #endif
Cache object: 3fcbf877752016ea23644fe7ba4c2b4a
|