FreeBSD/Linux Kernel Cross Reference
sys/net/flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD: releng/8.4/sys/net/flowtable.c 232552 2012-03-05 17:33:01Z bz $");
38
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
84 };
85
86 union ipv4_flow {
87 struct ipv4_tuple ipf_ipt;
88 uint32_t ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
96 };
97
98 union ipv6_flow {
99 struct ipv6_tuple ipf_ipt;
100 uint32_t ipf_key[9];
101 };
102
103 struct flentry {
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
106 uint8_t f_pad;
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
131
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE 300
136 #define UDP_IDLE 300
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
152 uint64_t ft_misses;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
155 uint64_t ft_frees;
156 uint64_t ft_hits;
157 uint64_t ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161 struct flowtable_stats ft_stats[MAXCPU];
162 int ft_size;
163 int ft_lock_count;
164 uint32_t ft_flags;
165 char *ft_name;
166 fl_lock_t *ft_lock;
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
169 /*
170 * XXX need to pad out
171 */
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
177
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
183 boolean_t ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
196
197
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
203
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
206 do { \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
209 } while (0); \
210
211 #else
212 #define FLDPRINTF(ft, flags, fmt, ...)
213
214 #endif
215
216
217 /*
218 * TODO:
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
221 * counter
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
226 for flows
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
232 */
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
241
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
250
251 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
252 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
253 &VNET_NAME(flowtable_debug), 0, "print debug info.");
254 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
255 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
256
257 /*
258 * XXX This does not end up updating timeouts at runtime
259 * and only reflects the value for the last table added :-/
260 */
261 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
262 &VNET_NAME(flowtable_syn_expire), 0,
263 "seconds after which to remove syn allocated flow.");
264 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
265 &VNET_NAME(flowtable_udp_expire), 0,
266 "seconds after which to remove flow allocated to UDP.");
267 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
268 &VNET_NAME(flowtable_fin_wait_expire), 0,
269 "seconds after which to remove a flow in FIN_WAIT.");
270 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
271 &VNET_NAME(flowtable_tcp_expire), 0,
272 "seconds after which to remove flow allocated to a TCP connection.");
273
274
275 /*
276 * Maximum number of flows that can be allocated of a given type.
277 *
278 * The table is allocated at boot time (for the pure caching case
279 * there is no reason why this could not be changed at runtime)
280 * and thus (currently) needs to be set with a tunable.
281 */
282 static int
283 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
284 {
285 int error, newnmbflows;
286
287 newnmbflows = V_flowtable_nmbflows;
288 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
289 if (error == 0 && req->newptr) {
290 if (newnmbflows > V_flowtable_nmbflows) {
291 V_flowtable_nmbflows = newnmbflows;
292 uma_zone_set_max(V_flow_ipv4_zone,
293 V_flowtable_nmbflows);
294 uma_zone_set_max(V_flow_ipv6_zone,
295 V_flowtable_nmbflows);
296 } else
297 error = EINVAL;
298 }
299 return (error);
300 }
301 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
302 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
303 "Maximum number of flows allowed");
304
305
306
307 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
308
309 static void
310 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
311 {
312
313 FS_PRINT(sb, collisions);
314 FS_PRINT(sb, allocated);
315 FS_PRINT(sb, misses);
316 FS_PRINT(sb, max_depth);
317 FS_PRINT(sb, free_checks);
318 FS_PRINT(sb, frees);
319 FS_PRINT(sb, hits);
320 FS_PRINT(sb, lookups);
321 }
322
323 static void
324 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
325 {
326 int i;
327 struct flowtable_stats fs, *pfs;
328
329 if (ft->ft_flags & FL_PCPU) {
330 bzero(&fs, sizeof(fs));
331 pfs = &fs;
332 CPU_FOREACH(i) {
333 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
334 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
335 pfs->ft_misses += ft->ft_stats[i].ft_misses;
336 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
337 pfs->ft_frees += ft->ft_stats[i].ft_frees;
338 pfs->ft_hits += ft->ft_stats[i].ft_hits;
339 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
340 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
341 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
342 }
343 } else {
344 pfs = &ft->ft_stats[0];
345 }
346 fs_print(sb, pfs);
347 }
348
349 static int
350 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
351 {
352 struct flowtable *ft;
353 struct sbuf *sb;
354 int error;
355
356 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
357
358 ft = V_flow_list_head;
359 while (ft != NULL) {
360 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
361 flowtable_show_stats(sb, ft);
362 ft = ft->ft_next;
363 }
364 sbuf_finish(sb);
365 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
366 sbuf_delete(sb);
367
368 return (error);
369 }
370 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
371 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
372
373
374 #ifndef RADIX_MPATH
375 static void
376 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
377 {
378
379 rtalloc_ign_fib(ro, 0, fibnum);
380 }
381 #endif
382
383 static void
384 flowtable_global_lock(struct flowtable *table, uint32_t hash)
385 {
386 int lock_index = (hash)&(table->ft_lock_count - 1);
387
388 mtx_lock(&table->ft_locks[lock_index]);
389 }
390
391 static void
392 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
393 {
394 int lock_index = (hash)&(table->ft_lock_count - 1);
395
396 mtx_unlock(&table->ft_locks[lock_index]);
397 }
398
399 static void
400 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
401 {
402
403 critical_enter();
404 }
405
406 static void
407 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
408 {
409
410 critical_exit();
411 }
412
413 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
414 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
415 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
416 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
417
418 #define FL_STALE (1<<8)
419 #define FL_IPV6 (1<<9)
420 #define FL_OVERWRITE (1<<10)
421
422 void
423 flow_invalidate(struct flentry *fle)
424 {
425
426 fle->f_flags |= FL_STALE;
427 }
428
429 static __inline int
430 proto_to_flags(uint8_t proto)
431 {
432 int flag;
433
434 switch (proto) {
435 case IPPROTO_TCP:
436 flag = FL_TCP;
437 break;
438 case IPPROTO_SCTP:
439 flag = FL_SCTP;
440 break;
441 case IPPROTO_UDP:
442 flag = FL_UDP;
443 break;
444 default:
445 flag = 0;
446 break;
447 }
448
449 return (flag);
450 }
451
452 static __inline int
453 flags_to_proto(int flags)
454 {
455 int proto, protoflags;
456
457 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
458 switch (protoflags) {
459 case FL_TCP:
460 proto = IPPROTO_TCP;
461 break;
462 case FL_SCTP:
463 proto = IPPROTO_SCTP;
464 break;
465 case FL_UDP:
466 proto = IPPROTO_UDP;
467 break;
468 default:
469 proto = 0;
470 break;
471 }
472 return (proto);
473 }
474
475 #ifdef INET
476 #ifdef FLOWTABLE_DEBUG
477 static void
478 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
479 struct sockaddr_in *dsin)
480 {
481 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
482
483 if (flags & FL_HASH_ALL) {
484 inet_ntoa_r(ssin->sin_addr, saddr);
485 inet_ntoa_r(dsin->sin_addr, daddr);
486 printf("proto=%d %s:%d->%s:%d\n",
487 proto, saddr, ntohs(ssin->sin_port), daddr,
488 ntohs(dsin->sin_port));
489 } else {
490 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
491 printf("proto=%d %s\n", proto, daddr);
492 }
493
494 }
495 #endif
496
497 static int
498 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
499 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
500 {
501 struct ip *ip;
502 uint8_t proto;
503 int iphlen;
504 struct tcphdr *th;
505 struct udphdr *uh;
506 struct sctphdr *sh;
507 uint16_t sport, dport;
508
509 proto = sport = dport = 0;
510 ip = mtod(m, struct ip *);
511 dsin->sin_family = AF_INET;
512 dsin->sin_len = sizeof(*dsin);
513 dsin->sin_addr = ip->ip_dst;
514 ssin->sin_family = AF_INET;
515 ssin->sin_len = sizeof(*ssin);
516 ssin->sin_addr = ip->ip_src;
517
518 proto = ip->ip_p;
519 if ((*flags & FL_HASH_ALL) == 0) {
520 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
521 *flags);
522 goto skipports;
523 }
524
525 iphlen = ip->ip_hl << 2; /* XXX options? */
526
527 switch (proto) {
528 case IPPROTO_TCP:
529 th = (struct tcphdr *)((caddr_t)ip + iphlen);
530 sport = th->th_sport;
531 dport = th->th_dport;
532 if ((*flags & FL_HASH_ALL) &&
533 (th->th_flags & (TH_RST|TH_FIN)))
534 *flags |= FL_STALE;
535 break;
536 case IPPROTO_UDP:
537 uh = (struct udphdr *)((caddr_t)ip + iphlen);
538 sport = uh->uh_sport;
539 dport = uh->uh_dport;
540 break;
541 case IPPROTO_SCTP:
542 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
543 sport = sh->src_port;
544 dport = sh->dest_port;
545 break;
546 default:
547 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
548 return (ENOTSUP);
549 /* no port - hence not a protocol we care about */
550 break;
551
552 }
553
554 skipports:
555 *flags |= proto_to_flags(proto);
556 ssin->sin_port = sport;
557 dsin->sin_port = dport;
558 return (0);
559 }
560
561 static uint32_t
562 ipv4_flow_lookup_hash_internal(
563 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
564 uint32_t *key, uint16_t flags)
565 {
566 uint16_t sport, dport;
567 uint8_t proto;
568 int offset = 0;
569
570 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
571 return (0);
572 proto = flags_to_proto(flags);
573 sport = dport = key[2] = key[1] = key[0] = 0;
574 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
575 key[1] = ssin->sin_addr.s_addr;
576 sport = ssin->sin_port;
577 }
578 if (dsin != NULL) {
579 key[2] = dsin->sin_addr.s_addr;
580 dport = dsin->sin_port;
581 }
582 if (flags & FL_HASH_ALL) {
583 ((uint16_t *)key)[0] = sport;
584 ((uint16_t *)key)[1] = dport;
585 } else
586 offset = V_flow_hashjitter + proto;
587
588 return (jenkins_hashword(key, 3, offset));
589 }
590
591 static struct flentry *
592 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
593 {
594 struct sockaddr_storage ssa, dsa;
595 uint16_t flags;
596 struct sockaddr_in *dsin, *ssin;
597
598 dsin = (struct sockaddr_in *)&dsa;
599 ssin = (struct sockaddr_in *)&ssa;
600 bzero(dsin, sizeof(*dsin));
601 bzero(ssin, sizeof(*ssin));
602 flags = ft->ft_flags;
603 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
604 return (NULL);
605
606 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
607 }
608
609 void
610 flow_to_route(struct flentry *fle, struct route *ro)
611 {
612 uint32_t *hashkey = NULL;
613 struct sockaddr_in *sin;
614
615 sin = (struct sockaddr_in *)&ro->ro_dst;
616 sin->sin_family = AF_INET;
617 sin->sin_len = sizeof(*sin);
618 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
619 sin->sin_addr.s_addr = hashkey[2];
620 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
621 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
622 }
623 #endif /* INET */
624
625 #ifdef INET6
626 /*
627 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
628 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
629 * pointer might become stale after other pullups (but we never use it
630 * this way).
631 */
632 #define PULLUP_TO(_len, p, T) \
633 do { \
634 int x = (_len) + sizeof(T); \
635 if ((m)->m_len < x) { \
636 goto receive_failed; \
637 } \
638 p = (mtod(m, char *) + (_len)); \
639 } while (0)
640
641 #define TCP(p) ((struct tcphdr *)(p))
642 #define SCTP(p) ((struct sctphdr *)(p))
643 #define UDP(p) ((struct udphdr *)(p))
644
645 static int
646 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
647 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
648 {
649 struct ip6_hdr *ip6;
650 uint8_t proto;
651 int hlen;
652 uint16_t src_port, dst_port;
653 u_short offset;
654 void *ulp;
655
656 offset = hlen = src_port = dst_port = 0;
657 ulp = NULL;
658 ip6 = mtod(m, struct ip6_hdr *);
659 hlen = sizeof(struct ip6_hdr);
660 proto = ip6->ip6_nxt;
661
662 if ((*flags & FL_HASH_ALL) == 0)
663 goto skipports;
664
665 while (ulp == NULL) {
666 switch (proto) {
667 case IPPROTO_ICMPV6:
668 case IPPROTO_OSPFIGP:
669 case IPPROTO_PIM:
670 case IPPROTO_CARP:
671 case IPPROTO_ESP:
672 case IPPROTO_NONE:
673 ulp = ip6;
674 break;
675 case IPPROTO_TCP:
676 PULLUP_TO(hlen, ulp, struct tcphdr);
677 dst_port = TCP(ulp)->th_dport;
678 src_port = TCP(ulp)->th_sport;
679 if ((*flags & FL_HASH_ALL) &&
680 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
681 *flags |= FL_STALE;
682 break;
683 case IPPROTO_SCTP:
684 PULLUP_TO(hlen, ulp, struct sctphdr);
685 src_port = SCTP(ulp)->src_port;
686 dst_port = SCTP(ulp)->dest_port;
687 break;
688 case IPPROTO_UDP:
689 PULLUP_TO(hlen, ulp, struct udphdr);
690 dst_port = UDP(ulp)->uh_dport;
691 src_port = UDP(ulp)->uh_sport;
692 break;
693 case IPPROTO_HOPOPTS: /* RFC 2460 */
694 PULLUP_TO(hlen, ulp, struct ip6_hbh);
695 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
696 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
697 ulp = NULL;
698 break;
699 case IPPROTO_ROUTING: /* RFC 2460 */
700 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
701 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
702 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
703 ulp = NULL;
704 break;
705 case IPPROTO_FRAGMENT: /* RFC 2460 */
706 PULLUP_TO(hlen, ulp, struct ip6_frag);
707 hlen += sizeof (struct ip6_frag);
708 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
709 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
710 IP6F_OFF_MASK;
711 ulp = NULL;
712 break;
713 case IPPROTO_DSTOPTS: /* RFC 2460 */
714 PULLUP_TO(hlen, ulp, struct ip6_hbh);
715 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
716 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
717 ulp = NULL;
718 break;
719 case IPPROTO_AH: /* RFC 2402 */
720 PULLUP_TO(hlen, ulp, struct ip6_ext);
721 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
722 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
723 ulp = NULL;
724 break;
725 default:
726 PULLUP_TO(hlen, ulp, struct ip6_ext);
727 break;
728 }
729 }
730
731 if (src_port == 0) {
732 receive_failed:
733 return (ENOTSUP);
734 }
735
736 skipports:
737 dsin6->sin6_family = AF_INET6;
738 dsin6->sin6_len = sizeof(*dsin6);
739 dsin6->sin6_port = dst_port;
740 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
741
742 ssin6->sin6_family = AF_INET6;
743 ssin6->sin6_len = sizeof(*ssin6);
744 ssin6->sin6_port = src_port;
745 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
746 *flags |= proto_to_flags(proto);
747
748 return (0);
749 }
750
751 #define zero_key(key) \
752 do { \
753 key[0] = 0; \
754 key[1] = 0; \
755 key[2] = 0; \
756 key[3] = 0; \
757 key[4] = 0; \
758 key[5] = 0; \
759 key[6] = 0; \
760 key[7] = 0; \
761 key[8] = 0; \
762 } while (0)
763
764 static uint32_t
765 ipv6_flow_lookup_hash_internal(
766 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
767 uint32_t *key, uint16_t flags)
768 {
769 uint16_t sport, dport;
770 uint8_t proto;
771 int offset = 0;
772
773 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
774 return (0);
775
776 proto = flags_to_proto(flags);
777 zero_key(key);
778 sport = dport = 0;
779 if (dsin6 != NULL) {
780 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
781 dport = dsin6->sin6_port;
782 }
783 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
784 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
785 sport = ssin6->sin6_port;
786 }
787 if (flags & FL_HASH_ALL) {
788 ((uint16_t *)key)[0] = sport;
789 ((uint16_t *)key)[1] = dport;
790 } else
791 offset = V_flow_hashjitter + proto;
792
793 return (jenkins_hashword(key, 9, offset));
794 }
795
796 static struct flentry *
797 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
798 {
799 struct sockaddr_storage ssa, dsa;
800 struct sockaddr_in6 *dsin6, *ssin6;
801 uint16_t flags;
802
803 dsin6 = (struct sockaddr_in6 *)&dsa;
804 ssin6 = (struct sockaddr_in6 *)&ssa;
805 bzero(dsin6, sizeof(*dsin6));
806 bzero(ssin6, sizeof(*ssin6));
807 flags = ft->ft_flags;
808
809 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
810 return (NULL);
811
812 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
813 }
814
815 void
816 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
817 {
818 uint32_t *hashkey = NULL;
819 struct sockaddr_in6 *sin6;
820
821 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
822
823 sin6->sin6_family = AF_INET6;
824 sin6->sin6_len = sizeof(*sin6);
825 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
826 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
827 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
828 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
829
830 }
831 #endif /* INET6 */
832
833 static bitstr_t *
834 flowtable_mask(struct flowtable *ft)
835 {
836 bitstr_t *mask;
837
838 if (ft->ft_flags & FL_PCPU)
839 mask = ft->ft_masks[curcpu];
840 else
841 mask = ft->ft_masks[0];
842
843 return (mask);
844 }
845
846 static struct flentry **
847 flowtable_entry(struct flowtable *ft, uint32_t hash)
848 {
849 struct flentry **fle;
850 int index = (hash % ft->ft_size);
851
852 if (ft->ft_flags & FL_PCPU) {
853 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
854 fle = &ft->ft_table.pcpu[curcpu][index];
855 } else {
856 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
857 fle = &ft->ft_table.global[index];
858 }
859
860 return (fle);
861 }
862
863 static int
864 flow_stale(struct flowtable *ft, struct flentry *fle)
865 {
866 time_t idle_time;
867
868 if ((fle->f_fhash == 0)
869 || ((fle->f_rt->rt_flags & RTF_HOST) &&
870 ((fle->f_rt->rt_flags & (RTF_UP))
871 != (RTF_UP)))
872 || (fle->f_rt->rt_ifp == NULL)
873 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
874 return (1);
875
876 idle_time = time_uptime - fle->f_uptime;
877
878 if ((fle->f_flags & FL_STALE) ||
879 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
880 && (idle_time > ft->ft_udp_idle)) ||
881 ((fle->f_flags & TH_FIN)
882 && (idle_time > ft->ft_fin_wait_idle)) ||
883 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
884 && (idle_time > ft->ft_syn_idle)) ||
885 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
886 && (idle_time > ft->ft_tcp_idle)) ||
887 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
888 (fle->f_rt->rt_ifp == NULL)))
889 return (1);
890
891 return (0);
892 }
893
894 static void
895 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
896 {
897 uint32_t *hashkey;
898 int i, nwords;
899
900 if (fle->f_flags & FL_IPV6) {
901 nwords = 9;
902 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
903 } else {
904 nwords = 3;
905 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
906 }
907
908 for (i = 0; i < nwords; i++)
909 hashkey[i] = key[i];
910 }
911
912 static struct flentry *
913 flow_alloc(struct flowtable *ft)
914 {
915 struct flentry *newfle;
916 uma_zone_t zone;
917
918 newfle = NULL;
919 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
920
921 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
922 if (newfle != NULL)
923 atomic_add_int(&ft->ft_count, 1);
924 return (newfle);
925 }
926
927 static void
928 flow_free(struct flentry *fle, struct flowtable *ft)
929 {
930 uma_zone_t zone;
931
932 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
933 atomic_add_int(&ft->ft_count, -1);
934 uma_zfree(zone, fle);
935 }
936
937 static int
938 flow_full(struct flowtable *ft)
939 {
940 boolean_t full;
941 uint32_t count;
942
943 full = ft->ft_full;
944 count = ft->ft_count;
945
946 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
947 ft->ft_full = FALSE;
948 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
949 ft->ft_full = TRUE;
950
951 if (full && !ft->ft_full) {
952 flowclean_freq = 4*hz;
953 if ((ft->ft_flags & FL_HASH_ALL) == 0)
954 ft->ft_udp_idle = ft->ft_fin_wait_idle =
955 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
956 cv_broadcast(&flowclean_c_cv);
957 } else if (!full && ft->ft_full) {
958 flowclean_freq = 20*hz;
959 if ((ft->ft_flags & FL_HASH_ALL) == 0)
960 ft->ft_udp_idle = ft->ft_fin_wait_idle =
961 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
962 }
963
964 return (ft->ft_full);
965 }
966
967 static int
968 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
969 uint32_t fibnum, struct route *ro, uint16_t flags)
970 {
971 struct flentry *fle, *fletail, *newfle, **flep;
972 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
973 int depth;
974 bitstr_t *mask;
975 uint8_t proto;
976
977 newfle = flow_alloc(ft);
978 if (newfle == NULL)
979 return (ENOMEM);
980
981 newfle->f_flags |= (flags & FL_IPV6);
982 proto = flags_to_proto(flags);
983
984 FL_ENTRY_LOCK(ft, hash);
985 mask = flowtable_mask(ft);
986 flep = flowtable_entry(ft, hash);
987 fletail = fle = *flep;
988
989 if (fle == NULL) {
990 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
991 *flep = fle = newfle;
992 goto skip;
993 }
994
995 depth = 0;
996 fs->ft_collisions++;
997 /*
998 * find end of list and make sure that we were not
999 * preempted by another thread handling this flow
1000 */
1001 while (fle != NULL) {
1002 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1003 /*
1004 * there was either a hash collision
1005 * or we lost a race to insert
1006 */
1007 FL_ENTRY_UNLOCK(ft, hash);
1008 flow_free(newfle, ft);
1009
1010 if (flags & FL_OVERWRITE)
1011 goto skip;
1012 return (EEXIST);
1013 }
1014 /*
1015 * re-visit this double condition XXX
1016 */
1017 if (fletail->f_next != NULL)
1018 fletail = fle->f_next;
1019
1020 depth++;
1021 fle = fle->f_next;
1022 }
1023
1024 if (depth > fs->ft_max_depth)
1025 fs->ft_max_depth = depth;
1026 fletail->f_next = newfle;
1027 fle = newfle;
1028 skip:
1029 flowtable_set_hashkey(fle, key);
1030
1031 fle->f_proto = proto;
1032 fle->f_rt = ro->ro_rt;
1033 fle->f_lle = ro->ro_lle;
1034 fle->f_fhash = hash;
1035 fle->f_fibnum = fibnum;
1036 fle->f_uptime = time_uptime;
1037 FL_ENTRY_UNLOCK(ft, hash);
1038 return (0);
1039 }
1040
1041 int
1042 kern_flowtable_insert(struct flowtable *ft,
1043 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1044 struct route *ro, uint32_t fibnum, int flags)
1045 {
1046 uint32_t key[9], hash;
1047
1048 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1049 hash = 0;
1050
1051 #ifdef INET
1052 if (ssa->ss_family == AF_INET)
1053 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1054 (struct sockaddr_in *)dsa, key, flags);
1055 #endif
1056 #ifdef INET6
1057 if (ssa->ss_family == AF_INET6)
1058 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1059 (struct sockaddr_in6 *)dsa, key, flags);
1060 #endif
1061 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1062 return (EINVAL);
1063
1064 FLDPRINTF(ft, FL_DEBUG,
1065 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1066 key[0], key[1], key[2], hash, fibnum, flags);
1067 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1068 }
1069
1070 static int
1071 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1072 {
1073 uint32_t *hashkey;
1074 int i, nwords;
1075
1076 if (fle->f_flags & FL_IPV6) {
1077 nwords = 9;
1078 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1079 } else {
1080 nwords = 3;
1081 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1082 }
1083
1084 for (i = 0; i < nwords; i++)
1085 if (hashkey[i] != key[i])
1086 return (0);
1087
1088 return (1);
1089 }
1090
1091 struct flentry *
1092 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1093 {
1094 struct flentry *fle = NULL;
1095
1096 #ifdef INET
1097 if (af == AF_INET)
1098 fle = flowtable_lookup_mbuf4(ft, m);
1099 #endif
1100 #ifdef INET6
1101 if (af == AF_INET6)
1102 fle = flowtable_lookup_mbuf6(ft, m);
1103 #endif
1104 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1105 m->m_flags |= M_FLOWID;
1106 m->m_pkthdr.flowid = fle->f_fhash;
1107 }
1108 return (fle);
1109 }
1110
1111 struct flentry *
1112 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1113 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1114 {
1115 uint32_t key[9], hash;
1116 struct flentry *fle;
1117 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1118 uint8_t proto = 0;
1119 int error = 0;
1120 struct rtentry *rt;
1121 struct llentry *lle;
1122 struct route sro, *ro;
1123 struct route_in6 sro6;
1124
1125 sro.ro_rt = sro6.ro_rt = NULL;
1126 sro.ro_lle = sro6.ro_lle = NULL;
1127 ro = NULL;
1128 hash = 0;
1129 flags |= ft->ft_flags;
1130 proto = flags_to_proto(flags);
1131 #ifdef INET
1132 if (ssa->ss_family == AF_INET) {
1133 struct sockaddr_in *ssin, *dsin;
1134
1135 ro = &sro;
1136 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1137 /*
1138 * The harvested source and destination addresses
1139 * may contain port information if the packet is
1140 * from a transport protocol (e.g. TCP/UDP). The
1141 * port field must be cleared before performing
1142 * a route lookup.
1143 */
1144 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1145 dsin = (struct sockaddr_in *)dsa;
1146 ssin = (struct sockaddr_in *)ssa;
1147 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1148 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1149 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1150 return (NULL);
1151
1152 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1153 }
1154 #endif
1155 #ifdef INET6
1156 if (ssa->ss_family == AF_INET6) {
1157 struct sockaddr_in6 *ssin6, *dsin6;
1158
1159 ro = (struct route *)&sro6;
1160 memcpy(&sro6.ro_dst, dsa,
1161 sizeof(struct sockaddr_in6));
1162 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1163 dsin6 = (struct sockaddr_in6 *)dsa;
1164 ssin6 = (struct sockaddr_in6 *)ssa;
1165
1166 flags |= FL_IPV6;
1167 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1168 }
1169 #endif
1170 /*
1171 * Ports are zero and this isn't a transmit cache
1172 * - thus not a protocol for which we need to keep
1173 * state
1174 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1175 */
1176 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1177 return (NULL);
1178
1179 fs->ft_lookups++;
1180 FL_ENTRY_LOCK(ft, hash);
1181 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1182 FL_ENTRY_UNLOCK(ft, hash);
1183 goto uncached;
1184 }
1185 keycheck:
1186 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1187 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1188 if ((rt != NULL)
1189 && lle != NULL
1190 && fle->f_fhash == hash
1191 && flowtable_key_equal(fle, key)
1192 && (proto == fle->f_proto)
1193 && (fibnum == fle->f_fibnum)
1194 && (rt->rt_flags & RTF_UP)
1195 && (rt->rt_ifp != NULL)
1196 && (lle->la_flags & LLE_VALID)) {
1197 fs->ft_hits++;
1198 fle->f_uptime = time_uptime;
1199 fle->f_flags |= flags;
1200 FL_ENTRY_UNLOCK(ft, hash);
1201 return (fle);
1202 } else if (fle->f_next != NULL) {
1203 fle = fle->f_next;
1204 goto keycheck;
1205 }
1206 FL_ENTRY_UNLOCK(ft, hash);
1207 uncached:
1208 if (flags & FL_NOAUTO || flow_full(ft))
1209 return (NULL);
1210
1211 fs->ft_misses++;
1212 /*
1213 * This bit of code ends up locking the
1214 * same route 3 times (just like ip_output + ether_output)
1215 * - at lookup
1216 * - in rt_check when called by arpresolve
1217 * - dropping the refcount for the rtentry
1218 *
1219 * This could be consolidated to one if we wrote a variant
1220 * of arpresolve with an rt_check variant that expected to
1221 * receive the route locked
1222 */
1223
1224 #ifdef INVARIANTS
1225 if ((ro->ro_dst.sa_family != AF_INET) &&
1226 (ro->ro_dst.sa_family != AF_INET6))
1227 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1228 #endif
1229
1230 ft->ft_rtalloc(ro, hash, fibnum);
1231 if (ro->ro_rt == NULL)
1232 error = ENETUNREACH;
1233 else {
1234 struct llentry *lle = NULL;
1235 struct sockaddr_storage *l3addr;
1236 struct rtentry *rt = ro->ro_rt;
1237 struct ifnet *ifp = rt->rt_ifp;
1238
1239 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1240 RTFREE(rt);
1241 ro->ro_rt = NULL;
1242 return (NULL);
1243 }
1244 #ifdef INET6
1245 if (ssa->ss_family == AF_INET6) {
1246 struct sockaddr_in6 *dsin6;
1247
1248 dsin6 = (struct sockaddr_in6 *)dsa;
1249 if (in6_localaddr(&dsin6->sin6_addr)) {
1250 RTFREE(rt);
1251 ro->ro_rt = NULL;
1252 return (NULL);
1253 }
1254
1255 if (rt->rt_flags & RTF_GATEWAY)
1256 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1257
1258 else
1259 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1260 llentry_update(&lle, LLTABLE6(ifp), l3addr, ifp);
1261 }
1262 #endif
1263 #ifdef INET
1264 if (ssa->ss_family == AF_INET) {
1265 if (rt->rt_flags & RTF_GATEWAY)
1266 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1267 else
1268 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1269 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
1270 }
1271
1272 #endif
1273 ro->ro_lle = lle;
1274
1275 if (lle == NULL) {
1276 RTFREE(rt);
1277 ro->ro_rt = NULL;
1278 return (NULL);
1279 }
1280 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1281
1282 if (error) {
1283 RTFREE(rt);
1284 LLE_FREE(lle);
1285 ro->ro_rt = NULL;
1286 ro->ro_lle = NULL;
1287 }
1288 }
1289
1290 return ((error) ? NULL : fle);
1291 }
1292
1293 /*
1294 * used by the bit_alloc macro
1295 */
1296 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1297
1298 struct flowtable *
1299 flowtable_alloc(char *name, int nentry, int flags)
1300 {
1301 struct flowtable *ft, *fttail;
1302 int i;
1303
1304 if (V_flow_hashjitter == 0)
1305 V_flow_hashjitter = arc4random();
1306
1307 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1308
1309 ft = malloc(sizeof(struct flowtable),
1310 M_RTABLE, M_WAITOK | M_ZERO);
1311
1312 ft->ft_name = name;
1313 ft->ft_flags = flags;
1314 ft->ft_size = nentry;
1315 #ifdef RADIX_MPATH
1316 ft->ft_rtalloc = rtalloc_mpath_fib;
1317 #else
1318 ft->ft_rtalloc = rtalloc_ign_wrapper;
1319 #endif
1320 if (flags & FL_PCPU) {
1321 ft->ft_lock = flowtable_pcpu_lock;
1322 ft->ft_unlock = flowtable_pcpu_unlock;
1323
1324 for (i = 0; i <= mp_maxid; i++) {
1325 ft->ft_table.pcpu[i] =
1326 malloc(nentry*sizeof(struct flentry *),
1327 M_RTABLE, M_WAITOK | M_ZERO);
1328 ft->ft_masks[i] = bit_alloc(nentry);
1329 }
1330 } else {
1331 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1332 (fls(mp_maxid + 1) << 1));
1333
1334 ft->ft_lock = flowtable_global_lock;
1335 ft->ft_unlock = flowtable_global_unlock;
1336 ft->ft_table.global =
1337 malloc(nentry*sizeof(struct flentry *),
1338 M_RTABLE, M_WAITOK | M_ZERO);
1339 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1340 M_RTABLE, M_WAITOK | M_ZERO);
1341 for (i = 0; i < ft->ft_lock_count; i++)
1342 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1343
1344 ft->ft_masks[0] = bit_alloc(nentry);
1345 }
1346 ft->ft_tmpmask = bit_alloc(nentry);
1347
1348 /*
1349 * In the local transmit case the table truly is
1350 * just a cache - so everything is eligible for
1351 * replacement after 5s of non-use
1352 */
1353 if (flags & FL_HASH_ALL) {
1354 ft->ft_udp_idle = V_flowtable_udp_expire;
1355 ft->ft_syn_idle = V_flowtable_syn_expire;
1356 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1357 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1358 } else {
1359 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1360 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1361
1362 }
1363
1364 /*
1365 * hook in to the cleaner list
1366 */
1367 if (V_flow_list_head == NULL)
1368 V_flow_list_head = ft;
1369 else {
1370 fttail = V_flow_list_head;
1371 while (fttail->ft_next != NULL)
1372 fttail = fttail->ft_next;
1373 fttail->ft_next = ft;
1374 }
1375
1376 return (ft);
1377 }
1378
1379 /*
1380 * The rest of the code is devoted to garbage collection of expired entries.
1381 * It is a new additon made necessary by the switch to dynamically allocating
1382 * flow tables.
1383 *
1384 */
1385 static void
1386 fle_free(struct flentry *fle, struct flowtable *ft)
1387 {
1388 struct rtentry *rt;
1389 struct llentry *lle;
1390
1391 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1392 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1393 RTFREE(rt);
1394 LLE_FREE(lle);
1395 flow_free(fle, ft);
1396 }
1397
1398 static void
1399 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1400 {
1401 int curbit = 0, count;
1402 struct flentry *fle, **flehead, *fleprev;
1403 struct flentry *flefreehead, *flefreetail, *fletmp;
1404 bitstr_t *mask, *tmpmask;
1405 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1406
1407 flefreehead = flefreetail = NULL;
1408 mask = flowtable_mask(ft);
1409 tmpmask = ft->ft_tmpmask;
1410 memcpy(tmpmask, mask, ft->ft_size/8);
1411 /*
1412 * XXX Note to self, bit_ffs operates at the byte level
1413 * and thus adds gratuitous overhead
1414 */
1415 bit_ffs(tmpmask, ft->ft_size, &curbit);
1416 while (curbit != -1) {
1417 if (curbit >= ft->ft_size || curbit < -1) {
1418 log(LOG_ALERT,
1419 "warning: bad curbit value %d \n",
1420 curbit);
1421 break;
1422 }
1423
1424 FL_ENTRY_LOCK(ft, curbit);
1425 flehead = flowtable_entry(ft, curbit);
1426 fle = fleprev = *flehead;
1427
1428 fs->ft_free_checks++;
1429 #ifdef DIAGNOSTIC
1430 if (fle == NULL && curbit > 0) {
1431 log(LOG_ALERT,
1432 "warning bit=%d set, but no fle found\n",
1433 curbit);
1434 }
1435 #endif
1436 while (fle != NULL) {
1437 if (rt != NULL) {
1438 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1439 fleprev = fle;
1440 fle = fle->f_next;
1441 continue;
1442 }
1443 } else if (!flow_stale(ft, fle)) {
1444 fleprev = fle;
1445 fle = fle->f_next;
1446 continue;
1447 }
1448 /*
1449 * delete head of the list
1450 */
1451 if (fleprev == *flehead) {
1452 fletmp = fleprev;
1453 if (fle == fleprev) {
1454 fleprev = *flehead = fle->f_next;
1455 } else
1456 fleprev = *flehead = fle;
1457 fle = fle->f_next;
1458 } else {
1459 /*
1460 * don't advance fleprev
1461 */
1462 fletmp = fle;
1463 fleprev->f_next = fle->f_next;
1464 fle = fleprev->f_next;
1465 }
1466
1467 if (flefreehead == NULL)
1468 flefreehead = flefreetail = fletmp;
1469 else {
1470 flefreetail->f_next = fletmp;
1471 flefreetail = fletmp;
1472 }
1473 fletmp->f_next = NULL;
1474 }
1475 if (*flehead == NULL)
1476 bit_clear(mask, curbit);
1477 FL_ENTRY_UNLOCK(ft, curbit);
1478 bit_clear(tmpmask, curbit);
1479 bit_ffs(tmpmask, ft->ft_size, &curbit);
1480 }
1481 count = 0;
1482 while ((fle = flefreehead) != NULL) {
1483 flefreehead = fle->f_next;
1484 count++;
1485 fs->ft_frees++;
1486 fle_free(fle, ft);
1487 }
1488 if (V_flowtable_debug && count)
1489 log(LOG_DEBUG, "freed %d flow entries\n", count);
1490 }
1491
1492 void
1493 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1494 {
1495 int i;
1496
1497 if (ft->ft_flags & FL_PCPU) {
1498 CPU_FOREACH(i) {
1499 if (smp_started == 1) {
1500 thread_lock(curthread);
1501 sched_bind(curthread, i);
1502 thread_unlock(curthread);
1503 }
1504
1505 flowtable_free_stale(ft, rt);
1506
1507 if (smp_started == 1) {
1508 thread_lock(curthread);
1509 sched_unbind(curthread);
1510 thread_unlock(curthread);
1511 }
1512 }
1513 } else {
1514 flowtable_free_stale(ft, rt);
1515 }
1516 }
1517
1518 static void
1519 flowtable_clean_vnet(void)
1520 {
1521 struct flowtable *ft;
1522 int i;
1523
1524 ft = V_flow_list_head;
1525 while (ft != NULL) {
1526 if (ft->ft_flags & FL_PCPU) {
1527 CPU_FOREACH(i) {
1528 if (smp_started == 1) {
1529 thread_lock(curthread);
1530 sched_bind(curthread, i);
1531 thread_unlock(curthread);
1532 }
1533
1534 flowtable_free_stale(ft, NULL);
1535
1536 if (smp_started == 1) {
1537 thread_lock(curthread);
1538 sched_unbind(curthread);
1539 thread_unlock(curthread);
1540 }
1541 }
1542 } else {
1543 flowtable_free_stale(ft, NULL);
1544 }
1545 ft = ft->ft_next;
1546 }
1547 }
1548
1549 static void
1550 flowtable_cleaner(void)
1551 {
1552 VNET_ITERATOR_DECL(vnet_iter);
1553 struct thread *td;
1554
1555 if (bootverbose)
1556 log(LOG_INFO, "flowtable cleaner started\n");
1557 td = curthread;
1558 while (1) {
1559 VNET_LIST_RLOCK();
1560 VNET_FOREACH(vnet_iter) {
1561 CURVNET_SET(vnet_iter);
1562 flowtable_clean_vnet();
1563 CURVNET_RESTORE();
1564 }
1565 VNET_LIST_RUNLOCK();
1566
1567 /*
1568 * The 10 second interval between cleaning checks
1569 * is arbitrary
1570 */
1571 mtx_lock(&flowclean_lock);
1572 thread_lock(td);
1573 sched_prio(td, PPAUSE);
1574 thread_unlock(td);
1575 flowclean_cycles++;
1576 cv_broadcast(&flowclean_f_cv);
1577 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1578 mtx_unlock(&flowclean_lock);
1579 }
1580 }
1581
1582 static void
1583 flowtable_flush(void *unused __unused)
1584 {
1585 uint64_t start;
1586
1587 mtx_lock(&flowclean_lock);
1588 start = flowclean_cycles;
1589 while (start == flowclean_cycles) {
1590 cv_broadcast(&flowclean_c_cv);
1591 cv_wait(&flowclean_f_cv, &flowclean_lock);
1592 }
1593 mtx_unlock(&flowclean_lock);
1594 }
1595
1596 static struct kproc_desc flow_kp = {
1597 "flowcleaner",
1598 flowtable_cleaner,
1599 &flowcleanerproc
1600 };
1601 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1602
1603 static void
1604 flowtable_init_vnet(const void *unused __unused)
1605 {
1606
1607 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1608 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1609 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1610 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1611 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1612 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1613 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1614 V_flowtable_ready = 1;
1615 }
1616 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1617 flowtable_init_vnet, NULL);
1618
1619 static void
1620 flowtable_init(const void *unused __unused)
1621 {
1622
1623 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1624 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1625 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1626 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1627 EVENTHANDLER_PRI_ANY);
1628 flowclean_freq = 20*hz;
1629 }
1630 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1631 flowtable_init, NULL);
1632
1633
1634 #ifdef VIMAGE
1635 static void
1636 flowtable_uninit(const void *unused __unused)
1637 {
1638
1639 V_flowtable_ready = 0;
1640 uma_zdestroy(V_flow_ipv4_zone);
1641 uma_zdestroy(V_flow_ipv6_zone);
1642 }
1643
1644 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1645 flowtable_uninit, NULL);
1646 #endif
1647
1648 #ifdef DDB
1649 static uint32_t *
1650 flowtable_get_hashkey(struct flentry *fle)
1651 {
1652 uint32_t *hashkey;
1653
1654 if (fle->f_flags & FL_IPV6)
1655 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1656 else
1657 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1658
1659 return (hashkey);
1660 }
1661
1662 static bitstr_t *
1663 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1664 {
1665 bitstr_t *mask;
1666
1667 if (ft->ft_flags & FL_PCPU)
1668 mask = ft->ft_masks[cpuid];
1669 else
1670 mask = ft->ft_masks[0];
1671
1672 return (mask);
1673 }
1674
1675 static struct flentry **
1676 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1677 {
1678 struct flentry **fle;
1679 int index = (hash % ft->ft_size);
1680
1681 if (ft->ft_flags & FL_PCPU) {
1682 fle = &ft->ft_table.pcpu[cpuid][index];
1683 } else {
1684 fle = &ft->ft_table.global[index];
1685 }
1686
1687 return (fle);
1688 }
1689
1690 static void
1691 flow_show(struct flowtable *ft, struct flentry *fle)
1692 {
1693 int idle_time;
1694 int rt_valid, ifp_valid;
1695 uint16_t sport, dport;
1696 uint32_t *hashkey;
1697 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1698 volatile struct rtentry *rt;
1699 struct ifnet *ifp = NULL;
1700
1701 idle_time = (int)(time_uptime - fle->f_uptime);
1702 rt = fle->f_rt;
1703 rt_valid = rt != NULL;
1704 if (rt_valid)
1705 ifp = rt->rt_ifp;
1706 ifp_valid = ifp != NULL;
1707 hashkey = flowtable_get_hashkey(fle);
1708 if (fle->f_flags & FL_IPV6)
1709 goto skipaddr;
1710
1711 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1712 if (ft->ft_flags & FL_HASH_ALL) {
1713 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1714 sport = ntohs(((uint16_t *)hashkey)[0]);
1715 dport = ntohs(((uint16_t *)hashkey)[1]);
1716 db_printf("%s:%d->%s:%d",
1717 saddr, sport, daddr,
1718 dport);
1719 } else
1720 db_printf("%s ", daddr);
1721
1722 skipaddr:
1723 if (fle->f_flags & FL_STALE)
1724 db_printf(" FL_STALE ");
1725 if (fle->f_flags & FL_TCP)
1726 db_printf(" FL_TCP ");
1727 if (fle->f_flags & FL_UDP)
1728 db_printf(" FL_UDP ");
1729 if (rt_valid) {
1730 if (rt->rt_flags & RTF_UP)
1731 db_printf(" RTF_UP ");
1732 }
1733 if (ifp_valid) {
1734 if (ifp->if_flags & IFF_LOOPBACK)
1735 db_printf(" IFF_LOOPBACK ");
1736 if (ifp->if_flags & IFF_UP)
1737 db_printf(" IFF_UP ");
1738 if (ifp->if_flags & IFF_POINTOPOINT)
1739 db_printf(" IFF_POINTOPOINT ");
1740 }
1741 if (fle->f_flags & FL_IPV6)
1742 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1743 hashkey[0], hashkey[1], hashkey[2],
1744 hashkey[3], hashkey[4], hashkey[5],
1745 hashkey[6], hashkey[7], hashkey[8]);
1746 else
1747 db_printf("\n\tkey=%08x:%08x:%08x ",
1748 hashkey[0], hashkey[1], hashkey[2]);
1749 db_printf("hash=%08x idle_time=%03d"
1750 "\n\tfibnum=%02d rt=%p",
1751 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1752 db_printf("\n");
1753 }
1754
1755 static void
1756 flowtable_show(struct flowtable *ft, int cpuid)
1757 {
1758 int curbit = 0;
1759 struct flentry *fle, **flehead;
1760 bitstr_t *mask, *tmpmask;
1761
1762 if (cpuid != -1)
1763 db_printf("cpu: %d\n", cpuid);
1764 mask = flowtable_mask_pcpu(ft, cpuid);
1765 tmpmask = ft->ft_tmpmask;
1766 memcpy(tmpmask, mask, ft->ft_size/8);
1767 /*
1768 * XXX Note to self, bit_ffs operates at the byte level
1769 * and thus adds gratuitous overhead
1770 */
1771 bit_ffs(tmpmask, ft->ft_size, &curbit);
1772 while (curbit != -1) {
1773 if (curbit >= ft->ft_size || curbit < -1) {
1774 db_printf("warning: bad curbit value %d \n",
1775 curbit);
1776 break;
1777 }
1778
1779 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1780 fle = *flehead;
1781
1782 while (fle != NULL) {
1783 flow_show(ft, fle);
1784 fle = fle->f_next;
1785 continue;
1786 }
1787 bit_clear(tmpmask, curbit);
1788 bit_ffs(tmpmask, ft->ft_size, &curbit);
1789 }
1790 }
1791
1792 static void
1793 flowtable_show_vnet(void)
1794 {
1795 struct flowtable *ft;
1796 int i;
1797
1798 ft = V_flow_list_head;
1799 while (ft != NULL) {
1800 printf("name: %s\n", ft->ft_name);
1801 if (ft->ft_flags & FL_PCPU) {
1802 CPU_FOREACH(i) {
1803 flowtable_show(ft, i);
1804 }
1805 } else {
1806 flowtable_show(ft, -1);
1807 }
1808 ft = ft->ft_next;
1809 }
1810 }
1811
1812 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1813 {
1814 VNET_ITERATOR_DECL(vnet_iter);
1815
1816 VNET_FOREACH(vnet_iter) {
1817 CURVNET_SET(vnet_iter);
1818 #ifdef VIMAGE
1819 db_printf("vnet %p\n", vnet_iter);
1820 #endif
1821 flowtable_show_vnet();
1822 CURVNET_RESTORE();
1823 }
1824 }
1825 #endif
Cache object: eea60e264fe7b99a092f6958e245b174
|