FreeBSD/Linux Kernel Cross Reference
sys/net/flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2010, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/bitstring.h>
42 #include <sys/condvar.h>
43 #include <sys/callout.h>
44 #include <sys/kernel.h>
45 #include <sys/kthread.h>
46 #include <sys/limits.h>
47 #include <sys/malloc.h>
48 #include <sys/mbuf.h>
49 #include <sys/proc.h>
50 #include <sys/sbuf.h>
51 #include <sys/sched.h>
52 #include <sys/smp.h>
53 #include <sys/socket.h>
54 #include <sys/syslog.h>
55 #include <sys/sysctl.h>
56
57 #include <net/if.h>
58 #include <net/if_llatbl.h>
59 #include <net/if_var.h>
60 #include <net/route.h>
61 #include <net/flowtable.h>
62 #include <net/vnet.h>
63
64 #include <netinet/in.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in_var.h>
67 #include <netinet/if_ether.h>
68 #include <netinet/ip.h>
69 #ifdef INET6
70 #include <netinet/ip6.h>
71 #endif
72 #include <netinet/tcp.h>
73 #include <netinet/udp.h>
74 #include <netinet/sctp.h>
75
76 #include <libkern/jenkins.h>
77 #include <ddb/ddb.h>
78
79 struct ipv4_tuple {
80 uint16_t ip_sport; /* source port */
81 uint16_t ip_dport; /* destination port */
82 in_addr_t ip_saddr; /* source address */
83 in_addr_t ip_daddr; /* destination address */
84 };
85
86 union ipv4_flow {
87 struct ipv4_tuple ipf_ipt;
88 uint32_t ipf_key[3];
89 };
90
91 struct ipv6_tuple {
92 uint16_t ip_sport; /* source port */
93 uint16_t ip_dport; /* destination port */
94 struct in6_addr ip_saddr; /* source address */
95 struct in6_addr ip_daddr; /* destination address */
96 };
97
98 union ipv6_flow {
99 struct ipv6_tuple ipf_ipt;
100 uint32_t ipf_key[9];
101 };
102
103 struct flentry {
104 volatile uint32_t f_fhash; /* hash flowing forward */
105 uint16_t f_flags; /* flow flags */
106 uint8_t f_pad;
107 uint8_t f_proto; /* protocol */
108 uint32_t f_fibnum; /* fib index */
109 uint32_t f_uptime; /* uptime at last access */
110 struct flentry *f_next; /* pointer to collision entry */
111 volatile struct rtentry *f_rt; /* rtentry for flow */
112 volatile struct llentry *f_lle; /* llentry for flow */
113 };
114
115 struct flentry_v4 {
116 struct flentry fl_entry;
117 union ipv4_flow fl_flow;
118 };
119
120 struct flentry_v6 {
121 struct flentry fl_entry;
122 union ipv6_flow fl_flow;
123 };
124
125 #define fl_fhash fl_entry.fl_fhash
126 #define fl_flags fl_entry.fl_flags
127 #define fl_proto fl_entry.fl_proto
128 #define fl_uptime fl_entry.fl_uptime
129 #define fl_rt fl_entry.fl_rt
130 #define fl_lle fl_entry.fl_lle
131
132 #define SECS_PER_HOUR 3600
133 #define SECS_PER_DAY (24*SECS_PER_HOUR)
134
135 #define SYN_IDLE 300
136 #define UDP_IDLE 300
137 #define FIN_WAIT_IDLE 600
138 #define TCP_IDLE SECS_PER_DAY
139
140
141 typedef void fl_lock_t(struct flowtable *, uint32_t);
142 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
143
144 union flentryp {
145 struct flentry **global;
146 struct flentry **pcpu[MAXCPU];
147 };
148
149 struct flowtable_stats {
150 uint64_t ft_collisions;
151 uint64_t ft_allocated;
152 uint64_t ft_misses;
153 uint64_t ft_max_depth;
154 uint64_t ft_free_checks;
155 uint64_t ft_frees;
156 uint64_t ft_hits;
157 uint64_t ft_lookups;
158 } __aligned(CACHE_LINE_SIZE);
159
160 struct flowtable {
161 struct flowtable_stats ft_stats[MAXCPU];
162 int ft_size;
163 int ft_lock_count;
164 uint32_t ft_flags;
165 char *ft_name;
166 fl_lock_t *ft_lock;
167 fl_lock_t *ft_unlock;
168 fl_rtalloc_t *ft_rtalloc;
169 /*
170 * XXX need to pad out
171 */
172 struct mtx *ft_locks;
173 union flentryp ft_table;
174 bitstr_t *ft_masks[MAXCPU];
175 bitstr_t *ft_tmpmask;
176 struct flowtable *ft_next;
177
178 uint32_t ft_count __aligned(CACHE_LINE_SIZE);
179 uint32_t ft_udp_idle __aligned(CACHE_LINE_SIZE);
180 uint32_t ft_fin_wait_idle;
181 uint32_t ft_syn_idle;
182 uint32_t ft_tcp_idle;
183 boolean_t ft_full;
184 } __aligned(CACHE_LINE_SIZE);
185
186 static struct proc *flowcleanerproc;
187 static VNET_DEFINE(struct flowtable *, flow_list_head);
188 static VNET_DEFINE(uint32_t, flow_hashjitter);
189 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
190 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
191
192 #define V_flow_list_head VNET(flow_list_head)
193 #define V_flow_hashjitter VNET(flow_hashjitter)
194 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
195 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
196
197
198 static struct cv flowclean_f_cv;
199 static struct cv flowclean_c_cv;
200 static struct mtx flowclean_lock;
201 static uint32_t flowclean_cycles;
202 static uint32_t flowclean_freq;
203
204 #ifdef FLOWTABLE_DEBUG
205 #define FLDPRINTF(ft, flags, fmt, ...) \
206 do { \
207 if ((ft)->ft_flags & (flags)) \
208 printf((fmt), __VA_ARGS__); \
209 } while (0); \
210
211 #else
212 #define FLDPRINTF(ft, flags, fmt, ...)
213
214 #endif
215
216
217 /*
218 * TODO:
219 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
220 * to avoid extra cache evictions caused by incrementing a shared
221 * counter
222 * - add sysctls to resize && flush flow tables
223 * - Add per flowtable sysctls for statistics and configuring timeouts
224 * - add saturation counter to rtentry to support per-packet load-balancing
225 * add flag to indicate round-robin flow, add list lookup from head
226 for flows
227 * - add sysctl / device node / syscall to support exporting and importing
228 * of flows with flag to indicate that a flow was imported so should
229 * not be considered for auto-cleaning
230 * - support explicit connection state (currently only ad-hoc for DSR)
231 * - idetach() cleanup for options VIMAGE builds.
232 */
233 VNET_DEFINE(int, flowtable_enable) = 1;
234 static VNET_DEFINE(int, flowtable_debug);
235 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
236 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
237 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
238 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
239 static VNET_DEFINE(int, flowtable_nmbflows);
240 static VNET_DEFINE(int, flowtable_ready) = 0;
241
242 #define V_flowtable_enable VNET(flowtable_enable)
243 #define V_flowtable_debug VNET(flowtable_debug)
244 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
245 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
246 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
247 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
248 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
249 #define V_flowtable_ready VNET(flowtable_ready)
250
251 static SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
252 "flowtable");
253 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
254 &VNET_NAME(flowtable_debug), 0, "print debug info.");
255 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
256 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
257
258 /*
259 * XXX This does not end up updating timeouts at runtime
260 * and only reflects the value for the last table added :-/
261 */
262 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
263 &VNET_NAME(flowtable_syn_expire), 0,
264 "seconds after which to remove syn allocated flow.");
265 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
266 &VNET_NAME(flowtable_udp_expire), 0,
267 "seconds after which to remove flow allocated to UDP.");
268 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
269 &VNET_NAME(flowtable_fin_wait_expire), 0,
270 "seconds after which to remove a flow in FIN_WAIT.");
271 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
272 &VNET_NAME(flowtable_tcp_expire), 0,
273 "seconds after which to remove flow allocated to a TCP connection.");
274
275
276 /*
277 * Maximum number of flows that can be allocated of a given type.
278 *
279 * The table is allocated at boot time (for the pure caching case
280 * there is no reason why this could not be changed at runtime)
281 * and thus (currently) needs to be set with a tunable.
282 */
283 static int
284 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
285 {
286 int error, newnmbflows;
287
288 newnmbflows = V_flowtable_nmbflows;
289 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
290 if (error == 0 && req->newptr) {
291 if (newnmbflows > V_flowtable_nmbflows) {
292 V_flowtable_nmbflows = newnmbflows;
293 uma_zone_set_max(V_flow_ipv4_zone,
294 V_flowtable_nmbflows);
295 uma_zone_set_max(V_flow_ipv6_zone,
296 V_flowtable_nmbflows);
297 } else
298 error = EINVAL;
299 }
300 return (error);
301 }
302 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
303 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
304 "Maximum number of flows allowed");
305
306
307
308 #define FS_PRINT(sb, field) sbuf_printf((sb), "\t%s: %jd\n", #field, fs->ft_##field)
309
310 static void
311 fs_print(struct sbuf *sb, struct flowtable_stats *fs)
312 {
313
314 FS_PRINT(sb, collisions);
315 FS_PRINT(sb, allocated);
316 FS_PRINT(sb, misses);
317 FS_PRINT(sb, max_depth);
318 FS_PRINT(sb, free_checks);
319 FS_PRINT(sb, frees);
320 FS_PRINT(sb, hits);
321 FS_PRINT(sb, lookups);
322 }
323
324 static void
325 flowtable_show_stats(struct sbuf *sb, struct flowtable *ft)
326 {
327 int i;
328 struct flowtable_stats fs, *pfs;
329
330 if (ft->ft_flags & FL_PCPU) {
331 bzero(&fs, sizeof(fs));
332 pfs = &fs;
333 CPU_FOREACH(i) {
334 pfs->ft_collisions += ft->ft_stats[i].ft_collisions;
335 pfs->ft_allocated += ft->ft_stats[i].ft_allocated;
336 pfs->ft_misses += ft->ft_stats[i].ft_misses;
337 pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
338 pfs->ft_frees += ft->ft_stats[i].ft_frees;
339 pfs->ft_hits += ft->ft_stats[i].ft_hits;
340 pfs->ft_lookups += ft->ft_stats[i].ft_lookups;
341 if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
342 pfs->ft_max_depth = ft->ft_stats[i].ft_max_depth;
343 }
344 } else {
345 pfs = &ft->ft_stats[0];
346 }
347 fs_print(sb, pfs);
348 }
349
350 static int
351 sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
352 {
353 struct flowtable *ft;
354 struct sbuf *sb;
355 int error;
356
357 sb = sbuf_new(NULL, NULL, 64*1024, SBUF_FIXEDLEN);
358
359 ft = V_flow_list_head;
360 while (ft != NULL) {
361 sbuf_printf(sb, "\ntable name: %s\n", ft->ft_name);
362 flowtable_show_stats(sb, ft);
363 ft = ft->ft_next;
364 }
365 sbuf_finish(sb);
366 error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
367 sbuf_delete(sb);
368
369 return (error);
370 }
371 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats, CTLTYPE_STRING|CTLFLAG_RD,
372 NULL, 0, sysctl_flowtable_stats, "A", "flowtable statistics");
373
374
375 #ifndef RADIX_MPATH
376 static void
377 rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
378 {
379
380 rtalloc_ign_fib(ro, 0, fibnum);
381 }
382 #endif
383
384 static void
385 flowtable_global_lock(struct flowtable *table, uint32_t hash)
386 {
387 int lock_index = (hash)&(table->ft_lock_count - 1);
388
389 mtx_lock(&table->ft_locks[lock_index]);
390 }
391
392 static void
393 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
394 {
395 int lock_index = (hash)&(table->ft_lock_count - 1);
396
397 mtx_unlock(&table->ft_locks[lock_index]);
398 }
399
400 static void
401 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
402 {
403
404 critical_enter();
405 }
406
407 static void
408 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
409 {
410
411 critical_exit();
412 }
413
414 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
415 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
416 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
417 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
418
419 #define FL_STALE (1<<8)
420 #define FL_OVERWRITE (1<<10)
421
422 void
423 flow_invalidate(struct flentry *fle)
424 {
425
426 fle->f_flags |= FL_STALE;
427 }
428
429 static __inline int
430 proto_to_flags(uint8_t proto)
431 {
432 int flag;
433
434 switch (proto) {
435 case IPPROTO_TCP:
436 flag = FL_TCP;
437 break;
438 case IPPROTO_SCTP:
439 flag = FL_SCTP;
440 break;
441 case IPPROTO_UDP:
442 flag = FL_UDP;
443 break;
444 default:
445 flag = 0;
446 break;
447 }
448
449 return (flag);
450 }
451
452 static __inline int
453 flags_to_proto(int flags)
454 {
455 int proto, protoflags;
456
457 protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
458 switch (protoflags) {
459 case FL_TCP:
460 proto = IPPROTO_TCP;
461 break;
462 case FL_SCTP:
463 proto = IPPROTO_SCTP;
464 break;
465 case FL_UDP:
466 proto = IPPROTO_UDP;
467 break;
468 default:
469 proto = 0;
470 break;
471 }
472 return (proto);
473 }
474
475 #ifdef INET
476 #ifdef FLOWTABLE_DEBUG
477 static void
478 ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
479 struct sockaddr_in *dsin)
480 {
481 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
482
483 if (flags & FL_HASH_ALL) {
484 inet_ntoa_r(ssin->sin_addr, saddr);
485 inet_ntoa_r(dsin->sin_addr, daddr);
486 printf("proto=%d %s:%d->%s:%d\n",
487 proto, saddr, ntohs(ssin->sin_port), daddr,
488 ntohs(dsin->sin_port));
489 } else {
490 inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
491 printf("proto=%d %s\n", proto, daddr);
492 }
493
494 }
495 #endif
496
497 static int
498 ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
499 struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
500 {
501 struct ip *ip;
502 uint8_t proto;
503 int iphlen;
504 struct tcphdr *th;
505 struct udphdr *uh;
506 struct sctphdr *sh;
507 uint16_t sport, dport;
508
509 proto = sport = dport = 0;
510 ip = mtod(m, struct ip *);
511 dsin->sin_family = AF_INET;
512 dsin->sin_len = sizeof(*dsin);
513 dsin->sin_addr = ip->ip_dst;
514 ssin->sin_family = AF_INET;
515 ssin->sin_len = sizeof(*ssin);
516 ssin->sin_addr = ip->ip_src;
517
518 proto = ip->ip_p;
519 if ((*flags & FL_HASH_ALL) == 0) {
520 FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
521 *flags);
522 goto skipports;
523 }
524
525 iphlen = ip->ip_hl << 2; /* XXX options? */
526
527 switch (proto) {
528 case IPPROTO_TCP:
529 th = (struct tcphdr *)((caddr_t)ip + iphlen);
530 sport = th->th_sport;
531 dport = th->th_dport;
532 if ((*flags & FL_HASH_ALL) &&
533 (th->th_flags & (TH_RST|TH_FIN)))
534 *flags |= FL_STALE;
535 break;
536 case IPPROTO_UDP:
537 uh = (struct udphdr *)((caddr_t)ip + iphlen);
538 sport = uh->uh_sport;
539 dport = uh->uh_dport;
540 break;
541 case IPPROTO_SCTP:
542 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
543 sport = sh->src_port;
544 dport = sh->dest_port;
545 break;
546 default:
547 FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", proto);
548 return (ENOTSUP);
549 /* no port - hence not a protocol we care about */
550 break;
551
552 }
553
554 skipports:
555 *flags |= proto_to_flags(proto);
556 ssin->sin_port = sport;
557 dsin->sin_port = dport;
558 return (0);
559 }
560
561 static uint32_t
562 ipv4_flow_lookup_hash_internal(
563 struct sockaddr_in *ssin, struct sockaddr_in *dsin,
564 uint32_t *key, uint16_t flags)
565 {
566 uint16_t sport, dport;
567 uint8_t proto;
568 int offset = 0;
569
570 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
571 return (0);
572 proto = flags_to_proto(flags);
573 sport = dport = key[2] = key[1] = key[0] = 0;
574 if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
575 key[1] = ssin->sin_addr.s_addr;
576 sport = ssin->sin_port;
577 }
578 if (dsin != NULL) {
579 key[2] = dsin->sin_addr.s_addr;
580 dport = dsin->sin_port;
581 }
582 if (flags & FL_HASH_ALL) {
583 ((uint16_t *)key)[0] = sport;
584 ((uint16_t *)key)[1] = dport;
585 } else
586 offset = V_flow_hashjitter + proto;
587
588 return (jenkins_hashword(key, 3, offset));
589 }
590
591 static struct flentry *
592 flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
593 {
594 struct sockaddr_storage ssa, dsa;
595 uint16_t flags;
596 struct sockaddr_in *dsin, *ssin;
597
598 dsin = (struct sockaddr_in *)&dsa;
599 ssin = (struct sockaddr_in *)&ssa;
600 bzero(dsin, sizeof(*dsin));
601 bzero(ssin, sizeof(*ssin));
602 flags = ft->ft_flags;
603 if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
604 return (NULL);
605
606 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
607 }
608
609 void
610 flow_to_route(struct flentry *fle, struct route *ro)
611 {
612 uint32_t *hashkey = NULL;
613 struct sockaddr_in *sin;
614
615 sin = (struct sockaddr_in *)&ro->ro_dst;
616 sin->sin_family = AF_INET;
617 sin->sin_len = sizeof(*sin);
618 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
619 sin->sin_addr.s_addr = hashkey[2];
620 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
621 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
622 ro->ro_flags |= RT_NORTREF;
623 }
624 #endif /* INET */
625
626 #ifdef INET6
627 /*
628 * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
629 * then it sets p to point at the offset "len" in the mbuf. WARNING: the
630 * pointer might become stale after other pullups (but we never use it
631 * this way).
632 */
633 #define PULLUP_TO(_len, p, T) \
634 do { \
635 int x = (_len) + sizeof(T); \
636 if ((m)->m_len < x) { \
637 goto receive_failed; \
638 } \
639 p = (mtod(m, char *) + (_len)); \
640 } while (0)
641
642 #define TCP(p) ((struct tcphdr *)(p))
643 #define SCTP(p) ((struct sctphdr *)(p))
644 #define UDP(p) ((struct udphdr *)(p))
645
646 static int
647 ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
648 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
649 {
650 struct ip6_hdr *ip6;
651 uint8_t proto;
652 int hlen;
653 uint16_t src_port, dst_port;
654 u_short offset;
655 void *ulp;
656
657 offset = hlen = src_port = dst_port = 0;
658 ulp = NULL;
659 ip6 = mtod(m, struct ip6_hdr *);
660 hlen = sizeof(struct ip6_hdr);
661 proto = ip6->ip6_nxt;
662
663 if ((*flags & FL_HASH_ALL) == 0)
664 goto skipports;
665
666 while (ulp == NULL) {
667 switch (proto) {
668 case IPPROTO_ICMPV6:
669 case IPPROTO_OSPFIGP:
670 case IPPROTO_PIM:
671 case IPPROTO_CARP:
672 case IPPROTO_ESP:
673 case IPPROTO_NONE:
674 ulp = ip6;
675 break;
676 case IPPROTO_TCP:
677 PULLUP_TO(hlen, ulp, struct tcphdr);
678 dst_port = TCP(ulp)->th_dport;
679 src_port = TCP(ulp)->th_sport;
680 if ((*flags & FL_HASH_ALL) &&
681 (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
682 *flags |= FL_STALE;
683 break;
684 case IPPROTO_SCTP:
685 PULLUP_TO(hlen, ulp, struct sctphdr);
686 src_port = SCTP(ulp)->src_port;
687 dst_port = SCTP(ulp)->dest_port;
688 break;
689 case IPPROTO_UDP:
690 PULLUP_TO(hlen, ulp, struct udphdr);
691 dst_port = UDP(ulp)->uh_dport;
692 src_port = UDP(ulp)->uh_sport;
693 break;
694 case IPPROTO_HOPOPTS: /* RFC 2460 */
695 PULLUP_TO(hlen, ulp, struct ip6_hbh);
696 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
697 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
698 ulp = NULL;
699 break;
700 case IPPROTO_ROUTING: /* RFC 2460 */
701 PULLUP_TO(hlen, ulp, struct ip6_rthdr);
702 hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
703 proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
704 ulp = NULL;
705 break;
706 case IPPROTO_FRAGMENT: /* RFC 2460 */
707 PULLUP_TO(hlen, ulp, struct ip6_frag);
708 hlen += sizeof (struct ip6_frag);
709 proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
710 offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
711 IP6F_OFF_MASK;
712 ulp = NULL;
713 break;
714 case IPPROTO_DSTOPTS: /* RFC 2460 */
715 PULLUP_TO(hlen, ulp, struct ip6_hbh);
716 hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
717 proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
718 ulp = NULL;
719 break;
720 case IPPROTO_AH: /* RFC 2402 */
721 PULLUP_TO(hlen, ulp, struct ip6_ext);
722 hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
723 proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
724 ulp = NULL;
725 break;
726 default:
727 PULLUP_TO(hlen, ulp, struct ip6_ext);
728 break;
729 }
730 }
731
732 if (src_port == 0) {
733 receive_failed:
734 return (ENOTSUP);
735 }
736
737 skipports:
738 dsin6->sin6_family = AF_INET6;
739 dsin6->sin6_len = sizeof(*dsin6);
740 dsin6->sin6_port = dst_port;
741 memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
742
743 ssin6->sin6_family = AF_INET6;
744 ssin6->sin6_len = sizeof(*ssin6);
745 ssin6->sin6_port = src_port;
746 memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
747 *flags |= proto_to_flags(proto);
748
749 return (0);
750 }
751
752 #define zero_key(key) \
753 do { \
754 key[0] = 0; \
755 key[1] = 0; \
756 key[2] = 0; \
757 key[3] = 0; \
758 key[4] = 0; \
759 key[5] = 0; \
760 key[6] = 0; \
761 key[7] = 0; \
762 key[8] = 0; \
763 } while (0)
764
765 static uint32_t
766 ipv6_flow_lookup_hash_internal(
767 struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6,
768 uint32_t *key, uint16_t flags)
769 {
770 uint16_t sport, dport;
771 uint8_t proto;
772 int offset = 0;
773
774 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
775 return (0);
776
777 proto = flags_to_proto(flags);
778 zero_key(key);
779 sport = dport = 0;
780 if (dsin6 != NULL) {
781 memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
782 dport = dsin6->sin6_port;
783 }
784 if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
785 memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
786 sport = ssin6->sin6_port;
787 }
788 if (flags & FL_HASH_ALL) {
789 ((uint16_t *)key)[0] = sport;
790 ((uint16_t *)key)[1] = dport;
791 } else
792 offset = V_flow_hashjitter + proto;
793
794 return (jenkins_hashword(key, 9, offset));
795 }
796
797 static struct flentry *
798 flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
799 {
800 struct sockaddr_storage ssa, dsa;
801 struct sockaddr_in6 *dsin6, *ssin6;
802 uint16_t flags;
803
804 dsin6 = (struct sockaddr_in6 *)&dsa;
805 ssin6 = (struct sockaddr_in6 *)&ssa;
806 bzero(dsin6, sizeof(*dsin6));
807 bzero(ssin6, sizeof(*ssin6));
808 flags = ft->ft_flags;
809
810 if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
811 return (NULL);
812
813 return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
814 }
815
816 void
817 flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
818 {
819 uint32_t *hashkey = NULL;
820 struct sockaddr_in6 *sin6;
821
822 sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
823
824 sin6->sin6_family = AF_INET6;
825 sin6->sin6_len = sizeof(*sin6);
826 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
827 memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
828 ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
829 ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
830 ro->ro_flags |= RT_NORTREF;
831 }
832 #endif /* INET6 */
833
834 static bitstr_t *
835 flowtable_mask(struct flowtable *ft)
836 {
837 bitstr_t *mask;
838
839 if (ft->ft_flags & FL_PCPU)
840 mask = ft->ft_masks[curcpu];
841 else
842 mask = ft->ft_masks[0];
843
844 return (mask);
845 }
846
847 static struct flentry **
848 flowtable_entry(struct flowtable *ft, uint32_t hash)
849 {
850 struct flentry **fle;
851 int index = (hash % ft->ft_size);
852
853 if (ft->ft_flags & FL_PCPU) {
854 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
855 fle = &ft->ft_table.pcpu[curcpu][index];
856 } else {
857 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
858 fle = &ft->ft_table.global[index];
859 }
860
861 return (fle);
862 }
863
864 static int
865 flow_stale(struct flowtable *ft, struct flentry *fle)
866 {
867 time_t idle_time;
868
869 if ((fle->f_fhash == 0)
870 || ((fle->f_rt->rt_flags & RTF_HOST) &&
871 ((fle->f_rt->rt_flags & (RTF_UP))
872 != (RTF_UP)))
873 || (fle->f_rt->rt_ifp == NULL)
874 || !RT_LINK_IS_UP(fle->f_rt->rt_ifp))
875 return (1);
876
877 idle_time = time_uptime - fle->f_uptime;
878
879 if ((fle->f_flags & FL_STALE) ||
880 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
881 && (idle_time > ft->ft_udp_idle)) ||
882 ((fle->f_flags & TH_FIN)
883 && (idle_time > ft->ft_fin_wait_idle)) ||
884 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
885 && (idle_time > ft->ft_syn_idle)) ||
886 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
887 && (idle_time > ft->ft_tcp_idle)) ||
888 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
889 (fle->f_rt->rt_ifp == NULL)))
890 return (1);
891
892 return (0);
893 }
894
895 static void
896 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
897 {
898 uint32_t *hashkey;
899 int i, nwords;
900
901 if (fle->f_flags & FL_IPV6) {
902 nwords = 9;
903 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
904 } else {
905 nwords = 3;
906 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
907 }
908
909 for (i = 0; i < nwords; i++)
910 hashkey[i] = key[i];
911 }
912
913 static struct flentry *
914 flow_alloc(struct flowtable *ft)
915 {
916 struct flentry *newfle;
917 uma_zone_t zone;
918
919 newfle = NULL;
920 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
921
922 newfle = uma_zalloc(zone, M_NOWAIT | M_ZERO);
923 if (newfle != NULL)
924 atomic_add_int(&ft->ft_count, 1);
925 return (newfle);
926 }
927
928 static void
929 flow_free(struct flentry *fle, struct flowtable *ft)
930 {
931 uma_zone_t zone;
932
933 zone = (ft->ft_flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
934 atomic_add_int(&ft->ft_count, -1);
935 uma_zfree(zone, fle);
936 }
937
938 static int
939 flow_full(struct flowtable *ft)
940 {
941 boolean_t full;
942 uint32_t count;
943
944 full = ft->ft_full;
945 count = ft->ft_count;
946
947 if (full && (count < (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 3))))
948 ft->ft_full = FALSE;
949 else if (!full && (count > (V_flowtable_nmbflows - (V_flowtable_nmbflows >> 5))))
950 ft->ft_full = TRUE;
951
952 if (full && !ft->ft_full) {
953 flowclean_freq = 4*hz;
954 if ((ft->ft_flags & FL_HASH_ALL) == 0)
955 ft->ft_udp_idle = ft->ft_fin_wait_idle =
956 ft->ft_syn_idle = ft->ft_tcp_idle = 5;
957 cv_broadcast(&flowclean_c_cv);
958 } else if (!full && ft->ft_full) {
959 flowclean_freq = 20*hz;
960 if ((ft->ft_flags & FL_HASH_ALL) == 0)
961 ft->ft_udp_idle = ft->ft_fin_wait_idle =
962 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
963 }
964
965 return (ft->ft_full);
966 }
967
968 static int
969 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
970 uint32_t fibnum, struct route *ro, uint16_t flags)
971 {
972 struct flentry *fle, *fletail, *newfle, **flep;
973 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
974 int depth;
975 bitstr_t *mask;
976 uint8_t proto;
977
978 newfle = flow_alloc(ft);
979 if (newfle == NULL)
980 return (ENOMEM);
981
982 newfle->f_flags |= (flags & FL_IPV6);
983 proto = flags_to_proto(flags);
984
985 FL_ENTRY_LOCK(ft, hash);
986 mask = flowtable_mask(ft);
987 flep = flowtable_entry(ft, hash);
988 fletail = fle = *flep;
989
990 if (fle == NULL) {
991 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
992 *flep = fle = newfle;
993 goto skip;
994 }
995
996 depth = 0;
997 fs->ft_collisions++;
998 /*
999 * find end of list and make sure that we were not
1000 * preempted by another thread handling this flow
1001 */
1002 while (fle != NULL) {
1003 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
1004 /*
1005 * there was either a hash collision
1006 * or we lost a race to insert
1007 */
1008 FL_ENTRY_UNLOCK(ft, hash);
1009 flow_free(newfle, ft);
1010
1011 if (flags & FL_OVERWRITE)
1012 goto skip;
1013 return (EEXIST);
1014 }
1015 /*
1016 * re-visit this double condition XXX
1017 */
1018 if (fletail->f_next != NULL)
1019 fletail = fle->f_next;
1020
1021 depth++;
1022 fle = fle->f_next;
1023 }
1024
1025 if (depth > fs->ft_max_depth)
1026 fs->ft_max_depth = depth;
1027 fletail->f_next = newfle;
1028 fle = newfle;
1029 skip:
1030 flowtable_set_hashkey(fle, key);
1031
1032 fle->f_proto = proto;
1033 fle->f_rt = ro->ro_rt;
1034 fle->f_lle = ro->ro_lle;
1035 fle->f_fhash = hash;
1036 fle->f_fibnum = fibnum;
1037 fle->f_uptime = time_uptime;
1038 FL_ENTRY_UNLOCK(ft, hash);
1039 return (0);
1040 }
1041
1042 int
1043 kern_flowtable_insert(struct flowtable *ft,
1044 struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
1045 struct route *ro, uint32_t fibnum, int flags)
1046 {
1047 uint32_t key[9], hash;
1048
1049 flags = (ft->ft_flags | flags | FL_OVERWRITE);
1050 hash = 0;
1051
1052 #ifdef INET
1053 if (ssa->ss_family == AF_INET)
1054 hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
1055 (struct sockaddr_in *)dsa, key, flags);
1056 #endif
1057 #ifdef INET6
1058 if (ssa->ss_family == AF_INET6)
1059 hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 *)ssa,
1060 (struct sockaddr_in6 *)dsa, key, flags);
1061 #endif
1062 if (ro->ro_rt == NULL || ro->ro_lle == NULL)
1063 return (EINVAL);
1064
1065 FLDPRINTF(ft, FL_DEBUG,
1066 "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
1067 key[0], key[1], key[2], hash, fibnum, flags);
1068 return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
1069 }
1070
1071 static int
1072 flowtable_key_equal(struct flentry *fle, uint32_t *key)
1073 {
1074 uint32_t *hashkey;
1075 int i, nwords;
1076
1077 if (fle->f_flags & FL_IPV6) {
1078 nwords = 9;
1079 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1080 } else {
1081 nwords = 3;
1082 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1083 }
1084
1085 for (i = 0; i < nwords; i++)
1086 if (hashkey[i] != key[i])
1087 return (0);
1088
1089 return (1);
1090 }
1091
1092 struct flentry *
1093 flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
1094 {
1095 struct flentry *fle = NULL;
1096
1097 #ifdef INET
1098 if (af == AF_INET)
1099 fle = flowtable_lookup_mbuf4(ft, m);
1100 #endif
1101 #ifdef INET6
1102 if (af == AF_INET6)
1103 fle = flowtable_lookup_mbuf6(ft, m);
1104 #endif
1105 if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
1106 m->m_flags |= M_FLOWID;
1107 m->m_pkthdr.flowid = fle->f_fhash;
1108 }
1109 return (fle);
1110 }
1111
1112 struct flentry *
1113 flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
1114 struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
1115 {
1116 uint32_t key[9], hash;
1117 struct flentry *fle;
1118 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1119 uint8_t proto = 0;
1120 int error = 0;
1121 struct rtentry *rt;
1122 struct llentry *lle;
1123 struct route sro, *ro;
1124 struct route_in6 sro6;
1125
1126 sro.ro_rt = sro6.ro_rt = NULL;
1127 sro.ro_lle = sro6.ro_lle = NULL;
1128 ro = NULL;
1129 hash = 0;
1130 flags |= ft->ft_flags;
1131 proto = flags_to_proto(flags);
1132 #ifdef INET
1133 if (ssa->ss_family == AF_INET) {
1134 struct sockaddr_in *ssin, *dsin;
1135
1136 ro = &sro;
1137 memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
1138 /*
1139 * The harvested source and destination addresses
1140 * may contain port information if the packet is
1141 * from a transport protocol (e.g. TCP/UDP). The
1142 * port field must be cleared before performing
1143 * a route lookup.
1144 */
1145 ((struct sockaddr_in *)&ro->ro_dst)->sin_port = 0;
1146 dsin = (struct sockaddr_in *)dsa;
1147 ssin = (struct sockaddr_in *)ssa;
1148 if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
1149 (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1150 (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
1151 return (NULL);
1152
1153 hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
1154 }
1155 #endif
1156 #ifdef INET6
1157 if (ssa->ss_family == AF_INET6) {
1158 struct sockaddr_in6 *ssin6, *dsin6;
1159
1160 ro = (struct route *)&sro6;
1161 memcpy(&sro6.ro_dst, dsa,
1162 sizeof(struct sockaddr_in6));
1163 ((struct sockaddr_in6 *)&ro->ro_dst)->sin6_port = 0;
1164 dsin6 = (struct sockaddr_in6 *)dsa;
1165 ssin6 = (struct sockaddr_in6 *)ssa;
1166
1167 flags |= FL_IPV6;
1168 hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
1169 }
1170 #endif
1171 /*
1172 * Ports are zero and this isn't a transmit cache
1173 * - thus not a protocol for which we need to keep
1174 * state
1175 * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
1176 */
1177 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
1178 return (NULL);
1179
1180 fs->ft_lookups++;
1181 FL_ENTRY_LOCK(ft, hash);
1182 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
1183 FL_ENTRY_UNLOCK(ft, hash);
1184 goto uncached;
1185 }
1186 keycheck:
1187 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1188 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1189 if ((rt != NULL)
1190 && lle != NULL
1191 && fle->f_fhash == hash
1192 && flowtable_key_equal(fle, key)
1193 && (proto == fle->f_proto)
1194 && (fibnum == fle->f_fibnum)
1195 && (rt->rt_flags & RTF_UP)
1196 && (rt->rt_ifp != NULL)
1197 && (lle->la_flags & LLE_VALID)) {
1198 fs->ft_hits++;
1199 fle->f_uptime = time_uptime;
1200 fle->f_flags |= flags;
1201 FL_ENTRY_UNLOCK(ft, hash);
1202 return (fle);
1203 } else if (fle->f_next != NULL) {
1204 fle = fle->f_next;
1205 goto keycheck;
1206 }
1207 FL_ENTRY_UNLOCK(ft, hash);
1208 uncached:
1209 if (flags & FL_NOAUTO || flow_full(ft))
1210 return (NULL);
1211
1212 fs->ft_misses++;
1213 /*
1214 * This bit of code ends up locking the
1215 * same route 3 times (just like ip_output + ether_output)
1216 * - at lookup
1217 * - in rt_check when called by arpresolve
1218 * - dropping the refcount for the rtentry
1219 *
1220 * This could be consolidated to one if we wrote a variant
1221 * of arpresolve with an rt_check variant that expected to
1222 * receive the route locked
1223 */
1224
1225 #ifdef INVARIANTS
1226 if ((ro->ro_dst.sa_family != AF_INET) &&
1227 (ro->ro_dst.sa_family != AF_INET6))
1228 panic("sa_family == %d\n", ro->ro_dst.sa_family);
1229 #endif
1230
1231 ft->ft_rtalloc(ro, hash, fibnum);
1232 if (ro->ro_rt == NULL)
1233 error = ENETUNREACH;
1234 else {
1235 struct llentry *lle = NULL;
1236 struct sockaddr_storage *l3addr;
1237 struct rtentry *rt = ro->ro_rt;
1238 struct ifnet *ifp = rt->rt_ifp;
1239
1240 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
1241 RTFREE(rt);
1242 ro->ro_rt = NULL;
1243 return (NULL);
1244 }
1245 #ifdef INET6
1246 if (ssa->ss_family == AF_INET6) {
1247 struct sockaddr_in6 *dsin6;
1248
1249 dsin6 = (struct sockaddr_in6 *)dsa;
1250 if (in6_localaddr(&dsin6->sin6_addr)) {
1251 RTFREE(rt);
1252 ro->ro_rt = NULL;
1253 return (NULL);
1254 }
1255
1256 if (rt->rt_flags & RTF_GATEWAY)
1257 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1258
1259 else
1260 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1261 lle = llentry_alloc(ifp, LLTABLE6(ifp), l3addr);
1262 }
1263 #endif
1264 #ifdef INET
1265 if (ssa->ss_family == AF_INET) {
1266 if (rt->rt_flags & RTF_GATEWAY)
1267 l3addr = (struct sockaddr_storage *)rt->rt_gateway;
1268 else
1269 l3addr = (struct sockaddr_storage *)&ro->ro_dst;
1270 lle = llentry_alloc(ifp, LLTABLE(ifp), l3addr);
1271 }
1272
1273 #endif
1274 ro->ro_lle = lle;
1275
1276 if (lle == NULL) {
1277 RTFREE(rt);
1278 ro->ro_rt = NULL;
1279 return (NULL);
1280 }
1281 error = flowtable_insert(ft, hash, key, fibnum, ro, flags);
1282
1283 if (error) {
1284 RTFREE(rt);
1285 LLE_FREE(lle);
1286 ro->ro_rt = NULL;
1287 ro->ro_lle = NULL;
1288 }
1289 }
1290
1291 return ((error) ? NULL : fle);
1292 }
1293
1294 /*
1295 * used by the bit_alloc macro
1296 */
1297 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
1298
1299 struct flowtable *
1300 flowtable_alloc(char *name, int nentry, int flags)
1301 {
1302 struct flowtable *ft, *fttail;
1303 int i;
1304
1305 if (V_flow_hashjitter == 0)
1306 V_flow_hashjitter = arc4random();
1307
1308 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
1309
1310 ft = malloc(sizeof(struct flowtable),
1311 M_RTABLE, M_WAITOK | M_ZERO);
1312
1313 ft->ft_name = name;
1314 ft->ft_flags = flags;
1315 ft->ft_size = nentry;
1316 #ifdef RADIX_MPATH
1317 ft->ft_rtalloc = rtalloc_mpath_fib;
1318 #else
1319 ft->ft_rtalloc = rtalloc_ign_wrapper;
1320 #endif
1321 if (flags & FL_PCPU) {
1322 ft->ft_lock = flowtable_pcpu_lock;
1323 ft->ft_unlock = flowtable_pcpu_unlock;
1324
1325 for (i = 0; i <= mp_maxid; i++) {
1326 ft->ft_table.pcpu[i] =
1327 malloc(nentry*sizeof(struct flentry *),
1328 M_RTABLE, M_WAITOK | M_ZERO);
1329 ft->ft_masks[i] = bit_alloc(nentry);
1330 }
1331 } else {
1332 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
1333 (fls(mp_maxid + 1) << 1));
1334
1335 ft->ft_lock = flowtable_global_lock;
1336 ft->ft_unlock = flowtable_global_unlock;
1337 ft->ft_table.global =
1338 malloc(nentry*sizeof(struct flentry *),
1339 M_RTABLE, M_WAITOK | M_ZERO);
1340 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
1341 M_RTABLE, M_WAITOK | M_ZERO);
1342 for (i = 0; i < ft->ft_lock_count; i++)
1343 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
1344
1345 ft->ft_masks[0] = bit_alloc(nentry);
1346 }
1347 ft->ft_tmpmask = bit_alloc(nentry);
1348
1349 /*
1350 * In the local transmit case the table truly is
1351 * just a cache - so everything is eligible for
1352 * replacement after 5s of non-use
1353 */
1354 if (flags & FL_HASH_ALL) {
1355 ft->ft_udp_idle = V_flowtable_udp_expire;
1356 ft->ft_syn_idle = V_flowtable_syn_expire;
1357 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
1358 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
1359 } else {
1360 ft->ft_udp_idle = ft->ft_fin_wait_idle =
1361 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
1362
1363 }
1364
1365 /*
1366 * hook in to the cleaner list
1367 */
1368 if (V_flow_list_head == NULL)
1369 V_flow_list_head = ft;
1370 else {
1371 fttail = V_flow_list_head;
1372 while (fttail->ft_next != NULL)
1373 fttail = fttail->ft_next;
1374 fttail->ft_next = ft;
1375 }
1376
1377 return (ft);
1378 }
1379
1380 /*
1381 * The rest of the code is devoted to garbage collection of expired entries.
1382 * It is a new additon made necessary by the switch to dynamically allocating
1383 * flow tables.
1384 *
1385 */
1386 static void
1387 fle_free(struct flentry *fle, struct flowtable *ft)
1388 {
1389 struct rtentry *rt;
1390 struct llentry *lle;
1391
1392 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
1393 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
1394 if (rt != NULL)
1395 RTFREE(rt);
1396 if (lle != NULL)
1397 LLE_FREE(lle);
1398 flow_free(fle, ft);
1399 }
1400
1401 static void
1402 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
1403 {
1404 int curbit = 0, count;
1405 struct flentry *fle, **flehead, *fleprev;
1406 struct flentry *flefreehead, *flefreetail, *fletmp;
1407 bitstr_t *mask, *tmpmask;
1408 struct flowtable_stats *fs = &ft->ft_stats[curcpu];
1409
1410 flefreehead = flefreetail = NULL;
1411 mask = flowtable_mask(ft);
1412 tmpmask = ft->ft_tmpmask;
1413 memcpy(tmpmask, mask, ft->ft_size/8);
1414 /*
1415 * XXX Note to self, bit_ffs operates at the byte level
1416 * and thus adds gratuitous overhead
1417 */
1418 bit_ffs(tmpmask, ft->ft_size, &curbit);
1419 while (curbit != -1) {
1420 if (curbit >= ft->ft_size || curbit < -1) {
1421 log(LOG_ALERT,
1422 "warning: bad curbit value %d \n",
1423 curbit);
1424 break;
1425 }
1426
1427 FL_ENTRY_LOCK(ft, curbit);
1428 flehead = flowtable_entry(ft, curbit);
1429 fle = fleprev = *flehead;
1430
1431 fs->ft_free_checks++;
1432 #ifdef DIAGNOSTIC
1433 if (fle == NULL && curbit > 0) {
1434 log(LOG_ALERT,
1435 "warning bit=%d set, but no fle found\n",
1436 curbit);
1437 }
1438 #endif
1439 while (fle != NULL) {
1440 if (rt != NULL) {
1441 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
1442 fleprev = fle;
1443 fle = fle->f_next;
1444 continue;
1445 }
1446 } else if (!flow_stale(ft, fle)) {
1447 fleprev = fle;
1448 fle = fle->f_next;
1449 continue;
1450 }
1451 /*
1452 * delete head of the list
1453 */
1454 if (fleprev == *flehead) {
1455 fletmp = fleprev;
1456 if (fle == fleprev) {
1457 fleprev = *flehead = fle->f_next;
1458 } else
1459 fleprev = *flehead = fle;
1460 fle = fle->f_next;
1461 } else {
1462 /*
1463 * don't advance fleprev
1464 */
1465 fletmp = fle;
1466 fleprev->f_next = fle->f_next;
1467 fle = fleprev->f_next;
1468 }
1469
1470 if (flefreehead == NULL)
1471 flefreehead = flefreetail = fletmp;
1472 else {
1473 flefreetail->f_next = fletmp;
1474 flefreetail = fletmp;
1475 }
1476 fletmp->f_next = NULL;
1477 }
1478 if (*flehead == NULL)
1479 bit_clear(mask, curbit);
1480 FL_ENTRY_UNLOCK(ft, curbit);
1481 bit_clear(tmpmask, curbit);
1482 bit_ffs(tmpmask, ft->ft_size, &curbit);
1483 }
1484 count = 0;
1485 while ((fle = flefreehead) != NULL) {
1486 flefreehead = fle->f_next;
1487 count++;
1488 fs->ft_frees++;
1489 fle_free(fle, ft);
1490 }
1491 if (V_flowtable_debug && count)
1492 log(LOG_DEBUG, "freed %d flow entries\n", count);
1493 }
1494
1495 void
1496 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
1497 {
1498 int i;
1499
1500 if (ft->ft_flags & FL_PCPU) {
1501 CPU_FOREACH(i) {
1502 if (smp_started == 1) {
1503 thread_lock(curthread);
1504 sched_bind(curthread, i);
1505 thread_unlock(curthread);
1506 }
1507
1508 flowtable_free_stale(ft, rt);
1509
1510 if (smp_started == 1) {
1511 thread_lock(curthread);
1512 sched_unbind(curthread);
1513 thread_unlock(curthread);
1514 }
1515 }
1516 } else {
1517 flowtable_free_stale(ft, rt);
1518 }
1519 }
1520
1521 static void
1522 flowtable_clean_vnet(void)
1523 {
1524 struct flowtable *ft;
1525 int i;
1526
1527 ft = V_flow_list_head;
1528 while (ft != NULL) {
1529 if (ft->ft_flags & FL_PCPU) {
1530 CPU_FOREACH(i) {
1531 if (smp_started == 1) {
1532 thread_lock(curthread);
1533 sched_bind(curthread, i);
1534 thread_unlock(curthread);
1535 }
1536
1537 flowtable_free_stale(ft, NULL);
1538
1539 if (smp_started == 1) {
1540 thread_lock(curthread);
1541 sched_unbind(curthread);
1542 thread_unlock(curthread);
1543 }
1544 }
1545 } else {
1546 flowtable_free_stale(ft, NULL);
1547 }
1548 ft = ft->ft_next;
1549 }
1550 }
1551
1552 static void
1553 flowtable_cleaner(void)
1554 {
1555 VNET_ITERATOR_DECL(vnet_iter);
1556 struct thread *td;
1557
1558 if (bootverbose)
1559 log(LOG_INFO, "flowtable cleaner started\n");
1560 td = curthread;
1561 while (1) {
1562 VNET_LIST_RLOCK();
1563 VNET_FOREACH(vnet_iter) {
1564 CURVNET_SET(vnet_iter);
1565 flowtable_clean_vnet();
1566 CURVNET_RESTORE();
1567 }
1568 VNET_LIST_RUNLOCK();
1569
1570 /*
1571 * The 10 second interval between cleaning checks
1572 * is arbitrary
1573 */
1574 mtx_lock(&flowclean_lock);
1575 thread_lock(td);
1576 sched_prio(td, PPAUSE);
1577 thread_unlock(td);
1578 flowclean_cycles++;
1579 cv_broadcast(&flowclean_f_cv);
1580 cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
1581 mtx_unlock(&flowclean_lock);
1582 }
1583 }
1584
1585 static void
1586 flowtable_flush(void *unused __unused)
1587 {
1588 uint64_t start;
1589
1590 mtx_lock(&flowclean_lock);
1591 start = flowclean_cycles;
1592 while (start == flowclean_cycles) {
1593 cv_broadcast(&flowclean_c_cv);
1594 cv_wait(&flowclean_f_cv, &flowclean_lock);
1595 }
1596 mtx_unlock(&flowclean_lock);
1597 }
1598
1599 static struct kproc_desc flow_kp = {
1600 "flowcleaner",
1601 flowtable_cleaner,
1602 &flowcleanerproc
1603 };
1604 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1605
1606 static void
1607 flowtable_init_vnet(const void *unused __unused)
1608 {
1609
1610 V_flowtable_nmbflows = 1024 + maxusers * 64 * mp_ncpus;
1611 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1612 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1613 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1614 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1615 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1616 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1617 V_flowtable_ready = 1;
1618 }
1619 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_SMP, SI_ORDER_ANY,
1620 flowtable_init_vnet, NULL);
1621
1622 static void
1623 flowtable_init(const void *unused __unused)
1624 {
1625
1626 cv_init(&flowclean_c_cv, "c_flowcleanwait");
1627 cv_init(&flowclean_f_cv, "f_flowcleanwait");
1628 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1629 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1630 EVENTHANDLER_PRI_ANY);
1631 flowclean_freq = 20*hz;
1632 }
1633 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST,
1634 flowtable_init, NULL);
1635
1636
1637 #ifdef VIMAGE
1638 static void
1639 flowtable_uninit(const void *unused __unused)
1640 {
1641
1642 V_flowtable_ready = 0;
1643 uma_zdestroy(V_flow_ipv4_zone);
1644 uma_zdestroy(V_flow_ipv6_zone);
1645 }
1646
1647 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1648 flowtable_uninit, NULL);
1649 #endif
1650
1651 #ifdef DDB
1652 static uint32_t *
1653 flowtable_get_hashkey(struct flentry *fle)
1654 {
1655 uint32_t *hashkey;
1656
1657 if (fle->f_flags & FL_IPV6)
1658 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
1659 else
1660 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
1661
1662 return (hashkey);
1663 }
1664
1665 static bitstr_t *
1666 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1667 {
1668 bitstr_t *mask;
1669
1670 if (ft->ft_flags & FL_PCPU)
1671 mask = ft->ft_masks[cpuid];
1672 else
1673 mask = ft->ft_masks[0];
1674
1675 return (mask);
1676 }
1677
1678 static struct flentry **
1679 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1680 {
1681 struct flentry **fle;
1682 int index = (hash % ft->ft_size);
1683
1684 if (ft->ft_flags & FL_PCPU) {
1685 fle = &ft->ft_table.pcpu[cpuid][index];
1686 } else {
1687 fle = &ft->ft_table.global[index];
1688 }
1689
1690 return (fle);
1691 }
1692
1693 static void
1694 flow_show(struct flowtable *ft, struct flentry *fle)
1695 {
1696 int idle_time;
1697 int rt_valid, ifp_valid;
1698 uint16_t sport, dport;
1699 uint32_t *hashkey;
1700 char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
1701 volatile struct rtentry *rt;
1702 struct ifnet *ifp = NULL;
1703
1704 idle_time = (int)(time_uptime - fle->f_uptime);
1705 rt = fle->f_rt;
1706 rt_valid = rt != NULL;
1707 if (rt_valid)
1708 ifp = rt->rt_ifp;
1709 ifp_valid = ifp != NULL;
1710 hashkey = flowtable_get_hashkey(fle);
1711 if (fle->f_flags & FL_IPV6)
1712 goto skipaddr;
1713
1714 inet_ntoa_r(*(struct in_addr *) &hashkey[2], daddr);
1715 if (ft->ft_flags & FL_HASH_ALL) {
1716 inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
1717 sport = ntohs(((uint16_t *)hashkey)[0]);
1718 dport = ntohs(((uint16_t *)hashkey)[1]);
1719 db_printf("%s:%d->%s:%d",
1720 saddr, sport, daddr,
1721 dport);
1722 } else
1723 db_printf("%s ", daddr);
1724
1725 skipaddr:
1726 if (fle->f_flags & FL_STALE)
1727 db_printf(" FL_STALE ");
1728 if (fle->f_flags & FL_TCP)
1729 db_printf(" FL_TCP ");
1730 if (fle->f_flags & FL_UDP)
1731 db_printf(" FL_UDP ");
1732 if (rt_valid) {
1733 if (rt->rt_flags & RTF_UP)
1734 db_printf(" RTF_UP ");
1735 }
1736 if (ifp_valid) {
1737 if (ifp->if_flags & IFF_LOOPBACK)
1738 db_printf(" IFF_LOOPBACK ");
1739 if (ifp->if_flags & IFF_UP)
1740 db_printf(" IFF_UP ");
1741 if (ifp->if_flags & IFF_POINTOPOINT)
1742 db_printf(" IFF_POINTOPOINT ");
1743 }
1744 if (fle->f_flags & FL_IPV6)
1745 db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
1746 hashkey[0], hashkey[1], hashkey[2],
1747 hashkey[3], hashkey[4], hashkey[5],
1748 hashkey[6], hashkey[7], hashkey[8]);
1749 else
1750 db_printf("\n\tkey=%08x:%08x:%08x ",
1751 hashkey[0], hashkey[1], hashkey[2]);
1752 db_printf("hash=%08x idle_time=%03d"
1753 "\n\tfibnum=%02d rt=%p",
1754 fle->f_fhash, idle_time, fle->f_fibnum, fle->f_rt);
1755 db_printf("\n");
1756 }
1757
1758 static void
1759 flowtable_show(struct flowtable *ft, int cpuid)
1760 {
1761 int curbit = 0;
1762 struct flentry *fle, **flehead;
1763 bitstr_t *mask, *tmpmask;
1764
1765 if (cpuid != -1)
1766 db_printf("cpu: %d\n", cpuid);
1767 mask = flowtable_mask_pcpu(ft, cpuid);
1768 tmpmask = ft->ft_tmpmask;
1769 memcpy(tmpmask, mask, ft->ft_size/8);
1770 /*
1771 * XXX Note to self, bit_ffs operates at the byte level
1772 * and thus adds gratuitous overhead
1773 */
1774 bit_ffs(tmpmask, ft->ft_size, &curbit);
1775 while (curbit != -1) {
1776 if (curbit >= ft->ft_size || curbit < -1) {
1777 db_printf("warning: bad curbit value %d \n",
1778 curbit);
1779 break;
1780 }
1781
1782 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1783 fle = *flehead;
1784
1785 while (fle != NULL) {
1786 flow_show(ft, fle);
1787 fle = fle->f_next;
1788 continue;
1789 }
1790 bit_clear(tmpmask, curbit);
1791 bit_ffs(tmpmask, ft->ft_size, &curbit);
1792 }
1793 }
1794
1795 static void
1796 flowtable_show_vnet(void)
1797 {
1798 struct flowtable *ft;
1799 int i;
1800
1801 ft = V_flow_list_head;
1802 while (ft != NULL) {
1803 printf("name: %s\n", ft->ft_name);
1804 if (ft->ft_flags & FL_PCPU) {
1805 CPU_FOREACH(i) {
1806 flowtable_show(ft, i);
1807 }
1808 } else {
1809 flowtable_show(ft, -1);
1810 }
1811 ft = ft->ft_next;
1812 }
1813 }
1814
1815 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1816 {
1817 VNET_ITERATOR_DECL(vnet_iter);
1818
1819 VNET_FOREACH(vnet_iter) {
1820 CURVNET_SET(vnet_iter);
1821 #ifdef VIMAGE
1822 db_printf("vnet %p\n", vnet_iter);
1823 #endif
1824 flowtable_show_vnet();
1825 CURVNET_RESTORE();
1826 }
1827 }
1828 #endif
Cache object: 26c7d7324815e67cfdbff6b3e349baf1
|