FreeBSD/Linux Kernel Cross Reference
sys/net/flowtable.c
1 /**************************************************************************
2
3 Copyright (c) 2008-2009, BitGravity Inc.
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
8
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
11
12 2. Neither the name of the BitGravity Corporation nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
27
28 ***************************************************************************/
29
30 #include "opt_route.h"
31 #include "opt_mpath.h"
32 #include "opt_ddb.h"
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD: releng/8.0/sys/net/flowtable.c 198568 2009-10-28 22:00:49Z qingli $");
36
37 #include <sys/param.h>
38 #include <sys/types.h>
39 #include <sys/bitstring.h>
40 #include <sys/condvar.h>
41 #include <sys/callout.h>
42 #include <sys/kernel.h>
43 #include <sys/kthread.h>
44 #include <sys/limits.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/proc.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/socket.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
53
54 #include <net/if.h>
55 #include <net/if_llatbl.h>
56 #include <net/if_var.h>
57 #include <net/route.h>
58 #include <net/flowtable.h>
59 #include <net/vnet.h>
60
61 #include <netinet/in.h>
62 #include <netinet/in_systm.h>
63 #include <netinet/in_var.h>
64 #include <netinet/if_ether.h>
65 #include <netinet/ip.h>
66 #include <netinet/tcp.h>
67 #include <netinet/udp.h>
68 #include <netinet/sctp.h>
69
70 #include <libkern/jenkins.h>
71 #include <ddb/ddb.h>
72
73 struct ipv4_tuple {
74 uint16_t ip_sport; /* source port */
75 uint16_t ip_dport; /* destination port */
76 in_addr_t ip_saddr; /* source address */
77 in_addr_t ip_daddr; /* destination address */
78 };
79
80 union ipv4_flow {
81 struct ipv4_tuple ipf_ipt;
82 uint32_t ipf_key[3];
83 };
84
85 struct ipv6_tuple {
86 uint16_t ip_sport; /* source port */
87 uint16_t ip_dport; /* destination port */
88 struct in6_addr ip_saddr; /* source address */
89 struct in6_addr ip_daddr; /* destination address */
90 };
91
92 union ipv6_flow {
93 struct ipv6_tuple ipf_ipt;
94 uint32_t ipf_key[9];
95 };
96
97 struct flentry {
98 volatile uint32_t f_fhash; /* hash flowing forward */
99 uint16_t f_flags; /* flow flags */
100 uint8_t f_pad;
101 uint8_t f_proto; /* protocol */
102 uint32_t f_fibnum; /* fib index */
103 uint32_t f_uptime; /* uptime at last access */
104 struct flentry *f_next; /* pointer to collision entry */
105 volatile struct rtentry *f_rt; /* rtentry for flow */
106 volatile struct llentry *f_lle; /* llentry for flow */
107 };
108
109 struct flentry_v4 {
110 struct flentry fl_entry;
111 union ipv4_flow fl_flow;
112 };
113
114 struct flentry_v6 {
115 struct flentry fl_entry;
116 union ipv6_flow fl_flow;
117 };
118
119 #define fl_fhash fl_entry.fl_fhash
120 #define fl_flags fl_entry.fl_flags
121 #define fl_proto fl_entry.fl_proto
122 #define fl_uptime fl_entry.fl_uptime
123 #define fl_rt fl_entry.fl_rt
124 #define fl_lle fl_entry.fl_lle
125
126 #define SECS_PER_HOUR 3600
127 #define SECS_PER_DAY (24*SECS_PER_HOUR)
128
129 #define SYN_IDLE 300
130 #define UDP_IDLE 300
131 #define FIN_WAIT_IDLE 600
132 #define TCP_IDLE SECS_PER_DAY
133
134
135 typedef void fl_lock_t(struct flowtable *, uint32_t);
136 typedef void fl_rtalloc_t(struct route *, uint32_t, u_int);
137
138 union flentryp {
139 struct flentry **global;
140 struct flentry **pcpu[MAXCPU];
141 };
142
143 struct flowtable {
144 int ft_size;
145 int ft_lock_count;
146 uint32_t ft_flags;
147 uint32_t ft_collisions;
148 uint32_t ft_allocated;
149 uint32_t ft_misses;
150 uint64_t ft_hits;
151
152 uint32_t ft_udp_idle;
153 uint32_t ft_fin_wait_idle;
154 uint32_t ft_syn_idle;
155 uint32_t ft_tcp_idle;
156
157 fl_lock_t *ft_lock;
158 fl_lock_t *ft_unlock;
159 fl_rtalloc_t *ft_rtalloc;
160 struct mtx *ft_locks;
161
162
163 union flentryp ft_table;
164 bitstr_t *ft_masks[MAXCPU];
165 bitstr_t *ft_tmpmask;
166 struct flowtable *ft_next;
167 };
168
169 static struct proc *flowcleanerproc;
170 static VNET_DEFINE(struct flowtable *, flow_list_head);
171 static VNET_DEFINE(uint32_t, flow_hashjitter);
172 static VNET_DEFINE(uma_zone_t, flow_ipv4_zone);
173 static VNET_DEFINE(uma_zone_t, flow_ipv6_zone);
174
175 #define V_flow_list_head VNET(flow_list_head)
176 #define V_flow_hashjitter VNET(flow_hashjitter)
177 #define V_flow_ipv4_zone VNET(flow_ipv4_zone)
178 #define V_flow_ipv6_zone VNET(flow_ipv6_zone)
179
180 static struct cv flowclean_cv;
181 static struct mtx flowclean_lock;
182 static uint32_t flowclean_cycles;
183
184 /*
185 * TODO:
186 * - Make flowtable stats per-cpu, aggregated at sysctl call time,
187 * to avoid extra cache evictions caused by incrementing a shared
188 * counter
189 * - add IPv6 support to flow lookup
190 * - add sysctls to resize && flush flow tables
191 * - Add per flowtable sysctls for statistics and configuring timeouts
192 * - add saturation counter to rtentry to support per-packet load-balancing
193 * add flag to indicate round-robin flow, add list lookup from head
194 for flows
195 * - add sysctl / device node / syscall to support exporting and importing
196 * of flows with flag to indicate that a flow was imported so should
197 * not be considered for auto-cleaning
198 * - support explicit connection state (currently only ad-hoc for DSR)
199 * - idetach() cleanup for options VIMAGE builds.
200 */
201 VNET_DEFINE(int, flowtable_enable) = 1;
202 static VNET_DEFINE(int, flowtable_debug);
203 static VNET_DEFINE(int, flowtable_hits);
204 static VNET_DEFINE(int, flowtable_lookups);
205 static VNET_DEFINE(int, flowtable_misses);
206 static VNET_DEFINE(int, flowtable_frees);
207 static VNET_DEFINE(int, flowtable_free_checks);
208 static VNET_DEFINE(int, flowtable_max_depth);
209 static VNET_DEFINE(int, flowtable_collisions);
210 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
211 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
212 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
213 static VNET_DEFINE(int, flowtable_tcp_expire) = TCP_IDLE;
214 static VNET_DEFINE(int, flowtable_nmbflows) = 4096;
215 static VNET_DEFINE(int, flowtable_ready) = 0;
216
217 #define V_flowtable_enable VNET(flowtable_enable)
218 #define V_flowtable_debug VNET(flowtable_debug)
219 #define V_flowtable_hits VNET(flowtable_hits)
220 #define V_flowtable_lookups VNET(flowtable_lookups)
221 #define V_flowtable_misses VNET(flowtable_misses)
222 #define V_flowtable_frees VNET(flowtable_frees)
223 #define V_flowtable_free_checks VNET(flowtable_free_checks)
224 #define V_flowtable_max_depth VNET(flowtable_max_depth)
225 #define V_flowtable_collisions VNET(flowtable_collisions)
226 #define V_flowtable_syn_expire VNET(flowtable_syn_expire)
227 #define V_flowtable_udp_expire VNET(flowtable_udp_expire)
228 #define V_flowtable_fin_wait_expire VNET(flowtable_fin_wait_expire)
229 #define V_flowtable_tcp_expire VNET(flowtable_tcp_expire)
230 #define V_flowtable_nmbflows VNET(flowtable_nmbflows)
231 #define V_flowtable_ready VNET(flowtable_ready)
232
233 SYSCTL_NODE(_net_inet, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable");
234 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, debug, CTLFLAG_RW,
235 &VNET_NAME(flowtable_debug), 0, "print debug info.");
236 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
237 &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
238 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
239 &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
240 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
241 &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
242 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
243 &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
244 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
245 &VNET_NAME(flowtable_frees), 0, "#flows freed.");
246 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
247 &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
248 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
249 &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
250 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
251 &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
252
253 /*
254 * XXX This does not end up updating timeouts at runtime
255 * and only reflects the value for the last table added :-/
256 */
257 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, syn_expire, CTLFLAG_RW,
258 &VNET_NAME(flowtable_syn_expire), 0,
259 "seconds after which to remove syn allocated flow.");
260 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, udp_expire, CTLFLAG_RW,
261 &VNET_NAME(flowtable_udp_expire), 0,
262 "seconds after which to remove flow allocated to UDP.");
263 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, fin_wait_expire, CTLFLAG_RW,
264 &VNET_NAME(flowtable_fin_wait_expire), 0,
265 "seconds after which to remove a flow in FIN_WAIT.");
266 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, tcp_expire, CTLFLAG_RW,
267 &VNET_NAME(flowtable_tcp_expire), 0,
268 "seconds after which to remove flow allocated to a TCP connection.");
269
270
271 /*
272 * Maximum number of flows that can be allocated of a given type.
273 *
274 * The table is allocated at boot time (for the pure caching case
275 * there is no reason why this could not be changed at runtime)
276 * and thus (currently) needs to be set with a tunable.
277 */
278 static int
279 sysctl_nmbflows(SYSCTL_HANDLER_ARGS)
280 {
281 int error, newnmbflows;
282
283 newnmbflows = V_flowtable_nmbflows;
284 error = sysctl_handle_int(oidp, &newnmbflows, 0, req);
285 if (error == 0 && req->newptr) {
286 if (newnmbflows > V_flowtable_nmbflows) {
287 V_flowtable_nmbflows = newnmbflows;
288 uma_zone_set_max(V_flow_ipv4_zone,
289 V_flowtable_nmbflows);
290 uma_zone_set_max(V_flow_ipv6_zone,
291 V_flowtable_nmbflows);
292 } else
293 error = EINVAL;
294 }
295 return (error);
296 }
297 SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, nmbflows,
298 CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
299 "Maximum number of flows allowed");
300
301 #ifndef RADIX_MPATH
302 static void
303 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
304 {
305
306 rtalloc_ign_fib(ro, 0, fibnum);
307 }
308 #endif
309
310 static void
311 flowtable_global_lock(struct flowtable *table, uint32_t hash)
312 {
313 int lock_index = (hash)&(table->ft_lock_count - 1);
314
315 mtx_lock(&table->ft_locks[lock_index]);
316 }
317
318 static void
319 flowtable_global_unlock(struct flowtable *table, uint32_t hash)
320 {
321 int lock_index = (hash)&(table->ft_lock_count - 1);
322
323 mtx_unlock(&table->ft_locks[lock_index]);
324 }
325
326 static void
327 flowtable_pcpu_lock(struct flowtable *table, uint32_t hash)
328 {
329
330 critical_enter();
331 }
332
333 static void
334 flowtable_pcpu_unlock(struct flowtable *table, uint32_t hash)
335 {
336
337 critical_exit();
338 }
339
340 #define FL_ENTRY_INDEX(table, hash)((hash) % (table)->ft_size)
341 #define FL_ENTRY(table, hash) *flowtable_entry((table), (hash))
342 #define FL_ENTRY_LOCK(table, hash) (table)->ft_lock((table), (hash))
343 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
344
345 #define FL_STALE (1<<8)
346 #define FL_IPV6 (1<<9)
347
348 static uint32_t
349 ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
350 uint32_t *key, uint16_t *flags, uint8_t *protop)
351 {
352 uint16_t sport = 0, dport = 0;
353 struct ip *ip = NULL;
354 uint8_t proto = 0;
355 int iphlen;
356 uint32_t hash;
357 struct sockaddr_in *sin;
358 struct tcphdr *th;
359 struct udphdr *uh;
360 struct sctphdr *sh;
361
362 if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
363 return (0);
364
365 key[1] = key[0] = 0;
366 sin = (struct sockaddr_in *)&ro->ro_dst;
367 if (m != NULL) {
368 ip = mtod(m, struct ip *);
369 sin->sin_family = AF_INET;
370 sin->sin_len = sizeof(*sin);
371 sin->sin_addr = ip->ip_dst;
372 } else
373 *flags &= ~FL_HASH_PORTS;
374
375 key[2] = sin->sin_addr.s_addr;
376
377 if ((*flags & FL_HASH_PORTS) == 0)
378 goto skipports;
379
380 proto = ip->ip_p;
381 iphlen = ip->ip_hl << 2; /* XXX options? */
382 key[1] = ip->ip_src.s_addr;
383
384 switch (proto) {
385 case IPPROTO_TCP:
386 th = (struct tcphdr *)((caddr_t)ip + iphlen);
387 sport = ntohs(th->th_sport);
388 dport = ntohs(th->th_dport);
389 *flags |= th->th_flags;
390 if (*flags & TH_RST)
391 *flags |= FL_STALE;
392 break;
393 case IPPROTO_UDP:
394 uh = (struct udphdr *)((caddr_t)ip + iphlen);
395 sport = uh->uh_sport;
396 dport = uh->uh_dport;
397 break;
398 case IPPROTO_SCTP:
399 sh = (struct sctphdr *)((caddr_t)ip + iphlen);
400 sport = sh->src_port;
401 dport = sh->dest_port;
402 break;
403 default:
404 if (*flags & FL_HASH_PORTS)
405 goto noop;
406 /* no port - hence not a protocol we care about */
407 break;;
408
409 }
410 *protop = proto;
411
412 /*
413 * If this is a transmit route cache then
414 * hash all flows to a given destination to
415 * the same bucket
416 */
417 if ((*flags & FL_HASH_PORTS) == 0)
418 proto = sport = dport = 0;
419
420 ((uint16_t *)key)[0] = sport;
421 ((uint16_t *)key)[1] = dport;
422
423 skipports:
424 hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
425 if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
426 m->m_flags |= M_FLOWID;
427 m->m_pkthdr.flowid = hash;
428 }
429
430 return (hash);
431 noop:
432 *protop = proto;
433 return (0);
434 }
435
436 static bitstr_t *
437 flowtable_mask(struct flowtable *ft)
438 {
439 bitstr_t *mask;
440
441 if (ft->ft_flags & FL_PCPU)
442 mask = ft->ft_masks[curcpu];
443 else
444 mask = ft->ft_masks[0];
445
446 return (mask);
447 }
448
449 static struct flentry **
450 flowtable_entry(struct flowtable *ft, uint32_t hash)
451 {
452 struct flentry **fle;
453 int index = (hash % ft->ft_size);
454
455 if (ft->ft_flags & FL_PCPU) {
456 KASSERT(&ft->ft_table.pcpu[curcpu][0] != NULL, ("pcpu not set"));
457 fle = &ft->ft_table.pcpu[curcpu][index];
458 } else {
459 KASSERT(&ft->ft_table.global[0] != NULL, ("global not set"));
460 fle = &ft->ft_table.global[index];
461 }
462
463 return (fle);
464 }
465
466 static int
467 flow_stale(struct flowtable *ft, struct flentry *fle)
468 {
469 time_t idle_time;
470
471 if ((fle->f_fhash == 0)
472 || ((fle->f_rt->rt_flags & RTF_HOST) &&
473 ((fle->f_rt->rt_flags & (RTF_UP))
474 != (RTF_UP)))
475 || (fle->f_rt->rt_ifp == NULL))
476 return (1);
477
478 idle_time = time_uptime - fle->f_uptime;
479
480 if ((fle->f_flags & FL_STALE) ||
481 ((fle->f_flags & (TH_SYN|TH_ACK|TH_FIN)) == 0
482 && (idle_time > ft->ft_udp_idle)) ||
483 ((fle->f_flags & TH_FIN)
484 && (idle_time > ft->ft_fin_wait_idle)) ||
485 ((fle->f_flags & (TH_SYN|TH_ACK)) == TH_SYN
486 && (idle_time > ft->ft_syn_idle)) ||
487 ((fle->f_flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)
488 && (idle_time > ft->ft_tcp_idle)) ||
489 ((fle->f_rt->rt_flags & RTF_UP) == 0 ||
490 (fle->f_rt->rt_ifp == NULL)))
491 return (1);
492
493 return (0);
494 }
495
496 static void
497 flowtable_set_hashkey(struct flentry *fle, uint32_t *key)
498 {
499 uint32_t *hashkey;
500 int i, nwords;
501
502 if (fle->f_flags & FL_IPV6) {
503 nwords = 9;
504 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
505 } else {
506 nwords = 3;
507 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
508 }
509
510 for (i = 0; i < nwords; i++)
511 hashkey[i] = key[i];
512 }
513
514 static int
515 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
516 uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
517 {
518 struct flentry *fle, *fletail, *newfle, **flep;
519 int depth;
520 uma_zone_t flezone;
521 bitstr_t *mask;
522
523 flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
524 newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
525 if (newfle == NULL)
526 return (ENOMEM);
527
528 newfle->f_flags |= (flags & FL_IPV6);
529
530 FL_ENTRY_LOCK(ft, hash);
531 mask = flowtable_mask(ft);
532 flep = flowtable_entry(ft, hash);
533 fletail = fle = *flep;
534
535 if (fle == NULL) {
536 bit_set(mask, FL_ENTRY_INDEX(ft, hash));
537 *flep = fle = newfle;
538 goto skip;
539 }
540
541 depth = 0;
542 V_flowtable_collisions++;
543 /*
544 * find end of list and make sure that we were not
545 * preempted by another thread handling this flow
546 */
547 while (fle != NULL) {
548 if (fle->f_fhash == hash && !flow_stale(ft, fle)) {
549 /*
550 * there was either a hash collision
551 * or we lost a race to insert
552 */
553 FL_ENTRY_UNLOCK(ft, hash);
554 uma_zfree((newfle->f_flags & FL_IPV6) ?
555 V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
556 return (EEXIST);
557 }
558 /*
559 * re-visit this double condition XXX
560 */
561 if (fletail->f_next != NULL)
562 fletail = fle->f_next;
563
564 depth++;
565 fle = fle->f_next;
566 }
567
568 if (depth > V_flowtable_max_depth)
569 V_flowtable_max_depth = depth;
570 fletail->f_next = newfle;
571 fle = newfle;
572 skip:
573 flowtable_set_hashkey(fle, key);
574
575 fle->f_proto = proto;
576 fle->f_rt = ro->ro_rt;
577 fle->f_lle = ro->ro_lle;
578 fle->f_fhash = hash;
579 fle->f_fibnum = fibnum;
580 fle->f_uptime = time_uptime;
581 FL_ENTRY_UNLOCK(ft, hash);
582 return (0);
583 }
584
585 static int
586 flowtable_key_equal(struct flentry *fle, uint32_t *key)
587 {
588 uint32_t *hashkey;
589 int i, nwords;
590
591 if (fle->f_flags & FL_IPV6) {
592 nwords = 9;
593 hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
594 } else {
595 nwords = 3;
596 hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
597 }
598
599 for (i = 0; i < nwords; i++)
600 if (hashkey[i] != key[i])
601 return (0);
602
603 return (1);
604 }
605
606 int
607 flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, uint32_t fibnum)
608 {
609 uint32_t key[9], hash;
610 struct flentry *fle;
611 uint16_t flags;
612 uint8_t proto = 0;
613 int error = 0;
614 struct rtentry *rt;
615 struct llentry *lle;
616
617 flags = ft->ft_flags;
618 ro->ro_rt = NULL;
619 ro->ro_lle = NULL;
620
621 /*
622 * The internal hash lookup is the only IPv4 specific bit
623 * remaining
624 *
625 * XXX BZ: to add IPv6 support just add a check for the
626 * address type in m and ro and an equivalent ipv6 lookup
627 * function - the rest of the code should automatically
628 * handle an ipv6 flow (note that m can be NULL in which
629 * case ro will be set)
630 */
631 hash = ipv4_flow_lookup_hash_internal(m, ro, key,
632 &flags, &proto);
633
634 /*
635 * Ports are zero and this isn't a transmit cache
636 * - thus not a protocol for which we need to keep
637 * state
638 * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
639 */
640 if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
641 return (ENOENT);
642
643 V_flowtable_lookups++;
644 FL_ENTRY_LOCK(ft, hash);
645 if ((fle = FL_ENTRY(ft, hash)) == NULL) {
646 FL_ENTRY_UNLOCK(ft, hash);
647 goto uncached;
648 }
649 keycheck:
650 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
651 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
652 if ((rt != NULL)
653 && fle->f_fhash == hash
654 && flowtable_key_equal(fle, key)
655 && (proto == fle->f_proto)
656 && (fibnum == fle->f_fibnum)
657 && (rt->rt_flags & RTF_UP)
658 && (rt->rt_ifp != NULL)) {
659 V_flowtable_hits++;
660 fle->f_uptime = time_uptime;
661 fle->f_flags |= flags;
662 ro->ro_rt = rt;
663 ro->ro_lle = lle;
664 FL_ENTRY_UNLOCK(ft, hash);
665 return (0);
666 } else if (fle->f_next != NULL) {
667 fle = fle->f_next;
668 goto keycheck;
669 }
670 FL_ENTRY_UNLOCK(ft, hash);
671
672 uncached:
673 V_flowtable_misses++;
674 /*
675 * This bit of code ends up locking the
676 * same route 3 times (just like ip_output + ether_output)
677 * - at lookup
678 * - in rt_check when called by arpresolve
679 * - dropping the refcount for the rtentry
680 *
681 * This could be consolidated to one if we wrote a variant
682 * of arpresolve with an rt_check variant that expected to
683 * receive the route locked
684 */
685
686 ft->ft_rtalloc(ro, hash, fibnum);
687 if (ro->ro_rt == NULL)
688 error = ENETUNREACH;
689 else {
690 struct llentry *lle = NULL;
691 struct sockaddr *l3addr;
692 struct rtentry *rt = ro->ro_rt;
693 struct ifnet *ifp = rt->rt_ifp;
694
695 if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
696 RTFREE(rt);
697 ro->ro_rt = NULL;
698 return (ENOENT);
699 }
700
701 if (rt->rt_flags & RTF_GATEWAY)
702 l3addr = rt->rt_gateway;
703 else
704 l3addr = &ro->ro_dst;
705 llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
706 ro->ro_lle = lle;
707
708 if (lle == NULL) {
709 RTFREE(rt);
710 ro->ro_rt = NULL;
711 return (ENOENT);
712 }
713 error = flowtable_insert(ft, hash, key, proto, fibnum,
714 ro, flags);
715
716 if (error) {
717 RTFREE(rt);
718 LLE_FREE(lle);
719 ro->ro_rt = NULL;
720 ro->ro_lle = NULL;
721 }
722 }
723
724 return (error);
725 }
726
727 /*
728 * used by the bit_alloc macro
729 */
730 #define calloc(count, size) malloc((count)*(size), M_DEVBUF, M_WAITOK|M_ZERO)
731
732 struct flowtable *
733 flowtable_alloc(int nentry, int flags)
734 {
735 struct flowtable *ft, *fttail;
736 int i;
737
738 if (V_flow_hashjitter == 0)
739 V_flow_hashjitter = arc4random();
740
741 KASSERT(nentry > 0, ("nentry must be > 0, is %d\n", nentry));
742
743 ft = malloc(sizeof(struct flowtable),
744 M_RTABLE, M_WAITOK | M_ZERO);
745
746 ft->ft_flags = flags;
747 ft->ft_size = nentry;
748 #ifdef RADIX_MPATH
749 ft->ft_rtalloc = rtalloc_mpath_fib;
750 #else
751 ft->ft_rtalloc = in_rtalloc_ign_wrapper;
752 #endif
753 if (flags & FL_PCPU) {
754 ft->ft_lock = flowtable_pcpu_lock;
755 ft->ft_unlock = flowtable_pcpu_unlock;
756
757 for (i = 0; i <= mp_maxid; i++) {
758 ft->ft_table.pcpu[i] =
759 malloc(nentry*sizeof(struct flentry *),
760 M_RTABLE, M_WAITOK | M_ZERO);
761 ft->ft_masks[i] = bit_alloc(nentry);
762 }
763 } else {
764 ft->ft_lock_count = 2*(powerof2(mp_maxid + 1) ? (mp_maxid + 1):
765 (fls(mp_maxid + 1) << 1));
766
767 ft->ft_lock = flowtable_global_lock;
768 ft->ft_unlock = flowtable_global_unlock;
769 ft->ft_table.global =
770 malloc(nentry*sizeof(struct flentry *),
771 M_RTABLE, M_WAITOK | M_ZERO);
772 ft->ft_locks = malloc(ft->ft_lock_count*sizeof(struct mtx),
773 M_RTABLE, M_WAITOK | M_ZERO);
774 for (i = 0; i < ft->ft_lock_count; i++)
775 mtx_init(&ft->ft_locks[i], "flow", NULL, MTX_DEF|MTX_DUPOK);
776
777 ft->ft_masks[0] = bit_alloc(nentry);
778 }
779 ft->ft_tmpmask = bit_alloc(nentry);
780
781 /*
782 * In the local transmit case the table truly is
783 * just a cache - so everything is eligible for
784 * replacement after 5s of non-use
785 */
786 if (flags & FL_HASH_PORTS) {
787 ft->ft_udp_idle = V_flowtable_udp_expire;
788 ft->ft_syn_idle = V_flowtable_syn_expire;
789 ft->ft_fin_wait_idle = V_flowtable_fin_wait_expire;
790 ft->ft_tcp_idle = V_flowtable_fin_wait_expire;
791 } else {
792 ft->ft_udp_idle = ft->ft_fin_wait_idle =
793 ft->ft_syn_idle = ft->ft_tcp_idle = 30;
794
795 }
796
797 /*
798 * hook in to the cleaner list
799 */
800 if (V_flow_list_head == NULL)
801 V_flow_list_head = ft;
802 else {
803 fttail = V_flow_list_head;
804 while (fttail->ft_next != NULL)
805 fttail = fttail->ft_next;
806 fttail->ft_next = ft;
807 }
808
809 return (ft);
810 }
811
812 /*
813 * The rest of the code is devoted to garbage collection of expired entries.
814 * It is a new additon made necessary by the switch to dynamically allocating
815 * flow tables.
816 *
817 */
818 static void
819 fle_free(struct flentry *fle)
820 {
821 struct rtentry *rt;
822 struct llentry *lle;
823
824 rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
825 lle = __DEVOLATILE(struct llentry *, fle->f_lle);
826 RTFREE(rt);
827 LLE_FREE(lle);
828 uma_zfree((fle->f_flags & FL_IPV6) ?
829 V_flow_ipv6_zone : V_flow_ipv4_zone, fle);
830 }
831
832 static void
833 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt)
834 {
835 int curbit = 0, count;
836 struct flentry *fle, **flehead, *fleprev;
837 struct flentry *flefreehead, *flefreetail, *fletmp;
838 bitstr_t *mask, *tmpmask;
839
840 flefreehead = flefreetail = NULL;
841 mask = flowtable_mask(ft);
842 tmpmask = ft->ft_tmpmask;
843 memcpy(tmpmask, mask, ft->ft_size/8);
844 /*
845 * XXX Note to self, bit_ffs operates at the byte level
846 * and thus adds gratuitous overhead
847 */
848 bit_ffs(tmpmask, ft->ft_size, &curbit);
849 while (curbit != -1) {
850 if (curbit >= ft->ft_size || curbit < -1) {
851 log(LOG_ALERT,
852 "warning: bad curbit value %d \n",
853 curbit);
854 break;
855 }
856
857 FL_ENTRY_LOCK(ft, curbit);
858 flehead = flowtable_entry(ft, curbit);
859 fle = fleprev = *flehead;
860
861 V_flowtable_free_checks++;
862 #ifdef DIAGNOSTIC
863 if (fle == NULL && curbit > 0) {
864 log(LOG_ALERT,
865 "warning bit=%d set, but no fle found\n",
866 curbit);
867 }
868 #endif
869 while (fle != NULL) {
870 if (rt != NULL) {
871 if (__DEVOLATILE(struct rtentry *, fle->f_rt) != rt) {
872 fleprev = fle;
873 fle = fle->f_next;
874 continue;
875 }
876 } else if (!flow_stale(ft, fle)) {
877 fleprev = fle;
878 fle = fle->f_next;
879 continue;
880 }
881 /*
882 * delete head of the list
883 */
884 if (fleprev == *flehead) {
885 fletmp = fleprev;
886 if (fle == fleprev) {
887 fleprev = *flehead = fle->f_next;
888 } else
889 fleprev = *flehead = fle;
890 fle = fle->f_next;
891 } else {
892 /*
893 * don't advance fleprev
894 */
895 fletmp = fle;
896 fleprev->f_next = fle->f_next;
897 fle = fleprev->f_next;
898 }
899
900 if (flefreehead == NULL)
901 flefreehead = flefreetail = fletmp;
902 else {
903 flefreetail->f_next = fletmp;
904 flefreetail = fletmp;
905 }
906 fletmp->f_next = NULL;
907 }
908 if (*flehead == NULL)
909 bit_clear(mask, curbit);
910 FL_ENTRY_UNLOCK(ft, curbit);
911 bit_clear(tmpmask, curbit);
912 bit_ffs(tmpmask, ft->ft_size, &curbit);
913 }
914 count = 0;
915 while ((fle = flefreehead) != NULL) {
916 flefreehead = fle->f_next;
917 count++;
918 V_flowtable_frees++;
919 fle_free(fle);
920 }
921 if (V_flowtable_debug && count)
922 log(LOG_DEBUG, "freed %d flow entries\n", count);
923 }
924
925 void
926 flowtable_route_flush(struct flowtable *ft, struct rtentry *rt)
927 {
928 int i;
929 if (ft->ft_flags & FL_PCPU) {
930 for (i = 0; i <= mp_maxid; i++) {
931 if (CPU_ABSENT(i))
932 continue;
933
934 if (smp_started == 1) {
935 thread_lock(curthread);
936 sched_bind(curthread, i);
937 thread_unlock(curthread);
938 }
939
940 flowtable_free_stale(ft, rt);
941
942 if (smp_started == 1) {
943 thread_lock(curthread);
944 sched_unbind(curthread);
945 thread_unlock(curthread);
946 }
947 }
948 } else {
949 flowtable_free_stale(ft, rt);
950 }
951 }
952
953 static void
954 flowtable_clean_vnet(void)
955 {
956 struct flowtable *ft;
957 int i;
958
959 ft = V_flow_list_head;
960 while (ft != NULL) {
961 if (ft->ft_flags & FL_PCPU) {
962 for (i = 0; i <= mp_maxid; i++) {
963 if (CPU_ABSENT(i))
964 continue;
965
966 if (smp_started == 1) {
967 thread_lock(curthread);
968 sched_bind(curthread, i);
969 thread_unlock(curthread);
970 }
971
972 flowtable_free_stale(ft, NULL);
973
974 if (smp_started == 1) {
975 thread_lock(curthread);
976 sched_unbind(curthread);
977 thread_unlock(curthread);
978 }
979 }
980 } else {
981 flowtable_free_stale(ft, NULL);
982 }
983 ft = ft->ft_next;
984 }
985 }
986
987 static void
988 flowtable_cleaner(void)
989 {
990 VNET_ITERATOR_DECL(vnet_iter);
991
992 if (bootverbose)
993 log(LOG_INFO, "flowtable cleaner started\n");
994 while (1) {
995 VNET_LIST_RLOCK();
996 VNET_FOREACH(vnet_iter) {
997 CURVNET_SET(vnet_iter);
998 flowtable_clean_vnet();
999 CURVNET_RESTORE();
1000 }
1001 VNET_LIST_RUNLOCK();
1002
1003 flowclean_cycles++;
1004 /*
1005 * The 10 second interval between cleaning checks
1006 * is arbitrary
1007 */
1008 mtx_lock(&flowclean_lock);
1009 cv_broadcast(&flowclean_cv);
1010 cv_timedwait(&flowclean_cv, &flowclean_lock, 10*hz);
1011 mtx_unlock(&flowclean_lock);
1012 }
1013 }
1014
1015 static void
1016 flowtable_flush(void *unused __unused)
1017 {
1018 uint64_t start;
1019
1020 mtx_lock(&flowclean_lock);
1021 start = flowclean_cycles;
1022 while (start == flowclean_cycles) {
1023 cv_broadcast(&flowclean_cv);
1024 cv_wait(&flowclean_cv, &flowclean_lock);
1025 }
1026 mtx_unlock(&flowclean_lock);
1027 }
1028
1029 static struct kproc_desc flow_kp = {
1030 "flowcleaner",
1031 flowtable_cleaner,
1032 &flowcleanerproc
1033 };
1034 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
1035
1036 static void
1037 flowtable_init_vnet(const void *unused __unused)
1038 {
1039
1040 V_flow_ipv4_zone = uma_zcreate("ip4flow", sizeof(struct flentry_v4),
1041 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1042 V_flow_ipv6_zone = uma_zcreate("ip6flow", sizeof(struct flentry_v6),
1043 NULL, NULL, NULL, NULL, 64, UMA_ZONE_MAXBUCKET);
1044 uma_zone_set_max(V_flow_ipv4_zone, V_flowtable_nmbflows);
1045 uma_zone_set_max(V_flow_ipv6_zone, V_flowtable_nmbflows);
1046 V_flowtable_ready = 1;
1047 }
1048 VNET_SYSINIT(flowtable_init_vnet, SI_SUB_KTHREAD_INIT, SI_ORDER_MIDDLE,
1049 flowtable_init_vnet, NULL);
1050
1051 static void
1052 flowtable_init(const void *unused __unused)
1053 {
1054
1055 cv_init(&flowclean_cv, "flowcleanwait");
1056 mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
1057 EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
1058 EVENTHANDLER_PRI_ANY);
1059 }
1060 SYSINIT(flowtable_init, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1061 flowtable_init, NULL);
1062
1063
1064 #ifdef VIMAGE
1065 static void
1066 flowtable_uninit(const void *unused __unused)
1067 {
1068
1069 V_flowtable_ready = 0;
1070 uma_zdestroy(V_flow_ipv4_zone);
1071 uma_zdestroy(V_flow_ipv6_zone);
1072 }
1073
1074 VNET_SYSUNINIT(flowtable_uninit, SI_SUB_KTHREAD_INIT, SI_ORDER_ANY,
1075 flowtable_uninit, NULL);
1076 #endif
1077
1078 #ifdef DDB
1079 static bitstr_t *
1080 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
1081 {
1082 bitstr_t *mask;
1083
1084 if (ft->ft_flags & FL_PCPU)
1085 mask = ft->ft_masks[cpuid];
1086 else
1087 mask = ft->ft_masks[0];
1088
1089 return (mask);
1090 }
1091
1092 static struct flentry **
1093 flowtable_entry_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
1094 {
1095 struct flentry **fle;
1096 int index = (hash % ft->ft_size);
1097
1098 if (ft->ft_flags & FL_PCPU) {
1099 fle = &ft->ft_table.pcpu[cpuid][index];
1100 } else {
1101 fle = &ft->ft_table.global[index];
1102 }
1103
1104 return (fle);
1105 }
1106
1107 static void
1108 flow_show(struct flowtable *ft, struct flentry *fle)
1109 {
1110 int idle_time;
1111 int rt_valid;
1112
1113 idle_time = (int)(time_uptime - fle->f_uptime);
1114 rt_valid = fle->f_rt != NULL;
1115 db_printf("hash=0x%08x idle_time=%03d rt=%p ifp=%p",
1116 fle->f_fhash, idle_time,
1117 fle->f_rt, rt_valid ? fle->f_rt->rt_ifp : NULL);
1118 if (rt_valid && (fle->f_rt->rt_flags & RTF_UP))
1119 db_printf(" RTF_UP ");
1120 if (fle->f_flags & FL_STALE)
1121 db_printf(" FL_STALE ");
1122 db_printf("\n");
1123 }
1124
1125 static void
1126 flowtable_show(struct flowtable *ft, int cpuid)
1127 {
1128 int curbit = 0;
1129 struct flentry *fle, **flehead;
1130 bitstr_t *mask, *tmpmask;
1131
1132 db_printf("cpu: %d\n", cpuid);
1133 mask = flowtable_mask_pcpu(ft, cpuid);
1134 tmpmask = ft->ft_tmpmask;
1135 memcpy(tmpmask, mask, ft->ft_size/8);
1136 /*
1137 * XXX Note to self, bit_ffs operates at the byte level
1138 * and thus adds gratuitous overhead
1139 */
1140 bit_ffs(tmpmask, ft->ft_size, &curbit);
1141 while (curbit != -1) {
1142 if (curbit >= ft->ft_size || curbit < -1) {
1143 db_printf("warning: bad curbit value %d \n",
1144 curbit);
1145 break;
1146 }
1147
1148 flehead = flowtable_entry_pcpu(ft, curbit, cpuid);
1149 fle = *flehead;
1150
1151 while (fle != NULL) {
1152 flow_show(ft, fle);
1153 fle = fle->f_next;
1154 continue;
1155 }
1156 bit_clear(tmpmask, curbit);
1157 bit_ffs(tmpmask, ft->ft_size, &curbit);
1158 }
1159 }
1160
1161 static void
1162 flowtable_show_vnet(void)
1163 {
1164 struct flowtable *ft;
1165 int i;
1166
1167 ft = V_flow_list_head;
1168 while (ft != NULL) {
1169 if (ft->ft_flags & FL_PCPU) {
1170 for (i = 0; i <= mp_maxid; i++) {
1171 if (CPU_ABSENT(i))
1172 continue;
1173 flowtable_show(ft, i);
1174 }
1175 } else {
1176 flowtable_show(ft, 0);
1177 }
1178 ft = ft->ft_next;
1179 }
1180 }
1181
1182 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
1183 {
1184 VNET_ITERATOR_DECL(vnet_iter);
1185
1186 VNET_FOREACH(vnet_iter) {
1187 CURVNET_SET(vnet_iter);
1188 flowtable_show_vnet();
1189 CURVNET_RESTORE();
1190 }
1191 }
1192 #endif
Cache object: 0b88812e88a80908734462a43b8623ba
|