1 /*-
2 * Copyright (c) 2001 McAfee, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Jonathan Lemon
6 * and McAfee Research, the Security Research Division of McAfee, Inc. under
7 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
8 * DARPA CHATS research program.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * $FreeBSD: releng/6.3/sys/netinet/tcp_syncache.c 172384 2007-09-28 17:39:45Z maxim $
32 */
33
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
37 #include "opt_mac.h"
38 #include "opt_tcpdebug.h"
39 #include "opt_tcp_sack.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/malloc.h>
46 #include <sys/mac.h>
47 #include <sys/mbuf.h>
48 #include <sys/md5.h>
49 #include <sys/proc.h> /* for proc0 declaration */
50 #include <sys/random.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/ip.h>
60 #include <netinet/in_var.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/ip_var.h>
63 #ifdef INET6
64 #include <netinet/ip6.h>
65 #include <netinet/icmp6.h>
66 #include <netinet6/nd6.h>
67 #include <netinet6/ip6_var.h>
68 #include <netinet6/in6_pcb.h>
69 #endif
70 #include <netinet/tcp.h>
71 #ifdef TCPDEBUG
72 #include <netinet/tcpip.h>
73 #endif
74 #include <netinet/tcp_fsm.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #ifdef TCPDEBUG
79 #include <netinet/tcp_debug.h>
80 #endif
81 #ifdef INET6
82 #include <netinet6/tcp6_var.h>
83 #endif
84
85 #ifdef IPSEC
86 #include <netinet6/ipsec.h>
87 #ifdef INET6
88 #include <netinet6/ipsec6.h>
89 #endif
90 #endif /*IPSEC*/
91
92 #ifdef FAST_IPSEC
93 #include <netipsec/ipsec.h>
94 #ifdef INET6
95 #include <netipsec/ipsec6.h>
96 #endif
97 #include <netipsec/key.h>
98 #endif /*FAST_IPSEC*/
99
100 #include <machine/in_cksum.h>
101 #include <vm/uma.h>
102
103 static int tcp_syncookies = 1;
104 SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
105 &tcp_syncookies, 0,
106 "Use TCP SYN cookies if the syncache overflows");
107
108 static void syncache_drop(struct syncache *, struct syncache_head *);
109 static void syncache_free(struct syncache *);
110 static void syncache_insert(struct syncache *, struct syncache_head *);
111 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
112 #ifdef TCPDEBUG
113 static int syncache_respond(struct syncache *, struct mbuf *, struct socket *);
114 #else
115 static int syncache_respond(struct syncache *, struct mbuf *);
116 #endif
117 static struct socket *syncache_socket(struct syncache *, struct socket *,
118 struct mbuf *m);
119 static void syncache_timer(void *);
120 static u_int32_t syncookie_generate(struct syncache *, u_int32_t *);
121 static struct syncache *syncookie_lookup(struct in_conninfo *,
122 struct tcphdr *, struct socket *);
123
124 /*
125 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
126 * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
127 * the odds are that the user has given up attempting to connect by then.
128 */
129 #define SYNCACHE_MAXREXMTS 3
130
131 /* Arbitrary values */
132 #define TCP_SYNCACHE_HASHSIZE 512
133 #define TCP_SYNCACHE_BUCKETLIMIT 30
134
135 struct tcp_syncache {
136 struct syncache_head *hashbase;
137 uma_zone_t zone;
138 u_int hashsize;
139 u_int hashmask;
140 u_int bucket_limit;
141 u_int cache_count;
142 u_int cache_limit;
143 u_int rexmt_limit;
144 u_int hash_secret;
145 TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
146 struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
147 };
148 static struct tcp_syncache tcp_syncache;
149
150 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
151
152 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
153 &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
154
155 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
156 &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
157
158 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
159 &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
160
161 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
162 &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
163
164 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
165 &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
166
167 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
168
169 #define SYNCACHE_HASH(inc, mask) \
170 ((tcp_syncache.hash_secret ^ \
171 (inc)->inc_faddr.s_addr ^ \
172 ((inc)->inc_faddr.s_addr >> 16) ^ \
173 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
174
175 #define SYNCACHE_HASH6(inc, mask) \
176 ((tcp_syncache.hash_secret ^ \
177 (inc)->inc6_faddr.s6_addr32[0] ^ \
178 (inc)->inc6_faddr.s6_addr32[3] ^ \
179 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
180
181 #define ENDPTS_EQ(a, b) ( \
182 (a)->ie_fport == (b)->ie_fport && \
183 (a)->ie_lport == (b)->ie_lport && \
184 (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
185 (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
186 )
187
188 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
189
190 #define SYNCACHE_TIMEOUT(sc, slot) do { \
191 sc->sc_rxtslot = (slot); \
192 sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[(slot)]; \
193 TAILQ_INSERT_TAIL(&tcp_syncache.timerq[(slot)], sc, sc_timerq); \
194 if (!callout_active(&tcp_syncache.tt_timerq[(slot)])) \
195 callout_reset(&tcp_syncache.tt_timerq[(slot)], \
196 TCPTV_RTOBASE * tcp_backoff[(slot)], \
197 syncache_timer, (void *)((intptr_t)(slot))); \
198 } while (0)
199
200 static void
201 syncache_free(struct syncache *sc)
202 {
203 if (sc->sc_ipopts)
204 (void) m_free(sc->sc_ipopts);
205
206 uma_zfree(tcp_syncache.zone, sc);
207 }
208
209 void
210 syncache_init(void)
211 {
212 int i;
213
214 tcp_syncache.cache_count = 0;
215 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
216 tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
217 tcp_syncache.cache_limit =
218 tcp_syncache.hashsize * tcp_syncache.bucket_limit;
219 tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
220 tcp_syncache.hash_secret = arc4random();
221
222 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
223 &tcp_syncache.hashsize);
224 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
225 &tcp_syncache.cache_limit);
226 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
227 &tcp_syncache.bucket_limit);
228 if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) {
229 printf("WARNING: syncache hash size is not a power of 2.\n");
230 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
231 }
232 tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
233
234 /* Allocate the hash table. */
235 MALLOC(tcp_syncache.hashbase, struct syncache_head *,
236 tcp_syncache.hashsize * sizeof(struct syncache_head),
237 M_SYNCACHE, M_WAITOK);
238
239 /* Initialize the hash buckets. */
240 for (i = 0; i < tcp_syncache.hashsize; i++) {
241 TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
242 tcp_syncache.hashbase[i].sch_length = 0;
243 }
244
245 /* Initialize the timer queues. */
246 for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
247 TAILQ_INIT(&tcp_syncache.timerq[i]);
248 callout_init(&tcp_syncache.tt_timerq[i], NET_CALLOUT_MPSAFE);
249 }
250
251 /*
252 * Allocate the syncache entries. Allow the zone to allocate one
253 * more entry than cache limit, so a new entry can bump out an
254 * older one.
255 */
256 tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
257 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
258 uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
259 tcp_syncache.cache_limit -= 1;
260 }
261
262 static void
263 syncache_insert(sc, sch)
264 struct syncache *sc;
265 struct syncache_head *sch;
266 {
267 struct syncache *sc2;
268 int i;
269
270 INP_INFO_WLOCK_ASSERT(&tcbinfo);
271
272 /*
273 * Make sure that we don't overflow the per-bucket
274 * limit or the total cache size limit.
275 */
276 if (sch->sch_length >= tcp_syncache.bucket_limit) {
277 /*
278 * The bucket is full, toss the oldest element.
279 */
280 sc2 = TAILQ_FIRST(&sch->sch_bucket);
281 sc2->sc_tp->ts_recent = ticks;
282 syncache_drop(sc2, sch);
283 tcpstat.tcps_sc_bucketoverflow++;
284 } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
285 /*
286 * The cache is full. Toss the oldest entry in the
287 * entire cache. This is the front entry in the
288 * first non-empty timer queue with the largest
289 * timeout value.
290 */
291 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
292 sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
293 if (sc2 != NULL)
294 break;
295 }
296 sc2->sc_tp->ts_recent = ticks;
297 syncache_drop(sc2, NULL);
298 tcpstat.tcps_sc_cacheoverflow++;
299 }
300
301 /* Initialize the entry's timer. */
302 SYNCACHE_TIMEOUT(sc, 0);
303
304 /* Put it into the bucket. */
305 TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
306 sch->sch_length++;
307 tcp_syncache.cache_count++;
308 tcpstat.tcps_sc_added++;
309 }
310
311 static void
312 syncache_drop(sc, sch)
313 struct syncache *sc;
314 struct syncache_head *sch;
315 {
316 INP_INFO_WLOCK_ASSERT(&tcbinfo);
317
318 if (sch == NULL) {
319 #ifdef INET6
320 if (sc->sc_inc.inc_isipv6) {
321 sch = &tcp_syncache.hashbase[
322 SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
323 } else
324 #endif
325 {
326 sch = &tcp_syncache.hashbase[
327 SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
328 }
329 }
330
331 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
332 sch->sch_length--;
333 tcp_syncache.cache_count--;
334
335 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
336 if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
337 callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
338
339 syncache_free(sc);
340 }
341
342 /*
343 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
344 * If we have retransmitted an entry the maximum number of times, expire it.
345 */
346 static void
347 syncache_timer(xslot)
348 void *xslot;
349 {
350 intptr_t slot = (intptr_t)xslot;
351 struct syncache *sc, *nsc;
352 struct inpcb *inp;
353
354 INP_INFO_WLOCK(&tcbinfo);
355 if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
356 !callout_active(&tcp_syncache.tt_timerq[slot])) {
357 /* XXX can this happen? */
358 INP_INFO_WUNLOCK(&tcbinfo);
359 return;
360 }
361 callout_deactivate(&tcp_syncache.tt_timerq[slot]);
362
363 nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
364 while (nsc != NULL) {
365 if (ticks < nsc->sc_rxttime)
366 break;
367 sc = nsc;
368 inp = sc->sc_tp->t_inpcb;
369 if (slot == SYNCACHE_MAXREXMTS ||
370 slot >= tcp_syncache.rexmt_limit ||
371 inp == NULL || inp->inp_gencnt != sc->sc_inp_gencnt) {
372 nsc = TAILQ_NEXT(sc, sc_timerq);
373 syncache_drop(sc, NULL);
374 tcpstat.tcps_sc_stale++;
375 continue;
376 }
377 /*
378 * syncache_respond() may call back into the syncache to
379 * to modify another entry, so do not obtain the next
380 * entry on the timer chain until it has completed.
381 */
382 #ifdef TCPDEBUG
383 (void) syncache_respond(sc, NULL, NULL);
384 #else
385 (void) syncache_respond(sc, NULL);
386 #endif
387 nsc = TAILQ_NEXT(sc, sc_timerq);
388 tcpstat.tcps_sc_retransmitted++;
389 TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
390 SYNCACHE_TIMEOUT(sc, slot + 1);
391 }
392 if (nsc != NULL)
393 callout_reset(&tcp_syncache.tt_timerq[slot],
394 nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
395 INP_INFO_WUNLOCK(&tcbinfo);
396 }
397
398 /*
399 * Find an entry in the syncache.
400 */
401 struct syncache *
402 syncache_lookup(inc, schp)
403 struct in_conninfo *inc;
404 struct syncache_head **schp;
405 {
406 struct syncache *sc;
407 struct syncache_head *sch;
408
409 INP_INFO_WLOCK_ASSERT(&tcbinfo);
410
411 #ifdef INET6
412 if (inc->inc_isipv6) {
413 sch = &tcp_syncache.hashbase[
414 SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
415 *schp = sch;
416 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
417 if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
418 return (sc);
419 }
420 } else
421 #endif
422 {
423 sch = &tcp_syncache.hashbase[
424 SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
425 *schp = sch;
426 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
427 #ifdef INET6
428 if (sc->sc_inc.inc_isipv6)
429 continue;
430 #endif
431 if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
432 return (sc);
433 }
434 }
435 return (NULL);
436 }
437
438 /*
439 * This function is called when we get a RST for a
440 * non-existent connection, so that we can see if the
441 * connection is in the syn cache. If it is, zap it.
442 */
443 void
444 syncache_chkrst(inc, th)
445 struct in_conninfo *inc;
446 struct tcphdr *th;
447 {
448 struct syncache *sc;
449 struct syncache_head *sch;
450
451 INP_INFO_WLOCK_ASSERT(&tcbinfo);
452
453 sc = syncache_lookup(inc, &sch);
454 if (sc == NULL)
455 return;
456 /*
457 * If the RST bit is set, check the sequence number to see
458 * if this is a valid reset segment.
459 * RFC 793 page 37:
460 * In all states except SYN-SENT, all reset (RST) segments
461 * are validated by checking their SEQ-fields. A reset is
462 * valid if its sequence number is in the window.
463 *
464 * The sequence number in the reset segment is normally an
465 * echo of our outgoing acknowlegement numbers, but some hosts
466 * send a reset with the sequence number at the rightmost edge
467 * of our receive window, and we have to handle this case.
468 */
469 if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
470 SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
471 syncache_drop(sc, sch);
472 tcpstat.tcps_sc_reset++;
473 }
474 }
475
476 void
477 syncache_badack(inc)
478 struct in_conninfo *inc;
479 {
480 struct syncache *sc;
481 struct syncache_head *sch;
482
483 INP_INFO_WLOCK_ASSERT(&tcbinfo);
484
485 sc = syncache_lookup(inc, &sch);
486 if (sc != NULL) {
487 syncache_drop(sc, sch);
488 tcpstat.tcps_sc_badack++;
489 }
490 }
491
492 void
493 syncache_unreach(inc, th)
494 struct in_conninfo *inc;
495 struct tcphdr *th;
496 {
497 struct syncache *sc;
498 struct syncache_head *sch;
499
500 INP_INFO_WLOCK_ASSERT(&tcbinfo);
501
502 sc = syncache_lookup(inc, &sch);
503 if (sc == NULL)
504 return;
505
506 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
507 if (ntohl(th->th_seq) != sc->sc_iss)
508 return;
509
510 /*
511 * If we've rertransmitted 3 times and this is our second error,
512 * we remove the entry. Otherwise, we allow it to continue on.
513 * This prevents us from incorrectly nuking an entry during a
514 * spurious network outage.
515 *
516 * See tcp_notify().
517 */
518 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
519 sc->sc_flags |= SCF_UNREACH;
520 return;
521 }
522 syncache_drop(sc, sch);
523 tcpstat.tcps_sc_unreach++;
524 }
525
526 /*
527 * Build a new TCP socket structure from a syncache entry.
528 */
529 static struct socket *
530 syncache_socket(sc, lso, m)
531 struct syncache *sc;
532 struct socket *lso;
533 struct mbuf *m;
534 {
535 struct inpcb *inp = NULL;
536 struct socket *so;
537 struct tcpcb *tp;
538
539 NET_ASSERT_GIANT();
540 INP_INFO_WLOCK_ASSERT(&tcbinfo);
541
542 /*
543 * Ok, create the full blown connection, and set things up
544 * as they would have been set up if we had created the
545 * connection when the SYN arrived. If we can't create
546 * the connection, abort it.
547 */
548 so = sonewconn(lso, SS_ISCONNECTED);
549 if (so == NULL) {
550 /*
551 * Drop the connection; we will send a RST if the peer
552 * retransmits the ACK,
553 */
554 tcpstat.tcps_listendrop++;
555 goto abort2;
556 }
557 #ifdef MAC
558 SOCK_LOCK(so);
559 mac_set_socket_peer_from_mbuf(m, so);
560 SOCK_UNLOCK(so);
561 #endif
562
563 inp = sotoinpcb(so);
564 INP_LOCK(inp);
565
566 /*
567 * Insert new socket into hash list.
568 */
569 inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
570 #ifdef INET6
571 if (sc->sc_inc.inc_isipv6) {
572 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
573 } else {
574 inp->inp_vflag &= ~INP_IPV6;
575 inp->inp_vflag |= INP_IPV4;
576 #endif
577 inp->inp_laddr = sc->sc_inc.inc_laddr;
578 #ifdef INET6
579 }
580 #endif
581 inp->inp_lport = sc->sc_inc.inc_lport;
582 if (in_pcbinshash(inp) != 0) {
583 /*
584 * Undo the assignments above if we failed to
585 * put the PCB on the hash lists.
586 */
587 #ifdef INET6
588 if (sc->sc_inc.inc_isipv6)
589 inp->in6p_laddr = in6addr_any;
590 else
591 #endif
592 inp->inp_laddr.s_addr = INADDR_ANY;
593 inp->inp_lport = 0;
594 goto abort;
595 }
596 #ifdef IPSEC
597 /* copy old policy into new socket's */
598 if (ipsec_copy_pcbpolicy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
599 printf("syncache_expand: could not copy policy\n");
600 #endif
601 #ifdef FAST_IPSEC
602 /* copy old policy into new socket's */
603 if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
604 printf("syncache_expand: could not copy policy\n");
605 #endif
606 #ifdef INET6
607 if (sc->sc_inc.inc_isipv6) {
608 struct inpcb *oinp = sotoinpcb(lso);
609 struct in6_addr laddr6;
610 struct sockaddr_in6 sin6;
611 /*
612 * Inherit socket options from the listening socket.
613 * Note that in6p_inputopts are not (and should not be)
614 * copied, since it stores previously received options and is
615 * used to detect if each new option is different than the
616 * previous one and hence should be passed to a user.
617 * If we copied in6p_inputopts, a user would not be able to
618 * receive options just after calling the accept system call.
619 */
620 inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
621 if (oinp->in6p_outputopts)
622 inp->in6p_outputopts =
623 ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
624
625 sin6.sin6_family = AF_INET6;
626 sin6.sin6_len = sizeof(sin6);
627 sin6.sin6_addr = sc->sc_inc.inc6_faddr;
628 sin6.sin6_port = sc->sc_inc.inc_fport;
629 sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
630 laddr6 = inp->in6p_laddr;
631 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
632 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
633 if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
634 thread0.td_ucred)) {
635 inp->in6p_laddr = laddr6;
636 goto abort;
637 }
638 /* Override flowlabel from in6_pcbconnect. */
639 inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
640 inp->in6p_flowinfo |= sc->sc_flowlabel;
641 } else
642 #endif
643 {
644 struct in_addr laddr;
645 struct sockaddr_in sin;
646
647 inp->inp_options = ip_srcroute(m);
648 if (inp->inp_options == NULL) {
649 inp->inp_options = sc->sc_ipopts;
650 sc->sc_ipopts = NULL;
651 }
652
653 sin.sin_family = AF_INET;
654 sin.sin_len = sizeof(sin);
655 sin.sin_addr = sc->sc_inc.inc_faddr;
656 sin.sin_port = sc->sc_inc.inc_fport;
657 bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
658 laddr = inp->inp_laddr;
659 if (inp->inp_laddr.s_addr == INADDR_ANY)
660 inp->inp_laddr = sc->sc_inc.inc_laddr;
661 if (in_pcbconnect(inp, (struct sockaddr *)&sin,
662 thread0.td_ucred)) {
663 inp->inp_laddr = laddr;
664 goto abort;
665 }
666 }
667
668 tp = intotcpcb(inp);
669 tp->t_state = TCPS_SYN_RECEIVED;
670 tp->iss = sc->sc_iss;
671 tp->irs = sc->sc_irs;
672 tcp_rcvseqinit(tp);
673 tcp_sendseqinit(tp);
674 tp->snd_wl1 = sc->sc_irs;
675 tp->rcv_up = sc->sc_irs + 1;
676 tp->rcv_wnd = sc->sc_wnd;
677 tp->rcv_adv += tp->rcv_wnd;
678
679 tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
680 if (sc->sc_flags & SCF_NOOPT)
681 tp->t_flags |= TF_NOOPT;
682 if (sc->sc_flags & SCF_WINSCALE) {
683 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
684 tp->requested_s_scale = sc->sc_requested_s_scale;
685 tp->request_r_scale = sc->sc_request_r_scale;
686 }
687 if (sc->sc_flags & SCF_TIMESTAMP) {
688 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
689 tp->ts_recent = sc->sc_tsrecent;
690 tp->ts_recent_age = ticks;
691 }
692 #ifdef TCP_SIGNATURE
693 if (sc->sc_flags & SCF_SIGNATURE)
694 tp->t_flags |= TF_SIGNATURE;
695 #endif
696 if (sc->sc_flags & SCF_SACK) {
697 tp->sack_enable = 1;
698 tp->t_flags |= TF_SACK_PERMIT;
699 }
700 /*
701 * Set up MSS and get cached values from tcp_hostcache.
702 * This might overwrite some of the defaults we just set.
703 */
704 tcp_mss(tp, sc->sc_peer_mss);
705
706 /*
707 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
708 */
709 if (sc->sc_rxtslot != 0)
710 tp->snd_cwnd = tp->t_maxseg;
711 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
712
713 INP_UNLOCK(inp);
714
715 tcpstat.tcps_accepts++;
716 return (so);
717
718 abort:
719 INP_UNLOCK(inp);
720 abort2:
721 if (so != NULL)
722 (void) soabort(so);
723 return (NULL);
724 }
725
726 /*
727 * This function gets called when we receive an ACK for a
728 * socket in the LISTEN state. We look up the connection
729 * in the syncache, and if its there, we pull it out of
730 * the cache and turn it into a full-blown connection in
731 * the SYN-RECEIVED state.
732 */
733 int
734 syncache_expand(inc, th, sop, m)
735 struct in_conninfo *inc;
736 struct tcphdr *th;
737 struct socket **sop;
738 struct mbuf *m;
739 {
740 struct syncache *sc;
741 struct syncache_head *sch;
742 struct socket *so;
743
744 INP_INFO_WLOCK_ASSERT(&tcbinfo);
745
746 sc = syncache_lookup(inc, &sch);
747 if (sc == NULL) {
748 /*
749 * There is no syncache entry, so see if this ACK is
750 * a returning syncookie. To do this, first:
751 * A. See if this socket has had a syncache entry dropped in
752 * the past. We don't want to accept a bogus syncookie
753 * if we've never received a SYN.
754 * B. check that the syncookie is valid. If it is, then
755 * cobble up a fake syncache entry, and return.
756 */
757 if (!tcp_syncookies)
758 return (0);
759 sc = syncookie_lookup(inc, th, *sop);
760 if (sc == NULL)
761 return (0);
762 sch = NULL;
763 tcpstat.tcps_sc_recvcookie++;
764 }
765
766 /*
767 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
768 */
769 if (th->th_ack != sc->sc_iss + 1) {
770 if (sch == NULL)
771 syncache_free(sc);
772 return (0);
773 }
774
775 so = syncache_socket(sc, *sop, m);
776 if (so == NULL) {
777 #if 0
778 resetandabort:
779 /* XXXjlemon check this - is this correct? */
780 (void) tcp_respond(NULL, m, m, th,
781 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
782 #endif
783 m_freem(m); /* XXX only needed for above */
784 tcpstat.tcps_sc_aborted++;
785 } else
786 tcpstat.tcps_sc_completed++;
787
788 if (sch == NULL)
789 syncache_free(sc);
790 else
791 syncache_drop(sc, sch);
792 *sop = so;
793 return (1);
794 }
795
796 /*
797 * Given a LISTEN socket and an inbound SYN request, add
798 * this to the syn cache, and send back a segment:
799 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
800 * to the source.
801 *
802 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
803 * Doing so would require that we hold onto the data and deliver it
804 * to the application. However, if we are the target of a SYN-flood
805 * DoS attack, an attacker could send data which would eventually
806 * consume all available buffer space if it were ACKed. By not ACKing
807 * the data, we avoid this DoS scenario.
808 */
809 int
810 syncache_add(inc, to, th, sop, m)
811 struct in_conninfo *inc;
812 struct tcpopt *to;
813 struct tcphdr *th;
814 struct socket **sop;
815 struct mbuf *m;
816 {
817 struct tcpcb *tp;
818 struct socket *so;
819 struct syncache *sc = NULL;
820 struct syncache_head *sch;
821 struct mbuf *ipopts = NULL;
822 u_int32_t flowtmp;
823 int i, win;
824
825 INP_INFO_WLOCK_ASSERT(&tcbinfo);
826
827 so = *sop;
828 tp = sototcpcb(so);
829
830 /*
831 * Remember the IP options, if any.
832 */
833 #ifdef INET6
834 if (!inc->inc_isipv6)
835 #endif
836 ipopts = ip_srcroute(m);
837
838 /*
839 * See if we already have an entry for this connection.
840 * If we do, resend the SYN,ACK, and reset the retransmit timer.
841 *
842 * XXX
843 * should the syncache be re-initialized with the contents
844 * of the new SYN here (which may have different options?)
845 */
846 sc = syncache_lookup(inc, &sch);
847 if (sc != NULL) {
848 tcpstat.tcps_sc_dupsyn++;
849 if (ipopts) {
850 /*
851 * If we were remembering a previous source route,
852 * forget it and use the new one we've been given.
853 */
854 if (sc->sc_ipopts)
855 (void) m_free(sc->sc_ipopts);
856 sc->sc_ipopts = ipopts;
857 }
858 /*
859 * Update timestamp if present.
860 */
861 if (sc->sc_flags & SCF_TIMESTAMP)
862 sc->sc_tsrecent = to->to_tsval;
863 /*
864 * PCB may have changed, pick up new values.
865 */
866 sc->sc_tp = tp;
867 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
868 #ifdef TCPDEBUG
869 if (syncache_respond(sc, m, so) == 0) {
870 #else
871 if (syncache_respond(sc, m) == 0) {
872 #endif
873 /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
874 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
875 sc, sc_timerq);
876 SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
877 tcpstat.tcps_sndacks++;
878 tcpstat.tcps_sndtotal++;
879 }
880 *sop = NULL;
881 return (1);
882 }
883
884 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
885 if (sc == NULL) {
886 /*
887 * The zone allocator couldn't provide more entries.
888 * Treat this as if the cache was full; drop the oldest
889 * entry and insert the new one.
890 */
891 /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
892 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
893 sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
894 if (sc != NULL)
895 break;
896 }
897 sc->sc_tp->ts_recent = ticks;
898 syncache_drop(sc, NULL);
899 tcpstat.tcps_sc_zonefail++;
900 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
901 if (sc == NULL) {
902 if (ipopts)
903 (void) m_free(ipopts);
904 return (0);
905 }
906 }
907
908 /*
909 * Fill in the syncache values.
910 */
911 sc->sc_tp = tp;
912 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
913 sc->sc_ipopts = ipopts;
914 sc->sc_inc.inc_fport = inc->inc_fport;
915 sc->sc_inc.inc_lport = inc->inc_lport;
916 #ifdef INET6
917 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
918 if (inc->inc_isipv6) {
919 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
920 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
921 } else
922 #endif
923 {
924 sc->sc_inc.inc_faddr = inc->inc_faddr;
925 sc->sc_inc.inc_laddr = inc->inc_laddr;
926 }
927 sc->sc_irs = th->th_seq;
928 sc->sc_flags = 0;
929 sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
930 sc->sc_flowlabel = 0;
931 if (tcp_syncookies) {
932 sc->sc_iss = syncookie_generate(sc, &flowtmp);
933 #ifdef INET6
934 if (inc->inc_isipv6 &&
935 (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
936 sc->sc_flowlabel = flowtmp & IPV6_FLOWLABEL_MASK;
937 }
938 #endif
939 } else {
940 sc->sc_iss = arc4random();
941 #ifdef INET6
942 if (inc->inc_isipv6 &&
943 (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
944 sc->sc_flowlabel =
945 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
946 }
947 #endif
948 }
949
950 /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
951 win = sbspace(&so->so_rcv);
952 win = imax(win, 0);
953 win = imin(win, TCP_MAXWIN);
954 sc->sc_wnd = win;
955
956 if (tcp_do_rfc1323) {
957 /*
958 * A timestamp received in a SYN makes
959 * it ok to send timestamp requests and replies.
960 */
961 if (to->to_flags & TOF_TS) {
962 sc->sc_tsrecent = to->to_tsval;
963 sc->sc_flags |= SCF_TIMESTAMP;
964 }
965 if (to->to_flags & TOF_SCALE) {
966 int wscale = 0;
967
968 /* Compute proper scaling value from buffer space */
969 while (wscale < TCP_MAX_WINSHIFT &&
970 (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
971 wscale++;
972 sc->sc_request_r_scale = wscale;
973 sc->sc_requested_s_scale = to->to_requested_s_scale;
974 sc->sc_flags |= SCF_WINSCALE;
975 }
976 }
977 if (tp->t_flags & TF_NOOPT)
978 sc->sc_flags = SCF_NOOPT;
979 #ifdef TCP_SIGNATURE
980 /*
981 * If listening socket requested TCP digests, and received SYN
982 * contains the option, flag this in the syncache so that
983 * syncache_respond() will do the right thing with the SYN+ACK.
984 * XXX Currently we always record the option by default and will
985 * attempt to use it in syncache_respond().
986 */
987 if (to->to_flags & TOF_SIGNATURE)
988 sc->sc_flags |= SCF_SIGNATURE;
989 #endif
990
991 if (to->to_flags & TOF_SACK)
992 sc->sc_flags |= SCF_SACK;
993
994 /*
995 * Do a standard 3-way handshake.
996 */
997 #ifdef TCPDEBUG
998 if (syncache_respond(sc, m, so) == 0) {
999 #else
1000 if (syncache_respond(sc, m) == 0) {
1001 #endif
1002 syncache_insert(sc, sch);
1003 tcpstat.tcps_sndacks++;
1004 tcpstat.tcps_sndtotal++;
1005 } else {
1006 syncache_free(sc);
1007 tcpstat.tcps_sc_dropped++;
1008 }
1009 *sop = NULL;
1010 return (1);
1011 }
1012
1013 #ifdef TCPDEBUG
1014 static int
1015 syncache_respond(sc, m, so)
1016 struct syncache *sc;
1017 struct mbuf *m;
1018 struct socket *so;
1019 #else
1020 static int
1021 syncache_respond(sc, m)
1022 struct syncache *sc;
1023 struct mbuf *m;
1024 #endif
1025 {
1026 u_int8_t *optp;
1027 int optlen, error;
1028 u_int16_t tlen, hlen, mssopt;
1029 struct ip *ip = NULL;
1030 struct tcphdr *th;
1031 struct inpcb *inp;
1032 #ifdef INET6
1033 struct ip6_hdr *ip6 = NULL;
1034 #endif
1035
1036 hlen =
1037 #ifdef INET6
1038 (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
1039 #endif
1040 sizeof(struct ip);
1041
1042 KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
1043
1044 /* Determine MSS we advertize to other end of connection */
1045 mssopt = tcp_mssopt(&sc->sc_inc);
1046
1047 /* Compute the size of the TCP options. */
1048 if (sc->sc_flags & SCF_NOOPT) {
1049 optlen = 0;
1050 } else {
1051 optlen = TCPOLEN_MAXSEG +
1052 ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
1053 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
1054 #ifdef TCP_SIGNATURE
1055 if (sc->sc_flags & SCF_SIGNATURE)
1056 optlen += TCPOLEN_SIGNATURE;
1057 #endif
1058 if (sc->sc_flags & SCF_SACK)
1059 optlen += TCPOLEN_SACK_PERMITTED;
1060 optlen = roundup2(optlen, 4);
1061 }
1062 tlen = hlen + sizeof(struct tcphdr) + optlen;
1063
1064 /*
1065 * XXX
1066 * assume that the entire packet will fit in a header mbuf
1067 */
1068 KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
1069
1070 /*
1071 * XXX shouldn't this reuse the mbuf if possible ?
1072 * Create the IP+TCP header from scratch.
1073 */
1074 if (m)
1075 m_freem(m);
1076
1077 m = m_gethdr(M_DONTWAIT, MT_HEADER);
1078 if (m == NULL)
1079 return (ENOBUFS);
1080 m->m_data += max_linkhdr;
1081 m->m_len = tlen;
1082 m->m_pkthdr.len = tlen;
1083 m->m_pkthdr.rcvif = NULL;
1084 inp = sc->sc_tp->t_inpcb;
1085 INP_LOCK(inp);
1086 #ifdef MAC
1087 mac_create_mbuf_from_inpcb(inp, m);
1088 #endif
1089
1090 #ifdef INET6
1091 if (sc->sc_inc.inc_isipv6) {
1092 ip6 = mtod(m, struct ip6_hdr *);
1093 ip6->ip6_vfc = IPV6_VERSION;
1094 ip6->ip6_nxt = IPPROTO_TCP;
1095 ip6->ip6_src = sc->sc_inc.inc6_laddr;
1096 ip6->ip6_dst = sc->sc_inc.inc6_faddr;
1097 ip6->ip6_plen = htons(tlen - hlen);
1098 /* ip6_hlim is set after checksum */
1099 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
1100 ip6->ip6_flow |= sc->sc_flowlabel;
1101
1102 th = (struct tcphdr *)(ip6 + 1);
1103 } else
1104 #endif
1105 {
1106 ip = mtod(m, struct ip *);
1107 ip->ip_v = IPVERSION;
1108 ip->ip_hl = sizeof(struct ip) >> 2;
1109 ip->ip_len = tlen;
1110 ip->ip_id = 0;
1111 ip->ip_off = 0;
1112 ip->ip_sum = 0;
1113 ip->ip_p = IPPROTO_TCP;
1114 ip->ip_src = sc->sc_inc.inc_laddr;
1115 ip->ip_dst = sc->sc_inc.inc_faddr;
1116 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
1117 ip->ip_tos = inp->inp_ip_tos; /* XXX */
1118
1119 /*
1120 * See if we should do MTU discovery. Route lookups are
1121 * expensive, so we will only unset the DF bit if:
1122 *
1123 * 1) path_mtu_discovery is disabled
1124 * 2) the SCF_UNREACH flag has been set
1125 */
1126 if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
1127 ip->ip_off |= IP_DF;
1128
1129 th = (struct tcphdr *)(ip + 1);
1130 }
1131 th->th_sport = sc->sc_inc.inc_lport;
1132 th->th_dport = sc->sc_inc.inc_fport;
1133
1134 th->th_seq = htonl(sc->sc_iss);
1135 th->th_ack = htonl(sc->sc_irs + 1);
1136 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1137 th->th_x2 = 0;
1138 th->th_flags = TH_SYN|TH_ACK;
1139 th->th_win = htons(sc->sc_wnd);
1140 th->th_urp = 0;
1141
1142 /* Tack on the TCP options. */
1143 if (optlen != 0) {
1144 optp = (u_int8_t *)(th + 1);
1145 *optp++ = TCPOPT_MAXSEG;
1146 *optp++ = TCPOLEN_MAXSEG;
1147 *optp++ = (mssopt >> 8) & 0xff;
1148 *optp++ = mssopt & 0xff;
1149
1150 if (sc->sc_flags & SCF_WINSCALE) {
1151 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1152 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1153 sc->sc_request_r_scale);
1154 optp += 4;
1155 }
1156
1157 if (sc->sc_flags & SCF_TIMESTAMP) {
1158 u_int32_t *lp = (u_int32_t *)(optp);
1159
1160 /* Form timestamp option per appendix A of RFC 1323. */
1161 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1162 *lp++ = htonl(ticks);
1163 *lp = htonl(sc->sc_tsrecent);
1164 optp += TCPOLEN_TSTAMP_APPA;
1165 }
1166
1167 #ifdef TCP_SIGNATURE
1168 /*
1169 * Handle TCP-MD5 passive opener response.
1170 */
1171 if (sc->sc_flags & SCF_SIGNATURE) {
1172 u_int8_t *bp = optp;
1173 int i;
1174
1175 *bp++ = TCPOPT_SIGNATURE;
1176 *bp++ = TCPOLEN_SIGNATURE;
1177 for (i = 0; i < TCP_SIGLEN; i++)
1178 *bp++ = 0;
1179 tcp_signature_compute(m, sizeof(struct ip), 0, optlen,
1180 optp + 2, IPSEC_DIR_OUTBOUND);
1181 optp += TCPOLEN_SIGNATURE;
1182 }
1183 #endif /* TCP_SIGNATURE */
1184
1185 if (sc->sc_flags & SCF_SACK) {
1186 *optp++ = TCPOPT_SACK_PERMITTED;
1187 *optp++ = TCPOLEN_SACK_PERMITTED;
1188 }
1189
1190 {
1191 /* Pad TCP options to a 4 byte boundary */
1192 int padlen = optlen - (optp - (u_int8_t *)(th + 1));
1193 while (padlen-- > 0)
1194 *optp++ = TCPOPT_EOL;
1195 }
1196 }
1197
1198 #ifdef INET6
1199 if (sc->sc_inc.inc_isipv6) {
1200 th->th_sum = 0;
1201 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1202 ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
1203 error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
1204 } else
1205 #endif
1206 {
1207 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1208 htons(tlen - hlen + IPPROTO_TCP));
1209 m->m_pkthdr.csum_flags = CSUM_TCP;
1210 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1211 #ifdef TCPDEBUG
1212 /*
1213 * Trace.
1214 */
1215 if (so != NULL && so->so_options & SO_DEBUG) {
1216 struct tcpcb *tp = sototcpcb(so);
1217 tcp_trace(TA_OUTPUT, tp->t_state, tp,
1218 mtod(m, void *), th, 0);
1219 }
1220 #endif
1221 error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
1222 }
1223 INP_UNLOCK(inp);
1224 return (error);
1225 }
1226
1227 /*
1228 * cookie layers:
1229 *
1230 * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
1231 * | peer iss |
1232 * | MD5(laddr,faddr,secret,lport,fport) |. . . . . . .|
1233 * | 0 |(A)| |
1234 * (A): peer mss index
1235 */
1236
1237 /*
1238 * The values below are chosen to minimize the size of the tcp_secret
1239 * table, as well as providing roughly a 16 second lifetime for the cookie.
1240 */
1241
1242 #define SYNCOOKIE_WNDBITS 5 /* exposed bits for window indexing */
1243 #define SYNCOOKIE_TIMESHIFT 1 /* scale ticks to window time units */
1244
1245 #define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1)
1246 #define SYNCOOKIE_NSECRETS (1 << SYNCOOKIE_WNDBITS)
1247 #define SYNCOOKIE_TIMEOUT \
1248 (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
1249 #define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
1250
1251 static struct {
1252 u_int32_t ts_secbits[4];
1253 u_int ts_expire;
1254 } tcp_secret[SYNCOOKIE_NSECRETS];
1255
1256 static int tcp_msstab[] = { 0, 536, 1460, 8960 };
1257
1258 static MD5_CTX syn_ctx;
1259
1260 #define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
1261
1262 struct md5_add {
1263 u_int32_t laddr, faddr;
1264 u_int32_t secbits[4];
1265 u_int16_t lport, fport;
1266 };
1267
1268 #ifdef CTASSERT
1269 CTASSERT(sizeof(struct md5_add) == 28);
1270 #endif
1271
1272 /*
1273 * Consider the problem of a recreated (and retransmitted) cookie. If the
1274 * original SYN was accepted, the connection is established. The second
1275 * SYN is inflight, and if it arrives with an ISN that falls within the
1276 * receive window, the connection is killed.
1277 *
1278 * However, since cookies have other problems, this may not be worth
1279 * worrying about.
1280 */
1281
1282 static u_int32_t
1283 syncookie_generate(struct syncache *sc, u_int32_t *flowid)
1284 {
1285 u_int32_t md5_buffer[4];
1286 u_int32_t data;
1287 int idx, i;
1288 struct md5_add add;
1289
1290 /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
1291
1292 idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
1293 if (tcp_secret[idx].ts_expire < ticks) {
1294 for (i = 0; i < 4; i++)
1295 tcp_secret[idx].ts_secbits[i] = arc4random();
1296 tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
1297 }
1298 for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
1299 if (tcp_msstab[data] <= sc->sc_peer_mss)
1300 break;
1301 data = (data << SYNCOOKIE_WNDBITS) | idx;
1302 data ^= sc->sc_irs; /* peer's iss */
1303 MD5Init(&syn_ctx);
1304 #ifdef INET6
1305 if (sc->sc_inc.inc_isipv6) {
1306 MD5Add(sc->sc_inc.inc6_laddr);
1307 MD5Add(sc->sc_inc.inc6_faddr);
1308 add.laddr = 0;
1309 add.faddr = 0;
1310 } else
1311 #endif
1312 {
1313 add.laddr = sc->sc_inc.inc_laddr.s_addr;
1314 add.faddr = sc->sc_inc.inc_faddr.s_addr;
1315 }
1316 add.lport = sc->sc_inc.inc_lport;
1317 add.fport = sc->sc_inc.inc_fport;
1318 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1319 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1320 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1321 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1322 MD5Add(add);
1323 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1324 data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK);
1325 *flowid = md5_buffer[1];
1326 tcpstat.tcps_sc_sendcookie++;
1327 return (data);
1328 }
1329
1330 static struct syncache *
1331 syncookie_lookup(inc, th, so)
1332 struct in_conninfo *inc;
1333 struct tcphdr *th;
1334 struct socket *so;
1335 {
1336 u_int32_t md5_buffer[4];
1337 struct syncache *sc;
1338 u_int32_t data;
1339 int wnd, idx;
1340 struct md5_add add;
1341
1342 /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
1343
1344 data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */
1345 idx = data & SYNCOOKIE_WNDMASK;
1346 if (tcp_secret[idx].ts_expire < ticks ||
1347 sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
1348 return (NULL);
1349 MD5Init(&syn_ctx);
1350 #ifdef INET6
1351 if (inc->inc_isipv6) {
1352 MD5Add(inc->inc6_laddr);
1353 MD5Add(inc->inc6_faddr);
1354 add.laddr = 0;
1355 add.faddr = 0;
1356 } else
1357 #endif
1358 {
1359 add.laddr = inc->inc_laddr.s_addr;
1360 add.faddr = inc->inc_faddr.s_addr;
1361 }
1362 add.lport = inc->inc_lport;
1363 add.fport = inc->inc_fport;
1364 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1365 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1366 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1367 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1368 MD5Add(add);
1369 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1370 data ^= md5_buffer[0];
1371 if ((data & ~SYNCOOKIE_DATAMASK) != 0)
1372 return (NULL);
1373 data = data >> SYNCOOKIE_WNDBITS;
1374
1375 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
1376 if (sc == NULL)
1377 return (NULL);
1378 /*
1379 * Fill in the syncache values.
1380 * XXX duplicate code from syncache_add
1381 */
1382 sc->sc_ipopts = NULL;
1383 sc->sc_inc.inc_fport = inc->inc_fport;
1384 sc->sc_inc.inc_lport = inc->inc_lport;
1385 sc->sc_tp = sototcpcb(so);
1386 #ifdef INET6
1387 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
1388 if (inc->inc_isipv6) {
1389 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
1390 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
1391 if (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)
1392 sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
1393 } else
1394 #endif
1395 {
1396 sc->sc_inc.inc_faddr = inc->inc_faddr;
1397 sc->sc_inc.inc_laddr = inc->inc_laddr;
1398 }
1399 sc->sc_irs = th->th_seq - 1;
1400 sc->sc_iss = th->th_ack - 1;
1401 wnd = sbspace(&so->so_rcv);
1402 wnd = imax(wnd, 0);
1403 wnd = imin(wnd, TCP_MAXWIN);
1404 sc->sc_wnd = wnd;
1405 sc->sc_flags = 0;
1406 sc->sc_rxtslot = 0;
1407 sc->sc_peer_mss = tcp_msstab[data];
1408 return (sc);
1409 }
Cache object: b307cfb83624ea122e5319cde48caca0
|