1 /*-
2 * Copyright (c) 2001 McAfee, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Jonathan Lemon
6 * and McAfee Research, the Security Research Division of McAfee, Inc. under
7 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
8 * DARPA CHATS research program.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * $FreeBSD$
32 */
33
34 #include "opt_inet.h"
35 #include "opt_inet6.h"
36 #include "opt_ipsec.h"
37 #include "opt_mac.h"
38 #include "opt_tcpdebug.h"
39 #include "opt_tcp_sack.h"
40
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/sysctl.h>
45 #include <sys/malloc.h>
46 #include <sys/mac.h>
47 #include <sys/mbuf.h>
48 #include <sys/md5.h>
49 #include <sys/proc.h> /* for proc0 declaration */
50 #include <sys/random.h>
51 #include <sys/socket.h>
52 #include <sys/socketvar.h>
53
54 #include <net/if.h>
55 #include <net/route.h>
56
57 #include <netinet/in.h>
58 #include <netinet/in_systm.h>
59 #include <netinet/ip.h>
60 #include <netinet/in_var.h>
61 #include <netinet/in_pcb.h>
62 #include <netinet/ip_var.h>
63 #ifdef INET6
64 #include <netinet/ip6.h>
65 #include <netinet/icmp6.h>
66 #include <netinet6/nd6.h>
67 #include <netinet6/ip6_var.h>
68 #include <netinet6/in6_pcb.h>
69 #endif
70 #include <netinet/tcp.h>
71 #ifdef TCPDEBUG
72 #include <netinet/tcpip.h>
73 #endif
74 #include <netinet/tcp_fsm.h>
75 #include <netinet/tcp_seq.h>
76 #include <netinet/tcp_timer.h>
77 #include <netinet/tcp_var.h>
78 #ifdef TCPDEBUG
79 #include <netinet/tcp_debug.h>
80 #endif
81 #ifdef INET6
82 #include <netinet6/tcp6_var.h>
83 #endif
84
85 #ifdef IPSEC
86 #include <netinet6/ipsec.h>
87 #ifdef INET6
88 #include <netinet6/ipsec6.h>
89 #endif
90 #endif /*IPSEC*/
91
92 #ifdef FAST_IPSEC
93 #include <netipsec/ipsec.h>
94 #ifdef INET6
95 #include <netipsec/ipsec6.h>
96 #endif
97 #include <netipsec/key.h>
98 #endif /*FAST_IPSEC*/
99
100 #include <machine/in_cksum.h>
101 #include <vm/uma.h>
102
103 static int tcp_syncookies = 1;
104 SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_RW,
105 &tcp_syncookies, 0,
106 "Use TCP SYN cookies if the syncache overflows");
107
108 static void syncache_drop(struct syncache *, struct syncache_head *);
109 static void syncache_free(struct syncache *);
110 static void syncache_insert(struct syncache *, struct syncache_head *);
111 struct syncache *syncache_lookup(struct in_conninfo *, struct syncache_head **);
112 #ifdef TCPDEBUG
113 static int syncache_respond(struct syncache *, struct mbuf *, struct socket *);
114 #else
115 static int syncache_respond(struct syncache *, struct mbuf *);
116 #endif
117 static struct socket *syncache_socket(struct syncache *, struct socket *,
118 struct mbuf *m);
119 static void syncache_timer(void *);
120 static u_int32_t syncookie_generate(struct syncache *, u_int32_t *);
121 static struct syncache *syncookie_lookup(struct in_conninfo *,
122 struct tcphdr *, struct socket *);
123
124 /*
125 * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies.
126 * 3 retransmits corresponds to a timeout of (1 + 2 + 4 + 8 == 15) seconds,
127 * the odds are that the user has given up attempting to connect by then.
128 */
129 #define SYNCACHE_MAXREXMTS 3
130
131 /* Arbitrary values */
132 #define TCP_SYNCACHE_HASHSIZE 512
133 #define TCP_SYNCACHE_BUCKETLIMIT 30
134
135 struct tcp_syncache {
136 struct syncache_head *hashbase;
137 uma_zone_t zone;
138 u_int hashsize;
139 u_int hashmask;
140 u_int bucket_limit;
141 u_int cache_count;
142 u_int cache_limit;
143 u_int rexmt_limit;
144 u_int hash_secret;
145 TAILQ_HEAD(, syncache) timerq[SYNCACHE_MAXREXMTS + 1];
146 struct callout tt_timerq[SYNCACHE_MAXREXMTS + 1];
147 };
148 static struct tcp_syncache tcp_syncache;
149
150 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache");
151
152 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
153 &tcp_syncache.bucket_limit, 0, "Per-bucket hash limit for syncache");
154
155 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
156 &tcp_syncache.cache_limit, 0, "Overall entry limit for syncache");
157
158 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_RD,
159 &tcp_syncache.cache_count, 0, "Current number of entries in syncache");
160
161 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
162 &tcp_syncache.hashsize, 0, "Size of TCP syncache hashtable");
163
164 SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_RW,
165 &tcp_syncache.rexmt_limit, 0, "Limit on SYN/ACK retransmissions");
166
167 static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
168
169 #define SYNCACHE_HASH(inc, mask) \
170 ((tcp_syncache.hash_secret ^ \
171 (inc)->inc_faddr.s_addr ^ \
172 ((inc)->inc_faddr.s_addr >> 16) ^ \
173 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
174
175 #define SYNCACHE_HASH6(inc, mask) \
176 ((tcp_syncache.hash_secret ^ \
177 (inc)->inc6_faddr.s6_addr32[0] ^ \
178 (inc)->inc6_faddr.s6_addr32[3] ^ \
179 (inc)->inc_fport ^ (inc)->inc_lport) & mask)
180
181 #define ENDPTS_EQ(a, b) ( \
182 (a)->ie_fport == (b)->ie_fport && \
183 (a)->ie_lport == (b)->ie_lport && \
184 (a)->ie_faddr.s_addr == (b)->ie_faddr.s_addr && \
185 (a)->ie_laddr.s_addr == (b)->ie_laddr.s_addr \
186 )
187
188 #define ENDPTS6_EQ(a, b) (memcmp(a, b, sizeof(*a)) == 0)
189
190 #define SYNCACHE_TIMEOUT(sc, slot) do { \
191 sc->sc_rxtslot = (slot); \
192 sc->sc_rxttime = ticks + TCPTV_RTOBASE * tcp_backoff[(slot)]; \
193 TAILQ_INSERT_TAIL(&tcp_syncache.timerq[(slot)], sc, sc_timerq); \
194 if (!callout_active(&tcp_syncache.tt_timerq[(slot)])) \
195 callout_reset(&tcp_syncache.tt_timerq[(slot)], \
196 TCPTV_RTOBASE * tcp_backoff[(slot)], \
197 syncache_timer, (void *)((intptr_t)(slot))); \
198 } while (0)
199
200 static void
201 syncache_free(struct syncache *sc)
202 {
203 if (sc->sc_ipopts)
204 (void) m_free(sc->sc_ipopts);
205
206 uma_zfree(tcp_syncache.zone, sc);
207 }
208
209 void
210 syncache_init(void)
211 {
212 int i;
213
214 tcp_syncache.cache_count = 0;
215 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
216 tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT;
217 tcp_syncache.cache_limit =
218 tcp_syncache.hashsize * tcp_syncache.bucket_limit;
219 tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS;
220 tcp_syncache.hash_secret = arc4random();
221
222 TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize",
223 &tcp_syncache.hashsize);
224 TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit",
225 &tcp_syncache.cache_limit);
226 TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit",
227 &tcp_syncache.bucket_limit);
228 if (!powerof2(tcp_syncache.hashsize) || tcp_syncache.hashsize == 0) {
229 printf("WARNING: syncache hash size is not a power of 2.\n");
230 tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE;
231 }
232 tcp_syncache.hashmask = tcp_syncache.hashsize - 1;
233
234 /* Allocate the hash table. */
235 MALLOC(tcp_syncache.hashbase, struct syncache_head *,
236 tcp_syncache.hashsize * sizeof(struct syncache_head),
237 M_SYNCACHE, M_WAITOK);
238
239 /* Initialize the hash buckets. */
240 for (i = 0; i < tcp_syncache.hashsize; i++) {
241 TAILQ_INIT(&tcp_syncache.hashbase[i].sch_bucket);
242 tcp_syncache.hashbase[i].sch_length = 0;
243 }
244
245 /* Initialize the timer queues. */
246 for (i = 0; i <= SYNCACHE_MAXREXMTS; i++) {
247 TAILQ_INIT(&tcp_syncache.timerq[i]);
248 callout_init(&tcp_syncache.tt_timerq[i], NET_CALLOUT_MPSAFE);
249 }
250
251 /*
252 * Allocate the syncache entries. Allow the zone to allocate one
253 * more entry than cache limit, so a new entry can bump out an
254 * older one.
255 */
256 tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache),
257 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
258 uma_zone_set_max(tcp_syncache.zone, tcp_syncache.cache_limit);
259 tcp_syncache.cache_limit -= 1;
260 }
261
262 static void
263 syncache_insert(sc, sch)
264 struct syncache *sc;
265 struct syncache_head *sch;
266 {
267 struct syncache *sc2;
268 int i;
269
270 INP_INFO_WLOCK_ASSERT(&tcbinfo);
271
272 /*
273 * Make sure that we don't overflow the per-bucket
274 * limit or the total cache size limit.
275 */
276 if (sch->sch_length >= tcp_syncache.bucket_limit) {
277 /*
278 * The bucket is full, toss the oldest element.
279 */
280 sc2 = TAILQ_FIRST(&sch->sch_bucket);
281 sc2->sc_tp->ts_recent = ticks;
282 syncache_drop(sc2, sch);
283 tcpstat.tcps_sc_bucketoverflow++;
284 } else if (tcp_syncache.cache_count >= tcp_syncache.cache_limit) {
285 /*
286 * The cache is full. Toss the oldest entry in the
287 * entire cache. This is the front entry in the
288 * first non-empty timer queue with the largest
289 * timeout value.
290 */
291 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
292 sc2 = TAILQ_FIRST(&tcp_syncache.timerq[i]);
293 if (sc2 != NULL)
294 break;
295 }
296 sc2->sc_tp->ts_recent = ticks;
297 syncache_drop(sc2, NULL);
298 tcpstat.tcps_sc_cacheoverflow++;
299 }
300
301 /* Initialize the entry's timer. */
302 SYNCACHE_TIMEOUT(sc, 0);
303
304 /* Put it into the bucket. */
305 TAILQ_INSERT_TAIL(&sch->sch_bucket, sc, sc_hash);
306 sch->sch_length++;
307 tcp_syncache.cache_count++;
308 tcpstat.tcps_sc_added++;
309 }
310
311 static void
312 syncache_drop(sc, sch)
313 struct syncache *sc;
314 struct syncache_head *sch;
315 {
316 INP_INFO_WLOCK_ASSERT(&tcbinfo);
317
318 if (sch == NULL) {
319 #ifdef INET6
320 if (sc->sc_inc.inc_isipv6) {
321 sch = &tcp_syncache.hashbase[
322 SYNCACHE_HASH6(&sc->sc_inc, tcp_syncache.hashmask)];
323 } else
324 #endif
325 {
326 sch = &tcp_syncache.hashbase[
327 SYNCACHE_HASH(&sc->sc_inc, tcp_syncache.hashmask)];
328 }
329 }
330
331 TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash);
332 sch->sch_length--;
333 tcp_syncache.cache_count--;
334
335 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot], sc, sc_timerq);
336 if (TAILQ_EMPTY(&tcp_syncache.timerq[sc->sc_rxtslot]))
337 callout_stop(&tcp_syncache.tt_timerq[sc->sc_rxtslot]);
338
339 syncache_free(sc);
340 }
341
342 /*
343 * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
344 * If we have retransmitted an entry the maximum number of times, expire it.
345 */
346 static void
347 syncache_timer(xslot)
348 void *xslot;
349 {
350 intptr_t slot = (intptr_t)xslot;
351 struct syncache *sc, *nsc;
352 struct inpcb *inp;
353
354 INP_INFO_WLOCK(&tcbinfo);
355 if (callout_pending(&tcp_syncache.tt_timerq[slot]) ||
356 !callout_active(&tcp_syncache.tt_timerq[slot])) {
357 /* XXX can this happen? */
358 INP_INFO_WUNLOCK(&tcbinfo);
359 return;
360 }
361 callout_deactivate(&tcp_syncache.tt_timerq[slot]);
362
363 nsc = TAILQ_FIRST(&tcp_syncache.timerq[slot]);
364 while (nsc != NULL) {
365 if (ticks < nsc->sc_rxttime)
366 break;
367 sc = nsc;
368 inp = sc->sc_tp->t_inpcb;
369 if (slot == SYNCACHE_MAXREXMTS ||
370 slot >= tcp_syncache.rexmt_limit ||
371 inp == NULL || inp->inp_gencnt != sc->sc_inp_gencnt) {
372 nsc = TAILQ_NEXT(sc, sc_timerq);
373 syncache_drop(sc, NULL);
374 tcpstat.tcps_sc_stale++;
375 continue;
376 }
377 /*
378 * syncache_respond() may call back into the syncache to
379 * to modify another entry, so do not obtain the next
380 * entry on the timer chain until it has completed.
381 */
382 #ifdef TCPDEBUG
383 (void) syncache_respond(sc, NULL, NULL);
384 #else
385 (void) syncache_respond(sc, NULL);
386 #endif
387 nsc = TAILQ_NEXT(sc, sc_timerq);
388 tcpstat.tcps_sc_retransmitted++;
389 TAILQ_REMOVE(&tcp_syncache.timerq[slot], sc, sc_timerq);
390 SYNCACHE_TIMEOUT(sc, slot + 1);
391 }
392 if (nsc != NULL)
393 callout_reset(&tcp_syncache.tt_timerq[slot],
394 nsc->sc_rxttime - ticks, syncache_timer, (void *)(slot));
395 INP_INFO_WUNLOCK(&tcbinfo);
396 }
397
398 /*
399 * Find an entry in the syncache.
400 */
401 struct syncache *
402 syncache_lookup(inc, schp)
403 struct in_conninfo *inc;
404 struct syncache_head **schp;
405 {
406 struct syncache *sc;
407 struct syncache_head *sch;
408
409 INP_INFO_WLOCK_ASSERT(&tcbinfo);
410
411 #ifdef INET6
412 if (inc->inc_isipv6) {
413 sch = &tcp_syncache.hashbase[
414 SYNCACHE_HASH6(inc, tcp_syncache.hashmask)];
415 *schp = sch;
416 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
417 if (ENDPTS6_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
418 return (sc);
419 }
420 } else
421 #endif
422 {
423 sch = &tcp_syncache.hashbase[
424 SYNCACHE_HASH(inc, tcp_syncache.hashmask)];
425 *schp = sch;
426 TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) {
427 #ifdef INET6
428 if (sc->sc_inc.inc_isipv6)
429 continue;
430 #endif
431 if (ENDPTS_EQ(&inc->inc_ie, &sc->sc_inc.inc_ie))
432 return (sc);
433 }
434 }
435 return (NULL);
436 }
437
438 /*
439 * This function is called when we get a RST for a
440 * non-existent connection, so that we can see if the
441 * connection is in the syn cache. If it is, zap it.
442 */
443 void
444 syncache_chkrst(inc, th)
445 struct in_conninfo *inc;
446 struct tcphdr *th;
447 {
448 struct syncache *sc;
449 struct syncache_head *sch;
450
451 INP_INFO_WLOCK_ASSERT(&tcbinfo);
452
453 sc = syncache_lookup(inc, &sch);
454 if (sc == NULL)
455 return;
456 /*
457 * If the RST bit is set, check the sequence number to see
458 * if this is a valid reset segment.
459 * RFC 793 page 37:
460 * In all states except SYN-SENT, all reset (RST) segments
461 * are validated by checking their SEQ-fields. A reset is
462 * valid if its sequence number is in the window.
463 *
464 * The sequence number in the reset segment is normally an
465 * echo of our outgoing acknowlegement numbers, but some hosts
466 * send a reset with the sequence number at the rightmost edge
467 * of our receive window, and we have to handle this case.
468 */
469 if (SEQ_GEQ(th->th_seq, sc->sc_irs) &&
470 SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) {
471 syncache_drop(sc, sch);
472 tcpstat.tcps_sc_reset++;
473 }
474 }
475
476 void
477 syncache_badack(inc)
478 struct in_conninfo *inc;
479 {
480 struct syncache *sc;
481 struct syncache_head *sch;
482
483 INP_INFO_WLOCK_ASSERT(&tcbinfo);
484
485 sc = syncache_lookup(inc, &sch);
486 if (sc != NULL) {
487 syncache_drop(sc, sch);
488 tcpstat.tcps_sc_badack++;
489 }
490 }
491
492 void
493 syncache_unreach(inc, th)
494 struct in_conninfo *inc;
495 struct tcphdr *th;
496 {
497 struct syncache *sc;
498 struct syncache_head *sch;
499
500 INP_INFO_WLOCK_ASSERT(&tcbinfo);
501
502 sc = syncache_lookup(inc, &sch);
503 if (sc == NULL)
504 return;
505
506 /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
507 if (ntohl(th->th_seq) != sc->sc_iss)
508 return;
509
510 /*
511 * If we've rertransmitted 3 times and this is our second error,
512 * we remove the entry. Otherwise, we allow it to continue on.
513 * This prevents us from incorrectly nuking an entry during a
514 * spurious network outage.
515 *
516 * See tcp_notify().
517 */
518 if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtslot < 3) {
519 sc->sc_flags |= SCF_UNREACH;
520 return;
521 }
522 syncache_drop(sc, sch);
523 tcpstat.tcps_sc_unreach++;
524 }
525
526 /*
527 * Build a new TCP socket structure from a syncache entry.
528 */
529 static struct socket *
530 syncache_socket(sc, lso, m)
531 struct syncache *sc;
532 struct socket *lso;
533 struct mbuf *m;
534 {
535 struct inpcb *inp = NULL;
536 struct socket *so;
537 struct tcpcb *tp;
538
539 NET_ASSERT_GIANT();
540 INP_INFO_WLOCK_ASSERT(&tcbinfo);
541
542 /*
543 * Ok, create the full blown connection, and set things up
544 * as they would have been set up if we had created the
545 * connection when the SYN arrived. If we can't create
546 * the connection, abort it.
547 */
548 so = sonewconn(lso, SS_ISCONNECTED);
549 if (so == NULL) {
550 /*
551 * Drop the connection; we will send a RST if the peer
552 * retransmits the ACK,
553 */
554 tcpstat.tcps_listendrop++;
555 goto abort2;
556 }
557 #ifdef MAC
558 SOCK_LOCK(so);
559 mac_set_socket_peer_from_mbuf(m, so);
560 SOCK_UNLOCK(so);
561 #endif
562
563 inp = sotoinpcb(so);
564 INP_LOCK(inp);
565
566 /*
567 * Insert new socket into hash list.
568 */
569 inp->inp_inc.inc_isipv6 = sc->sc_inc.inc_isipv6;
570 #ifdef INET6
571 if (sc->sc_inc.inc_isipv6) {
572 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
573 } else {
574 inp->inp_vflag &= ~INP_IPV6;
575 inp->inp_vflag |= INP_IPV4;
576 #endif
577 inp->inp_laddr = sc->sc_inc.inc_laddr;
578 #ifdef INET6
579 }
580 #endif
581 inp->inp_lport = sc->sc_inc.inc_lport;
582 if (in_pcbinshash(inp) != 0) {
583 /*
584 * Undo the assignments above if we failed to
585 * put the PCB on the hash lists.
586 */
587 #ifdef INET6
588 if (sc->sc_inc.inc_isipv6)
589 inp->in6p_laddr = in6addr_any;
590 else
591 #endif
592 inp->inp_laddr.s_addr = INADDR_ANY;
593 inp->inp_lport = 0;
594 goto abort;
595 }
596 #ifdef IPSEC
597 /* copy old policy into new socket's */
598 if (ipsec_copy_pcbpolicy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
599 printf("syncache_expand: could not copy policy\n");
600 #endif
601 #ifdef FAST_IPSEC
602 /* copy old policy into new socket's */
603 if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp))
604 printf("syncache_expand: could not copy policy\n");
605 #endif
606 #ifdef INET6
607 if (sc->sc_inc.inc_isipv6) {
608 struct inpcb *oinp = sotoinpcb(lso);
609 struct in6_addr laddr6;
610 struct sockaddr_in6 sin6;
611 /*
612 * Inherit socket options from the listening socket.
613 * Note that in6p_inputopts are not (and should not be)
614 * copied, since it stores previously received options and is
615 * used to detect if each new option is different than the
616 * previous one and hence should be passed to a user.
617 * If we copied in6p_inputopts, a user would not be able to
618 * receive options just after calling the accept system call.
619 */
620 inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS;
621 if (oinp->in6p_outputopts)
622 inp->in6p_outputopts =
623 ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
624
625 sin6.sin6_family = AF_INET6;
626 sin6.sin6_len = sizeof(sin6);
627 sin6.sin6_addr = sc->sc_inc.inc6_faddr;
628 sin6.sin6_port = sc->sc_inc.inc_fport;
629 sin6.sin6_flowinfo = sin6.sin6_scope_id = 0;
630 laddr6 = inp->in6p_laddr;
631 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
632 inp->in6p_laddr = sc->sc_inc.inc6_laddr;
633 if (in6_pcbconnect(inp, (struct sockaddr *)&sin6,
634 thread0.td_ucred)) {
635 inp->in6p_laddr = laddr6;
636 goto abort;
637 }
638 /* Override flowlabel from in6_pcbconnect. */
639 inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
640 inp->in6p_flowinfo |= sc->sc_flowlabel;
641 } else
642 #endif
643 {
644 struct in_addr laddr;
645 struct sockaddr_in sin;
646
647 inp->inp_options = ip_srcroute(m);
648 if (inp->inp_options == NULL) {
649 inp->inp_options = sc->sc_ipopts;
650 sc->sc_ipopts = NULL;
651 }
652
653 sin.sin_family = AF_INET;
654 sin.sin_len = sizeof(sin);
655 sin.sin_addr = sc->sc_inc.inc_faddr;
656 sin.sin_port = sc->sc_inc.inc_fport;
657 bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero));
658 laddr = inp->inp_laddr;
659 if (inp->inp_laddr.s_addr == INADDR_ANY)
660 inp->inp_laddr = sc->sc_inc.inc_laddr;
661 if (in_pcbconnect(inp, (struct sockaddr *)&sin,
662 thread0.td_ucred)) {
663 inp->inp_laddr = laddr;
664 goto abort;
665 }
666 }
667
668 tp = intotcpcb(inp);
669 tp->t_state = TCPS_SYN_RECEIVED;
670 tp->iss = sc->sc_iss;
671 tp->irs = sc->sc_irs;
672 tcp_rcvseqinit(tp);
673 tcp_sendseqinit(tp);
674 tp->snd_wl1 = sc->sc_irs;
675 tp->rcv_up = sc->sc_irs + 1;
676 tp->rcv_wnd = sc->sc_wnd;
677 tp->rcv_adv += tp->rcv_wnd;
678
679 tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY);
680 if (sc->sc_flags & SCF_NOOPT)
681 tp->t_flags |= TF_NOOPT;
682 if (sc->sc_flags & SCF_WINSCALE) {
683 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
684 tp->requested_s_scale = sc->sc_requested_s_scale;
685 tp->request_r_scale = sc->sc_request_r_scale;
686 }
687 if (sc->sc_flags & SCF_TIMESTAMP) {
688 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
689 tp->ts_recent = sc->sc_tsrecent;
690 tp->ts_recent_age = ticks;
691 }
692 #ifdef TCP_SIGNATURE
693 if (sc->sc_flags & SCF_SIGNATURE)
694 tp->t_flags |= TF_SIGNATURE;
695 #endif
696 if (sc->sc_flags & SCF_SACK) {
697 tp->sack_enable = 1;
698 tp->t_flags |= TF_SACK_PERMIT;
699 }
700 /*
701 * Set up MSS and get cached values from tcp_hostcache.
702 * This might overwrite some of the defaults we just set.
703 */
704 tcp_mss(tp, sc->sc_peer_mss);
705
706 /*
707 * If the SYN,ACK was retransmitted, reset cwnd to 1 segment.
708 */
709 if (sc->sc_rxtslot != 0)
710 tp->snd_cwnd = tp->t_maxseg;
711 callout_reset(tp->tt_keep, tcp_keepinit, tcp_timer_keep, tp);
712
713 INP_UNLOCK(inp);
714
715 tcpstat.tcps_accepts++;
716 return (so);
717
718 abort:
719 INP_UNLOCK(inp);
720 abort2:
721 if (so != NULL)
722 (void) soabort(so);
723 return (NULL);
724 }
725
726 /*
727 * This function gets called when we receive an ACK for a
728 * socket in the LISTEN state. We look up the connection
729 * in the syncache, and if its there, we pull it out of
730 * the cache and turn it into a full-blown connection in
731 * the SYN-RECEIVED state.
732 */
733 int
734 syncache_expand(inc, th, sop, m)
735 struct in_conninfo *inc;
736 struct tcphdr *th;
737 struct socket **sop;
738 struct mbuf *m;
739 {
740 struct syncache *sc;
741 struct syncache_head *sch;
742 struct socket *so;
743
744 INP_INFO_WLOCK_ASSERT(&tcbinfo);
745
746 sc = syncache_lookup(inc, &sch);
747 if (sc == NULL) {
748 /*
749 * There is no syncache entry, so see if this ACK is
750 * a returning syncookie. To do this, first:
751 * A. See if this socket has had a syncache entry dropped in
752 * the past. We don't want to accept a bogus syncookie
753 * if we've never received a SYN.
754 * B. check that the syncookie is valid. If it is, then
755 * cobble up a fake syncache entry, and return.
756 */
757 if (!tcp_syncookies)
758 return (0);
759 sc = syncookie_lookup(inc, th, *sop);
760 if (sc == NULL)
761 return (0);
762 sch = NULL;
763 tcpstat.tcps_sc_recvcookie++;
764 }
765
766 /*
767 * If seg contains an ACK, but not for our SYN/ACK, send a RST.
768 */
769 if (th->th_ack != sc->sc_iss + 1) {
770 if (sch == NULL)
771 syncache_free(sc);
772 return (0);
773 }
774
775 so = syncache_socket(sc, *sop, m);
776 if (so == NULL) {
777 #if 0
778 resetandabort:
779 /* XXXjlemon check this - is this correct? */
780 (void) tcp_respond(NULL, m, m, th,
781 th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
782 #endif
783 m_freem(m); /* XXX only needed for above */
784 tcpstat.tcps_sc_aborted++;
785 } else
786 tcpstat.tcps_sc_completed++;
787
788 if (sch == NULL)
789 syncache_free(sc);
790 else
791 syncache_drop(sc, sch);
792 *sop = so;
793 return (1);
794 }
795
796 /*
797 * Given a LISTEN socket and an inbound SYN request, add
798 * this to the syn cache, and send back a segment:
799 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
800 * to the source.
801 *
802 * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
803 * Doing so would require that we hold onto the data and deliver it
804 * to the application. However, if we are the target of a SYN-flood
805 * DoS attack, an attacker could send data which would eventually
806 * consume all available buffer space if it were ACKed. By not ACKing
807 * the data, we avoid this DoS scenario.
808 */
809 int
810 syncache_add(inc, to, th, sop, m)
811 struct in_conninfo *inc;
812 struct tcpopt *to;
813 struct tcphdr *th;
814 struct socket **sop;
815 struct mbuf *m;
816 {
817 struct tcpcb *tp;
818 struct socket *so;
819 struct syncache *sc = NULL;
820 struct syncache_head *sch;
821 struct mbuf *ipopts = NULL;
822 u_int32_t flowtmp;
823 int i, win;
824
825 INP_INFO_WLOCK_ASSERT(&tcbinfo);
826
827 so = *sop;
828 tp = sototcpcb(so);
829
830 /*
831 * Remember the IP options, if any.
832 */
833 #ifdef INET6
834 if (!inc->inc_isipv6)
835 #endif
836 ipopts = ip_srcroute(m);
837
838 /*
839 * See if we already have an entry for this connection.
840 * If we do, resend the SYN,ACK, and reset the retransmit timer.
841 *
842 * XXX
843 * should the syncache be re-initialized with the contents
844 * of the new SYN here (which may have different options?)
845 */
846 sc = syncache_lookup(inc, &sch);
847 if (sc != NULL) {
848 tcpstat.tcps_sc_dupsyn++;
849 if (ipopts) {
850 /*
851 * If we were remembering a previous source route,
852 * forget it and use the new one we've been given.
853 */
854 if (sc->sc_ipopts)
855 (void) m_free(sc->sc_ipopts);
856 sc->sc_ipopts = ipopts;
857 }
858 /*
859 * Update timestamp if present.
860 */
861 if (sc->sc_flags & SCF_TIMESTAMP)
862 sc->sc_tsrecent = to->to_tsval;
863 /*
864 * PCB may have changed, pick up new values.
865 */
866 sc->sc_tp = tp;
867 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
868 #ifdef TCPDEBUG
869 if (syncache_respond(sc, m, so) == 0) {
870 #else
871 if (syncache_respond(sc, m) == 0) {
872 #endif
873 /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
874 TAILQ_REMOVE(&tcp_syncache.timerq[sc->sc_rxtslot],
875 sc, sc_timerq);
876 SYNCACHE_TIMEOUT(sc, sc->sc_rxtslot);
877 tcpstat.tcps_sndacks++;
878 tcpstat.tcps_sndtotal++;
879 }
880 *sop = NULL;
881 return (1);
882 }
883
884 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
885 if (sc == NULL) {
886 /*
887 * The zone allocator couldn't provide more entries.
888 * Treat this as if the cache was full; drop the oldest
889 * entry and insert the new one.
890 */
891 /* NB: guarded by INP_INFO_WLOCK(&tcbinfo) */
892 for (i = SYNCACHE_MAXREXMTS; i >= 0; i--) {
893 sc = TAILQ_FIRST(&tcp_syncache.timerq[i]);
894 if (sc != NULL) {
895 sc->sc_tp->ts_recent = ticks;
896 syncache_drop(sc, NULL);
897 tcpstat.tcps_sc_zonefail++;
898 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT |
899 M_ZERO);
900 break;
901 }
902 }
903 if (sc == NULL) {
904 if (ipopts)
905 (void) m_free(ipopts);
906 return (0);
907 }
908 }
909
910 /*
911 * Fill in the syncache values.
912 */
913 sc->sc_tp = tp;
914 sc->sc_inp_gencnt = tp->t_inpcb->inp_gencnt;
915 sc->sc_ipopts = ipopts;
916 sc->sc_inc.inc_fport = inc->inc_fport;
917 sc->sc_inc.inc_lport = inc->inc_lport;
918 #ifdef INET6
919 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
920 if (inc->inc_isipv6) {
921 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
922 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
923 } else
924 #endif
925 {
926 sc->sc_inc.inc_faddr = inc->inc_faddr;
927 sc->sc_inc.inc_laddr = inc->inc_laddr;
928 }
929 sc->sc_irs = th->th_seq;
930 sc->sc_flags = 0;
931 sc->sc_peer_mss = to->to_flags & TOF_MSS ? to->to_mss : 0;
932 sc->sc_flowlabel = 0;
933 if (tcp_syncookies) {
934 sc->sc_iss = syncookie_generate(sc, &flowtmp);
935 #ifdef INET6
936 if (inc->inc_isipv6 &&
937 (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
938 sc->sc_flowlabel = flowtmp & IPV6_FLOWLABEL_MASK;
939 }
940 #endif
941 } else {
942 sc->sc_iss = arc4random();
943 #ifdef INET6
944 if (inc->inc_isipv6 &&
945 (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)) {
946 sc->sc_flowlabel =
947 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
948 }
949 #endif
950 }
951
952 /* Initial receive window: clip sbspace to [0 .. TCP_MAXWIN] */
953 win = sbspace(&so->so_rcv);
954 win = imax(win, 0);
955 win = imin(win, TCP_MAXWIN);
956 sc->sc_wnd = win;
957
958 if (tcp_do_rfc1323) {
959 /*
960 * A timestamp received in a SYN makes
961 * it ok to send timestamp requests and replies.
962 */
963 if (to->to_flags & TOF_TS) {
964 sc->sc_tsrecent = to->to_tsval;
965 sc->sc_flags |= SCF_TIMESTAMP;
966 }
967 if (to->to_flags & TOF_SCALE) {
968 int wscale = 0;
969
970 /* Compute proper scaling value from buffer space */
971 while (wscale < TCP_MAX_WINSHIFT &&
972 (TCP_MAXWIN << wscale) < so->so_rcv.sb_hiwat)
973 wscale++;
974 sc->sc_request_r_scale = wscale;
975 sc->sc_requested_s_scale = to->to_requested_s_scale;
976 sc->sc_flags |= SCF_WINSCALE;
977 }
978 }
979 if (tp->t_flags & TF_NOOPT)
980 sc->sc_flags = SCF_NOOPT;
981 #ifdef TCP_SIGNATURE
982 /*
983 * If listening socket requested TCP digests, and received SYN
984 * contains the option, flag this in the syncache so that
985 * syncache_respond() will do the right thing with the SYN+ACK.
986 * XXX Currently we always record the option by default and will
987 * attempt to use it in syncache_respond().
988 */
989 if (to->to_flags & TOF_SIGNATURE)
990 sc->sc_flags |= SCF_SIGNATURE;
991 #endif
992
993 if (to->to_flags & TOF_SACK)
994 sc->sc_flags |= SCF_SACK;
995
996 /*
997 * Do a standard 3-way handshake.
998 */
999 #ifdef TCPDEBUG
1000 if (syncache_respond(sc, m, so) == 0) {
1001 #else
1002 if (syncache_respond(sc, m) == 0) {
1003 #endif
1004 syncache_insert(sc, sch);
1005 tcpstat.tcps_sndacks++;
1006 tcpstat.tcps_sndtotal++;
1007 } else {
1008 syncache_free(sc);
1009 tcpstat.tcps_sc_dropped++;
1010 }
1011 *sop = NULL;
1012 return (1);
1013 }
1014
1015 #ifdef TCPDEBUG
1016 static int
1017 syncache_respond(sc, m, so)
1018 struct syncache *sc;
1019 struct mbuf *m;
1020 struct socket *so;
1021 #else
1022 static int
1023 syncache_respond(sc, m)
1024 struct syncache *sc;
1025 struct mbuf *m;
1026 #endif
1027 {
1028 u_int8_t *optp;
1029 int optlen, error;
1030 u_int16_t tlen, hlen, mssopt;
1031 struct ip *ip = NULL;
1032 struct tcphdr *th;
1033 struct inpcb *inp;
1034 #ifdef INET6
1035 struct ip6_hdr *ip6 = NULL;
1036 #endif
1037
1038 hlen =
1039 #ifdef INET6
1040 (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
1041 #endif
1042 sizeof(struct ip);
1043
1044 KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
1045
1046 /* Determine MSS we advertize to other end of connection */
1047 mssopt = tcp_mssopt(&sc->sc_inc);
1048
1049 /* Compute the size of the TCP options. */
1050 if (sc->sc_flags & SCF_NOOPT) {
1051 optlen = 0;
1052 } else {
1053 optlen = TCPOLEN_MAXSEG +
1054 ((sc->sc_flags & SCF_WINSCALE) ? 4 : 0) +
1055 ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
1056 #ifdef TCP_SIGNATURE
1057 if (sc->sc_flags & SCF_SIGNATURE)
1058 optlen += TCPOLEN_SIGNATURE;
1059 #endif
1060 if (sc->sc_flags & SCF_SACK)
1061 optlen += TCPOLEN_SACK_PERMITTED;
1062 optlen = roundup2(optlen, 4);
1063 }
1064 tlen = hlen + sizeof(struct tcphdr) + optlen;
1065
1066 /*
1067 * XXX
1068 * assume that the entire packet will fit in a header mbuf
1069 */
1070 KASSERT(max_linkhdr + tlen <= MHLEN, ("syncache: mbuf too small"));
1071
1072 /*
1073 * XXX shouldn't this reuse the mbuf if possible ?
1074 * Create the IP+TCP header from scratch.
1075 */
1076 if (m)
1077 m_freem(m);
1078
1079 m = m_gethdr(M_DONTWAIT, MT_HEADER);
1080 if (m == NULL)
1081 return (ENOBUFS);
1082 m->m_data += max_linkhdr;
1083 m->m_len = tlen;
1084 m->m_pkthdr.len = tlen;
1085 m->m_pkthdr.rcvif = NULL;
1086 inp = sc->sc_tp->t_inpcb;
1087 INP_LOCK(inp);
1088 #ifdef MAC
1089 mac_create_mbuf_from_inpcb(inp, m);
1090 #endif
1091
1092 #ifdef INET6
1093 if (sc->sc_inc.inc_isipv6) {
1094 ip6 = mtod(m, struct ip6_hdr *);
1095 ip6->ip6_vfc = IPV6_VERSION;
1096 ip6->ip6_nxt = IPPROTO_TCP;
1097 ip6->ip6_src = sc->sc_inc.inc6_laddr;
1098 ip6->ip6_dst = sc->sc_inc.inc6_faddr;
1099 ip6->ip6_plen = htons(tlen - hlen);
1100 /* ip6_hlim is set after checksum */
1101 ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
1102 ip6->ip6_flow |= sc->sc_flowlabel;
1103
1104 th = (struct tcphdr *)(ip6 + 1);
1105 } else
1106 #endif
1107 {
1108 ip = mtod(m, struct ip *);
1109 ip->ip_v = IPVERSION;
1110 ip->ip_hl = sizeof(struct ip) >> 2;
1111 ip->ip_len = tlen;
1112 ip->ip_id = 0;
1113 ip->ip_off = 0;
1114 ip->ip_sum = 0;
1115 ip->ip_p = IPPROTO_TCP;
1116 ip->ip_src = sc->sc_inc.inc_laddr;
1117 ip->ip_dst = sc->sc_inc.inc_faddr;
1118 ip->ip_ttl = inp->inp_ip_ttl; /* XXX */
1119 ip->ip_tos = inp->inp_ip_tos; /* XXX */
1120
1121 /*
1122 * See if we should do MTU discovery. Route lookups are
1123 * expensive, so we will only unset the DF bit if:
1124 *
1125 * 1) path_mtu_discovery is disabled
1126 * 2) the SCF_UNREACH flag has been set
1127 */
1128 if (path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0))
1129 ip->ip_off |= IP_DF;
1130
1131 th = (struct tcphdr *)(ip + 1);
1132 }
1133 th->th_sport = sc->sc_inc.inc_lport;
1134 th->th_dport = sc->sc_inc.inc_fport;
1135
1136 th->th_seq = htonl(sc->sc_iss);
1137 th->th_ack = htonl(sc->sc_irs + 1);
1138 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1139 th->th_x2 = 0;
1140 th->th_flags = TH_SYN|TH_ACK;
1141 th->th_win = htons(sc->sc_wnd);
1142 th->th_urp = 0;
1143
1144 /* Tack on the TCP options. */
1145 if (optlen != 0) {
1146 optp = (u_int8_t *)(th + 1);
1147 *optp++ = TCPOPT_MAXSEG;
1148 *optp++ = TCPOLEN_MAXSEG;
1149 *optp++ = (mssopt >> 8) & 0xff;
1150 *optp++ = mssopt & 0xff;
1151
1152 if (sc->sc_flags & SCF_WINSCALE) {
1153 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
1154 TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
1155 sc->sc_request_r_scale);
1156 optp += 4;
1157 }
1158
1159 if (sc->sc_flags & SCF_TIMESTAMP) {
1160 u_int32_t *lp = (u_int32_t *)(optp);
1161
1162 /* Form timestamp option per appendix A of RFC 1323. */
1163 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1164 *lp++ = htonl(ticks);
1165 *lp = htonl(sc->sc_tsrecent);
1166 optp += TCPOLEN_TSTAMP_APPA;
1167 }
1168
1169 #ifdef TCP_SIGNATURE
1170 /*
1171 * Handle TCP-MD5 passive opener response.
1172 */
1173 if (sc->sc_flags & SCF_SIGNATURE) {
1174 u_int8_t *bp = optp;
1175 int i;
1176
1177 *bp++ = TCPOPT_SIGNATURE;
1178 *bp++ = TCPOLEN_SIGNATURE;
1179 for (i = 0; i < TCP_SIGLEN; i++)
1180 *bp++ = 0;
1181 tcp_signature_compute(m, sizeof(struct ip), 0, optlen,
1182 optp + 2, IPSEC_DIR_OUTBOUND);
1183 optp += TCPOLEN_SIGNATURE;
1184 }
1185 #endif /* TCP_SIGNATURE */
1186
1187 if (sc->sc_flags & SCF_SACK) {
1188 *optp++ = TCPOPT_SACK_PERMITTED;
1189 *optp++ = TCPOLEN_SACK_PERMITTED;
1190 }
1191
1192 {
1193 /* Pad TCP options to a 4 byte boundary */
1194 int padlen = optlen - (optp - (u_int8_t *)(th + 1));
1195 while (padlen-- > 0)
1196 *optp++ = TCPOPT_EOL;
1197 }
1198 }
1199
1200 #ifdef INET6
1201 if (sc->sc_inc.inc_isipv6) {
1202 th->th_sum = 0;
1203 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
1204 ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
1205 error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
1206 } else
1207 #endif
1208 {
1209 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1210 htons(tlen - hlen + IPPROTO_TCP));
1211 m->m_pkthdr.csum_flags = CSUM_TCP;
1212 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1213 #ifdef TCPDEBUG
1214 /*
1215 * Trace.
1216 */
1217 if (so != NULL && so->so_options & SO_DEBUG) {
1218 struct tcpcb *tp = sototcpcb(so);
1219 tcp_trace(TA_OUTPUT, tp->t_state, tp,
1220 mtod(m, void *), th, 0);
1221 }
1222 #endif
1223 error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
1224 }
1225 INP_UNLOCK(inp);
1226 return (error);
1227 }
1228
1229 /*
1230 * cookie layers:
1231 *
1232 * |. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .|
1233 * | peer iss |
1234 * | MD5(laddr,faddr,secret,lport,fport) |. . . . . . .|
1235 * | 0 |(A)| |
1236 * (A): peer mss index
1237 */
1238
1239 /*
1240 * The values below are chosen to minimize the size of the tcp_secret
1241 * table, as well as providing roughly a 16 second lifetime for the cookie.
1242 */
1243
1244 #define SYNCOOKIE_WNDBITS 5 /* exposed bits for window indexing */
1245 #define SYNCOOKIE_TIMESHIFT 1 /* scale ticks to window time units */
1246
1247 #define SYNCOOKIE_WNDMASK ((1 << SYNCOOKIE_WNDBITS) - 1)
1248 #define SYNCOOKIE_NSECRETS (1 << SYNCOOKIE_WNDBITS)
1249 #define SYNCOOKIE_TIMEOUT \
1250 (hz * (1 << SYNCOOKIE_WNDBITS) / (1 << SYNCOOKIE_TIMESHIFT))
1251 #define SYNCOOKIE_DATAMASK ((3 << SYNCOOKIE_WNDBITS) | SYNCOOKIE_WNDMASK)
1252
1253 static struct {
1254 u_int32_t ts_secbits[4];
1255 u_int ts_expire;
1256 } tcp_secret[SYNCOOKIE_NSECRETS];
1257
1258 static int tcp_msstab[] = { 0, 536, 1460, 8960 };
1259
1260 static MD5_CTX syn_ctx;
1261
1262 #define MD5Add(v) MD5Update(&syn_ctx, (u_char *)&v, sizeof(v))
1263
1264 struct md5_add {
1265 u_int32_t laddr, faddr;
1266 u_int32_t secbits[4];
1267 u_int16_t lport, fport;
1268 };
1269
1270 #ifdef CTASSERT
1271 CTASSERT(sizeof(struct md5_add) == 28);
1272 #endif
1273
1274 /*
1275 * Consider the problem of a recreated (and retransmitted) cookie. If the
1276 * original SYN was accepted, the connection is established. The second
1277 * SYN is inflight, and if it arrives with an ISN that falls within the
1278 * receive window, the connection is killed.
1279 *
1280 * However, since cookies have other problems, this may not be worth
1281 * worrying about.
1282 */
1283
1284 static u_int32_t
1285 syncookie_generate(struct syncache *sc, u_int32_t *flowid)
1286 {
1287 u_int32_t md5_buffer[4];
1288 u_int32_t data;
1289 int idx, i;
1290 struct md5_add add;
1291
1292 /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
1293
1294 idx = ((ticks << SYNCOOKIE_TIMESHIFT) / hz) & SYNCOOKIE_WNDMASK;
1295 if (tcp_secret[idx].ts_expire < ticks) {
1296 for (i = 0; i < 4; i++)
1297 tcp_secret[idx].ts_secbits[i] = arc4random();
1298 tcp_secret[idx].ts_expire = ticks + SYNCOOKIE_TIMEOUT;
1299 }
1300 for (data = sizeof(tcp_msstab) / sizeof(int) - 1; data > 0; data--)
1301 if (tcp_msstab[data] <= sc->sc_peer_mss)
1302 break;
1303 data = (data << SYNCOOKIE_WNDBITS) | idx;
1304 data ^= sc->sc_irs; /* peer's iss */
1305 MD5Init(&syn_ctx);
1306 #ifdef INET6
1307 if (sc->sc_inc.inc_isipv6) {
1308 MD5Add(sc->sc_inc.inc6_laddr);
1309 MD5Add(sc->sc_inc.inc6_faddr);
1310 add.laddr = 0;
1311 add.faddr = 0;
1312 } else
1313 #endif
1314 {
1315 add.laddr = sc->sc_inc.inc_laddr.s_addr;
1316 add.faddr = sc->sc_inc.inc_faddr.s_addr;
1317 }
1318 add.lport = sc->sc_inc.inc_lport;
1319 add.fport = sc->sc_inc.inc_fport;
1320 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1321 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1322 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1323 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1324 MD5Add(add);
1325 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1326 data ^= (md5_buffer[0] & ~SYNCOOKIE_WNDMASK);
1327 *flowid = md5_buffer[1];
1328 tcpstat.tcps_sc_sendcookie++;
1329 return (data);
1330 }
1331
1332 static struct syncache *
1333 syncookie_lookup(inc, th, so)
1334 struct in_conninfo *inc;
1335 struct tcphdr *th;
1336 struct socket *so;
1337 {
1338 u_int32_t md5_buffer[4];
1339 struct syncache *sc;
1340 u_int32_t data;
1341 int wnd, idx;
1342 struct md5_add add;
1343
1344 /* NB: single threaded; could add INP_INFO_WLOCK_ASSERT(&tcbinfo) */
1345
1346 data = (th->th_ack - 1) ^ (th->th_seq - 1); /* remove ISS */
1347 idx = data & SYNCOOKIE_WNDMASK;
1348 if (tcp_secret[idx].ts_expire < ticks ||
1349 sototcpcb(so)->ts_recent + SYNCOOKIE_TIMEOUT < ticks)
1350 return (NULL);
1351 MD5Init(&syn_ctx);
1352 #ifdef INET6
1353 if (inc->inc_isipv6) {
1354 MD5Add(inc->inc6_laddr);
1355 MD5Add(inc->inc6_faddr);
1356 add.laddr = 0;
1357 add.faddr = 0;
1358 } else
1359 #endif
1360 {
1361 add.laddr = inc->inc_laddr.s_addr;
1362 add.faddr = inc->inc_faddr.s_addr;
1363 }
1364 add.lport = inc->inc_lport;
1365 add.fport = inc->inc_fport;
1366 add.secbits[0] = tcp_secret[idx].ts_secbits[0];
1367 add.secbits[1] = tcp_secret[idx].ts_secbits[1];
1368 add.secbits[2] = tcp_secret[idx].ts_secbits[2];
1369 add.secbits[3] = tcp_secret[idx].ts_secbits[3];
1370 MD5Add(add);
1371 MD5Final((u_char *)&md5_buffer, &syn_ctx);
1372 data ^= md5_buffer[0];
1373 if ((data & ~SYNCOOKIE_DATAMASK) != 0)
1374 return (NULL);
1375 data = data >> SYNCOOKIE_WNDBITS;
1376
1377 sc = uma_zalloc(tcp_syncache.zone, M_NOWAIT | M_ZERO);
1378 if (sc == NULL)
1379 return (NULL);
1380 /*
1381 * Fill in the syncache values.
1382 * XXX duplicate code from syncache_add
1383 */
1384 sc->sc_ipopts = NULL;
1385 sc->sc_inc.inc_fport = inc->inc_fport;
1386 sc->sc_inc.inc_lport = inc->inc_lport;
1387 sc->sc_tp = sototcpcb(so);
1388 #ifdef INET6
1389 sc->sc_inc.inc_isipv6 = inc->inc_isipv6;
1390 if (inc->inc_isipv6) {
1391 sc->sc_inc.inc6_faddr = inc->inc6_faddr;
1392 sc->sc_inc.inc6_laddr = inc->inc6_laddr;
1393 if (sc->sc_tp->t_inpcb->in6p_flags & IN6P_AUTOFLOWLABEL)
1394 sc->sc_flowlabel = md5_buffer[1] & IPV6_FLOWLABEL_MASK;
1395 } else
1396 #endif
1397 {
1398 sc->sc_inc.inc_faddr = inc->inc_faddr;
1399 sc->sc_inc.inc_laddr = inc->inc_laddr;
1400 }
1401 sc->sc_irs = th->th_seq - 1;
1402 sc->sc_iss = th->th_ack - 1;
1403 wnd = sbspace(&so->so_rcv);
1404 wnd = imax(wnd, 0);
1405 wnd = imin(wnd, TCP_MAXWIN);
1406 sc->sc_wnd = wnd;
1407 sc->sc_flags = 0;
1408 sc->sc_rxtslot = 0;
1409 sc->sc_peer_mss = tcp_msstab[data];
1410 return (sc);
1411 }
Cache object: 70d07cc315c5270ae13d2fd3bcbb644a
|