1 /*
2 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
3 * The Regents of the University of California. All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. All advertising materials mentioning features or use of this software
14 * must display the following acknowledgement:
15 * This product includes software developed by the University of
16 * California, Berkeley and its contributors.
17 * 4. Neither the name of the University nor the names of its contributors
18 * may be used to endorse or promote products derived from this software
19 * without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
34 * $FreeBSD: releng/5.1/sys/netinet/tcp_subr.c 127036 2004-03-15 20:02:07Z fjoe $
35 */
36
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_ipsec.h"
40 #include "opt_mac.h"
41 #include "opt_tcpdebug.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/callout.h>
46 #include <sys/kernel.h>
47 #include <sys/sysctl.h>
48 #include <sys/mac.h>
49 #include <sys/malloc.h>
50 #include <sys/mbuf.h>
51 #ifdef INET6
52 #include <sys/domain.h>
53 #endif
54 #include <sys/proc.h>
55 #include <sys/socket.h>
56 #include <sys/socketvar.h>
57 #include <sys/protosw.h>
58 #include <sys/random.h>
59
60 #include <vm/uma.h>
61
62 #include <net/route.h>
63 #include <net/if.h>
64
65 #include <netinet/in.h>
66 #include <netinet/in_systm.h>
67 #include <netinet/ip.h>
68 #ifdef INET6
69 #include <netinet/ip6.h>
70 #endif
71 #include <netinet/in_pcb.h>
72 #ifdef INET6
73 #include <netinet6/in6_pcb.h>
74 #endif
75 #include <netinet/in_var.h>
76 #include <netinet/ip_var.h>
77 #ifdef INET6
78 #include <netinet6/ip6_var.h>
79 #endif
80 #include <netinet/tcp.h>
81 #include <netinet/tcp_fsm.h>
82 #include <netinet/tcp_seq.h>
83 #include <netinet/tcp_timer.h>
84 #include <netinet/tcp_var.h>
85 #ifdef INET6
86 #include <netinet6/tcp6_var.h>
87 #endif
88 #include <netinet/tcpip.h>
89 #ifdef TCPDEBUG
90 #include <netinet/tcp_debug.h>
91 #endif
92 #include <netinet6/ip6protosw.h>
93
94 #ifdef IPSEC
95 #include <netinet6/ipsec.h>
96 #ifdef INET6
97 #include <netinet6/ipsec6.h>
98 #endif
99 #endif /*IPSEC*/
100
101 #ifdef FAST_IPSEC
102 #include <netipsec/ipsec.h>
103 #ifdef INET6
104 #include <netipsec/ipsec6.h>
105 #endif
106 #define IPSEC
107 #endif /*FAST_IPSEC*/
108
109 #include <machine/in_cksum.h>
110 #include <sys/md5.h>
111
112 int tcp_mssdflt = TCP_MSS;
113 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW,
114 &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
115
116 #ifdef INET6
117 int tcp_v6mssdflt = TCP6_MSS;
118 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
119 CTLFLAG_RW, &tcp_v6mssdflt , 0,
120 "Default TCP Maximum Segment Size for IPv6");
121 #endif
122
123 #if 0
124 static int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
125 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW,
126 &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
127 #endif
128
129 int tcp_do_rfc1323 = 1;
130 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW,
131 &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
132
133 int tcp_do_rfc1644 = 0;
134 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW,
135 &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
136
137 static int tcp_tcbhashsize = 0;
138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
139 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
140
141 static int do_tcpdrain = 1;
142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
143 "Enable tcp_drain routine for extra help when low on mbufs");
144
145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD,
146 &tcbinfo.ipi_count, 0, "Number of active PCBs");
147
148 static int icmp_may_rst = 1;
149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0,
150 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
151
152 static int tcp_isn_reseed_interval = 0;
153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
154 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
155
156 /*
157 * TCP bandwidth limiting sysctls. Note that the default lower bound of
158 * 1024 exists only for debugging. A good production default would be
159 * something like 6100.
160 */
161 static int tcp_inflight_enable = 0;
162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
163 &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
164
165 static int tcp_inflight_debug = 0;
166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
167 &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
168
169 static int tcp_inflight_min = 6144;
170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
171 &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
172
173 static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
175 &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
176 static int tcp_inflight_stab = 20;
177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
178 &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
179
180 static void tcp_cleartaocache(void);
181 static struct inpcb *tcp_notify(struct inpcb *, int);
182 static void tcp_discardcb(struct tcpcb *);
183
184 /*
185 * Target size of TCP PCB hash tables. Must be a power of two.
186 *
187 * Note that this can be overridden by the kernel environment
188 * variable net.inet.tcp.tcbhashsize
189 */
190 #ifndef TCBHASHSIZE
191 #define TCBHASHSIZE 512
192 #endif
193
194 /*
195 * XXX
196 * Callouts should be moved into struct tcp directly. They are currently
197 * separate becuase the tcpcb structure is exported to userland for sysctl
198 * parsing purposes, which do not know about callouts.
199 */
200 struct tcpcb_mem {
201 struct tcpcb tcb;
202 struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
203 struct callout tcpcb_mem_2msl, tcpcb_mem_delack;
204 };
205
206 static uma_zone_t tcpcb_zone;
207 static uma_zone_t tcptw_zone;
208
209 /*
210 * Tcp initialization
211 */
212 void
213 tcp_init()
214 {
215 int hashsize = TCBHASHSIZE;
216
217 tcp_ccgen = 1;
218 tcp_cleartaocache();
219
220 tcp_delacktime = TCPTV_DELACK;
221 tcp_keepinit = TCPTV_KEEP_INIT;
222 tcp_keepidle = TCPTV_KEEP_IDLE;
223 tcp_keepintvl = TCPTV_KEEPINTVL;
224 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
225 tcp_msl = TCPTV_MSL;
226 tcp_rexmit_min = TCPTV_MIN;
227 tcp_rexmit_slop = TCPTV_CPU_VAR;
228
229 INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
230 LIST_INIT(&tcb);
231 tcbinfo.listhead = &tcb;
232 TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
233 if (!powerof2(hashsize)) {
234 printf("WARNING: TCB hash size not a power of 2\n");
235 hashsize = 512; /* safe default */
236 }
237 tcp_tcbhashsize = hashsize;
238 tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
239 tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
240 &tcbinfo.porthashmask);
241 tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
242 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
243 uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
244 #ifdef INET6
245 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
246 #else /* INET6 */
247 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
248 #endif /* INET6 */
249 if (max_protohdr < TCP_MINPROTOHDR)
250 max_protohdr = TCP_MINPROTOHDR;
251 if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
252 panic("tcp_init");
253 #undef TCP_MINPROTOHDR
254 /*
255 * These have to be type stable for the benefit of the timers.
256 */
257 tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
258 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
259 uma_zone_set_max(tcpcb_zone, maxsockets);
260 tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw),
261 NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
262 uma_zone_set_max(tcptw_zone, maxsockets);
263 tcp_timer_init();
264 syncache_init();
265 tcp_reass_init();
266 }
267
268 /*
269 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
270 * tcp_template used to store this data in mbufs, but we now recopy it out
271 * of the tcpcb each time to conserve mbufs.
272 */
273 void
274 tcpip_fillheaders(inp, ip_ptr, tcp_ptr)
275 struct inpcb *inp;
276 void *ip_ptr;
277 void *tcp_ptr;
278 {
279 struct tcphdr *th = (struct tcphdr *)tcp_ptr;
280
281 #ifdef INET6
282 if ((inp->inp_vflag & INP_IPV6) != 0) {
283 struct ip6_hdr *ip6;
284
285 ip6 = (struct ip6_hdr *)ip_ptr;
286 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
287 (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
288 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
289 (IPV6_VERSION & IPV6_VERSION_MASK);
290 ip6->ip6_nxt = IPPROTO_TCP;
291 ip6->ip6_plen = sizeof(struct tcphdr);
292 ip6->ip6_src = inp->in6p_laddr;
293 ip6->ip6_dst = inp->in6p_faddr;
294 } else
295 #endif
296 {
297 struct ip *ip;
298
299 ip = (struct ip *)ip_ptr;
300 ip->ip_v = IPVERSION;
301 ip->ip_hl = 5;
302 ip->ip_tos = inp->inp_ip_tos;
303 ip->ip_len = 0;
304 ip->ip_id = 0;
305 ip->ip_off = 0;
306 ip->ip_ttl = inp->inp_ip_ttl;
307 ip->ip_sum = 0;
308 ip->ip_p = IPPROTO_TCP;
309 ip->ip_src = inp->inp_laddr;
310 ip->ip_dst = inp->inp_faddr;
311 }
312 th->th_sport = inp->inp_lport;
313 th->th_dport = inp->inp_fport;
314 th->th_seq = 0;
315 th->th_ack = 0;
316 th->th_x2 = 0;
317 th->th_off = 5;
318 th->th_flags = 0;
319 th->th_win = 0;
320 th->th_urp = 0;
321 th->th_sum = 0; /* in_pseudo() is called later for ipv4 */
322 }
323
324 /*
325 * Create template to be used to send tcp packets on a connection.
326 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
327 * use for this function is in keepalives, which use tcp_respond.
328 */
329 struct tcptemp *
330 tcpip_maketemplate(inp)
331 struct inpcb *inp;
332 {
333 struct mbuf *m;
334 struct tcptemp *n;
335
336 m = m_get(M_DONTWAIT, MT_HEADER);
337 if (m == NULL)
338 return (0);
339 m->m_len = sizeof(struct tcptemp);
340 n = mtod(m, struct tcptemp *);
341
342 tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
343 return (n);
344 }
345
346 /*
347 * Send a single message to the TCP at address specified by
348 * the given TCP/IP header. If m == 0, then we make a copy
349 * of the tcpiphdr at ti and send directly to the addressed host.
350 * This is used to force keep alive messages out using the TCP
351 * template for a connection. If flags are given then we send
352 * a message back to the TCP which originated the * segment ti,
353 * and discard the mbuf containing it and any other attached mbufs.
354 *
355 * In any case the ack and sequence number of the transmitted
356 * segment are as specified by the parameters.
357 *
358 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
359 */
360 void
361 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
362 struct tcpcb *tp;
363 void *ipgen;
364 register struct tcphdr *th;
365 register struct mbuf *m;
366 tcp_seq ack, seq;
367 int flags;
368 {
369 register int tlen;
370 int win = 0;
371 struct route *ro = 0;
372 struct route sro;
373 struct ip *ip;
374 struct tcphdr *nth;
375 #ifdef INET6
376 struct route_in6 *ro6 = 0;
377 struct route_in6 sro6;
378 struct ip6_hdr *ip6;
379 int isipv6;
380 #endif /* INET6 */
381 int ipflags = 0;
382
383 KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
384
385 #ifdef INET6
386 isipv6 = ((struct ip *)ipgen)->ip_v == 6;
387 ip6 = ipgen;
388 #endif /* INET6 */
389 ip = ipgen;
390
391 if (tp) {
392 if (!(flags & TH_RST)) {
393 win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
394 if (win > (long)TCP_MAXWIN << tp->rcv_scale)
395 win = (long)TCP_MAXWIN << tp->rcv_scale;
396 }
397 #ifdef INET6
398 if (isipv6)
399 ro6 = &tp->t_inpcb->in6p_route;
400 else
401 #endif /* INET6 */
402 ro = &tp->t_inpcb->inp_route;
403 } else {
404 #ifdef INET6
405 if (isipv6) {
406 ro6 = &sro6;
407 bzero(ro6, sizeof *ro6);
408 } else
409 #endif /* INET6 */
410 {
411 ro = &sro;
412 bzero(ro, sizeof *ro);
413 }
414 }
415 if (m == 0) {
416 m = m_gethdr(M_DONTWAIT, MT_HEADER);
417 if (m == NULL)
418 return;
419 tlen = 0;
420 m->m_data += max_linkhdr;
421 #ifdef INET6
422 if (isipv6) {
423 bcopy((caddr_t)ip6, mtod(m, caddr_t),
424 sizeof(struct ip6_hdr));
425 ip6 = mtod(m, struct ip6_hdr *);
426 nth = (struct tcphdr *)(ip6 + 1);
427 } else
428 #endif /* INET6 */
429 {
430 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
431 ip = mtod(m, struct ip *);
432 nth = (struct tcphdr *)(ip + 1);
433 }
434 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
435 flags = TH_ACK;
436 } else {
437 m_freem(m->m_next);
438 m->m_next = 0;
439 m->m_data = (caddr_t)ipgen;
440 /* m_len is set later */
441 tlen = 0;
442 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
443 #ifdef INET6
444 if (isipv6) {
445 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
446 nth = (struct tcphdr *)(ip6 + 1);
447 } else
448 #endif /* INET6 */
449 {
450 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
451 nth = (struct tcphdr *)(ip + 1);
452 }
453 if (th != nth) {
454 /*
455 * this is usually a case when an extension header
456 * exists between the IPv6 header and the
457 * TCP header.
458 */
459 nth->th_sport = th->th_sport;
460 nth->th_dport = th->th_dport;
461 }
462 xchg(nth->th_dport, nth->th_sport, n_short);
463 #undef xchg
464 }
465 #ifdef INET6
466 if (isipv6) {
467 ip6->ip6_flow = 0;
468 ip6->ip6_vfc = IPV6_VERSION;
469 ip6->ip6_nxt = IPPROTO_TCP;
470 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
471 tlen));
472 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
473 } else
474 #endif
475 {
476 tlen += sizeof (struct tcpiphdr);
477 ip->ip_len = tlen;
478 ip->ip_ttl = ip_defttl;
479 }
480 m->m_len = tlen;
481 m->m_pkthdr.len = tlen;
482 m->m_pkthdr.rcvif = (struct ifnet *) 0;
483 #ifdef MAC
484 if (tp != NULL && tp->t_inpcb != NULL) {
485 /*
486 * Packet is associated with a socket, so allow the
487 * label of the response to reflect the socket label.
488 */
489 mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m);
490 } else {
491 /*
492 * XXXMAC: This will need to call a mac function that
493 * modifies the mbuf label in place for TCP datagrams
494 * not associated with a PCB.
495 */
496 }
497 #endif
498 nth->th_seq = htonl(seq);
499 nth->th_ack = htonl(ack);
500 nth->th_x2 = 0;
501 nth->th_off = sizeof (struct tcphdr) >> 2;
502 nth->th_flags = flags;
503 if (tp)
504 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
505 else
506 nth->th_win = htons((u_short)win);
507 nth->th_urp = 0;
508 #ifdef INET6
509 if (isipv6) {
510 nth->th_sum = 0;
511 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
512 sizeof(struct ip6_hdr),
513 tlen - sizeof(struct ip6_hdr));
514 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
515 ro6 && ro6->ro_rt ?
516 ro6->ro_rt->rt_ifp :
517 NULL);
518 } else
519 #endif /* INET6 */
520 {
521 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
522 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
523 m->m_pkthdr.csum_flags = CSUM_TCP;
524 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
525 }
526 #ifdef TCPDEBUG
527 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
528 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
529 #endif
530 #ifdef INET6
531 if (isipv6) {
532 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
533 tp ? tp->t_inpcb : NULL);
534 if (ro6 == &sro6 && ro6->ro_rt) {
535 RTFREE(ro6->ro_rt);
536 ro6->ro_rt = NULL;
537 }
538 } else
539 #endif /* INET6 */
540 {
541 (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
542 if (ro == &sro && ro->ro_rt) {
543 RTFREE(ro->ro_rt);
544 ro->ro_rt = NULL;
545 }
546 }
547 }
548
549 /*
550 * Create a new TCP control block, making an
551 * empty reassembly queue and hooking it to the argument
552 * protocol control block. The `inp' parameter must have
553 * come from the zone allocator set up in tcp_init().
554 */
555 struct tcpcb *
556 tcp_newtcpcb(inp)
557 struct inpcb *inp;
558 {
559 struct tcpcb_mem *tm;
560 struct tcpcb *tp;
561 #ifdef INET6
562 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
563 #endif /* INET6 */
564
565 tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
566 if (tm == NULL)
567 return (NULL);
568 tp = &tm->tcb;
569 /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
570 tp->t_maxseg = tp->t_maxopd =
571 #ifdef INET6
572 isipv6 ? tcp_v6mssdflt :
573 #endif /* INET6 */
574 tcp_mssdflt;
575
576 /* Set up our timeouts. */
577 callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
578 callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
579 callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
580 callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
581 callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
582
583 if (tcp_do_rfc1323)
584 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
585 if (tcp_do_rfc1644)
586 tp->t_flags |= TF_REQ_CC;
587 tp->t_inpcb = inp; /* XXX */
588 /*
589 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
590 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
591 * reasonable initial retransmit time.
592 */
593 tp->t_srtt = TCPTV_SRTTBASE;
594 tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
595 tp->t_rttmin = tcp_rexmit_min;
596 tp->t_rxtcur = TCPTV_RTOBASE;
597 tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
598 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
599 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
600 tp->t_rcvtime = ticks;
601 tp->t_bw_rtttime = ticks;
602 /*
603 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
604 * because the socket may be bound to an IPv6 wildcard address,
605 * which may match an IPv4-mapped IPv6 address.
606 */
607 inp->inp_ip_ttl = ip_defttl;
608 inp->inp_ppcb = (caddr_t)tp;
609 return (tp); /* XXX */
610 }
611
612 /*
613 * Drop a TCP connection, reporting
614 * the specified error. If connection is synchronized,
615 * then send a RST to peer.
616 */
617 struct tcpcb *
618 tcp_drop(tp, errno)
619 register struct tcpcb *tp;
620 int errno;
621 {
622 struct socket *so = tp->t_inpcb->inp_socket;
623
624 if (TCPS_HAVERCVDSYN(tp->t_state)) {
625 tp->t_state = TCPS_CLOSED;
626 (void) tcp_output(tp);
627 tcpstat.tcps_drops++;
628 } else
629 tcpstat.tcps_conndrops++;
630 if (errno == ETIMEDOUT && tp->t_softerror)
631 errno = tp->t_softerror;
632 so->so_error = errno;
633 return (tcp_close(tp));
634 }
635
636 static void
637 tcp_discardcb(tp)
638 struct tcpcb *tp;
639 {
640 struct tseg_qent *q;
641 struct inpcb *inp = tp->t_inpcb;
642 struct socket *so = inp->inp_socket;
643 #ifdef INET6
644 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
645 #endif /* INET6 */
646 struct rtentry *rt;
647 int dosavessthresh;
648
649 /*
650 * Make sure that all of our timers are stopped before we
651 * delete the PCB.
652 */
653 callout_stop(tp->tt_rexmt);
654 callout_stop(tp->tt_persist);
655 callout_stop(tp->tt_keep);
656 callout_stop(tp->tt_2msl);
657 callout_stop(tp->tt_delack);
658
659 /*
660 * If we got enough samples through the srtt filter,
661 * save the rtt and rttvar in the routing entry.
662 * 'Enough' is arbitrarily defined as the 16 samples.
663 * 16 samples is enough for the srtt filter to converge
664 * to within 5% of the correct value; fewer samples and
665 * we could save a very bogus rtt.
666 *
667 * Don't update the default route's characteristics and don't
668 * update anything that the user "locked".
669 */
670 if (tp->t_rttupdated >= 16) {
671 register u_long i = 0;
672 #ifdef INET6
673 if (isipv6) {
674 struct sockaddr_in6 *sin6;
675
676 if ((rt = inp->in6p_route.ro_rt) == NULL)
677 goto no_valid_rt;
678 sin6 = (struct sockaddr_in6 *)rt_key(rt);
679 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
680 goto no_valid_rt;
681 }
682 else
683 #endif /* INET6 */
684 if ((rt = inp->inp_route.ro_rt) == NULL ||
685 ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
686 == INADDR_ANY)
687 goto no_valid_rt;
688
689 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
690 i = tp->t_srtt *
691 (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
692 if (rt->rt_rmx.rmx_rtt && i)
693 /*
694 * filter this update to half the old & half
695 * the new values, converting scale.
696 * See route.h and tcp_var.h for a
697 * description of the scaling constants.
698 */
699 rt->rt_rmx.rmx_rtt =
700 (rt->rt_rmx.rmx_rtt + i) / 2;
701 else
702 rt->rt_rmx.rmx_rtt = i;
703 tcpstat.tcps_cachedrtt++;
704 }
705 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
706 i = tp->t_rttvar *
707 (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
708 if (rt->rt_rmx.rmx_rttvar && i)
709 rt->rt_rmx.rmx_rttvar =
710 (rt->rt_rmx.rmx_rttvar + i) / 2;
711 else
712 rt->rt_rmx.rmx_rttvar = i;
713 tcpstat.tcps_cachedrttvar++;
714 }
715 /*
716 * The old comment here said:
717 * update the pipelimit (ssthresh) if it has been updated
718 * already or if a pipesize was specified & the threshhold
719 * got below half the pipesize. I.e., wait for bad news
720 * before we start updating, then update on both good
721 * and bad news.
722 *
723 * But we want to save the ssthresh even if no pipesize is
724 * specified explicitly in the route, because such
725 * connections still have an implicit pipesize specified
726 * by the global tcp_sendspace. In the absence of a reliable
727 * way to calculate the pipesize, it will have to do.
728 */
729 i = tp->snd_ssthresh;
730 if (rt->rt_rmx.rmx_sendpipe != 0)
731 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
732 else
733 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
734 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
735 i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
736 || dosavessthresh) {
737 /*
738 * convert the limit from user data bytes to
739 * packets then to packet data bytes.
740 */
741 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
742 if (i < 2)
743 i = 2;
744 i *= (u_long)(tp->t_maxseg +
745 #ifdef INET6
746 (isipv6 ? sizeof (struct ip6_hdr) +
747 sizeof (struct tcphdr) :
748 #endif
749 sizeof (struct tcpiphdr)
750 #ifdef INET6
751 )
752 #endif
753 );
754 if (rt->rt_rmx.rmx_ssthresh)
755 rt->rt_rmx.rmx_ssthresh =
756 (rt->rt_rmx.rmx_ssthresh + i) / 2;
757 else
758 rt->rt_rmx.rmx_ssthresh = i;
759 tcpstat.tcps_cachedssthresh++;
760 }
761 }
762 no_valid_rt:
763 /* free the reassembly queue, if any */
764 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
765 LIST_REMOVE(q, tqe_q);
766 m_freem(q->tqe_m);
767 uma_zfree(tcp_reass_zone, q);
768 tcp_reass_qsize--;
769 }
770 inp->inp_ppcb = NULL;
771 tp->t_inpcb = NULL;
772 uma_zfree(tcpcb_zone, tp);
773 soisdisconnected(so);
774 }
775
776 /*
777 * Close a TCP control block:
778 * discard all space held by the tcp
779 * discard internet protocol block
780 * wake up any sleepers
781 */
782 struct tcpcb *
783 tcp_close(tp)
784 struct tcpcb *tp;
785 {
786 struct inpcb *inp = tp->t_inpcb;
787 #ifdef INET6
788 struct socket *so = inp->inp_socket;
789 #endif
790
791 tcp_discardcb(tp);
792 #ifdef INET6
793 if (INP_CHECK_SOCKAF(so, AF_INET6))
794 in6_pcbdetach(inp);
795 else
796 #endif
797 in_pcbdetach(inp);
798 tcpstat.tcps_closed++;
799 return ((struct tcpcb *)0);
800 }
801
802 void
803 tcp_drain()
804 {
805 if (do_tcpdrain)
806 {
807 struct inpcb *inpb;
808 struct tcpcb *tcpb;
809 struct tseg_qent *te;
810
811 /*
812 * Walk the tcpbs, if existing, and flush the reassembly queue,
813 * if there is one...
814 * XXX: The "Net/3" implementation doesn't imply that the TCP
815 * reassembly queue should be flushed, but in a situation
816 * where we're really low on mbufs, this is potentially
817 * usefull.
818 */
819 INP_INFO_RLOCK(&tcbinfo);
820 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
821 if (inpb->inp_vflag & INP_TIMEWAIT)
822 continue;
823 INP_LOCK(inpb);
824 if ((tcpb = intotcpcb(inpb))) {
825 while ((te = LIST_FIRST(&tcpb->t_segq))
826 != NULL) {
827 LIST_REMOVE(te, tqe_q);
828 m_freem(te->tqe_m);
829 uma_zfree(tcp_reass_zone, te);
830 tcp_reass_qsize--;
831 }
832 }
833 INP_UNLOCK(inpb);
834 }
835 INP_INFO_RUNLOCK(&tcbinfo);
836 }
837 }
838
839 /*
840 * Notify a tcp user of an asynchronous error;
841 * store error as soft error, but wake up user
842 * (for now, won't do anything until can select for soft error).
843 *
844 * Do not wake up user since there currently is no mechanism for
845 * reporting soft errors (yet - a kqueue filter may be added).
846 */
847 static struct inpcb *
848 tcp_notify(inp, error)
849 struct inpcb *inp;
850 int error;
851 {
852 struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
853
854 /*
855 * Ignore some errors if we are hooked up.
856 * If connection hasn't completed, has retransmitted several times,
857 * and receives a second error, give up now. This is better
858 * than waiting a long time to establish a connection that
859 * can never complete.
860 */
861 if (tp->t_state == TCPS_ESTABLISHED &&
862 (error == EHOSTUNREACH || error == ENETUNREACH ||
863 error == EHOSTDOWN)) {
864 return inp;
865 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
866 tp->t_softerror) {
867 tcp_drop(tp, error);
868 return (struct inpcb *)0;
869 } else {
870 tp->t_softerror = error;
871 return inp;
872 }
873 #if 0
874 wakeup( &so->so_timeo);
875 sorwakeup(so);
876 sowwakeup(so);
877 #endif
878 }
879
880 static int
881 tcp_pcblist(SYSCTL_HANDLER_ARGS)
882 {
883 int error, i, n, s;
884 struct inpcb *inp, **inp_list;
885 inp_gen_t gencnt;
886 struct xinpgen xig;
887
888 /*
889 * The process of preparing the TCB list is too time-consuming and
890 * resource-intensive to repeat twice on every request.
891 */
892 if (req->oldptr == 0) {
893 n = tcbinfo.ipi_count;
894 req->oldidx = 2 * (sizeof xig)
895 + (n + n/8) * sizeof(struct xtcpcb);
896 return 0;
897 }
898
899 if (req->newptr != 0)
900 return EPERM;
901
902 /*
903 * OK, now we're committed to doing something.
904 */
905 s = splnet();
906 INP_INFO_RLOCK(&tcbinfo);
907 gencnt = tcbinfo.ipi_gencnt;
908 n = tcbinfo.ipi_count;
909 INP_INFO_RUNLOCK(&tcbinfo);
910 splx(s);
911
912 sysctl_wire_old_buffer(req, 2 * (sizeof xig)
913 + n * sizeof(struct xtcpcb));
914
915 xig.xig_len = sizeof xig;
916 xig.xig_count = n;
917 xig.xig_gen = gencnt;
918 xig.xig_sogen = so_gencnt;
919 error = SYSCTL_OUT(req, &xig, sizeof xig);
920 if (error)
921 return error;
922
923 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
924 if (inp_list == 0)
925 return ENOMEM;
926
927 s = splnet();
928 INP_INFO_RLOCK(&tcbinfo);
929 for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
930 inp = LIST_NEXT(inp, inp_list)) {
931 INP_LOCK(inp);
932 if (inp->inp_gencnt <= gencnt) {
933 /*
934 * XXX: This use of cr_cansee(), introduced with
935 * TCP state changes, is not quite right, but for
936 * now, better than nothing.
937 */
938 if (inp->inp_vflag & INP_TIMEWAIT)
939 error = cr_cansee(req->td->td_ucred,
940 intotw(inp)->tw_cred);
941 else
942 error = cr_canseesocket(req->td->td_ucred,
943 inp->inp_socket);
944 if (error == 0)
945 inp_list[i++] = inp;
946 }
947 INP_UNLOCK(inp);
948 }
949 INP_INFO_RUNLOCK(&tcbinfo);
950 splx(s);
951 n = i;
952
953 error = 0;
954 for (i = 0; i < n; i++) {
955 inp = inp_list[i];
956 if (inp->inp_gencnt <= gencnt) {
957 struct xtcpcb xt;
958 caddr_t inp_ppcb;
959 xt.xt_len = sizeof xt;
960 /* XXX should avoid extra copy */
961 bcopy(inp, &xt.xt_inp, sizeof *inp);
962 inp_ppcb = inp->inp_ppcb;
963 if (inp_ppcb == NULL)
964 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
965 else if (inp->inp_vflag & INP_TIMEWAIT) {
966 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
967 xt.xt_tp.t_state = TCPS_TIME_WAIT;
968 } else
969 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
970 if (inp->inp_socket)
971 sotoxsocket(inp->inp_socket, &xt.xt_socket);
972 else {
973 bzero(&xt.xt_socket, sizeof xt.xt_socket);
974 xt.xt_socket.xso_protocol = IPPROTO_TCP;
975 }
976 xt.xt_inp.inp_gencnt = inp->inp_gencnt;
977 error = SYSCTL_OUT(req, &xt, sizeof xt);
978 }
979 }
980 if (!error) {
981 /*
982 * Give the user an updated idea of our state.
983 * If the generation differs from what we told
984 * her before, she knows that something happened
985 * while we were processing this request, and it
986 * might be necessary to retry.
987 */
988 s = splnet();
989 INP_INFO_RLOCK(&tcbinfo);
990 xig.xig_gen = tcbinfo.ipi_gencnt;
991 xig.xig_sogen = so_gencnt;
992 xig.xig_count = tcbinfo.ipi_count;
993 INP_INFO_RUNLOCK(&tcbinfo);
994 splx(s);
995 error = SYSCTL_OUT(req, &xig, sizeof xig);
996 }
997 free(inp_list, M_TEMP);
998 return error;
999 }
1000
1001 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
1002 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1003
1004 static int
1005 tcp_getcred(SYSCTL_HANDLER_ARGS)
1006 {
1007 struct xucred xuc;
1008 struct sockaddr_in addrs[2];
1009 struct inpcb *inp;
1010 int error, s;
1011
1012 error = suser_cred(req->td->td_ucred, PRISON_ROOT);
1013 if (error)
1014 return (error);
1015 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1016 if (error)
1017 return (error);
1018 s = splnet();
1019 INP_INFO_RLOCK(&tcbinfo);
1020 inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
1021 addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
1022 if (inp == NULL) {
1023 error = ENOENT;
1024 goto outunlocked;
1025 }
1026 INP_LOCK(inp);
1027 if (inp->inp_socket == NULL) {
1028 error = ENOENT;
1029 goto out;
1030 }
1031 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
1032 if (error)
1033 goto out;
1034 cru2x(inp->inp_socket->so_cred, &xuc);
1035 out:
1036 INP_UNLOCK(inp);
1037 outunlocked:
1038 INP_INFO_RUNLOCK(&tcbinfo);
1039 splx(s);
1040 if (error == 0)
1041 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1042 return (error);
1043 }
1044
1045 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
1046 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1047 tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
1048
1049 #ifdef INET6
1050 static int
1051 tcp6_getcred(SYSCTL_HANDLER_ARGS)
1052 {
1053 struct xucred xuc;
1054 struct sockaddr_in6 addrs[2];
1055 struct inpcb *inp;
1056 int error, s, mapped = 0;
1057
1058 error = suser_cred(req->td->td_ucred, PRISON_ROOT);
1059 if (error)
1060 return (error);
1061 error = SYSCTL_IN(req, addrs, sizeof(addrs));
1062 if (error)
1063 return (error);
1064 if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
1065 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
1066 mapped = 1;
1067 else
1068 return (EINVAL);
1069 }
1070 s = splnet();
1071 INP_INFO_RLOCK(&tcbinfo);
1072 if (mapped == 1)
1073 inp = in_pcblookup_hash(&tcbinfo,
1074 *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
1075 addrs[1].sin6_port,
1076 *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
1077 addrs[0].sin6_port,
1078 0, NULL);
1079 else
1080 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
1081 addrs[1].sin6_port,
1082 &addrs[0].sin6_addr, addrs[0].sin6_port,
1083 0, NULL);
1084 if (inp == NULL) {
1085 error = ENOENT;
1086 goto outunlocked;
1087 }
1088 INP_LOCK(inp);
1089 if (inp->inp_socket == NULL) {
1090 error = ENOENT;
1091 goto out;
1092 }
1093 error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
1094 if (error)
1095 goto out;
1096 cru2x(inp->inp_socket->so_cred, &xuc);
1097 out:
1098 INP_UNLOCK(inp);
1099 outunlocked:
1100 INP_INFO_RUNLOCK(&tcbinfo);
1101 splx(s);
1102 if (error == 0)
1103 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
1104 return (error);
1105 }
1106
1107 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
1108 CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
1109 tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
1110 #endif
1111
1112
1113 void
1114 tcp_ctlinput(cmd, sa, vip)
1115 int cmd;
1116 struct sockaddr *sa;
1117 void *vip;
1118 {
1119 struct ip *ip = vip;
1120 struct tcphdr *th;
1121 struct in_addr faddr;
1122 struct inpcb *inp;
1123 struct tcpcb *tp;
1124 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1125 tcp_seq icmp_seq;
1126 int s;
1127
1128 faddr = ((struct sockaddr_in *)sa)->sin_addr;
1129 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
1130 return;
1131
1132 if (cmd == PRC_QUENCH)
1133 notify = tcp_quench;
1134 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
1135 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
1136 notify = tcp_drop_syn_sent;
1137 else if (cmd == PRC_MSGSIZE)
1138 notify = tcp_mtudisc;
1139 else if (PRC_IS_REDIRECT(cmd)) {
1140 ip = 0;
1141 notify = in_rtchange;
1142 } else if (cmd == PRC_HOSTDEAD)
1143 ip = 0;
1144 else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
1145 return;
1146 if (ip) {
1147 s = splnet();
1148 th = (struct tcphdr *)((caddr_t)ip
1149 + (ip->ip_hl << 2));
1150 INP_INFO_WLOCK(&tcbinfo);
1151 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
1152 ip->ip_src, th->th_sport, 0, NULL);
1153 if (inp != NULL) {
1154 INP_LOCK(inp);
1155 if (inp->inp_socket != NULL) {
1156 icmp_seq = htonl(th->th_seq);
1157 tp = intotcpcb(inp);
1158 if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
1159 SEQ_LT(icmp_seq, tp->snd_max))
1160 inp = (*notify)(inp, inetctlerrmap[cmd]);
1161 }
1162 if (inp)
1163 INP_UNLOCK(inp);
1164 } else {
1165 struct in_conninfo inc;
1166
1167 inc.inc_fport = th->th_dport;
1168 inc.inc_lport = th->th_sport;
1169 inc.inc_faddr = faddr;
1170 inc.inc_laddr = ip->ip_src;
1171 #ifdef INET6
1172 inc.inc_isipv6 = 0;
1173 #endif
1174 syncache_unreach(&inc, th);
1175 }
1176 INP_INFO_WUNLOCK(&tcbinfo);
1177 splx(s);
1178 } else
1179 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
1180 }
1181
1182 #ifdef INET6
1183 void
1184 tcp6_ctlinput(cmd, sa, d)
1185 int cmd;
1186 struct sockaddr *sa;
1187 void *d;
1188 {
1189 struct tcphdr th;
1190 struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
1191 struct ip6_hdr *ip6;
1192 struct mbuf *m;
1193 struct ip6ctlparam *ip6cp = NULL;
1194 const struct sockaddr_in6 *sa6_src = NULL;
1195 int off;
1196 struct tcp_portonly {
1197 u_int16_t th_sport;
1198 u_int16_t th_dport;
1199 } *thp;
1200
1201 if (sa->sa_family != AF_INET6 ||
1202 sa->sa_len != sizeof(struct sockaddr_in6))
1203 return;
1204
1205 if (cmd == PRC_QUENCH)
1206 notify = tcp_quench;
1207 else if (cmd == PRC_MSGSIZE)
1208 notify = tcp_mtudisc;
1209 else if (!PRC_IS_REDIRECT(cmd) &&
1210 ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
1211 return;
1212
1213 /* if the parameter is from icmp6, decode it. */
1214 if (d != NULL) {
1215 ip6cp = (struct ip6ctlparam *)d;
1216 m = ip6cp->ip6c_m;
1217 ip6 = ip6cp->ip6c_ip6;
1218 off = ip6cp->ip6c_off;
1219 sa6_src = ip6cp->ip6c_src;
1220 } else {
1221 m = NULL;
1222 ip6 = NULL;
1223 off = 0; /* fool gcc */
1224 sa6_src = &sa6_any;
1225 }
1226
1227 if (ip6) {
1228 struct in_conninfo inc;
1229 /*
1230 * XXX: We assume that when IPV6 is non NULL,
1231 * M and OFF are valid.
1232 */
1233
1234 /* check if we can safely examine src and dst ports */
1235 if (m->m_pkthdr.len < off + sizeof(*thp))
1236 return;
1237
1238 bzero(&th, sizeof(th));
1239 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
1240
1241 in6_pcbnotify(&tcb, sa, th.th_dport,
1242 (struct sockaddr *)ip6cp->ip6c_src,
1243 th.th_sport, cmd, notify);
1244
1245 inc.inc_fport = th.th_dport;
1246 inc.inc_lport = th.th_sport;
1247 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
1248 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
1249 inc.inc_isipv6 = 1;
1250 syncache_unreach(&inc, &th);
1251 } else
1252 in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
1253 0, cmd, notify);
1254 }
1255 #endif /* INET6 */
1256
1257
1258 /*
1259 * Following is where TCP initial sequence number generation occurs.
1260 *
1261 * There are two places where we must use initial sequence numbers:
1262 * 1. In SYN-ACK packets.
1263 * 2. In SYN packets.
1264 *
1265 * All ISNs for SYN-ACK packets are generated by the syncache. See
1266 * tcp_syncache.c for details.
1267 *
1268 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
1269 * depends on this property. In addition, these ISNs should be
1270 * unguessable so as to prevent connection hijacking. To satisfy
1271 * the requirements of this situation, the algorithm outlined in
1272 * RFC 1948 is used to generate sequence numbers.
1273 *
1274 * Implementation details:
1275 *
1276 * Time is based off the system timer, and is corrected so that it
1277 * increases by one megabyte per second. This allows for proper
1278 * recycling on high speed LANs while still leaving over an hour
1279 * before rollover.
1280 *
1281 * net.inet.tcp.isn_reseed_interval controls the number of seconds
1282 * between seeding of isn_secret. This is normally set to zero,
1283 * as reseeding should not be necessary.
1284 *
1285 */
1286
1287 #define ISN_BYTES_PER_SECOND 1048576
1288
1289 u_char isn_secret[32];
1290 int isn_last_reseed;
1291 MD5_CTX isn_ctx;
1292
1293 tcp_seq
1294 tcp_new_isn(tp)
1295 struct tcpcb *tp;
1296 {
1297 u_int32_t md5_buffer[4];
1298 tcp_seq new_isn;
1299
1300 /* Seed if this is the first use, reseed if requested. */
1301 if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
1302 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
1303 < (u_int)ticks))) {
1304 read_random(&isn_secret, sizeof(isn_secret));
1305 isn_last_reseed = ticks;
1306 }
1307
1308 /* Compute the md5 hash and return the ISN. */
1309 MD5Init(&isn_ctx);
1310 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
1311 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
1312 #ifdef INET6
1313 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
1314 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
1315 sizeof(struct in6_addr));
1316 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
1317 sizeof(struct in6_addr));
1318 } else
1319 #endif
1320 {
1321 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
1322 sizeof(struct in_addr));
1323 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
1324 sizeof(struct in_addr));
1325 }
1326 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
1327 MD5Final((u_char *) &md5_buffer, &isn_ctx);
1328 new_isn = (tcp_seq) md5_buffer[0];
1329 new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
1330 return new_isn;
1331 }
1332
1333 /*
1334 * When a source quench is received, close congestion window
1335 * to one segment. We will gradually open it again as we proceed.
1336 */
1337 struct inpcb *
1338 tcp_quench(inp, errno)
1339 struct inpcb *inp;
1340 int errno;
1341 {
1342 struct tcpcb *tp = intotcpcb(inp);
1343
1344 if (tp)
1345 tp->snd_cwnd = tp->t_maxseg;
1346 return (inp);
1347 }
1348
1349 /*
1350 * When a specific ICMP unreachable message is received and the
1351 * connection state is SYN-SENT, drop the connection. This behavior
1352 * is controlled by the icmp_may_rst sysctl.
1353 */
1354 struct inpcb *
1355 tcp_drop_syn_sent(inp, errno)
1356 struct inpcb *inp;
1357 int errno;
1358 {
1359 struct tcpcb *tp = intotcpcb(inp);
1360
1361 if (tp && tp->t_state == TCPS_SYN_SENT) {
1362 tcp_drop(tp, errno);
1363 return (struct inpcb *)0;
1364 }
1365 return inp;
1366 }
1367
1368 /*
1369 * When `need fragmentation' ICMP is received, update our idea of the MSS
1370 * based on the new value in the route. Also nudge TCP to send something,
1371 * since we know the packet we just sent was dropped.
1372 * This duplicates some code in the tcp_mss() function in tcp_input.c.
1373 */
1374 struct inpcb *
1375 tcp_mtudisc(inp, errno)
1376 struct inpcb *inp;
1377 int errno;
1378 {
1379 struct tcpcb *tp = intotcpcb(inp);
1380 struct rtentry *rt;
1381 struct rmxp_tao *taop;
1382 struct socket *so = inp->inp_socket;
1383 int offered;
1384 int mss;
1385 #ifdef INET6
1386 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
1387 #endif /* INET6 */
1388
1389 if (tp) {
1390 #ifdef INET6
1391 if (isipv6)
1392 rt = tcp_rtlookup6(&inp->inp_inc);
1393 else
1394 #endif /* INET6 */
1395 rt = tcp_rtlookup(&inp->inp_inc);
1396 if (!rt || !rt->rt_rmx.rmx_mtu) {
1397 tp->t_maxopd = tp->t_maxseg =
1398 #ifdef INET6
1399 isipv6 ? tcp_v6mssdflt :
1400 #endif /* INET6 */
1401 tcp_mssdflt;
1402 return inp;
1403 }
1404 taop = rmx_taop(rt->rt_rmx);
1405 offered = taop->tao_mssopt;
1406 mss = rt->rt_rmx.rmx_mtu -
1407 #ifdef INET6
1408 (isipv6 ?
1409 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
1410 #endif /* INET6 */
1411 sizeof(struct tcpiphdr)
1412 #ifdef INET6
1413 )
1414 #endif /* INET6 */
1415 ;
1416
1417 if (offered)
1418 mss = min(mss, offered);
1419 /*
1420 * XXX - The above conditional probably violates the TCP
1421 * spec. The problem is that, since we don't know the
1422 * other end's MSS, we are supposed to use a conservative
1423 * default. But, if we do that, then MTU discovery will
1424 * never actually take place, because the conservative
1425 * default is much less than the MTUs typically seen
1426 * on the Internet today. For the moment, we'll sweep
1427 * this under the carpet.
1428 *
1429 * The conservative default might not actually be a problem
1430 * if the only case this occurs is when sending an initial
1431 * SYN with options and data to a host we've never talked
1432 * to before. Then, they will reply with an MSS value which
1433 * will get recorded and the new parameters should get
1434 * recomputed. For Further Study.
1435 */
1436 if (tp->t_maxopd <= mss)
1437 return inp;
1438 tp->t_maxopd = mss;
1439
1440 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
1441 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
1442 mss -= TCPOLEN_TSTAMP_APPA;
1443 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
1444 (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
1445 mss -= TCPOLEN_CC_APPA;
1446 #if (MCLBYTES & (MCLBYTES - 1)) == 0
1447 if (mss > MCLBYTES)
1448 mss &= ~(MCLBYTES-1);
1449 #else
1450 if (mss > MCLBYTES)
1451 mss = mss / MCLBYTES * MCLBYTES;
1452 #endif
1453 if (so->so_snd.sb_hiwat < mss)
1454 mss = so->so_snd.sb_hiwat;
1455
1456 tp->t_maxseg = mss;
1457
1458 tcpstat.tcps_mturesent++;
1459 tp->t_rtttime = 0;
1460 tp->snd_nxt = tp->snd_una;
1461 tcp_output(tp);
1462 }
1463 return inp;
1464 }
1465
1466 /*
1467 * Look-up the routing entry to the peer of this inpcb. If no route
1468 * is found and it cannot be allocated, then return NULL. This routine
1469 * is called by TCP routines that access the rmx structure and by tcp_mss
1470 * to get the interface MTU.
1471 */
1472 struct rtentry *
1473 tcp_rtlookup(inc)
1474 struct in_conninfo *inc;
1475 {
1476 struct route *ro;
1477 struct rtentry *rt;
1478
1479 ro = &inc->inc_route;
1480 rt = ro->ro_rt;
1481 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1482 /* No route yet, so try to acquire one */
1483 if (inc->inc_faddr.s_addr != INADDR_ANY) {
1484 ro->ro_dst.sa_family = AF_INET;
1485 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
1486 ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
1487 inc->inc_faddr;
1488 rtalloc(ro);
1489 rt = ro->ro_rt;
1490 }
1491 }
1492 return rt;
1493 }
1494
1495 #ifdef INET6
1496 struct rtentry *
1497 tcp_rtlookup6(inc)
1498 struct in_conninfo *inc;
1499 {
1500 struct route_in6 *ro6;
1501 struct rtentry *rt;
1502
1503 ro6 = &inc->inc6_route;
1504 rt = ro6->ro_rt;
1505 if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
1506 /* No route yet, so try to acquire one */
1507 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
1508 ro6->ro_dst.sin6_family = AF_INET6;
1509 ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
1510 ro6->ro_dst.sin6_addr = inc->inc6_faddr;
1511 rtalloc((struct route *)ro6);
1512 rt = ro6->ro_rt;
1513 }
1514 }
1515 return rt;
1516 }
1517 #endif /* INET6 */
1518
1519 #ifdef IPSEC
1520 /* compute ESP/AH header size for TCP, including outer IP header. */
1521 size_t
1522 ipsec_hdrsiz_tcp(tp)
1523 struct tcpcb *tp;
1524 {
1525 struct inpcb *inp;
1526 struct mbuf *m;
1527 size_t hdrsiz;
1528 struct ip *ip;
1529 #ifdef INET6
1530 struct ip6_hdr *ip6;
1531 #endif
1532 struct tcphdr *th;
1533
1534 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
1535 return 0;
1536 MGETHDR(m, M_DONTWAIT, MT_DATA);
1537 if (!m)
1538 return 0;
1539
1540 #ifdef INET6
1541 if ((inp->inp_vflag & INP_IPV6) != 0) {
1542 ip6 = mtod(m, struct ip6_hdr *);
1543 th = (struct tcphdr *)(ip6 + 1);
1544 m->m_pkthdr.len = m->m_len =
1545 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1546 tcpip_fillheaders(inp, ip6, th);
1547 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1548 } else
1549 #endif /* INET6 */
1550 {
1551 ip = mtod(m, struct ip *);
1552 th = (struct tcphdr *)(ip + 1);
1553 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
1554 tcpip_fillheaders(inp, ip, th);
1555 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
1556 }
1557
1558 m_free(m);
1559 return hdrsiz;
1560 }
1561 #endif /*IPSEC*/
1562
1563 /*
1564 * Return a pointer to the cached information about the remote host.
1565 * The cached information is stored in the protocol specific part of
1566 * the route metrics.
1567 */
1568 struct rmxp_tao *
1569 tcp_gettaocache(inc)
1570 struct in_conninfo *inc;
1571 {
1572 struct rtentry *rt;
1573
1574 #ifdef INET6
1575 if (inc->inc_isipv6)
1576 rt = tcp_rtlookup6(inc);
1577 else
1578 #endif /* INET6 */
1579 rt = tcp_rtlookup(inc);
1580
1581 /* Make sure this is a host route and is up. */
1582 if (rt == NULL ||
1583 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
1584 return NULL;
1585
1586 return rmx_taop(rt->rt_rmx);
1587 }
1588
1589 /*
1590 * Clear all the TAO cache entries, called from tcp_init.
1591 *
1592 * XXX
1593 * This routine is just an empty one, because we assume that the routing
1594 * routing tables are initialized at the same time when TCP, so there is
1595 * nothing in the cache left over.
1596 */
1597 static void
1598 tcp_cleartaocache()
1599 {
1600 }
1601
1602 /*
1603 * Move a TCP connection into TIME_WAIT state.
1604 * tcbinfo is unlocked.
1605 * inp is locked, and is unlocked before returning.
1606 */
1607 void
1608 tcp_twstart(tp)
1609 struct tcpcb *tp;
1610 {
1611 struct tcptw *tw;
1612 struct inpcb *inp;
1613 int tw_time, acknow;
1614 struct socket *so;
1615
1616 tw = uma_zalloc(tcptw_zone, M_NOWAIT);
1617 if (tw == NULL) {
1618 tw = tcp_timer_2msl_tw(1);
1619 if (tw == NULL) {
1620 tcp_close(tp);
1621 return;
1622 }
1623 }
1624 inp = tp->t_inpcb;
1625 tw->tw_inpcb = inp;
1626
1627 /*
1628 * Recover last window size sent.
1629 */
1630 tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
1631
1632 /*
1633 * Set t_recent if timestamps are used on the connection.
1634 */
1635 if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
1636 (TF_REQ_TSTMP|TF_RCVD_TSTMP))
1637 tw->t_recent = tp->ts_recent;
1638 else
1639 tw->t_recent = 0;
1640
1641 tw->snd_nxt = tp->snd_nxt;
1642 tw->rcv_nxt = tp->rcv_nxt;
1643 tw->cc_recv = tp->cc_recv;
1644 tw->cc_send = tp->cc_send;
1645 tw->t_starttime = tp->t_starttime;
1646 tw->tw_time = 0;
1647
1648 /* XXX
1649 * If this code will
1650 * be used for fin-wait-2 state also, then we may need
1651 * a ts_recent from the last segment.
1652 */
1653 /* Shorten TIME_WAIT [RFC-1644, p.28] */
1654 if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
1655 tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
1656 /* For T/TCP client, force ACK now. */
1657 acknow = 1;
1658 } else {
1659 tw_time = 2 * tcp_msl;
1660 acknow = tp->t_flags & TF_ACKNOW;
1661 }
1662 tcp_discardcb(tp);
1663 so = inp->inp_socket;
1664 so->so_pcb = NULL;
1665 tw->tw_cred = crhold(so->so_cred);
1666 tw->tw_so_options = so->so_options;
1667 if (acknow)
1668 tcp_twrespond(tw, so, NULL, TH_ACK);
1669 sotryfree(so);
1670 inp->inp_socket = NULL;
1671 inp->inp_ppcb = (caddr_t)tw;
1672 inp->inp_vflag |= INP_TIMEWAIT;
1673 tcp_timer_2msl_reset(tw, tw_time);
1674 INP_UNLOCK(inp);
1675 }
1676
1677 struct tcptw *
1678 tcp_twclose(struct tcptw *tw, int reuse)
1679 {
1680 struct inpcb *inp;
1681
1682 inp = tw->tw_inpcb;
1683 tw->tw_inpcb = NULL;
1684 tcp_timer_2msl_stop(tw);
1685 inp->inp_ppcb = NULL;
1686 #ifdef INET6
1687 if (inp->inp_vflag & INP_IPV6PROTO)
1688 in6_pcbdetach(inp);
1689 else
1690 #endif
1691 in_pcbdetach(inp);
1692 tcpstat.tcps_closed++;
1693 if (reuse)
1694 return (tw);
1695 uma_zfree(tcptw_zone, tw);
1696 return (NULL);
1697 }
1698
1699 /*
1700 * One of so and msrc must be non-NULL for use by the MAC Framework to
1701 * construct a label for ay resulting packet.
1702 */
1703 int
1704 tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
1705 int flags)
1706 {
1707 struct inpcb *inp = tw->tw_inpcb;
1708 struct tcphdr *th;
1709 struct mbuf *m;
1710 struct ip *ip = NULL;
1711 u_int8_t *optp;
1712 u_int hdrlen, optlen;
1713 int error;
1714 #ifdef INET6
1715 struct ip6_hdr *ip6 = NULL;
1716 int isipv6 = inp->inp_inc.inc_isipv6;
1717 #endif
1718
1719 KASSERT(so != NULL || msrc != NULL,
1720 ("tcp_twrespond: so and msrc NULL"));
1721
1722 m = m_gethdr(M_DONTWAIT, MT_HEADER);
1723 if (m == NULL)
1724 return (ENOBUFS);
1725 m->m_data += max_linkhdr;
1726
1727 #ifdef MAC
1728 if (so != NULL)
1729 mac_create_mbuf_from_socket(so, m);
1730 else
1731 mac_create_mbuf_netlayer(msrc, m);
1732 #endif
1733
1734 #ifdef INET6
1735 if (isipv6) {
1736 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1737 ip6 = mtod(m, struct ip6_hdr *);
1738 th = (struct tcphdr *)(ip6 + 1);
1739 tcpip_fillheaders(inp, ip6, th);
1740 } else
1741 #endif
1742 {
1743 hdrlen = sizeof(struct tcpiphdr);
1744 ip = mtod(m, struct ip *);
1745 th = (struct tcphdr *)(ip + 1);
1746 tcpip_fillheaders(inp, ip, th);
1747 }
1748 optp = (u_int8_t *)(th + 1);
1749
1750 /*
1751 * Send a timestamp and echo-reply if both our side and our peer
1752 * have sent timestamps in our SYN's and this is not a RST.
1753 */
1754 if (tw->t_recent && flags == TH_ACK) {
1755 u_int32_t *lp = (u_int32_t *)optp;
1756
1757 /* Form timestamp option as shown in appendix A of RFC 1323. */
1758 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
1759 *lp++ = htonl(ticks);
1760 *lp = htonl(tw->t_recent);
1761 optp += TCPOLEN_TSTAMP_APPA;
1762 }
1763
1764 /*
1765 * Send `CC-family' options if needed, and it's not a RST.
1766 */
1767 if (tw->cc_recv != 0 && flags == TH_ACK) {
1768 u_int32_t *lp = (u_int32_t *)optp;
1769
1770 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
1771 *lp = htonl(tw->cc_send);
1772 optp += TCPOLEN_CC_APPA;
1773 }
1774 optlen = optp - (u_int8_t *)(th + 1);
1775
1776 m->m_len = hdrlen + optlen;
1777 m->m_pkthdr.len = m->m_len;
1778
1779 KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
1780
1781 th->th_seq = htonl(tw->snd_nxt);
1782 th->th_ack = htonl(tw->rcv_nxt);
1783 th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
1784 th->th_flags = flags;
1785 th->th_win = htons(tw->last_win);
1786
1787 #ifdef INET6
1788 if (isipv6) {
1789 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
1790 sizeof(struct tcphdr) + optlen);
1791 ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
1792 inp->in6p_route.ro_rt->rt_ifp : NULL);
1793 error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
1794 (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
1795 } else
1796 #endif
1797 {
1798 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
1799 htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
1800 m->m_pkthdr.csum_flags = CSUM_TCP;
1801 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
1802 ip->ip_len = m->m_pkthdr.len;
1803 error = ip_output(m, inp->inp_options, &inp->inp_route,
1804 (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
1805 }
1806 if (flags & TH_ACK)
1807 tcpstat.tcps_sndacks++;
1808 else
1809 tcpstat.tcps_sndctrl++;
1810 tcpstat.tcps_sndtotal++;
1811 return (error);
1812 }
1813
1814 /*
1815 * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
1816 *
1817 * This code attempts to calculate the bandwidth-delay product as a
1818 * means of determining the optimal window size to maximize bandwidth,
1819 * minimize RTT, and avoid the over-allocation of buffers on interfaces and
1820 * routers. This code also does a fairly good job keeping RTTs in check
1821 * across slow links like modems. We implement an algorithm which is very
1822 * similar (but not meant to be) TCP/Vegas. The code operates on the
1823 * transmitter side of a TCP connection and so only effects the transmit
1824 * side of the connection.
1825 *
1826 * BACKGROUND: TCP makes no provision for the management of buffer space
1827 * at the end points or at the intermediate routers and switches. A TCP
1828 * stream, whether using NewReno or not, will eventually buffer as
1829 * many packets as it is able and the only reason this typically works is
1830 * due to the fairly small default buffers made available for a connection
1831 * (typicaly 16K or 32K). As machines use larger windows and/or window
1832 * scaling it is now fairly easy for even a single TCP connection to blow-out
1833 * all available buffer space not only on the local interface, but on
1834 * intermediate routers and switches as well. NewReno makes a misguided
1835 * attempt to 'solve' this problem by waiting for an actual failure to occur,
1836 * then backing off, then steadily increasing the window again until another
1837 * failure occurs, ad-infinitum. This results in terrible oscillation that
1838 * is only made worse as network loads increase and the idea of intentionally
1839 * blowing out network buffers is, frankly, a terrible way to manage network
1840 * resources.
1841 *
1842 * It is far better to limit the transmit window prior to the failure
1843 * condition being achieved. There are two general ways to do this: First
1844 * you can 'scan' through different transmit window sizes and locate the
1845 * point where the RTT stops increasing, indicating that you have filled the
1846 * pipe, then scan backwards until you note that RTT stops decreasing, then
1847 * repeat ad-infinitum. This method works in principle but has severe
1848 * implementation issues due to RTT variances, timer granularity, and
1849 * instability in the algorithm which can lead to many false positives and
1850 * create oscillations as well as interact badly with other TCP streams
1851 * implementing the same algorithm.
1852 *
1853 * The second method is to limit the window to the bandwidth delay product
1854 * of the link. This is the method we implement. RTT variances and our
1855 * own manipulation of the congestion window, bwnd, can potentially
1856 * destabilize the algorithm. For this reason we have to stabilize the
1857 * elements used to calculate the window. We do this by using the minimum
1858 * observed RTT, the long term average of the observed bandwidth, and
1859 * by adding two segments worth of slop. It isn't perfect but it is able
1860 * to react to changing conditions and gives us a very stable basis on
1861 * which to extend the algorithm.
1862 */
1863 void
1864 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
1865 {
1866 u_long bw;
1867 u_long bwnd;
1868 int save_ticks;
1869
1870 /*
1871 * If inflight_enable is disabled in the middle of a tcp connection,
1872 * make sure snd_bwnd is effectively disabled.
1873 */
1874 if (tcp_inflight_enable == 0) {
1875 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1876 tp->snd_bandwidth = 0;
1877 return;
1878 }
1879
1880 /*
1881 * Figure out the bandwidth. Due to the tick granularity this
1882 * is a very rough number and it MUST be averaged over a fairly
1883 * long period of time. XXX we need to take into account a link
1884 * that is not using all available bandwidth, but for now our
1885 * slop will ramp us up if this case occurs and the bandwidth later
1886 * increases.
1887 *
1888 * Note: if ticks rollover 'bw' may wind up negative. We must
1889 * effectively reset t_bw_rtttime for this case.
1890 */
1891 save_ticks = ticks;
1892 if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
1893 return;
1894
1895 bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz /
1896 (save_ticks - tp->t_bw_rtttime);
1897 tp->t_bw_rtttime = save_ticks;
1898 tp->t_bw_rtseq = ack_seq;
1899 if (tp->t_bw_rtttime == 0 || (int)bw < 0)
1900 return;
1901 bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
1902
1903 tp->snd_bandwidth = bw;
1904
1905 /*
1906 * Calculate the semi-static bandwidth delay product, plus two maximal
1907 * segments. The additional slop puts us squarely in the sweet
1908 * spot and also handles the bandwidth run-up case and stabilization.
1909 * Without the slop we could be locking ourselves into a lower
1910 * bandwidth.
1911 *
1912 * Situations Handled:
1913 * (1) Prevents over-queueing of packets on LANs, especially on
1914 * high speed LANs, allowing larger TCP buffers to be
1915 * specified, and also does a good job preventing
1916 * over-queueing of packets over choke points like modems
1917 * (at least for the transmit side).
1918 *
1919 * (2) Is able to handle changing network loads (bandwidth
1920 * drops so bwnd drops, bandwidth increases so bwnd
1921 * increases).
1922 *
1923 * (3) Theoretically should stabilize in the face of multiple
1924 * connections implementing the same algorithm (this may need
1925 * a little work).
1926 *
1927 * (4) Stability value (defaults to 20 = 2 maximal packets) can
1928 * be adjusted with a sysctl but typically only needs to be
1929 * on very slow connections. A value no smaller then 5
1930 * should be used, but only reduce this default if you have
1931 * no other choice.
1932 */
1933 #define USERTT ((tp->t_srtt + tp->t_rttbest) / 2)
1934 bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
1935 #undef USERTT
1936
1937 if (tcp_inflight_debug > 0) {
1938 static int ltime;
1939 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
1940 ltime = ticks;
1941 printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
1942 tp,
1943 bw,
1944 tp->t_rttbest,
1945 tp->t_srtt,
1946 bwnd
1947 );
1948 }
1949 }
1950 if ((long)bwnd < tcp_inflight_min)
1951 bwnd = tcp_inflight_min;
1952 if (bwnd > tcp_inflight_max)
1953 bwnd = tcp_inflight_max;
1954 if ((long)bwnd < tp->t_maxseg * 2)
1955 bwnd = tp->t_maxseg * 2;
1956 tp->snd_bwnd = bwnd;
1957 }
1958
Cache object: 39abc1178411429076ce2cf5af8c716f
|