tcp_subr.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  * 3. All advertising materials mentioning features or use of this software
   14  *    must display the following acknowledgement:
   15  *      This product includes software developed by the University of
   16  *      California, Berkeley and its contributors.
   17  * 4. Neither the name of the University nor the names of its contributors
   18  *    may be used to endorse or promote products derived from this software
   19  *    without specific prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      @(#)tcp_subr.c  8.2 (Berkeley) 5/24/95
   34  * $FreeBSD: releng/5.1/sys/netinet/tcp_subr.c 127036 2004-03-15 20:02:07Z fjoe $
   35  */
   36 
   37 #include "opt_compat.h"
   38 #include "opt_inet6.h"
   39 #include "opt_ipsec.h"
   40 #include "opt_mac.h"
   41 #include "opt_tcpdebug.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/callout.h>
   46 #include <sys/kernel.h>
   47 #include <sys/sysctl.h>
   48 #include <sys/mac.h>
   49 #include <sys/malloc.h>
   50 #include <sys/mbuf.h>
   51 #ifdef INET6
   52 #include <sys/domain.h>
   53 #endif
   54 #include <sys/proc.h>
   55 #include <sys/socket.h>
   56 #include <sys/socketvar.h>
   57 #include <sys/protosw.h>
   58 #include <sys/random.h>
   59 
   60 #include <vm/uma.h>
   61 
   62 #include <net/route.h>
   63 #include <net/if.h>
   64 
   65 #include <netinet/in.h>
   66 #include <netinet/in_systm.h>
   67 #include <netinet/ip.h>
   68 #ifdef INET6
   69 #include <netinet/ip6.h>
   70 #endif
   71 #include <netinet/in_pcb.h>
   72 #ifdef INET6
   73 #include <netinet6/in6_pcb.h>
   74 #endif
   75 #include <netinet/in_var.h>
   76 #include <netinet/ip_var.h>
   77 #ifdef INET6
   78 #include <netinet6/ip6_var.h>
   79 #endif
   80 #include <netinet/tcp.h>
   81 #include <netinet/tcp_fsm.h>
   82 #include <netinet/tcp_seq.h>
   83 #include <netinet/tcp_timer.h>
   84 #include <netinet/tcp_var.h>
   85 #ifdef INET6
   86 #include <netinet6/tcp6_var.h>
   87 #endif
   88 #include <netinet/tcpip.h>
   89 #ifdef TCPDEBUG
   90 #include <netinet/tcp_debug.h>
   91 #endif
   92 #include <netinet6/ip6protosw.h>
   93 
   94 #ifdef IPSEC
   95 #include <netinet6/ipsec.h>
   96 #ifdef INET6
   97 #include <netinet6/ipsec6.h>
   98 #endif
   99 #endif /*IPSEC*/
  100 
  101 #ifdef FAST_IPSEC
  102 #include <netipsec/ipsec.h>
  103 #ifdef INET6
  104 #include <netipsec/ipsec6.h>
  105 #endif
  106 #define IPSEC
  107 #endif /*FAST_IPSEC*/
  108 
  109 #include <machine/in_cksum.h>
  110 #include <sys/md5.h>
  111 
  112 int     tcp_mssdflt = TCP_MSS;
  113 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 
  114     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
  115 
  116 #ifdef INET6
  117 int     tcp_v6mssdflt = TCP6_MSS;
  118 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
  119         CTLFLAG_RW, &tcp_v6mssdflt , 0,
  120         "Default TCP Maximum Segment Size for IPv6");
  121 #endif
  122 
  123 #if 0
  124 static int      tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
  125 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 
  126     &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
  127 #endif
  128 
  129 int     tcp_do_rfc1323 = 1;
  130 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 
  131     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
  132 
  133 int     tcp_do_rfc1644 = 0;
  134 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 
  135     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
  136 
  137 static int      tcp_tcbhashsize = 0;
  138 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
  139      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
  140 
  141 static int      do_tcpdrain = 1;
  142 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
  143      "Enable tcp_drain routine for extra help when low on mbufs");
  144 
  145 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 
  146     &tcbinfo.ipi_count, 0, "Number of active PCBs");
  147 
  148 static int      icmp_may_rst = 1;
  149 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 
  150     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
  151 
  152 static int      tcp_isn_reseed_interval = 0;
  153 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
  154     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
  155 
  156 /*
  157  * TCP bandwidth limiting sysctls.  Note that the default lower bound of 
  158  * 1024 exists only for debugging.  A good production default would be 
  159  * something like 6100.
  160  */
  161 static int      tcp_inflight_enable = 0;
  162 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
  163     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
  164 
  165 static int      tcp_inflight_debug = 0;
  166 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
  167     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
  168 
  169 static int      tcp_inflight_min = 6144;
  170 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
  171     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
  172 
  173 static int      tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  174 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
  175     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
  176 static int      tcp_inflight_stab = 20;
  177 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
  178     &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
  179 
  180 static void     tcp_cleartaocache(void);
  181 static struct inpcb *tcp_notify(struct inpcb *, int);
  182 static void     tcp_discardcb(struct tcpcb *);
  183 
  184 /*
  185  * Target size of TCP PCB hash tables. Must be a power of two.
  186  *
  187  * Note that this can be overridden by the kernel environment
  188  * variable net.inet.tcp.tcbhashsize
  189  */
  190 #ifndef TCBHASHSIZE
  191 #define TCBHASHSIZE     512
  192 #endif
  193 
  194 /*
  195  * XXX
  196  * Callouts should be moved into struct tcp directly.  They are currently
  197  * separate becuase the tcpcb structure is exported to userland for sysctl
  198  * parsing purposes, which do not know about callouts.
  199  */
  200 struct  tcpcb_mem {
  201         struct  tcpcb tcb;
  202         struct  callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
  203         struct  callout tcpcb_mem_2msl, tcpcb_mem_delack;
  204 };
  205 
  206 static uma_zone_t tcpcb_zone;
  207 static uma_zone_t tcptw_zone;
  208 
  209 /*
  210  * Tcp initialization
  211  */
  212 void
  213 tcp_init()
  214 {
  215         int hashsize = TCBHASHSIZE;
  216         
  217         tcp_ccgen = 1;
  218         tcp_cleartaocache();
  219 
  220         tcp_delacktime = TCPTV_DELACK;
  221         tcp_keepinit = TCPTV_KEEP_INIT;
  222         tcp_keepidle = TCPTV_KEEP_IDLE;
  223         tcp_keepintvl = TCPTV_KEEPINTVL;
  224         tcp_maxpersistidle = TCPTV_KEEP_IDLE;
  225         tcp_msl = TCPTV_MSL;
  226         tcp_rexmit_min = TCPTV_MIN;
  227         tcp_rexmit_slop = TCPTV_CPU_VAR;
  228 
  229         INP_INFO_LOCK_INIT(&tcbinfo, "tcp");
  230         LIST_INIT(&tcb);
  231         tcbinfo.listhead = &tcb;
  232         TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
  233         if (!powerof2(hashsize)) {
  234                 printf("WARNING: TCB hash size not a power of 2\n");
  235                 hashsize = 512; /* safe default */
  236         }
  237         tcp_tcbhashsize = hashsize;
  238         tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
  239         tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
  240                                         &tcbinfo.porthashmask);
  241         tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), 
  242             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  243         uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
  244 #ifdef INET6
  245 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
  246 #else /* INET6 */
  247 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
  248 #endif /* INET6 */
  249         if (max_protohdr < TCP_MINPROTOHDR)
  250                 max_protohdr = TCP_MINPROTOHDR;
  251         if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
  252                 panic("tcp_init");
  253 #undef TCP_MINPROTOHDR
  254         /*
  255          * These have to be type stable for the benefit of the timers.
  256          */
  257         tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), 
  258             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  259         uma_zone_set_max(tcpcb_zone, maxsockets);
  260         tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), 
  261             NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
  262         uma_zone_set_max(tcptw_zone, maxsockets);
  263         tcp_timer_init();
  264         syncache_init();
  265         tcp_reass_init();
  266 }
  267 
  268 /*
  269  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  270  * tcp_template used to store this data in mbufs, but we now recopy it out
  271  * of the tcpcb each time to conserve mbufs.
  272  */
  273 void
  274 tcpip_fillheaders(inp, ip_ptr, tcp_ptr)
  275         struct inpcb *inp;
  276         void *ip_ptr;
  277         void *tcp_ptr;
  278 {
  279         struct tcphdr *th = (struct tcphdr *)tcp_ptr;
  280 
  281 #ifdef INET6
  282         if ((inp->inp_vflag & INP_IPV6) != 0) {
  283                 struct ip6_hdr *ip6;
  284 
  285                 ip6 = (struct ip6_hdr *)ip_ptr;
  286                 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
  287                         (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
  288                 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
  289                         (IPV6_VERSION & IPV6_VERSION_MASK);
  290                 ip6->ip6_nxt = IPPROTO_TCP;
  291                 ip6->ip6_plen = sizeof(struct tcphdr);
  292                 ip6->ip6_src = inp->in6p_laddr;
  293                 ip6->ip6_dst = inp->in6p_faddr;
  294         } else
  295 #endif
  296         {
  297                 struct ip *ip;
  298 
  299                 ip = (struct ip *)ip_ptr;
  300                 ip->ip_v = IPVERSION;
  301                 ip->ip_hl = 5;
  302                 ip->ip_tos = inp->inp_ip_tos;
  303                 ip->ip_len = 0;
  304                 ip->ip_id = 0;
  305                 ip->ip_off = 0;
  306                 ip->ip_ttl = inp->inp_ip_ttl;
  307                 ip->ip_sum = 0;
  308                 ip->ip_p = IPPROTO_TCP;
  309                 ip->ip_src = inp->inp_laddr;
  310                 ip->ip_dst = inp->inp_faddr;
  311         }
  312         th->th_sport = inp->inp_lport;
  313         th->th_dport = inp->inp_fport;
  314         th->th_seq = 0;
  315         th->th_ack = 0;
  316         th->th_x2 = 0;
  317         th->th_off = 5;
  318         th->th_flags = 0;
  319         th->th_win = 0;
  320         th->th_urp = 0;
  321         th->th_sum = 0;         /* in_pseudo() is called later for ipv4 */
  322 }
  323 
  324 /*
  325  * Create template to be used to send tcp packets on a connection.
  326  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  327  * use for this function is in keepalives, which use tcp_respond.
  328  */
  329 struct tcptemp *
  330 tcpip_maketemplate(inp)
  331         struct inpcb *inp;
  332 {
  333         struct mbuf *m;
  334         struct tcptemp *n;
  335 
  336         m = m_get(M_DONTWAIT, MT_HEADER);
  337         if (m == NULL)
  338                 return (0);
  339         m->m_len = sizeof(struct tcptemp);
  340         n = mtod(m, struct tcptemp *);
  341 
  342         tcpip_fillheaders(inp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
  343         return (n);
  344 }
  345 
  346 /*
  347  * Send a single message to the TCP at address specified by
  348  * the given TCP/IP header.  If m == 0, then we make a copy
  349  * of the tcpiphdr at ti and send directly to the addressed host.
  350  * This is used to force keep alive messages out using the TCP
  351  * template for a connection.  If flags are given then we send
  352  * a message back to the TCP which originated the * segment ti,
  353  * and discard the mbuf containing it and any other attached mbufs.
  354  *
  355  * In any case the ack and sequence number of the transmitted
  356  * segment are as specified by the parameters.
  357  *
  358  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  359  */
  360 void
  361 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
  362         struct tcpcb *tp;
  363         void *ipgen;
  364         register struct tcphdr *th;
  365         register struct mbuf *m;
  366         tcp_seq ack, seq;
  367         int flags;
  368 {
  369         register int tlen;
  370         int win = 0;
  371         struct route *ro = 0;
  372         struct route sro;
  373         struct ip *ip;
  374         struct tcphdr *nth;
  375 #ifdef INET6
  376         struct route_in6 *ro6 = 0;
  377         struct route_in6 sro6;
  378         struct ip6_hdr *ip6;
  379         int isipv6;
  380 #endif /* INET6 */
  381         int ipflags = 0;
  382 
  383         KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
  384 
  385 #ifdef INET6
  386         isipv6 = ((struct ip *)ipgen)->ip_v == 6;
  387         ip6 = ipgen;
  388 #endif /* INET6 */
  389         ip = ipgen;
  390 
  391         if (tp) {
  392                 if (!(flags & TH_RST)) {
  393                         win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
  394                         if (win > (long)TCP_MAXWIN << tp->rcv_scale)
  395                                 win = (long)TCP_MAXWIN << tp->rcv_scale;
  396                 }
  397 #ifdef INET6
  398                 if (isipv6)
  399                         ro6 = &tp->t_inpcb->in6p_route;
  400                 else
  401 #endif /* INET6 */
  402                 ro = &tp->t_inpcb->inp_route;
  403         } else {
  404 #ifdef INET6
  405                 if (isipv6) {
  406                         ro6 = &sro6;
  407                         bzero(ro6, sizeof *ro6);
  408                 } else
  409 #endif /* INET6 */
  410               {
  411                 ro = &sro;
  412                 bzero(ro, sizeof *ro);
  413               }
  414         }
  415         if (m == 0) {
  416                 m = m_gethdr(M_DONTWAIT, MT_HEADER);
  417                 if (m == NULL)
  418                         return;
  419                 tlen = 0;
  420                 m->m_data += max_linkhdr;
  421 #ifdef INET6
  422                 if (isipv6) {
  423                         bcopy((caddr_t)ip6, mtod(m, caddr_t), 
  424                               sizeof(struct ip6_hdr));
  425                         ip6 = mtod(m, struct ip6_hdr *);
  426                         nth = (struct tcphdr *)(ip6 + 1);
  427                 } else
  428 #endif /* INET6 */
  429               {
  430                 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
  431                 ip = mtod(m, struct ip *);
  432                 nth = (struct tcphdr *)(ip + 1);
  433               }
  434                 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
  435                 flags = TH_ACK;
  436         } else {
  437                 m_freem(m->m_next);
  438                 m->m_next = 0;
  439                 m->m_data = (caddr_t)ipgen;
  440                 /* m_len is set later */
  441                 tlen = 0;
  442 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
  443 #ifdef INET6
  444                 if (isipv6) {
  445                         xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
  446                         nth = (struct tcphdr *)(ip6 + 1);
  447                 } else
  448 #endif /* INET6 */
  449               {
  450                 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
  451                 nth = (struct tcphdr *)(ip + 1);
  452               }
  453                 if (th != nth) {
  454                         /*
  455                          * this is usually a case when an extension header
  456                          * exists between the IPv6 header and the
  457                          * TCP header.
  458                          */
  459                         nth->th_sport = th->th_sport;
  460                         nth->th_dport = th->th_dport;
  461                 }
  462                 xchg(nth->th_dport, nth->th_sport, n_short);
  463 #undef xchg
  464         }
  465 #ifdef INET6
  466         if (isipv6) {
  467                 ip6->ip6_flow = 0;
  468                 ip6->ip6_vfc = IPV6_VERSION;
  469                 ip6->ip6_nxt = IPPROTO_TCP;
  470                 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
  471                                                 tlen));
  472                 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
  473         } else
  474 #endif
  475       {
  476         tlen += sizeof (struct tcpiphdr);
  477         ip->ip_len = tlen;
  478         ip->ip_ttl = ip_defttl;
  479       }
  480         m->m_len = tlen;
  481         m->m_pkthdr.len = tlen;
  482         m->m_pkthdr.rcvif = (struct ifnet *) 0;
  483 #ifdef MAC
  484         if (tp != NULL && tp->t_inpcb != NULL) {
  485                 /*
  486                  * Packet is associated with a socket, so allow the
  487                  * label of the response to reflect the socket label.
  488                  */
  489                 mac_create_mbuf_from_socket(tp->t_inpcb->inp_socket, m);
  490         } else {
  491                 /*
  492                  * XXXMAC: This will need to call a mac function that
  493                  * modifies the mbuf label in place for TCP datagrams
  494                  * not associated with a PCB.
  495                  */
  496         }
  497 #endif
  498         nth->th_seq = htonl(seq);
  499         nth->th_ack = htonl(ack);
  500         nth->th_x2 = 0;
  501         nth->th_off = sizeof (struct tcphdr) >> 2;
  502         nth->th_flags = flags;
  503         if (tp)
  504                 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
  505         else
  506                 nth->th_win = htons((u_short)win);
  507         nth->th_urp = 0;
  508 #ifdef INET6
  509         if (isipv6) {
  510                 nth->th_sum = 0;
  511                 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
  512                                         sizeof(struct ip6_hdr),
  513                                         tlen - sizeof(struct ip6_hdr));
  514                 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
  515                                                ro6 && ro6->ro_rt ?
  516                                                ro6->ro_rt->rt_ifp :
  517                                                NULL);
  518         } else
  519 #endif /* INET6 */
  520       {
  521         nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
  522             htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
  523         m->m_pkthdr.csum_flags = CSUM_TCP;
  524         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
  525       }
  526 #ifdef TCPDEBUG
  527         if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
  528                 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
  529 #endif
  530 #ifdef INET6
  531         if (isipv6) {
  532                 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
  533                         tp ? tp->t_inpcb : NULL);
  534                 if (ro6 == &sro6 && ro6->ro_rt) {
  535                         RTFREE(ro6->ro_rt);
  536                         ro6->ro_rt = NULL;
  537                 }
  538         } else
  539 #endif /* INET6 */
  540       {
  541         (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
  542         if (ro == &sro && ro->ro_rt) {
  543                 RTFREE(ro->ro_rt);
  544                 ro->ro_rt = NULL;
  545         }
  546       }
  547 }
  548 
  549 /*
  550  * Create a new TCP control block, making an
  551  * empty reassembly queue and hooking it to the argument
  552  * protocol control block.  The `inp' parameter must have
  553  * come from the zone allocator set up in tcp_init().
  554  */
  555 struct tcpcb *
  556 tcp_newtcpcb(inp)
  557         struct inpcb *inp;
  558 {
  559         struct tcpcb_mem *tm;
  560         struct tcpcb *tp;
  561 #ifdef INET6
  562         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
  563 #endif /* INET6 */
  564 
  565         tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
  566         if (tm == NULL)
  567                 return (NULL);
  568         tp = &tm->tcb;
  569         /*      LIST_INIT(&tp->t_segq); */      /* XXX covered by M_ZERO */
  570         tp->t_maxseg = tp->t_maxopd =
  571 #ifdef INET6
  572                 isipv6 ? tcp_v6mssdflt :
  573 #endif /* INET6 */
  574                 tcp_mssdflt;
  575 
  576         /* Set up our timeouts. */
  577         callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
  578         callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
  579         callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
  580         callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
  581         callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
  582 
  583         if (tcp_do_rfc1323)
  584                 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
  585         if (tcp_do_rfc1644)
  586                 tp->t_flags |= TF_REQ_CC;
  587         tp->t_inpcb = inp;      /* XXX */
  588         /*
  589          * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
  590          * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
  591          * reasonable initial retransmit time.
  592          */
  593         tp->t_srtt = TCPTV_SRTTBASE;
  594         tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
  595         tp->t_rttmin = tcp_rexmit_min;
  596         tp->t_rxtcur = TCPTV_RTOBASE;
  597         tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  598         tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  599         tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  600         tp->t_rcvtime = ticks;
  601         tp->t_bw_rtttime = ticks;
  602         /*
  603          * IPv4 TTL initialization is necessary for an IPv6 socket as well,
  604          * because the socket may be bound to an IPv6 wildcard address,
  605          * which may match an IPv4-mapped IPv6 address.
  606          */
  607         inp->inp_ip_ttl = ip_defttl;
  608         inp->inp_ppcb = (caddr_t)tp;
  609         return (tp);            /* XXX */
  610 }
  611 
  612 /*
  613  * Drop a TCP connection, reporting
  614  * the specified error.  If connection is synchronized,
  615  * then send a RST to peer.
  616  */
  617 struct tcpcb *
  618 tcp_drop(tp, errno)
  619         register struct tcpcb *tp;
  620         int errno;
  621 {
  622         struct socket *so = tp->t_inpcb->inp_socket;
  623 
  624         if (TCPS_HAVERCVDSYN(tp->t_state)) {
  625                 tp->t_state = TCPS_CLOSED;
  626                 (void) tcp_output(tp);
  627                 tcpstat.tcps_drops++;
  628         } else
  629                 tcpstat.tcps_conndrops++;
  630         if (errno == ETIMEDOUT && tp->t_softerror)
  631                 errno = tp->t_softerror;
  632         so->so_error = errno;
  633         return (tcp_close(tp));
  634 }
  635 
  636 static void
  637 tcp_discardcb(tp)
  638         struct tcpcb *tp;
  639 {
  640         struct tseg_qent *q;
  641         struct inpcb *inp = tp->t_inpcb;
  642         struct socket *so = inp->inp_socket;
  643 #ifdef INET6
  644         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
  645 #endif /* INET6 */
  646         struct rtentry *rt;
  647         int dosavessthresh;
  648 
  649         /*
  650          * Make sure that all of our timers are stopped before we
  651          * delete the PCB.
  652          */
  653         callout_stop(tp->tt_rexmt);
  654         callout_stop(tp->tt_persist);
  655         callout_stop(tp->tt_keep);
  656         callout_stop(tp->tt_2msl);
  657         callout_stop(tp->tt_delack);
  658 
  659         /*
  660          * If we got enough samples through the srtt filter,
  661          * save the rtt and rttvar in the routing entry.
  662          * 'Enough' is arbitrarily defined as the 16 samples.
  663          * 16 samples is enough for the srtt filter to converge
  664          * to within 5% of the correct value; fewer samples and
  665          * we could save a very bogus rtt.
  666          *
  667          * Don't update the default route's characteristics and don't
  668          * update anything that the user "locked".
  669          */
  670         if (tp->t_rttupdated >= 16) {
  671                 register u_long i = 0;
  672 #ifdef INET6
  673                 if (isipv6) {
  674                         struct sockaddr_in6 *sin6;
  675 
  676                         if ((rt = inp->in6p_route.ro_rt) == NULL)
  677                                 goto no_valid_rt;
  678                         sin6 = (struct sockaddr_in6 *)rt_key(rt);
  679                         if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
  680                                 goto no_valid_rt;
  681                 }
  682                 else
  683 #endif /* INET6 */              
  684                 if ((rt = inp->inp_route.ro_rt) == NULL ||
  685                     ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
  686                     == INADDR_ANY)
  687                         goto no_valid_rt;
  688 
  689                 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
  690                         i = tp->t_srtt *
  691                             (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
  692                         if (rt->rt_rmx.rmx_rtt && i)
  693                                 /*
  694                                  * filter this update to half the old & half
  695                                  * the new values, converting scale.
  696                                  * See route.h and tcp_var.h for a
  697                                  * description of the scaling constants.
  698                                  */
  699                                 rt->rt_rmx.rmx_rtt =
  700                                     (rt->rt_rmx.rmx_rtt + i) / 2;
  701                         else
  702                                 rt->rt_rmx.rmx_rtt = i;
  703                         tcpstat.tcps_cachedrtt++;
  704                 }
  705                 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
  706                         i = tp->t_rttvar *
  707                             (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
  708                         if (rt->rt_rmx.rmx_rttvar && i)
  709                                 rt->rt_rmx.rmx_rttvar =
  710                                     (rt->rt_rmx.rmx_rttvar + i) / 2;
  711                         else
  712                                 rt->rt_rmx.rmx_rttvar = i;
  713                         tcpstat.tcps_cachedrttvar++;
  714                 }
  715                 /*
  716                  * The old comment here said:
  717                  * update the pipelimit (ssthresh) if it has been updated
  718                  * already or if a pipesize was specified & the threshhold
  719                  * got below half the pipesize.  I.e., wait for bad news
  720                  * before we start updating, then update on both good
  721                  * and bad news.
  722                  *
  723                  * But we want to save the ssthresh even if no pipesize is
  724                  * specified explicitly in the route, because such
  725                  * connections still have an implicit pipesize specified
  726                  * by the global tcp_sendspace.  In the absence of a reliable
  727                  * way to calculate the pipesize, it will have to do.
  728                  */
  729                 i = tp->snd_ssthresh;
  730                 if (rt->rt_rmx.rmx_sendpipe != 0)
  731                         dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
  732                 else
  733                         dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
  734                 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
  735                      i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
  736                     || dosavessthresh) {
  737                         /*
  738                          * convert the limit from user data bytes to
  739                          * packets then to packet data bytes.
  740                          */
  741                         i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
  742                         if (i < 2)
  743                                 i = 2;
  744                         i *= (u_long)(tp->t_maxseg +
  745 #ifdef INET6
  746                                       (isipv6 ? sizeof (struct ip6_hdr) +
  747                                                sizeof (struct tcphdr) :
  748 #endif
  749                                        sizeof (struct tcpiphdr)
  750 #ifdef INET6
  751                                        )
  752 #endif
  753                                       );
  754                         if (rt->rt_rmx.rmx_ssthresh)
  755                                 rt->rt_rmx.rmx_ssthresh =
  756                                     (rt->rt_rmx.rmx_ssthresh + i) / 2;
  757                         else
  758                                 rt->rt_rmx.rmx_ssthresh = i;
  759                         tcpstat.tcps_cachedssthresh++;
  760                 }
  761         }
  762     no_valid_rt:
  763         /* free the reassembly queue, if any */
  764         while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
  765                 LIST_REMOVE(q, tqe_q);
  766                 m_freem(q->tqe_m);
  767                 uma_zfree(tcp_reass_zone, q);
  768                 tcp_reass_qsize--;
  769         }
  770         inp->inp_ppcb = NULL;
  771         tp->t_inpcb = NULL;
  772         uma_zfree(tcpcb_zone, tp);
  773         soisdisconnected(so);
  774 }
  775 
  776 /*
  777  * Close a TCP control block:
  778  *    discard all space held by the tcp
  779  *    discard internet protocol block
  780  *    wake up any sleepers
  781  */
  782 struct tcpcb *
  783 tcp_close(tp)
  784         struct tcpcb *tp;
  785 {
  786         struct inpcb *inp = tp->t_inpcb;
  787 #ifdef INET6
  788         struct socket *so = inp->inp_socket;
  789 #endif
  790 
  791         tcp_discardcb(tp);
  792 #ifdef INET6
  793         if (INP_CHECK_SOCKAF(so, AF_INET6))
  794                 in6_pcbdetach(inp);
  795         else
  796 #endif
  797                 in_pcbdetach(inp);
  798         tcpstat.tcps_closed++;
  799         return ((struct tcpcb *)0);
  800 }
  801 
  802 void
  803 tcp_drain()
  804 {
  805         if (do_tcpdrain)
  806         {
  807                 struct inpcb *inpb;
  808                 struct tcpcb *tcpb;
  809                 struct tseg_qent *te;
  810 
  811         /*
  812          * Walk the tcpbs, if existing, and flush the reassembly queue,
  813          * if there is one...
  814          * XXX: The "Net/3" implementation doesn't imply that the TCP
  815          *      reassembly queue should be flushed, but in a situation
  816          *      where we're really low on mbufs, this is potentially
  817          *      usefull.        
  818          */
  819                 INP_INFO_RLOCK(&tcbinfo);
  820                 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
  821                         if (inpb->inp_vflag & INP_TIMEWAIT)
  822                                 continue;
  823                         INP_LOCK(inpb);
  824                         if ((tcpb = intotcpcb(inpb))) {
  825                                 while ((te = LIST_FIRST(&tcpb->t_segq))
  826                                     != NULL) {
  827                                         LIST_REMOVE(te, tqe_q);
  828                                         m_freem(te->tqe_m);
  829                                         uma_zfree(tcp_reass_zone, te);
  830                                         tcp_reass_qsize--;
  831                                 }
  832                         }
  833                         INP_UNLOCK(inpb);
  834                 }
  835                 INP_INFO_RUNLOCK(&tcbinfo);
  836         }
  837 }
  838 
  839 /*
  840  * Notify a tcp user of an asynchronous error;
  841  * store error as soft error, but wake up user
  842  * (for now, won't do anything until can select for soft error).
  843  *
  844  * Do not wake up user since there currently is no mechanism for
  845  * reporting soft errors (yet - a kqueue filter may be added).
  846  */
  847 static struct inpcb *
  848 tcp_notify(inp, error)
  849         struct inpcb *inp;
  850         int error;
  851 {
  852         struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
  853 
  854         /*
  855          * Ignore some errors if we are hooked up.
  856          * If connection hasn't completed, has retransmitted several times,
  857          * and receives a second error, give up now.  This is better
  858          * than waiting a long time to establish a connection that
  859          * can never complete.
  860          */
  861         if (tp->t_state == TCPS_ESTABLISHED &&
  862             (error == EHOSTUNREACH || error == ENETUNREACH ||
  863              error == EHOSTDOWN)) {
  864                 return inp;
  865         } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
  866             tp->t_softerror) {
  867                 tcp_drop(tp, error);
  868                 return (struct inpcb *)0;
  869         } else {
  870                 tp->t_softerror = error;
  871                 return inp;
  872         }
  873 #if 0
  874         wakeup( &so->so_timeo);
  875         sorwakeup(so);
  876         sowwakeup(so);
  877 #endif
  878 }
  879 
  880 static int
  881 tcp_pcblist(SYSCTL_HANDLER_ARGS)
  882 {
  883         int error, i, n, s;
  884         struct inpcb *inp, **inp_list;
  885         inp_gen_t gencnt;
  886         struct xinpgen xig;
  887 
  888         /*
  889          * The process of preparing the TCB list is too time-consuming and
  890          * resource-intensive to repeat twice on every request.
  891          */
  892         if (req->oldptr == 0) {
  893                 n = tcbinfo.ipi_count;
  894                 req->oldidx = 2 * (sizeof xig)
  895                         + (n + n/8) * sizeof(struct xtcpcb);
  896                 return 0;
  897         }
  898 
  899         if (req->newptr != 0)
  900                 return EPERM;
  901 
  902         /*
  903          * OK, now we're committed to doing something.
  904          */
  905         s = splnet();
  906         INP_INFO_RLOCK(&tcbinfo);
  907         gencnt = tcbinfo.ipi_gencnt;
  908         n = tcbinfo.ipi_count;
  909         INP_INFO_RUNLOCK(&tcbinfo);
  910         splx(s);
  911 
  912         sysctl_wire_old_buffer(req, 2 * (sizeof xig)
  913                 + n * sizeof(struct xtcpcb));
  914 
  915         xig.xig_len = sizeof xig;
  916         xig.xig_count = n;
  917         xig.xig_gen = gencnt;
  918         xig.xig_sogen = so_gencnt;
  919         error = SYSCTL_OUT(req, &xig, sizeof xig);
  920         if (error)
  921                 return error;
  922 
  923         inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
  924         if (inp_list == 0)
  925                 return ENOMEM;
  926         
  927         s = splnet();
  928         INP_INFO_RLOCK(&tcbinfo);
  929         for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
  930              inp = LIST_NEXT(inp, inp_list)) {
  931                 INP_LOCK(inp);
  932                 if (inp->inp_gencnt <= gencnt) {
  933                         /*
  934                          * XXX: This use of cr_cansee(), introduced with
  935                          * TCP state changes, is not quite right, but for
  936                          * now, better than nothing.
  937                          */
  938                         if (inp->inp_vflag & INP_TIMEWAIT)
  939                                 error = cr_cansee(req->td->td_ucred,
  940                                     intotw(inp)->tw_cred);
  941                         else
  942                                 error = cr_canseesocket(req->td->td_ucred,
  943                                     inp->inp_socket);
  944                         if (error == 0)
  945                                 inp_list[i++] = inp;
  946                 }
  947                 INP_UNLOCK(inp);
  948         }
  949         INP_INFO_RUNLOCK(&tcbinfo);
  950         splx(s);
  951         n = i;
  952 
  953         error = 0;
  954         for (i = 0; i < n; i++) {
  955                 inp = inp_list[i];
  956                 if (inp->inp_gencnt <= gencnt) {
  957                         struct xtcpcb xt;
  958                         caddr_t inp_ppcb;
  959                         xt.xt_len = sizeof xt;
  960                         /* XXX should avoid extra copy */
  961                         bcopy(inp, &xt.xt_inp, sizeof *inp);
  962                         inp_ppcb = inp->inp_ppcb;
  963                         if (inp_ppcb == NULL)
  964                                 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
  965                         else if (inp->inp_vflag & INP_TIMEWAIT) {
  966                                 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
  967                                 xt.xt_tp.t_state = TCPS_TIME_WAIT;
  968                         } else
  969                                 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
  970                         if (inp->inp_socket)
  971                                 sotoxsocket(inp->inp_socket, &xt.xt_socket);
  972                         else {
  973                                 bzero(&xt.xt_socket, sizeof xt.xt_socket);
  974                                 xt.xt_socket.xso_protocol = IPPROTO_TCP;
  975                         }
  976                         xt.xt_inp.inp_gencnt = inp->inp_gencnt;
  977                         error = SYSCTL_OUT(req, &xt, sizeof xt);
  978                 }
  979         }
  980         if (!error) {
  981                 /*
  982                  * Give the user an updated idea of our state.
  983                  * If the generation differs from what we told
  984                  * her before, she knows that something happened
  985                  * while we were processing this request, and it
  986                  * might be necessary to retry.
  987                  */
  988                 s = splnet();
  989                 INP_INFO_RLOCK(&tcbinfo);
  990                 xig.xig_gen = tcbinfo.ipi_gencnt;
  991                 xig.xig_sogen = so_gencnt;
  992                 xig.xig_count = tcbinfo.ipi_count;
  993                 INP_INFO_RUNLOCK(&tcbinfo);
  994                 splx(s);
  995                 error = SYSCTL_OUT(req, &xig, sizeof xig);
  996         }
  997         free(inp_list, M_TEMP);
  998         return error;
  999 }
 1000 
 1001 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
 1002             tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
 1003 
 1004 static int
 1005 tcp_getcred(SYSCTL_HANDLER_ARGS)
 1006 {
 1007         struct xucred xuc;
 1008         struct sockaddr_in addrs[2];
 1009         struct inpcb *inp;
 1010         int error, s;
 1011 
 1012         error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 1013         if (error)
 1014                 return (error);
 1015         error = SYSCTL_IN(req, addrs, sizeof(addrs));
 1016         if (error)
 1017                 return (error);
 1018         s = splnet();
 1019         INP_INFO_RLOCK(&tcbinfo);
 1020         inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 1021             addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
 1022         if (inp == NULL) {
 1023                 error = ENOENT;
 1024                 goto outunlocked;
 1025         }
 1026         INP_LOCK(inp);
 1027         if (inp->inp_socket == NULL) {
 1028                 error = ENOENT;
 1029                 goto out;
 1030         }
 1031         error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 1032         if (error)
 1033                 goto out;
 1034         cru2x(inp->inp_socket->so_cred, &xuc);
 1035 out:
 1036         INP_UNLOCK(inp);
 1037 outunlocked:
 1038         INP_INFO_RUNLOCK(&tcbinfo);
 1039         splx(s);
 1040         if (error == 0)
 1041                 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 1042         return (error);
 1043 }
 1044 
 1045 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
 1046     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
 1047     tcp_getcred, "S,xucred", "Get the xucred of a TCP connection");
 1048 
 1049 #ifdef INET6
 1050 static int
 1051 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 1052 {
 1053         struct xucred xuc;
 1054         struct sockaddr_in6 addrs[2];
 1055         struct inpcb *inp;
 1056         int error, s, mapped = 0;
 1057 
 1058         error = suser_cred(req->td->td_ucred, PRISON_ROOT);
 1059         if (error)
 1060                 return (error);
 1061         error = SYSCTL_IN(req, addrs, sizeof(addrs));
 1062         if (error)
 1063                 return (error);
 1064         if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 1065                 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 1066                         mapped = 1;
 1067                 else
 1068                         return (EINVAL);
 1069         }
 1070         s = splnet();
 1071         INP_INFO_RLOCK(&tcbinfo);
 1072         if (mapped == 1)
 1073                 inp = in_pcblookup_hash(&tcbinfo,
 1074                         *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 1075                         addrs[1].sin6_port,
 1076                         *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 1077                         addrs[0].sin6_port,
 1078                         0, NULL);
 1079         else
 1080                 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
 1081                                  addrs[1].sin6_port,
 1082                                  &addrs[0].sin6_addr, addrs[0].sin6_port,
 1083                                  0, NULL);
 1084         if (inp == NULL) {
 1085                 error = ENOENT;
 1086                 goto outunlocked;
 1087         }
 1088         INP_LOCK(inp);
 1089         if (inp->inp_socket == NULL) {
 1090                 error = ENOENT;
 1091                 goto out;
 1092         }
 1093         error = cr_canseesocket(req->td->td_ucred, inp->inp_socket);
 1094         if (error)
 1095                 goto out;
 1096         cru2x(inp->inp_socket->so_cred, &xuc);
 1097 out:
 1098         INP_UNLOCK(inp);
 1099 outunlocked:
 1100         INP_INFO_RUNLOCK(&tcbinfo);
 1101         splx(s);
 1102         if (error == 0)
 1103                 error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 1104         return (error);
 1105 }
 1106 
 1107 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
 1108     CTLTYPE_OPAQUE|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0,
 1109     tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection");
 1110 #endif
 1111 
 1112 
 1113 void
 1114 tcp_ctlinput(cmd, sa, vip)
 1115         int cmd;
 1116         struct sockaddr *sa;
 1117         void *vip;
 1118 {
 1119         struct ip *ip = vip;
 1120         struct tcphdr *th;
 1121         struct in_addr faddr;
 1122         struct inpcb *inp;
 1123         struct tcpcb *tp;
 1124         struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 1125         tcp_seq icmp_seq;
 1126         int s;
 1127 
 1128         faddr = ((struct sockaddr_in *)sa)->sin_addr;
 1129         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 1130                 return;
 1131 
 1132         if (cmd == PRC_QUENCH)
 1133                 notify = tcp_quench;
 1134         else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 1135                 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 1136                 notify = tcp_drop_syn_sent;
 1137         else if (cmd == PRC_MSGSIZE)
 1138                 notify = tcp_mtudisc;
 1139         else if (PRC_IS_REDIRECT(cmd)) {
 1140                 ip = 0;
 1141                 notify = in_rtchange;
 1142         } else if (cmd == PRC_HOSTDEAD)
 1143                 ip = 0;
 1144         else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
 1145                 return;
 1146         if (ip) {
 1147                 s = splnet();
 1148                 th = (struct tcphdr *)((caddr_t)ip 
 1149                                        + (ip->ip_hl << 2));
 1150                 INP_INFO_WLOCK(&tcbinfo);
 1151                 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
 1152                     ip->ip_src, th->th_sport, 0, NULL);
 1153                 if (inp != NULL)  {
 1154                         INP_LOCK(inp);
 1155                         if (inp->inp_socket != NULL) {
 1156                                 icmp_seq = htonl(th->th_seq);
 1157                                 tp = intotcpcb(inp);
 1158                                 if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
 1159                                         SEQ_LT(icmp_seq, tp->snd_max))
 1160                                         inp = (*notify)(inp, inetctlerrmap[cmd]);
 1161                         }
 1162                         if (inp)
 1163                                 INP_UNLOCK(inp);
 1164                 } else {
 1165                         struct in_conninfo inc;
 1166 
 1167                         inc.inc_fport = th->th_dport;
 1168                         inc.inc_lport = th->th_sport;
 1169                         inc.inc_faddr = faddr;
 1170                         inc.inc_laddr = ip->ip_src;
 1171 #ifdef INET6
 1172                         inc.inc_isipv6 = 0;
 1173 #endif
 1174                         syncache_unreach(&inc, th);
 1175                 }
 1176                 INP_INFO_WUNLOCK(&tcbinfo);
 1177                 splx(s);
 1178         } else
 1179                 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
 1180 }
 1181 
 1182 #ifdef INET6
 1183 void
 1184 tcp6_ctlinput(cmd, sa, d)
 1185         int cmd;
 1186         struct sockaddr *sa;
 1187         void *d;
 1188 {
 1189         struct tcphdr th;
 1190         struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 1191         struct ip6_hdr *ip6;
 1192         struct mbuf *m;
 1193         struct ip6ctlparam *ip6cp = NULL;
 1194         const struct sockaddr_in6 *sa6_src = NULL;
 1195         int off;
 1196         struct tcp_portonly {
 1197                 u_int16_t th_sport;
 1198                 u_int16_t th_dport;
 1199         } *thp;
 1200 
 1201         if (sa->sa_family != AF_INET6 ||
 1202             sa->sa_len != sizeof(struct sockaddr_in6))
 1203                 return;
 1204 
 1205         if (cmd == PRC_QUENCH)
 1206                 notify = tcp_quench;
 1207         else if (cmd == PRC_MSGSIZE)
 1208                 notify = tcp_mtudisc;
 1209         else if (!PRC_IS_REDIRECT(cmd) &&
 1210                  ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 1211                 return;
 1212 
 1213         /* if the parameter is from icmp6, decode it. */
 1214         if (d != NULL) {
 1215                 ip6cp = (struct ip6ctlparam *)d;
 1216                 m = ip6cp->ip6c_m;
 1217                 ip6 = ip6cp->ip6c_ip6;
 1218                 off = ip6cp->ip6c_off;
 1219                 sa6_src = ip6cp->ip6c_src;
 1220         } else {
 1221                 m = NULL;
 1222                 ip6 = NULL;
 1223                 off = 0;        /* fool gcc */
 1224                 sa6_src = &sa6_any;
 1225         }
 1226 
 1227         if (ip6) {
 1228                 struct in_conninfo inc;
 1229                 /*
 1230                  * XXX: We assume that when IPV6 is non NULL,
 1231                  * M and OFF are valid.
 1232                  */
 1233 
 1234                 /* check if we can safely examine src and dst ports */
 1235                 if (m->m_pkthdr.len < off + sizeof(*thp))
 1236                         return;
 1237 
 1238                 bzero(&th, sizeof(th));
 1239                 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 1240 
 1241                 in6_pcbnotify(&tcb, sa, th.th_dport,
 1242                     (struct sockaddr *)ip6cp->ip6c_src,
 1243                     th.th_sport, cmd, notify);
 1244 
 1245                 inc.inc_fport = th.th_dport;
 1246                 inc.inc_lport = th.th_sport;
 1247                 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 1248                 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 1249                 inc.inc_isipv6 = 1;
 1250                 syncache_unreach(&inc, &th);
 1251         } else
 1252                 in6_pcbnotify(&tcb, sa, 0, (const struct sockaddr *)sa6_src,
 1253                               0, cmd, notify);
 1254 }
 1255 #endif /* INET6 */
 1256 
 1257 
 1258 /*
 1259  * Following is where TCP initial sequence number generation occurs.
 1260  *
 1261  * There are two places where we must use initial sequence numbers:
 1262  * 1.  In SYN-ACK packets.
 1263  * 2.  In SYN packets.
 1264  *
 1265  * All ISNs for SYN-ACK packets are generated by the syncache.  See
 1266  * tcp_syncache.c for details.
 1267  *
 1268  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
 1269  * depends on this property.  In addition, these ISNs should be
 1270  * unguessable so as to prevent connection hijacking.  To satisfy
 1271  * the requirements of this situation, the algorithm outlined in
 1272  * RFC 1948 is used to generate sequence numbers.
 1273  *
 1274  * Implementation details:
 1275  *
 1276  * Time is based off the system timer, and is corrected so that it
 1277  * increases by one megabyte per second.  This allows for proper
 1278  * recycling on high speed LANs while still leaving over an hour
 1279  * before rollover.
 1280  *
 1281  * net.inet.tcp.isn_reseed_interval controls the number of seconds
 1282  * between seeding of isn_secret.  This is normally set to zero,
 1283  * as reseeding should not be necessary.
 1284  *
 1285  */
 1286 
 1287 #define ISN_BYTES_PER_SECOND 1048576
 1288 
 1289 u_char isn_secret[32];
 1290 int isn_last_reseed;
 1291 MD5_CTX isn_ctx;
 1292 
 1293 tcp_seq
 1294 tcp_new_isn(tp)
 1295         struct tcpcb *tp;
 1296 {
 1297         u_int32_t md5_buffer[4];
 1298         tcp_seq new_isn;
 1299 
 1300         /* Seed if this is the first use, reseed if requested. */
 1301         if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
 1302              (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
 1303                 < (u_int)ticks))) {
 1304                 read_random(&isn_secret, sizeof(isn_secret));
 1305                 isn_last_reseed = ticks;
 1306         }
 1307                 
 1308         /* Compute the md5 hash and return the ISN. */
 1309         MD5Init(&isn_ctx);
 1310         MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 1311         MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 1312 #ifdef INET6
 1313         if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 1314                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 1315                           sizeof(struct in6_addr));
 1316                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 1317                           sizeof(struct in6_addr));
 1318         } else
 1319 #endif
 1320         {
 1321                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 1322                           sizeof(struct in_addr));
 1323                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 1324                           sizeof(struct in_addr));
 1325         }
 1326         MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
 1327         MD5Final((u_char *) &md5_buffer, &isn_ctx);
 1328         new_isn = (tcp_seq) md5_buffer[0];
 1329         new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
 1330         return new_isn;
 1331 }
 1332 
 1333 /*
 1334  * When a source quench is received, close congestion window
 1335  * to one segment.  We will gradually open it again as we proceed.
 1336  */
 1337 struct inpcb *
 1338 tcp_quench(inp, errno)
 1339         struct inpcb *inp;
 1340         int errno;
 1341 {
 1342         struct tcpcb *tp = intotcpcb(inp);
 1343 
 1344         if (tp)
 1345                 tp->snd_cwnd = tp->t_maxseg;
 1346         return (inp);
 1347 }
 1348 
 1349 /*
 1350  * When a specific ICMP unreachable message is received and the
 1351  * connection state is SYN-SENT, drop the connection.  This behavior
 1352  * is controlled by the icmp_may_rst sysctl.
 1353  */
 1354 struct inpcb *
 1355 tcp_drop_syn_sent(inp, errno)
 1356         struct inpcb *inp;
 1357         int errno;
 1358 {
 1359         struct tcpcb *tp = intotcpcb(inp);
 1360 
 1361         if (tp && tp->t_state == TCPS_SYN_SENT) {
 1362                 tcp_drop(tp, errno);
 1363                 return (struct inpcb *)0;
 1364         }
 1365         return inp;
 1366 }
 1367 
 1368 /*
 1369  * When `need fragmentation' ICMP is received, update our idea of the MSS
 1370  * based on the new value in the route.  Also nudge TCP to send something,
 1371  * since we know the packet we just sent was dropped.
 1372  * This duplicates some code in the tcp_mss() function in tcp_input.c.
 1373  */
 1374 struct inpcb *
 1375 tcp_mtudisc(inp, errno)
 1376         struct inpcb *inp;
 1377         int errno;
 1378 {
 1379         struct tcpcb *tp = intotcpcb(inp);
 1380         struct rtentry *rt;
 1381         struct rmxp_tao *taop;
 1382         struct socket *so = inp->inp_socket;
 1383         int offered;
 1384         int mss;
 1385 #ifdef INET6
 1386         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 1387 #endif /* INET6 */
 1388 
 1389         if (tp) {
 1390 #ifdef INET6
 1391                 if (isipv6)
 1392                         rt = tcp_rtlookup6(&inp->inp_inc);
 1393                 else
 1394 #endif /* INET6 */
 1395                 rt = tcp_rtlookup(&inp->inp_inc);
 1396                 if (!rt || !rt->rt_rmx.rmx_mtu) {
 1397                         tp->t_maxopd = tp->t_maxseg =
 1398 #ifdef INET6
 1399                                 isipv6 ? tcp_v6mssdflt :
 1400 #endif /* INET6 */
 1401                                 tcp_mssdflt;
 1402                         return inp;
 1403                 }
 1404                 taop = rmx_taop(rt->rt_rmx);
 1405                 offered = taop->tao_mssopt;
 1406                 mss = rt->rt_rmx.rmx_mtu -
 1407 #ifdef INET6
 1408                         (isipv6 ?
 1409                          sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
 1410 #endif /* INET6 */
 1411                          sizeof(struct tcpiphdr)
 1412 #ifdef INET6
 1413                          )
 1414 #endif /* INET6 */
 1415                         ;
 1416 
 1417                 if (offered)
 1418                         mss = min(mss, offered);
 1419                 /*
 1420                  * XXX - The above conditional probably violates the TCP
 1421                  * spec.  The problem is that, since we don't know the
 1422                  * other end's MSS, we are supposed to use a conservative
 1423                  * default.  But, if we do that, then MTU discovery will
 1424                  * never actually take place, because the conservative
 1425                  * default is much less than the MTUs typically seen
 1426                  * on the Internet today.  For the moment, we'll sweep
 1427                  * this under the carpet.
 1428                  *
 1429                  * The conservative default might not actually be a problem
 1430                  * if the only case this occurs is when sending an initial
 1431                  * SYN with options and data to a host we've never talked
 1432                  * to before.  Then, they will reply with an MSS value which
 1433                  * will get recorded and the new parameters should get
 1434                  * recomputed.  For Further Study.
 1435                  */
 1436                 if (tp->t_maxopd <= mss)
 1437                         return inp;
 1438                 tp->t_maxopd = mss;
 1439 
 1440                 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 1441                     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 1442                         mss -= TCPOLEN_TSTAMP_APPA;
 1443                 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 1444                     (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
 1445                         mss -= TCPOLEN_CC_APPA;
 1446 #if     (MCLBYTES & (MCLBYTES - 1)) == 0
 1447                 if (mss > MCLBYTES)
 1448                         mss &= ~(MCLBYTES-1);
 1449 #else
 1450                 if (mss > MCLBYTES)
 1451                         mss = mss / MCLBYTES * MCLBYTES;
 1452 #endif
 1453                 if (so->so_snd.sb_hiwat < mss)
 1454                         mss = so->so_snd.sb_hiwat;
 1455 
 1456                 tp->t_maxseg = mss;
 1457 
 1458                 tcpstat.tcps_mturesent++;
 1459                 tp->t_rtttime = 0;
 1460                 tp->snd_nxt = tp->snd_una;
 1461                 tcp_output(tp);
 1462         }
 1463         return inp;
 1464 }
 1465 
 1466 /*
 1467  * Look-up the routing entry to the peer of this inpcb.  If no route
 1468  * is found and it cannot be allocated, then return NULL.  This routine
 1469  * is called by TCP routines that access the rmx structure and by tcp_mss
 1470  * to get the interface MTU.
 1471  */
 1472 struct rtentry *
 1473 tcp_rtlookup(inc)
 1474         struct in_conninfo *inc;
 1475 {
 1476         struct route *ro;
 1477         struct rtentry *rt;
 1478 
 1479         ro = &inc->inc_route;
 1480         rt = ro->ro_rt;
 1481         if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 1482                 /* No route yet, so try to acquire one */
 1483                 if (inc->inc_faddr.s_addr != INADDR_ANY) {
 1484                         ro->ro_dst.sa_family = AF_INET;
 1485                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
 1486                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
 1487                             inc->inc_faddr;
 1488                         rtalloc(ro);
 1489                         rt = ro->ro_rt;
 1490                 }
 1491         }
 1492         return rt;
 1493 }
 1494 
 1495 #ifdef INET6
 1496 struct rtentry *
 1497 tcp_rtlookup6(inc)
 1498         struct in_conninfo *inc;
 1499 {
 1500         struct route_in6 *ro6;
 1501         struct rtentry *rt;
 1502 
 1503         ro6 = &inc->inc6_route;
 1504         rt = ro6->ro_rt;
 1505         if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 1506                 /* No route yet, so try to acquire one */
 1507                 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 1508                         ro6->ro_dst.sin6_family = AF_INET6;
 1509                         ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 1510                         ro6->ro_dst.sin6_addr = inc->inc6_faddr;
 1511                         rtalloc((struct route *)ro6);
 1512                         rt = ro6->ro_rt;
 1513                 }
 1514         }
 1515         return rt;
 1516 }
 1517 #endif /* INET6 */
 1518 
 1519 #ifdef IPSEC
 1520 /* compute ESP/AH header size for TCP, including outer IP header. */
 1521 size_t
 1522 ipsec_hdrsiz_tcp(tp)
 1523         struct tcpcb *tp;
 1524 {
 1525         struct inpcb *inp;
 1526         struct mbuf *m;
 1527         size_t hdrsiz;
 1528         struct ip *ip;
 1529 #ifdef INET6
 1530         struct ip6_hdr *ip6;
 1531 #endif
 1532         struct tcphdr *th;
 1533 
 1534         if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 1535                 return 0;
 1536         MGETHDR(m, M_DONTWAIT, MT_DATA);
 1537         if (!m)
 1538                 return 0;
 1539 
 1540 #ifdef INET6
 1541         if ((inp->inp_vflag & INP_IPV6) != 0) {
 1542                 ip6 = mtod(m, struct ip6_hdr *);
 1543                 th = (struct tcphdr *)(ip6 + 1);
 1544                 m->m_pkthdr.len = m->m_len =
 1545                         sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 1546                 tcpip_fillheaders(inp, ip6, th);
 1547                 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 1548         } else
 1549 #endif /* INET6 */
 1550       {
 1551         ip = mtod(m, struct ip *);
 1552         th = (struct tcphdr *)(ip + 1);
 1553         m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 1554         tcpip_fillheaders(inp, ip, th);
 1555         hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 1556       }
 1557 
 1558         m_free(m);
 1559         return hdrsiz;
 1560 }
 1561 #endif /*IPSEC*/
 1562 
 1563 /*
 1564  * Return a pointer to the cached information about the remote host.
 1565  * The cached information is stored in the protocol specific part of
 1566  * the route metrics.
 1567  */
 1568 struct rmxp_tao *
 1569 tcp_gettaocache(inc)
 1570         struct in_conninfo *inc;
 1571 {
 1572         struct rtentry *rt;
 1573 
 1574 #ifdef INET6
 1575         if (inc->inc_isipv6)
 1576                 rt = tcp_rtlookup6(inc);
 1577         else
 1578 #endif /* INET6 */
 1579         rt = tcp_rtlookup(inc);
 1580 
 1581         /* Make sure this is a host route and is up. */
 1582         if (rt == NULL ||
 1583             (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
 1584                 return NULL;
 1585 
 1586         return rmx_taop(rt->rt_rmx);
 1587 }
 1588 
 1589 /*
 1590  * Clear all the TAO cache entries, called from tcp_init.
 1591  *
 1592  * XXX
 1593  * This routine is just an empty one, because we assume that the routing
 1594  * routing tables are initialized at the same time when TCP, so there is
 1595  * nothing in the cache left over.
 1596  */
 1597 static void
 1598 tcp_cleartaocache()
 1599 {
 1600 }
 1601 
 1602 /*
 1603  * Move a TCP connection into TIME_WAIT state.
 1604  *    tcbinfo is unlocked.
 1605  *    inp is locked, and is unlocked before returning.
 1606  */
 1607 void
 1608 tcp_twstart(tp)
 1609         struct tcpcb *tp;
 1610 {
 1611         struct tcptw *tw;
 1612         struct inpcb *inp;
 1613         int tw_time, acknow;
 1614         struct socket *so;
 1615 
 1616         tw = uma_zalloc(tcptw_zone, M_NOWAIT);
 1617         if (tw == NULL) {
 1618                 tw = tcp_timer_2msl_tw(1);
 1619                 if (tw == NULL) {
 1620                         tcp_close(tp);
 1621                         return;
 1622                 }
 1623         }
 1624         inp = tp->t_inpcb;
 1625         tw->tw_inpcb = inp;
 1626 
 1627         /*
 1628          * Recover last window size sent.
 1629          */
 1630         tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
 1631 
 1632         /*
 1633          * Set t_recent if timestamps are used on the connection.
 1634          */
 1635         if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
 1636             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
 1637                 tw->t_recent = tp->ts_recent;
 1638         else
 1639                 tw->t_recent = 0;
 1640 
 1641         tw->snd_nxt = tp->snd_nxt;
 1642         tw->rcv_nxt = tp->rcv_nxt;
 1643         tw->cc_recv = tp->cc_recv;
 1644         tw->cc_send = tp->cc_send;
 1645         tw->t_starttime = tp->t_starttime;
 1646         tw->tw_time = 0;
 1647 
 1648 /* XXX
 1649  * If this code will
 1650  * be used for fin-wait-2 state also, then we may need
 1651  * a ts_recent from the last segment.
 1652  */
 1653         /* Shorten TIME_WAIT [RFC-1644, p.28] */
 1654         if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
 1655                 tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
 1656                 /* For T/TCP client, force ACK now. */
 1657                 acknow = 1;
 1658         } else {
 1659                 tw_time = 2 * tcp_msl;
 1660                 acknow = tp->t_flags & TF_ACKNOW;
 1661         }
 1662         tcp_discardcb(tp);
 1663         so = inp->inp_socket;
 1664         so->so_pcb = NULL;
 1665         tw->tw_cred = crhold(so->so_cred);
 1666         tw->tw_so_options = so->so_options;
 1667         if (acknow)
 1668                 tcp_twrespond(tw, so, NULL, TH_ACK);
 1669         sotryfree(so);
 1670         inp->inp_socket = NULL;
 1671         inp->inp_ppcb = (caddr_t)tw;
 1672         inp->inp_vflag |= INP_TIMEWAIT;
 1673         tcp_timer_2msl_reset(tw, tw_time);
 1674         INP_UNLOCK(inp);
 1675 }
 1676 
 1677 struct tcptw *
 1678 tcp_twclose(struct tcptw *tw, int reuse)
 1679 {
 1680         struct inpcb *inp;
 1681 
 1682         inp = tw->tw_inpcb;
 1683         tw->tw_inpcb = NULL;
 1684         tcp_timer_2msl_stop(tw);
 1685         inp->inp_ppcb = NULL;
 1686 #ifdef INET6
 1687         if (inp->inp_vflag & INP_IPV6PROTO)
 1688                 in6_pcbdetach(inp);
 1689         else
 1690 #endif
 1691                 in_pcbdetach(inp);
 1692         tcpstat.tcps_closed++;
 1693         if (reuse)
 1694                 return (tw);
 1695         uma_zfree(tcptw_zone, tw);
 1696         return (NULL);
 1697 }
 1698 
 1699 /*
 1700  * One of so and msrc must be non-NULL for use by the MAC Framework to
 1701  * construct a label for ay resulting packet.
 1702  */
 1703 int
 1704 tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
 1705     int flags)
 1706 {
 1707         struct inpcb *inp = tw->tw_inpcb;
 1708         struct tcphdr *th;
 1709         struct mbuf *m;
 1710         struct ip *ip = NULL;
 1711         u_int8_t *optp;
 1712         u_int hdrlen, optlen;
 1713         int error;
 1714 #ifdef INET6
 1715         struct ip6_hdr *ip6 = NULL;
 1716         int isipv6 = inp->inp_inc.inc_isipv6;
 1717 #endif
 1718 
 1719         KASSERT(so != NULL || msrc != NULL,
 1720             ("tcp_twrespond: so and msrc NULL"));
 1721 
 1722         m = m_gethdr(M_DONTWAIT, MT_HEADER);
 1723         if (m == NULL)
 1724                 return (ENOBUFS);
 1725         m->m_data += max_linkhdr;
 1726 
 1727 #ifdef MAC
 1728         if (so != NULL)
 1729                 mac_create_mbuf_from_socket(so, m);
 1730         else
 1731                 mac_create_mbuf_netlayer(msrc, m);
 1732 #endif
 1733 
 1734 #ifdef INET6
 1735         if (isipv6) {
 1736                 hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 1737                 ip6 = mtod(m, struct ip6_hdr *);
 1738                 th = (struct tcphdr *)(ip6 + 1);
 1739                 tcpip_fillheaders(inp, ip6, th);
 1740         } else
 1741 #endif
 1742         {
 1743                 hdrlen = sizeof(struct tcpiphdr);
 1744                 ip = mtod(m, struct ip *);
 1745                 th = (struct tcphdr *)(ip + 1);
 1746                 tcpip_fillheaders(inp, ip, th);
 1747         }
 1748         optp = (u_int8_t *)(th + 1);
 1749                 
 1750         /*
 1751          * Send a timestamp and echo-reply if both our side and our peer
 1752          * have sent timestamps in our SYN's and this is not a RST.
 1753          */
 1754         if (tw->t_recent && flags == TH_ACK) {
 1755                 u_int32_t *lp = (u_int32_t *)optp;
 1756 
 1757                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 1758                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 1759                 *lp++ = htonl(ticks);
 1760                 *lp   = htonl(tw->t_recent);
 1761                 optp += TCPOLEN_TSTAMP_APPA;
 1762         }
 1763 
 1764         /*
 1765          * Send `CC-family' options if needed, and it's not a RST.
 1766          */
 1767         if (tw->cc_recv != 0 && flags == TH_ACK) {
 1768                 u_int32_t *lp = (u_int32_t *)optp;
 1769 
 1770                 *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
 1771                 *lp   = htonl(tw->cc_send);
 1772                 optp += TCPOLEN_CC_APPA;
 1773         }
 1774         optlen = optp - (u_int8_t *)(th + 1);
 1775 
 1776         m->m_len = hdrlen + optlen;
 1777         m->m_pkthdr.len = m->m_len;
 1778 
 1779         KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
 1780 
 1781         th->th_seq = htonl(tw->snd_nxt);
 1782         th->th_ack = htonl(tw->rcv_nxt);
 1783         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 1784         th->th_flags = flags;
 1785         th->th_win = htons(tw->last_win);
 1786 
 1787 #ifdef INET6
 1788         if (isipv6) {
 1789                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
 1790                     sizeof(struct tcphdr) + optlen);
 1791                 ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
 1792                     inp->in6p_route.ro_rt->rt_ifp : NULL);
 1793                 error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
 1794                     (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
 1795         } else
 1796 #endif
 1797         {
 1798                 th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 1799                     htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
 1800                 m->m_pkthdr.csum_flags = CSUM_TCP;
 1801                 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 1802                 ip->ip_len = m->m_pkthdr.len;
 1803                 error = ip_output(m, inp->inp_options, &inp->inp_route,
 1804                     (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
 1805         }
 1806         if (flags & TH_ACK)
 1807                 tcpstat.tcps_sndacks++;
 1808         else
 1809                 tcpstat.tcps_sndctrl++;
 1810         tcpstat.tcps_sndtotal++;
 1811         return (error);
 1812 }
 1813 
 1814 /*
 1815  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
 1816  *
 1817  * This code attempts to calculate the bandwidth-delay product as a
 1818  * means of determining the optimal window size to maximize bandwidth,
 1819  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
 1820  * routers.  This code also does a fairly good job keeping RTTs in check
 1821  * across slow links like modems.  We implement an algorithm which is very
 1822  * similar (but not meant to be) TCP/Vegas.  The code operates on the
 1823  * transmitter side of a TCP connection and so only effects the transmit
 1824  * side of the connection.
 1825  *
 1826  * BACKGROUND:  TCP makes no provision for the management of buffer space
 1827  * at the end points or at the intermediate routers and switches.  A TCP 
 1828  * stream, whether using NewReno or not, will eventually buffer as
 1829  * many packets as it is able and the only reason this typically works is
 1830  * due to the fairly small default buffers made available for a connection
 1831  * (typicaly 16K or 32K).  As machines use larger windows and/or window
 1832  * scaling it is now fairly easy for even a single TCP connection to blow-out
 1833  * all available buffer space not only on the local interface, but on 
 1834  * intermediate routers and switches as well.  NewReno makes a misguided
 1835  * attempt to 'solve' this problem by waiting for an actual failure to occur,
 1836  * then backing off, then steadily increasing the window again until another
 1837  * failure occurs, ad-infinitum.  This results in terrible oscillation that
 1838  * is only made worse as network loads increase and the idea of intentionally
 1839  * blowing out network buffers is, frankly, a terrible way to manage network
 1840  * resources.
 1841  *
 1842  * It is far better to limit the transmit window prior to the failure
 1843  * condition being achieved.  There are two general ways to do this:  First
 1844  * you can 'scan' through different transmit window sizes and locate the
 1845  * point where the RTT stops increasing, indicating that you have filled the
 1846  * pipe, then scan backwards until you note that RTT stops decreasing, then
 1847  * repeat ad-infinitum.  This method works in principle but has severe
 1848  * implementation issues due to RTT variances, timer granularity, and
 1849  * instability in the algorithm which can lead to many false positives and
 1850  * create oscillations as well as interact badly with other TCP streams
 1851  * implementing the same algorithm.
 1852  *
 1853  * The second method is to limit the window to the bandwidth delay product
 1854  * of the link.  This is the method we implement.  RTT variances and our
 1855  * own manipulation of the congestion window, bwnd, can potentially 
 1856  * destabilize the algorithm.  For this reason we have to stabilize the
 1857  * elements used to calculate the window.  We do this by using the minimum
 1858  * observed RTT, the long term average of the observed bandwidth, and
 1859  * by adding two segments worth of slop.  It isn't perfect but it is able
 1860  * to react to changing conditions and gives us a very stable basis on
 1861  * which to extend the algorithm.
 1862  */
 1863 void
 1864 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 1865 {
 1866         u_long bw;
 1867         u_long bwnd;
 1868         int save_ticks;
 1869 
 1870         /*
 1871          * If inflight_enable is disabled in the middle of a tcp connection,
 1872          * make sure snd_bwnd is effectively disabled.
 1873          */
 1874         if (tcp_inflight_enable == 0) {
 1875                 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 1876                 tp->snd_bandwidth = 0;
 1877                 return;
 1878         }
 1879 
 1880         /*
 1881          * Figure out the bandwidth.  Due to the tick granularity this
 1882          * is a very rough number and it MUST be averaged over a fairly
 1883          * long period of time.  XXX we need to take into account a link
 1884          * that is not using all available bandwidth, but for now our
 1885          * slop will ramp us up if this case occurs and the bandwidth later
 1886          * increases.
 1887          *
 1888          * Note: if ticks rollover 'bw' may wind up negative.  We must
 1889          * effectively reset t_bw_rtttime for this case.
 1890          */
 1891         save_ticks = ticks;
 1892         if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 1893                 return;
 1894 
 1895         bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
 1896             (save_ticks - tp->t_bw_rtttime);
 1897         tp->t_bw_rtttime = save_ticks;
 1898         tp->t_bw_rtseq = ack_seq;
 1899         if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 1900                 return;
 1901         bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 1902 
 1903         tp->snd_bandwidth = bw;
 1904 
 1905         /*
 1906          * Calculate the semi-static bandwidth delay product, plus two maximal
 1907          * segments.  The additional slop puts us squarely in the sweet
 1908          * spot and also handles the bandwidth run-up case and stabilization.
 1909          * Without the slop we could be locking ourselves into a lower
 1910          * bandwidth.
 1911          *
 1912          * Situations Handled:
 1913          *      (1) Prevents over-queueing of packets on LANs, especially on
 1914          *          high speed LANs, allowing larger TCP buffers to be
 1915          *          specified, and also does a good job preventing 
 1916          *          over-queueing of packets over choke points like modems
 1917          *          (at least for the transmit side).
 1918          *
 1919          *      (2) Is able to handle changing network loads (bandwidth
 1920          *          drops so bwnd drops, bandwidth increases so bwnd
 1921          *          increases).
 1922          *
 1923          *      (3) Theoretically should stabilize in the face of multiple
 1924          *          connections implementing the same algorithm (this may need
 1925          *          a little work).
 1926          *
 1927          *      (4) Stability value (defaults to 20 = 2 maximal packets) can
 1928          *          be adjusted with a sysctl but typically only needs to be
 1929          *          on very slow connections.  A value no smaller then 5
 1930          *          should be used, but only reduce this default if you have
 1931          *          no other choice.
 1932          */
 1933 #define USERTT  ((tp->t_srtt + tp->t_rttbest) / 2)
 1934         bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * tp->t_maxseg / 10;
 1935 #undef USERTT
 1936 
 1937         if (tcp_inflight_debug > 0) {
 1938                 static int ltime;
 1939                 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 1940                         ltime = ticks;
 1941                         printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 1942                             tp,
 1943                             bw,
 1944                             tp->t_rttbest,
 1945                             tp->t_srtt,
 1946                             bwnd
 1947                         );
 1948                 }
 1949         }
 1950         if ((long)bwnd < tcp_inflight_min)
 1951                 bwnd = tcp_inflight_min;
 1952         if (bwnd > tcp_inflight_max)
 1953                 bwnd = tcp_inflight_max;
 1954         if ((long)bwnd < tp->t_maxseg * 2)
 1955                 bwnd = tp->t_maxseg * 2;
 1956         tp->snd_bwnd = bwnd;
 1957 }
 1958
Cache object: 39abc1178411429076ce2cf5af8c716f
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/netinet/tcp_subr.c

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_subr.c