tcp_subr.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*
    2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  * 3. All advertising materials mentioning features or use of this software
   14  *    must display the following acknowledgement:
   15  *      This product includes software developed by the University of
   16  *      California, Berkeley and its contributors.
   17  * 4. Neither the name of the University nor the names of its contributors
   18  *    may be used to endorse or promote products derived from this software
   19  *    without specific prior written permission.
   20  *
   21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   31  * SUCH DAMAGE.
   32  *
   33  *      @(#)tcp_subr.c  8.2 (Berkeley) 5/24/95
   34  * $FreeBSD$
   35  */
   36 
   37 #include "opt_compat.h"
   38 #include "opt_inet.h"
   39 #include "opt_inet6.h"
   40 #include "opt_ipsec.h"
   41 #include "opt_tcpdebug.h"
   42 
   43 #include <sys/param.h>
   44 #include <sys/systm.h>
   45 #include <sys/callout.h>
   46 #include <sys/kernel.h>
   47 #include <sys/sysctl.h>
   48 #include <sys/malloc.h>
   49 #include <sys/mbuf.h>
   50 #ifdef INET6
   51 #include <sys/domain.h>
   52 #endif
   53 #include <sys/proc.h>
   54 #include <sys/socket.h>
   55 #include <sys/socketvar.h>
   56 #include <sys/protosw.h>
   57 #include <sys/random.h>
   58 
   59 #include <vm/vm_zone.h>
   60 
   61 #include <net/route.h>
   62 #include <net/if.h>
   63 
   64 #define _IP_VHL
   65 #include <netinet/in.h>
   66 #include <netinet/in_systm.h>
   67 #include <netinet/ip.h>
   68 #ifdef INET6
   69 #include <netinet/ip6.h>
   70 #endif
   71 #include <netinet/in_pcb.h>
   72 #ifdef INET6
   73 #include <netinet6/in6_pcb.h>
   74 #endif
   75 #include <netinet/in_var.h>
   76 #include <netinet/ip_var.h>
   77 #ifdef INET6
   78 #include <netinet6/ip6_var.h>
   79 #endif
   80 #include <netinet/tcp.h>
   81 #include <netinet/tcp_fsm.h>
   82 #include <netinet/tcp_seq.h>
   83 #include <netinet/tcp_timer.h>
   84 #include <netinet/tcp_var.h>
   85 #ifdef INET6
   86 #include <netinet6/tcp6_var.h>
   87 #endif
   88 #include <netinet/tcpip.h>
   89 #ifdef TCPDEBUG
   90 #include <netinet/tcp_debug.h>
   91 #endif
   92 #include <netinet6/ip6protosw.h>
   93 
   94 #ifdef IPSEC
   95 #include <netinet6/ipsec.h>
   96 #ifdef INET6
   97 #include <netinet6/ipsec6.h>
   98 #endif
   99 #endif /*IPSEC*/
  100 
  101 #ifdef FAST_IPSEC
  102 #include <netipsec/ipsec.h>
  103 #include <netipsec/xform.h>
  104 #ifdef INET6
  105 #include <netipsec/ipsec6.h>
  106 #endif
  107 #include <netipsec/key.h>
  108 #define IPSEC
  109 #endif /*FAST_IPSEC*/
  110 
  111 #include <machine/in_cksum.h>
  112 #include <sys/md5.h>
  113 
  114 int     tcp_mssdflt = TCP_MSS;
  115 SYSCTL_INT(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW, 
  116     &tcp_mssdflt , 0, "Default TCP Maximum Segment Size");
  117 
  118 #ifdef INET6
  119 int     tcp_v6mssdflt = TCP6_MSS;
  120 SYSCTL_INT(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
  121         CTLFLAG_RW, &tcp_v6mssdflt , 0,
  122         "Default TCP Maximum Segment Size for IPv6");
  123 #endif
  124 
  125 #if 0
  126 static int      tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
  127 SYSCTL_INT(_net_inet_tcp, TCPCTL_RTTDFLT, rttdflt, CTLFLAG_RW, 
  128     &tcp_rttdflt , 0, "Default maximum TCP Round Trip Time");
  129 #endif
  130 
  131 int     tcp_do_rfc1323 = 1;
  132 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_RW, 
  133     &tcp_do_rfc1323 , 0, "Enable rfc1323 (high performance TCP) extensions");
  134 
  135 int     tcp_do_rfc1644 = 0;
  136 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644, CTLFLAG_RW, 
  137     &tcp_do_rfc1644 , 0, "Enable rfc1644 (TTCP) extensions");
  138 
  139 static int      tcp_tcbhashsize = 0;
  140 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD,
  141      &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
  142 
  143 static int      do_tcpdrain = 1;
  144 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
  145      "Enable tcp_drain routine for extra help when low on mbufs");
  146 
  147 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD, 
  148     &tcbinfo.ipi_count, 0, "Number of active PCBs");
  149 
  150 static int      icmp_may_rst = 1;
  151 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_RW, &icmp_may_rst, 0, 
  152     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
  153 
  154 static int      tcp_isn_reseed_interval = 0;
  155 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
  156     &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
  157 
  158 /*
  159  * TCP bandwidth limiting sysctls.  Note that the default lower bound of 
  160  * 1024 exists only for debugging.  A good production default would be 
  161  * something like 6100.
  162  */
  163 static int     tcp_inflight_enable = 0;
  164 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_enable, CTLFLAG_RW,
  165     &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
  166 
  167 static int     tcp_inflight_debug = 0;
  168 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_debug, CTLFLAG_RW,
  169     &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
  170 
  171 static int     tcp_inflight_min = 6144;
  172 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_min, CTLFLAG_RW,
  173     &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
  174 
  175 static int     tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  176 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_max, CTLFLAG_RW,
  177     &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
  178 
  179 static int     tcp_inflight_stab = 20;
  180 SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
  181     &tcp_inflight_stab, 0, "Slop in maximal packets / 10 (20 = 2 packets)");
  182 
  183 static void     tcp_cleartaocache __P((void));
  184 static void     tcp_notify __P((struct inpcb *, int));
  185 
  186 /*
  187  * Target size of TCP PCB hash tables. Must be a power of two.
  188  *
  189  * Note that this can be overridden by the kernel environment
  190  * variable net.inet.tcp.tcbhashsize
  191  */
  192 #ifndef TCBHASHSIZE
  193 #define TCBHASHSIZE     512
  194 #endif
  195 
  196 /*
  197  * This is the actual shape of what we allocate using the zone
  198  * allocator.  Doing it this way allows us to protect both structures
  199  * using the same generation count, and also eliminates the overhead
  200  * of allocating tcpcbs separately.  By hiding the structure here,
  201  * we avoid changing most of the rest of the code (although it needs
  202  * to be changed, eventually, for greater efficiency).
  203  */
  204 #define ALIGNMENT       32
  205 #define ALIGNM1         (ALIGNMENT - 1)
  206 struct  inp_tp {
  207         union {
  208                 struct  inpcb inp;
  209                 char    align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
  210         } inp_tp_u;
  211         struct  tcpcb tcb;
  212         struct  callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
  213         struct  callout inp_tp_delack;
  214 };
  215 #undef ALIGNMENT
  216 #undef ALIGNM1
  217 
  218 /*
  219  * Tcp initialization
  220  */
  221 void
  222 tcp_init()
  223 {
  224         int hashsize = TCBHASHSIZE;
  225         
  226         tcp_ccgen = 1;
  227         tcp_cleartaocache();
  228 
  229         tcp_delacktime = TCPTV_DELACK;
  230         tcp_keepinit = TCPTV_KEEP_INIT;
  231         tcp_keepidle = TCPTV_KEEP_IDLE;
  232         tcp_keepintvl = TCPTV_KEEPINTVL;
  233         tcp_maxpersistidle = TCPTV_KEEP_IDLE;
  234         tcp_msl = TCPTV_MSL;
  235         tcp_rexmit_min = TCPTV_MIN;
  236         tcp_rexmit_slop = TCPTV_CPU_VAR;
  237 
  238         LIST_INIT(&tcb);
  239         tcbinfo.listhead = &tcb;
  240         TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
  241         if (!powerof2(hashsize)) {
  242                 printf("WARNING: TCB hash size not a power of 2\n");
  243                 hashsize = 512; /* safe default */
  244         }
  245         tcp_tcbhashsize = hashsize;
  246         tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
  247         tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
  248                                         &tcbinfo.porthashmask);
  249         tcbinfo.ipi_zone = zinit("tcpcb", sizeof(struct inp_tp), maxsockets,
  250                                  ZONE_INTERRUPT, 0);
  251 
  252         tcp_reass_maxseg = nmbclusters / 16;
  253         TUNABLE_INT_FETCH("net.inet.tcp.reass.maxsegments",
  254             &tcp_reass_maxseg);
  255 
  256 #ifdef INET6
  257 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
  258 #else /* INET6 */
  259 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
  260 #endif /* INET6 */
  261         if (max_protohdr < TCP_MINPROTOHDR)
  262                 max_protohdr = TCP_MINPROTOHDR;
  263         if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
  264                 panic("tcp_init");
  265 #undef TCP_MINPROTOHDR
  266 
  267         syncache_init();
  268 }
  269 
  270 /*
  271  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  272  * tcp_template used to store this data in mbufs, but we now recopy it out
  273  * of the tcpcb each time to conserve mbufs.
  274  */
  275 void
  276 tcp_fillheaders(tp, ip_ptr, tcp_ptr)
  277         struct tcpcb *tp;
  278         void *ip_ptr;
  279         void *tcp_ptr;
  280 {
  281         struct inpcb *inp = tp->t_inpcb;
  282         struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
  283 
  284 #ifdef INET6
  285         if ((inp->inp_vflag & INP_IPV6) != 0) {
  286                 struct ip6_hdr *ip6;
  287 
  288                 ip6 = (struct ip6_hdr *)ip_ptr;
  289                 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
  290                         (inp->in6p_flowinfo & IPV6_FLOWINFO_MASK);
  291                 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
  292                         (IPV6_VERSION & IPV6_VERSION_MASK);
  293                 ip6->ip6_nxt = IPPROTO_TCP;
  294                 ip6->ip6_plen = sizeof(struct tcphdr);
  295                 ip6->ip6_src = inp->in6p_laddr;
  296                 ip6->ip6_dst = inp->in6p_faddr;
  297                 tcp_hdr->th_sum = 0;
  298         } else
  299 #endif
  300         {
  301         struct ip *ip = (struct ip *) ip_ptr;
  302 
  303         ip->ip_vhl = IP_VHL_BORING;
  304         ip->ip_tos = 0;
  305         ip->ip_len = 0;
  306         ip->ip_id = 0;
  307         ip->ip_off = 0;
  308         ip->ip_ttl = 0;
  309         ip->ip_sum = 0;
  310         ip->ip_p = IPPROTO_TCP;
  311         ip->ip_src = inp->inp_laddr;
  312         ip->ip_dst = inp->inp_faddr;
  313         tcp_hdr->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
  314                 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
  315         }
  316 
  317         tcp_hdr->th_sport = inp->inp_lport;
  318         tcp_hdr->th_dport = inp->inp_fport;
  319         tcp_hdr->th_seq = 0;
  320         tcp_hdr->th_ack = 0;
  321         tcp_hdr->th_x2 = 0;
  322         tcp_hdr->th_off = 5;
  323         tcp_hdr->th_flags = 0;
  324         tcp_hdr->th_win = 0;
  325         tcp_hdr->th_urp = 0;
  326 }
  327 
  328 /*
  329  * Create template to be used to send tcp packets on a connection.
  330  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  331  * use for this function is in keepalives, which use tcp_respond.
  332  */
  333 struct tcptemp *
  334 tcp_maketemplate(tp)
  335         struct tcpcb *tp;
  336 {
  337         struct mbuf *m;
  338         struct tcptemp *n;
  339 
  340         m = m_get(M_DONTWAIT, MT_HEADER);
  341         if (m == NULL)
  342                 return (0);
  343         m->m_len = sizeof(struct tcptemp);
  344         n = mtod(m, struct tcptemp *);
  345 
  346         tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
  347         return (n);
  348 }
  349 
  350 /*
  351  * Send a single message to the TCP at address specified by
  352  * the given TCP/IP header.  If m == 0, then we make a copy
  353  * of the tcpiphdr at ti and send directly to the addressed host.
  354  * This is used to force keep alive messages out using the TCP
  355  * template for a connection.  If flags are given then we send
  356  * a message back to the TCP which originated the * segment ti,
  357  * and discard the mbuf containing it and any other attached mbufs.
  358  *
  359  * In any case the ack and sequence number of the transmitted
  360  * segment are as specified by the parameters.
  361  *
  362  * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
  363  */
  364 void
  365 tcp_respond(tp, ipgen, th, m, ack, seq, flags)
  366         struct tcpcb *tp;
  367         void *ipgen;
  368         register struct tcphdr *th;
  369         register struct mbuf *m;
  370         tcp_seq ack, seq;
  371         int flags;
  372 {
  373         register int tlen;
  374         int win = 0;
  375         struct route *ro = 0;
  376         struct route sro;
  377         struct ip *ip;
  378         struct tcphdr *nth;
  379 #ifdef INET6
  380         struct route_in6 *ro6 = 0;
  381         struct route_in6 sro6;
  382         struct ip6_hdr *ip6;
  383         int isipv6;
  384 #endif /* INET6 */
  385         int ipflags = 0;
  386 
  387 #ifdef INET6
  388         isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
  389         ip6 = ipgen;
  390 #endif /* INET6 */
  391         ip = ipgen;
  392 
  393         if (tp) {
  394                 if (!(flags & TH_RST)) {
  395                         win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
  396                         if (win > (long)TCP_MAXWIN << tp->rcv_scale)
  397                                 win = (long)TCP_MAXWIN << tp->rcv_scale;
  398                 }
  399 #ifdef INET6
  400                 if (isipv6)
  401                         ro6 = &tp->t_inpcb->in6p_route;
  402                 else
  403 #endif /* INET6 */
  404                 ro = &tp->t_inpcb->inp_route;
  405         } else {
  406 #ifdef INET6
  407                 if (isipv6) {
  408                         ro6 = &sro6;
  409                         bzero(ro6, sizeof *ro6);
  410                 } else
  411 #endif /* INET6 */
  412               {
  413                 ro = &sro;
  414                 bzero(ro, sizeof *ro);
  415               }
  416         }
  417         if (m == 0) {
  418                 m = m_gethdr(M_DONTWAIT, MT_HEADER);
  419                 if (m == NULL)
  420                         return;
  421                 tlen = 0;
  422                 m->m_data += max_linkhdr;
  423 #ifdef INET6
  424                 if (isipv6) {
  425                         bcopy((caddr_t)ip6, mtod(m, caddr_t), 
  426                               sizeof(struct ip6_hdr));
  427                         ip6 = mtod(m, struct ip6_hdr *);
  428                         nth = (struct tcphdr *)(ip6 + 1);
  429                 } else
  430 #endif /* INET6 */
  431               {
  432                 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
  433                 ip = mtod(m, struct ip *);
  434                 nth = (struct tcphdr *)(ip + 1);
  435               }
  436                 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
  437                 flags = TH_ACK;
  438         } else {
  439                 m_freem(m->m_next);
  440                 m->m_next = 0;
  441                 m->m_data = (caddr_t)ipgen;
  442                 /* m_len is set later */
  443                 tlen = 0;
  444 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
  445 #ifdef INET6
  446                 if (isipv6) {
  447                         xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
  448                         nth = (struct tcphdr *)(ip6 + 1);
  449                 } else
  450 #endif /* INET6 */
  451               {
  452                 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
  453                 nth = (struct tcphdr *)(ip + 1);
  454               }
  455                 if (th != nth) {
  456                         /*
  457                          * this is usually a case when an extension header
  458                          * exists between the IPv6 header and the
  459                          * TCP header.
  460                          */
  461                         nth->th_sport = th->th_sport;
  462                         nth->th_dport = th->th_dport;
  463                 }
  464                 xchg(nth->th_dport, nth->th_sport, n_short);
  465 #undef xchg
  466         }
  467 #ifdef INET6
  468         if (isipv6) {
  469                 ip6->ip6_flow = 0;
  470                 ip6->ip6_vfc = IPV6_VERSION;
  471                 ip6->ip6_nxt = IPPROTO_TCP;
  472                 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
  473                                                 tlen));
  474                 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
  475         } else
  476 #endif
  477       {
  478         tlen += sizeof (struct tcpiphdr);
  479         ip->ip_len = tlen;
  480         ip->ip_ttl = ip_defttl;
  481       }
  482         m->m_len = tlen;
  483         m->m_pkthdr.len = tlen;
  484         m->m_pkthdr.rcvif = (struct ifnet *) 0;
  485         nth->th_seq = htonl(seq);
  486         nth->th_ack = htonl(ack);
  487         nth->th_x2 = 0;
  488         nth->th_off = sizeof (struct tcphdr) >> 2;
  489         nth->th_flags = flags;
  490         if (tp)
  491                 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
  492         else
  493                 nth->th_win = htons((u_short)win);
  494         nth->th_urp = 0;
  495 #ifdef INET6
  496         if (isipv6) {
  497                 nth->th_sum = 0;
  498                 nth->th_sum = in6_cksum(m, IPPROTO_TCP,
  499                                         sizeof(struct ip6_hdr),
  500                                         tlen - sizeof(struct ip6_hdr));
  501                 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
  502                                                ro6 && ro6->ro_rt ?
  503                                                ro6->ro_rt->rt_ifp :
  504                                                NULL);
  505         } else
  506 #endif /* INET6 */
  507       {
  508         nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
  509             htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
  510         m->m_pkthdr.csum_flags = CSUM_TCP;
  511         m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
  512       }
  513 #ifdef TCPDEBUG
  514         if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
  515                 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
  516 #endif
  517 #ifdef INET6
  518         if (isipv6) {
  519                 (void)ip6_output(m, NULL, ro6, ipflags, NULL, NULL,
  520                         tp ? tp->t_inpcb : NULL);
  521                 if (ro6 == &sro6 && ro6->ro_rt) {
  522                         RTFREE(ro6->ro_rt);
  523                         ro6->ro_rt = NULL;
  524                 }
  525         } else
  526 #endif /* INET6 */
  527       {
  528         (void) ip_output(m, NULL, ro, ipflags, NULL, tp ? tp->t_inpcb : NULL);
  529         if (ro == &sro && ro->ro_rt) {
  530                 RTFREE(ro->ro_rt);
  531                 ro->ro_rt = NULL;
  532         }
  533       }
  534 }
  535 
  536 /*
  537  * Create a new TCP control block, making an
  538  * empty reassembly queue and hooking it to the argument
  539  * protocol control block.  The `inp' parameter must have
  540  * come from the zone allocator set up in tcp_init().
  541  */
  542 struct tcpcb *
  543 tcp_newtcpcb(inp)
  544         struct inpcb *inp;
  545 {
  546         struct inp_tp *it;
  547         register struct tcpcb *tp;
  548 #ifdef INET6
  549         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
  550 #endif /* INET6 */
  551 
  552         it = (struct inp_tp *)inp;
  553         tp = &it->tcb;
  554         bzero((char *) tp, sizeof(struct tcpcb));
  555         LIST_INIT(&tp->t_segq);
  556         tp->t_maxseg = tp->t_maxopd =
  557 #ifdef INET6
  558                 isipv6 ? tcp_v6mssdflt :
  559 #endif /* INET6 */
  560                 tcp_mssdflt;
  561 
  562         /* Set up our timeouts. */
  563         callout_init(tp->tt_rexmt = &it->inp_tp_rexmt);
  564         callout_init(tp->tt_persist = &it->inp_tp_persist);
  565         callout_init(tp->tt_keep = &it->inp_tp_keep);
  566         callout_init(tp->tt_2msl = &it->inp_tp_2msl);
  567         callout_init(tp->tt_delack = &it->inp_tp_delack);
  568 
  569         if (tcp_do_rfc1323)
  570                 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
  571         if (tcp_do_rfc1644)
  572                 tp->t_flags |= TF_REQ_CC;
  573         tp->t_inpcb = inp;      /* XXX */
  574         /*
  575          * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
  576          * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
  577          * reasonable initial retransmit time.
  578          */
  579         tp->t_srtt = TCPTV_SRTTBASE;
  580         tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
  581         tp->t_rttmin = tcp_rexmit_min;
  582         tp->t_rxtcur = TCPTV_RTOBASE;
  583         tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  584         tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  585         tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
  586         tp->t_rcvtime = ticks;
  587         tp->t_bw_rtttime = ticks;
  588         /*
  589          * IPv4 TTL initialization is necessary for an IPv6 socket as well,
  590          * because the socket may be bound to an IPv6 wildcard address,
  591          * which may match an IPv4-mapped IPv6 address.
  592          */
  593         inp->inp_ip_ttl = ip_defttl;
  594         inp->inp_ppcb = (caddr_t)tp;
  595         return (tp);            /* XXX */
  596 }
  597 
  598 /*
  599  * Drop a TCP connection, reporting
  600  * the specified error.  If connection is synchronized,
  601  * then send a RST to peer.
  602  */
  603 struct tcpcb *
  604 tcp_drop(tp, errno)
  605         register struct tcpcb *tp;
  606         int errno;
  607 {
  608         struct socket *so = tp->t_inpcb->inp_socket;
  609 
  610         if (TCPS_HAVERCVDSYN(tp->t_state)) {
  611                 tp->t_state = TCPS_CLOSED;
  612                 (void) tcp_output(tp);
  613                 tcpstat.tcps_drops++;
  614         } else
  615                 tcpstat.tcps_conndrops++;
  616         if (errno == ETIMEDOUT && tp->t_softerror)
  617                 errno = tp->t_softerror;
  618         so->so_error = errno;
  619         return (tcp_close(tp));
  620 }
  621 
  622 /*
  623  * Close a TCP control block:
  624  *      discard all space held by the tcp
  625  *      discard internet protocol block
  626  *      wake up any sleepers
  627  */
  628 struct tcpcb *
  629 tcp_close(tp)
  630         register struct tcpcb *tp;
  631 {
  632         register struct tseg_qent *q;
  633         struct inpcb *inp = tp->t_inpcb;
  634         struct socket *so = inp->inp_socket;
  635 #ifdef INET6
  636         int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
  637 #endif /* INET6 */
  638         register struct rtentry *rt;
  639         int dosavessthresh;
  640 
  641         /*
  642          * Make sure that all of our timers are stopped before we
  643          * delete the PCB.
  644          */
  645         callout_stop(tp->tt_rexmt);
  646         callout_stop(tp->tt_persist);
  647         callout_stop(tp->tt_keep);
  648         callout_stop(tp->tt_2msl);
  649         callout_stop(tp->tt_delack);
  650 
  651         /*
  652          * If we got enough samples through the srtt filter,
  653          * save the rtt and rttvar in the routing entry.
  654          * 'Enough' is arbitrarily defined as the 16 samples.
  655          * 16 samples is enough for the srtt filter to converge
  656          * to within 5% of the correct value; fewer samples and
  657          * we could save a very bogus rtt.
  658          *
  659          * Don't update the default route's characteristics and don't
  660          * update anything that the user "locked".
  661          */
  662         if (tp->t_rttupdated >= 16) {
  663                 register u_long i = 0;
  664 #ifdef INET6
  665                 if (isipv6) {
  666                         struct sockaddr_in6 *sin6;
  667 
  668                         if ((rt = inp->in6p_route.ro_rt) == NULL)
  669                                 goto no_valid_rt;
  670                         sin6 = (struct sockaddr_in6 *)rt_key(rt);
  671                         if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
  672                                 goto no_valid_rt;
  673                 }
  674                 else
  675 #endif /* INET6 */              
  676                 if ((rt = inp->inp_route.ro_rt) == NULL ||
  677                     ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
  678                     == INADDR_ANY)
  679                         goto no_valid_rt;
  680 
  681                 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
  682                         i = tp->t_srtt *
  683                             (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
  684                         if (rt->rt_rmx.rmx_rtt && i)
  685                                 /*
  686                                  * filter this update to half the old & half
  687                                  * the new values, converting scale.
  688                                  * See route.h and tcp_var.h for a
  689                                  * description of the scaling constants.
  690                                  */
  691                                 rt->rt_rmx.rmx_rtt =
  692                                     (rt->rt_rmx.rmx_rtt + i) / 2;
  693                         else
  694                                 rt->rt_rmx.rmx_rtt = i;
  695                         tcpstat.tcps_cachedrtt++;
  696                 }
  697                 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
  698                         i = tp->t_rttvar *
  699                             (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
  700                         if (rt->rt_rmx.rmx_rttvar && i)
  701                                 rt->rt_rmx.rmx_rttvar =
  702                                     (rt->rt_rmx.rmx_rttvar + i) / 2;
  703                         else
  704                                 rt->rt_rmx.rmx_rttvar = i;
  705                         tcpstat.tcps_cachedrttvar++;
  706                 }
  707                 /*
  708                  * The old comment here said:
  709                  * update the pipelimit (ssthresh) if it has been updated
  710                  * already or if a pipesize was specified & the threshhold
  711                  * got below half the pipesize.  I.e., wait for bad news
  712                  * before we start updating, then update on both good
  713                  * and bad news.
  714                  *
  715                  * But we want to save the ssthresh even if no pipesize is
  716                  * specified explicitly in the route, because such
  717                  * connections still have an implicit pipesize specified
  718                  * by the global tcp_sendspace.  In the absence of a reliable
  719                  * way to calculate the pipesize, it will have to do.
  720                  */
  721                 i = tp->snd_ssthresh;
  722                 if (rt->rt_rmx.rmx_sendpipe != 0)
  723                         dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
  724                 else
  725                         dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
  726                 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
  727                      i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
  728                     || dosavessthresh) {
  729                         /*
  730                          * convert the limit from user data bytes to
  731                          * packets then to packet data bytes.
  732                          */
  733                         i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
  734                         if (i < 2)
  735                                 i = 2;
  736                         i *= (u_long)(tp->t_maxseg +
  737 #ifdef INET6
  738                                       (isipv6 ? sizeof (struct ip6_hdr) +
  739                                                sizeof (struct tcphdr) :
  740 #endif
  741                                        sizeof (struct tcpiphdr)
  742 #ifdef INET6
  743                                        )
  744 #endif
  745                                       );
  746                         if (rt->rt_rmx.rmx_ssthresh)
  747                                 rt->rt_rmx.rmx_ssthresh =
  748                                     (rt->rt_rmx.rmx_ssthresh + i) / 2;
  749                         else
  750                                 rt->rt_rmx.rmx_ssthresh = i;
  751                         tcpstat.tcps_cachedssthresh++;
  752                 }
  753         }
  754     no_valid_rt:
  755         /* free the reassembly queue, if any */
  756         while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
  757                 LIST_REMOVE(q, tqe_q);
  758                 m_freem(q->tqe_m);
  759                 FREE(q, M_TSEGQ);
  760                 tcp_reass_qsize--;
  761         }
  762         inp->inp_ppcb = NULL;
  763         soisdisconnected(so);
  764 #ifdef INET6
  765         if (INP_CHECK_SOCKAF(so, AF_INET6))
  766                 in6_pcbdetach(inp);
  767         else
  768 #endif /* INET6 */
  769         in_pcbdetach(inp);
  770         tcpstat.tcps_closed++;
  771         return ((struct tcpcb *)0);
  772 }
  773 
  774 void
  775 tcp_drain()
  776 {
  777         if (do_tcpdrain)
  778         {
  779                 struct inpcb *inpb;
  780                 struct tcpcb *tcpb;
  781                 struct tseg_qent *te;
  782 
  783         /*
  784          * Walk the tcpbs, if existing, and flush the reassembly queue,
  785          * if there is one...
  786          * XXX: The "Net/3" implementation doesn't imply that the TCP
  787          *      reassembly queue should be flushed, but in a situation
  788          *      where we're really low on mbufs, this is potentially
  789          *      usefull.        
  790          */
  791                 LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
  792                         if ((tcpb = intotcpcb(inpb))) {
  793                                 while ((te = LIST_FIRST(&tcpb->t_segq))
  794                                     != NULL) {
  795                                         LIST_REMOVE(te, tqe_q);
  796                                         m_freem(te->tqe_m);
  797                                         FREE(te, M_TSEGQ);
  798                                         tcp_reass_qsize--;
  799                                 }
  800                         }
  801                 }
  802 
  803         }
  804 }
  805 
  806 /*
  807  * Notify a tcp user of an asynchronous error;
  808  * store error as soft error, but wake up user
  809  * (for now, won't do anything until can select for soft error).
  810  *
  811  * Do not wake up user since there currently is no mechanism for
  812  * reporting soft errors (yet - a kqueue filter may be added).
  813  */
  814 static void
  815 tcp_notify(inp, error)
  816         struct inpcb *inp;
  817         int error;
  818 {
  819         struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
  820 
  821         /*
  822          * Ignore some errors if we are hooked up.
  823          * If connection hasn't completed, has retransmitted several times,
  824          * and receives a second error, give up now.  This is better
  825          * than waiting a long time to establish a connection that
  826          * can never complete.
  827          */
  828         if (tp->t_state == TCPS_ESTABLISHED &&
  829              (error == EHOSTUNREACH || error == ENETUNREACH ||
  830               error == EHOSTDOWN)) {
  831                 return;
  832         } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
  833             tp->t_softerror)
  834                 tcp_drop(tp, error);
  835         else
  836                 tp->t_softerror = error;
  837 #if 0
  838         wakeup((caddr_t) &so->so_timeo);
  839         sorwakeup(so);
  840         sowwakeup(so);
  841 #endif
  842 }
  843 
  844 static int
  845 tcp_pcblist(SYSCTL_HANDLER_ARGS)
  846 {
  847         int error, i, n, s;
  848         struct inpcb *inp, **inp_list;
  849         inp_gen_t gencnt;
  850         struct xinpgen xig;
  851 
  852         /*
  853          * The process of preparing the TCB list is too time-consuming and
  854          * resource-intensive to repeat twice on every request.
  855          */
  856         if (req->oldptr == 0) {
  857                 n = tcbinfo.ipi_count;
  858                 req->oldidx = 2 * (sizeof xig)
  859                         + (n + n/8) * sizeof(struct xtcpcb);
  860                 return 0;
  861         }
  862 
  863         if (req->newptr != 0)
  864                 return EPERM;
  865 
  866         /*
  867          * OK, now we're committed to doing something.
  868          */
  869         s = splnet();
  870         gencnt = tcbinfo.ipi_gencnt;
  871         n = tcbinfo.ipi_count;
  872         splx(s);
  873 
  874         xig.xig_len = sizeof xig;
  875         xig.xig_count = n;
  876         xig.xig_gen = gencnt;
  877         xig.xig_sogen = so_gencnt;
  878         error = SYSCTL_OUT(req, &xig, sizeof xig);
  879         if (error)
  880                 return error;
  881 
  882         inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK);
  883         if (inp_list == 0)
  884                 return ENOMEM;
  885         
  886         s = splnet();
  887         for (inp = LIST_FIRST(tcbinfo.listhead), i = 0; inp && i < n;
  888              inp = LIST_NEXT(inp, inp_list)) {
  889                 if (inp->inp_gencnt <= gencnt && !prison_xinpcb(req->p, inp))
  890                         inp_list[i++] = inp;
  891         }
  892         splx(s);
  893         n = i;
  894 
  895         error = 0;
  896         for (i = 0; i < n; i++) {
  897                 inp = inp_list[i];
  898                 if (inp->inp_gencnt <= gencnt) {
  899                         struct xtcpcb xt;
  900                         caddr_t inp_ppcb;
  901 
  902                         bzero(&xt, sizeof(xt));
  903                         xt.xt_len = sizeof xt;
  904                         /* XXX should avoid extra copy */
  905                         bcopy(inp, &xt.xt_inp, sizeof *inp);
  906                         inp_ppcb = inp->inp_ppcb;
  907                         if (inp_ppcb != NULL)
  908                                 bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
  909                         else
  910                                 bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
  911                         if (inp->inp_socket)
  912                                 sotoxsocket(inp->inp_socket, &xt.xt_socket);
  913                         error = SYSCTL_OUT(req, &xt, sizeof xt);
  914                 }
  915         }
  916         if (!error) {
  917                 /*
  918                  * Give the user an updated idea of our state.
  919                  * If the generation differs from what we told
  920                  * her before, she knows that something happened
  921                  * while we were processing this request, and it
  922                  * might be necessary to retry.
  923                  */
  924                 s = splnet();
  925                 xig.xig_gen = tcbinfo.ipi_gencnt;
  926                 xig.xig_sogen = so_gencnt;
  927                 xig.xig_count = tcbinfo.ipi_count;
  928                 splx(s);
  929                 error = SYSCTL_OUT(req, &xig, sizeof xig);
  930         }
  931         free(inp_list, M_TEMP);
  932         return error;
  933 }
  934 
  935 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLFLAG_RD, 0, 0,
  936             tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
  937 
  938 static int
  939 tcp_getcred(SYSCTL_HANDLER_ARGS)
  940 {
  941         struct sockaddr_in addrs[2];
  942         struct inpcb *inp;
  943         int error, s;
  944 
  945         error = suser(req->p);
  946         if (error)
  947                 return (error);
  948         error = SYSCTL_IN(req, addrs, sizeof(addrs));
  949         if (error)
  950                 return (error);
  951         s = splnet();
  952         inp = in_pcblookup_hash(&tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
  953             addrs[0].sin_addr, addrs[0].sin_port, 0, NULL);
  954         if (inp == NULL || inp->inp_socket == NULL) {
  955                 error = ENOENT;
  956                 goto out;
  957         }
  958         error = SYSCTL_OUT(req, inp->inp_socket->so_cred, sizeof(struct ucred));
  959 out:
  960         splx(s);
  961         return (error);
  962 }
  963 
  964 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
  965     0, 0, tcp_getcred, "S,ucred", "Get the ucred of a TCP connection");
  966 
  967 #ifdef INET6
  968 static int
  969 tcp6_getcred(SYSCTL_HANDLER_ARGS)
  970 {
  971         struct sockaddr_in6 addrs[2];
  972         struct inpcb *inp;
  973         int error, s, mapped = 0;
  974 
  975         error = suser(req->p);
  976         if (error)
  977                 return (error);
  978         error = SYSCTL_IN(req, addrs, sizeof(addrs));
  979         if (error)
  980                 return (error);
  981         if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
  982                 if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
  983                         mapped = 1;
  984                 else
  985                         return (EINVAL);
  986         }
  987         s = splnet();
  988         if (mapped == 1)
  989                 inp = in_pcblookup_hash(&tcbinfo,
  990                         *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
  991                         addrs[1].sin6_port,
  992                         *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
  993                         addrs[0].sin6_port,
  994                         0, NULL);
  995         else
  996                 inp = in6_pcblookup_hash(&tcbinfo, &addrs[1].sin6_addr,
  997                                  addrs[1].sin6_port,
  998                                  &addrs[0].sin6_addr, addrs[0].sin6_port,
  999                                  0, NULL);
 1000         if (inp == NULL || inp->inp_socket == NULL) {
 1001                 error = ENOENT;
 1002                 goto out;
 1003         }
 1004         error = SYSCTL_OUT(req, inp->inp_socket->so_cred, 
 1005                            sizeof(struct ucred));
 1006 out:
 1007         splx(s);
 1008         return (error);
 1009 }
 1010 
 1011 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW,
 1012             0, 0,
 1013             tcp6_getcred, "S,ucred", "Get the ucred of a TCP6 connection");
 1014 #endif
 1015 
 1016 
 1017 void
 1018 tcp_ctlinput(cmd, sa, vip)
 1019         int cmd;
 1020         struct sockaddr *sa;
 1021         void *vip;
 1022 {
 1023         struct ip *ip = vip;
 1024         struct tcphdr *th;
 1025         struct in_addr faddr;
 1026         struct inpcb *inp;
 1027         struct tcpcb *tp;
 1028         void (*notify) __P((struct inpcb *, int)) = tcp_notify;
 1029         tcp_seq icmp_seq;
 1030         int s;
 1031 
 1032         faddr = ((struct sockaddr_in *)sa)->sin_addr;
 1033         if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 1034                 return;
 1035 
 1036         if (cmd == PRC_QUENCH)
 1037                 notify = tcp_quench;
 1038         else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 1039                 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
 1040                 notify = tcp_drop_syn_sent;
 1041         else if (cmd == PRC_MSGSIZE)
 1042                 notify = tcp_mtudisc;
 1043         else if (PRC_IS_REDIRECT(cmd)) {
 1044                 ip = 0;
 1045                 notify = in_rtchange;
 1046         } else if (cmd == PRC_HOSTDEAD)
 1047                 ip = 0;
 1048         else if ((unsigned)cmd > PRC_NCMDS || inetctlerrmap[cmd] == 0)
 1049                 return;
 1050         if (ip) {
 1051                 s = splnet();
 1052                 th = (struct tcphdr *)((caddr_t)ip 
 1053                                        + (IP_VHL_HL(ip->ip_vhl) << 2));
 1054                 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
 1055                     ip->ip_src, th->th_sport, 0, NULL);
 1056                 if (inp != NULL && inp->inp_socket != NULL) {
 1057                         icmp_seq = htonl(th->th_seq);
 1058                         tp = intotcpcb(inp);
 1059                         if (SEQ_GEQ(icmp_seq, tp->snd_una) &&
 1060                             SEQ_LT(icmp_seq, tp->snd_max))
 1061                                 (*notify)(inp, inetctlerrmap[cmd]);
 1062                 } else {
 1063                         struct in_conninfo inc;
 1064 
 1065                         inc.inc_fport = th->th_dport;
 1066                         inc.inc_lport = th->th_sport;
 1067                         inc.inc_faddr = faddr;
 1068                         inc.inc_laddr = ip->ip_src;
 1069 #ifdef INET6
 1070                         inc.inc_isipv6 = 0;
 1071 #endif
 1072                         syncache_unreach(&inc, th);
 1073                 }
 1074                 splx(s);
 1075         } else
 1076                 in_pcbnotifyall(&tcb, faddr, inetctlerrmap[cmd], notify);
 1077 }
 1078 
 1079 #ifdef INET6
 1080 void
 1081 tcp6_ctlinput(cmd, sa, d)
 1082         int cmd;
 1083         struct sockaddr *sa;
 1084         void *d;
 1085 {
 1086         struct tcphdr th;
 1087         void (*notify) __P((struct inpcb *, int)) = tcp_notify;
 1088         struct ip6_hdr *ip6;
 1089         struct mbuf *m;
 1090         struct ip6ctlparam *ip6cp = NULL;
 1091         const struct sockaddr_in6 *sa6_src = NULL;
 1092         int off;
 1093         struct tcp_portonly {
 1094                 u_int16_t th_sport;
 1095                 u_int16_t th_dport;
 1096         } *thp;
 1097 
 1098         if (sa->sa_family != AF_INET6 ||
 1099             sa->sa_len != sizeof(struct sockaddr_in6))
 1100                 return;
 1101 
 1102         if (cmd == PRC_QUENCH)
 1103                 notify = tcp_quench;
 1104         else if (cmd == PRC_MSGSIZE)
 1105                 notify = tcp_mtudisc;
 1106         else if (!PRC_IS_REDIRECT(cmd) &&
 1107                  ((unsigned)cmd > PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
 1108                 return;
 1109 
 1110         /* if the parameter is from icmp6, decode it. */
 1111         if (d != NULL) {
 1112                 ip6cp = (struct ip6ctlparam *)d;
 1113                 m = ip6cp->ip6c_m;
 1114                 ip6 = ip6cp->ip6c_ip6;
 1115                 off = ip6cp->ip6c_off;
 1116                 sa6_src = ip6cp->ip6c_src;
 1117         } else {
 1118                 m = NULL;
 1119                 ip6 = NULL;
 1120                 off = 0;        /* fool gcc */
 1121                 sa6_src = &sa6_any;
 1122         }
 1123 
 1124         if (ip6) {
 1125                 struct in_conninfo inc;
 1126                 /*
 1127                  * XXX: We assume that when IPV6 is non NULL,
 1128                  * M and OFF are valid.
 1129                  */
 1130 
 1131                 /* check if we can safely examine src and dst ports */
 1132                 if (m->m_pkthdr.len < off + sizeof(*thp))
 1133                         return;
 1134 
 1135                 bzero(&th, sizeof(th));
 1136                 m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
 1137 
 1138                 in6_pcbnotify(&tcb, sa, th.th_dport,
 1139                     (struct sockaddr *)ip6cp->ip6c_src,
 1140                     th.th_sport, cmd, notify);
 1141 
 1142                 inc.inc_fport = th.th_dport;
 1143                 inc.inc_lport = th.th_sport;
 1144                 inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
 1145                 inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
 1146                 inc.inc_isipv6 = 1;
 1147                 syncache_unreach(&inc, &th);
 1148         } else
 1149                 in6_pcbnotify(&tcb, sa, 0, (struct sockaddr *)sa6_src,
 1150                               0, cmd, notify);
 1151 }
 1152 #endif /* INET6 */
 1153 
 1154 
 1155 /*
 1156  * Following is where TCP initial sequence number generation occurs.
 1157  *
 1158  * There are two places where we must use initial sequence numbers:
 1159  * 1.  In SYN-ACK packets.
 1160  * 2.  In SYN packets.
 1161  *
 1162  * All ISNs for SYN-ACK packets are generated by the syncache.  See
 1163  * tcp_syncache.c for details.
 1164  *
 1165  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
 1166  * depends on this property.  In addition, these ISNs should be
 1167  * unguessable so as to prevent connection hijacking.  To satisfy
 1168  * the requirements of this situation, the algorithm outlined in
 1169  * RFC 1948 is used to generate sequence numbers.
 1170  *
 1171  * Implementation details:
 1172  *
 1173  * Time is based off the system timer, and is corrected so that it
 1174  * increases by one megabyte per second.  This allows for proper
 1175  * recycling on high speed LANs while still leaving over an hour
 1176  * before rollover.
 1177  *
 1178  * net.inet.tcp.isn_reseed_interval controls the number of seconds
 1179  * between seeding of isn_secret.  This is normally set to zero,
 1180  * as reseeding should not be necessary.
 1181  *
 1182  */
 1183 
 1184 #define ISN_BYTES_PER_SECOND 1048576
 1185 
 1186 u_char isn_secret[32];
 1187 int isn_last_reseed;
 1188 MD5_CTX isn_ctx;
 1189 
 1190 tcp_seq
 1191 tcp_new_isn(tp)
 1192         struct tcpcb *tp;
 1193 {
 1194         u_int32_t md5_buffer[4];
 1195         tcp_seq new_isn;
 1196 
 1197         /* Seed if this is the first use, reseed if requested. */
 1198         if ((isn_last_reseed == 0) || ((tcp_isn_reseed_interval > 0) &&
 1199              (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
 1200                 < (u_int)ticks))) {
 1201                 read_random_unlimited(&isn_secret, sizeof(isn_secret));
 1202                 isn_last_reseed = ticks;
 1203         }
 1204                 
 1205         /* Compute the md5 hash and return the ISN. */
 1206         MD5Init(&isn_ctx);
 1207         MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport, sizeof(u_short));
 1208         MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport, sizeof(u_short));
 1209 #ifdef INET6
 1210         if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
 1211                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
 1212                           sizeof(struct in6_addr));
 1213                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
 1214                           sizeof(struct in6_addr));
 1215         } else
 1216 #endif
 1217         {
 1218                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
 1219                           sizeof(struct in_addr));
 1220                 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
 1221                           sizeof(struct in_addr));
 1222         }
 1223         MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
 1224         MD5Final((u_char *) &md5_buffer, &isn_ctx);
 1225         new_isn = (tcp_seq) md5_buffer[0];
 1226         new_isn += ticks * (ISN_BYTES_PER_SECOND / hz);
 1227         return new_isn;
 1228 }
 1229 
 1230 /*
 1231  * When a source quench is received, close congestion window
 1232  * to one segment.  We will gradually open it again as we proceed.
 1233  */
 1234 void
 1235 tcp_quench(inp, errno)
 1236         struct inpcb *inp;
 1237         int errno;
 1238 {
 1239         struct tcpcb *tp = intotcpcb(inp);
 1240 
 1241         if (tp)
 1242                 tp->snd_cwnd = tp->t_maxseg;
 1243 }
 1244 
 1245 /*
 1246  * When a specific ICMP unreachable message is received and the
 1247  * connection state is SYN-SENT, drop the connection.  This behavior
 1248  * is controlled by the icmp_may_rst sysctl.
 1249  */
 1250 void
 1251 tcp_drop_syn_sent(inp, errno)
 1252         struct inpcb *inp;
 1253         int errno;
 1254 {
 1255         struct tcpcb *tp = intotcpcb(inp);
 1256 
 1257         if (tp && tp->t_state == TCPS_SYN_SENT)
 1258                 tcp_drop(tp, errno);
 1259 }
 1260 
 1261 /*
 1262  * When `need fragmentation' ICMP is received, update our idea of the MSS
 1263  * based on the new value in the route.  Also nudge TCP to send something,
 1264  * since we know the packet we just sent was dropped.
 1265  * This duplicates some code in the tcp_mss() function in tcp_input.c.
 1266  */
 1267 void
 1268 tcp_mtudisc(inp, errno)
 1269         struct inpcb *inp;
 1270         int errno;
 1271 {
 1272         struct tcpcb *tp = intotcpcb(inp);
 1273         struct rtentry *rt;
 1274         struct rmxp_tao *taop;
 1275         struct socket *so = inp->inp_socket;
 1276         int offered;
 1277         int mss;
 1278 #ifdef INET6
 1279         int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
 1280 #endif /* INET6 */
 1281 
 1282         if (tp) {
 1283 #ifdef INET6
 1284                 if (isipv6)
 1285                         rt = tcp_rtlookup6(&inp->inp_inc);
 1286                 else
 1287 #endif /* INET6 */
 1288                 rt = tcp_rtlookup(&inp->inp_inc);
 1289                 if (!rt || !rt->rt_rmx.rmx_mtu) {
 1290                         tp->t_maxopd = tp->t_maxseg =
 1291 #ifdef INET6
 1292                                 isipv6 ? tcp_v6mssdflt :
 1293 #endif /* INET6 */
 1294                                 tcp_mssdflt;
 1295                         return;
 1296                 }
 1297                 taop = rmx_taop(rt->rt_rmx);
 1298                 offered = taop->tao_mssopt;
 1299                 mss = rt->rt_rmx.rmx_mtu -
 1300 #ifdef INET6
 1301                         (isipv6 ?
 1302                          sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
 1303 #endif /* INET6 */
 1304                          sizeof(struct tcpiphdr)
 1305 #ifdef INET6
 1306                          )
 1307 #endif /* INET6 */
 1308                         ;
 1309 
 1310                 if (offered)
 1311                         mss = min(mss, offered);
 1312                 /*
 1313                  * XXX - The above conditional probably violates the TCP
 1314                  * spec.  The problem is that, since we don't know the
 1315                  * other end's MSS, we are supposed to use a conservative
 1316                  * default.  But, if we do that, then MTU discovery will
 1317                  * never actually take place, because the conservative
 1318                  * default is much less than the MTUs typically seen
 1319                  * on the Internet today.  For the moment, we'll sweep
 1320                  * this under the carpet.
 1321                  *
 1322                  * The conservative default might not actually be a problem
 1323                  * if the only case this occurs is when sending an initial
 1324                  * SYN with options and data to a host we've never talked
 1325                  * to before.  Then, they will reply with an MSS value which
 1326                  * will get recorded and the new parameters should get
 1327                  * recomputed.  For Further Study.
 1328                  */
 1329                 if (tp->t_maxopd <= mss)
 1330                         return;
 1331                 tp->t_maxopd = mss;
 1332 
 1333                 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 1334                     (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 1335                         mss -= TCPOLEN_TSTAMP_APPA;
 1336                 if ((tp->t_flags & (TF_REQ_CC|TF_NOOPT)) == TF_REQ_CC &&
 1337                     (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)
 1338                         mss -= TCPOLEN_CC_APPA;
 1339 #if     (MCLBYTES & (MCLBYTES - 1)) == 0
 1340                 if (mss > MCLBYTES)
 1341                         mss &= ~(MCLBYTES-1);
 1342 #else
 1343                 if (mss > MCLBYTES)
 1344                         mss = mss / MCLBYTES * MCLBYTES;
 1345 #endif
 1346                 if (so->so_snd.sb_hiwat < mss)
 1347                         mss = so->so_snd.sb_hiwat;
 1348 
 1349                 tp->t_maxseg = mss;
 1350 
 1351                 tcpstat.tcps_mturesent++;
 1352                 tp->t_rtttime = 0;
 1353                 tp->snd_nxt = tp->snd_una;
 1354                 tcp_output(tp);
 1355         }
 1356 }
 1357 
 1358 /*
 1359  * Look-up the routing entry to the peer of this inpcb.  If no route
 1360  * is found and it cannot be allocated the return NULL.  This routine
 1361  * is called by TCP routines that access the rmx structure and by tcp_mss
 1362  * to get the interface MTU.
 1363  */
 1364 struct rtentry *
 1365 tcp_rtlookup(inc)
 1366         struct in_conninfo *inc;
 1367 {
 1368         struct route *ro;
 1369         struct rtentry *rt;
 1370 
 1371         ro = &inc->inc_route;
 1372         rt = ro->ro_rt;
 1373         if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 1374                 /* No route yet, so try to acquire one */
 1375                 if (inc->inc_faddr.s_addr != INADDR_ANY) {
 1376                         ro->ro_dst.sa_family = AF_INET;
 1377                         ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
 1378                         ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
 1379                             inc->inc_faddr;
 1380                         rtalloc(ro);
 1381                         rt = ro->ro_rt;
 1382                 }
 1383         }
 1384         return rt;
 1385 }
 1386 
 1387 #ifdef INET6
 1388 struct rtentry *
 1389 tcp_rtlookup6(inc)
 1390         struct in_conninfo *inc;
 1391 {
 1392         struct route_in6 *ro6;
 1393         struct rtentry *rt;
 1394 
 1395         ro6 = &inc->inc6_route;
 1396         rt = ro6->ro_rt;
 1397         if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
 1398                 /* No route yet, so try to acquire one */
 1399                 if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 1400                         ro6->ro_dst.sin6_family = AF_INET6;
 1401                         ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
 1402                         ro6->ro_dst.sin6_addr = inc->inc6_faddr;
 1403                         rtalloc((struct route *)ro6);
 1404                         rt = ro6->ro_rt;
 1405                 }
 1406         }
 1407         return rt;
 1408 }
 1409 #endif /* INET6 */
 1410 
 1411 #ifdef IPSEC
 1412 /* compute ESP/AH header size for TCP, including outer IP header. */
 1413 size_t
 1414 ipsec_hdrsiz_tcp(tp)
 1415         struct tcpcb *tp;
 1416 {
 1417         struct inpcb *inp;
 1418         struct mbuf *m;
 1419         size_t hdrsiz;
 1420         struct ip *ip;
 1421 #ifdef INET6
 1422         struct ip6_hdr *ip6;
 1423 #endif /* INET6 */
 1424         struct tcphdr *th;
 1425 
 1426         if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
 1427                 return 0;
 1428         MGETHDR(m, M_DONTWAIT, MT_DATA);
 1429         if (!m)
 1430                 return 0;
 1431 
 1432 #ifdef INET6
 1433         if ((inp->inp_vflag & INP_IPV6) != 0) {
 1434                 ip6 = mtod(m, struct ip6_hdr *);
 1435                 th = (struct tcphdr *)(ip6 + 1);
 1436                 m->m_pkthdr.len = m->m_len =
 1437                         sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 1438                 tcp_fillheaders(tp, ip6, th);
 1439                 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 1440         } else
 1441 #endif /* INET6 */
 1442       {
 1443         ip = mtod(m, struct ip *);
 1444         th = (struct tcphdr *)(ip + 1);
 1445         m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
 1446         tcp_fillheaders(tp, ip, th);
 1447         hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
 1448       }
 1449 
 1450         m_free(m);
 1451         return hdrsiz;
 1452 }
 1453 #endif /*IPSEC*/
 1454 
 1455 /*
 1456  * Return a pointer to the cached information about the remote host.
 1457  * The cached information is stored in the protocol specific part of
 1458  * the route metrics.
 1459  */
 1460 struct rmxp_tao *
 1461 tcp_gettaocache(inc)
 1462         struct in_conninfo *inc;
 1463 {
 1464         struct rtentry *rt;
 1465 
 1466 #ifdef INET6
 1467         if (inc->inc_isipv6)
 1468                 rt = tcp_rtlookup6(inc);
 1469         else
 1470 #endif /* INET6 */
 1471         rt = tcp_rtlookup(inc);
 1472 
 1473         /* Make sure this is a host route and is up. */
 1474         if (rt == NULL ||
 1475             (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
 1476                 return NULL;
 1477 
 1478         return rmx_taop(rt->rt_rmx);
 1479 }
 1480 
 1481 /*
 1482  * Clear all the TAO cache entries, called from tcp_init.
 1483  *
 1484  * XXX
 1485  * This routine is just an empty one, because we assume that the routing
 1486  * routing tables are initialized at the same time when TCP, so there is
 1487  * nothing in the cache left over.
 1488  */
 1489 static void
 1490 tcp_cleartaocache()
 1491 {
 1492 }
 1493 
 1494 /*
 1495  * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
 1496  *
 1497  * This code attempts to calculate the bandwidth-delay product as a
 1498  * means of determining the optimal window size to maximize bandwidth,
 1499  * minimize RTT, and avoid the over-allocation of buffers on interfaces and
 1500  * routers.  This code also does a fairly good job keeping RTTs in check
 1501  * across slow links like modems.  We implement an algorithm which is very
 1502  * similar (but not meant to be) TCP/Vegas.  The code operates on the
 1503  * transmitter side of a TCP connection and so only effects the transmit
 1504  * side of the connection.
 1505  *
 1506  * BACKGROUND:  TCP makes no provision for the management of buffer space
 1507  * at the end points or at the intermediate routers and switches.  A TCP 
 1508  * stream, whether using NewReno or not, will eventually buffer as
 1509  * many packets as it is able and the only reason this typically works is
 1510  * due to the fairly small default buffers made available for a connection
 1511  * (typicaly 16K or 32K).  As machines use larger windows and/or window
 1512  * scaling it is now fairly easy for even a single TCP connection to blow-out
 1513  * all available buffer space not only on the local interface, but on 
 1514  * intermediate routers and switches as well.  NewReno makes a misguided
 1515  * attempt to 'solve' this problem by waiting for an actual failure to occur,
 1516  * then backing off, then steadily increasing the window again until another
 1517  * failure occurs, ad-infinitum.  This results in terrible oscillation that
 1518  * is only made worse as network loads increase and the idea of intentionally
 1519  * blowing out network buffers is, frankly, a terrible way to manage network
 1520  * resources.
 1521  *
 1522  * It is far better to limit the transmit window prior to the failure
 1523  * condition being achieved.  There are two general ways to do this:  First
 1524  * you can 'scan' through different transmit window sizes and locate the
 1525  * point where the RTT stops increasing, indicating that you have filled the
 1526  * pipe, then scan backwards until you note that RTT stops decreasing, then
 1527  * repeat ad-infinitum.  This method works in principle but has severe
 1528  * implementation issues due to RTT variances, timer granularity, and
 1529  * instability in the algorithm which can lead to many false positives and
 1530  * create oscillations as well as interact badly with other TCP streams
 1531  * implementing the same algorithm.
 1532  *
 1533  * The second method is to limit the window to the bandwidth delay product
 1534  * of the link.  This is the method we implement.  RTT variances and our
 1535  * own manipulation of the congestion window, bwnd, can potentially 
 1536  * destabilize the algorithm.  For this reason we have to stabilize the
 1537  * elements used to calculate the window.  We do this by using the minimum
 1538  * observed RTT, the long term average of the observed bandwidth, and
 1539  * by adding two segments worth of slop.  It isn't perfect but it is able
 1540  * to react to changing conditions and gives us a very stable basis on
 1541  * which to extend the algorithm.
 1542  */
 1543 void
 1544 tcp_xmit_bandwidth_limit(struct tcpcb *tp, tcp_seq ack_seq)
 1545 {
 1546         u_long bw;
 1547         u_long bwnd;
 1548         int save_ticks;
 1549 
 1550         /*
 1551          * If inflight_enable is disabled in the middle of a tcp connection,
 1552          * make sure snd_bwnd is effectively disabled.
 1553          */
 1554         if (tcp_inflight_enable == 0) {
 1555                 tp->snd_bwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 1556                 tp->snd_bandwidth = 0;
 1557                 return;
 1558         }
 1559 
 1560         /*
 1561          * Figure out the bandwidth.  Due to the tick granularity this
 1562          * is a very rough number and it MUST be averaged over a fairly
 1563          * long period of time.  XXX we need to take into account a link
 1564          * that is not using all available bandwidth, but for now our
 1565          * slop will ramp us up if this case occurs and the bandwidth later
 1566          * increases.
 1567          *
 1568          * Note: if ticks rollover 'bw' may wind up negative.  We must
 1569          * effectively reset t_bw_rtttime for this case.
 1570          */
 1571         save_ticks = ticks;
 1572         if ((u_int)(save_ticks - tp->t_bw_rtttime) < 1)
 1573                 return;
 1574 
 1575         bw = (int64_t)(ack_seq - tp->t_bw_rtseq) * hz / 
 1576             (save_ticks - tp->t_bw_rtttime);
 1577         tp->t_bw_rtttime = save_ticks;
 1578         tp->t_bw_rtseq = ack_seq;
 1579         if (tp->t_bw_rtttime == 0 || (int)bw < 0)
 1580                 return;
 1581         bw = ((int64_t)tp->snd_bandwidth * 15 + bw) >> 4;
 1582 
 1583         tp->snd_bandwidth = bw;
 1584 
 1585         /*
 1586          * Calculate the semi-static bandwidth delay product, plus two maximal
 1587          * segments.  The additional slop puts us squarely in the sweet
 1588          * spot and also handles the bandwidth run-up case.  Without the
 1589          * slop we could be locking ourselves into a lower bandwidth.
 1590          *
 1591          * Situations Handled:
 1592          *      (1) Prevents over-queueing of packets on LANs, especially on
 1593          *          high speed LANs, allowing larger TCP buffers to be
 1594          *          specified, and also does a good job preventing 
 1595          *          over-queueing of packets over choke points like modems
 1596          *          (at least for the transmit side).
 1597          *
 1598          *      (2) Is able to handle changing network loads (bandwidth
 1599          *          drops so bwnd drops, bandwidth increases so bwnd
 1600          *          increases).
 1601          *
 1602          *      (3) Theoretically should stabilize in the face of multiple
 1603          *          connections implementing the same algorithm (this may need
 1604          *          a little work).
 1605          *
 1606          *      (4) Stability value (defaults to 20 = 2 maximal packets) can 
 1607          *          be adjusted with a sysctl but typically only needs to be on
 1608          *          very slow connections.  A value no smaller then 5 should
 1609          *          be used, but only reduce this default if you have no other
 1610          *          choice.
 1611          */
 1612 #define USERTT  ((tp->t_srtt + tp->t_rttbest) / 2)
 1613         bwnd = (int64_t)bw * USERTT / (hz << TCP_RTT_SHIFT) + tcp_inflight_stab * (int)tp->t_maxseg / 10;
 1614 #undef USERTT
 1615 
 1616         if (tcp_inflight_debug > 0) {
 1617                 static int ltime;
 1618                 if ((u_int)(ticks - ltime) >= hz / tcp_inflight_debug) {
 1619                         ltime = ticks;
 1620                         printf("%p bw %ld rttbest %d srtt %d bwnd %ld\n",
 1621                             tp,
 1622                             bw,
 1623                             tp->t_rttbest,
 1624                             tp->t_srtt,
 1625                             bwnd
 1626                         );
 1627                 }
 1628         }
 1629         if ((long)bwnd < tcp_inflight_min)
 1630                 bwnd = tcp_inflight_min;
 1631         if (bwnd > tcp_inflight_max)
 1632                 bwnd = tcp_inflight_max;
 1633         if ((long)bwnd < tp->t_maxseg * 2)
 1634                 bwnd = tp->t_maxseg * 2;
 1635         tp->snd_bwnd = bwnd;
 1636 }
 1637 
 1638 #ifdef TCP_SIGNATURE
 1639 /*
 1640  * Callback function invoked by m_apply() to digest TCP segment data
 1641  * contained within an mbuf chain.
 1642  */
 1643 static int
 1644 tcp_signature_apply(void *fstate, void *data, u_int len)
 1645 {
 1646 
 1647         MD5Update(fstate, (u_char *)data, len);
 1648         return (0);
 1649 }
 1650 
 1651 /*
 1652  * Compute TCP-MD5 hash of a TCPv4 segment. (RFC2385)
 1653  *
 1654  * Parameters:
 1655  * m            pointer to head of mbuf chain
 1656  * off0         offset to TCP header within the mbuf chain
 1657  * len          length of TCP segment data, excluding options
 1658  * optlen       length of TCP segment options
 1659  * buf          pointer to storage for computed MD5 digest
 1660  * direction    direction of flow (IPSEC_DIR_INBOUND or OUTBOUND)
 1661  *
 1662  * We do this over ip, tcphdr, segment data, and the key in the SADB.
 1663  * When called from tcp_input(), we can be sure that th_sum has been
 1664  * zeroed out and verified already.
 1665  *
 1666  * This function is for IPv4 use only. Calling this function with an
 1667  * IPv6 packet in the mbuf chain will yield undefined results.
 1668  *
 1669  * Return 0 if successful, otherwise return -1.
 1670  *
 1671  * XXX The key is retrieved from the system's PF_KEY SADB, by keying a
 1672  * search with the destination IP address, and a 'magic SPI' of 0x1000.
 1673  * Another branch of this code exists which uses the SPD to specify
 1674  * per-application flows, but it is unstable.
 1675  */
 1676 int
 1677 tcp_signature_compute(struct mbuf *m, int off0, int len, int optlen,
 1678     u_char *buf, u_int direction)
 1679 {
 1680         union sockaddr_union dst;
 1681         struct ippseudo ippseudo;
 1682         MD5_CTX ctx;
 1683         int doff;
 1684         struct ip *ip;
 1685         struct ipovly *ipovly;
 1686         struct secasvar *sav;
 1687         struct tcphdr *th;
 1688         u_short savecsum;
 1689 
 1690         KASSERT(m != NULL, ("NULL mbuf chain"));
 1691         KASSERT(buf != NULL, ("NULL signature pointer"));
 1692 
 1693         /* Extract the destination from the IP header in the mbuf. */
 1694         ip = mtod(m, struct ip *);
 1695         bzero(&dst, sizeof(union sockaddr_union));
 1696         dst.sa.sa_len = sizeof(struct sockaddr_in);
 1697         dst.sa.sa_family = AF_INET;
 1698         dst.sin.sin_addr = (direction == IPSEC_DIR_INBOUND) ?
 1699             ip->ip_src : ip->ip_dst;
 1700 
 1701         /* Look up an SADB entry which matches the address of the peer. */
 1702         sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 1703         if (sav == NULL) {
 1704                 printf("%s: SADB lookup failed for %s\n", __func__,
 1705                     inet_ntoa(dst.sin.sin_addr));
 1706                 return (EINVAL);
 1707         }
 1708 
 1709         MD5Init(&ctx);
 1710         ipovly = (struct ipovly *)ip;
 1711         th = (struct tcphdr *)((u_char *)ip + off0);
 1712         doff = off0 + sizeof(struct tcphdr) + optlen;
 1713 
 1714         /*
 1715          * Step 1: Update MD5 hash with IP pseudo-header.
 1716          *
 1717          * XXX The ippseudo header MUST be digested in network byte order,
 1718          * or else we'll fail the regression test. Assume all fields we've
 1719          * been doing arithmetic on have been in host byte order.
 1720          * XXX One cannot depend on ipovly->ih_len here. When called from
 1721          * tcp_output(), the underlying ip_len member has not yet been set.
 1722          */
 1723         ippseudo.ippseudo_src = ipovly->ih_src;
 1724         ippseudo.ippseudo_dst = ipovly->ih_dst;
 1725         ippseudo.ippseudo_pad = 0;
 1726         ippseudo.ippseudo_p = IPPROTO_TCP;
 1727         ippseudo.ippseudo_len = htons(len + sizeof(struct tcphdr) + optlen);
 1728         MD5Update(&ctx, (char *)&ippseudo, sizeof(struct ippseudo));
 1729 
 1730         /*
 1731          * Step 2: Update MD5 hash with TCP header, excluding options.
 1732          * The TCP checksum must be set to zero.
 1733          */
 1734         savecsum = th->th_sum;
 1735         th->th_sum = 0;
 1736         MD5Update(&ctx, (char *)th, sizeof(struct tcphdr));
 1737         th->th_sum = savecsum;
 1738 
 1739         /*
 1740          * Step 3: Update MD5 hash with TCP segment data.
 1741          *         Use m_apply() to avoid an early m_pullup().
 1742          */
 1743         if (len > 0)
 1744                 m_apply(m, doff, len, tcp_signature_apply, &ctx);
 1745 
 1746         /*
 1747          * Step 4: Update MD5 hash with shared secret.
 1748          */
 1749         MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
 1750         MD5Final(buf, &ctx);
 1751 
 1752         key_sa_recordxfer(sav, m);
 1753         KEY_FREESAV(&sav);
 1754         return (0);
 1755 }
 1756 #endif /* TCP_SIGNATURE */
Cache object: 036b9692e97e5f4bffb9d07aa778758e
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/netinet/tcp_subr.c

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_subr.c