The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_subr.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: tcp_subr.c,v 1.187.2.3 2005/05/06 08:39:52 tron Exp $  */
    2 
    3 /*
    4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. Neither the name of the project nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  */
   31 
   32 /*-
   33  * Copyright (c) 1997, 1998, 2000, 2001 The NetBSD Foundation, Inc.
   34  * All rights reserved.
   35  *
   36  * This code is derived from software contributed to The NetBSD Foundation
   37  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
   38  * Facility, NASA Ames Research Center.
   39  *
   40  * Redistribution and use in source and binary forms, with or without
   41  * modification, are permitted provided that the following conditions
   42  * are met:
   43  * 1. Redistributions of source code must retain the above copyright
   44  *    notice, this list of conditions and the following disclaimer.
   45  * 2. Redistributions in binary form must reproduce the above copyright
   46  *    notice, this list of conditions and the following disclaimer in the
   47  *    documentation and/or other materials provided with the distribution.
   48  * 3. All advertising materials mentioning features or use of this software
   49  *    must display the following acknowledgement:
   50  *      This product includes software developed by the NetBSD
   51  *      Foundation, Inc. and its contributors.
   52  * 4. Neither the name of The NetBSD Foundation nor the names of its
   53  *    contributors may be used to endorse or promote products derived
   54  *    from this software without specific prior written permission.
   55  *
   56  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   57  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   58  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   59  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   60  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   61  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   62  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   63  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   64  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   65  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   66  * POSSIBILITY OF SUCH DAMAGE.
   67  */
   68 
   69 /*
   70  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
   71  *      The Regents of the University of California.  All rights reserved.
   72  *
   73  * Redistribution and use in source and binary forms, with or without
   74  * modification, are permitted provided that the following conditions
   75  * are met:
   76  * 1. Redistributions of source code must retain the above copyright
   77  *    notice, this list of conditions and the following disclaimer.
   78  * 2. Redistributions in binary form must reproduce the above copyright
   79  *    notice, this list of conditions and the following disclaimer in the
   80  *    documentation and/or other materials provided with the distribution.
   81  * 3. Neither the name of the University nor the names of its contributors
   82  *    may be used to endorse or promote products derived from this software
   83  *    without specific prior written permission.
   84  *
   85  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   86  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   87  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   88  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   89  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   90  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   91  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   92  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   93  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   94  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   95  * SUCH DAMAGE.
   96  *
   97  *      @(#)tcp_subr.c  8.2 (Berkeley) 5/24/95
   98  */
   99 
  100 #include <sys/cdefs.h>
  101 __KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.187.2.3 2005/05/06 08:39:52 tron Exp $");
  102 
  103 #include "opt_inet.h"
  104 #include "opt_ipsec.h"
  105 #include "opt_tcp_compat_42.h"
  106 #include "opt_inet_csum.h"
  107 #include "opt_mbuftrace.h"
  108 #include "rnd.h"
  109 
  110 #include <sys/param.h>
  111 #include <sys/proc.h>
  112 #include <sys/systm.h>
  113 #include <sys/malloc.h>
  114 #include <sys/mbuf.h>
  115 #include <sys/socket.h>
  116 #include <sys/socketvar.h>
  117 #include <sys/protosw.h>
  118 #include <sys/errno.h>
  119 #include <sys/kernel.h>
  120 #include <sys/pool.h>
  121 #if NRND > 0
  122 #include <sys/md5.h>
  123 #include <sys/rnd.h>
  124 #endif
  125 
  126 #include <net/route.h>
  127 #include <net/if.h>
  128 
  129 #include <netinet/in.h>
  130 #include <netinet/in_systm.h>
  131 #include <netinet/ip.h>
  132 #include <netinet/in_pcb.h>
  133 #include <netinet/ip_var.h>
  134 #include <netinet/ip_icmp.h>
  135 
  136 #ifdef INET6
  137 #ifndef INET
  138 #include <netinet/in.h>
  139 #endif
  140 #include <netinet/ip6.h>
  141 #include <netinet6/in6_pcb.h>
  142 #include <netinet6/ip6_var.h>
  143 #include <netinet6/in6_var.h>
  144 #include <netinet6/ip6protosw.h>
  145 #include <netinet/icmp6.h>
  146 #include <netinet6/nd6.h>
  147 #endif
  148 
  149 #include <netinet/tcp.h>
  150 #include <netinet/tcp_fsm.h>
  151 #include <netinet/tcp_seq.h>
  152 #include <netinet/tcp_timer.h>
  153 #include <netinet/tcp_var.h>
  154 #include <netinet/tcpip.h>
  155 
  156 #ifdef IPSEC
  157 #include <netinet6/ipsec.h>
  158 #include <netkey/key.h>
  159 #endif /*IPSEC*/
  160 
  161 #ifdef FAST_IPSEC
  162 #include <netipsec/ipsec.h>
  163 #include <netipsec/xform.h>
  164 #ifdef INET6
  165 #include <netipsec/ipsec6.h>
  166 #endif
  167  #include <netipsec/key.h>
  168 #endif  /* FAST_IPSEC*/
  169 
  170 
  171 struct  inpcbtable tcbtable;    /* head of queue of active tcpcb's */
  172 struct  tcpstat tcpstat;        /* tcp statistics */
  173 u_int32_t tcp_now;              /* for RFC 1323 timestamps */
  174 
  175 /* patchable/settable parameters for tcp */
  176 int     tcp_mssdflt = TCP_MSS;
  177 int     tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
  178 int     tcp_do_rfc1323 = 1;     /* window scaling / timestamps (obsolete) */
  179 #if NRND > 0
  180 int     tcp_do_rfc1948 = 0;     /* ISS by cryptographic hash */
  181 #endif
  182 int     tcp_do_sack = 1;        /* selective acknowledgement */
  183 int     tcp_do_win_scale = 1;   /* RFC1323 window scaling */
  184 int     tcp_do_timestamps = 1;  /* RFC1323 timestamps */
  185 int     tcp_do_newreno = 1;     /* Use the New Reno algorithms */
  186 int     tcp_ack_on_push = 0;    /* set to enable immediate ACK-on-PUSH */
  187 #ifndef TCP_INIT_WIN
  188 #define TCP_INIT_WIN    0       /* initial slow start window */
  189 #endif
  190 #ifndef TCP_INIT_WIN_LOCAL
  191 #define TCP_INIT_WIN_LOCAL 4    /* initial slow start window for local nets */
  192 #endif
  193 int     tcp_init_win = TCP_INIT_WIN;
  194 int     tcp_init_win_local = TCP_INIT_WIN_LOCAL;
  195 int     tcp_mss_ifmtu = 0;
  196 #ifdef TCP_COMPAT_42
  197 int     tcp_compat_42 = 1;
  198 #else
  199 int     tcp_compat_42 = 0;
  200 #endif
  201 int     tcp_rst_ppslim = 100;   /* 100pps */
  202 int     tcp_ackdrop_ppslim = 100;       /* 100pps */
  203 int     tcp_sack_tp_maxholes = 32;
  204 int     tcp_sack_globalmaxholes = 1024;
  205 int     tcp_sack_globalholes = 0;
  206 
  207 
  208 /* tcb hash */
  209 #ifndef TCBHASHSIZE
  210 #define TCBHASHSIZE     128
  211 #endif
  212 int     tcbhashsize = TCBHASHSIZE;
  213 
  214 /* syn hash parameters */
  215 #define TCP_SYN_HASH_SIZE       293
  216 #define TCP_SYN_BUCKET_SIZE     35
  217 int     tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
  218 int     tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
  219 int     tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
  220 struct  syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
  221 
  222 int     tcp_freeq(struct tcpcb *);
  223 
  224 #ifdef INET
  225 void    tcp_mtudisc_callback(struct in_addr);
  226 #endif
  227 #ifdef INET6
  228 void    tcp6_mtudisc_callback(struct in6_addr *);
  229 #endif
  230 
  231 void    tcp_mtudisc(struct inpcb *, int);
  232 #ifdef INET6
  233 void    tcp6_mtudisc(struct in6pcb *, int);
  234 #endif
  235 
  236 POOL_INIT(tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl", NULL);
  237 
  238 #ifdef TCP_CSUM_COUNTERS
  239 #include <sys/device.h>
  240 
  241 struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  242     NULL, "tcp", "hwcsum bad");
  243 struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  244     NULL, "tcp", "hwcsum ok");
  245 struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  246     NULL, "tcp", "hwcsum data");
  247 struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  248     NULL, "tcp", "swcsum");
  249 
  250 EVCNT_ATTACH_STATIC(tcp_hwcsum_bad);
  251 EVCNT_ATTACH_STATIC(tcp_hwcsum_ok);
  252 EVCNT_ATTACH_STATIC(tcp_hwcsum_data);
  253 EVCNT_ATTACH_STATIC(tcp_swcsum);
  254 #endif /* TCP_CSUM_COUNTERS */
  255 
  256 
  257 #ifdef TCP_OUTPUT_COUNTERS
  258 #include <sys/device.h>
  259 
  260 struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  261     NULL, "tcp", "output big header");
  262 struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  263     NULL, "tcp", "output predict hit");
  264 struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  265     NULL, "tcp", "output predict miss");
  266 struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  267     NULL, "tcp", "output copy small");
  268 struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  269     NULL, "tcp", "output copy big");
  270 struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  271     NULL, "tcp", "output reference big");
  272 
  273 EVCNT_ATTACH_STATIC(tcp_output_bigheader);
  274 EVCNT_ATTACH_STATIC(tcp_output_predict_hit);
  275 EVCNT_ATTACH_STATIC(tcp_output_predict_miss);
  276 EVCNT_ATTACH_STATIC(tcp_output_copysmall);
  277 EVCNT_ATTACH_STATIC(tcp_output_copybig);
  278 EVCNT_ATTACH_STATIC(tcp_output_refbig);
  279 
  280 #endif /* TCP_OUTPUT_COUNTERS */
  281 
  282 #ifdef TCP_REASS_COUNTERS
  283 #include <sys/device.h>
  284 
  285 struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  286     NULL, "tcp_reass", "calls");
  287 struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  288     &tcp_reass_, "tcp_reass", "insert into empty queue");
  289 struct evcnt tcp_reass_iteration[8] = {
  290     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"),
  291     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"),
  292     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"),
  293     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"),
  294     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"),
  295     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"),
  296     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"),
  297     EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"),
  298 };
  299 struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  300     &tcp_reass_, "tcp_reass", "prepend to first");
  301 struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  302     &tcp_reass_, "tcp_reass", "prepend");
  303 struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  304     &tcp_reass_, "tcp_reass", "insert");
  305 struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  306     &tcp_reass_, "tcp_reass", "insert at tail");
  307 struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  308     &tcp_reass_, "tcp_reass", "append");
  309 struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  310     &tcp_reass_, "tcp_reass", "append to tail fragment");
  311 struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  312     &tcp_reass_, "tcp_reass", "overlap at end");
  313 struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  314     &tcp_reass_, "tcp_reass", "overlap at start");
  315 struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  316     &tcp_reass_, "tcp_reass", "duplicate segment");
  317 struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
  318     &tcp_reass_, "tcp_reass", "duplicate fragment");
  319 
  320 EVCNT_ATTACH_STATIC(tcp_reass_);
  321 EVCNT_ATTACH_STATIC(tcp_reass_empty);
  322 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0);
  323 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1);
  324 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2);
  325 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3);
  326 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4);
  327 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5);
  328 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6);
  329 EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7);
  330 EVCNT_ATTACH_STATIC(tcp_reass_prependfirst);
  331 EVCNT_ATTACH_STATIC(tcp_reass_prepend);
  332 EVCNT_ATTACH_STATIC(tcp_reass_insert);
  333 EVCNT_ATTACH_STATIC(tcp_reass_inserttail);
  334 EVCNT_ATTACH_STATIC(tcp_reass_append);
  335 EVCNT_ATTACH_STATIC(tcp_reass_appendtail);
  336 EVCNT_ATTACH_STATIC(tcp_reass_overlaptail);
  337 EVCNT_ATTACH_STATIC(tcp_reass_overlapfront);
  338 EVCNT_ATTACH_STATIC(tcp_reass_segdup);
  339 EVCNT_ATTACH_STATIC(tcp_reass_fragdup);
  340 
  341 #endif /* TCP_REASS_COUNTERS */
  342 
  343 #ifdef MBUFTRACE
  344 struct mowner tcp_mowner = { "tcp" };
  345 struct mowner tcp_rx_mowner = { "tcp", "rx" };
  346 struct mowner tcp_tx_mowner = { "tcp", "tx" };
  347 #endif
  348 
  349 /*
  350  * Tcp initialization
  351  */
  352 void
  353 tcp_init(void)
  354 {
  355         int hlen;
  356 
  357         /* Initialize the TCPCB template. */
  358         tcp_tcpcb_template();
  359 
  360         in_pcbinit(&tcbtable, tcbhashsize, tcbhashsize);
  361 
  362         hlen = sizeof(struct ip) + sizeof(struct tcphdr);
  363 #ifdef INET6
  364         if (sizeof(struct ip) < sizeof(struct ip6_hdr))
  365                 hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
  366 #endif
  367         if (max_protohdr < hlen)
  368                 max_protohdr = hlen;
  369         if (max_linkhdr + hlen > MHLEN)
  370                 panic("tcp_init");
  371 
  372 #ifdef INET
  373         icmp_mtudisc_callback_register(tcp_mtudisc_callback);
  374 #endif
  375 #ifdef INET6
  376         icmp6_mtudisc_callback_register(tcp6_mtudisc_callback);
  377 #endif
  378 
  379         /* Initialize timer state. */
  380         tcp_timer_init();
  381 
  382         /* Initialize the compressed state engine. */
  383         syn_cache_init();
  384 
  385         MOWNER_ATTACH(&tcp_tx_mowner);
  386         MOWNER_ATTACH(&tcp_rx_mowner);
  387         MOWNER_ATTACH(&tcp_mowner);
  388 }
  389 
  390 /*
  391  * Create template to be used to send tcp packets on a connection.
  392  * Call after host entry created, allocates an mbuf and fills
  393  * in a skeletal tcp/ip header, minimizing the amount of work
  394  * necessary when the connection is used.
  395  */
  396 struct mbuf *
  397 tcp_template(struct tcpcb *tp)
  398 {
  399         struct inpcb *inp = tp->t_inpcb;
  400 #ifdef INET6
  401         struct in6pcb *in6p = tp->t_in6pcb;
  402 #endif
  403         struct tcphdr *n;
  404         struct mbuf *m;
  405         int hlen;
  406 
  407         switch (tp->t_family) {
  408         case AF_INET:
  409                 hlen = sizeof(struct ip);
  410                 if (inp)
  411                         break;
  412 #ifdef INET6
  413                 if (in6p) {
  414                         /* mapped addr case */
  415                         if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_laddr)
  416                          && IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr))
  417                                 break;
  418                 }
  419 #endif
  420                 return NULL;    /*EINVAL*/
  421 #ifdef INET6
  422         case AF_INET6:
  423                 hlen = sizeof(struct ip6_hdr);
  424                 if (in6p) {
  425                         /* more sainty check? */
  426                         break;
  427                 }
  428                 return NULL;    /*EINVAL*/
  429 #endif
  430         default:
  431                 hlen = 0;       /*pacify gcc*/
  432                 return NULL;    /*EAFNOSUPPORT*/
  433         }
  434 #ifdef DIAGNOSTIC
  435         if (hlen + sizeof(struct tcphdr) > MCLBYTES)
  436                 panic("mclbytes too small for t_template");
  437 #endif
  438         m = tp->t_template;
  439         if (m && m->m_len == hlen + sizeof(struct tcphdr))
  440                 ;
  441         else {
  442                 if (m)
  443                         m_freem(m);
  444                 m = tp->t_template = NULL;
  445                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
  446                 if (m && hlen + sizeof(struct tcphdr) > MHLEN) {
  447                         MCLGET(m, M_DONTWAIT);
  448                         if ((m->m_flags & M_EXT) == 0) {
  449                                 m_free(m);
  450                                 m = NULL;
  451                         }
  452                 }
  453                 if (m == NULL)
  454                         return NULL;
  455                 MCLAIM(m, &tcp_mowner);
  456                 m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
  457         }
  458 
  459         bzero(mtod(m, caddr_t), m->m_len);
  460 
  461         n = (struct tcphdr *)(mtod(m, caddr_t) + hlen);
  462 
  463         switch (tp->t_family) {
  464         case AF_INET:
  465             {
  466                 struct ipovly *ipov;
  467                 mtod(m, struct ip *)->ip_v = 4;
  468                 mtod(m, struct ip *)->ip_hl = hlen >> 2;
  469                 ipov = mtod(m, struct ipovly *);
  470                 ipov->ih_pr = IPPROTO_TCP;
  471                 ipov->ih_len = htons(sizeof(struct tcphdr));
  472                 if (inp) {
  473                         ipov->ih_src = inp->inp_laddr;
  474                         ipov->ih_dst = inp->inp_faddr;
  475                 }
  476 #ifdef INET6
  477                 else if (in6p) {
  478                         /* mapped addr case */
  479                         bcopy(&in6p->in6p_laddr.s6_addr32[3], &ipov->ih_src,
  480                                 sizeof(ipov->ih_src));
  481                         bcopy(&in6p->in6p_faddr.s6_addr32[3], &ipov->ih_dst,
  482                                 sizeof(ipov->ih_dst));
  483                 }
  484 #endif
  485                 /*
  486                  * Compute the pseudo-header portion of the checksum
  487                  * now.  We incrementally add in the TCP option and
  488                  * payload lengths later, and then compute the TCP
  489                  * checksum right before the packet is sent off onto
  490                  * the wire.
  491                  */
  492                 n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr,
  493                     ipov->ih_dst.s_addr,
  494                     htons(sizeof(struct tcphdr) + IPPROTO_TCP));
  495                 break;
  496             }
  497 #ifdef INET6
  498         case AF_INET6:
  499             {
  500                 struct ip6_hdr *ip6;
  501                 mtod(m, struct ip *)->ip_v = 6;
  502                 ip6 = mtod(m, struct ip6_hdr *);
  503                 ip6->ip6_nxt = IPPROTO_TCP;
  504                 ip6->ip6_plen = htons(sizeof(struct tcphdr));
  505                 ip6->ip6_src = in6p->in6p_laddr;
  506                 ip6->ip6_dst = in6p->in6p_faddr;
  507                 ip6->ip6_flow = in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK;
  508                 if (ip6_auto_flowlabel) {
  509                         ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
  510                         ip6->ip6_flow |=
  511                             (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
  512                 }
  513                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
  514                 ip6->ip6_vfc |= IPV6_VERSION;
  515 
  516                 /*
  517                  * Compute the pseudo-header portion of the checksum
  518                  * now.  We incrementally add in the TCP option and
  519                  * payload lengths later, and then compute the TCP
  520                  * checksum right before the packet is sent off onto
  521                  * the wire.
  522                  */
  523                 n->th_sum = in6_cksum_phdr(&in6p->in6p_laddr,
  524                     &in6p->in6p_faddr, htonl(sizeof(struct tcphdr)),
  525                     htonl(IPPROTO_TCP));
  526                 break;
  527             }
  528 #endif
  529         }
  530         if (inp) {
  531                 n->th_sport = inp->inp_lport;
  532                 n->th_dport = inp->inp_fport;
  533         }
  534 #ifdef INET6
  535         else if (in6p) {
  536                 n->th_sport = in6p->in6p_lport;
  537                 n->th_dport = in6p->in6p_fport;
  538         }
  539 #endif
  540         n->th_seq = 0;
  541         n->th_ack = 0;
  542         n->th_x2 = 0;
  543         n->th_off = 5;
  544         n->th_flags = 0;
  545         n->th_win = 0;
  546         n->th_urp = 0;
  547         return (m);
  548 }
  549 
  550 /*
  551  * Send a single message to the TCP at address specified by
  552  * the given TCP/IP header.  If m == 0, then we make a copy
  553  * of the tcpiphdr at ti and send directly to the addressed host.
  554  * This is used to force keep alive messages out using the TCP
  555  * template for a connection tp->t_template.  If flags are given
  556  * then we send a message back to the TCP which originated the
  557  * segment ti, and discard the mbuf containing it and any other
  558  * attached mbufs.
  559  *
  560  * In any case the ack and sequence number of the transmitted
  561  * segment are as specified by the parameters.
  562  */
  563 int
  564 tcp_respond(struct tcpcb *tp, struct mbuf *template, struct mbuf *m,
  565     struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags)
  566 {
  567         struct route *ro;
  568         int error, tlen, win = 0;
  569         int hlen;
  570         struct ip *ip;
  571 #ifdef INET6
  572         struct ip6_hdr *ip6;
  573 #endif
  574         int family;     /* family on packet, not inpcb/in6pcb! */
  575         struct tcphdr *th;
  576         struct socket *so;
  577 
  578         if (tp != NULL && (flags & TH_RST) == 0) {
  579 #ifdef DIAGNOSTIC
  580                 if (tp->t_inpcb && tp->t_in6pcb)
  581                         panic("tcp_respond: both t_inpcb and t_in6pcb are set");
  582 #endif
  583 #ifdef INET
  584                 if (tp->t_inpcb)
  585                         win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
  586 #endif
  587 #ifdef INET6
  588                 if (tp->t_in6pcb)
  589                         win = sbspace(&tp->t_in6pcb->in6p_socket->so_rcv);
  590 #endif
  591         }
  592 
  593         th = NULL;      /* Quell uninitialized warning */
  594         ip = NULL;
  595 #ifdef INET6
  596         ip6 = NULL;
  597 #endif
  598         if (m == 0) {
  599                 if (!template)
  600                         return EINVAL;
  601 
  602                 /* get family information from template */
  603                 switch (mtod(template, struct ip *)->ip_v) {
  604                 case 4:
  605                         family = AF_INET;
  606                         hlen = sizeof(struct ip);
  607                         break;
  608 #ifdef INET6
  609                 case 6:
  610                         family = AF_INET6;
  611                         hlen = sizeof(struct ip6_hdr);
  612                         break;
  613 #endif
  614                 default:
  615                         return EAFNOSUPPORT;
  616                 }
  617 
  618                 MGETHDR(m, M_DONTWAIT, MT_HEADER);
  619                 if (m) {
  620                         MCLAIM(m, &tcp_tx_mowner);
  621                         MCLGET(m, M_DONTWAIT);
  622                         if ((m->m_flags & M_EXT) == 0) {
  623                                 m_free(m);
  624                                 m = NULL;
  625                         }
  626                 }
  627                 if (m == NULL)
  628                         return (ENOBUFS);
  629 
  630                 if (tcp_compat_42)
  631                         tlen = 1;
  632                 else
  633                         tlen = 0;
  634 
  635                 m->m_data += max_linkhdr;
  636                 bcopy(mtod(template, caddr_t), mtod(m, caddr_t),
  637                         template->m_len);
  638                 switch (family) {
  639                 case AF_INET:
  640                         ip = mtod(m, struct ip *);
  641                         th = (struct tcphdr *)(ip + 1);
  642                         break;
  643 #ifdef INET6
  644                 case AF_INET6:
  645                         ip6 = mtod(m, struct ip6_hdr *);
  646                         th = (struct tcphdr *)(ip6 + 1);
  647                         break;
  648 #endif
  649 #if 0
  650                 default:
  651                         /* noone will visit here */
  652                         m_freem(m);
  653                         return EAFNOSUPPORT;
  654 #endif
  655                 }
  656                 flags = TH_ACK;
  657         } else {
  658 
  659                 if ((m->m_flags & M_PKTHDR) == 0) {
  660 #if 0
  661                         printf("non PKTHDR to tcp_respond\n");
  662 #endif
  663                         m_freem(m);
  664                         return EINVAL;
  665                 }
  666 #ifdef DIAGNOSTIC
  667                 if (!th0)
  668                         panic("th0 == NULL in tcp_respond");
  669 #endif
  670 
  671                 /* get family information from m */
  672                 switch (mtod(m, struct ip *)->ip_v) {
  673                 case 4:
  674                         family = AF_INET;
  675                         hlen = sizeof(struct ip);
  676                         ip = mtod(m, struct ip *);
  677                         break;
  678 #ifdef INET6
  679                 case 6:
  680                         family = AF_INET6;
  681                         hlen = sizeof(struct ip6_hdr);
  682                         ip6 = mtod(m, struct ip6_hdr *);
  683                         break;
  684 #endif
  685                 default:
  686                         m_freem(m);
  687                         return EAFNOSUPPORT;
  688                 }
  689                 /* clear h/w csum flags inherited from rx packet */
  690                 m->m_pkthdr.csum_flags = 0;
  691 
  692                 if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
  693                         tlen = sizeof(*th0);
  694                 else
  695                         tlen = th0->th_off << 2;
  696 
  697                 if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 &&
  698                     mtod(m, caddr_t) + hlen == (caddr_t)th0) {
  699                         m->m_len = hlen + tlen;
  700                         m_freem(m->m_next);
  701                         m->m_next = NULL;
  702                 } else {
  703                         struct mbuf *n;
  704 
  705 #ifdef DIAGNOSTIC
  706                         if (max_linkhdr + hlen + tlen > MCLBYTES) {
  707                                 m_freem(m);
  708                                 return EMSGSIZE;
  709                         }
  710 #endif
  711                         MGETHDR(n, M_DONTWAIT, MT_HEADER);
  712                         if (n && max_linkhdr + hlen + tlen > MHLEN) {
  713                                 MCLGET(n, M_DONTWAIT);
  714                                 if ((n->m_flags & M_EXT) == 0) {
  715                                         m_freem(n);
  716                                         n = NULL;
  717                                 }
  718                         }
  719                         if (!n) {
  720                                 m_freem(m);
  721                                 return ENOBUFS;
  722                         }
  723 
  724                         MCLAIM(n, &tcp_tx_mowner);
  725                         n->m_data += max_linkhdr;
  726                         n->m_len = hlen + tlen;
  727                         m_copyback(n, 0, hlen, mtod(m, caddr_t));
  728                         m_copyback(n, hlen, tlen, (caddr_t)th0);
  729 
  730                         m_freem(m);
  731                         m = n;
  732                         n = NULL;
  733                 }
  734 
  735 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
  736                 switch (family) {
  737                 case AF_INET:
  738                         ip = mtod(m, struct ip *);
  739                         th = (struct tcphdr *)(ip + 1);
  740                         ip->ip_p = IPPROTO_TCP;
  741                         xchg(ip->ip_dst, ip->ip_src, struct in_addr);
  742                         ip->ip_p = IPPROTO_TCP;
  743                         break;
  744 #ifdef INET6
  745                 case AF_INET6:
  746                         ip6 = mtod(m, struct ip6_hdr *);
  747                         th = (struct tcphdr *)(ip6 + 1);
  748                         ip6->ip6_nxt = IPPROTO_TCP;
  749                         xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
  750                         ip6->ip6_nxt = IPPROTO_TCP;
  751                         break;
  752 #endif
  753 #if 0
  754                 default:
  755                         /* noone will visit here */
  756                         m_freem(m);
  757                         return EAFNOSUPPORT;
  758 #endif
  759                 }
  760                 xchg(th->th_dport, th->th_sport, u_int16_t);
  761 #undef xchg
  762                 tlen = 0;       /*be friendly with the following code*/
  763         }
  764         th->th_seq = htonl(seq);
  765         th->th_ack = htonl(ack);
  766         th->th_x2 = 0;
  767         if ((flags & TH_SYN) == 0) {
  768                 if (tp)
  769                         win >>= tp->rcv_scale;
  770                 if (win > TCP_MAXWIN)
  771                         win = TCP_MAXWIN;
  772                 th->th_win = htons((u_int16_t)win);
  773                 th->th_off = sizeof (struct tcphdr) >> 2;
  774                 tlen += sizeof(*th);
  775         } else
  776                 tlen += th->th_off << 2;
  777         m->m_len = hlen + tlen;
  778         m->m_pkthdr.len = hlen + tlen;
  779         m->m_pkthdr.rcvif = (struct ifnet *) 0;
  780         th->th_flags = flags;
  781         th->th_urp = 0;
  782 
  783         switch (family) {
  784 #ifdef INET
  785         case AF_INET:
  786             {
  787                 struct ipovly *ipov = (struct ipovly *)ip;
  788                 bzero(ipov->ih_x1, sizeof ipov->ih_x1);
  789                 ipov->ih_len = htons((u_int16_t)tlen);
  790 
  791                 th->th_sum = 0;
  792                 th->th_sum = in_cksum(m, hlen + tlen);
  793                 ip->ip_len = htons(hlen + tlen);
  794                 ip->ip_ttl = ip_defttl;
  795                 break;
  796             }
  797 #endif
  798 #ifdef INET6
  799         case AF_INET6:
  800             {
  801                 th->th_sum = 0;
  802                 th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
  803                                 tlen);
  804                 ip6->ip6_plen = htons(tlen);
  805                 if (tp && tp->t_in6pcb) {
  806                         struct ifnet *oifp;
  807                         ro = (struct route *)&tp->t_in6pcb->in6p_route;
  808                         oifp = ro->ro_rt ? ro->ro_rt->rt_ifp : NULL;
  809                         ip6->ip6_hlim = in6_selecthlim(tp->t_in6pcb, oifp);
  810                 } else
  811                         ip6->ip6_hlim = ip6_defhlim;
  812                 ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
  813                 if (ip6_auto_flowlabel) {
  814                         ip6->ip6_flow |=
  815                             (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
  816                 }
  817                 break;
  818             }
  819 #endif
  820         }
  821 
  822         if (tp && tp->t_inpcb)
  823                 so = tp->t_inpcb->inp_socket;
  824 #ifdef INET6
  825         else if (tp && tp->t_in6pcb)
  826                 so = tp->t_in6pcb->in6p_socket;
  827 #endif
  828         else
  829                 so = NULL;
  830 
  831         if (tp != NULL && tp->t_inpcb != NULL) {
  832                 ro = &tp->t_inpcb->inp_route;
  833 #ifdef DIAGNOSTIC
  834                 if (family != AF_INET)
  835                         panic("tcp_respond: address family mismatch");
  836                 if (!in_hosteq(ip->ip_dst, tp->t_inpcb->inp_faddr)) {
  837                         panic("tcp_respond: ip_dst %x != inp_faddr %x",
  838                             ntohl(ip->ip_dst.s_addr),
  839                             ntohl(tp->t_inpcb->inp_faddr.s_addr));
  840                 }
  841 #endif
  842         }
  843 #ifdef INET6
  844         else if (tp != NULL && tp->t_in6pcb != NULL) {
  845                 ro = (struct route *)&tp->t_in6pcb->in6p_route;
  846 #ifdef DIAGNOSTIC
  847                 if (family == AF_INET) {
  848                         if (!IN6_IS_ADDR_V4MAPPED(&tp->t_in6pcb->in6p_faddr))
  849                                 panic("tcp_respond: not mapped addr");
  850                         if (bcmp(&ip->ip_dst,
  851                             &tp->t_in6pcb->in6p_faddr.s6_addr32[3],
  852                             sizeof(ip->ip_dst)) != 0) {
  853                                 panic("tcp_respond: ip_dst != in6p_faddr");
  854                         }
  855                 } else if (family == AF_INET6) {
  856                         if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
  857                             &tp->t_in6pcb->in6p_faddr))
  858                                 panic("tcp_respond: ip6_dst != in6p_faddr");
  859                 } else
  860                         panic("tcp_respond: address family mismatch");
  861 #endif
  862         }
  863 #endif
  864         else
  865                 ro = NULL;
  866 
  867         switch (family) {
  868 #ifdef INET
  869         case AF_INET:
  870                 error = ip_output(m, NULL, ro,
  871                     (tp && tp->t_mtudisc ? IP_MTUDISC : 0),
  872                     (struct ip_moptions *)0, so);
  873                 break;
  874 #endif
  875 #ifdef INET6
  876         case AF_INET6:
  877                 error = ip6_output(m, NULL, (struct route_in6 *)ro, 0,
  878                     (struct ip6_moptions *)0, so, NULL);
  879                 break;
  880 #endif
  881         default:
  882                 error = EAFNOSUPPORT;
  883                 break;
  884         }
  885 
  886         return (error);
  887 }
  888 
  889 /*
  890  * Template TCPCB.  Rather than zeroing a new TCPCB and initializing
  891  * a bunch of members individually, we maintain this template for the
  892  * static and mostly-static components of the TCPCB, and copy it into
  893  * the new TCPCB instead.
  894  */
  895 static struct tcpcb tcpcb_template = {
  896         /*
  897          * If TCP_NTIMERS ever changes, we'll need to update this
  898          * initializer.
  899          */
  900         .t_timer = {
  901                 CALLOUT_INITIALIZER,
  902                 CALLOUT_INITIALIZER,
  903                 CALLOUT_INITIALIZER,
  904                 CALLOUT_INITIALIZER,
  905         },
  906         .t_delack_ch = CALLOUT_INITIALIZER,
  907 
  908         .t_srtt = TCPTV_SRTTBASE,
  909         .t_rttmin = TCPTV_MIN,
  910 
  911         .snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT,
  912         .snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT,
  913         .snd_numholes = 0,
  914 
  915         .t_partialacks = -1,
  916 };
  917 
  918 /*
  919  * Updates the TCPCB template whenever a parameter that would affect
  920  * the template is changed.
  921  */
  922 void
  923 tcp_tcpcb_template(void)
  924 {
  925         struct tcpcb *tp = &tcpcb_template;
  926         int flags;
  927 
  928         tp->t_peermss = tcp_mssdflt;
  929         tp->t_ourmss = tcp_mssdflt;
  930         tp->t_segsz = tcp_mssdflt;
  931 
  932         flags = 0;
  933         if (tcp_do_rfc1323 && tcp_do_win_scale)
  934                 flags |= TF_REQ_SCALE;
  935         if (tcp_do_rfc1323 && tcp_do_timestamps)
  936                 flags |= TF_REQ_TSTMP;
  937         tp->t_flags = flags;
  938 
  939         /*
  940          * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
  941          * rtt estimate.  Set rttvar so that srtt + 2 * rttvar gives
  942          * reasonable initial retransmit time.
  943          */
  944         tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
  945         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
  946             TCPTV_MIN, TCPTV_REXMTMAX);
  947 }
  948 
  949 /*
  950  * Create a new TCP control block, making an
  951  * empty reassembly queue and hooking it to the argument
  952  * protocol control block.
  953  */
  954 /* family selects inpcb, or in6pcb */
  955 struct tcpcb *
  956 tcp_newtcpcb(int family, void *aux)
  957 {
  958         struct tcpcb *tp;
  959         int i;
  960 
  961         /* XXX Consider using a pool_cache for speed. */
  962         tp = pool_get(&tcpcb_pool, PR_NOWAIT);
  963         if (tp == NULL)
  964                 return (NULL);
  965         memcpy(tp, &tcpcb_template, sizeof(*tp));
  966         TAILQ_INIT(&tp->segq);
  967         TAILQ_INIT(&tp->timeq);
  968         tp->t_family = family;          /* may be overridden later on */
  969         TAILQ_INIT(&tp->snd_holes);
  970         LIST_INIT(&tp->t_sc);           /* XXX can template this */
  971 
  972         /* Don't sweat this loop; hopefully the compiler will unroll it. */
  973         for (i = 0; i < TCPT_NTIMERS; i++)
  974                 TCP_TIMER_INIT(tp, i);
  975 
  976         switch (family) {
  977         case AF_INET:
  978             {
  979                 struct inpcb *inp = (struct inpcb *)aux;
  980 
  981                 inp->inp_ip.ip_ttl = ip_defttl;
  982                 inp->inp_ppcb = (caddr_t)tp;
  983 
  984                 tp->t_inpcb = inp;
  985                 tp->t_mtudisc = ip_mtudisc;
  986                 break;
  987             }
  988 #ifdef INET6
  989         case AF_INET6:
  990             {
  991                 struct in6pcb *in6p = (struct in6pcb *)aux;
  992 
  993                 in6p->in6p_ip6.ip6_hlim = in6_selecthlim(in6p,
  994                         in6p->in6p_route.ro_rt ? in6p->in6p_route.ro_rt->rt_ifp
  995                                                : NULL);
  996                 in6p->in6p_ppcb = (caddr_t)tp;
  997 
  998                 tp->t_in6pcb = in6p;
  999                 /* for IPv6, always try to run path MTU discovery */
 1000                 tp->t_mtudisc = 1;
 1001                 break;
 1002             }
 1003 #endif /* INET6 */
 1004         default:
 1005                 pool_put(&tcpcb_pool, tp);
 1006                 return (NULL);
 1007         }
 1008 
 1009         /*
 1010          * Initialize our timebase.  When we send timestamps, we take
 1011          * the delta from tcp_now -- this means each connection always
 1012          * gets a timebase of 0, which makes it, among other things,
 1013          * more difficult to determine how long a system has been up,
 1014          * and thus how many TCP sequence increments have occurred.
 1015          */
 1016         tp->ts_timebase = tcp_now;
 1017 
 1018         return (tp);
 1019 }
 1020 
 1021 /*
 1022  * Drop a TCP connection, reporting
 1023  * the specified error.  If connection is synchronized,
 1024  * then send a RST to peer.
 1025  */
 1026 struct tcpcb *
 1027 tcp_drop(struct tcpcb *tp, int errno)
 1028 {
 1029         struct socket *so = NULL;
 1030 
 1031 #ifdef DIAGNOSTIC
 1032         if (tp->t_inpcb && tp->t_in6pcb)
 1033                 panic("tcp_drop: both t_inpcb and t_in6pcb are set");
 1034 #endif
 1035 #ifdef INET
 1036         if (tp->t_inpcb)
 1037                 so = tp->t_inpcb->inp_socket;
 1038 #endif
 1039 #ifdef INET6
 1040         if (tp->t_in6pcb)
 1041                 so = tp->t_in6pcb->in6p_socket;
 1042 #endif
 1043         if (!so)
 1044                 return NULL;
 1045 
 1046         if (TCPS_HAVERCVDSYN(tp->t_state)) {
 1047                 tp->t_state = TCPS_CLOSED;
 1048                 (void) tcp_output(tp);
 1049                 tcpstat.tcps_drops++;
 1050         } else
 1051                 tcpstat.tcps_conndrops++;
 1052         if (errno == ETIMEDOUT && tp->t_softerror)
 1053                 errno = tp->t_softerror;
 1054         so->so_error = errno;
 1055         return (tcp_close(tp));
 1056 }
 1057 
 1058 /*
 1059  * Return whether this tcpcb is marked as dead, indicating
 1060  * to the calling timer function that no further action should
 1061  * be taken, as we are about to release this tcpcb.  The release
 1062  * of the storage will be done if this is the last timer running.
 1063  *
 1064  * This should be called from the callout handler function after
 1065  * callout_ack() is done, so that the number of invoking timer
 1066  * functions is 0.
 1067  */
 1068 int
 1069 tcp_isdead(struct tcpcb *tp)
 1070 {
 1071         int dead = (tp->t_flags & TF_DEAD);
 1072 
 1073         if (__predict_false(dead)) {
 1074                 if (tcp_timers_invoking(tp) > 0)
 1075                                 /* not quite there yet -- count separately? */
 1076                         return dead;
 1077                 tcpstat.tcps_delayed_free++;
 1078                 pool_put(&tcpcb_pool, tp);
 1079         }
 1080         return dead;
 1081 }
 1082 
 1083 /*
 1084  * Close a TCP control block:
 1085  *      discard all space held by the tcp
 1086  *      discard internet protocol block
 1087  *      wake up any sleepers
 1088  */
 1089 struct tcpcb *
 1090 tcp_close(struct tcpcb *tp)
 1091 {
 1092         struct inpcb *inp;
 1093 #ifdef INET6
 1094         struct in6pcb *in6p;
 1095 #endif
 1096         struct socket *so;
 1097 #ifdef RTV_RTT
 1098         struct rtentry *rt;
 1099 #endif
 1100         struct route *ro;
 1101 
 1102         inp = tp->t_inpcb;
 1103 #ifdef INET6
 1104         in6p = tp->t_in6pcb;
 1105 #endif
 1106         so = NULL;
 1107         ro = NULL;
 1108         if (inp) {
 1109                 so = inp->inp_socket;
 1110                 ro = &inp->inp_route;
 1111         }
 1112 #ifdef INET6
 1113         else if (in6p) {
 1114                 so = in6p->in6p_socket;
 1115                 ro = (struct route *)&in6p->in6p_route;
 1116         }
 1117 #endif
 1118 
 1119 #ifdef RTV_RTT
 1120         /*
 1121          * If we sent enough data to get some meaningful characteristics,
 1122          * save them in the routing entry.  'Enough' is arbitrarily
 1123          * defined as the sendpipesize (default 4K) * 16.  This would
 1124          * give us 16 rtt samples assuming we only get one sample per
 1125          * window (the usual case on a long haul net).  16 samples is
 1126          * enough for the srtt filter to converge to within 5% of the correct
 1127          * value; fewer samples and we could save a very bogus rtt.
 1128          *
 1129          * Don't update the default route's characteristics and don't
 1130          * update anything that the user "locked".
 1131          */
 1132         if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
 1133             ro && (rt = ro->ro_rt) &&
 1134             !in_nullhost(satosin(rt_key(rt))->sin_addr)) {
 1135                 u_long i = 0;
 1136 
 1137                 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
 1138                         i = tp->t_srtt *
 1139                             ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
 1140                         if (rt->rt_rmx.rmx_rtt && i)
 1141                                 /*
 1142                                  * filter this update to half the old & half
 1143                                  * the new values, converting scale.
 1144                                  * See route.h and tcp_var.h for a
 1145                                  * description of the scaling constants.
 1146                                  */
 1147                                 rt->rt_rmx.rmx_rtt =
 1148                                     (rt->rt_rmx.rmx_rtt + i) / 2;
 1149                         else
 1150                                 rt->rt_rmx.rmx_rtt = i;
 1151                 }
 1152                 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
 1153                         i = tp->t_rttvar *
 1154                             ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
 1155                         if (rt->rt_rmx.rmx_rttvar && i)
 1156                                 rt->rt_rmx.rmx_rttvar =
 1157                                     (rt->rt_rmx.rmx_rttvar + i) / 2;
 1158                         else
 1159                                 rt->rt_rmx.rmx_rttvar = i;
 1160                 }
 1161                 /*
 1162                  * update the pipelimit (ssthresh) if it has been updated
 1163                  * already or if a pipesize was specified & the threshhold
 1164                  * got below half the pipesize.  I.e., wait for bad news
 1165                  * before we start updating, then update on both good
 1166                  * and bad news.
 1167                  */
 1168                 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
 1169                     (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
 1170                     i < (rt->rt_rmx.rmx_sendpipe / 2)) {
 1171                         /*
 1172                          * convert the limit from user data bytes to
 1173                          * packets then to packet data bytes.
 1174                          */
 1175                         i = (i + tp->t_segsz / 2) / tp->t_segsz;
 1176                         if (i < 2)
 1177                                 i = 2;
 1178                         i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
 1179                         if (rt->rt_rmx.rmx_ssthresh)
 1180                                 rt->rt_rmx.rmx_ssthresh =
 1181                                     (rt->rt_rmx.rmx_ssthresh + i) / 2;
 1182                         else
 1183                                 rt->rt_rmx.rmx_ssthresh = i;
 1184                 }
 1185         }
 1186 #endif /* RTV_RTT */
 1187         /* free the reassembly queue, if any */
 1188         TCP_REASS_LOCK(tp);
 1189         (void) tcp_freeq(tp);
 1190         TCP_REASS_UNLOCK(tp);
 1191 
 1192         /* free the SACK holes list. */
 1193         tcp_free_sackholes(tp);
 1194 
 1195         tcp_canceltimers(tp);
 1196         TCP_CLEAR_DELACK(tp);
 1197         syn_cache_cleanup(tp);
 1198 
 1199         if (tp->t_template) {
 1200                 m_free(tp->t_template);
 1201                 tp->t_template = NULL;
 1202         }
 1203         if (tcp_timers_invoking(tp))
 1204                 tp->t_flags |= TF_DEAD;
 1205         else
 1206                 pool_put(&tcpcb_pool, tp);
 1207 
 1208         if (inp) {
 1209                 inp->inp_ppcb = 0;
 1210                 soisdisconnected(so);
 1211                 in_pcbdetach(inp);
 1212         }
 1213 #ifdef INET6
 1214         else if (in6p) {
 1215                 in6p->in6p_ppcb = 0;
 1216                 soisdisconnected(so);
 1217                 in6_pcbdetach(in6p);
 1218         }
 1219 #endif
 1220         tcpstat.tcps_closed++;
 1221         return ((struct tcpcb *)0);
 1222 }
 1223 
 1224 int
 1225 tcp_freeq(tp)
 1226         struct tcpcb *tp;
 1227 {
 1228         struct ipqent *qe;
 1229         int rv = 0;
 1230 #ifdef TCPREASS_DEBUG
 1231         int i = 0;
 1232 #endif
 1233 
 1234         TCP_REASS_LOCK_CHECK(tp);
 1235 
 1236         while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) {
 1237 #ifdef TCPREASS_DEBUG
 1238                 printf("tcp_freeq[%p,%d]: %u:%u(%u) 0x%02x\n",
 1239                         tp, i++, qe->ipqe_seq, qe->ipqe_seq + qe->ipqe_len,
 1240                         qe->ipqe_len, qe->ipqe_flags & (TH_SYN|TH_FIN|TH_RST));
 1241 #endif
 1242                 TAILQ_REMOVE(&tp->segq, qe, ipqe_q);
 1243                 TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq);
 1244                 m_freem(qe->ipqe_m);
 1245                 tcpipqent_free(qe);
 1246                 rv = 1;
 1247         }
 1248         tp->t_segqlen = 0;
 1249         KASSERT(TAILQ_EMPTY(&tp->timeq));
 1250         return (rv);
 1251 }
 1252 
 1253 /*
 1254  * Protocol drain routine.  Called when memory is in short supply.
 1255  */
 1256 void
 1257 tcp_drain(void)
 1258 {
 1259         struct inpcb_hdr *inph;
 1260         struct tcpcb *tp;
 1261 
 1262         /*
 1263          * Free the sequence queue of all TCP connections.
 1264          */
 1265         CIRCLEQ_FOREACH(inph, &tcbtable.inpt_queue, inph_queue) {
 1266                 switch (inph->inph_af) {
 1267                 case AF_INET:
 1268                         tp = intotcpcb((struct inpcb *)inph);
 1269                         break;
 1270 #ifdef INET6
 1271                 case AF_INET6:
 1272                         tp = in6totcpcb((struct in6pcb *)inph);
 1273                         break;
 1274 #endif
 1275                 default:
 1276                         tp = NULL;
 1277                         break;
 1278                 }
 1279                 if (tp != NULL) {
 1280                         /*
 1281                          * We may be called from a device's interrupt
 1282                          * context.  If the tcpcb is already busy,
 1283                          * just bail out now.
 1284                          */
 1285                         if (tcp_reass_lock_try(tp) == 0)
 1286                                 continue;
 1287                         if (tcp_freeq(tp))
 1288                                 tcpstat.tcps_connsdrained++;
 1289                         TCP_REASS_UNLOCK(tp);
 1290                 }
 1291         }
 1292 }
 1293 
 1294 /*
 1295  * Notify a tcp user of an asynchronous error;
 1296  * store error as soft error, but wake up user
 1297  * (for now, won't do anything until can select for soft error).
 1298  */
 1299 void
 1300 tcp_notify(struct inpcb *inp, int error)
 1301 {
 1302         struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
 1303         struct socket *so = inp->inp_socket;
 1304 
 1305         /*
 1306          * Ignore some errors if we are hooked up.
 1307          * If connection hasn't completed, has retransmitted several times,
 1308          * and receives a second error, give up now.  This is better
 1309          * than waiting a long time to establish a connection that
 1310          * can never complete.
 1311          */
 1312         if (tp->t_state == TCPS_ESTABLISHED &&
 1313              (error == EHOSTUNREACH || error == ENETUNREACH ||
 1314               error == EHOSTDOWN)) {
 1315                 return;
 1316         } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
 1317             tp->t_rxtshift > 3 && tp->t_softerror)
 1318                 so->so_error = error;
 1319         else
 1320                 tp->t_softerror = error;
 1321         wakeup((caddr_t) &so->so_timeo);
 1322         sorwakeup(so);
 1323         sowwakeup(so);
 1324 }
 1325 
 1326 #ifdef INET6
 1327 void
 1328 tcp6_notify(struct in6pcb *in6p, int error)
 1329 {
 1330         struct tcpcb *tp = (struct tcpcb *)in6p->in6p_ppcb;
 1331         struct socket *so = in6p->in6p_socket;
 1332 
 1333         /*
 1334          * Ignore some errors if we are hooked up.
 1335          * If connection hasn't completed, has retransmitted several times,
 1336          * and receives a second error, give up now.  This is better
 1337          * than waiting a long time to establish a connection that
 1338          * can never complete.
 1339          */
 1340         if (tp->t_state == TCPS_ESTABLISHED &&
 1341              (error == EHOSTUNREACH || error == ENETUNREACH ||
 1342               error == EHOSTDOWN)) {
 1343                 return;
 1344         } else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
 1345             tp->t_rxtshift > 3 && tp->t_softerror)
 1346                 so->so_error = error;
 1347         else
 1348                 tp->t_softerror = error;
 1349         wakeup((caddr_t) &so->so_timeo);
 1350         sorwakeup(so);
 1351         sowwakeup(so);
 1352 }
 1353 #endif
 1354 
 1355 #ifdef INET6
 1356 void
 1357 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 1358 {
 1359         struct tcphdr th;
 1360         void (*notify)(struct in6pcb *, int) = tcp6_notify;
 1361         int nmatch;
 1362         struct ip6_hdr *ip6;
 1363         const struct sockaddr_in6 *sa6_src = NULL;
 1364         struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)sa;
 1365         struct mbuf *m;
 1366         int off;
 1367 
 1368         if (sa->sa_family != AF_INET6 ||
 1369             sa->sa_len != sizeof(struct sockaddr_in6))
 1370                 return;
 1371         if ((unsigned)cmd >= PRC_NCMDS)
 1372                 return;
 1373         else if (cmd == PRC_QUENCH) {
 1374                 /* XXX there's no PRC_QUENCH in IPv6 */
 1375                 notify = tcp6_quench;
 1376         } else if (PRC_IS_REDIRECT(cmd))
 1377                 notify = in6_rtchange, d = NULL;
 1378         else if (cmd == PRC_MSGSIZE)
 1379                 ; /* special code is present, see below */
 1380         else if (cmd == PRC_HOSTDEAD)
 1381                 d = NULL;
 1382         else if (inet6ctlerrmap[cmd] == 0)
 1383                 return;
 1384 
 1385         /* if the parameter is from icmp6, decode it. */
 1386         if (d != NULL) {
 1387                 struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
 1388                 m = ip6cp->ip6c_m;
 1389                 ip6 = ip6cp->ip6c_ip6;
 1390                 off = ip6cp->ip6c_off;
 1391                 sa6_src = ip6cp->ip6c_src;
 1392         } else {
 1393                 m = NULL;
 1394                 ip6 = NULL;
 1395                 sa6_src = &sa6_any;
 1396                 off = 0;
 1397         }
 1398 
 1399         if (ip6) {
 1400                 /*
 1401                  * XXX: We assume that when ip6 is non NULL,
 1402                  * M and OFF are valid.
 1403                  */
 1404 
 1405                 /* check if we can safely examine src and dst ports */
 1406                 if (m->m_pkthdr.len < off + sizeof(th)) {
 1407                         if (cmd == PRC_MSGSIZE)
 1408                                 icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
 1409                         return;
 1410                 }
 1411 
 1412                 bzero(&th, sizeof(th));
 1413                 m_copydata(m, off, sizeof(th), (caddr_t)&th);
 1414 
 1415                 if (cmd == PRC_MSGSIZE) {
 1416                         int valid = 0;
 1417 
 1418                         /*
 1419                          * Check to see if we have a valid TCP connection
 1420                          * corresponding to the address in the ICMPv6 message
 1421                          * payload.
 1422                          */
 1423                         if (in6_pcblookup_connect(&tcbtable, &sa6->sin6_addr,
 1424                             th.th_dport, (struct in6_addr *)&sa6_src->sin6_addr,
 1425                             th.th_sport, 0))
 1426                                 valid++;
 1427 
 1428                         /*
 1429                          * Depending on the value of "valid" and routing table
 1430                          * size (mtudisc_{hi,lo}wat), we will:
 1431                          * - recalcurate the new MTU and create the
 1432                          *   corresponding routing entry, or
 1433                          * - ignore the MTU change notification.
 1434                          */
 1435                         icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
 1436 
 1437                         /*
 1438                          * no need to call in6_pcbnotify, it should have been
 1439                          * called via callback if necessary
 1440                          */
 1441                         return;
 1442                 }
 1443 
 1444                 nmatch = in6_pcbnotify(&tcbtable, sa, th.th_dport,
 1445                     (struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
 1446                 if (nmatch == 0 && syn_cache_count &&
 1447                     (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
 1448                      inet6ctlerrmap[cmd] == ENETUNREACH ||
 1449                      inet6ctlerrmap[cmd] == EHOSTDOWN))
 1450                         syn_cache_unreach((struct sockaddr *)sa6_src,
 1451                                           sa, &th);
 1452         } else {
 1453                 (void) in6_pcbnotify(&tcbtable, sa, 0,
 1454                     (struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
 1455         }
 1456 }
 1457 #endif
 1458 
 1459 #ifdef INET
 1460 /* assumes that ip header and tcp header are contiguous on mbuf */
 1461 void *
 1462 tcp_ctlinput(int cmd, struct sockaddr *sa, void *v)
 1463 {
 1464         struct ip *ip = v;
 1465         struct tcphdr *th;
 1466         struct icmp *icp;
 1467         extern const int inetctlerrmap[];
 1468         void (*notify)(struct inpcb *, int) = tcp_notify;
 1469         int errno;
 1470         int nmatch;
 1471 #ifdef INET6
 1472         struct in6_addr src6, dst6;
 1473 #endif
 1474 
 1475         if (sa->sa_family != AF_INET ||
 1476             sa->sa_len != sizeof(struct sockaddr_in))
 1477                 return NULL;
 1478         if ((unsigned)cmd >= PRC_NCMDS)
 1479                 return NULL;
 1480         errno = inetctlerrmap[cmd];
 1481         if (cmd == PRC_QUENCH)
 1482                 notify = tcp_quench;
 1483         else if (PRC_IS_REDIRECT(cmd))
 1484                 notify = in_rtchange, ip = 0;
 1485         else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) {
 1486                 /*
 1487                  * Check to see if we have a valid TCP connection
 1488                  * corresponding to the address in the ICMP message
 1489                  * payload.
 1490                  *
 1491                  * Boundary check is made in icmp_input(), with ICMP_ADVLENMIN.
 1492                  */
 1493                 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 1494 #ifdef INET6
 1495                 memset(&src6, 0, sizeof(src6));
 1496                 memset(&dst6, 0, sizeof(dst6));
 1497                 src6.s6_addr16[5] = dst6.s6_addr16[5] = 0xffff;
 1498                 memcpy(&src6.s6_addr32[3], &ip->ip_src, sizeof(struct in_addr));
 1499                 memcpy(&dst6.s6_addr32[3], &ip->ip_dst, sizeof(struct in_addr));
 1500 #endif
 1501                 if (in_pcblookup_connect(&tcbtable, ip->ip_dst, th->th_dport,
 1502                     ip->ip_src, th->th_sport) != NULL)
 1503                         ;
 1504 #ifdef INET6
 1505                 else if (in6_pcblookup_connect(&tcbtable, &dst6,
 1506                     th->th_dport, &src6, th->th_sport, 0) != NULL)
 1507                         ;
 1508 #endif
 1509                 else
 1510                         return NULL;
 1511 
 1512                 /*
 1513                  * Now that we've validated that we are actually communicating
 1514                  * with the host indicated in the ICMP message, locate the
 1515                  * ICMP header, recalculate the new MTU, and create the
 1516                  * corresponding routing entry.
 1517                  */
 1518                 icp = (struct icmp *)((caddr_t)ip -
 1519                     offsetof(struct icmp, icmp_ip));
 1520                 icmp_mtudisc(icp, ip->ip_dst);
 1521 
 1522                 return NULL;
 1523         } else if (cmd == PRC_HOSTDEAD)
 1524                 ip = 0;
 1525         else if (errno == 0)
 1526                 return NULL;
 1527         if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
 1528                 th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 1529                 nmatch = in_pcbnotify(&tcbtable, satosin(sa)->sin_addr,
 1530                     th->th_dport, ip->ip_src, th->th_sport, errno, notify);
 1531                 if (nmatch == 0 && syn_cache_count &&
 1532                     (inetctlerrmap[cmd] == EHOSTUNREACH ||
 1533                     inetctlerrmap[cmd] == ENETUNREACH ||
 1534                     inetctlerrmap[cmd] == EHOSTDOWN)) {
 1535                         struct sockaddr_in sin;
 1536                         bzero(&sin, sizeof(sin));
 1537                         sin.sin_len = sizeof(sin);
 1538                         sin.sin_family = AF_INET;
 1539                         sin.sin_port = th->th_sport;
 1540                         sin.sin_addr = ip->ip_src;
 1541                         syn_cache_unreach((struct sockaddr *)&sin, sa, th);
 1542                 }
 1543 
 1544                 /* XXX mapped address case */
 1545         } else
 1546                 in_pcbnotifyall(&tcbtable, satosin(sa)->sin_addr, errno,
 1547                     notify);
 1548         return NULL;
 1549 }
 1550 
 1551 /*
 1552  * When a source quench is received, we are being notified of congestion.
 1553  * Close the congestion window down to the Loss Window (one segment).
 1554  * We will gradually open it again as we proceed.
 1555  */
 1556 void
 1557 tcp_quench(struct inpcb *inp, int errno)
 1558 {
 1559         struct tcpcb *tp = intotcpcb(inp);
 1560 
 1561         if (tp)
 1562                 tp->snd_cwnd = tp->t_segsz;
 1563 }
 1564 #endif
 1565 
 1566 #ifdef INET6
 1567 void
 1568 tcp6_quench(struct in6pcb *in6p, int errno)
 1569 {
 1570         struct tcpcb *tp = in6totcpcb(in6p);
 1571 
 1572         if (tp)
 1573                 tp->snd_cwnd = tp->t_segsz;
 1574 }
 1575 #endif
 1576 
 1577 #ifdef INET
 1578 /*
 1579  * Path MTU Discovery handlers.
 1580  */
 1581 void
 1582 tcp_mtudisc_callback(struct in_addr faddr)
 1583 {
 1584 #ifdef INET6
 1585         struct in6_addr in6;
 1586 #endif
 1587 
 1588         in_pcbnotifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc);
 1589 #ifdef INET6
 1590         memset(&in6, 0, sizeof(in6));
 1591         in6.s6_addr16[5] = 0xffff;
 1592         memcpy(&in6.s6_addr32[3], &faddr, sizeof(struct in_addr));
 1593         tcp6_mtudisc_callback(&in6);
 1594 #endif
 1595 }
 1596 
 1597 /*
 1598  * On receipt of path MTU corrections, flush old route and replace it
 1599  * with the new one.  Retransmit all unacknowledged packets, to ensure
 1600  * that all packets will be received.
 1601  */
 1602 void
 1603 tcp_mtudisc(struct inpcb *inp, int errno)
 1604 {
 1605         struct tcpcb *tp = intotcpcb(inp);
 1606         struct rtentry *rt = in_pcbrtentry(inp);
 1607 
 1608         if (tp != 0) {
 1609                 if (rt != 0) {
 1610                         /*
 1611                          * If this was not a host route, remove and realloc.
 1612                          */
 1613                         if ((rt->rt_flags & RTF_HOST) == 0) {
 1614                                 in_rtchange(inp, errno);
 1615                                 if ((rt = in_pcbrtentry(inp)) == 0)
 1616                                         return;
 1617                         }
 1618 
 1619                         /*
 1620                          * Slow start out of the error condition.  We
 1621                          * use the MTU because we know it's smaller
 1622                          * than the previously transmitted segment.
 1623                          *
 1624                          * Note: This is more conservative than the
 1625                          * suggestion in draft-floyd-incr-init-win-03.
 1626                          */
 1627                         if (rt->rt_rmx.rmx_mtu != 0)
 1628                                 tp->snd_cwnd =
 1629                                     TCP_INITIAL_WINDOW(tcp_init_win,
 1630                                     rt->rt_rmx.rmx_mtu);
 1631                 }
 1632 
 1633                 /*
 1634                  * Resend unacknowledged packets.
 1635                  */
 1636                 tp->snd_nxt = tp->snd_una;
 1637                 tcp_output(tp);
 1638         }
 1639 }
 1640 #endif
 1641 
 1642 #ifdef INET6
 1643 /*
 1644  * Path MTU Discovery handlers.
 1645  */
 1646 void
 1647 tcp6_mtudisc_callback(struct in6_addr *faddr)
 1648 {
 1649         struct sockaddr_in6 sin6;
 1650 
 1651         bzero(&sin6, sizeof(sin6));
 1652         sin6.sin6_family = AF_INET6;
 1653         sin6.sin6_len = sizeof(struct sockaddr_in6);
 1654         sin6.sin6_addr = *faddr;
 1655         (void) in6_pcbnotify(&tcbtable, (struct sockaddr *)&sin6, 0,
 1656             (struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc);
 1657 }
 1658 
 1659 void
 1660 tcp6_mtudisc(struct in6pcb *in6p, int errno)
 1661 {
 1662         struct tcpcb *tp = in6totcpcb(in6p);
 1663         struct rtentry *rt = in6_pcbrtentry(in6p);
 1664 
 1665         if (tp != 0) {
 1666                 if (rt != 0) {
 1667                         /*
 1668                          * If this was not a host route, remove and realloc.
 1669                          */
 1670                         if ((rt->rt_flags & RTF_HOST) == 0) {
 1671                                 in6_rtchange(in6p, errno);
 1672                                 if ((rt = in6_pcbrtentry(in6p)) == 0)
 1673                                         return;
 1674                         }
 1675 
 1676                         /*
 1677                          * Slow start out of the error condition.  We
 1678                          * use the MTU because we know it's smaller
 1679                          * than the previously transmitted segment.
 1680                          *
 1681                          * Note: This is more conservative than the
 1682                          * suggestion in draft-floyd-incr-init-win-03.
 1683                          */
 1684                         if (rt->rt_rmx.rmx_mtu != 0)
 1685                                 tp->snd_cwnd =
 1686                                     TCP_INITIAL_WINDOW(tcp_init_win,
 1687                                     rt->rt_rmx.rmx_mtu);
 1688                 }
 1689 
 1690                 /*
 1691                  * Resend unacknowledged packets.
 1692                  */
 1693                 tp->snd_nxt = tp->snd_una;
 1694                 tcp_output(tp);
 1695         }
 1696 }
 1697 #endif /* INET6 */
 1698 
 1699 /*
 1700  * Compute the MSS to advertise to the peer.  Called only during
 1701  * the 3-way handshake.  If we are the server (peer initiated
 1702  * connection), we are called with a pointer to the interface
 1703  * on which the SYN packet arrived.  If we are the client (we
 1704  * initiated connection), we are called with a pointer to the
 1705  * interface out which this connection should go.
 1706  *
 1707  * NOTE: Do not subtract IP option/extension header size nor IPsec
 1708  * header size from MSS advertisement.  MSS option must hold the maximum
 1709  * segment size we can accept, so it must always be:
 1710  *       max(if mtu) - ip header - tcp header
 1711  */
 1712 u_long
 1713 tcp_mss_to_advertise(const struct ifnet *ifp, int af)
 1714 {
 1715         extern u_long in_maxmtu;
 1716         u_long mss = 0;
 1717         u_long hdrsiz;
 1718 
 1719         /*
 1720          * In order to avoid defeating path MTU discovery on the peer,
 1721          * we advertise the max MTU of all attached networks as our MSS,
 1722          * per RFC 1191, section 3.1.
 1723          *
 1724          * We provide the option to advertise just the MTU of
 1725          * the interface on which we hope this connection will
 1726          * be receiving.  If we are responding to a SYN, we
 1727          * will have a pretty good idea about this, but when
 1728          * initiating a connection there is a bit more doubt.
 1729          *
 1730          * We also need to ensure that loopback has a large enough
 1731          * MSS, as the loopback MTU is never included in in_maxmtu.
 1732          */
 1733 
 1734         if (ifp != NULL)
 1735                 switch (af) {
 1736                 case AF_INET:
 1737                         mss = ifp->if_mtu;
 1738                         break;
 1739 #ifdef INET6
 1740                 case AF_INET6:
 1741                         mss = IN6_LINKMTU(ifp);
 1742                         break;
 1743 #endif
 1744                 }
 1745 
 1746         if (tcp_mss_ifmtu == 0)
 1747                 switch (af) {
 1748                 case AF_INET:
 1749                         mss = max(in_maxmtu, mss);
 1750                         break;
 1751 #ifdef INET6
 1752                 case AF_INET6:
 1753                         mss = max(in6_maxmtu, mss);
 1754                         break;
 1755 #endif
 1756                 }
 1757 
 1758         switch (af) {
 1759         case AF_INET:
 1760                 hdrsiz = sizeof(struct ip);
 1761                 break;
 1762 #ifdef INET6
 1763         case AF_INET6:
 1764                 hdrsiz = sizeof(struct ip6_hdr);
 1765                 break;
 1766 #endif
 1767         default:
 1768                 hdrsiz = 0;
 1769                 break;
 1770         }
 1771         hdrsiz += sizeof(struct tcphdr);
 1772         if (mss > hdrsiz)
 1773                 mss -= hdrsiz;
 1774 
 1775         mss = max(tcp_mssdflt, mss);
 1776         return (mss);
 1777 }
 1778 
 1779 /*
 1780  * Set connection variables based on the peer's advertised MSS.
 1781  * We are passed the TCPCB for the actual connection.  If we
 1782  * are the server, we are called by the compressed state engine
 1783  * when the 3-way handshake is complete.  If we are the client,
 1784  * we are called when we receive the SYN,ACK from the server.
 1785  *
 1786  * NOTE: Our advertised MSS value must be initialized in the TCPCB
 1787  * before this routine is called!
 1788  */
 1789 void
 1790 tcp_mss_from_peer(struct tcpcb *tp, int offer)
 1791 {
 1792         struct socket *so;
 1793 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
 1794         struct rtentry *rt;
 1795 #endif
 1796         u_long bufsize;
 1797         int mss;
 1798 
 1799 #ifdef DIAGNOSTIC
 1800         if (tp->t_inpcb && tp->t_in6pcb)
 1801                 panic("tcp_mss_from_peer: both t_inpcb and t_in6pcb are set");
 1802 #endif
 1803         so = NULL;
 1804         rt = NULL;
 1805 #ifdef INET
 1806         if (tp->t_inpcb) {
 1807                 so = tp->t_inpcb->inp_socket;
 1808 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
 1809                 rt = in_pcbrtentry(tp->t_inpcb);
 1810 #endif
 1811         }
 1812 #endif
 1813 #ifdef INET6
 1814         if (tp->t_in6pcb) {
 1815                 so = tp->t_in6pcb->in6p_socket;
 1816 #if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
 1817                 rt = in6_pcbrtentry(tp->t_in6pcb);
 1818 #endif
 1819         }
 1820 #endif
 1821 
 1822         /*
 1823          * As per RFC1122, use the default MSS value, unless they
 1824          * sent us an offer.  Do not accept offers less than 256 bytes.
 1825          */
 1826         mss = tcp_mssdflt;
 1827         if (offer)
 1828                 mss = offer;
 1829         mss = max(mss, 256);            /* sanity */
 1830         tp->t_peermss = mss;
 1831         mss -= tcp_optlen(tp);
 1832 #ifdef INET
 1833         if (tp->t_inpcb)
 1834                 mss -= ip_optlen(tp->t_inpcb);
 1835 #endif
 1836 #ifdef INET6
 1837         if (tp->t_in6pcb)
 1838                 mss -= ip6_optlen(tp->t_in6pcb);
 1839 #endif
 1840 
 1841         /*
 1842          * If there's a pipesize, change the socket buffer to that size.
 1843          * Make the socket buffer an integral number of MSS units.  If
 1844          * the MSS is larger than the socket buffer, artificially decrease
 1845          * the MSS.
 1846          */
 1847 #ifdef RTV_SPIPE
 1848         if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
 1849                 bufsize = rt->rt_rmx.rmx_sendpipe;
 1850         else
 1851 #endif
 1852                 bufsize = so->so_snd.sb_hiwat;
 1853         if (bufsize < mss)
 1854                 mss = bufsize;
 1855         else {
 1856                 bufsize = roundup(bufsize, mss);
 1857                 if (bufsize > sb_max)
 1858                         bufsize = sb_max;
 1859                 (void) sbreserve(&so->so_snd, bufsize, so);
 1860         }
 1861         tp->t_segsz = mss;
 1862 
 1863 #ifdef RTV_SSTHRESH
 1864         if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
 1865                 /*
 1866                  * There's some sort of gateway or interface buffer
 1867                  * limit on the path.  Use this to set the slow
 1868                  * start threshold, but set the threshold to no less
 1869                  * than 2 * MSS.
 1870                  */
 1871                 tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
 1872         }
 1873 #endif
 1874 }
 1875 
 1876 /*
 1877  * Processing necessary when a TCP connection is established.
 1878  */
 1879 void
 1880 tcp_established(struct tcpcb *tp)
 1881 {
 1882         struct socket *so;
 1883 #ifdef RTV_RPIPE
 1884         struct rtentry *rt;
 1885 #endif
 1886         u_long bufsize;
 1887 
 1888 #ifdef DIAGNOSTIC
 1889         if (tp->t_inpcb && tp->t_in6pcb)
 1890                 panic("tcp_established: both t_inpcb and t_in6pcb are set");
 1891 #endif
 1892         so = NULL;
 1893         rt = NULL;
 1894 #ifdef INET
 1895         if (tp->t_inpcb) {
 1896                 so = tp->t_inpcb->inp_socket;
 1897 #if defined(RTV_RPIPE)
 1898                 rt = in_pcbrtentry(tp->t_inpcb);
 1899 #endif
 1900         }
 1901 #endif
 1902 #ifdef INET6
 1903         if (tp->t_in6pcb) {
 1904                 so = tp->t_in6pcb->in6p_socket;
 1905 #if defined(RTV_RPIPE)
 1906                 rt = in6_pcbrtentry(tp->t_in6pcb);
 1907 #endif
 1908         }
 1909 #endif
 1910 
 1911         tp->t_state = TCPS_ESTABLISHED;
 1912         TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
 1913 
 1914 #ifdef RTV_RPIPE
 1915         if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
 1916                 bufsize = rt->rt_rmx.rmx_recvpipe;
 1917         else
 1918 #endif
 1919                 bufsize = so->so_rcv.sb_hiwat;
 1920         if (bufsize > tp->t_ourmss) {
 1921                 bufsize = roundup(bufsize, tp->t_ourmss);
 1922                 if (bufsize > sb_max)
 1923                         bufsize = sb_max;
 1924                 (void) sbreserve(&so->so_rcv, bufsize, so);
 1925         }
 1926 }
 1927 
 1928 /*
 1929  * Check if there's an initial rtt or rttvar.  Convert from the
 1930  * route-table units to scaled multiples of the slow timeout timer.
 1931  * Called only during the 3-way handshake.
 1932  */
 1933 void
 1934 tcp_rmx_rtt(struct tcpcb *tp)
 1935 {
 1936 #ifdef RTV_RTT
 1937         struct rtentry *rt = NULL;
 1938         int rtt;
 1939 
 1940 #ifdef DIAGNOSTIC
 1941         if (tp->t_inpcb && tp->t_in6pcb)
 1942                 panic("tcp_rmx_rtt: both t_inpcb and t_in6pcb are set");
 1943 #endif
 1944 #ifdef INET
 1945         if (tp->t_inpcb)
 1946                 rt = in_pcbrtentry(tp->t_inpcb);
 1947 #endif
 1948 #ifdef INET6
 1949         if (tp->t_in6pcb)
 1950                 rt = in6_pcbrtentry(tp->t_in6pcb);
 1951 #endif
 1952         if (rt == NULL)
 1953                 return;
 1954 
 1955         if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
 1956                 /*
 1957                  * XXX The lock bit for MTU indicates that the value
 1958                  * is also a minimum value; this is subject to time.
 1959                  */
 1960                 if (rt->rt_rmx.rmx_locks & RTV_RTT)
 1961                         TCPT_RANGESET(tp->t_rttmin,
 1962                             rtt / (RTM_RTTUNIT / PR_SLOWHZ),
 1963                             TCPTV_MIN, TCPTV_REXMTMAX);
 1964                 tp->t_srtt = rtt /
 1965                     ((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
 1966                 if (rt->rt_rmx.rmx_rttvar) {
 1967                         tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
 1968                             ((RTM_RTTUNIT / PR_SLOWHZ) >>
 1969                                 (TCP_RTTVAR_SHIFT + 2));
 1970                 } else {
 1971                         /* Default variation is +- 1 rtt */
 1972                         tp->t_rttvar =
 1973                             tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
 1974                 }
 1975                 TCPT_RANGESET(tp->t_rxtcur,
 1976                     ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
 1977                     tp->t_rttmin, TCPTV_REXMTMAX);
 1978         }
 1979 #endif
 1980 }
 1981 
 1982 tcp_seq  tcp_iss_seq = 0;       /* tcp initial seq # */
 1983 #if NRND > 0
 1984 u_int8_t tcp_iss_secret[16];    /* 128 bits; should be plenty */
 1985 #endif
 1986 
 1987 /*
 1988  * Get a new sequence value given a tcp control block
 1989  */
 1990 tcp_seq
 1991 tcp_new_iss(struct tcpcb *tp, tcp_seq addin)
 1992 {
 1993 
 1994 #ifdef INET
 1995         if (tp->t_inpcb != NULL) {
 1996                 return (tcp_new_iss1(&tp->t_inpcb->inp_laddr,
 1997                     &tp->t_inpcb->inp_faddr, tp->t_inpcb->inp_lport,
 1998                     tp->t_inpcb->inp_fport, sizeof(tp->t_inpcb->inp_laddr),
 1999                     addin));
 2000         }
 2001 #endif
 2002 #ifdef INET6
 2003         if (tp->t_in6pcb != NULL) {
 2004                 return (tcp_new_iss1(&tp->t_in6pcb->in6p_laddr,
 2005                     &tp->t_in6pcb->in6p_faddr, tp->t_in6pcb->in6p_lport,
 2006                     tp->t_in6pcb->in6p_fport, sizeof(tp->t_in6pcb->in6p_laddr),
 2007                     addin));
 2008         }
 2009 #endif
 2010         /* Not possible. */
 2011         panic("tcp_new_iss");
 2012 }
 2013 
 2014 /*
 2015  * This routine actually generates a new TCP initial sequence number.
 2016  */
 2017 tcp_seq
 2018 tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport,
 2019     size_t addrsz, tcp_seq addin)
 2020 {
 2021         tcp_seq tcp_iss;
 2022 
 2023 #if NRND > 0
 2024         static int beenhere;
 2025 
 2026         /*
 2027          * If we haven't been here before, initialize our cryptographic
 2028          * hash secret.
 2029          */
 2030         if (beenhere == 0) {
 2031                 rnd_extract_data(tcp_iss_secret, sizeof(tcp_iss_secret),
 2032                     RND_EXTRACT_ANY);
 2033                 beenhere = 1;
 2034         }
 2035 
 2036         if (tcp_do_rfc1948) {
 2037                 MD5_CTX ctx;
 2038                 u_int8_t hash[16];      /* XXX MD5 knowledge */
 2039 
 2040                 /*
 2041                  * Compute the base value of the ISS.  It is a hash
 2042                  * of (saddr, sport, daddr, dport, secret).
 2043                  */
 2044                 MD5Init(&ctx);
 2045 
 2046                 MD5Update(&ctx, (u_char *) laddr, addrsz);
 2047                 MD5Update(&ctx, (u_char *) &lport, sizeof(lport));
 2048 
 2049                 MD5Update(&ctx, (u_char *) faddr, addrsz);
 2050                 MD5Update(&ctx, (u_char *) &fport, sizeof(fport));
 2051 
 2052                 MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret));
 2053 
 2054                 MD5Final(hash, &ctx);
 2055 
 2056                 memcpy(&tcp_iss, hash, sizeof(tcp_iss));
 2057 
 2058                 /*
 2059                  * Now increment our "timer", and add it in to
 2060                  * the computed value.
 2061                  *
 2062                  * XXX Use `addin'?
 2063                  * XXX TCP_ISSINCR too large to use?
 2064                  */
 2065                 tcp_iss_seq += TCP_ISSINCR;
 2066 #ifdef TCPISS_DEBUG
 2067                 printf("ISS hash 0x%08x, ", tcp_iss);
 2068 #endif
 2069                 tcp_iss += tcp_iss_seq + addin;
 2070 #ifdef TCPISS_DEBUG
 2071                 printf("new ISS 0x%08x\n", tcp_iss);
 2072 #endif
 2073         } else
 2074 #endif /* NRND > 0 */
 2075         {
 2076                 /*
 2077                  * Randomize.
 2078                  */
 2079 #if NRND > 0
 2080                 rnd_extract_data(&tcp_iss, sizeof(tcp_iss), RND_EXTRACT_ANY);
 2081 #else
 2082                 tcp_iss = arc4random();
 2083 #endif
 2084 
 2085                 /*
 2086                  * If we were asked to add some amount to a known value,
 2087                  * we will take a random value obtained above, mask off
 2088                  * the upper bits, and add in the known value.  We also
 2089                  * add in a constant to ensure that we are at least a
 2090                  * certain distance from the original value.
 2091                  *
 2092                  * This is used when an old connection is in timed wait
 2093                  * and we have a new one coming in, for instance.
 2094                  */
 2095                 if (addin != 0) {
 2096 #ifdef TCPISS_DEBUG
 2097                         printf("Random %08x, ", tcp_iss);
 2098 #endif
 2099                         tcp_iss &= TCP_ISS_RANDOM_MASK;
 2100                         tcp_iss += addin + TCP_ISSINCR;
 2101 #ifdef TCPISS_DEBUG
 2102                         printf("Old ISS %08x, ISS %08x\n", addin, tcp_iss);
 2103 #endif
 2104                 } else {
 2105                         tcp_iss &= TCP_ISS_RANDOM_MASK;
 2106                         tcp_iss += tcp_iss_seq;
 2107                         tcp_iss_seq += TCP_ISSINCR;
 2108 #ifdef TCPISS_DEBUG
 2109                         printf("ISS %08x\n", tcp_iss);
 2110 #endif
 2111                 }
 2112         }
 2113 
 2114         if (tcp_compat_42) {
 2115                 /*
 2116                  * Limit it to the positive range for really old TCP
 2117                  * implementations.
 2118                  * Just AND off the top bit instead of checking if
 2119                  * is set first - saves a branch 50% of the time.
 2120                  */
 2121                 tcp_iss &= 0x7fffffff;          /* XXX */
 2122         }
 2123 
 2124         return (tcp_iss);
 2125 }
 2126 
 2127 #if defined(IPSEC) || defined(FAST_IPSEC)
 2128 /* compute ESP/AH header size for TCP, including outer IP header. */
 2129 size_t
 2130 ipsec4_hdrsiz_tcp(struct tcpcb *tp)
 2131 {
 2132         struct inpcb *inp;
 2133         size_t hdrsiz;
 2134 
 2135         /* XXX mapped addr case (tp->t_in6pcb) */
 2136         if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
 2137                 return 0;
 2138         switch (tp->t_family) {
 2139         case AF_INET:
 2140                 /* XXX: should use currect direction. */
 2141                 hdrsiz = ipsec4_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
 2142                 break;
 2143         default:
 2144                 hdrsiz = 0;
 2145                 break;
 2146         }
 2147 
 2148         return hdrsiz;
 2149 }
 2150 
 2151 #ifdef INET6
 2152 size_t
 2153 ipsec6_hdrsiz_tcp(struct tcpcb *tp)
 2154 {
 2155         struct in6pcb *in6p;
 2156         size_t hdrsiz;
 2157 
 2158         if (!tp || !tp->t_template || !(in6p = tp->t_in6pcb))
 2159                 return 0;
 2160         switch (tp->t_family) {
 2161         case AF_INET6:
 2162                 /* XXX: should use currect direction. */
 2163                 hdrsiz = ipsec6_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, in6p);
 2164                 break;
 2165         case AF_INET:
 2166                 /* mapped address case - tricky */
 2167         default:
 2168                 hdrsiz = 0;
 2169                 break;
 2170         }
 2171 
 2172         return hdrsiz;
 2173 }
 2174 #endif
 2175 #endif /*IPSEC*/
 2176 
 2177 /*
 2178  * Determine the length of the TCP options for this connection.
 2179  *
 2180  * XXX:  What do we do for SACK, when we add that?  Just reserve
 2181  *       all of the space?  Otherwise we can't exactly be incrementing
 2182  *       cwnd by an amount that varies depending on the amount we last
 2183  *       had to SACK!
 2184  */
 2185 
 2186 u_int
 2187 tcp_optlen(struct tcpcb *tp)
 2188 {
 2189         u_int optlen;
 2190 
 2191         optlen = 0;
 2192         if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
 2193             (TF_REQ_TSTMP | TF_RCVD_TSTMP))
 2194                 optlen += TCPOLEN_TSTAMP_APPA;
 2195 
 2196 #ifdef TCP_SIGNATURE
 2197 #if defined(INET6) && defined(FAST_IPSEC)
 2198         if (tp->t_family == AF_INET)
 2199 #endif
 2200         if (tp->t_flags & TF_SIGNATURE)
 2201                 optlen += TCPOLEN_SIGNATURE + 2;
 2202 #endif /* TCP_SIGNATURE */
 2203 
 2204         return optlen;
 2205 }

Cache object: 208342ac8b5dcf971609caa1bb1d3f1a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.