The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_input.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
    3  *      The Regents of the University of California.  All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  * 4. Neither the name of the University nor the names of its contributors
   14  *    may be used to endorse or promote products derived from this software
   15  *    without specific prior written permission.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   27  * SUCH DAMAGE.
   28  *
   29  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
   30  */
   31 
   32 #include <sys/cdefs.h>
   33 __FBSDID("$FreeBSD: releng/8.2/sys/netinet/tcp_input.c 236953 2012-06-12 12:10:10Z bz $");
   34 
   35 #include "opt_ipfw.h"           /* for ipfw_fwd */
   36 #include "opt_inet.h"
   37 #include "opt_inet6.h"
   38 #include "opt_ipsec.h"
   39 #include "opt_tcpdebug.h"
   40 
   41 #include <sys/param.h>
   42 #include <sys/kernel.h>
   43 #include <sys/malloc.h>
   44 #include <sys/mbuf.h>
   45 #include <sys/proc.h>           /* for proc0 declaration */
   46 #include <sys/protosw.h>
   47 #include <sys/signalvar.h>
   48 #include <sys/socket.h>
   49 #include <sys/socketvar.h>
   50 #include <sys/sysctl.h>
   51 #include <sys/syslog.h>
   52 #include <sys/systm.h>
   53 
   54 #include <machine/cpu.h>        /* before tcp_seq.h, for tcp_random18() */
   55 
   56 #include <vm/uma.h>
   57 
   58 #include <net/if.h>
   59 #include <net/route.h>
   60 #include <net/vnet.h>
   61 
   62 #define TCPSTATES               /* for logging */
   63 
   64 #include <netinet/in.h>
   65 #include <netinet/in_pcb.h>
   66 #include <netinet/in_systm.h>
   67 #include <netinet/in_var.h>
   68 #include <netinet/ip.h>
   69 #include <netinet/ip_icmp.h>    /* required for icmp_var.h */
   70 #include <netinet/icmp_var.h>   /* for ICMP_BANDLIM */
   71 #include <netinet/ip_var.h>
   72 #include <netinet/ip_options.h>
   73 #include <netinet/ip6.h>
   74 #include <netinet/icmp6.h>
   75 #include <netinet6/in6_pcb.h>
   76 #include <netinet6/ip6_var.h>
   77 #include <netinet6/nd6.h>
   78 #include <netinet/tcp.h>
   79 #include <netinet/tcp_fsm.h>
   80 #include <netinet/tcp_seq.h>
   81 #include <netinet/tcp_timer.h>
   82 #include <netinet/tcp_var.h>
   83 #include <netinet6/tcp6_var.h>
   84 #include <netinet/tcpip.h>
   85 #include <netinet/tcp_syncache.h>
   86 #ifdef TCPDEBUG
   87 #include <netinet/tcp_debug.h>
   88 #endif /* TCPDEBUG */
   89 
   90 #ifdef IPSEC
   91 #include <netipsec/ipsec.h>
   92 #include <netipsec/ipsec6.h>
   93 #endif /*IPSEC*/
   94 
   95 #include <machine/in_cksum.h>
   96 
   97 #include <security/mac/mac_framework.h>
   98 
   99 static const int tcprexmtthresh = 3;
  100 
  101 VNET_DEFINE(struct tcpstat, tcpstat);
  102 SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
  103     &VNET_NAME(tcpstat), tcpstat,
  104     "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
  105 
  106 int tcp_log_in_vain = 0;
  107 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain, CTLFLAG_RW,
  108     &tcp_log_in_vain, 0,
  109     "Log all incoming TCP segments to closed ports");
  110 
  111 VNET_DEFINE(int, blackhole) = 0;
  112 #define V_blackhole             VNET(blackhole)
  113 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, blackhole, CTLFLAG_RW,
  114     &VNET_NAME(blackhole), 0,
  115     "Do not send RST on segments to closed ports");
  116 
  117 VNET_DEFINE(int, tcp_delack_enabled) = 1;
  118 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, delayed_ack, CTLFLAG_RW,
  119     &VNET_NAME(tcp_delack_enabled), 0,
  120     "Delay ACK to try and piggyback it onto a data packet");
  121 
  122 VNET_DEFINE(int, drop_synfin) = 0;
  123 #define V_drop_synfin           VNET(drop_synfin)
  124 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, drop_synfin, CTLFLAG_RW,
  125     &VNET_NAME(drop_synfin), 0,
  126     "Drop TCP packets with SYN+FIN set");
  127 
  128 VNET_DEFINE(int, tcp_do_rfc3042) = 1;
  129 #define V_tcp_do_rfc3042        VNET(tcp_do_rfc3042)
  130 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
  131     &VNET_NAME(tcp_do_rfc3042), 0,
  132     "Enable RFC 3042 (Limited Transmit)");
  133 
  134 VNET_DEFINE(int, tcp_do_rfc3390) = 1;
  135 #define V_tcp_do_rfc3390        VNET(tcp_do_rfc3390)
  136 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
  137     &VNET_NAME(tcp_do_rfc3390), 0,
  138     "Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
  139 
  140 VNET_DEFINE(int, tcp_do_rfc3465) = 1;
  141 #define V_tcp_do_rfc3465        VNET(tcp_do_rfc3465)
  142 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
  143     &VNET_NAME(tcp_do_rfc3465), 0,
  144     "Enable RFC 3465 (Appropriate Byte Counting)");
  145 
  146 VNET_DEFINE(int, tcp_abc_l_var) = 2;
  147 #define V_tcp_abc_l_var         VNET(tcp_abc_l_var)
  148 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
  149     &VNET_NAME(tcp_abc_l_var), 2,
  150     "Cap the max cwnd increment during slow-start to this number of segments");
  151 
  152 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn, CTLFLAG_RW, 0, "TCP ECN");
  153 
  154 VNET_DEFINE(int, tcp_do_ecn) = 0;
  155 SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_RW,
  156     &VNET_NAME(tcp_do_ecn), 0,
  157     "TCP ECN support");
  158 
  159 VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
  160 SYSCTL_VNET_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_RW,
  161     &VNET_NAME(tcp_ecn_maxretries), 0,
  162     "Max retries before giving up on ECN");
  163 
  164 VNET_DEFINE(int, tcp_insecure_rst) = 0;
  165 #define V_tcp_insecure_rst      VNET(tcp_insecure_rst)
  166 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, insecure_rst, CTLFLAG_RW,
  167     &VNET_NAME(tcp_insecure_rst), 0,
  168     "Follow the old (insecure) criteria for accepting RST packets");
  169 
  170 VNET_DEFINE(int, tcp_do_autorcvbuf) = 1;
  171 #define V_tcp_do_autorcvbuf     VNET(tcp_do_autorcvbuf)
  172 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
  173     &VNET_NAME(tcp_do_autorcvbuf), 0,
  174     "Enable automatic receive buffer sizing");
  175 
  176 VNET_DEFINE(int, tcp_autorcvbuf_inc) = 16*1024;
  177 #define V_tcp_autorcvbuf_inc    VNET(tcp_autorcvbuf_inc)
  178 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
  179     &VNET_NAME(tcp_autorcvbuf_inc), 0,
  180     "Incrementor step size of automatic receive buffer");
  181 
  182 VNET_DEFINE(int, tcp_autorcvbuf_max) = 256*1024;
  183 #define V_tcp_autorcvbuf_max    VNET(tcp_autorcvbuf_max)
  184 SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
  185     &VNET_NAME(tcp_autorcvbuf_max), 0,
  186     "Max size of automatic receive buffer");
  187 
  188 int     tcp_read_locking = 1;
  189 SYSCTL_INT(_net_inet_tcp, OID_AUTO, read_locking, CTLFLAG_RW,
  190     &tcp_read_locking, 0, "Enable read locking strategy");
  191 
  192 VNET_DEFINE(struct inpcbhead, tcb);
  193 #define tcb6    tcb  /* for KAME src sync over BSD*'s */
  194 VNET_DEFINE(struct inpcbinfo, tcbinfo);
  195 
  196 static void      tcp_dooptions(struct tcpopt *, u_char *, int, int);
  197 static void      tcp_do_segment(struct mbuf *, struct tcphdr *,
  198                      struct socket *, struct tcpcb *, int, int, uint8_t,
  199                      int);
  200 static void      tcp_dropwithreset(struct mbuf *, struct tcphdr *,
  201                      struct tcpcb *, int, int);
  202 static void      tcp_pulloutofband(struct socket *,
  203                      struct tcphdr *, struct mbuf *, int);
  204 static void      tcp_xmit_timer(struct tcpcb *, int);
  205 static void      tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
  206 static void inline
  207                  tcp_congestion_exp(struct tcpcb *);
  208 
  209 /*
  210  * Kernel module interface for updating tcpstat.  The argument is an index
  211  * into tcpstat treated as an array of u_long.  While this encodes the
  212  * general layout of tcpstat into the caller, it doesn't encode its location,
  213  * so that future changes to add, for example, per-CPU stats support won't
  214  * cause binary compatibility problems for kernel modules.
  215  */
  216 void
  217 kmod_tcpstat_inc(int statnum)
  218 {
  219 
  220         (*((u_long *)&V_tcpstat + statnum))++;
  221 }
  222 
  223 static void inline
  224 tcp_congestion_exp(struct tcpcb *tp)
  225 {
  226         u_int win;
  227         
  228         win = min(tp->snd_wnd, tp->snd_cwnd) /
  229             2 / tp->t_maxseg;
  230         if (win < 2)
  231                 win = 2;
  232         tp->snd_ssthresh = win * tp->t_maxseg;
  233         ENTER_FASTRECOVERY(tp);
  234         tp->snd_recover = tp->snd_max;
  235         if (tp->t_flags & TF_ECN_PERMIT)
  236                 tp->t_flags |= TF_ECN_SND_CWR;
  237 }
  238 
  239 /* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
  240 #ifdef INET6
  241 #define ND6_HINT(tp) \
  242 do { \
  243         if ((tp) && (tp)->t_inpcb && \
  244             ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
  245                 nd6_nud_hint(NULL, NULL, 0); \
  246 } while (0)
  247 #else
  248 #define ND6_HINT(tp)
  249 #endif
  250 
  251 /*
  252  * Indicate whether this ack should be delayed.  We can delay the ack if
  253  *      - there is no delayed ack timer in progress and
  254  *      - our last ack wasn't a 0-sized window.  We never want to delay
  255  *        the ack that opens up a 0-sized window and
  256  *              - delayed acks are enabled or
  257  *              - this is a half-synchronized T/TCP connection.
  258  */
  259 #define DELAY_ACK(tp)                                                   \
  260         ((!tcp_timer_active(tp, TT_DELACK) &&                           \
  261             (tp->t_flags & TF_RXWIN0SENT) == 0) &&                      \
  262             (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
  263 
  264 /*
  265  * TCP input handling is split into multiple parts:
  266  *   tcp6_input is a thin wrapper around tcp_input for the extended
  267  *      ip6_protox[] call format in ip6_input
  268  *   tcp_input handles primary segment validation, inpcb lookup and
  269  *      SYN processing on listen sockets
  270  *   tcp_do_segment processes the ACK and text of the segment for
  271  *      establishing, established and closing connections
  272  */
  273 #ifdef INET6
  274 int
  275 tcp6_input(struct mbuf **mp, int *offp, int proto)
  276 {
  277         struct mbuf *m = *mp;
  278         struct in6_ifaddr *ia6;
  279 
  280         IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), IPPROTO_DONE);
  281 
  282         /*
  283          * draft-itojun-ipv6-tcp-to-anycast
  284          * better place to put this in?
  285          */
  286         ia6 = ip6_getdstifaddr(m);
  287         if (ia6 && (ia6->ia6_flags & IN6_IFF_ANYCAST)) {
  288                 struct ip6_hdr *ip6;
  289 
  290                 ifa_free(&ia6->ia_ifa);
  291                 ip6 = mtod(m, struct ip6_hdr *);
  292                 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
  293                             (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
  294                 return IPPROTO_DONE;
  295         }
  296         if (ia6)
  297                 ifa_free(&ia6->ia_ifa);
  298 
  299         tcp_input(m, *offp);
  300         return IPPROTO_DONE;
  301 }
  302 #endif
  303 
  304 void
  305 tcp_input(struct mbuf *m, int off0)
  306 {
  307         struct tcphdr *th;
  308         struct ip *ip = NULL;
  309         struct ipovly *ipov;
  310         struct inpcb *inp = NULL;
  311         struct tcpcb *tp = NULL;
  312         struct socket *so = NULL;
  313         u_char *optp = NULL;
  314         int optlen = 0;
  315         int len, tlen, off;
  316         int drop_hdrlen;
  317         int thflags;
  318         int rstreason = 0;      /* For badport_bandlim accounting purposes */
  319         uint8_t iptos;
  320 #ifdef IPFIREWALL_FORWARD
  321         struct m_tag *fwd_tag;
  322 #endif
  323 #ifdef INET6
  324         struct ip6_hdr *ip6 = NULL;
  325         int isipv6;
  326 #else
  327         const void *ip6 = NULL;
  328         const int isipv6 = 0;
  329 #endif
  330         struct tcpopt to;               /* options in this segment */
  331         char *s = NULL;                 /* address and port logging */
  332         int ti_locked;
  333 #define TI_UNLOCKED     1
  334 #define TI_RLOCKED      2
  335 #define TI_WLOCKED      3
  336 
  337 #ifdef TCPDEBUG
  338         /*
  339          * The size of tcp_saveipgen must be the size of the max ip header,
  340          * now IPv6.
  341          */
  342         u_char tcp_saveipgen[IP6_HDR_LEN];
  343         struct tcphdr tcp_savetcp;
  344         short ostate = 0;
  345 #endif
  346 
  347 #ifdef INET6
  348         isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
  349 #endif
  350 
  351         to.to_flags = 0;
  352         TCPSTAT_INC(tcps_rcvtotal);
  353 
  354         if (isipv6) {
  355 #ifdef INET6
  356                 /* IP6_EXTHDR_CHECK() is already done at tcp6_input(). */
  357                 ip6 = mtod(m, struct ip6_hdr *);
  358                 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
  359                 if (in6_cksum(m, IPPROTO_TCP, off0, tlen)) {
  360                         TCPSTAT_INC(tcps_rcvbadsum);
  361                         goto drop;
  362                 }
  363                 th = (struct tcphdr *)((caddr_t)ip6 + off0);
  364 
  365                 /*
  366                  * Be proactive about unspecified IPv6 address in source.
  367                  * As we use all-zero to indicate unbounded/unconnected pcb,
  368                  * unspecified IPv6 address can be used to confuse us.
  369                  *
  370                  * Note that packets with unspecified IPv6 destination is
  371                  * already dropped in ip6_input.
  372                  */
  373                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
  374                         /* XXX stat */
  375                         goto drop;
  376                 }
  377 #else
  378                 th = NULL;              /* XXX: Avoid compiler warning. */
  379 #endif
  380         } else {
  381                 /*
  382                  * Get IP and TCP header together in first mbuf.
  383                  * Note: IP leaves IP header in first mbuf.
  384                  */
  385                 if (off0 > sizeof (struct ip)) {
  386                         ip_stripoptions(m, (struct mbuf *)0);
  387                         off0 = sizeof(struct ip);
  388                 }
  389                 if (m->m_len < sizeof (struct tcpiphdr)) {
  390                         if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
  391                             == NULL) {
  392                                 TCPSTAT_INC(tcps_rcvshort);
  393                                 return;
  394                         }
  395                 }
  396                 ip = mtod(m, struct ip *);
  397                 ipov = (struct ipovly *)ip;
  398                 th = (struct tcphdr *)((caddr_t)ip + off0);
  399                 tlen = ip->ip_len;
  400 
  401                 if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
  402                         if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
  403                                 th->th_sum = m->m_pkthdr.csum_data;
  404                         else
  405                                 th->th_sum = in_pseudo(ip->ip_src.s_addr,
  406                                                 ip->ip_dst.s_addr,
  407                                                 htonl(m->m_pkthdr.csum_data +
  408                                                         ip->ip_len +
  409                                                         IPPROTO_TCP));
  410                         th->th_sum ^= 0xffff;
  411 #ifdef TCPDEBUG
  412                         ipov->ih_len = (u_short)tlen;
  413                         ipov->ih_len = htons(ipov->ih_len);
  414 #endif
  415                 } else {
  416                         /*
  417                          * Checksum extended TCP header and data.
  418                          */
  419                         len = sizeof (struct ip) + tlen;
  420                         bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
  421                         ipov->ih_len = (u_short)tlen;
  422                         ipov->ih_len = htons(ipov->ih_len);
  423                         th->th_sum = in_cksum(m, len);
  424                 }
  425                 if (th->th_sum) {
  426                         TCPSTAT_INC(tcps_rcvbadsum);
  427                         goto drop;
  428                 }
  429                 /* Re-initialization for later version check */
  430                 ip->ip_v = IPVERSION;
  431         }
  432 
  433 #ifdef INET6
  434         if (isipv6)
  435                 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
  436         else
  437 #endif
  438                 iptos = ip->ip_tos;
  439 
  440         /*
  441          * Check that TCP offset makes sense,
  442          * pull out TCP options and adjust length.              XXX
  443          */
  444         off = th->th_off << 2;
  445         if (off < sizeof (struct tcphdr) || off > tlen) {
  446                 TCPSTAT_INC(tcps_rcvbadoff);
  447                 goto drop;
  448         }
  449         tlen -= off;    /* tlen is used instead of ti->ti_len */
  450         if (off > sizeof (struct tcphdr)) {
  451                 if (isipv6) {
  452 #ifdef INET6
  453                         IP6_EXTHDR_CHECK(m, off0, off, );
  454                         ip6 = mtod(m, struct ip6_hdr *);
  455                         th = (struct tcphdr *)((caddr_t)ip6 + off0);
  456 #endif
  457                 } else {
  458                         if (m->m_len < sizeof(struct ip) + off) {
  459                                 if ((m = m_pullup(m, sizeof (struct ip) + off))
  460                                     == NULL) {
  461                                         TCPSTAT_INC(tcps_rcvshort);
  462                                         return;
  463                                 }
  464                                 ip = mtod(m, struct ip *);
  465                                 ipov = (struct ipovly *)ip;
  466                                 th = (struct tcphdr *)((caddr_t)ip + off0);
  467                         }
  468                 }
  469                 optlen = off - sizeof (struct tcphdr);
  470                 optp = (u_char *)(th + 1);
  471         }
  472         thflags = th->th_flags;
  473 
  474         /*
  475          * Convert TCP protocol specific fields to host format.
  476          */
  477         th->th_seq = ntohl(th->th_seq);
  478         th->th_ack = ntohl(th->th_ack);
  479         th->th_win = ntohs(th->th_win);
  480         th->th_urp = ntohs(th->th_urp);
  481 
  482         /*
  483          * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options.
  484          */
  485         drop_hdrlen = off0 + off;
  486 
  487         /*
  488          * Locate pcb for segment, which requires a lock on tcbinfo.
  489          * Optimisticaly acquire a global read lock rather than a write lock
  490          * unless header flags necessarily imply a state change.  There are
  491          * two cases where we might discover later we need a write lock
  492          * despite the flags: ACKs moving a connection out of the syncache,
  493          * and ACKs for a connection in TIMEWAIT.
  494          */
  495         if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
  496             tcp_read_locking == 0) {
  497                 INP_INFO_WLOCK(&V_tcbinfo);
  498                 ti_locked = TI_WLOCKED;
  499         } else {
  500                 INP_INFO_RLOCK(&V_tcbinfo);
  501                 ti_locked = TI_RLOCKED;
  502         }
  503 
  504 findpcb:
  505 #ifdef INVARIANTS
  506         if (ti_locked == TI_RLOCKED)
  507                 INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
  508         else if (ti_locked == TI_WLOCKED)
  509                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  510         else
  511                 panic("%s: findpcb ti_locked %d\n", __func__, ti_locked);
  512 #endif
  513 
  514 #ifdef IPFIREWALL_FORWARD
  515         /*
  516          * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
  517          */
  518         fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
  519 
  520         if (fwd_tag != NULL && isipv6 == 0) {   /* IPv6 support is not yet */
  521                 struct sockaddr_in *next_hop;
  522 
  523                 next_hop = (struct sockaddr_in *)(fwd_tag+1);
  524                 /*
  525                  * Transparently forwarded. Pretend to be the destination.
  526                  * already got one like this?
  527                  */
  528                 inp = in_pcblookup_hash(&V_tcbinfo,
  529                                         ip->ip_src, th->th_sport,
  530                                         ip->ip_dst, th->th_dport,
  531                                         0, m->m_pkthdr.rcvif);
  532                 if (!inp) {
  533                         /* It's new.  Try to find the ambushing socket. */
  534                         inp = in_pcblookup_hash(&V_tcbinfo,
  535                                                 ip->ip_src, th->th_sport,
  536                                                 next_hop->sin_addr,
  537                                                 next_hop->sin_port ?
  538                                                     ntohs(next_hop->sin_port) :
  539                                                     th->th_dport,
  540                                                 INPLOOKUP_WILDCARD,
  541                                                 m->m_pkthdr.rcvif);
  542                 }
  543                 /* Remove the tag from the packet.  We don't need it anymore. */
  544                 m_tag_delete(m, fwd_tag);
  545         } else
  546 #endif /* IPFIREWALL_FORWARD */
  547         {
  548                 if (isipv6) {
  549 #ifdef INET6
  550                         inp = in6_pcblookup_hash(&V_tcbinfo,
  551                                                  &ip6->ip6_src, th->th_sport,
  552                                                  &ip6->ip6_dst, th->th_dport,
  553                                                  INPLOOKUP_WILDCARD,
  554                                                  m->m_pkthdr.rcvif);
  555 #endif
  556                 } else
  557                         inp = in_pcblookup_hash(&V_tcbinfo,
  558                                                 ip->ip_src, th->th_sport,
  559                                                 ip->ip_dst, th->th_dport,
  560                                                 INPLOOKUP_WILDCARD,
  561                                                 m->m_pkthdr.rcvif);
  562         }
  563 
  564         /*
  565          * If the INPCB does not exist then all data in the incoming
  566          * segment is discarded and an appropriate RST is sent back.
  567          * XXX MRT Send RST using which routing table?
  568          */
  569         if (inp == NULL) {
  570                 /*
  571                  * Log communication attempts to ports that are not
  572                  * in use.
  573                  */
  574                 if ((tcp_log_in_vain == 1 && (thflags & TH_SYN)) ||
  575                     tcp_log_in_vain == 2) {
  576                         if ((s = tcp_log_vain(NULL, th, (void *)ip, ip6)))
  577                                 log(LOG_INFO, "%s; %s: Connection attempt "
  578                                     "to closed port\n", s, __func__);
  579                 }
  580                 /*
  581                  * When blackholing do not respond with a RST but
  582                  * completely ignore the segment and drop it.
  583                  */
  584                 if ((V_blackhole == 1 && (thflags & TH_SYN)) ||
  585                     V_blackhole == 2)
  586                         goto dropunlock;
  587 
  588                 rstreason = BANDLIM_RST_CLOSEDPORT;
  589                 goto dropwithreset;
  590         }
  591         INP_WLOCK(inp);
  592         if (!(inp->inp_flags & INP_HW_FLOWID)
  593             && (m->m_flags & M_FLOWID)
  594             && ((inp->inp_socket == NULL)
  595                 || !(inp->inp_socket->so_options & SO_ACCEPTCONN))) {
  596                 inp->inp_flags |= INP_HW_FLOWID;
  597                 inp->inp_flags &= ~INP_SW_FLOWID;
  598                 inp->inp_flowid = m->m_pkthdr.flowid;
  599         }
  600 #ifdef IPSEC
  601 #ifdef INET6
  602         if (isipv6 && ipsec6_in_reject(m, inp)) {
  603                 V_ipsec6stat.in_polvio++;
  604                 goto dropunlock;
  605         } else
  606 #endif /* INET6 */
  607         if (ipsec4_in_reject(m, inp) != 0) {
  608                 V_ipsec4stat.in_polvio++;
  609                 goto dropunlock;
  610         }
  611 #endif /* IPSEC */
  612 
  613         /*
  614          * Check the minimum TTL for socket.
  615          */
  616         if (inp->inp_ip_minttl != 0) {
  617 #ifdef INET6
  618                 if (isipv6 && inp->inp_ip_minttl > ip6->ip6_hlim)
  619                         goto dropunlock;
  620                 else
  621 #endif
  622                 if (inp->inp_ip_minttl > ip->ip_ttl)
  623                         goto dropunlock;
  624         }
  625 
  626         /*
  627          * A previous connection in TIMEWAIT state is supposed to catch stray
  628          * or duplicate segments arriving late.  If this segment was a
  629          * legitimate new connection attempt the old INPCB gets removed and
  630          * we can try again to find a listening socket.
  631          *
  632          * At this point, due to earlier optimism, we may hold a read lock on
  633          * the inpcbinfo, rather than a write lock.  If so, we need to
  634          * upgrade, or if that fails, acquire a reference on the inpcb, drop
  635          * all locks, acquire a global write lock, and then re-acquire the
  636          * inpcb lock.  We may at that point discover that another thread has
  637          * tried to free the inpcb, in which case we need to loop back and
  638          * try to find a new inpcb to deliver to.
  639          */
  640 relocked:
  641         if (inp->inp_flags & INP_TIMEWAIT) {
  642                 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
  643                     ("%s: INP_TIMEWAIT ti_locked %d", __func__, ti_locked));
  644 
  645                 if (ti_locked == TI_RLOCKED) {
  646                         if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
  647                                 in_pcbref(inp);
  648                                 INP_WUNLOCK(inp);
  649                                 INP_INFO_RUNLOCK(&V_tcbinfo);
  650                                 INP_INFO_WLOCK(&V_tcbinfo);
  651                                 ti_locked = TI_WLOCKED;
  652                                 INP_WLOCK(inp);
  653                                 if (in_pcbrele(inp)) {
  654                                         inp = NULL;
  655                                         goto findpcb;
  656                                 }
  657                         } else
  658                                 ti_locked = TI_WLOCKED;
  659                 }
  660                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  661 
  662                 if (thflags & TH_SYN)
  663                         tcp_dooptions(&to, optp, optlen, TO_SYN);
  664                 /*
  665                  * NB: tcp_twcheck unlocks the INP and frees the mbuf.
  666                  */
  667                 if (tcp_twcheck(inp, &to, th, m, tlen))
  668                         goto findpcb;
  669                 INP_INFO_WUNLOCK(&V_tcbinfo);
  670                 return;
  671         }
  672         /*
  673          * The TCPCB may no longer exist if the connection is winding
  674          * down or it is in the CLOSED state.  Either way we drop the
  675          * segment and send an appropriate response.
  676          */
  677         tp = intotcpcb(inp);
  678         if (tp == NULL || tp->t_state == TCPS_CLOSED) {
  679                 rstreason = BANDLIM_RST_CLOSEDPORT;
  680                 goto dropwithreset;
  681         }
  682 
  683         /*
  684          * We've identified a valid inpcb, but it could be that we need an
  685          * inpcbinfo write lock and have only a read lock.  In this case,
  686          * attempt to upgrade/relock using the same strategy as the TIMEWAIT
  687          * case above.  If we relock, we have to jump back to 'relocked' as
  688          * the connection might now be in TIMEWAIT.
  689          */
  690         if (tp->t_state != TCPS_ESTABLISHED ||
  691             (thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
  692             tcp_read_locking == 0) {
  693                 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
  694                     ("%s: upgrade check ti_locked %d", __func__, ti_locked));
  695 
  696                 if (ti_locked == TI_RLOCKED) {
  697                         if (INP_INFO_TRY_UPGRADE(&V_tcbinfo) == 0) {
  698                                 in_pcbref(inp);
  699                                 INP_WUNLOCK(inp);
  700                                 INP_INFO_RUNLOCK(&V_tcbinfo);
  701                                 INP_INFO_WLOCK(&V_tcbinfo);
  702                                 ti_locked = TI_WLOCKED;
  703                                 INP_WLOCK(inp);
  704                                 if (in_pcbrele(inp)) {
  705                                         inp = NULL;
  706                                         goto findpcb;
  707                                 }
  708                                 goto relocked;
  709                         } else
  710                                 ti_locked = TI_WLOCKED;
  711                 }
  712                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
  713         }
  714 
  715 #ifdef MAC
  716         INP_WLOCK_ASSERT(inp);
  717         if (mac_inpcb_check_deliver(inp, m))
  718                 goto dropunlock;
  719 #endif
  720         so = inp->inp_socket;
  721         KASSERT(so != NULL, ("%s: so == NULL", __func__));
  722 #ifdef TCPDEBUG
  723         if (so->so_options & SO_DEBUG) {
  724                 ostate = tp->t_state;
  725                 if (isipv6) {
  726 #ifdef INET6
  727                         bcopy((char *)ip6, (char *)tcp_saveipgen, sizeof(*ip6));
  728 #endif
  729                 } else
  730                         bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
  731                 tcp_savetcp = *th;
  732         }
  733 #endif
  734         /*
  735          * When the socket is accepting connections (the INPCB is in LISTEN
  736          * state) we look into the SYN cache if this is a new connection
  737          * attempt or the completion of a previous one.
  738          */
  739         if (so->so_options & SO_ACCEPTCONN) {
  740                 struct in_conninfo inc;
  741 
  742                 KASSERT(tp->t_state == TCPS_LISTEN, ("%s: so accepting but "
  743                     "tp not listening", __func__));
  744 
  745                 bzero(&inc, sizeof(inc));
  746 #ifdef INET6
  747                 if (isipv6) {
  748                         inc.inc_flags |= INC_ISIPV6;
  749                         inc.inc6_faddr = ip6->ip6_src;
  750                         inc.inc6_laddr = ip6->ip6_dst;
  751                 } else
  752 #endif
  753                 {
  754                         inc.inc_faddr = ip->ip_src;
  755                         inc.inc_laddr = ip->ip_dst;
  756                 }
  757                 inc.inc_fport = th->th_sport;
  758                 inc.inc_lport = th->th_dport;
  759                 inc.inc_fibnum = so->so_fibnum;
  760 
  761                 /*
  762                  * Check for an existing connection attempt in syncache if
  763                  * the flag is only ACK.  A successful lookup creates a new
  764                  * socket appended to the listen queue in SYN_RECEIVED state.
  765                  */
  766                 if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
  767                         /*
  768                          * Parse the TCP options here because
  769                          * syncookies need access to the reflected
  770                          * timestamp.
  771                          */
  772                         tcp_dooptions(&to, optp, optlen, 0);
  773                         /*
  774                          * NB: syncache_expand() doesn't unlock
  775                          * inp and tcpinfo locks.
  776                          */
  777                         if (!syncache_expand(&inc, &to, th, &so, m)) {
  778                                 /*
  779                                  * No syncache entry or ACK was not
  780                                  * for our SYN/ACK.  Send a RST.
  781                                  * NB: syncache did its own logging
  782                                  * of the failure cause.
  783                                  */
  784                                 rstreason = BANDLIM_RST_OPENPORT;
  785                                 goto dropwithreset;
  786                         }
  787                         if (so == NULL) {
  788                                 /*
  789                                  * We completed the 3-way handshake
  790                                  * but could not allocate a socket
  791                                  * either due to memory shortage,
  792                                  * listen queue length limits or
  793                                  * global socket limits.  Send RST
  794                                  * or wait and have the remote end
  795                                  * retransmit the ACK for another
  796                                  * try.
  797                                  */
  798                                 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  799                                         log(LOG_DEBUG, "%s; %s: Listen socket: "
  800                                             "Socket allocation failed due to "
  801                                             "limits or memory shortage, %s\n",
  802                                             s, __func__,
  803                                             V_tcp_sc_rst_sock_fail ?
  804                                             "sending RST" : "try again");
  805                                 if (V_tcp_sc_rst_sock_fail) {
  806                                         rstreason = BANDLIM_UNLIMITED;
  807                                         goto dropwithreset;
  808                                 } else
  809                                         goto dropunlock;
  810                         }
  811                         /*
  812                          * Socket is created in state SYN_RECEIVED.
  813                          * Unlock the listen socket, lock the newly
  814                          * created socket and update the tp variable.
  815                          */
  816                         INP_WUNLOCK(inp);       /* listen socket */
  817                         inp = sotoinpcb(so);
  818                         INP_WLOCK(inp);         /* new connection */
  819                         tp = intotcpcb(inp);
  820                         KASSERT(tp->t_state == TCPS_SYN_RECEIVED,
  821                             ("%s: ", __func__));
  822                         /*
  823                          * Process the segment and the data it
  824                          * contains.  tcp_do_segment() consumes
  825                          * the mbuf chain and unlocks the inpcb.
  826                          */
  827                         tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
  828                             iptos, ti_locked);
  829                         INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
  830                         return;
  831                 }
  832                 /*
  833                  * Segment flag validation for new connection attempts:
  834                  *
  835                  * Our (SYN|ACK) response was rejected.
  836                  * Check with syncache and remove entry to prevent
  837                  * retransmits.
  838                  *
  839                  * NB: syncache_chkrst does its own logging of failure
  840                  * causes.
  841                  */
  842                 if (thflags & TH_RST) {
  843                         syncache_chkrst(&inc, th);
  844                         goto dropunlock;
  845                 }
  846                 /*
  847                  * We can't do anything without SYN.
  848                  */
  849                 if ((thflags & TH_SYN) == 0) {
  850                         if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  851                                 log(LOG_DEBUG, "%s; %s: Listen socket: "
  852                                     "SYN is missing, segment ignored\n",
  853                                     s, __func__);
  854                         TCPSTAT_INC(tcps_badsyn);
  855                         goto dropunlock;
  856                 }
  857                 /*
  858                  * (SYN|ACK) is bogus on a listen socket.
  859                  */
  860                 if (thflags & TH_ACK) {
  861                         if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  862                                 log(LOG_DEBUG, "%s; %s: Listen socket: "
  863                                     "SYN|ACK invalid, segment rejected\n",
  864                                     s, __func__);
  865                         syncache_badack(&inc);  /* XXX: Not needed! */
  866                         TCPSTAT_INC(tcps_badsyn);
  867                         rstreason = BANDLIM_RST_OPENPORT;
  868                         goto dropwithreset;
  869                 }
  870                 /*
  871                  * If the drop_synfin option is enabled, drop all
  872                  * segments with both the SYN and FIN bits set.
  873                  * This prevents e.g. nmap from identifying the
  874                  * TCP/IP stack.
  875                  * XXX: Poor reasoning.  nmap has other methods
  876                  * and is constantly refining its stack detection
  877                  * strategies.
  878                  * XXX: This is a violation of the TCP specification
  879                  * and was used by RFC1644.
  880                  */
  881                 if ((thflags & TH_FIN) && V_drop_synfin) {
  882                         if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  883                                 log(LOG_DEBUG, "%s; %s: Listen socket: "
  884                                     "SYN|FIN segment ignored (based on "
  885                                     "sysctl setting)\n", s, __func__);
  886                         TCPSTAT_INC(tcps_badsyn);
  887                         goto dropunlock;
  888                 }
  889                 /*
  890                  * Segment's flags are (SYN) or (SYN|FIN).
  891                  *
  892                  * TH_PUSH, TH_URG, TH_ECE, TH_CWR are ignored
  893                  * as they do not affect the state of the TCP FSM.
  894                  * The data pointed to by TH_URG and th_urp is ignored.
  895                  */
  896                 KASSERT((thflags & (TH_RST|TH_ACK)) == 0,
  897                     ("%s: Listen socket: TH_RST or TH_ACK set", __func__));
  898                 KASSERT(thflags & (TH_SYN),
  899                     ("%s: Listen socket: TH_SYN not set", __func__));
  900 #ifdef INET6
  901                 /*
  902                  * If deprecated address is forbidden,
  903                  * we do not accept SYN to deprecated interface
  904                  * address to prevent any new inbound connection from
  905                  * getting established.
  906                  * When we do not accept SYN, we send a TCP RST,
  907                  * with deprecated source address (instead of dropping
  908                  * it).  We compromise it as it is much better for peer
  909                  * to send a RST, and RST will be the final packet
  910                  * for the exchange.
  911                  *
  912                  * If we do not forbid deprecated addresses, we accept
  913                  * the SYN packet.  RFC2462 does not suggest dropping
  914                  * SYN in this case.
  915                  * If we decipher RFC2462 5.5.4, it says like this:
  916                  * 1. use of deprecated addr with existing
  917                  *    communication is okay - "SHOULD continue to be
  918                  *    used"
  919                  * 2. use of it with new communication:
  920                  *   (2a) "SHOULD NOT be used if alternate address
  921                  *        with sufficient scope is available"
  922                  *   (2b) nothing mentioned otherwise.
  923                  * Here we fall into (2b) case as we have no choice in
  924                  * our source address selection - we must obey the peer.
  925                  *
  926                  * The wording in RFC2462 is confusing, and there are
  927                  * multiple description text for deprecated address
  928                  * handling - worse, they are not exactly the same.
  929                  * I believe 5.5.4 is the best one, so we follow 5.5.4.
  930                  */
  931                 if (isipv6 && !V_ip6_use_deprecated) {
  932                         struct in6_ifaddr *ia6;
  933 
  934                         ia6 = ip6_getdstifaddr(m);
  935                         if (ia6 != NULL &&
  936                             (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
  937                                 ifa_free(&ia6->ia_ifa);
  938                                 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  939                                     log(LOG_DEBUG, "%s; %s: Listen socket: "
  940                                         "Connection attempt to deprecated "
  941                                         "IPv6 address rejected\n",
  942                                         s, __func__);
  943                                 rstreason = BANDLIM_RST_OPENPORT;
  944                                 goto dropwithreset;
  945                         }
  946                         if (ia6)
  947                                 ifa_free(&ia6->ia_ifa);
  948                 }
  949 #endif
  950                 /*
  951                  * Basic sanity checks on incoming SYN requests:
  952                  *   Don't respond if the destination is a link layer
  953                  *      broadcast according to RFC1122 4.2.3.10, p. 104.
  954                  *   If it is from this socket it must be forged.
  955                  *   Don't respond if the source or destination is a
  956                  *      global or subnet broad- or multicast address.
  957                  *   Note that it is quite possible to receive unicast
  958                  *      link-layer packets with a broadcast IP address. Use
  959                  *      in_broadcast() to find them.
  960                  */
  961                 if (m->m_flags & (M_BCAST|M_MCAST)) {
  962                         if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  963                             log(LOG_DEBUG, "%s; %s: Listen socket: "
  964                                 "Connection attempt from broad- or multicast "
  965                                 "link layer address ignored\n", s, __func__);
  966                         goto dropunlock;
  967                 }
  968                 if (isipv6) {
  969 #ifdef INET6
  970                         if (th->th_dport == th->th_sport &&
  971                             IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &ip6->ip6_src)) {
  972                                 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  973                                     log(LOG_DEBUG, "%s; %s: Listen socket: "
  974                                         "Connection attempt to/from self "
  975                                         "ignored\n", s, __func__);
  976                                 goto dropunlock;
  977                         }
  978                         if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
  979                             IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
  980                                 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  981                                     log(LOG_DEBUG, "%s; %s: Listen socket: "
  982                                         "Connection attempt from/to multicast "
  983                                         "address ignored\n", s, __func__);
  984                                 goto dropunlock;
  985                         }
  986 #endif
  987                 } else {
  988                         if (th->th_dport == th->th_sport &&
  989                             ip->ip_dst.s_addr == ip->ip_src.s_addr) {
  990                                 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
  991                                     log(LOG_DEBUG, "%s; %s: Listen socket: "
  992                                         "Connection attempt from/to self "
  993                                         "ignored\n", s, __func__);
  994                                 goto dropunlock;
  995                         }
  996                         if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
  997                             IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
  998                             ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
  999                             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
 1000                                 if ((s = tcp_log_addrs(&inc, th, NULL, NULL)))
 1001                                     log(LOG_DEBUG, "%s; %s: Listen socket: "
 1002                                         "Connection attempt from/to broad- "
 1003                                         "or multicast address ignored\n",
 1004                                         s, __func__);
 1005                                 goto dropunlock;
 1006                         }
 1007                 }
 1008                 /*
 1009                  * SYN appears to be valid.  Create compressed TCP state
 1010                  * for syncache.
 1011                  */
 1012 #ifdef TCPDEBUG
 1013                 if (so->so_options & SO_DEBUG)
 1014                         tcp_trace(TA_INPUT, ostate, tp,
 1015                             (void *)tcp_saveipgen, &tcp_savetcp, 0);
 1016 #endif
 1017                 tcp_dooptions(&to, optp, optlen, TO_SYN);
 1018                 syncache_add(&inc, &to, th, inp, &so, m);
 1019                 /*
 1020                  * Entry added to syncache and mbuf consumed.
 1021                  * Everything already unlocked by syncache_add().
 1022                  */
 1023                 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 1024                 return;
 1025         }
 1026 
 1027         /*
 1028          * Segment belongs to a connection in SYN_SENT, ESTABLISHED or later
 1029          * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
 1030          * the inpcb, and unlocks pcbinfo.
 1031          */
 1032         tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
 1033         INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 1034         return;
 1035 
 1036 dropwithreset:
 1037         if (ti_locked == TI_RLOCKED)
 1038                 INP_INFO_RUNLOCK(&V_tcbinfo);
 1039         else if (ti_locked == TI_WLOCKED)
 1040                 INP_INFO_WUNLOCK(&V_tcbinfo);
 1041         else
 1042                 panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
 1043         ti_locked = TI_UNLOCKED;
 1044 
 1045         if (inp != NULL) {
 1046                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
 1047                 INP_WUNLOCK(inp);
 1048         } else
 1049                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 1050         m = NULL;       /* mbuf chain got consumed. */
 1051         goto drop;
 1052 
 1053 dropunlock:
 1054         if (ti_locked == TI_RLOCKED)
 1055                 INP_INFO_RUNLOCK(&V_tcbinfo);
 1056         else if (ti_locked == TI_WLOCKED)
 1057                 INP_INFO_WUNLOCK(&V_tcbinfo);
 1058         else
 1059                 panic("%s: dropunlock ti_locked %d", __func__, ti_locked);
 1060         ti_locked = TI_UNLOCKED;
 1061 
 1062         if (inp != NULL)
 1063                 INP_WUNLOCK(inp);
 1064 
 1065 drop:
 1066         INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 1067         if (s != NULL)
 1068                 free(s, M_TCPLOG);
 1069         if (m != NULL)
 1070                 m_freem(m);
 1071 }
 1072 
 1073 static void
 1074 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
 1075     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
 1076     int ti_locked)
 1077 {
 1078         int thflags, acked, ourfinisacked, needoutput = 0;
 1079         int rstreason, todrop, win;
 1080         u_long tiwin;
 1081         struct tcpopt to;
 1082 
 1083 #ifdef TCPDEBUG
 1084         /*
 1085          * The size of tcp_saveipgen must be the size of the max ip header,
 1086          * now IPv6.
 1087          */
 1088         u_char tcp_saveipgen[IP6_HDR_LEN];
 1089         struct tcphdr tcp_savetcp;
 1090         short ostate = 0;
 1091 #endif
 1092         thflags = th->th_flags;
 1093 
 1094         /*
 1095          * If this is either a state-changing packet or current state isn't
 1096          * established, we require a write lock on tcbinfo.  Otherwise, we
 1097          * allow either a read lock or a write lock, as we may have acquired
 1098          * a write lock due to a race.
 1099          *
 1100          * Require a global write lock for SYN/FIN/RST segments or
 1101          * non-established connections; otherwise accept either a read or
 1102          * write lock, as we may have conservatively acquired a write lock in
 1103          * certain cases in tcp_input() (is this still true?).  Currently we
 1104          * will never enter with no lock, so we try to drop it quickly in the
 1105          * common pure ack/pure data cases.
 1106          */
 1107         if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
 1108             tp->t_state != TCPS_ESTABLISHED) {
 1109                 KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for "
 1110                     "SYN/FIN/RST/!EST", __func__, ti_locked));
 1111                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1112         } else {
 1113 #ifdef INVARIANTS
 1114                 if (ti_locked == TI_RLOCKED)
 1115                         INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 1116                 else if (ti_locked == TI_WLOCKED)
 1117                         INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1118                 else
 1119                         panic("%s: ti_locked %d for EST", __func__,
 1120                             ti_locked);
 1121 #endif
 1122         }
 1123         INP_WLOCK_ASSERT(tp->t_inpcb);
 1124         KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 1125             __func__));
 1126         KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT",
 1127             __func__));
 1128 
 1129         /*
 1130          * Segment received on connection.
 1131          * Reset idle time and keep-alive timer.
 1132          * XXX: This should be done after segment
 1133          * validation to ignore broken/spoofed segs.
 1134          */
 1135         tp->t_rcvtime = ticks;
 1136         if (TCPS_HAVEESTABLISHED(tp->t_state))
 1137                 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 1138 
 1139         /*
 1140          * Unscale the window into a 32-bit value.
 1141          * For the SYN_SENT state the scale is zero.
 1142          */
 1143         tiwin = th->th_win << tp->snd_scale;
 1144 
 1145         /*
 1146          * TCP ECN processing.
 1147          */
 1148         if (tp->t_flags & TF_ECN_PERMIT) {
 1149                 if (thflags & TH_CWR)
 1150                         tp->t_flags &= ~TF_ECN_SND_ECE;
 1151                 switch (iptos & IPTOS_ECN_MASK) {
 1152                 case IPTOS_ECN_CE:
 1153                         tp->t_flags |= TF_ECN_SND_ECE;
 1154                         TCPSTAT_INC(tcps_ecn_ce);
 1155                         break;
 1156                 case IPTOS_ECN_ECT0:
 1157                         TCPSTAT_INC(tcps_ecn_ect0);
 1158                         break;
 1159                 case IPTOS_ECN_ECT1:
 1160                         TCPSTAT_INC(tcps_ecn_ect1);
 1161                         break;
 1162                 }
 1163                 /*
 1164                  * Congestion experienced.
 1165                  * Ignore if we are already trying to recover.
 1166                  */
 1167                 if ((thflags & TH_ECE) &&
 1168                     SEQ_LEQ(th->th_ack, tp->snd_recover)) {
 1169                         TCPSTAT_INC(tcps_ecn_rcwnd);
 1170                         tcp_congestion_exp(tp);
 1171                 }
 1172         }
 1173 
 1174         /*
 1175          * Parse options on any incoming segment.
 1176          */
 1177         tcp_dooptions(&to, (u_char *)(th + 1),
 1178             (th->th_off << 2) - sizeof(struct tcphdr),
 1179             (thflags & TH_SYN) ? TO_SYN : 0);
 1180 
 1181         /*
 1182          * If echoed timestamp is later than the current time,
 1183          * fall back to non RFC1323 RTT calculation.  Normalize
 1184          * timestamp if syncookies were used when this connection
 1185          * was established.
 1186          */
 1187         if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
 1188                 to.to_tsecr -= tp->ts_offset;
 1189                 if (TSTMP_GT(to.to_tsecr, ticks))
 1190                         to.to_tsecr = 0;
 1191         }
 1192 
 1193         /*
 1194          * Process options only when we get SYN/ACK back. The SYN case
 1195          * for incoming connections is handled in tcp_syncache.
 1196          * According to RFC1323 the window field in a SYN (i.e., a <SYN>
 1197          * or <SYN,ACK>) segment itself is never scaled.
 1198          * XXX this is traditional behavior, may need to be cleaned up.
 1199          */
 1200         if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
 1201                 if ((to.to_flags & TOF_SCALE) &&
 1202                     (tp->t_flags & TF_REQ_SCALE)) {
 1203                         tp->t_flags |= TF_RCVD_SCALE;
 1204                         tp->snd_scale = to.to_wscale;
 1205                 }
 1206                 /*
 1207                  * Initial send window.  It will be updated with
 1208                  * the next incoming segment to the scaled value.
 1209                  */
 1210                 tp->snd_wnd = th->th_win;
 1211                 if (to.to_flags & TOF_TS) {
 1212                         tp->t_flags |= TF_RCVD_TSTMP;
 1213                         tp->ts_recent = to.to_tsval;
 1214                         tp->ts_recent_age = ticks;
 1215                 }
 1216                 if (to.to_flags & TOF_MSS)
 1217                         tcp_mss(tp, to.to_mss);
 1218                 if ((tp->t_flags & TF_SACK_PERMIT) &&
 1219                     (to.to_flags & TOF_SACKPERM) == 0)
 1220                         tp->t_flags &= ~TF_SACK_PERMIT;
 1221         }
 1222 
 1223         /*
 1224          * Header prediction: check for the two common cases
 1225          * of a uni-directional data xfer.  If the packet has
 1226          * no control flags, is in-sequence, the window didn't
 1227          * change and we're not retransmitting, it's a
 1228          * candidate.  If the length is zero and the ack moved
 1229          * forward, we're the sender side of the xfer.  Just
 1230          * free the data acked & wake any higher level process
 1231          * that was blocked waiting for space.  If the length
 1232          * is non-zero and the ack didn't move, we're the
 1233          * receiver side.  If we're getting packets in-order
 1234          * (the reassembly queue is empty), add the data to
 1235          * the socket buffer and note that we need a delayed ack.
 1236          * Make sure that the hidden state-flags are also off.
 1237          * Since we check for TCPS_ESTABLISHED first, it can only
 1238          * be TH_NEEDSYN.
 1239          */
 1240         if (tp->t_state == TCPS_ESTABLISHED &&
 1241             th->th_seq == tp->rcv_nxt &&
 1242             (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 1243             tp->snd_nxt == tp->snd_max &&
 1244             tiwin && tiwin == tp->snd_wnd && 
 1245             ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) &&
 1246             LIST_EMPTY(&tp->t_segq) &&
 1247             ((to.to_flags & TOF_TS) == 0 ||
 1248              TSTMP_GEQ(to.to_tsval, tp->ts_recent)) ) {
 1249 
 1250                 /*
 1251                  * If last ACK falls within this segment's sequence numbers,
 1252                  * record the timestamp.
 1253                  * NOTE that the test is modified according to the latest
 1254                  * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 1255                  */
 1256                 if ((to.to_flags & TOF_TS) != 0 &&
 1257                     SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 1258                         tp->ts_recent_age = ticks;
 1259                         tp->ts_recent = to.to_tsval;
 1260                 }
 1261 
 1262                 if (tlen == 0) {
 1263                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
 1264                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
 1265                             tp->snd_cwnd >= tp->snd_wnd &&
 1266                             ((!V_tcp_do_newreno &&
 1267                               !(tp->t_flags & TF_SACK_PERMIT) &&
 1268                               tp->t_dupacks < tcprexmtthresh) ||
 1269                              ((V_tcp_do_newreno ||
 1270                                (tp->t_flags & TF_SACK_PERMIT)) &&
 1271                               !IN_FASTRECOVERY(tp) &&
 1272                               (to.to_flags & TOF_SACK) == 0 &&
 1273                               TAILQ_EMPTY(&tp->snd_holes)))) {
 1274                                 /*
 1275                                  * This is a pure ack for outstanding data.
 1276                                  */
 1277                                 if (ti_locked == TI_RLOCKED)
 1278                                         INP_INFO_RUNLOCK(&V_tcbinfo);
 1279                                 else if (ti_locked == TI_WLOCKED)
 1280                                         INP_INFO_WUNLOCK(&V_tcbinfo);
 1281                                 else
 1282                                         panic("%s: ti_locked %d on pure ACK",
 1283                                             __func__, ti_locked);
 1284                                 ti_locked = TI_UNLOCKED;
 1285 
 1286                                 TCPSTAT_INC(tcps_predack);
 1287 
 1288                                 /*
 1289                                  * "bad retransmit" recovery.
 1290                                  */
 1291                                 if (tp->t_rxtshift == 1 &&
 1292                                     (int)(ticks - tp->t_badrxtwin) < 0) {
 1293                                         TCPSTAT_INC(tcps_sndrexmitbad);
 1294                                         tp->snd_cwnd = tp->snd_cwnd_prev;
 1295                                         tp->snd_ssthresh =
 1296                                             tp->snd_ssthresh_prev;
 1297                                         tp->snd_recover = tp->snd_recover_prev;
 1298                                         if (tp->t_flags & TF_WASFRECOVERY)
 1299                                             ENTER_FASTRECOVERY(tp);
 1300                                         tp->snd_nxt = tp->snd_max;
 1301                                         tp->t_badrxtwin = 0;
 1302                                 }
 1303 
 1304                                 /*
 1305                                  * Recalculate the transmit timer / rtt.
 1306                                  *
 1307                                  * Some boxes send broken timestamp replies
 1308                                  * during the SYN+ACK phase, ignore
 1309                                  * timestamps of 0 or we could calculate a
 1310                                  * huge RTT and blow up the retransmit timer.
 1311                                  */
 1312                                 if ((to.to_flags & TOF_TS) != 0 &&
 1313                                     to.to_tsecr) {
 1314                                         if (!tp->t_rttlow ||
 1315                                             tp->t_rttlow > ticks - to.to_tsecr)
 1316                                                 tp->t_rttlow = ticks - to.to_tsecr;
 1317                                         tcp_xmit_timer(tp,
 1318                                             ticks - to.to_tsecr + 1);
 1319                                 } else if (tp->t_rtttime &&
 1320                                     SEQ_GT(th->th_ack, tp->t_rtseq)) {
 1321                                         if (!tp->t_rttlow ||
 1322                                             tp->t_rttlow > ticks - tp->t_rtttime)
 1323                                                 tp->t_rttlow = ticks - tp->t_rtttime;
 1324                                         tcp_xmit_timer(tp,
 1325                                                         ticks - tp->t_rtttime);
 1326                                 }
 1327                                 tcp_xmit_bandwidth_limit(tp, th->th_ack);
 1328                                 acked = th->th_ack - tp->snd_una;
 1329                                 TCPSTAT_INC(tcps_rcvackpack);
 1330                                 TCPSTAT_ADD(tcps_rcvackbyte, acked);
 1331                                 sbdrop(&so->so_snd, acked);
 1332                                 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
 1333                                     SEQ_LEQ(th->th_ack, tp->snd_recover))
 1334                                         tp->snd_recover = th->th_ack - 1;
 1335                                 tp->snd_una = th->th_ack;
 1336                                 /*
 1337                                  * Pull snd_wl2 up to prevent seq wrap relative
 1338                                  * to th_ack.
 1339                                  */
 1340                                 tp->snd_wl2 = th->th_ack;
 1341                                 tp->t_dupacks = 0;
 1342                                 m_freem(m);
 1343                                 ND6_HINT(tp); /* Some progress has been made. */
 1344 
 1345                                 /*
 1346                                  * If all outstanding data are acked, stop
 1347                                  * retransmit timer, otherwise restart timer
 1348                                  * using current (possibly backed-off) value.
 1349                                  * If process is waiting for space,
 1350                                  * wakeup/selwakeup/signal.  If data
 1351                                  * are ready to send, let tcp_output
 1352                                  * decide between more output or persist.
 1353                                  */
 1354 #ifdef TCPDEBUG
 1355                                 if (so->so_options & SO_DEBUG)
 1356                                         tcp_trace(TA_INPUT, ostate, tp,
 1357                                             (void *)tcp_saveipgen,
 1358                                             &tcp_savetcp, 0);
 1359 #endif
 1360                                 if (tp->snd_una == tp->snd_max)
 1361                                         tcp_timer_activate(tp, TT_REXMT, 0);
 1362                                 else if (!tcp_timer_active(tp, TT_PERSIST))
 1363                                         tcp_timer_activate(tp, TT_REXMT,
 1364                                                       tp->t_rxtcur);
 1365                                 sowwakeup(so);
 1366                                 if (so->so_snd.sb_cc)
 1367                                         (void) tcp_output(tp);
 1368                                 goto check_delack;
 1369                         }
 1370                 } else if (th->th_ack == tp->snd_una &&
 1371                     tlen <= sbspace(&so->so_rcv)) {
 1372                         int newsize = 0;        /* automatic sockbuf scaling */
 1373 
 1374                         /*
 1375                          * This is a pure, in-sequence data packet with
 1376                          * nothing on the reassembly queue and we have enough
 1377                          * buffer space to take it.
 1378                          */
 1379                         if (ti_locked == TI_RLOCKED)
 1380                                 INP_INFO_RUNLOCK(&V_tcbinfo);
 1381                         else if (ti_locked == TI_WLOCKED)
 1382                                 INP_INFO_WUNLOCK(&V_tcbinfo);
 1383                         else
 1384                                 panic("%s: ti_locked %d on pure data "
 1385                                     "segment", __func__, ti_locked);
 1386                         ti_locked = TI_UNLOCKED;
 1387 
 1388                         /* Clean receiver SACK report if present */
 1389                         if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
 1390                                 tcp_clean_sackreport(tp);
 1391                         TCPSTAT_INC(tcps_preddat);
 1392                         tp->rcv_nxt += tlen;
 1393                         /*
 1394                          * Pull snd_wl1 up to prevent seq wrap relative to
 1395                          * th_seq.
 1396                          */
 1397                         tp->snd_wl1 = th->th_seq;
 1398                         /*
 1399                          * Pull rcv_up up to prevent seq wrap relative to
 1400                          * rcv_nxt.
 1401                          */
 1402                         tp->rcv_up = tp->rcv_nxt;
 1403                         TCPSTAT_INC(tcps_rcvpack);
 1404                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
 1405                         ND6_HINT(tp);   /* Some progress has been made */
 1406 #ifdef TCPDEBUG
 1407                         if (so->so_options & SO_DEBUG)
 1408                                 tcp_trace(TA_INPUT, ostate, tp,
 1409                                     (void *)tcp_saveipgen, &tcp_savetcp, 0);
 1410 #endif
 1411                 /*
 1412                  * Automatic sizing of receive socket buffer.  Often the send
 1413                  * buffer size is not optimally adjusted to the actual network
 1414                  * conditions at hand (delay bandwidth product).  Setting the
 1415                  * buffer size too small limits throughput on links with high
 1416                  * bandwidth and high delay (eg. trans-continental/oceanic links).
 1417                  *
 1418                  * On the receive side the socket buffer memory is only rarely
 1419                  * used to any significant extent.  This allows us to be much
 1420                  * more aggressive in scaling the receive socket buffer.  For
 1421                  * the case that the buffer space is actually used to a large
 1422                  * extent and we run out of kernel memory we can simply drop
 1423                  * the new segments; TCP on the sender will just retransmit it
 1424                  * later.  Setting the buffer size too big may only consume too
 1425                  * much kernel memory if the application doesn't read() from
 1426                  * the socket or packet loss or reordering makes use of the
 1427                  * reassembly queue.
 1428                  *
 1429                  * The criteria to step up the receive buffer one notch are:
 1430                  *  1. the number of bytes received during the time it takes
 1431                  *     one timestamp to be reflected back to us (the RTT);
 1432                  *  2. received bytes per RTT is within seven eighth of the
 1433                  *     current socket buffer size;
 1434                  *  3. receive buffer size has not hit maximal automatic size;
 1435                  *
 1436                  * This algorithm does one step per RTT at most and only if
 1437                  * we receive a bulk stream w/o packet losses or reorderings.
 1438                  * Shrinking the buffer during idle times is not necessary as
 1439                  * it doesn't consume any memory when idle.
 1440                  *
 1441                  * TODO: Only step up if the application is actually serving
 1442                  * the buffer to better manage the socket buffer resources.
 1443                  */
 1444                         if (V_tcp_do_autorcvbuf &&
 1445                             to.to_tsecr &&
 1446                             (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
 1447                                 if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
 1448                                     to.to_tsecr - tp->rfbuf_ts < hz) {
 1449                                         if (tp->rfbuf_cnt >
 1450                                             (so->so_rcv.sb_hiwat / 8 * 7) &&
 1451                                             so->so_rcv.sb_hiwat <
 1452                                             V_tcp_autorcvbuf_max) {
 1453                                                 newsize =
 1454                                                     min(so->so_rcv.sb_hiwat +
 1455                                                     V_tcp_autorcvbuf_inc,
 1456                                                     V_tcp_autorcvbuf_max);
 1457                                         }
 1458                                         /* Start over with next RTT. */
 1459                                         tp->rfbuf_ts = 0;
 1460                                         tp->rfbuf_cnt = 0;
 1461                                 } else
 1462                                         tp->rfbuf_cnt += tlen;  /* add up */
 1463                         }
 1464 
 1465                         /* Add data to socket buffer. */
 1466                         SOCKBUF_LOCK(&so->so_rcv);
 1467                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 1468                                 m_freem(m);
 1469                         } else {
 1470                                 /*
 1471                                  * Set new socket buffer size.
 1472                                  * Give up when limit is reached.
 1473                                  */
 1474                                 if (newsize)
 1475                                         if (!sbreserve_locked(&so->so_rcv,
 1476                                             newsize, so, NULL))
 1477                                                 so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 1478                                 m_adj(m, drop_hdrlen);  /* delayed header drop */
 1479                                 sbappendstream_locked(&so->so_rcv, m);
 1480                         }
 1481                         /* NB: sorwakeup_locked() does an implicit unlock. */
 1482                         sorwakeup_locked(so);
 1483                         if (DELAY_ACK(tp)) {
 1484                                 tp->t_flags |= TF_DELACK;
 1485                         } else {
 1486                                 tp->t_flags |= TF_ACKNOW;
 1487                                 tcp_output(tp);
 1488                         }
 1489                         goto check_delack;
 1490                 }
 1491         }
 1492 
 1493         /*
 1494          * Calculate amount of space in receive window,
 1495          * and then do TCP input processing.
 1496          * Receive window is amount of space in rcv queue,
 1497          * but not less than advertised window.
 1498          */
 1499         win = sbspace(&so->so_rcv);
 1500         if (win < 0)
 1501                 win = 0;
 1502         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 1503 
 1504         /* Reset receive buffer auto scaling when not in bulk receive mode. */
 1505         tp->rfbuf_ts = 0;
 1506         tp->rfbuf_cnt = 0;
 1507 
 1508         switch (tp->t_state) {
 1509 
 1510         /*
 1511          * If the state is SYN_RECEIVED:
 1512          *      if seg contains an ACK, but not for our SYN/ACK, send a RST.
 1513          */
 1514         case TCPS_SYN_RECEIVED:
 1515                 if ((thflags & TH_ACK) &&
 1516                     (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 1517                      SEQ_GT(th->th_ack, tp->snd_max))) {
 1518                                 rstreason = BANDLIM_RST_OPENPORT;
 1519                                 goto dropwithreset;
 1520                 }
 1521                 break;
 1522 
 1523         /*
 1524          * If the state is SYN_SENT:
 1525          *      if seg contains an ACK, but not for our SYN, drop the input.
 1526          *      if seg contains a RST, then drop the connection.
 1527          *      if seg does not contain SYN, then drop it.
 1528          * Otherwise this is an acceptable SYN segment
 1529          *      initialize tp->rcv_nxt and tp->irs
 1530          *      if seg contains ack then advance tp->snd_una
 1531          *      if seg contains an ECE and ECN support is enabled, the stream
 1532          *          is ECN capable.
 1533          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 1534          *      arrange for segment to be acked (eventually)
 1535          *      continue processing rest of data/controls, beginning with URG
 1536          */
 1537         case TCPS_SYN_SENT:
 1538                 if ((thflags & TH_ACK) &&
 1539                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1540                      SEQ_GT(th->th_ack, tp->snd_max))) {
 1541                         rstreason = BANDLIM_UNLIMITED;
 1542                         goto dropwithreset;
 1543                 }
 1544                 if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST))
 1545                         tp = tcp_drop(tp, ECONNREFUSED);
 1546                 if (thflags & TH_RST)
 1547                         goto drop;
 1548                 if (!(thflags & TH_SYN))
 1549                         goto drop;
 1550 
 1551                 tp->irs = th->th_seq;
 1552                 tcp_rcvseqinit(tp);
 1553                 if (thflags & TH_ACK) {
 1554                         TCPSTAT_INC(tcps_connects);
 1555                         soisconnected(so);
 1556 #ifdef MAC
 1557                         mac_socketpeer_set_from_mbuf(m, so);
 1558 #endif
 1559                         /* Do window scaling on this connection? */
 1560                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1561                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1562                                 tp->rcv_scale = tp->request_r_scale;
 1563                         }
 1564                         tp->rcv_adv += tp->rcv_wnd;
 1565                         tp->snd_una++;          /* SYN is acked */
 1566                         /*
 1567                          * If there's data, delay ACK; if there's also a FIN
 1568                          * ACKNOW will be turned on later.
 1569                          */
 1570                         if (DELAY_ACK(tp) && tlen != 0)
 1571                                 tcp_timer_activate(tp, TT_DELACK,
 1572                                     tcp_delacktime);
 1573                         else
 1574                                 tp->t_flags |= TF_ACKNOW;
 1575 
 1576                         if ((thflags & TH_ECE) && V_tcp_do_ecn) {
 1577                                 tp->t_flags |= TF_ECN_PERMIT;
 1578                                 TCPSTAT_INC(tcps_ecn_shs);
 1579                         }
 1580                         
 1581                         /*
 1582                          * Received <SYN,ACK> in SYN_SENT[*] state.
 1583                          * Transitions:
 1584                          *      SYN_SENT  --> ESTABLISHED
 1585                          *      SYN_SENT* --> FIN_WAIT_1
 1586                          */
 1587                         tp->t_starttime = ticks;
 1588                         if (tp->t_flags & TF_NEEDFIN) {
 1589                                 tp->t_state = TCPS_FIN_WAIT_1;
 1590                                 tp->t_flags &= ~TF_NEEDFIN;
 1591                                 thflags &= ~TH_SYN;
 1592                         } else {
 1593                                 tp->t_state = TCPS_ESTABLISHED;
 1594                                 tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 1595                         }
 1596                 } else {
 1597                         /*
 1598                          * Received initial SYN in SYN-SENT[*] state =>
 1599                          * simultaneous open.  If segment contains CC option
 1600                          * and there is a cached CC, apply TAO test.
 1601                          * If it succeeds, connection is * half-synchronized.
 1602                          * Otherwise, do 3-way handshake:
 1603                          *        SYN-SENT -> SYN-RECEIVED
 1604                          *        SYN-SENT* -> SYN-RECEIVED*
 1605                          * If there was no CC option, clear cached CC value.
 1606                          */
 1607                         tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 1608                         tcp_timer_activate(tp, TT_REXMT, 0);
 1609                         tp->t_state = TCPS_SYN_RECEIVED;
 1610                 }
 1611 
 1612                 KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: "
 1613                     "ti_locked %d", __func__, ti_locked));
 1614                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1615                 INP_WLOCK_ASSERT(tp->t_inpcb);
 1616 
 1617                 /*
 1618                  * Advance th->th_seq to correspond to first data byte.
 1619                  * If data, trim to stay within window,
 1620                  * dropping FIN if necessary.
 1621                  */
 1622                 th->th_seq++;
 1623                 if (tlen > tp->rcv_wnd) {
 1624                         todrop = tlen - tp->rcv_wnd;
 1625                         m_adj(m, -todrop);
 1626                         tlen = tp->rcv_wnd;
 1627                         thflags &= ~TH_FIN;
 1628                         TCPSTAT_INC(tcps_rcvpackafterwin);
 1629                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 1630                 }
 1631                 tp->snd_wl1 = th->th_seq - 1;
 1632                 tp->rcv_up = th->th_seq;
 1633                 /*
 1634                  * Client side of transaction: already sent SYN and data.
 1635                  * If the remote host used T/TCP to validate the SYN,
 1636                  * our data will be ACK'd; if so, enter normal data segment
 1637                  * processing in the middle of step 5, ack processing.
 1638                  * Otherwise, goto step 6.
 1639                  */
 1640                 if (thflags & TH_ACK)
 1641                         goto process_ACK;
 1642 
 1643                 goto step6;
 1644 
 1645         /*
 1646          * If the state is LAST_ACK or CLOSING or TIME_WAIT:
 1647          *      do normal processing.
 1648          *
 1649          * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
 1650          */
 1651         case TCPS_LAST_ACK:
 1652         case TCPS_CLOSING:
 1653                 break;  /* continue normal processing */
 1654         }
 1655 
 1656         /*
 1657          * States other than LISTEN or SYN_SENT.
 1658          * First check the RST flag and sequence number since reset segments
 1659          * are exempt from the timestamp and connection count tests.  This
 1660          * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
 1661          * below which allowed reset segments in half the sequence space
 1662          * to fall though and be processed (which gives forged reset
 1663          * segments with a random sequence number a 50 percent chance of
 1664          * killing a connection).
 1665          * Then check timestamp, if present.
 1666          * Then check the connection count, if present.
 1667          * Then check that at least some bytes of segment are within
 1668          * receive window.  If segment begins before rcv_nxt,
 1669          * drop leading data (and SYN); if nothing left, just ack.
 1670          *
 1671          *
 1672          * If the RST bit is set, check the sequence number to see
 1673          * if this is a valid reset segment.
 1674          * RFC 793 page 37:
 1675          *   In all states except SYN-SENT, all reset (RST) segments
 1676          *   are validated by checking their SEQ-fields.  A reset is
 1677          *   valid if its sequence number is in the window.
 1678          * Note: this does not take into account delayed ACKs, so
 1679          *   we should test against last_ack_sent instead of rcv_nxt.
 1680          *   The sequence number in the reset segment is normally an
 1681          *   echo of our outgoing acknowlegement numbers, but some hosts
 1682          *   send a reset with the sequence number at the rightmost edge
 1683          *   of our receive window, and we have to handle this case.
 1684          * Note 2: Paul Watson's paper "Slipping in the Window" has shown
 1685          *   that brute force RST attacks are possible.  To combat this,
 1686          *   we use a much stricter check while in the ESTABLISHED state,
 1687          *   only accepting RSTs where the sequence number is equal to
 1688          *   last_ack_sent.  In all other states (the states in which a
 1689          *   RST is more likely), the more permissive check is used.
 1690          * If we have multiple segments in flight, the initial reset
 1691          * segment sequence numbers will be to the left of last_ack_sent,
 1692          * but they will eventually catch up.
 1693          * In any case, it never made sense to trim reset segments to
 1694          * fit the receive window since RFC 1122 says:
 1695          *   4.2.2.12  RST Segment: RFC-793 Section 3.4
 1696          *
 1697          *    A TCP SHOULD allow a received RST segment to include data.
 1698          *
 1699          *    DISCUSSION
 1700          *         It has been suggested that a RST segment could contain
 1701          *         ASCII text that encoded and explained the cause of the
 1702          *         RST.  No standard has yet been established for such
 1703          *         data.
 1704          *
 1705          * If the reset segment passes the sequence number test examine
 1706          * the state:
 1707          *    SYN_RECEIVED STATE:
 1708          *      If passive open, return to LISTEN state.
 1709          *      If active open, inform user that connection was refused.
 1710          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
 1711          *      Inform user that connection was reset, and close tcb.
 1712          *    CLOSING, LAST_ACK STATES:
 1713          *      Close the tcb.
 1714          *    TIME_WAIT STATE:
 1715          *      Drop the segment - see Stevens, vol. 2, p. 964 and
 1716          *      RFC 1337.
 1717          */
 1718         if (thflags & TH_RST) {
 1719                 if (SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 1720                     SEQ_LEQ(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
 1721                         switch (tp->t_state) {
 1722 
 1723                         case TCPS_SYN_RECEIVED:
 1724                                 so->so_error = ECONNREFUSED;
 1725                                 goto close;
 1726 
 1727                         case TCPS_ESTABLISHED:
 1728                                 if (V_tcp_insecure_rst == 0 &&
 1729                                     !(SEQ_GEQ(th->th_seq, tp->rcv_nxt - 1) &&
 1730                                     SEQ_LEQ(th->th_seq, tp->rcv_nxt + 1)) &&
 1731                                     !(SEQ_GEQ(th->th_seq, tp->last_ack_sent - 1) &&
 1732                                     SEQ_LEQ(th->th_seq, tp->last_ack_sent + 1))) {
 1733                                         TCPSTAT_INC(tcps_badrst);
 1734                                         goto drop;
 1735                                 }
 1736                                 /* FALLTHROUGH */
 1737                         case TCPS_FIN_WAIT_1:
 1738                         case TCPS_FIN_WAIT_2:
 1739                         case TCPS_CLOSE_WAIT:
 1740                                 so->so_error = ECONNRESET;
 1741                         close:
 1742                                 KASSERT(ti_locked == TI_WLOCKED,
 1743                                     ("tcp_do_segment: TH_RST 1 ti_locked %d",
 1744                                     ti_locked));
 1745                                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1746 
 1747                                 tp->t_state = TCPS_CLOSED;
 1748                                 TCPSTAT_INC(tcps_drops);
 1749                                 tp = tcp_close(tp);
 1750                                 break;
 1751 
 1752                         case TCPS_CLOSING:
 1753                         case TCPS_LAST_ACK:
 1754                                 KASSERT(ti_locked == TI_WLOCKED,
 1755                                     ("tcp_do_segment: TH_RST 2 ti_locked %d",
 1756                                     ti_locked));
 1757                                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1758 
 1759                                 tp = tcp_close(tp);
 1760                                 break;
 1761                         }
 1762                 }
 1763                 goto drop;
 1764         }
 1765 
 1766         /*
 1767          * RFC 1323 PAWS: If we have a timestamp reply on this segment
 1768          * and it's less than ts_recent, drop it.
 1769          */
 1770         if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
 1771             TSTMP_LT(to.to_tsval, tp->ts_recent)) {
 1772 
 1773                 /* Check to see if ts_recent is over 24 days old.  */
 1774                 if (ticks - tp->ts_recent_age > TCP_PAWS_IDLE) {
 1775                         /*
 1776                          * Invalidate ts_recent.  If this segment updates
 1777                          * ts_recent, the age will be reset later and ts_recent
 1778                          * will get a valid value.  If it does not, setting
 1779                          * ts_recent to zero will at least satisfy the
 1780                          * requirement that zero be placed in the timestamp
 1781                          * echo reply when ts_recent isn't valid.  The
 1782                          * age isn't reset until we get a valid ts_recent
 1783                          * because we don't want out-of-order segments to be
 1784                          * dropped when ts_recent is old.
 1785                          */
 1786                         tp->ts_recent = 0;
 1787                 } else {
 1788                         TCPSTAT_INC(tcps_rcvduppack);
 1789                         TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
 1790                         TCPSTAT_INC(tcps_pawsdrop);
 1791                         if (tlen)
 1792                                 goto dropafterack;
 1793                         goto drop;
 1794                 }
 1795         }
 1796 
 1797         /*
 1798          * In the SYN-RECEIVED state, validate that the packet belongs to
 1799          * this connection before trimming the data to fit the receive
 1800          * window.  Check the sequence number versus IRS since we know
 1801          * the sequence numbers haven't wrapped.  This is a partial fix
 1802          * for the "LAND" DoS attack.
 1803          */
 1804         if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
 1805                 rstreason = BANDLIM_RST_OPENPORT;
 1806                 goto dropwithreset;
 1807         }
 1808 
 1809         todrop = tp->rcv_nxt - th->th_seq;
 1810         if (todrop > 0) {
 1811                 /*
 1812                  * If this is a duplicate SYN for our current connection,
 1813                  * advance over it and pretend and it's not a SYN.
 1814                  */
 1815                 if (thflags & TH_SYN && th->th_seq == tp->irs) {
 1816                         thflags &= ~TH_SYN;
 1817                         th->th_seq++;
 1818                         if (th->th_urp > 1)
 1819                                 th->th_urp--;
 1820                         else
 1821                                 thflags &= ~TH_URG;
 1822                         todrop--;
 1823                 }
 1824                 /*
 1825                  * Following if statement from Stevens, vol. 2, p. 960.
 1826                  */
 1827                 if (todrop > tlen
 1828                     || (todrop == tlen && (thflags & TH_FIN) == 0)) {
 1829                         /*
 1830                          * Any valid FIN must be to the left of the window.
 1831                          * At this point the FIN must be a duplicate or out
 1832                          * of sequence; drop it.
 1833                          */
 1834                         thflags &= ~TH_FIN;
 1835 
 1836                         /*
 1837                          * Send an ACK to resynchronize and drop any data.
 1838                          * But keep on processing for RST or ACK.
 1839                          */
 1840                         tp->t_flags |= TF_ACKNOW;
 1841                         todrop = tlen;
 1842                         TCPSTAT_INC(tcps_rcvduppack);
 1843                         TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
 1844                 } else {
 1845                         TCPSTAT_INC(tcps_rcvpartduppack);
 1846                         TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
 1847                 }
 1848                 drop_hdrlen += todrop;  /* drop from the top afterwards */
 1849                 th->th_seq += todrop;
 1850                 tlen -= todrop;
 1851                 if (th->th_urp > todrop)
 1852                         th->th_urp -= todrop;
 1853                 else {
 1854                         thflags &= ~TH_URG;
 1855                         th->th_urp = 0;
 1856                 }
 1857         }
 1858 
 1859         /*
 1860          * If new data are received on a connection after the
 1861          * user processes are gone, then RST the other end.
 1862          */
 1863         if ((so->so_state & SS_NOFDREF) &&
 1864             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 1865                 char *s;
 1866 
 1867                 KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && "
 1868                     "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked));
 1869                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1870 
 1871                 if ((s = tcp_log_addrs(&tp->t_inpcb->inp_inc, th, NULL, NULL))) {
 1872                         log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data after socket "
 1873                             "was closed, sending RST and removing tcpcb\n",
 1874                             s, __func__, tcpstates[tp->t_state], tlen);
 1875                         free(s, M_TCPLOG);
 1876                 }
 1877                 tp = tcp_close(tp);
 1878                 TCPSTAT_INC(tcps_rcvafterclose);
 1879                 rstreason = BANDLIM_UNLIMITED;
 1880                 goto dropwithreset;
 1881         }
 1882 
 1883         /*
 1884          * If segment ends after window, drop trailing data
 1885          * (and PUSH and FIN); if nothing left, just ACK.
 1886          */
 1887         todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
 1888         if (todrop > 0) {
 1889                 TCPSTAT_INC(tcps_rcvpackafterwin);
 1890                 if (todrop >= tlen) {
 1891                         TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
 1892                         /*
 1893                          * If window is closed can only take segments at
 1894                          * window edge, and have to drop data and PUSH from
 1895                          * incoming segments.  Continue processing, but
 1896                          * remember to ack.  Otherwise, drop segment
 1897                          * and ack.
 1898                          */
 1899                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 1900                                 tp->t_flags |= TF_ACKNOW;
 1901                                 TCPSTAT_INC(tcps_rcvwinprobe);
 1902                         } else
 1903                                 goto dropafterack;
 1904                 } else
 1905                         TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
 1906                 m_adj(m, -todrop);
 1907                 tlen -= todrop;
 1908                 thflags &= ~(TH_PUSH|TH_FIN);
 1909         }
 1910 
 1911         /*
 1912          * If last ACK falls within this segment's sequence numbers,
 1913          * record its timestamp.
 1914          * NOTE: 
 1915          * 1) That the test incorporates suggestions from the latest
 1916          *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 1917          * 2) That updating only on newer timestamps interferes with
 1918          *    our earlier PAWS tests, so this check should be solely
 1919          *    predicated on the sequence space of this segment.
 1920          * 3) That we modify the segment boundary check to be 
 1921          *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 1922          *    instead of RFC1323's
 1923          *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 1924          *    This modified check allows us to overcome RFC1323's
 1925          *    limitations as described in Stevens TCP/IP Illustrated
 1926          *    Vol. 2 p.869. In such cases, we can still calculate the
 1927          *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 1928          */
 1929         if ((to.to_flags & TOF_TS) != 0 &&
 1930             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 1931             SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 1932                 ((thflags & (TH_SYN|TH_FIN)) != 0))) {
 1933                 tp->ts_recent_age = ticks;
 1934                 tp->ts_recent = to.to_tsval;
 1935         }
 1936 
 1937         /*
 1938          * If a SYN is in the window, then this is an
 1939          * error and we send an RST and drop the connection.
 1940          */
 1941         if (thflags & TH_SYN) {
 1942                 KASSERT(ti_locked == TI_WLOCKED,
 1943                     ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked));
 1944                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 1945 
 1946                 tp = tcp_drop(tp, ECONNRESET);
 1947                 rstreason = BANDLIM_UNLIMITED;
 1948                 goto drop;
 1949         }
 1950 
 1951         /*
 1952          * If the ACK bit is off:  if in SYN-RECEIVED state or SENDSYN
 1953          * flag is on (half-synchronized state), then queue data for
 1954          * later processing; else drop segment and return.
 1955          */
 1956         if ((thflags & TH_ACK) == 0) {
 1957                 if (tp->t_state == TCPS_SYN_RECEIVED ||
 1958                     (tp->t_flags & TF_NEEDSYN))
 1959                         goto step6;
 1960                 else if (tp->t_flags & TF_ACKNOW)
 1961                         goto dropafterack;
 1962                 else
 1963                         goto drop;
 1964         }
 1965 
 1966         /*
 1967          * Ack processing.
 1968          */
 1969         switch (tp->t_state) {
 1970 
 1971         /*
 1972          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 1973          * ESTABLISHED state and continue processing.
 1974          * The ACK was checked above.
 1975          */
 1976         case TCPS_SYN_RECEIVED:
 1977 
 1978                 TCPSTAT_INC(tcps_connects);
 1979                 soisconnected(so);
 1980                 /* Do window scaling? */
 1981                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1982                         (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1983                         tp->rcv_scale = tp->request_r_scale;
 1984                         tp->snd_wnd = tiwin;
 1985                 }
 1986                 /*
 1987                  * Make transitions:
 1988                  *      SYN-RECEIVED  -> ESTABLISHED
 1989                  *      SYN-RECEIVED* -> FIN-WAIT-1
 1990                  */
 1991                 tp->t_starttime = ticks;
 1992                 if (tp->t_flags & TF_NEEDFIN) {
 1993                         tp->t_state = TCPS_FIN_WAIT_1;
 1994                         tp->t_flags &= ~TF_NEEDFIN;
 1995                 } else {
 1996                         tp->t_state = TCPS_ESTABLISHED;
 1997                         tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
 1998                 }
 1999                 /*
 2000                  * If segment contains data or ACK, will call tcp_reass()
 2001                  * later; if not, do so now to pass queued data to user.
 2002                  */
 2003                 if (tlen == 0 && (thflags & TH_FIN) == 0)
 2004                         (void) tcp_reass(tp, (struct tcphdr *)0, 0,
 2005                             (struct mbuf *)0);
 2006                 tp->snd_wl1 = th->th_seq - 1;
 2007                 /* FALLTHROUGH */
 2008 
 2009         /*
 2010          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 2011          * ACKs.  If the ack is in the range
 2012          *      tp->snd_una < th->th_ack <= tp->snd_max
 2013          * then advance tp->snd_una to th->th_ack and drop
 2014          * data from the retransmission queue.  If this ACK reflects
 2015          * more up to date window information we update our window information.
 2016          */
 2017         case TCPS_ESTABLISHED:
 2018         case TCPS_FIN_WAIT_1:
 2019         case TCPS_FIN_WAIT_2:
 2020         case TCPS_CLOSE_WAIT:
 2021         case TCPS_CLOSING:
 2022         case TCPS_LAST_ACK:
 2023                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
 2024                         TCPSTAT_INC(tcps_rcvacktoomuch);
 2025                         goto dropafterack;
 2026                 }
 2027                 if ((tp->t_flags & TF_SACK_PERMIT) &&
 2028                     ((to.to_flags & TOF_SACK) ||
 2029                      !TAILQ_EMPTY(&tp->snd_holes)))
 2030                         tcp_sack_doack(tp, &to, th->th_ack);
 2031                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 2032                         if (tlen == 0 && tiwin == tp->snd_wnd) {
 2033                                 TCPSTAT_INC(tcps_rcvdupack);
 2034                                 /*
 2035                                  * If we have outstanding data (other than
 2036                                  * a window probe), this is a completely
 2037                                  * duplicate ack (ie, window info didn't
 2038                                  * change), the ack is the biggest we've
 2039                                  * seen and we've seen exactly our rexmt
 2040                                  * threshhold of them, assume a packet
 2041                                  * has been dropped and retransmit it.
 2042                                  * Kludge snd_nxt & the congestion
 2043                                  * window so we send only this one
 2044                                  * packet.
 2045                                  *
 2046                                  * We know we're losing at the current
 2047                                  * window size so do congestion avoidance
 2048                                  * (set ssthresh to half the current window
 2049                                  * and pull our congestion window back to
 2050                                  * the new ssthresh).
 2051                                  *
 2052                                  * Dup acks mean that packets have left the
 2053                                  * network (they're now cached at the receiver)
 2054                                  * so bump cwnd by the amount in the receiver
 2055                                  * to keep a constant cwnd packets in the
 2056                                  * network.
 2057                                  *
 2058                                  * When using TCP ECN, notify the peer that
 2059                                  * we reduced the cwnd.
 2060                                  */
 2061                                 if (!tcp_timer_active(tp, TT_REXMT) ||
 2062                                     th->th_ack != tp->snd_una)
 2063                                         tp->t_dupacks = 0;
 2064                                 else if (++tp->t_dupacks > tcprexmtthresh ||
 2065                                     ((V_tcp_do_newreno ||
 2066                                       (tp->t_flags & TF_SACK_PERMIT)) &&
 2067                                      IN_FASTRECOVERY(tp))) {
 2068                                         if ((tp->t_flags & TF_SACK_PERMIT) &&
 2069                                             IN_FASTRECOVERY(tp)) {
 2070                                                 int awnd;
 2071                                                 
 2072                                                 /*
 2073                                                  * Compute the amount of data in flight first.
 2074                                                  * We can inject new data into the pipe iff 
 2075                                                  * we have less than 1/2 the original window's  
 2076                                                  * worth of data in flight.
 2077                                                  */
 2078                                                 awnd = (tp->snd_nxt - tp->snd_fack) +
 2079                                                         tp->sackhint.sack_bytes_rexmit;
 2080                                                 if (awnd < tp->snd_ssthresh) {
 2081                                                         tp->snd_cwnd += tp->t_maxseg;
 2082                                                         if (tp->snd_cwnd > tp->snd_ssthresh)
 2083                                                                 tp->snd_cwnd = tp->snd_ssthresh;
 2084                                                 }
 2085                                         } else
 2086                                                 tp->snd_cwnd += tp->t_maxseg;
 2087                                         (void) tcp_output(tp);
 2088                                         goto drop;
 2089                                 } else if (tp->t_dupacks == tcprexmtthresh) {
 2090                                         tcp_seq onxt = tp->snd_nxt;
 2091 
 2092                                         /*
 2093                                          * If we're doing sack, check to
 2094                                          * see if we're already in sack
 2095                                          * recovery. If we're not doing sack,
 2096                                          * check to see if we're in newreno
 2097                                          * recovery.
 2098                                          */
 2099                                         if (tp->t_flags & TF_SACK_PERMIT) {
 2100                                                 if (IN_FASTRECOVERY(tp)) {
 2101                                                         tp->t_dupacks = 0;
 2102                                                         break;
 2103                                                 }
 2104                                         } else if (V_tcp_do_newreno ||
 2105                                             V_tcp_do_ecn) {
 2106                                                 if (SEQ_LEQ(th->th_ack,
 2107                                                     tp->snd_recover)) {
 2108                                                         tp->t_dupacks = 0;
 2109                                                         break;
 2110                                                 }
 2111                                         }
 2112                                         tcp_congestion_exp(tp);
 2113                                         tcp_timer_activate(tp, TT_REXMT, 0);
 2114                                         tp->t_rtttime = 0;
 2115                                         if (tp->t_flags & TF_SACK_PERMIT) {
 2116                                                 TCPSTAT_INC(
 2117                                                     tcps_sack_recovery_episode);
 2118                                                 tp->sack_newdata = tp->snd_nxt;
 2119                                                 tp->snd_cwnd = tp->t_maxseg;
 2120                                                 (void) tcp_output(tp);
 2121                                                 goto drop;
 2122                                         }
 2123                                         tp->snd_nxt = th->th_ack;
 2124                                         tp->snd_cwnd = tp->t_maxseg;
 2125                                         (void) tcp_output(tp);
 2126                                         KASSERT(tp->snd_limited <= 2,
 2127                                             ("%s: tp->snd_limited too big",
 2128                                             __func__));
 2129                                         tp->snd_cwnd = tp->snd_ssthresh +
 2130                                              tp->t_maxseg *
 2131                                              (tp->t_dupacks - tp->snd_limited);
 2132                                         if (SEQ_GT(onxt, tp->snd_nxt))
 2133                                                 tp->snd_nxt = onxt;
 2134                                         goto drop;
 2135                                 } else if (V_tcp_do_rfc3042) {
 2136                                         u_long oldcwnd = tp->snd_cwnd;
 2137                                         tcp_seq oldsndmax = tp->snd_max;
 2138                                         u_int sent;
 2139 
 2140                                         KASSERT(tp->t_dupacks == 1 ||
 2141                                             tp->t_dupacks == 2,
 2142                                             ("%s: dupacks not 1 or 2",
 2143                                             __func__));
 2144                                         if (tp->t_dupacks == 1)
 2145                                                 tp->snd_limited = 0;
 2146                                         tp->snd_cwnd =
 2147                                             (tp->snd_nxt - tp->snd_una) +
 2148                                             (tp->t_dupacks - tp->snd_limited) *
 2149                                             tp->t_maxseg;
 2150                                         (void) tcp_output(tp);
 2151                                         sent = tp->snd_max - oldsndmax;
 2152                                         if (sent > tp->t_maxseg) {
 2153                                                 KASSERT((tp->t_dupacks == 2 &&
 2154                                                     tp->snd_limited == 0) ||
 2155                                                    (sent == tp->t_maxseg + 1 &&
 2156                                                     tp->t_flags & TF_SENTFIN),
 2157                                                     ("%s: sent too much",
 2158                                                     __func__));
 2159                                                 tp->snd_limited = 2;
 2160                                         } else if (sent > 0)
 2161                                                 ++tp->snd_limited;
 2162                                         tp->snd_cwnd = oldcwnd;
 2163                                         goto drop;
 2164                                 }
 2165                         } else
 2166                                 tp->t_dupacks = 0;
 2167                         break;
 2168                 }
 2169 
 2170                 KASSERT(SEQ_GT(th->th_ack, tp->snd_una),
 2171                     ("%s: th_ack <= snd_una", __func__));
 2172 
 2173                 /*
 2174                  * If the congestion window was inflated to account
 2175                  * for the other side's cached packets, retract it.
 2176                  */
 2177                 if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
 2178                         if (IN_FASTRECOVERY(tp)) {
 2179                                 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 2180                                         if (tp->t_flags & TF_SACK_PERMIT)
 2181                                                 tcp_sack_partialack(tp, th);
 2182                                         else
 2183                                                 tcp_newreno_partial_ack(tp, th);
 2184                                 } else {
 2185                                         /*
 2186                                          * Out of fast recovery.
 2187                                          * Window inflation should have left us
 2188                                          * with approximately snd_ssthresh
 2189                                          * outstanding data.
 2190                                          * But in case we would be inclined to
 2191                                          * send a burst, better to do it via
 2192                                          * the slow start mechanism.
 2193                                          */
 2194                                         if (SEQ_GT(th->th_ack +
 2195                                                         tp->snd_ssthresh,
 2196                                                    tp->snd_max))
 2197                                                 tp->snd_cwnd = tp->snd_max -
 2198                                                                 th->th_ack +
 2199                                                                 tp->t_maxseg;
 2200                                         else
 2201                                                 tp->snd_cwnd = tp->snd_ssthresh;
 2202                                 }
 2203                         }
 2204                 } else {
 2205                         if (tp->t_dupacks >= tcprexmtthresh &&
 2206                             tp->snd_cwnd > tp->snd_ssthresh)
 2207                                 tp->snd_cwnd = tp->snd_ssthresh;
 2208                 }
 2209                 tp->t_dupacks = 0;
 2210                 /*
 2211                  * If we reach this point, ACK is not a duplicate,
 2212                  *     i.e., it ACKs something we sent.
 2213                  */
 2214                 if (tp->t_flags & TF_NEEDSYN) {
 2215                         /*
 2216                          * T/TCP: Connection was half-synchronized, and our
 2217                          * SYN has been ACK'd (so connection is now fully
 2218                          * synchronized).  Go to non-starred state,
 2219                          * increment snd_una for ACK of SYN, and check if
 2220                          * we can do window scaling.
 2221                          */
 2222                         tp->t_flags &= ~TF_NEEDSYN;
 2223                         tp->snd_una++;
 2224                         /* Do window scaling? */
 2225                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 2226                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 2227                                 tp->rcv_scale = tp->request_r_scale;
 2228                                 /* Send window already scaled. */
 2229                         }
 2230                 }
 2231 
 2232 process_ACK:
 2233                 INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 2234                 KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 2235                     ("tcp_input: process_ACK ti_locked %d", ti_locked));
 2236                 INP_WLOCK_ASSERT(tp->t_inpcb);
 2237 
 2238                 acked = th->th_ack - tp->snd_una;
 2239                 TCPSTAT_INC(tcps_rcvackpack);
 2240                 TCPSTAT_ADD(tcps_rcvackbyte, acked);
 2241 
 2242                 /*
 2243                  * If we just performed our first retransmit, and the ACK
 2244                  * arrives within our recovery window, then it was a mistake
 2245                  * to do the retransmit in the first place.  Recover our
 2246                  * original cwnd and ssthresh, and proceed to transmit where
 2247                  * we left off.
 2248                  */
 2249                 if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
 2250                         TCPSTAT_INC(tcps_sndrexmitbad);
 2251                         tp->snd_cwnd = tp->snd_cwnd_prev;
 2252                         tp->snd_ssthresh = tp->snd_ssthresh_prev;
 2253                         tp->snd_recover = tp->snd_recover_prev;
 2254                         if (tp->t_flags & TF_WASFRECOVERY)
 2255                                 ENTER_FASTRECOVERY(tp);
 2256                         tp->snd_nxt = tp->snd_max;
 2257                         tp->t_badrxtwin = 0;    /* XXX probably not required */
 2258                 }
 2259 
 2260                 /*
 2261                  * If we have a timestamp reply, update smoothed
 2262                  * round trip time.  If no timestamp is present but
 2263                  * transmit timer is running and timed sequence
 2264                  * number was acked, update smoothed round trip time.
 2265                  * Since we now have an rtt measurement, cancel the
 2266                  * timer backoff (cf., Phil Karn's retransmit alg.).
 2267                  * Recompute the initial retransmit timer.
 2268                  *
 2269                  * Some boxes send broken timestamp replies
 2270                  * during the SYN+ACK phase, ignore
 2271                  * timestamps of 0 or we could calculate a
 2272                  * huge RTT and blow up the retransmit timer.
 2273                  */
 2274                 if ((to.to_flags & TOF_TS) != 0 &&
 2275                     to.to_tsecr) {
 2276                         if (!tp->t_rttlow || tp->t_rttlow > ticks - to.to_tsecr)
 2277                                 tp->t_rttlow = ticks - to.to_tsecr;
 2278                         tcp_xmit_timer(tp, ticks - to.to_tsecr + 1);
 2279                 } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
 2280                         if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
 2281                                 tp->t_rttlow = ticks - tp->t_rtttime;
 2282                         tcp_xmit_timer(tp, ticks - tp->t_rtttime);
 2283                 }
 2284                 tcp_xmit_bandwidth_limit(tp, th->th_ack);
 2285 
 2286                 /*
 2287                  * If all outstanding data is acked, stop retransmit
 2288                  * timer and remember to restart (more output or persist).
 2289                  * If there is more data to be acked, restart retransmit
 2290                  * timer, using current (possibly backed-off) value.
 2291                  */
 2292                 if (th->th_ack == tp->snd_max) {
 2293                         tcp_timer_activate(tp, TT_REXMT, 0);
 2294                         needoutput = 1;
 2295                 } else if (!tcp_timer_active(tp, TT_PERSIST))
 2296                         tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 2297 
 2298                 /*
 2299                  * If no data (only SYN) was ACK'd,
 2300                  *    skip rest of ACK processing.
 2301                  */
 2302                 if (acked == 0)
 2303                         goto step6;
 2304 
 2305                 /*
 2306                  * When new data is acked, open the congestion window.
 2307                  * Method depends on which congestion control state we're
 2308                  * in (slow start or cong avoid) and if ABC (RFC 3465) is
 2309                  * enabled.
 2310                  *
 2311                  * slow start: cwnd <= ssthresh
 2312                  * cong avoid: cwnd > ssthresh
 2313                  *
 2314                  * slow start and ABC (RFC 3465):
 2315                  *   Grow cwnd exponentially by the amount of data
 2316                  *   ACKed capping the max increment per ACK to
 2317                  *   (abc_l_var * maxseg) bytes.
 2318                  *
 2319                  * slow start without ABC (RFC 2581):
 2320                  *   Grow cwnd exponentially by maxseg per ACK.
 2321                  *
 2322                  * cong avoid and ABC (RFC 3465):
 2323                  *   Grow cwnd linearly by maxseg per RTT for each
 2324                  *   cwnd worth of ACKed data.
 2325                  *
 2326                  * cong avoid without ABC (RFC 2581):
 2327                  *   Grow cwnd linearly by approximately maxseg per RTT using
 2328                  *   maxseg^2 / cwnd per ACK as the increment.
 2329                  *   If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
 2330                  *   avoid capping cwnd.
 2331                  */
 2332                 if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
 2333                     !IN_FASTRECOVERY(tp)) {
 2334                         u_int cw = tp->snd_cwnd;
 2335                         u_int incr = tp->t_maxseg;
 2336                         /* In congestion avoidance? */
 2337                         if (cw > tp->snd_ssthresh) {
 2338                                 if (V_tcp_do_rfc3465) {
 2339                                         tp->t_bytes_acked += acked;
 2340                                         if (tp->t_bytes_acked >= tp->snd_cwnd)
 2341                                                 tp->t_bytes_acked -= cw;
 2342                                         else
 2343                                                 incr = 0;
 2344                                 }
 2345                                 else
 2346                                         incr = max((incr * incr / cw), 1);
 2347                         /*
 2348                          * In slow-start with ABC enabled and no RTO in sight?
 2349                          * (Must not use abc_l_var > 1 if slow starting after an
 2350                          * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
 2351                          * snd_max check is sufficient to handle this).
 2352                          */
 2353                         } else if (V_tcp_do_rfc3465 &&
 2354                             tp->snd_nxt == tp->snd_max)
 2355                                 incr = min(acked,
 2356                                     V_tcp_abc_l_var * tp->t_maxseg);
 2357                         /* ABC is on by default, so (incr == 0) frequently. */
 2358                         if (incr > 0)
 2359                                 tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
 2360                 }
 2361                 SOCKBUF_LOCK(&so->so_snd);
 2362                 if (acked > so->so_snd.sb_cc) {
 2363                         tp->snd_wnd -= so->so_snd.sb_cc;
 2364                         sbdrop_locked(&so->so_snd, (int)so->so_snd.sb_cc);
 2365                         ourfinisacked = 1;
 2366                 } else {
 2367                         sbdrop_locked(&so->so_snd, acked);
 2368                         tp->snd_wnd -= acked;
 2369                         ourfinisacked = 0;
 2370                 }
 2371                 /* NB: sowwakeup_locked() does an implicit unlock. */
 2372                 sowwakeup_locked(so);
 2373                 /* Detect una wraparound. */
 2374                 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
 2375                     !IN_FASTRECOVERY(tp) &&
 2376                     SEQ_GT(tp->snd_una, tp->snd_recover) &&
 2377                     SEQ_LEQ(th->th_ack, tp->snd_recover))
 2378                         tp->snd_recover = th->th_ack - 1;
 2379                 if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
 2380                     IN_FASTRECOVERY(tp) &&
 2381                     SEQ_GEQ(th->th_ack, tp->snd_recover)) {
 2382                         EXIT_FASTRECOVERY(tp);
 2383                         tp->t_bytes_acked = 0;
 2384                 }
 2385                 tp->snd_una = th->th_ack;
 2386                 if (tp->t_flags & TF_SACK_PERMIT) {
 2387                         if (SEQ_GT(tp->snd_una, tp->snd_recover))
 2388                                 tp->snd_recover = tp->snd_una;
 2389                 }
 2390                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 2391                         tp->snd_nxt = tp->snd_una;
 2392 
 2393                 switch (tp->t_state) {
 2394 
 2395                 /*
 2396                  * In FIN_WAIT_1 STATE in addition to the processing
 2397                  * for the ESTABLISHED state if our FIN is now acknowledged
 2398                  * then enter FIN_WAIT_2.
 2399                  */
 2400                 case TCPS_FIN_WAIT_1:
 2401                         if (ourfinisacked) {
 2402                                 /*
 2403                                  * If we can't receive any more
 2404                                  * data, then closing user can proceed.
 2405                                  * Starting the timer is contrary to the
 2406                                  * specification, but if we don't get a FIN
 2407                                  * we'll hang forever.
 2408                                  *
 2409                                  * XXXjl:
 2410                                  * we should release the tp also, and use a
 2411                                  * compressed state.
 2412                                  */
 2413                                 if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 2414                                         int timeout;
 2415 
 2416                                         soisdisconnected(so);
 2417                                         timeout = (tcp_fast_finwait2_recycle) ? 
 2418                                                 tcp_finwait2_timeout : tcp_maxidle;
 2419                                         tcp_timer_activate(tp, TT_2MSL, timeout);
 2420                                 }
 2421                                 tp->t_state = TCPS_FIN_WAIT_2;
 2422                         }
 2423                         break;
 2424 
 2425                 /*
 2426                  * In CLOSING STATE in addition to the processing for
 2427                  * the ESTABLISHED state if the ACK acknowledges our FIN
 2428                  * then enter the TIME-WAIT state, otherwise ignore
 2429                  * the segment.
 2430                  */
 2431                 case TCPS_CLOSING:
 2432                         if (ourfinisacked) {
 2433                                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 2434                                 tcp_twstart(tp);
 2435                                 INP_INFO_WUNLOCK(&V_tcbinfo);
 2436                                 m_freem(m);
 2437                                 return;
 2438                         }
 2439                         break;
 2440 
 2441                 /*
 2442                  * In LAST_ACK, we may still be waiting for data to drain
 2443                  * and/or to be acked, as well as for the ack of our FIN.
 2444                  * If our FIN is now acknowledged, delete the TCB,
 2445                  * enter the closed state and return.
 2446                  */
 2447                 case TCPS_LAST_ACK:
 2448                         if (ourfinisacked) {
 2449                                 INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 2450                                 tp = tcp_close(tp);
 2451                                 goto drop;
 2452                         }
 2453                         break;
 2454                 }
 2455         }
 2456 
 2457 step6:
 2458         INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 2459         KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 2460             ("tcp_do_segment: step6 ti_locked %d", ti_locked));
 2461         INP_WLOCK_ASSERT(tp->t_inpcb);
 2462 
 2463         /*
 2464          * Update window information.
 2465          * Don't look at window if no ACK: TAC's send garbage on first SYN.
 2466          */
 2467         if ((thflags & TH_ACK) &&
 2468             (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 2469             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 2470              (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 2471                 /* keep track of pure window updates */
 2472                 if (tlen == 0 &&
 2473                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 2474                         TCPSTAT_INC(tcps_rcvwinupd);
 2475                 tp->snd_wnd = tiwin;
 2476                 tp->snd_wl1 = th->th_seq;
 2477                 tp->snd_wl2 = th->th_ack;
 2478                 if (tp->snd_wnd > tp->max_sndwnd)
 2479                         tp->max_sndwnd = tp->snd_wnd;
 2480                 needoutput = 1;
 2481         }
 2482 
 2483         /*
 2484          * Process segments with URG.
 2485          */
 2486         if ((thflags & TH_URG) && th->th_urp &&
 2487             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2488                 /*
 2489                  * This is a kludge, but if we receive and accept
 2490                  * random urgent pointers, we'll crash in
 2491                  * soreceive.  It's hard to imagine someone
 2492                  * actually wanting to send this much urgent data.
 2493                  */
 2494                 SOCKBUF_LOCK(&so->so_rcv);
 2495                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 2496                         th->th_urp = 0;                 /* XXX */
 2497                         thflags &= ~TH_URG;             /* XXX */
 2498                         SOCKBUF_UNLOCK(&so->so_rcv);    /* XXX */
 2499                         goto dodata;                    /* XXX */
 2500                 }
 2501                 /*
 2502                  * If this segment advances the known urgent pointer,
 2503                  * then mark the data stream.  This should not happen
 2504                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 2505                  * a FIN has been received from the remote side.
 2506                  * In these states we ignore the URG.
 2507                  *
 2508                  * According to RFC961 (Assigned Protocols),
 2509                  * the urgent pointer points to the last octet
 2510                  * of urgent data.  We continue, however,
 2511                  * to consider it to indicate the first octet
 2512                  * of data past the urgent section as the original
 2513                  * spec states (in one of two places).
 2514                  */
 2515                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 2516                         tp->rcv_up = th->th_seq + th->th_urp;
 2517                         so->so_oobmark = so->so_rcv.sb_cc +
 2518                             (tp->rcv_up - tp->rcv_nxt) - 1;
 2519                         if (so->so_oobmark == 0)
 2520                                 so->so_rcv.sb_state |= SBS_RCVATMARK;
 2521                         sohasoutofband(so);
 2522                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 2523                 }
 2524                 SOCKBUF_UNLOCK(&so->so_rcv);
 2525                 /*
 2526                  * Remove out of band data so doesn't get presented to user.
 2527                  * This can happen independent of advancing the URG pointer,
 2528                  * but if two URG's are pending at once, some out-of-band
 2529                  * data may creep in... ick.
 2530                  */
 2531                 if (th->th_urp <= (u_long)tlen &&
 2532                     !(so->so_options & SO_OOBINLINE)) {
 2533                         /* hdr drop is delayed */
 2534                         tcp_pulloutofband(so, th, m, drop_hdrlen);
 2535                 }
 2536         } else {
 2537                 /*
 2538                  * If no out of band data is expected,
 2539                  * pull receive urgent pointer along
 2540                  * with the receive window.
 2541                  */
 2542                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 2543                         tp->rcv_up = tp->rcv_nxt;
 2544         }
 2545 dodata:                                                 /* XXX */
 2546         INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 2547         KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 2548             ("tcp_do_segment: dodata ti_locked %d", ti_locked));
 2549         INP_WLOCK_ASSERT(tp->t_inpcb);
 2550 
 2551         /*
 2552          * Process the segment text, merging it into the TCP sequencing queue,
 2553          * and arranging for acknowledgment of receipt if necessary.
 2554          * This process logically involves adjusting tp->rcv_wnd as data
 2555          * is presented to the user (this happens in tcp_usrreq.c,
 2556          * case PRU_RCVD).  If a FIN has already been received on this
 2557          * connection then we just ignore the text.
 2558          */
 2559         if ((tlen || (thflags & TH_FIN)) &&
 2560             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2561                 tcp_seq save_start = th->th_seq;
 2562                 m_adj(m, drop_hdrlen);  /* delayed header drop */
 2563                 /*
 2564                  * Insert segment which includes th into TCP reassembly queue
 2565                  * with control block tp.  Set thflags to whether reassembly now
 2566                  * includes a segment with FIN.  This handles the common case
 2567                  * inline (segment is the next to be received on an established
 2568                  * connection, and the queue is empty), avoiding linkage into
 2569                  * and removal from the queue and repetition of various
 2570                  * conversions.
 2571                  * Set DELACK for segments received in order, but ack
 2572                  * immediately when segments are out of order (so
 2573                  * fast retransmit can work).
 2574                  */
 2575                 if (th->th_seq == tp->rcv_nxt &&
 2576                     LIST_EMPTY(&tp->t_segq) &&
 2577                     TCPS_HAVEESTABLISHED(tp->t_state)) {
 2578                         if (DELAY_ACK(tp))
 2579                                 tp->t_flags |= TF_DELACK;
 2580                         else
 2581                                 tp->t_flags |= TF_ACKNOW;
 2582                         tp->rcv_nxt += tlen;
 2583                         thflags = th->th_flags & TH_FIN;
 2584                         TCPSTAT_INC(tcps_rcvpack);
 2585                         TCPSTAT_ADD(tcps_rcvbyte, tlen);
 2586                         ND6_HINT(tp);
 2587                         SOCKBUF_LOCK(&so->so_rcv);
 2588                         if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 2589                                 m_freem(m);
 2590                         else
 2591                                 sbappendstream_locked(&so->so_rcv, m);
 2592                         /* NB: sorwakeup_locked() does an implicit unlock. */
 2593                         sorwakeup_locked(so);
 2594                 } else {
 2595                         /*
 2596                          * XXX: Due to the header drop above "th" is
 2597                          * theoretically invalid by now.  Fortunately
 2598                          * m_adj() doesn't actually frees any mbufs
 2599                          * when trimming from the head.
 2600                          */
 2601                         thflags = tcp_reass(tp, th, &tlen, m);
 2602                         tp->t_flags |= TF_ACKNOW;
 2603                 }
 2604                 if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT))
 2605                         tcp_update_sack_list(tp, save_start, save_start + tlen);
 2606 #if 0
 2607                 /*
 2608                  * Note the amount of data that peer has sent into
 2609                  * our window, in order to estimate the sender's
 2610                  * buffer size.
 2611                  * XXX: Unused.
 2612                  */
 2613                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 2614 #endif
 2615         } else {
 2616                 m_freem(m);
 2617                 thflags &= ~TH_FIN;
 2618         }
 2619 
 2620         /*
 2621          * If FIN is received ACK the FIN and let the user know
 2622          * that the connection is closing.
 2623          */
 2624         if (thflags & TH_FIN) {
 2625                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2626                         socantrcvmore(so);
 2627                         /*
 2628                          * If connection is half-synchronized
 2629                          * (ie NEEDSYN flag on) then delay ACK,
 2630                          * so it may be piggybacked when SYN is sent.
 2631                          * Otherwise, since we received a FIN then no
 2632                          * more input can be expected, send ACK now.
 2633                          */
 2634                         if (tp->t_flags & TF_NEEDSYN)
 2635                                 tp->t_flags |= TF_DELACK;
 2636                         else
 2637                                 tp->t_flags |= TF_ACKNOW;
 2638                         tp->rcv_nxt++;
 2639                 }
 2640                 switch (tp->t_state) {
 2641 
 2642                 /*
 2643                  * In SYN_RECEIVED and ESTABLISHED STATES
 2644                  * enter the CLOSE_WAIT state.
 2645                  */
 2646                 case TCPS_SYN_RECEIVED:
 2647                         tp->t_starttime = ticks;
 2648                         /* FALLTHROUGH */
 2649                 case TCPS_ESTABLISHED:
 2650                         tp->t_state = TCPS_CLOSE_WAIT;
 2651                         break;
 2652 
 2653                 /*
 2654                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
 2655                  * enter the CLOSING state.
 2656                  */
 2657                 case TCPS_FIN_WAIT_1:
 2658                         tp->t_state = TCPS_CLOSING;
 2659                         break;
 2660 
 2661                 /*
 2662                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
 2663                  * starting the time-wait timer, turning off the other
 2664                  * standard timers.
 2665                  */
 2666                 case TCPS_FIN_WAIT_2:
 2667                         INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 2668                         KASSERT(ti_locked == TI_WLOCKED, ("%s: dodata "
 2669                             "TCP_FIN_WAIT_2 ti_locked: %d", __func__,
 2670                             ti_locked));
 2671 
 2672                         tcp_twstart(tp);
 2673                         INP_INFO_WUNLOCK(&V_tcbinfo);
 2674                         return;
 2675                 }
 2676         }
 2677         if (ti_locked == TI_RLOCKED)
 2678                 INP_INFO_RUNLOCK(&V_tcbinfo);
 2679         else if (ti_locked == TI_WLOCKED)
 2680                 INP_INFO_WUNLOCK(&V_tcbinfo);
 2681         else
 2682                 panic("%s: dodata epilogue ti_locked %d", __func__,
 2683                     ti_locked);
 2684         ti_locked = TI_UNLOCKED;
 2685 
 2686 #ifdef TCPDEBUG
 2687         if (so->so_options & SO_DEBUG)
 2688                 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
 2689                           &tcp_savetcp, 0);
 2690 #endif
 2691 
 2692         /*
 2693          * Return any desired output.
 2694          */
 2695         if (needoutput || (tp->t_flags & TF_ACKNOW))
 2696                 (void) tcp_output(tp);
 2697 
 2698 check_delack:
 2699         KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
 2700             __func__, ti_locked));
 2701         INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 2702         INP_WLOCK_ASSERT(tp->t_inpcb);
 2703 
 2704         if (tp->t_flags & TF_DELACK) {
 2705                 tp->t_flags &= ~TF_DELACK;
 2706                 tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
 2707         }
 2708         INP_WUNLOCK(tp->t_inpcb);
 2709         return;
 2710 
 2711 dropafterack:
 2712         KASSERT(ti_locked == TI_RLOCKED || ti_locked == TI_WLOCKED,
 2713             ("tcp_do_segment: dropafterack ti_locked %d", ti_locked));
 2714 
 2715         /*
 2716          * Generate an ACK dropping incoming segment if it occupies
 2717          * sequence space, where the ACK reflects our state.
 2718          *
 2719          * We can now skip the test for the RST flag since all
 2720          * paths to this code happen after packets containing
 2721          * RST have been dropped.
 2722          *
 2723          * In the SYN-RECEIVED state, don't send an ACK unless the
 2724          * segment we received passes the SYN-RECEIVED ACK test.
 2725          * If it fails send a RST.  This breaks the loop in the
 2726          * "LAND" DoS attack, and also prevents an ACK storm
 2727          * between two listening ports that have been sent forged
 2728          * SYN segments, each with the source address of the other.
 2729          */
 2730         if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
 2731             (SEQ_GT(tp->snd_una, th->th_ack) ||
 2732              SEQ_GT(th->th_ack, tp->snd_max)) ) {
 2733                 rstreason = BANDLIM_RST_OPENPORT;
 2734                 goto dropwithreset;
 2735         }
 2736 #ifdef TCPDEBUG
 2737         if (so->so_options & SO_DEBUG)
 2738                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 2739                           &tcp_savetcp, 0);
 2740 #endif
 2741         if (ti_locked == TI_RLOCKED)
 2742                 INP_INFO_RUNLOCK(&V_tcbinfo);
 2743         else if (ti_locked == TI_WLOCKED)
 2744                 INP_INFO_WUNLOCK(&V_tcbinfo);
 2745         else
 2746                 panic("%s: dropafterack epilogue ti_locked %d", __func__,
 2747                     ti_locked);
 2748         ti_locked = TI_UNLOCKED;
 2749 
 2750         tp->t_flags |= TF_ACKNOW;
 2751         (void) tcp_output(tp);
 2752         INP_WUNLOCK(tp->t_inpcb);
 2753         m_freem(m);
 2754         return;
 2755 
 2756 dropwithreset:
 2757         if (ti_locked == TI_RLOCKED)
 2758                 INP_INFO_RUNLOCK(&V_tcbinfo);
 2759         else if (ti_locked == TI_WLOCKED)
 2760                 INP_INFO_WUNLOCK(&V_tcbinfo);
 2761         else
 2762                 panic("%s: dropwithreset ti_locked %d", __func__, ti_locked);
 2763         ti_locked = TI_UNLOCKED;
 2764 
 2765         if (tp != NULL) {
 2766                 tcp_dropwithreset(m, th, tp, tlen, rstreason);
 2767                 INP_WUNLOCK(tp->t_inpcb);
 2768         } else
 2769                 tcp_dropwithreset(m, th, NULL, tlen, rstreason);
 2770         return;
 2771 
 2772 drop:
 2773         if (ti_locked == TI_RLOCKED)
 2774                 INP_INFO_RUNLOCK(&V_tcbinfo);
 2775         else if (ti_locked == TI_WLOCKED)
 2776                 INP_INFO_WUNLOCK(&V_tcbinfo);
 2777 #ifdef INVARIANTS
 2778         else
 2779                 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
 2780 #endif
 2781         ti_locked = TI_UNLOCKED;
 2782 
 2783         /*
 2784          * Drop space held by incoming segment and return.
 2785          */
 2786 #ifdef TCPDEBUG
 2787         if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
 2788                 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
 2789                           &tcp_savetcp, 0);
 2790 #endif
 2791         if (tp != NULL)
 2792                 INP_WUNLOCK(tp->t_inpcb);
 2793         m_freem(m);
 2794 }
 2795 
 2796 /*
 2797  * Issue RST and make ACK acceptable to originator of segment.
 2798  * The mbuf must still include the original packet header.
 2799  * tp may be NULL.
 2800  */
 2801 static void
 2802 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
 2803     int tlen, int rstreason)
 2804 {
 2805         struct ip *ip;
 2806 #ifdef INET6
 2807         struct ip6_hdr *ip6;
 2808 #endif
 2809 
 2810         if (tp != NULL) {
 2811                 INP_WLOCK_ASSERT(tp->t_inpcb);
 2812         }
 2813 
 2814         /* Don't bother if destination was broadcast/multicast. */
 2815         if ((th->th_flags & TH_RST) || m->m_flags & (M_BCAST|M_MCAST))
 2816                 goto drop;
 2817 #ifdef INET6
 2818         if (mtod(m, struct ip *)->ip_v == 6) {
 2819                 ip6 = mtod(m, struct ip6_hdr *);
 2820                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
 2821                     IN6_IS_ADDR_MULTICAST(&ip6->ip6_src))
 2822                         goto drop;
 2823                 /* IPv6 anycast check is done at tcp6_input() */
 2824         } else
 2825 #endif
 2826         {
 2827                 ip = mtod(m, struct ip *);
 2828                 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 2829                     IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
 2830                     ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
 2831                     in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 2832                         goto drop;
 2833         }
 2834 
 2835         /* Perform bandwidth limiting. */
 2836         if (badport_bandlim(rstreason) < 0)
 2837                 goto drop;
 2838 
 2839         /* tcp_respond consumes the mbuf chain. */
 2840         if (th->th_flags & TH_ACK) {
 2841                 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0,
 2842                     th->th_ack, TH_RST);
 2843         } else {
 2844                 if (th->th_flags & TH_SYN)
 2845                         tlen++;
 2846                 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen,
 2847                     (tcp_seq)0, TH_RST|TH_ACK);
 2848         }
 2849         return;
 2850 drop:
 2851         m_freem(m);
 2852 }
 2853 
 2854 /*
 2855  * Parse TCP options and place in tcpopt.
 2856  */
 2857 static void
 2858 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 2859 {
 2860         int opt, optlen;
 2861 
 2862         to->to_flags = 0;
 2863         for (; cnt > 0; cnt -= optlen, cp += optlen) {
 2864                 opt = cp[0];
 2865                 if (opt == TCPOPT_EOL)
 2866                         break;
 2867                 if (opt == TCPOPT_NOP)
 2868                         optlen = 1;
 2869                 else {
 2870                         if (cnt < 2)
 2871                                 break;
 2872                         optlen = cp[1];
 2873                         if (optlen < 2 || optlen > cnt)
 2874                                 break;
 2875                 }
 2876                 switch (opt) {
 2877                 case TCPOPT_MAXSEG:
 2878                         if (optlen != TCPOLEN_MAXSEG)
 2879                                 continue;
 2880                         if (!(flags & TO_SYN))
 2881                                 continue;
 2882                         to->to_flags |= TOF_MSS;
 2883                         bcopy((char *)cp + 2,
 2884                             (char *)&to->to_mss, sizeof(to->to_mss));
 2885                         to->to_mss = ntohs(to->to_mss);
 2886                         break;
 2887                 case TCPOPT_WINDOW:
 2888                         if (optlen != TCPOLEN_WINDOW)
 2889                                 continue;
 2890                         if (!(flags & TO_SYN))
 2891                                 continue;
 2892                         to->to_flags |= TOF_SCALE;
 2893                         to->to_wscale = min(cp[2], TCP_MAX_WINSHIFT);
 2894                         break;
 2895                 case TCPOPT_TIMESTAMP:
 2896                         if (optlen != TCPOLEN_TIMESTAMP)
 2897                                 continue;
 2898                         to->to_flags |= TOF_TS;
 2899                         bcopy((char *)cp + 2,
 2900                             (char *)&to->to_tsval, sizeof(to->to_tsval));
 2901                         to->to_tsval = ntohl(to->to_tsval);
 2902                         bcopy((char *)cp + 6,
 2903                             (char *)&to->to_tsecr, sizeof(to->to_tsecr));
 2904                         to->to_tsecr = ntohl(to->to_tsecr);
 2905                         break;
 2906 #ifdef TCP_SIGNATURE
 2907                 /*
 2908                  * XXX In order to reply to a host which has set the
 2909                  * TCP_SIGNATURE option in its initial SYN, we have to
 2910                  * record the fact that the option was observed here
 2911                  * for the syncache code to perform the correct response.
 2912                  */
 2913                 case TCPOPT_SIGNATURE:
 2914                         if (optlen != TCPOLEN_SIGNATURE)
 2915                                 continue;
 2916                         to->to_flags |= TOF_SIGNATURE;
 2917                         to->to_signature = cp + 2;
 2918                         break;
 2919 #endif
 2920                 case TCPOPT_SACK_PERMITTED:
 2921                         if (optlen != TCPOLEN_SACK_PERMITTED)
 2922                                 continue;
 2923                         if (!(flags & TO_SYN))
 2924                                 continue;
 2925                         if (!V_tcp_do_sack)
 2926                                 continue;
 2927                         to->to_flags |= TOF_SACKPERM;
 2928                         break;
 2929                 case TCPOPT_SACK:
 2930                         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 2931                                 continue;
 2932                         if (flags & TO_SYN)
 2933                                 continue;
 2934                         to->to_flags |= TOF_SACK;
 2935                         to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
 2936                         to->to_sacks = cp + 2;
 2937                         TCPSTAT_INC(tcps_sack_rcv_blocks);
 2938                         break;
 2939                 default:
 2940                         continue;
 2941                 }
 2942         }
 2943 }
 2944 
 2945 /*
 2946  * Pull out of band byte out of a segment so
 2947  * it doesn't appear in the user's data queue.
 2948  * It is still reflected in the segment length for
 2949  * sequencing purposes.
 2950  */
 2951 static void
 2952 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
 2953     int off)
 2954 {
 2955         int cnt = off + th->th_urp - 1;
 2956 
 2957         while (cnt >= 0) {
 2958                 if (m->m_len > cnt) {
 2959                         char *cp = mtod(m, caddr_t) + cnt;
 2960                         struct tcpcb *tp = sototcpcb(so);
 2961 
 2962                         INP_WLOCK_ASSERT(tp->t_inpcb);
 2963 
 2964                         tp->t_iobc = *cp;
 2965                         tp->t_oobflags |= TCPOOB_HAVEDATA;
 2966                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 2967                         m->m_len--;
 2968                         if (m->m_flags & M_PKTHDR)
 2969                                 m->m_pkthdr.len--;
 2970                         return;
 2971                 }
 2972                 cnt -= m->m_len;
 2973                 m = m->m_next;
 2974                 if (m == NULL)
 2975                         break;
 2976         }
 2977         panic("tcp_pulloutofband");
 2978 }
 2979 
 2980 /*
 2981  * Collect new round-trip time estimate
 2982  * and update averages and current timeout.
 2983  */
 2984 static void
 2985 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 2986 {
 2987         int delta;
 2988 
 2989         INP_WLOCK_ASSERT(tp->t_inpcb);
 2990 
 2991         TCPSTAT_INC(tcps_rttupdated);
 2992         tp->t_rttupdated++;
 2993         if (tp->t_srtt != 0) {
 2994                 /*
 2995                  * srtt is stored as fixed point with 5 bits after the
 2996                  * binary point (i.e., scaled by 8).  The following magic
 2997                  * is equivalent to the smoothing algorithm in rfc793 with
 2998                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 2999                  * point).  Adjust rtt to origin 0.
 3000                  */
 3001                 delta = ((rtt - 1) << TCP_DELTA_SHIFT)
 3002                         - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
 3003 
 3004                 if ((tp->t_srtt += delta) <= 0)
 3005                         tp->t_srtt = 1;
 3006 
 3007                 /*
 3008                  * We accumulate a smoothed rtt variance (actually, a
 3009                  * smoothed mean difference), then set the retransmit
 3010                  * timer to smoothed rtt + 4 times the smoothed variance.
 3011                  * rttvar is stored as fixed point with 4 bits after the
 3012                  * binary point (scaled by 16).  The following is
 3013                  * equivalent to rfc793 smoothing with an alpha of .75
 3014                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 3015                  * rfc793's wired-in beta.
 3016                  */
 3017                 if (delta < 0)
 3018                         delta = -delta;
 3019                 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
 3020                 if ((tp->t_rttvar += delta) <= 0)
 3021                         tp->t_rttvar = 1;
 3022                 if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
 3023                     tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 3024         } else {
 3025                 /*
 3026                  * No rtt measurement yet - use the unsmoothed rtt.
 3027                  * Set the variance to half the rtt (so our first
 3028                  * retransmit happens at 3*rtt).
 3029                  */
 3030                 tp->t_srtt = rtt << TCP_RTT_SHIFT;
 3031                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
 3032                 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
 3033         }
 3034         tp->t_rtttime = 0;
 3035         tp->t_rxtshift = 0;
 3036 
 3037         /*
 3038          * the retransmit should happen at rtt + 4 * rttvar.
 3039          * Because of the way we do the smoothing, srtt and rttvar
 3040          * will each average +1/2 tick of bias.  When we compute
 3041          * the retransmit timer, we want 1/2 tick of rounding and
 3042          * 1 extra tick because of +-1/2 tick uncertainty in the
 3043          * firing of the timer.  The bias will give us exactly the
 3044          * 1.5 tick we need.  But, because the bias is
 3045          * statistical, we have to test that we don't drop below
 3046          * the minimum feasible timer (which is 2 ticks).
 3047          */
 3048         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 3049                       max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 3050 
 3051         /*
 3052          * We received an ack for a packet that wasn't retransmitted;
 3053          * it is probably safe to discard any error indications we've
 3054          * received recently.  This isn't quite right, but close enough
 3055          * for now (a route might have failed after we sent a segment,
 3056          * and the return path might not be symmetrical).
 3057          */
 3058         tp->t_softerror = 0;
 3059 }
 3060 
 3061 /*
 3062  * Determine a reasonable value for maxseg size.
 3063  * If the route is known, check route for mtu.
 3064  * If none, use an mss that can be handled on the outgoing
 3065  * interface without forcing IP to fragment; if bigger than
 3066  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
 3067  * to utilize large mbufs.  If no route is found, route has no mtu,
 3068  * or the destination isn't local, use a default, hopefully conservative
 3069  * size (usually 512 or the default IP max size, but no more than the mtu
 3070  * of the interface), as we can't discover anything about intervening
 3071  * gateways or networks.  We also initialize the congestion/slow start
 3072  * window to be a single segment if the destination isn't local.
 3073  * While looking at the routing entry, we also initialize other path-dependent
 3074  * parameters from pre-set or cached values in the routing entry.
 3075  *
 3076  * Also take into account the space needed for options that we
 3077  * send regularly.  Make maxseg shorter by that amount to assure
 3078  * that we can send maxseg amount of data even when the options
 3079  * are present.  Store the upper limit of the length of options plus
 3080  * data in maxopd.
 3081  *
 3082  * In case of T/TCP, we call this routine during implicit connection
 3083  * setup as well (offer = -1), to initialize maxseg from the cached
 3084  * MSS of our peer.
 3085  *
 3086  * NOTE that this routine is only called when we process an incoming
 3087  * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
 3088  */
 3089 void
 3090 tcp_mss_update(struct tcpcb *tp, int offer,
 3091     struct hc_metrics_lite *metricptr, int *mtuflags)
 3092 {
 3093         int mss;
 3094         u_long maxmtu;
 3095         struct inpcb *inp = tp->t_inpcb;
 3096         struct hc_metrics_lite metrics;
 3097         int origoffer = offer;
 3098 #ifdef INET6
 3099         int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 3100         size_t min_protoh = isipv6 ?
 3101                             sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 3102                             sizeof (struct tcpiphdr);
 3103 #else
 3104         const size_t min_protoh = sizeof(struct tcpiphdr);
 3105 #endif
 3106 
 3107         INP_WLOCK_ASSERT(tp->t_inpcb);
 3108 
 3109         /* Initialize. */
 3110 #ifdef INET6
 3111         if (isipv6) {
 3112                 maxmtu = tcp_maxmtu6(&inp->inp_inc, mtuflags);
 3113                 tp->t_maxopd = tp->t_maxseg = V_tcp_v6mssdflt;
 3114         } else
 3115 #endif
 3116         {
 3117                 maxmtu = tcp_maxmtu(&inp->inp_inc, mtuflags);
 3118                 tp->t_maxopd = tp->t_maxseg = V_tcp_mssdflt;
 3119         }
 3120 
 3121         /*
 3122          * No route to sender, stay with default mss and return.
 3123          */
 3124         if (maxmtu == 0) {
 3125                 /*
 3126                  * In case we return early we need to initialize metrics
 3127                  * to a defined state as tcp_hc_get() would do for us
 3128                  * if there was no cache hit.
 3129                  */
 3130                 if (metricptr != NULL)
 3131                         bzero(metricptr, sizeof(struct hc_metrics_lite));
 3132                 return;
 3133         }
 3134 
 3135         /* What have we got? */
 3136         switch (offer) {
 3137                 case 0:
 3138                         /*
 3139                          * Offer == 0 means that there was no MSS on the SYN
 3140                          * segment, in this case we use tcp_mssdflt as
 3141                          * already assigned to t_maxopd above.
 3142                          */
 3143                         offer = tp->t_maxopd;
 3144                         break;
 3145 
 3146                 case -1:
 3147                         /*
 3148                          * Offer == -1 means that we didn't receive SYN yet.
 3149                          */
 3150                         /* FALLTHROUGH */
 3151 
 3152                 default:
 3153                         /*
 3154                          * Prevent DoS attack with too small MSS. Round up
 3155                          * to at least minmss.
 3156                          */
 3157                         offer = max(offer, V_tcp_minmss);
 3158         }
 3159 
 3160         /*
 3161          * rmx information is now retrieved from tcp_hostcache.
 3162          */
 3163         tcp_hc_get(&inp->inp_inc, &metrics);
 3164         if (metricptr != NULL)
 3165                 bcopy(&metrics, metricptr, sizeof(struct hc_metrics_lite));
 3166 
 3167         /*
 3168          * If there's a discovered mtu int tcp hostcache, use it
 3169          * else, use the link mtu.
 3170          */
 3171         if (metrics.rmx_mtu)
 3172                 mss = min(metrics.rmx_mtu, maxmtu) - min_protoh;
 3173         else {
 3174 #ifdef INET6
 3175                 if (isipv6) {
 3176                         mss = maxmtu - min_protoh;
 3177                         if (!V_path_mtu_discovery &&
 3178                             !in6_localaddr(&inp->in6p_faddr))
 3179                                 mss = min(mss, V_tcp_v6mssdflt);
 3180                 } else
 3181 #endif
 3182                 {
 3183                         mss = maxmtu - min_protoh;
 3184                         if (!V_path_mtu_discovery &&
 3185                             !in_localaddr(inp->inp_faddr))
 3186                                 mss = min(mss, V_tcp_mssdflt);
 3187                 }
 3188                 /*
 3189                  * XXX - The above conditional (mss = maxmtu - min_protoh)
 3190                  * probably violates the TCP spec.
 3191                  * The problem is that, since we don't know the
 3192                  * other end's MSS, we are supposed to use a conservative
 3193                  * default.  But, if we do that, then MTU discovery will
 3194                  * never actually take place, because the conservative
 3195                  * default is much less than the MTUs typically seen
 3196                  * on the Internet today.  For the moment, we'll sweep
 3197                  * this under the carpet.
 3198                  *
 3199                  * The conservative default might not actually be a problem
 3200                  * if the only case this occurs is when sending an initial
 3201                  * SYN with options and data to a host we've never talked
 3202                  * to before.  Then, they will reply with an MSS value which
 3203                  * will get recorded and the new parameters should get
 3204                  * recomputed.  For Further Study.
 3205                  */
 3206         }
 3207         mss = min(mss, offer);
 3208 
 3209         /*
 3210          * Sanity check: make sure that maxopd will be large
 3211          * enough to allow some data on segments even if the
 3212          * all the option space is used (40bytes).  Otherwise
 3213          * funny things may happen in tcp_output.
 3214          */
 3215         mss = max(mss, 64);
 3216 
 3217         /*
 3218          * maxopd stores the maximum length of data AND options
 3219          * in a segment; maxseg is the amount of data in a normal
 3220          * segment.  We need to store this value (maxopd) apart
 3221          * from maxseg, because now every segment carries options
 3222          * and thus we normally have somewhat less data in segments.
 3223          */
 3224         tp->t_maxopd = mss;
 3225 
 3226         /*
 3227          * origoffer==-1 indicates that no segments were received yet.
 3228          * In this case we just guess.
 3229          */
 3230         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 3231             (origoffer == -1 ||
 3232              (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP))
 3233                 mss -= TCPOLEN_TSTAMP_APPA;
 3234 
 3235 #if     (MCLBYTES & (MCLBYTES - 1)) == 0
 3236         if (mss > MCLBYTES)
 3237                 mss &= ~(MCLBYTES-1);
 3238 #else
 3239         if (mss > MCLBYTES)
 3240                 mss = mss / MCLBYTES * MCLBYTES;
 3241 #endif
 3242         tp->t_maxseg = mss;
 3243 }
 3244 
 3245 void
 3246 tcp_mss(struct tcpcb *tp, int offer)
 3247 {
 3248         int rtt, mss;
 3249         u_long bufsize;
 3250         struct inpcb *inp;
 3251         struct socket *so;
 3252         struct hc_metrics_lite metrics;
 3253         int mtuflags = 0;
 3254 #ifdef INET6
 3255         int isipv6;
 3256 #endif
 3257         KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
 3258         
 3259         tcp_mss_update(tp, offer, &metrics, &mtuflags);
 3260 
 3261         mss = tp->t_maxseg;
 3262         inp = tp->t_inpcb;
 3263 #ifdef INET6
 3264         isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
 3265 #endif
 3266 
 3267         /*
 3268          * If there's a pipesize, change the socket buffer to that size,
 3269          * don't change if sb_hiwat is different than default (then it
 3270          * has been changed on purpose with setsockopt).
 3271          * Make the socket buffers an integral number of mss units;
 3272          * if the mss is larger than the socket buffer, decrease the mss.
 3273          */
 3274         so = inp->inp_socket;
 3275         SOCKBUF_LOCK(&so->so_snd);
 3276         if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
 3277                 bufsize = metrics.rmx_sendpipe;
 3278         else
 3279                 bufsize = so->so_snd.sb_hiwat;
 3280         if (bufsize < mss)
 3281                 mss = bufsize;
 3282         else {
 3283                 bufsize = roundup(bufsize, mss);
 3284                 if (bufsize > sb_max)
 3285                         bufsize = sb_max;
 3286                 if (bufsize > so->so_snd.sb_hiwat)
 3287                         (void)sbreserve_locked(&so->so_snd, bufsize, so, NULL);
 3288         }
 3289         SOCKBUF_UNLOCK(&so->so_snd);
 3290         tp->t_maxseg = mss;
 3291 
 3292         SOCKBUF_LOCK(&so->so_rcv);
 3293         if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
 3294                 bufsize = metrics.rmx_recvpipe;
 3295         else
 3296                 bufsize = so->so_rcv.sb_hiwat;
 3297         if (bufsize > mss) {
 3298                 bufsize = roundup(bufsize, mss);
 3299                 if (bufsize > sb_max)
 3300                         bufsize = sb_max;
 3301                 if (bufsize > so->so_rcv.sb_hiwat)
 3302                         (void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
 3303         }
 3304         SOCKBUF_UNLOCK(&so->so_rcv);
 3305         /*
 3306          * While we're here, check the others too.
 3307          */
 3308         if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
 3309                 tp->t_srtt = rtt;
 3310                 tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
 3311                 TCPSTAT_INC(tcps_usedrtt);
 3312                 if (metrics.rmx_rttvar) {
 3313                         tp->t_rttvar = metrics.rmx_rttvar;
 3314                         TCPSTAT_INC(tcps_usedrttvar);
 3315                 } else {
 3316                         /* default variation is +- 1 rtt */
 3317                         tp->t_rttvar =
 3318                             tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
 3319                 }
 3320                 TCPT_RANGESET(tp->t_rxtcur,
 3321                               ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
 3322                               tp->t_rttmin, TCPTV_REXMTMAX);
 3323         }
 3324         if (metrics.rmx_ssthresh) {
 3325                 /*
 3326                  * There's some sort of gateway or interface
 3327                  * buffer limit on the path.  Use this to set
 3328                  * the slow start threshhold, but set the
 3329                  * threshold to no less than 2*mss.
 3330                  */
 3331                 tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
 3332                 TCPSTAT_INC(tcps_usedssthresh);
 3333         }
 3334         if (metrics.rmx_bandwidth)
 3335                 tp->snd_bandwidth = metrics.rmx_bandwidth;
 3336 
 3337         /*
 3338          * Set the slow-start flight size depending on whether this
 3339          * is a local network or not.
 3340          *
 3341          * Extend this so we cache the cwnd too and retrieve it here.
 3342          * Make cwnd even bigger than RFC3390 suggests but only if we
 3343          * have previous experience with the remote host. Be careful
 3344          * not make cwnd bigger than remote receive window or our own
 3345          * send socket buffer. Maybe put some additional upper bound
 3346          * on the retrieved cwnd. Should do incremental updates to
 3347          * hostcache when cwnd collapses so next connection doesn't
 3348          * overloads the path again.
 3349          *
 3350          * XXXAO: Initializing the CWND from the hostcache is broken
 3351          * and in its current form not RFC conformant.  It is disabled
 3352          * until fixed or removed entirely.
 3353          *
 3354          * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
 3355          * We currently check only in syncache_socket for that.
 3356          */
 3357 /* #define TCP_METRICS_CWND */
 3358 #ifdef TCP_METRICS_CWND
 3359         if (metrics.rmx_cwnd)
 3360                 tp->snd_cwnd = max(mss,
 3361                                 min(metrics.rmx_cwnd / 2,
 3362                                  min(tp->snd_wnd, so->so_snd.sb_hiwat)));
 3363         else
 3364 #endif
 3365         if (V_tcp_do_rfc3390)
 3366                 tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
 3367 #ifdef INET6
 3368         else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
 3369                  (!isipv6 && in_localaddr(inp->inp_faddr)))
 3370 #else
 3371         else if (in_localaddr(inp->inp_faddr))
 3372 #endif
 3373                 tp->snd_cwnd = mss * V_ss_fltsz_local;
 3374         else
 3375                 tp->snd_cwnd = mss * V_ss_fltsz;
 3376 
 3377         /* Check the interface for TSO capabilities. */
 3378         if (mtuflags & CSUM_TSO)
 3379                 tp->t_flags |= TF_TSO;
 3380 }
 3381 
 3382 /*
 3383  * Determine the MSS option to send on an outgoing SYN.
 3384  */
 3385 int
 3386 tcp_mssopt(struct in_conninfo *inc)
 3387 {
 3388         int mss = 0;
 3389         u_long maxmtu = 0;
 3390         u_long thcmtu = 0;
 3391         size_t min_protoh;
 3392 
 3393         KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
 3394 
 3395 #ifdef INET6
 3396         if (inc->inc_flags & INC_ISIPV6) {
 3397                 mss = V_tcp_v6mssdflt;
 3398                 maxmtu = tcp_maxmtu6(inc, NULL);
 3399                 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 3400                 min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 3401         } else
 3402 #endif
 3403         {
 3404                 mss = V_tcp_mssdflt;
 3405                 maxmtu = tcp_maxmtu(inc, NULL);
 3406                 thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
 3407                 min_protoh = sizeof(struct tcpiphdr);
 3408         }
 3409         if (maxmtu && thcmtu)
 3410                 mss = min(maxmtu, thcmtu) - min_protoh;
 3411         else if (maxmtu || thcmtu)
 3412                 mss = max(maxmtu, thcmtu) - min_protoh;
 3413 
 3414         return (mss);
 3415 }
 3416 
 3417 
 3418 /*
 3419  * On a partial ack arrives, force the retransmission of the
 3420  * next unacknowledged segment.  Do not clear tp->t_dupacks.
 3421  * By setting snd_nxt to ti_ack, this forces retransmission timer to
 3422  * be started again.
 3423  */
 3424 static void
 3425 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 3426 {
 3427         tcp_seq onxt = tp->snd_nxt;
 3428         u_long  ocwnd = tp->snd_cwnd;
 3429 
 3430         INP_WLOCK_ASSERT(tp->t_inpcb);
 3431 
 3432         tcp_timer_activate(tp, TT_REXMT, 0);
 3433         tp->t_rtttime = 0;
 3434         tp->snd_nxt = th->th_ack;
 3435         /*
 3436          * Set snd_cwnd to one segment beyond acknowledged offset.
 3437          * (tp->snd_una has not yet been updated when this function is called.)
 3438          */
 3439         tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 3440         tp->t_flags |= TF_ACKNOW;
 3441         (void) tcp_output(tp);
 3442         tp->snd_cwnd = ocwnd;
 3443         if (SEQ_GT(onxt, tp->snd_nxt))
 3444                 tp->snd_nxt = onxt;
 3445         /*
 3446          * Partial window deflation.  Relies on fact that tp->snd_una
 3447          * not updated yet.
 3448          */
 3449         if (tp->snd_cwnd > th->th_ack - tp->snd_una)
 3450                 tp->snd_cwnd -= th->th_ack - tp->snd_una;
 3451         else
 3452                 tp->snd_cwnd = 0;
 3453         tp->snd_cwnd += tp->t_maxseg;
 3454 }

Cache object: 230012dd0eabb45a3909f0634864df11


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.