The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_input.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: tcp_input.c,v 1.190.2.8 2005/04/22 06:58:40 tron Exp $ */
    2 
    3 /*
    4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. Neither the name of the project nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   34  *
   35  * NRL grants permission for redistribution and use in source and binary
   36  * forms, with or without modification, of the software and documentation
   37  * created at NRL provided that the following conditions are met:
   38  *
   39  * 1. Redistributions of source code must retain the above copyright
   40  *    notice, this list of conditions and the following disclaimer.
   41  * 2. Redistributions in binary form must reproduce the above copyright
   42  *    notice, this list of conditions and the following disclaimer in the
   43  *    documentation and/or other materials provided with the distribution.
   44  * 3. All advertising materials mentioning features or use of this software
   45  *    must display the following acknowledgements:
   46  *      This product includes software developed by the University of
   47  *      California, Berkeley and its contributors.
   48  *      This product includes software developed at the Information
   49  *      Technology Division, US Naval Research Laboratory.
   50  * 4. Neither the name of the NRL nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   65  *
   66  * The views and conclusions contained in the software and documentation
   67  * are those of the authors and should not be interpreted as representing
   68  * official policies, either expressed or implied, of the US Naval
   69  * Research Laboratory (NRL).
   70  */
   71 
   72 /*-
   73  * Copyright (c) 1997, 1998, 1999, 2001 The NetBSD Foundation, Inc.
   74  * All rights reserved.
   75  *
   76  * This code is derived from software contributed to The NetBSD Foundation
   77  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
   78  * Facility, NASA Ames Research Center.
   79  *
   80  * Redistribution and use in source and binary forms, with or without
   81  * modification, are permitted provided that the following conditions
   82  * are met:
   83  * 1. Redistributions of source code must retain the above copyright
   84  *    notice, this list of conditions and the following disclaimer.
   85  * 2. Redistributions in binary form must reproduce the above copyright
   86  *    notice, this list of conditions and the following disclaimer in the
   87  *    documentation and/or other materials provided with the distribution.
   88  * 3. All advertising materials mentioning features or use of this software
   89  *    must display the following acknowledgement:
   90  *      This product includes software developed by the NetBSD
   91  *      Foundation, Inc. and its contributors.
   92  * 4. Neither the name of The NetBSD Foundation nor the names of its
   93  *    contributors may be used to endorse or promote products derived
   94  *    from this software without specific prior written permission.
   95  *
   96  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   97  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   98  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   99  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  100  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  101  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  102  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  103  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  104  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  105  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  106  * POSSIBILITY OF SUCH DAMAGE.
  107  */
  108 
  109 /*
  110  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  111  *      The Regents of the University of California.  All rights reserved.
  112  *
  113  * Redistribution and use in source and binary forms, with or without
  114  * modification, are permitted provided that the following conditions
  115  * are met:
  116  * 1. Redistributions of source code must retain the above copyright
  117  *    notice, this list of conditions and the following disclaimer.
  118  * 2. Redistributions in binary form must reproduce the above copyright
  119  *    notice, this list of conditions and the following disclaimer in the
  120  *    documentation and/or other materials provided with the distribution.
  121  * 3. Neither the name of the University nor the names of its contributors
  122  *    may be used to endorse or promote products derived from this software
  123  *    without specific prior written permission.
  124  *
  125  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  126  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  127  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  128  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  129  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  130  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  131  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  132  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  133  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  134  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  135  * SUCH DAMAGE.
  136  *
  137  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  138  */
  139 
  140 /*
  141  *      TODO list for SYN cache stuff:
  142  *
  143  *      Find room for a "state" field, which is needed to keep a
  144  *      compressed state for TIME_WAIT TCBs.  It's been noted already
  145  *      that this is fairly important for very high-volume web and
  146  *      mail servers, which use a large number of short-lived
  147  *      connections.
  148  */
  149 
  150 #include <sys/cdefs.h>
  151 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.190.2.8 2005/04/22 06:58:40 tron Exp $");
  152 
  153 #include "opt_inet.h"
  154 #include "opt_ipsec.h"
  155 #include "opt_inet_csum.h"
  156 #include "opt_tcp_debug.h"
  157 
  158 #include <sys/param.h>
  159 #include <sys/systm.h>
  160 #include <sys/malloc.h>
  161 #include <sys/mbuf.h>
  162 #include <sys/protosw.h>
  163 #include <sys/socket.h>
  164 #include <sys/socketvar.h>
  165 #include <sys/errno.h>
  166 #include <sys/syslog.h>
  167 #include <sys/pool.h>
  168 #include <sys/domain.h>
  169 #include <sys/kernel.h>
  170 
  171 #include <net/if.h>
  172 #include <net/route.h>
  173 #include <net/if_types.h>
  174 
  175 #include <netinet/in.h>
  176 #include <netinet/in_systm.h>
  177 #include <netinet/ip.h>
  178 #include <netinet/in_pcb.h>
  179 #include <netinet/in_var.h>
  180 #include <netinet/ip_var.h>
  181 
  182 #ifdef INET6
  183 #ifndef INET
  184 #include <netinet/in.h>
  185 #endif
  186 #include <netinet/ip6.h>
  187 #include <netinet6/ip6_var.h>
  188 #include <netinet6/in6_pcb.h>
  189 #include <netinet6/ip6_var.h>
  190 #include <netinet6/in6_var.h>
  191 #include <netinet/icmp6.h>
  192 #include <netinet6/nd6.h>
  193 #endif
  194 
  195 #ifndef INET6
  196 /* always need ip6.h for IP6_EXTHDR_GET */
  197 #include <netinet/ip6.h>
  198 #endif
  199 
  200 #include <netinet/tcp.h>
  201 #include <netinet/tcp_fsm.h>
  202 #include <netinet/tcp_seq.h>
  203 #include <netinet/tcp_timer.h>
  204 #include <netinet/tcp_var.h>
  205 #include <netinet/tcpip.h>
  206 #include <netinet/tcp_debug.h>
  207 
  208 #include <machine/stdarg.h>
  209 
  210 #ifdef IPSEC
  211 #include <netinet6/ipsec.h>
  212 #include <netkey/key.h>
  213 #endif /*IPSEC*/
  214 #ifdef INET6
  215 #include "faith.h"
  216 #if defined(NFAITH) && NFAITH > 0
  217 #include <net/if_faith.h>
  218 #endif
  219 #endif  /* IPSEC */
  220 
  221 #ifdef FAST_IPSEC
  222 #include <netipsec/ipsec.h>
  223 #include <netipsec/ipsec_var.h>                 /* XXX ipsecstat namespace */
  224 #include <netipsec/key.h>
  225 #ifdef INET6
  226 #include <netipsec/ipsec6.h>
  227 #endif
  228 #endif  /* FAST_IPSEC*/
  229 
  230 
  231 int     tcprexmtthresh = 3;
  232 int     tcp_log_refused;
  233 
  234 static int tcp_rst_ppslim_count = 0;
  235 static struct timeval tcp_rst_ppslim_last;
  236 static int tcp_ackdrop_ppslim_count = 0;
  237 static struct timeval tcp_ackdrop_ppslim_last;
  238 
  239 #define TCP_PAWS_IDLE   (24 * 24 * 60 * 60 * PR_SLOWHZ)
  240 
  241 /* for modulo comparisons of timestamps */
  242 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
  243 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
  244 
  245 /*
  246  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
  247  */
  248 #ifdef INET6
  249 #define ND6_HINT(tp) \
  250 do { \
  251         if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 && \
  252             tp->t_in6pcb->in6p_route.ro_rt) { \
  253                 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \
  254         } \
  255 } while (/*CONSTCOND*/ 0)
  256 #else
  257 #define ND6_HINT(tp)
  258 #endif
  259 
  260 /*
  261  * Macro to compute ACK transmission behavior.  Delay the ACK unless
  262  * we have already delayed an ACK (must send an ACK every two segments).
  263  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
  264  * option is enabled.
  265  */
  266 #define TCP_SETUP_ACK(tp, th) \
  267 do { \
  268         if ((tp)->t_flags & TF_DELACK || \
  269             (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \
  270                 tp->t_flags |= TF_ACKNOW; \
  271         else \
  272                 TCP_SET_DELACK(tp); \
  273 } while (/*CONSTCOND*/ 0)
  274 
  275 /*
  276  * Convert TCP protocol fields to host order for easier processing.
  277  */
  278 #define TCP_FIELDS_TO_HOST(th)                                          \
  279 do {                                                                    \
  280         NTOHL((th)->th_seq);                                            \
  281         NTOHL((th)->th_ack);                                            \
  282         NTOHS((th)->th_win);                                            \
  283         NTOHS((th)->th_urp);                                            \
  284 } while (/*CONSTCOND*/ 0)
  285 
  286 /*
  287  * ... and reverse the above.
  288  */
  289 #define TCP_FIELDS_TO_NET(th)                                           \
  290 do {                                                                    \
  291         HTONL((th)->th_seq);                                            \
  292         HTONL((th)->th_ack);                                            \
  293         HTONS((th)->th_win);                                            \
  294         HTONS((th)->th_urp);                                            \
  295 } while (/*CONSTCOND*/ 0)
  296 
  297 #ifdef TCP_CSUM_COUNTERS
  298 #include <sys/device.h>
  299 
  300 extern struct evcnt tcp_hwcsum_ok;
  301 extern struct evcnt tcp_hwcsum_bad;
  302 extern struct evcnt tcp_hwcsum_data;
  303 extern struct evcnt tcp_swcsum;
  304 
  305 #define TCP_CSUM_COUNTER_INCR(ev)       (ev)->ev_count++
  306 
  307 #else
  308 
  309 #define TCP_CSUM_COUNTER_INCR(ev)       /* nothing */
  310 
  311 #endif /* TCP_CSUM_COUNTERS */
  312 
  313 #ifdef TCP_REASS_COUNTERS
  314 #include <sys/device.h>
  315 
  316 extern struct evcnt tcp_reass_;
  317 extern struct evcnt tcp_reass_empty;
  318 extern struct evcnt tcp_reass_iteration[8];
  319 extern struct evcnt tcp_reass_prependfirst;
  320 extern struct evcnt tcp_reass_prepend;
  321 extern struct evcnt tcp_reass_insert;
  322 extern struct evcnt tcp_reass_inserttail;
  323 extern struct evcnt tcp_reass_append;
  324 extern struct evcnt tcp_reass_appendtail;
  325 extern struct evcnt tcp_reass_overlaptail;
  326 extern struct evcnt tcp_reass_overlapfront;
  327 extern struct evcnt tcp_reass_segdup;
  328 extern struct evcnt tcp_reass_fragdup;
  329 
  330 #define TCP_REASS_COUNTER_INCR(ev)      (ev)->ev_count++
  331 
  332 #else
  333 
  334 #define TCP_REASS_COUNTER_INCR(ev)      /* nothing */
  335 
  336 #endif /* TCP_REASS_COUNTERS */
  337 
  338 #ifdef INET
  339 static void tcp4_log_refused __P((const struct ip *, const struct tcphdr *));
  340 #endif
  341 #ifdef INET6
  342 static void tcp6_log_refused
  343     __P((const struct ip6_hdr *, const struct tcphdr *));
  344 #endif
  345 
  346 struct pool tcpipqent_pool;
  347 
  348 int
  349 tcp_reass(tp, th, m, tlen)
  350         struct tcpcb *tp;
  351         struct tcphdr *th;
  352         struct mbuf *m;
  353         int *tlen;
  354 {
  355         struct ipqent *p, *q, *nq, *tiqe = NULL;
  356         struct socket *so = NULL;
  357         int pkt_flags;
  358         tcp_seq pkt_seq;
  359         unsigned pkt_len;
  360         u_long rcvpartdupbyte = 0;
  361         u_long rcvoobyte;
  362 #ifdef TCP_REASS_COUNTERS
  363         u_int count = 0;
  364 #endif
  365 
  366         if (tp->t_inpcb)
  367                 so = tp->t_inpcb->inp_socket;
  368 #ifdef INET6
  369         else if (tp->t_in6pcb)
  370                 so = tp->t_in6pcb->in6p_socket;
  371 #endif
  372 
  373         TCP_REASS_LOCK_CHECK(tp);
  374 
  375         /*
  376          * Call with th==0 after become established to
  377          * force pre-ESTABLISHED data up to user socket.
  378          */
  379         if (th == 0)
  380                 goto present;
  381 
  382         rcvoobyte = *tlen;
  383         /*
  384          * Copy these to local variables because the tcpiphdr
  385          * gets munged while we are collapsing mbufs.
  386          */
  387         pkt_seq = th->th_seq;
  388         pkt_len = *tlen;
  389         pkt_flags = th->th_flags;
  390 
  391         TCP_REASS_COUNTER_INCR(&tcp_reass_);
  392 
  393         if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
  394                 /*
  395                  * When we miss a packet, the vast majority of time we get
  396                  * packets that follow it in order.  So optimize for that.
  397                  */
  398                 if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
  399                         p->ipqe_len += pkt_len;
  400                         p->ipqe_flags |= pkt_flags;
  401                         m_cat(p->ipqe_m, m);
  402                         m = NULL;
  403                         tiqe = p;
  404                         TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
  405                         TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
  406                         goto skip_replacement;
  407                 }
  408                 /*
  409                  * While we're here, if the pkt is completely beyond
  410                  * anything we have, just insert it at the tail.
  411                  */
  412                 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
  413                         TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
  414                         goto insert_it;
  415                 }
  416         }
  417 
  418         q = TAILQ_FIRST(&tp->segq);
  419 
  420         if (q != NULL) {
  421                 /*
  422                  * If this segment immediately precedes the first out-of-order
  423                  * block, simply slap the segment in front of it and (mostly)
  424                  * skip the complicated logic.
  425                  */
  426                 if (pkt_seq + pkt_len == q->ipqe_seq) {
  427                         q->ipqe_seq = pkt_seq;
  428                         q->ipqe_len += pkt_len;
  429                         q->ipqe_flags |= pkt_flags;
  430                         m_cat(m, q->ipqe_m);
  431                         q->ipqe_m = m;
  432                         tiqe = q;
  433                         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  434                         TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
  435                         goto skip_replacement;
  436                 }
  437         } else {
  438                 TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
  439         }
  440 
  441         /*
  442          * Find a segment which begins after this one does.
  443          */
  444         for (p = NULL; q != NULL; q = nq) {
  445                 nq = TAILQ_NEXT(q, ipqe_q);
  446 #ifdef TCP_REASS_COUNTERS
  447                 count++;
  448 #endif
  449                 /*
  450                  * If the received segment is just right after this
  451                  * fragment, merge the two together and then check
  452                  * for further overlaps.
  453                  */
  454                 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
  455 #ifdef TCPREASS_DEBUG
  456                         printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
  457                                tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
  458                                q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
  459 #endif
  460                         pkt_len += q->ipqe_len;
  461                         pkt_flags |= q->ipqe_flags;
  462                         pkt_seq = q->ipqe_seq;
  463                         m_cat(q->ipqe_m, m);
  464                         m = q->ipqe_m;
  465                         TCP_REASS_COUNTER_INCR(&tcp_reass_append);
  466                         goto free_ipqe;
  467                 }
  468                 /*
  469                  * If the received segment is completely past this
  470                  * fragment, we need to go the next fragment.
  471                  */
  472                 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
  473                         p = q;
  474                         continue;
  475                 }
  476                 /*
  477                  * If the fragment is past the received segment,
  478                  * it (or any following) can't be concatenated.
  479                  */
  480                 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
  481                         TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
  482                         break;
  483                 }
  484 
  485                 /*
  486                  * We've received all the data in this segment before.
  487                  * mark it as a duplicate and return.
  488                  */
  489                 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
  490                     SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
  491                         tcpstat.tcps_rcvduppack++;
  492                         tcpstat.tcps_rcvdupbyte += pkt_len;
  493                         m_freem(m);
  494                         if (tiqe != NULL)
  495                                 pool_put(&tcpipqent_pool, tiqe);
  496                         TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
  497                         return (0);
  498                 }
  499                 /*
  500                  * Received segment completely overlaps this fragment
  501                  * so we drop the fragment (this keeps the temporal
  502                  * ordering of segments correct).
  503                  */
  504                 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
  505                     SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
  506                         rcvpartdupbyte += q->ipqe_len;
  507                         m_freem(q->ipqe_m);
  508                         TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
  509                         goto free_ipqe;
  510                 }
  511                 /*
  512                  * RX'ed segment extends past the end of the
  513                  * fragment.  Drop the overlapping bytes.  Then
  514                  * merge the fragment and segment then treat as
  515                  * a longer received packet.
  516                  */
  517                 if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
  518                     SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq))  {
  519                         int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
  520 #ifdef TCPREASS_DEBUG
  521                         printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
  522                                tp, overlap,
  523                                pkt_seq, pkt_seq + pkt_len, pkt_len);
  524 #endif
  525                         m_adj(m, overlap);
  526                         rcvpartdupbyte += overlap;
  527                         m_cat(q->ipqe_m, m);
  528                         m = q->ipqe_m;
  529                         pkt_seq = q->ipqe_seq;
  530                         pkt_len += q->ipqe_len - overlap;
  531                         rcvoobyte -= overlap;
  532                         TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
  533                         goto free_ipqe;
  534                 }
  535                 /*
  536                  * RX'ed segment extends past the front of the
  537                  * fragment.  Drop the overlapping bytes on the
  538                  * received packet.  The packet will then be
  539                  * contatentated with this fragment a bit later.
  540                  */
  541                 if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
  542                     SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len))  {
  543                         int overlap = pkt_seq + pkt_len - q->ipqe_seq;
  544 #ifdef TCPREASS_DEBUG
  545                         printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
  546                                tp, overlap,
  547                                pkt_seq, pkt_seq + pkt_len, pkt_len);
  548 #endif
  549                         m_adj(m, -overlap);
  550                         pkt_len -= overlap;
  551                         rcvpartdupbyte += overlap;
  552                         TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
  553                         rcvoobyte -= overlap;
  554                 }
  555                 /*
  556                  * If the received segment immediates precedes this
  557                  * fragment then tack the fragment onto this segment
  558                  * and reinsert the data.
  559                  */
  560                 if (q->ipqe_seq == pkt_seq + pkt_len) {
  561 #ifdef TCPREASS_DEBUG
  562                         printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
  563                                tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
  564                                pkt_seq, pkt_seq + pkt_len, pkt_len);
  565 #endif
  566                         pkt_len += q->ipqe_len;
  567                         pkt_flags |= q->ipqe_flags;
  568                         m_cat(m, q->ipqe_m);
  569                         TAILQ_REMOVE(&tp->segq, q, ipqe_q);
  570                         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  571                         if (tiqe == NULL)
  572                                 tiqe = q;
  573                         else
  574                                 pool_put(&tcpipqent_pool, q);
  575                         TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
  576                         break;
  577                 }
  578                 /*
  579                  * If the fragment is before the segment, remember it.
  580                  * When this loop is terminated, p will contain the
  581                  * pointer to fragment that is right before the received
  582                  * segment.
  583                  */
  584                 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
  585                         p = q;
  586 
  587                 continue;
  588 
  589                 /*
  590                  * This is a common operation.  It also will allow
  591                  * to save doing a malloc/free in most instances.
  592                  */
  593           free_ipqe:
  594                 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
  595                 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  596                 if (tiqe == NULL)
  597                         tiqe = q;
  598                 else
  599                         pool_put(&tcpipqent_pool, q);
  600         }
  601 
  602 #ifdef TCP_REASS_COUNTERS
  603         if (count > 7)
  604                 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
  605         else if (count > 0)
  606                 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
  607 #endif
  608 
  609     insert_it:
  610 
  611         /*
  612          * Allocate a new queue entry since the received segment did not
  613          * collapse onto any other out-of-order block; thus we are allocating
  614          * a new block.  If it had collapsed, tiqe would not be NULL and
  615          * we would be reusing it.
  616          * XXX If we can't, just drop the packet.  XXX
  617          */
  618         if (tiqe == NULL) {
  619                 tiqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
  620                 if (tiqe == NULL) {
  621                         tcpstat.tcps_rcvmemdrop++;
  622                         m_freem(m);
  623                         return (0);
  624                 }
  625         }
  626 
  627         /*
  628          * Update the counters.
  629          */
  630         tcpstat.tcps_rcvoopack++;
  631         tcpstat.tcps_rcvoobyte += rcvoobyte;
  632         if (rcvpartdupbyte) {
  633             tcpstat.tcps_rcvpartduppack++;
  634             tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte;
  635         }
  636 
  637         /*
  638          * Insert the new fragment queue entry into both queues.
  639          */
  640         tiqe->ipqe_m = m;
  641         tiqe->ipqe_seq = pkt_seq;
  642         tiqe->ipqe_len = pkt_len;
  643         tiqe->ipqe_flags = pkt_flags;
  644         if (p == NULL) {
  645                 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
  646 #ifdef TCPREASS_DEBUG
  647                 if (tiqe->ipqe_seq != tp->rcv_nxt)
  648                         printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
  649                                tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
  650 #endif
  651         } else {
  652                 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
  653 #ifdef TCPREASS_DEBUG
  654                 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
  655                        tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
  656                        p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
  657 #endif
  658         }
  659 
  660 skip_replacement:
  661 
  662         TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
  663 
  664 present:
  665         /*
  666          * Present data to user, advancing rcv_nxt through
  667          * completed sequence space.
  668          */
  669         if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
  670                 return (0);
  671         q = TAILQ_FIRST(&tp->segq);
  672         if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
  673                 return (0);
  674         if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
  675                 return (0);
  676 
  677         tp->rcv_nxt += q->ipqe_len;
  678         pkt_flags = q->ipqe_flags & TH_FIN;
  679         ND6_HINT(tp);
  680 
  681         TAILQ_REMOVE(&tp->segq, q, ipqe_q);
  682         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  683         if (so->so_state & SS_CANTRCVMORE)
  684                 m_freem(q->ipqe_m);
  685         else
  686                 sbappendstream(&so->so_rcv, q->ipqe_m);
  687         pool_put(&tcpipqent_pool, q);
  688         sorwakeup(so);
  689         return (pkt_flags);
  690 }
  691 
  692 #ifdef INET6
  693 int
  694 tcp6_input(mp, offp, proto)
  695         struct mbuf **mp;
  696         int *offp, proto;
  697 {
  698         struct mbuf *m = *mp;
  699 
  700         /*
  701          * draft-itojun-ipv6-tcp-to-anycast
  702          * better place to put this in?
  703          */
  704         if (m->m_flags & M_ANYCAST6) {
  705                 struct ip6_hdr *ip6;
  706                 if (m->m_len < sizeof(struct ip6_hdr)) {
  707                         if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
  708                                 tcpstat.tcps_rcvshort++;
  709                                 return IPPROTO_DONE;
  710                         }
  711                 }
  712                 ip6 = mtod(m, struct ip6_hdr *);
  713                 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
  714                     (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
  715                 return IPPROTO_DONE;
  716         }
  717 
  718         tcp_input(m, *offp, proto);
  719         return IPPROTO_DONE;
  720 }
  721 #endif
  722 
  723 #ifdef INET
  724 static void
  725 tcp4_log_refused(ip, th)
  726         const struct ip *ip;
  727         const struct tcphdr *th;
  728 {
  729         char src[4*sizeof "123"];
  730         char dst[4*sizeof "123"];
  731 
  732         if (ip) {
  733                 strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src));
  734                 strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst));
  735         }
  736         else {
  737                 strlcpy(src, "(unknown)", sizeof(src));
  738                 strlcpy(dst, "(unknown)", sizeof(dst));
  739         }
  740         log(LOG_INFO,
  741             "Connection attempt to TCP %s:%d from %s:%d\n",
  742             dst, ntohs(th->th_dport),
  743             src, ntohs(th->th_sport));
  744 }
  745 #endif
  746 
  747 #ifdef INET6
  748 static void
  749 tcp6_log_refused(ip6, th)
  750         const struct ip6_hdr *ip6;
  751         const struct tcphdr *th;
  752 {
  753         char src[INET6_ADDRSTRLEN];
  754         char dst[INET6_ADDRSTRLEN];
  755 
  756         if (ip6) {
  757                 strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src));
  758                 strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst));
  759         }
  760         else {
  761                 strlcpy(src, "(unknown v6)", sizeof(src));
  762                 strlcpy(dst, "(unknown v6)", sizeof(dst));
  763         }
  764         log(LOG_INFO,
  765             "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
  766             dst, ntohs(th->th_dport),
  767             src, ntohs(th->th_sport));
  768 }
  769 #endif
  770 
  771 /*
  772  * TCP input routine, follows pages 65-76 of the
  773  * protocol specification dated September, 1981 very closely.
  774  */
  775 void
  776 #if __STDC__
  777 tcp_input(struct mbuf *m, ...)
  778 #else
  779 tcp_input(m, va_alist)
  780         struct mbuf *m;
  781 #endif
  782 {
  783         struct tcphdr *th;
  784         struct ip *ip;
  785         struct inpcb *inp;
  786 #ifdef INET6
  787         struct ip6_hdr *ip6;
  788         struct in6pcb *in6p;
  789 #endif
  790         u_int8_t *optp = NULL;
  791         int optlen = 0;
  792         int len, tlen, toff, hdroptlen = 0;
  793         struct tcpcb *tp = 0;
  794         int tiflags;
  795         struct socket *so = NULL;
  796         int todrop, acked, ourfinisacked, needoutput = 0;
  797 #ifdef TCP_DEBUG
  798         short ostate = 0;
  799 #endif
  800         int iss = 0;
  801         u_long tiwin;
  802         struct tcp_opt_info opti;
  803         int off, iphlen;
  804         va_list ap;
  805         int af;         /* af on the wire */
  806         struct mbuf *tcp_saveti = NULL;
  807 
  808         MCLAIM(m, &tcp_rx_mowner);
  809         va_start(ap, m);
  810         toff = va_arg(ap, int);
  811         (void)va_arg(ap, int);          /* ignore value, advance ap */
  812         va_end(ap);
  813 
  814         tcpstat.tcps_rcvtotal++;
  815 
  816         bzero(&opti, sizeof(opti));
  817         opti.ts_present = 0;
  818         opti.maxseg = 0;
  819 
  820         /*
  821          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
  822          *
  823          * TCP is, by definition, unicast, so we reject all
  824          * multicast outright.
  825          *
  826          * Note, there are additional src/dst address checks in
  827          * the AF-specific code below.
  828          */
  829         if (m->m_flags & (M_BCAST|M_MCAST)) {
  830                 /* XXX stat */
  831                 goto drop;
  832         }
  833 #ifdef INET6
  834         if (m->m_flags & M_ANYCAST6) {
  835                 /* XXX stat */
  836                 goto drop;
  837         }
  838 #endif
  839 
  840         /*
  841          * Get IP and TCP header together in first mbuf.
  842          * Note: IP leaves IP header in first mbuf.
  843          */
  844         ip = mtod(m, struct ip *);
  845 #ifdef INET6
  846         ip6 = NULL;
  847 #endif
  848         switch (ip->ip_v) {
  849 #ifdef INET
  850         case 4:
  851                 af = AF_INET;
  852                 iphlen = sizeof(struct ip);
  853                 ip = mtod(m, struct ip *);
  854                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
  855                         sizeof(struct tcphdr));
  856                 if (th == NULL) {
  857                         tcpstat.tcps_rcvshort++;
  858                         return;
  859                 }
  860                 /* We do the checksum after PCB lookup... */
  861                 len = ntohs(ip->ip_len);
  862                 tlen = len - toff;
  863                 break;
  864 #endif
  865 #ifdef INET6
  866         case 6:
  867                 ip = NULL;
  868                 iphlen = sizeof(struct ip6_hdr);
  869                 af = AF_INET6;
  870                 ip6 = mtod(m, struct ip6_hdr *);
  871                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
  872                         sizeof(struct tcphdr));
  873                 if (th == NULL) {
  874                         tcpstat.tcps_rcvshort++;
  875                         return;
  876                 }
  877 
  878                 /* Be proactive about malicious use of IPv4 mapped address */
  879                 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
  880                     IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
  881                         /* XXX stat */
  882                         goto drop;
  883                 }
  884 
  885                 /*
  886                  * Be proactive about unspecified IPv6 address in source.
  887                  * As we use all-zero to indicate unbounded/unconnected pcb,
  888                  * unspecified IPv6 address can be used to confuse us.
  889                  *
  890                  * Note that packets with unspecified IPv6 destination is
  891                  * already dropped in ip6_input.
  892                  */
  893                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
  894                         /* XXX stat */
  895                         goto drop;
  896                 }
  897 
  898                 /*
  899                  * Make sure destination address is not multicast.
  900                  * Source address checked in ip6_input().
  901                  */
  902                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
  903                         /* XXX stat */
  904                         goto drop;
  905                 }
  906 
  907                 /* We do the checksum after PCB lookup... */
  908                 len = m->m_pkthdr.len;
  909                 tlen = len - toff;
  910                 break;
  911 #endif
  912         default:
  913                 m_freem(m);
  914                 return;
  915         }
  916 
  917         KASSERT(TCP_HDR_ALIGNED_P(th));
  918 
  919         /*
  920          * Check that TCP offset makes sense,
  921          * pull out TCP options and adjust length.              XXX
  922          */
  923         off = th->th_off << 2;
  924         if (off < sizeof (struct tcphdr) || off > tlen) {
  925                 tcpstat.tcps_rcvbadoff++;
  926                 goto drop;
  927         }
  928         tlen -= off;
  929 
  930         /*
  931          * tcp_input() has been modified to use tlen to mean the TCP data
  932          * length throughout the function.  Other functions can use
  933          * m->m_pkthdr.len as the basis for calculating the TCP data length.
  934          * rja
  935          */
  936 
  937         if (off > sizeof (struct tcphdr)) {
  938                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
  939                 if (th == NULL) {
  940                         tcpstat.tcps_rcvshort++;
  941                         return;
  942                 }
  943                 /*
  944                  * NOTE: ip/ip6 will not be affected by m_pulldown()
  945                  * (as they're before toff) and we don't need to update those.
  946                  */
  947                 KASSERT(TCP_HDR_ALIGNED_P(th));
  948                 optlen = off - sizeof (struct tcphdr);
  949                 optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
  950                 /*
  951                  * Do quick retrieval of timestamp options ("options
  952                  * prediction?").  If timestamp is the only option and it's
  953                  * formatted as recommended in RFC 1323 appendix A, we
  954                  * quickly get the values now and not bother calling
  955                  * tcp_dooptions(), etc.
  956                  */
  957                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
  958                      (optlen > TCPOLEN_TSTAMP_APPA &&
  959                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
  960                      *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
  961                      (th->th_flags & TH_SYN) == 0) {
  962                         opti.ts_present = 1;
  963                         opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
  964                         opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
  965                         optp = NULL;    /* we've parsed the options */
  966                 }
  967         }
  968         tiflags = th->th_flags;
  969 
  970         /*
  971          * Locate pcb for segment.
  972          */
  973 findpcb:
  974         inp = NULL;
  975 #ifdef INET6
  976         in6p = NULL;
  977 #endif
  978         switch (af) {
  979 #ifdef INET
  980         case AF_INET:
  981                 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
  982                     ip->ip_dst, th->th_dport);
  983                 if (inp == 0) {
  984                         ++tcpstat.tcps_pcbhashmiss;
  985                         inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
  986                 }
  987 #ifdef INET6
  988                 if (inp == 0) {
  989                         struct in6_addr s, d;
  990 
  991                         /* mapped addr case */
  992                         bzero(&s, sizeof(s));
  993                         s.s6_addr16[5] = htons(0xffff);
  994                         bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src));
  995                         bzero(&d, sizeof(d));
  996                         d.s6_addr16[5] = htons(0xffff);
  997                         bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst));
  998                         in6p = in6_pcblookup_connect(&tcbtable, &s,
  999                             th->th_sport, &d, th->th_dport, 0);
 1000                         if (in6p == 0) {
 1001                                 ++tcpstat.tcps_pcbhashmiss;
 1002                                 in6p = in6_pcblookup_bind(&tcbtable, &d,
 1003                                     th->th_dport, 0);
 1004                         }
 1005                 }
 1006 #endif
 1007 #ifndef INET6
 1008                 if (inp == 0)
 1009 #else
 1010                 if (inp == 0 && in6p == 0)
 1011 #endif
 1012                 {
 1013                         ++tcpstat.tcps_noport;
 1014                         if (tcp_log_refused &&
 1015                             (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
 1016                                 tcp4_log_refused(ip, th);
 1017                         }
 1018                         TCP_FIELDS_TO_HOST(th);
 1019                         goto dropwithreset_ratelim;
 1020                 }
 1021 #if defined(IPSEC) || defined(FAST_IPSEC)
 1022                 if (inp && (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 &&
 1023                     ipsec4_in_reject(m, inp)) {
 1024                         ipsecstat.in_polvio++;
 1025                         goto drop;
 1026                 }
 1027 #ifdef INET6
 1028                 else if (in6p &&
 1029                     (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
 1030                     ipsec4_in_reject_so(m, in6p->in6p_socket)) {
 1031                         ipsecstat.in_polvio++;
 1032                         goto drop;
 1033                 }
 1034 #endif
 1035 #endif /*IPSEC*/
 1036                 break;
 1037 #endif /*INET*/
 1038 #ifdef INET6
 1039         case AF_INET6:
 1040             {
 1041                 int faith;
 1042 
 1043 #if defined(NFAITH) && NFAITH > 0
 1044                 faith = faithprefix(&ip6->ip6_dst);
 1045 #else
 1046                 faith = 0;
 1047 #endif
 1048                 in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
 1049                     th->th_sport, &ip6->ip6_dst, th->th_dport, faith);
 1050                 if (in6p == NULL) {
 1051                         ++tcpstat.tcps_pcbhashmiss;
 1052                         in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
 1053                                 th->th_dport, faith);
 1054                 }
 1055                 if (in6p == NULL) {
 1056                         ++tcpstat.tcps_noport;
 1057                         if (tcp_log_refused &&
 1058                             (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
 1059                                 tcp6_log_refused(ip6, th);
 1060                         }
 1061                         TCP_FIELDS_TO_HOST(th);
 1062                         goto dropwithreset_ratelim;
 1063                 }
 1064 #if defined(IPSEC) || defined(FAST_IPSEC)
 1065                 if ((in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
 1066                     ipsec6_in_reject(m, in6p)) {
 1067                         ipsec6stat.in_polvio++;
 1068                         goto drop;
 1069                 }
 1070 #endif /*IPSEC*/
 1071                 break;
 1072             }
 1073 #endif
 1074         }
 1075 
 1076         /*
 1077          * If the state is CLOSED (i.e., TCB does not exist) then
 1078          * all data in the incoming segment is discarded.
 1079          * If the TCB exists but is in CLOSED state, it is embryonic,
 1080          * but should either do a listen or a connect soon.
 1081          */
 1082         tp = NULL;
 1083         so = NULL;
 1084         if (inp) {
 1085                 tp = intotcpcb(inp);
 1086                 so = inp->inp_socket;
 1087         }
 1088 #ifdef INET6
 1089         else if (in6p) {
 1090                 tp = in6totcpcb(in6p);
 1091                 so = in6p->in6p_socket;
 1092         }
 1093 #endif
 1094         if (tp == 0) {
 1095                 TCP_FIELDS_TO_HOST(th);
 1096                 goto dropwithreset_ratelim;
 1097         }
 1098         if (tp->t_state == TCPS_CLOSED)
 1099                 goto drop;
 1100 
 1101         /*
 1102          * Checksum extended TCP header and data.
 1103          */
 1104         switch (af) {
 1105 #ifdef INET
 1106         case AF_INET:
 1107                 switch (m->m_pkthdr.csum_flags &
 1108                         ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
 1109                          M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
 1110                 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
 1111                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
 1112                         goto badcsum;
 1113 
 1114                 case M_CSUM_TCPv4|M_CSUM_DATA: {
 1115                         u_int32_t hw_csum = m->m_pkthdr.csum_data;
 1116                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
 1117                         if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
 1118                                 hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
 1119                                     ip->ip_dst.s_addr,
 1120                                     htons(hw_csum + tlen + off + IPPROTO_TCP));
 1121                         }
 1122                         if ((hw_csum ^ 0xffff) != 0)
 1123                                 goto badcsum;
 1124                         break;
 1125                 }
 1126 
 1127                 case M_CSUM_TCPv4:
 1128                         /* Checksum was okay. */
 1129                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
 1130                         break;
 1131 
 1132                 default:
 1133                         /* Must compute it ourselves. */
 1134                         TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
 1135                         if (in4_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0)
 1136                                 goto badcsum;
 1137                         break;
 1138                 }
 1139                 break;
 1140 #endif /* INET4 */
 1141 
 1142 #ifdef INET6
 1143         case AF_INET6:
 1144                 if (in6_cksum(m, IPPROTO_TCP, toff, tlen + off) != 0)
 1145                         goto badcsum;
 1146                 break;
 1147 #endif /* INET6 */
 1148         }
 1149 
 1150         TCP_FIELDS_TO_HOST(th);
 1151 
 1152         /* Unscale the window into a 32-bit value. */
 1153         if ((tiflags & TH_SYN) == 0)
 1154                 tiwin = th->th_win << tp->snd_scale;
 1155         else
 1156                 tiwin = th->th_win;
 1157 
 1158 #ifdef INET6
 1159         /* save packet options if user wanted */
 1160         if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
 1161                 if (in6p->in6p_options) {
 1162                         m_freem(in6p->in6p_options);
 1163                         in6p->in6p_options = 0;
 1164                 }
 1165                 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
 1166         }
 1167 #endif
 1168 
 1169         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
 1170                 union syn_cache_sa src;
 1171                 union syn_cache_sa dst;
 1172 
 1173                 bzero(&src, sizeof(src));
 1174                 bzero(&dst, sizeof(dst));
 1175                 switch (af) {
 1176 #ifdef INET
 1177                 case AF_INET:
 1178                         src.sin.sin_len = sizeof(struct sockaddr_in);
 1179                         src.sin.sin_family = AF_INET;
 1180                         src.sin.sin_addr = ip->ip_src;
 1181                         src.sin.sin_port = th->th_sport;
 1182 
 1183                         dst.sin.sin_len = sizeof(struct sockaddr_in);
 1184                         dst.sin.sin_family = AF_INET;
 1185                         dst.sin.sin_addr = ip->ip_dst;
 1186                         dst.sin.sin_port = th->th_dport;
 1187                         break;
 1188 #endif
 1189 #ifdef INET6
 1190                 case AF_INET6:
 1191                         src.sin6.sin6_len = sizeof(struct sockaddr_in6);
 1192                         src.sin6.sin6_family = AF_INET6;
 1193                         src.sin6.sin6_addr = ip6->ip6_src;
 1194                         src.sin6.sin6_port = th->th_sport;
 1195 
 1196                         dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
 1197                         dst.sin6.sin6_family = AF_INET6;
 1198                         dst.sin6.sin6_addr = ip6->ip6_dst;
 1199                         dst.sin6.sin6_port = th->th_dport;
 1200                         break;
 1201 #endif /* INET6 */
 1202                 default:
 1203                         goto badsyn;    /*sanity*/
 1204                 }
 1205 
 1206                 if (so->so_options & SO_DEBUG) {
 1207 #ifdef TCP_DEBUG
 1208                         ostate = tp->t_state;
 1209 #endif
 1210 
 1211                         tcp_saveti = NULL;
 1212                         if (iphlen + sizeof(struct tcphdr) > MHLEN)
 1213                                 goto nosave;
 1214 
 1215                         if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
 1216                                 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
 1217                                 if (!tcp_saveti)
 1218                                         goto nosave;
 1219                         } else {
 1220                                 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
 1221                                 if (!tcp_saveti)
 1222                                         goto nosave;
 1223                                 MCLAIM(m, &tcp_mowner);
 1224                                 tcp_saveti->m_len = iphlen;
 1225                                 m_copydata(m, 0, iphlen,
 1226                                     mtod(tcp_saveti, caddr_t));
 1227                         }
 1228 
 1229                         if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
 1230                                 m_freem(tcp_saveti);
 1231                                 tcp_saveti = NULL;
 1232                         } else {
 1233                                 tcp_saveti->m_len += sizeof(struct tcphdr);
 1234                                 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen,
 1235                                     sizeof(struct tcphdr));
 1236                         }
 1237         nosave:;
 1238                 }
 1239                 if (so->so_options & SO_ACCEPTCONN) {
 1240                         if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
 1241                                 if (tiflags & TH_RST) {
 1242                                         syn_cache_reset(&src.sa, &dst.sa, th);
 1243                                 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
 1244                                     (TH_ACK|TH_SYN)) {
 1245                                         /*
 1246                                          * Received a SYN,ACK.  This should
 1247                                          * never happen while we are in
 1248                                          * LISTEN.  Send an RST.
 1249                                          */
 1250                                         goto badsyn;
 1251                                 } else if (tiflags & TH_ACK) {
 1252                                         so = syn_cache_get(&src.sa, &dst.sa,
 1253                                                 th, toff, tlen, so, m);
 1254                                         if (so == NULL) {
 1255                                                 /*
 1256                                                  * We don't have a SYN for
 1257                                                  * this ACK; send an RST.
 1258                                                  */
 1259                                                 goto badsyn;
 1260                                         } else if (so ==
 1261                                             (struct socket *)(-1)) {
 1262                                                 /*
 1263                                                  * We were unable to create
 1264                                                  * the connection.  If the
 1265                                                  * 3-way handshake was
 1266                                                  * completed, and RST has
 1267                                                  * been sent to the peer.
 1268                                                  * Since the mbuf might be
 1269                                                  * in use for the reply,
 1270                                                  * do not free it.
 1271                                                  */
 1272                                                 m = NULL;
 1273                                         } else {
 1274                                                 /*
 1275                                                  * We have created a
 1276                                                  * full-blown connection.
 1277                                                  */
 1278                                                 tp = NULL;
 1279                                                 inp = NULL;
 1280 #ifdef INET6
 1281                                                 in6p = NULL;
 1282 #endif
 1283                                                 switch (so->so_proto->pr_domain->dom_family) {
 1284 #ifdef INET
 1285                                                 case AF_INET:
 1286                                                         inp = sotoinpcb(so);
 1287                                                         tp = intotcpcb(inp);
 1288                                                         break;
 1289 #endif
 1290 #ifdef INET6
 1291                                                 case AF_INET6:
 1292                                                         in6p = sotoin6pcb(so);
 1293                                                         tp = in6totcpcb(in6p);
 1294                                                         break;
 1295 #endif
 1296                                                 }
 1297                                                 if (tp == NULL)
 1298                                                         goto badsyn;    /*XXX*/
 1299                                                 tiwin <<= tp->snd_scale;
 1300                                                 goto after_listen;
 1301                                         }
 1302                                 } else {
 1303                                         /*
 1304                                          * None of RST, SYN or ACK was set.
 1305                                          * This is an invalid packet for a
 1306                                          * TCB in LISTEN state.  Send a RST.
 1307                                          */
 1308                                         goto badsyn;
 1309                                 }
 1310                         } else {
 1311                                 /*
 1312                                  * Received a SYN.
 1313                                  */
 1314 
 1315 #ifdef INET6
 1316                                 /*
 1317                                  * If deprecated address is forbidden, we do
 1318                                  * not accept SYN to deprecated interface
 1319                                  * address to prevent any new inbound
 1320                                  * connection from getting established.
 1321                                  * When we do not accept SYN, we send a TCP
 1322                                  * RST, with deprecated source address (instead
 1323                                  * of dropping it).  We compromise it as it is
 1324                                  * much better for peer to send a RST, and
 1325                                  * RST will be the final packet for the
 1326                                  * exchange.
 1327                                  *
 1328                                  * If we do not forbid deprecated addresses, we
 1329                                  * accept the SYN packet.  RFC2462 does not
 1330                                  * suggest dropping SYN in this case.
 1331                                  * If we decipher RFC2462 5.5.4, it says like
 1332                                  * this:
 1333                                  * 1. use of deprecated addr with existing
 1334                                  *    communication is okay - "SHOULD continue
 1335                                  *    to be used"
 1336                                  * 2. use of it with new communication:
 1337                                  *   (2a) "SHOULD NOT be used if alternate
 1338                                  *        address with sufficient scope is
 1339                                  *        available"
 1340                                  *   (2b) nothing mentioned otherwise. 
 1341                                  * Here we fall into (2b) case as we have no
 1342                                  * choice in our source address selection - we
 1343                                  * must obey the peer.
 1344                                  *
 1345                                  * The wording in RFC2462 is confusing, and
 1346                                  * there are multiple description text for
 1347                                  * deprecated address handling - worse, they
 1348                                  * are not exactly the same.  I believe 5.5.4
 1349                                  * is the best one, so we follow 5.5.4.
 1350                                  */
 1351                                 if (af == AF_INET6 && !ip6_use_deprecated) {
 1352                                         struct in6_ifaddr *ia6;
 1353                                         if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
 1354                                             &ip6->ip6_dst)) &&
 1355                                             (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 1356                                                 tp = NULL;
 1357                                                 goto dropwithreset;
 1358                                         }
 1359                                 }
 1360 #endif
 1361 
 1362 #ifdef IPSEC
 1363                                 switch (af) {
 1364                                 case AF_INET:
 1365                                         if (ipsec4_in_reject_so(m, so)) {
 1366                                                 ipsecstat.in_polvio++;
 1367                                                 tp = NULL;
 1368                                                 goto dropwithreset;
 1369                                         }
 1370                                         break;
 1371 #ifdef INET6
 1372                                 case AF_INET6:
 1373                                         if (ipsec6_in_reject_so(m, so)) {
 1374                                                 ipsec6stat.in_polvio++;
 1375                                                 tp = NULL;
 1376                                                 goto dropwithreset;
 1377                                         }
 1378                                         break;
 1379 #endif
 1380                                 }
 1381 #endif
 1382 
 1383                                 /*
 1384                                  * LISTEN socket received a SYN
 1385                                  * from itself?  This can't possibly
 1386                                  * be valid; drop the packet.
 1387                                  */
 1388                                 if (th->th_sport == th->th_dport) {
 1389                                         int i;
 1390 
 1391                                         switch (af) {
 1392 #ifdef INET
 1393                                         case AF_INET:
 1394                                                 i = in_hosteq(ip->ip_src, ip->ip_dst);
 1395                                                 break;
 1396 #endif
 1397 #ifdef INET6
 1398                                         case AF_INET6:
 1399                                                 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
 1400                                                 break;
 1401 #endif
 1402                                         default:
 1403                                                 i = 1;
 1404                                         }
 1405                                         if (i) {
 1406                                                 tcpstat.tcps_badsyn++;
 1407                                                 goto drop;
 1408                                         }
 1409                                 }
 1410 
 1411                                 /*
 1412                                  * SYN looks ok; create compressed TCP
 1413                                  * state for it.
 1414                                  */
 1415                                 if (so->so_qlen <= so->so_qlimit &&
 1416                                     syn_cache_add(&src.sa, &dst.sa, th, tlen,
 1417                                                 so, m, optp, optlen, &opti))
 1418                                         m = NULL;
 1419                         }
 1420                         goto drop;
 1421                 }
 1422         }
 1423 
 1424 after_listen:
 1425 #ifdef DIAGNOSTIC
 1426         /*
 1427          * Should not happen now that all embryonic connections
 1428          * are handled with compressed state.
 1429          */
 1430         if (tp->t_state == TCPS_LISTEN)
 1431                 panic("tcp_input: TCPS_LISTEN");
 1432 #endif
 1433 
 1434         /*
 1435          * Segment received on connection.
 1436          * Reset idle time and keep-alive timer.
 1437          */
 1438         tp->t_rcvtime = tcp_now;
 1439         if (TCPS_HAVEESTABLISHED(tp->t_state))
 1440                 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
 1441 
 1442         /*
 1443          * Process options.
 1444          */
 1445         if (optp)
 1446                 tcp_dooptions(tp, optp, optlen, th, &opti);
 1447 
 1448         if (opti.ts_present && opti.ts_ecr) {
 1449                 u_int32_t now;
 1450 
 1451                 /*
 1452                  * Calculate the RTT from the returned time stamp and the
 1453                  * connection's time base.  If the time stamp is later than
 1454                  * the current time, fall back to non-1323 RTT calculation.
 1455                  */
 1456                 now = TCP_TIMESTAMP(tp);
 1457                 if (SEQ_GEQ(now, opti.ts_ecr))
 1458                         opti.ts_ecr = now - opti.ts_ecr + 1;
 1459                 else
 1460                         opti.ts_ecr = 0;
 1461         }
 1462 
 1463         /*
 1464          * Header prediction: check for the two common cases
 1465          * of a uni-directional data xfer.  If the packet has
 1466          * no control flags, is in-sequence, the window didn't
 1467          * change and we're not retransmitting, it's a
 1468          * candidate.  If the length is zero and the ack moved
 1469          * forward, we're the sender side of the xfer.  Just
 1470          * free the data acked & wake any higher level process
 1471          * that was blocked waiting for space.  If the length
 1472          * is non-zero and the ack didn't move, we're the
 1473          * receiver side.  If we're getting packets in-order
 1474          * (the reassembly queue is empty), add the data to
 1475          * the socket buffer and note that we need a delayed ack.
 1476          */
 1477         if (tp->t_state == TCPS_ESTABLISHED &&
 1478             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
 1479             (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
 1480             th->th_seq == tp->rcv_nxt &&
 1481             tiwin && tiwin == tp->snd_wnd &&
 1482             tp->snd_nxt == tp->snd_max) {
 1483 
 1484                 /*
 1485                  * If last ACK falls within this segment's sequence numbers,
 1486                  *  record the timestamp.
 1487                  */
 1488                 if (opti.ts_present &&
 1489                     SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 1490                     SEQ_LT(tp->last_ack_sent, th->th_seq + tlen)) {
 1491                         tp->ts_recent_age = TCP_TIMESTAMP(tp);
 1492                         tp->ts_recent = opti.ts_val;
 1493                 }
 1494 
 1495                 if (tlen == 0) {
 1496                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
 1497                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
 1498                             tp->snd_cwnd >= tp->snd_wnd &&
 1499                             tp->t_dupacks < tcprexmtthresh) {
 1500                                 /*
 1501                                  * this is a pure ack for outstanding data.
 1502                                  */
 1503                                 ++tcpstat.tcps_predack;
 1504                                 if (opti.ts_present && opti.ts_ecr)
 1505                                         tcp_xmit_timer(tp, opti.ts_ecr);
 1506                                 else if (tp->t_rtttime &&
 1507                                     SEQ_GT(th->th_ack, tp->t_rtseq))
 1508                                         tcp_xmit_timer(tp,
 1509                                         tcp_now - tp->t_rtttime);
 1510                                 acked = th->th_ack - tp->snd_una;
 1511                                 tcpstat.tcps_rcvackpack++;
 1512                                 tcpstat.tcps_rcvackbyte += acked;
 1513                                 ND6_HINT(tp);
 1514 
 1515                                 if (acked > (tp->t_lastoff - tp->t_inoff))
 1516                                         tp->t_lastm = NULL;
 1517                                 sbdrop(&so->so_snd, acked);
 1518                                 tp->t_lastoff -= acked;
 1519 
 1520                                 /*
 1521                                  * We want snd_recover to track snd_una to
 1522                                  * avoid sequence wraparound problems for
 1523                                  * very large transfers.
 1524                                  */
 1525                                 tp->snd_una = tp->snd_recover = th->th_ack;
 1526                                 m_freem(m);
 1527 
 1528                                 /*
 1529                                  * If all outstanding data are acked, stop
 1530                                  * retransmit timer, otherwise restart timer
 1531                                  * using current (possibly backed-off) value.
 1532                                  * If process is waiting for space,
 1533                                  * wakeup/selwakeup/signal.  If data
 1534                                  * are ready to send, let tcp_output
 1535                                  * decide between more output or persist.
 1536                                  */
 1537                                 if (tp->snd_una == tp->snd_max)
 1538                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1539                                 else if (TCP_TIMER_ISARMED(tp,
 1540                                     TCPT_PERSIST) == 0)
 1541                                         TCP_TIMER_ARM(tp, TCPT_REXMT,
 1542                                             tp->t_rxtcur);
 1543 
 1544                                 sowwakeup(so);
 1545                                 if (so->so_snd.sb_cc)
 1546                                         (void) tcp_output(tp);
 1547                                 if (tcp_saveti)
 1548                                         m_freem(tcp_saveti);
 1549                                 return;
 1550                         }
 1551                 } else if (th->th_ack == tp->snd_una &&
 1552                     TAILQ_FIRST(&tp->segq) == NULL &&
 1553                     tlen <= sbspace(&so->so_rcv)) {
 1554                         /*
 1555                          * this is a pure, in-sequence data packet
 1556                          * with nothing on the reassembly queue and
 1557                          * we have enough buffer space to take it.
 1558                          */
 1559                         ++tcpstat.tcps_preddat;
 1560                         tp->rcv_nxt += tlen;
 1561                         tcpstat.tcps_rcvpack++;
 1562                         tcpstat.tcps_rcvbyte += tlen;
 1563                         ND6_HINT(tp);
 1564                         /*
 1565                          * Drop TCP, IP headers and TCP options then add data
 1566                          * to socket buffer.
 1567                          */
 1568                         if (so->so_state & SS_CANTRCVMORE)
 1569                                 m_freem(m);
 1570                         else {
 1571                                 m_adj(m, toff + off);
 1572                                 sbappendstream(&so->so_rcv, m);
 1573                         }
 1574                         sorwakeup(so);
 1575                         TCP_SETUP_ACK(tp, th);
 1576                         if (tp->t_flags & TF_ACKNOW)
 1577                                 (void) tcp_output(tp);
 1578                         if (tcp_saveti)
 1579                                 m_freem(tcp_saveti);
 1580                         return;
 1581                 }
 1582         }
 1583 
 1584         /*
 1585          * Compute mbuf offset to TCP data segment.
 1586          */
 1587         hdroptlen = toff + off;
 1588 
 1589         /*
 1590          * Calculate amount of space in receive window,
 1591          * and then do TCP input processing.
 1592          * Receive window is amount of space in rcv queue,
 1593          * but not less than advertised window.
 1594          */
 1595         { int win;
 1596 
 1597         win = sbspace(&so->so_rcv);
 1598         if (win < 0)
 1599                 win = 0;
 1600         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 1601         }
 1602 
 1603         switch (tp->t_state) {
 1604         case TCPS_LISTEN:
 1605                 /*
 1606                  * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 1607                  */
 1608                 if (m->m_flags & (M_BCAST|M_MCAST))
 1609                         goto drop;
 1610                 switch (af) {
 1611 #ifdef INET6
 1612                 case AF_INET6:
 1613                         if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 1614                                 goto drop;
 1615                         break;
 1616 #endif /* INET6 */
 1617                 case AF_INET:
 1618                         if (IN_MULTICAST(ip->ip_dst.s_addr) ||
 1619                             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 1620                                 goto drop;
 1621                         break;
 1622                 }
 1623                 break;
 1624 
 1625         /*
 1626          * If the state is SYN_SENT:
 1627          *      if seg contains an ACK, but not for our SYN, drop the input.
 1628          *      if seg contains a RST, then drop the connection.
 1629          *      if seg does not contain SYN, then drop it.
 1630          * Otherwise this is an acceptable SYN segment
 1631          *      initialize tp->rcv_nxt and tp->irs
 1632          *      if seg contains ack then advance tp->snd_una
 1633          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 1634          *      arrange for segment to be acked (eventually)
 1635          *      continue processing rest of data/controls, beginning with URG
 1636          */
 1637         case TCPS_SYN_SENT:
 1638                 if ((tiflags & TH_ACK) &&
 1639                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1640                      SEQ_GT(th->th_ack, tp->snd_max)))
 1641                         goto dropwithreset;
 1642                 if (tiflags & TH_RST) {
 1643                         if (tiflags & TH_ACK)
 1644                                 tp = tcp_drop(tp, ECONNREFUSED);
 1645                         goto drop;
 1646                 }
 1647                 if ((tiflags & TH_SYN) == 0)
 1648                         goto drop;
 1649                 if (tiflags & TH_ACK) {
 1650                         tp->snd_una = tp->snd_recover = th->th_ack;
 1651                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 1652                                 tp->snd_nxt = tp->snd_una;
 1653                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1654                 }
 1655                 tp->irs = th->th_seq;
 1656                 tcp_rcvseqinit(tp);
 1657                 tp->t_flags |= TF_ACKNOW;
 1658                 tcp_mss_from_peer(tp, opti.maxseg);
 1659 
 1660                 /*
 1661                  * Initialize the initial congestion window.  If we
 1662                  * had to retransmit the SYN, we must initialize cwnd
 1663                  * to 1 segment (i.e. the Loss Window).
 1664                  */
 1665                 if (tp->t_flags & TF_SYN_REXMT)
 1666                         tp->snd_cwnd = tp->t_peermss;
 1667                 else {
 1668                         int ss = tcp_init_win;
 1669 #ifdef INET
 1670                         if (inp != NULL && in_localaddr(inp->inp_faddr))
 1671                                 ss = tcp_init_win_local;
 1672 #endif
 1673 #ifdef INET6
 1674                         if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
 1675                                 ss = tcp_init_win_local;
 1676 #endif
 1677                         tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
 1678                 }
 1679 
 1680                 tcp_rmx_rtt(tp);
 1681                 if (tiflags & TH_ACK) {
 1682                         tcpstat.tcps_connects++;
 1683                         soisconnected(so);
 1684                         tcp_established(tp);
 1685                         /* Do window scaling on this connection? */
 1686                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1687                             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1688                                 tp->snd_scale = tp->requested_s_scale;
 1689                                 tp->rcv_scale = tp->request_r_scale;
 1690                         }
 1691                         TCP_REASS_LOCK(tp);
 1692                         (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
 1693                         TCP_REASS_UNLOCK(tp);
 1694                         /*
 1695                          * if we didn't have to retransmit the SYN,
 1696                          * use its rtt as our initial srtt & rtt var.
 1697                          */
 1698                         if (tp->t_rtttime)
 1699                                 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 1700                 } else
 1701                         tp->t_state = TCPS_SYN_RECEIVED;
 1702 
 1703                 /*
 1704                  * Advance th->th_seq to correspond to first data byte.
 1705                  * If data, trim to stay within window,
 1706                  * dropping FIN if necessary.
 1707                  */
 1708                 th->th_seq++;
 1709                 if (tlen > tp->rcv_wnd) {
 1710                         todrop = tlen - tp->rcv_wnd;
 1711                         m_adj(m, -todrop);
 1712                         tlen = tp->rcv_wnd;
 1713                         tiflags &= ~TH_FIN;
 1714                         tcpstat.tcps_rcvpackafterwin++;
 1715                         tcpstat.tcps_rcvbyteafterwin += todrop;
 1716                 }
 1717                 tp->snd_wl1 = th->th_seq - 1;
 1718                 tp->rcv_up = th->th_seq;
 1719                 goto step6;
 1720 
 1721         /*
 1722          * If the state is SYN_RECEIVED:
 1723          *      If seg contains an ACK, but not for our SYN, drop the input
 1724          *      and generate an RST.  See page 36, rfc793
 1725          */
 1726         case TCPS_SYN_RECEIVED:
 1727                 if ((tiflags & TH_ACK) &&
 1728                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1729                      SEQ_GT(th->th_ack, tp->snd_max)))
 1730                         goto dropwithreset;
 1731                 break;
 1732         }
 1733 
 1734         /*
 1735          * States other than LISTEN or SYN_SENT.
 1736          * First check timestamp, if present.
 1737          * Then check that at least some bytes of segment are within
 1738          * receive window.  If segment begins before rcv_nxt,
 1739          * drop leading data (and SYN); if nothing left, just ack.
 1740          *
 1741          * RFC 1323 PAWS: If we have a timestamp reply on this segment
 1742          * and it's less than ts_recent, drop it.
 1743          */
 1744         if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
 1745             TSTMP_LT(opti.ts_val, tp->ts_recent)) {
 1746 
 1747                 /* Check to see if ts_recent is over 24 days old.  */
 1748                 if ((int)(TCP_TIMESTAMP(tp) - tp->ts_recent_age) >
 1749                     TCP_PAWS_IDLE) {
 1750                         /*
 1751                          * Invalidate ts_recent.  If this segment updates
 1752                          * ts_recent, the age will be reset later and ts_recent
 1753                          * will get a valid value.  If it does not, setting
 1754                          * ts_recent to zero will at least satisfy the
 1755                          * requirement that zero be placed in the timestamp
 1756                          * echo reply when ts_recent isn't valid.  The
 1757                          * age isn't reset until we get a valid ts_recent
 1758                          * because we don't want out-of-order segments to be
 1759                          * dropped when ts_recent is old.
 1760                          */
 1761                         tp->ts_recent = 0;
 1762                 } else {
 1763                         tcpstat.tcps_rcvduppack++;
 1764                         tcpstat.tcps_rcvdupbyte += tlen;
 1765                         tcpstat.tcps_pawsdrop++;
 1766                         goto dropafterack;
 1767                 }
 1768         }
 1769 
 1770         todrop = tp->rcv_nxt - th->th_seq;
 1771         if (todrop > 0) {
 1772                 if (tiflags & TH_SYN) {
 1773                         tiflags &= ~TH_SYN;
 1774                         th->th_seq++;
 1775                         if (th->th_urp > 1)
 1776                                 th->th_urp--;
 1777                         else {
 1778                                 tiflags &= ~TH_URG;
 1779                                 th->th_urp = 0;
 1780                         }
 1781                         todrop--;
 1782                 }
 1783                 if (todrop > tlen ||
 1784                     (todrop == tlen && (tiflags & TH_FIN) == 0)) {
 1785                         /*
 1786                          * Any valid FIN or RST must be to the left of the
 1787                          * window.  At this point the FIN or RST must be a
 1788                          * duplicate or out of sequence; drop it.
 1789                          */
 1790                         if (tiflags & TH_RST)
 1791                                 goto drop;
 1792                         tiflags &= ~(TH_FIN|TH_RST);
 1793                         /*
 1794                          * Send an ACK to resynchronize and drop any data.
 1795                          * But keep on processing for RST or ACK.
 1796                          */
 1797                         tp->t_flags |= TF_ACKNOW;
 1798                         todrop = tlen;
 1799                         tcpstat.tcps_rcvdupbyte += todrop;
 1800                         tcpstat.tcps_rcvduppack++;
 1801                 } else if ((tiflags & TH_RST) &&
 1802                            th->th_seq != tp->last_ack_sent) {
 1803                         /*
 1804                          * Test for reset before adjusting the sequence
 1805                          * number for overlapping data.
 1806                          */
 1807                         goto dropafterack_ratelim;
 1808                 } else {
 1809                         tcpstat.tcps_rcvpartduppack++;
 1810                         tcpstat.tcps_rcvpartdupbyte += todrop;
 1811                 }
 1812                 hdroptlen += todrop;    /*drop from head afterwards*/
 1813                 th->th_seq += todrop;
 1814                 tlen -= todrop;
 1815                 if (th->th_urp > todrop)
 1816                         th->th_urp -= todrop;
 1817                 else {
 1818                         tiflags &= ~TH_URG;
 1819                         th->th_urp = 0;
 1820                 }
 1821         }
 1822 
 1823         /*
 1824          * If new data are received on a connection after the
 1825          * user processes are gone, then RST the other end.
 1826          */
 1827         if ((so->so_state & SS_NOFDREF) &&
 1828             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 1829                 tp = tcp_close(tp);
 1830                 tcpstat.tcps_rcvafterclose++;
 1831                 goto dropwithreset;
 1832         }
 1833 
 1834         /*
 1835          * If segment ends after window, drop trailing data
 1836          * (and PUSH and FIN); if nothing left, just ACK.
 1837          */
 1838         todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
 1839         if (todrop > 0) {
 1840                 tcpstat.tcps_rcvpackafterwin++;
 1841                 if (todrop >= tlen) {
 1842                         /*
 1843                          * The segment actually starts after the window.
 1844                          * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
 1845                          * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
 1846                          * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
 1847                          */
 1848                         tcpstat.tcps_rcvbyteafterwin += tlen;
 1849                         /*
 1850                          * If a new connection request is received
 1851                          * while in TIME_WAIT, drop the old connection
 1852                          * and start over if the sequence numbers
 1853                          * are above the previous ones.
 1854                          *
 1855                          * NOTE: We will checksum the packet again, and
 1856                          * so we need to put the header fields back into
 1857                          * network order!
 1858                          * XXX This kind of sucks, but we don't expect
 1859                          * XXX this to happen very often, so maybe it
 1860                          * XXX doesn't matter so much.
 1861                          */
 1862                         if (tiflags & TH_SYN &&
 1863                             tp->t_state == TCPS_TIME_WAIT &&
 1864                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
 1865                                 iss = tcp_new_iss(tp, tp->snd_nxt);
 1866                                 tp = tcp_close(tp);
 1867                                 TCP_FIELDS_TO_NET(th);
 1868                                 goto findpcb;
 1869                         }
 1870                         /*
 1871                          * If window is closed can only take segments at
 1872                          * window edge, and have to drop data and PUSH from
 1873                          * incoming segments.  Continue processing, but
 1874                          * remember to ack.  Otherwise, drop segment
 1875                          * and (if not RST) ack.
 1876                          */
 1877                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 1878                                 tp->t_flags |= TF_ACKNOW;
 1879                                 tcpstat.tcps_rcvwinprobe++;
 1880                         } else
 1881                                 goto dropafterack;
 1882                 } else
 1883                         tcpstat.tcps_rcvbyteafterwin += todrop;
 1884                 m_adj(m, -todrop);
 1885                 tlen -= todrop;
 1886                 tiflags &= ~(TH_PUSH|TH_FIN);
 1887         }
 1888 
 1889         /*
 1890          * If last ACK falls within this segment's sequence numbers,
 1891          * and the timestamp is newer, record it.
 1892          */
 1893         if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
 1894             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 1895             SEQ_LT(tp->last_ack_sent, th->th_seq + tlen +
 1896                    ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
 1897                 tp->ts_recent_age = TCP_TIMESTAMP(tp);
 1898                 tp->ts_recent = opti.ts_val;
 1899         }
 1900 
 1901         /*
 1902          * If the RST bit is set examine the state:
 1903          *    SYN_RECEIVED STATE:
 1904          *      If passive open, return to LISTEN state.
 1905          *      If active open, inform user that connection was refused.
 1906          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
 1907          *      Inform user that connection was reset, and close tcb.
 1908          *    CLOSING, LAST_ACK, TIME_WAIT STATES
 1909          *      Close the tcb.
 1910          */
 1911         if (tiflags & TH_RST) {
 1912                 if (th->th_seq != tp->last_ack_sent)
 1913                         goto dropafterack_ratelim;
 1914 
 1915                 switch (tp->t_state) {
 1916                 case TCPS_SYN_RECEIVED:
 1917                         so->so_error = ECONNREFUSED;
 1918                         goto close;
 1919 
 1920                 case TCPS_ESTABLISHED:
 1921                 case TCPS_FIN_WAIT_1:
 1922                 case TCPS_FIN_WAIT_2:
 1923                 case TCPS_CLOSE_WAIT:
 1924                         so->so_error = ECONNRESET;
 1925                 close:
 1926                         tp->t_state = TCPS_CLOSED;
 1927                         tcpstat.tcps_drops++;
 1928                         tp = tcp_close(tp);
 1929                         goto drop;
 1930 
 1931                 case TCPS_CLOSING:
 1932                 case TCPS_LAST_ACK:
 1933                 case TCPS_TIME_WAIT:
 1934                         tp = tcp_close(tp);
 1935                         goto drop;
 1936                 }
 1937         }
 1938 
 1939         /*
 1940          * Since we've covered the SYN-SENT and SYN-RECEIVED states above
 1941          * we must be in a synchronized state.  RFC791 states (under RST
 1942          * generation) that any unacceptable segment (an out-of-order SYN
 1943          * qualifies) received in a synchronized state must elicit only an
 1944          * empty acknowledgment segment ... and the connection remains in
 1945          * the same state.
 1946          */
 1947         if (tiflags & TH_SYN) {
 1948                 if (tp->rcv_nxt == th->th_seq) {
 1949                         tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
 1950                             TH_ACK);
 1951                         if (tcp_saveti)
 1952                                 m_freem(tcp_saveti);
 1953                         return;
 1954                 }
 1955                         
 1956                 goto dropafterack_ratelim;
 1957         }
 1958 
 1959         /*
 1960          * If the ACK bit is off we drop the segment and return.
 1961          */
 1962         if ((tiflags & TH_ACK) == 0) {
 1963                 if (tp->t_flags & TF_ACKNOW)
 1964                         goto dropafterack;
 1965                 else
 1966                         goto drop;
 1967         }
 1968 
 1969         /*
 1970          * Ack processing.
 1971          */
 1972         switch (tp->t_state) {
 1973 
 1974         /*
 1975          * In SYN_RECEIVED state if the ack ACKs our SYN then enter
 1976          * ESTABLISHED state and continue processing, otherwise
 1977          * send an RST.
 1978          */
 1979         case TCPS_SYN_RECEIVED:
 1980                 if (SEQ_GT(tp->snd_una, th->th_ack) ||
 1981                     SEQ_GT(th->th_ack, tp->snd_max))
 1982                         goto dropwithreset;
 1983                 tcpstat.tcps_connects++;
 1984                 soisconnected(so);
 1985                 tcp_established(tp);
 1986                 /* Do window scaling? */
 1987                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1988                     (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1989                         tp->snd_scale = tp->requested_s_scale;
 1990                         tp->rcv_scale = tp->request_r_scale;
 1991                 }
 1992                 TCP_REASS_LOCK(tp);
 1993                 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
 1994                 TCP_REASS_UNLOCK(tp);
 1995                 tp->snd_wl1 = th->th_seq - 1;
 1996                 /* fall into ... */
 1997 
 1998         /*
 1999          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 2000          * ACKs.  If the ack is in the range
 2001          *      tp->snd_una < th->th_ack <= tp->snd_max
 2002          * then advance tp->snd_una to th->th_ack and drop
 2003          * data from the retransmission queue.  If this ACK reflects
 2004          * more up to date window information we update our window information.
 2005          */
 2006         case TCPS_ESTABLISHED:
 2007         case TCPS_FIN_WAIT_1:
 2008         case TCPS_FIN_WAIT_2:
 2009         case TCPS_CLOSE_WAIT:
 2010         case TCPS_CLOSING:
 2011         case TCPS_LAST_ACK:
 2012         case TCPS_TIME_WAIT:
 2013 
 2014                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 2015                         if (tlen == 0 && tiwin == tp->snd_wnd) {
 2016                                 tcpstat.tcps_rcvdupack++;
 2017                                 /*
 2018                                  * If we have outstanding data (other than
 2019                                  * a window probe), this is a completely
 2020                                  * duplicate ack (ie, window info didn't
 2021                                  * change), the ack is the biggest we've
 2022                                  * seen and we've seen exactly our rexmt
 2023                                  * threshhold of them, assume a packet
 2024                                  * has been dropped and retransmit it.
 2025                                  * Kludge snd_nxt & the congestion
 2026                                  * window so we send only this one
 2027                                  * packet.
 2028                                  *
 2029                                  * We know we're losing at the current
 2030                                  * window size so do congestion avoidance
 2031                                  * (set ssthresh to half the current window
 2032                                  * and pull our congestion window back to
 2033                                  * the new ssthresh).
 2034                                  *
 2035                                  * Dup acks mean that packets have left the
 2036                                  * network (they're now cached at the receiver)
 2037                                  * so bump cwnd by the amount in the receiver
 2038                                  * to keep a constant cwnd packets in the
 2039                                  * network.
 2040                                  */
 2041                                 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
 2042                                     th->th_ack != tp->snd_una)
 2043                                         tp->t_dupacks = 0;
 2044                                 else if (++tp->t_dupacks == tcprexmtthresh) {
 2045                                         tcp_seq onxt = tp->snd_nxt;
 2046                                         u_int win =
 2047                                             min(tp->snd_wnd, tp->snd_cwnd) /
 2048                                             2 / tp->t_segsz;
 2049                                         if (tcp_do_newreno && SEQ_LT(th->th_ack,
 2050                                             tp->snd_recover)) {
 2051                                                 /*
 2052                                                  * False fast retransmit after
 2053                                                  * timeout.  Do not cut window.
 2054                                                  */
 2055                                                 tp->snd_cwnd += tp->t_segsz;
 2056                                                 tp->t_dupacks = 0;
 2057                                                 (void) tcp_output(tp);
 2058                                                 goto drop;
 2059                                         }
 2060 
 2061                                         if (win < 2)
 2062                                                 win = 2;
 2063                                         tp->snd_ssthresh = win * tp->t_segsz;
 2064                                         tp->snd_recover = tp->snd_max;
 2065                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 2066                                         tp->t_rtttime = 0;
 2067                                         tp->snd_nxt = th->th_ack;
 2068                                         tp->snd_cwnd = tp->t_segsz;
 2069                                         (void) tcp_output(tp);
 2070                                         tp->snd_cwnd = tp->snd_ssthresh +
 2071                                                tp->t_segsz * tp->t_dupacks;
 2072                                         if (SEQ_GT(onxt, tp->snd_nxt))
 2073                                                 tp->snd_nxt = onxt;
 2074                                         goto drop;
 2075                                 } else if (tp->t_dupacks > tcprexmtthresh) {
 2076                                         tp->snd_cwnd += tp->t_segsz;
 2077                                         (void) tcp_output(tp);
 2078                                         goto drop;
 2079                                 }
 2080                         } else if (tlen) {
 2081                                 tp->t_dupacks = 0;      /*XXX*/
 2082                                 /* drop very old ACKs unless th_seq matches */
 2083                                 if (th->th_seq != tp->rcv_nxt &&
 2084                                     SEQ_LT(th->th_ack,
 2085                                     tp->snd_una - tp->max_sndwnd)) {
 2086                                         goto drop;
 2087                                 }
 2088                                 break;
 2089                         } else
 2090                                 tp->t_dupacks = 0;
 2091                         break;
 2092                 }
 2093                 /*
 2094                  * If the congestion window was inflated to account
 2095                  * for the other side's cached packets, retract it.
 2096                  */
 2097                 if (tcp_do_newreno == 0) {
 2098                         if (tp->t_dupacks >= tcprexmtthresh &&
 2099                             tp->snd_cwnd > tp->snd_ssthresh)
 2100                                 tp->snd_cwnd = tp->snd_ssthresh;
 2101                         tp->t_dupacks = 0;
 2102                 } else if (tp->t_dupacks >= tcprexmtthresh &&
 2103                            tcp_newreno(tp, th) == 0) {
 2104                         tp->snd_cwnd = tp->snd_ssthresh;
 2105                         /*
 2106                          * Window inflation should have left us with approx.
 2107                          * snd_ssthresh outstanding data.  But in case we
 2108                          * would be inclined to send a burst, better to do
 2109                          * it via the slow start mechanism.
 2110                          */
 2111                         if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
 2112                                 tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
 2113                                     + tp->t_segsz;
 2114                         tp->t_dupacks = 0;
 2115                 }
 2116                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
 2117                         tcpstat.tcps_rcvacktoomuch++;
 2118                         goto dropafterack;
 2119                 }
 2120                 acked = th->th_ack - tp->snd_una;
 2121                 tcpstat.tcps_rcvackpack++;
 2122                 tcpstat.tcps_rcvackbyte += acked;
 2123 
 2124                 /*
 2125                  * If we have a timestamp reply, update smoothed
 2126                  * round trip time.  If no timestamp is present but
 2127                  * transmit timer is running and timed sequence
 2128                  * number was acked, update smoothed round trip time.
 2129                  * Since we now have an rtt measurement, cancel the
 2130                  * timer backoff (cf., Phil Karn's retransmit alg.).
 2131                  * Recompute the initial retransmit timer.
 2132                  */
 2133                 if (opti.ts_present && opti.ts_ecr)
 2134                         tcp_xmit_timer(tp, opti.ts_ecr);
 2135                 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 2136                         tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 2137 
 2138                 /*
 2139                  * If all outstanding data is acked, stop retransmit
 2140                  * timer and remember to restart (more output or persist).
 2141                  * If there is more data to be acked, restart retransmit
 2142                  * timer, using current (possibly backed-off) value.
 2143                  */
 2144                 if (th->th_ack == tp->snd_max) {
 2145                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 2146                         needoutput = 1;
 2147                 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 2148                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 2149                 /*
 2150                  * When new data is acked, open the congestion window.
 2151                  * If the window gives us less than ssthresh packets
 2152                  * in flight, open exponentially (segsz per packet).
 2153                  * Otherwise open linearly: segsz per window
 2154                  * (segsz^2 / cwnd per packet), plus a constant
 2155                  * fraction of a packet (segsz/8) to help larger windows
 2156                  * open quickly enough.
 2157                  */
 2158                 {
 2159                 u_int cw = tp->snd_cwnd;
 2160                 u_int incr = tp->t_segsz;
 2161 
 2162                 if (cw > tp->snd_ssthresh)
 2163                         incr = incr * incr / cw;
 2164                 if (tcp_do_newreno == 0 || SEQ_GEQ(th->th_ack, tp->snd_recover))
 2165                         tp->snd_cwnd = min(cw + incr,
 2166                             TCP_MAXWIN << tp->snd_scale);
 2167                 }
 2168                 ND6_HINT(tp);
 2169                 if (acked > so->so_snd.sb_cc) {
 2170                         tp->snd_wnd -= so->so_snd.sb_cc;
 2171                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
 2172                         ourfinisacked = 1;
 2173                 } else {
 2174                         if (acked > (tp->t_lastoff - tp->t_inoff))
 2175                                 tp->t_lastm = NULL;
 2176                         sbdrop(&so->so_snd, acked);
 2177                         tp->t_lastoff -= acked;
 2178                         tp->snd_wnd -= acked;
 2179                         ourfinisacked = 0;
 2180                 }
 2181                 sowwakeup(so);
 2182                 /*
 2183                  * We want snd_recover to track snd_una to
 2184                  * avoid sequence wraparound problems for
 2185                  * very large transfers.
 2186                  */
 2187                 tp->snd_una = tp->snd_recover = th->th_ack;
 2188                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 2189                         tp->snd_nxt = tp->snd_una;
 2190 
 2191                 switch (tp->t_state) {
 2192 
 2193                 /*
 2194                  * In FIN_WAIT_1 STATE in addition to the processing
 2195                  * for the ESTABLISHED state if our FIN is now acknowledged
 2196                  * then enter FIN_WAIT_2.
 2197                  */
 2198                 case TCPS_FIN_WAIT_1:
 2199                         if (ourfinisacked) {
 2200                                 /*
 2201                                  * If we can't receive any more
 2202                                  * data, then closing user can proceed.
 2203                                  * Starting the timer is contrary to the
 2204                                  * specification, but if we don't get a FIN
 2205                                  * we'll hang forever.
 2206                                  */
 2207                                 if (so->so_state & SS_CANTRCVMORE) {
 2208                                         soisdisconnected(so);
 2209                                         if (tcp_maxidle > 0)
 2210                                                 TCP_TIMER_ARM(tp, TCPT_2MSL,
 2211                                                     tcp_maxidle);
 2212                                 }
 2213                                 tp->t_state = TCPS_FIN_WAIT_2;
 2214                         }
 2215                         break;
 2216 
 2217                 /*
 2218                  * In CLOSING STATE in addition to the processing for
 2219                  * the ESTABLISHED state if the ACK acknowledges our FIN
 2220                  * then enter the TIME-WAIT state, otherwise ignore
 2221                  * the segment.
 2222                  */
 2223                 case TCPS_CLOSING:
 2224                         if (ourfinisacked) {
 2225                                 tp->t_state = TCPS_TIME_WAIT;
 2226                                 tcp_canceltimers(tp);
 2227                                 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2228                                 soisdisconnected(so);
 2229                         }
 2230                         break;
 2231 
 2232                 /*
 2233                  * In LAST_ACK, we may still be waiting for data to drain
 2234                  * and/or to be acked, as well as for the ack of our FIN.
 2235                  * If our FIN is now acknowledged, delete the TCB,
 2236                  * enter the closed state and return.
 2237                  */
 2238                 case TCPS_LAST_ACK:
 2239                         if (ourfinisacked) {
 2240                                 tp = tcp_close(tp);
 2241                                 goto drop;
 2242                         }
 2243                         break;
 2244 
 2245                 /*
 2246                  * In TIME_WAIT state the only thing that should arrive
 2247                  * is a retransmission of the remote FIN.  Acknowledge
 2248                  * it and restart the finack timer.
 2249                  */
 2250                 case TCPS_TIME_WAIT:
 2251                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2252                         goto dropafterack;
 2253                 }
 2254         }
 2255 
 2256 step6:
 2257         /*
 2258          * Update window information.
 2259          * Don't look at window if no ACK: TAC's send garbage on first SYN.
 2260          */
 2261         if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 2262             (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) ||
 2263             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))) {
 2264                 /* keep track of pure window updates */
 2265                 if (tlen == 0 &&
 2266                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 2267                         tcpstat.tcps_rcvwinupd++;
 2268                 tp->snd_wnd = tiwin;
 2269                 tp->snd_wl1 = th->th_seq;
 2270                 tp->snd_wl2 = th->th_ack;
 2271                 if (tp->snd_wnd > tp->max_sndwnd)
 2272                         tp->max_sndwnd = tp->snd_wnd;
 2273                 needoutput = 1;
 2274         }
 2275 
 2276         /*
 2277          * Process segments with URG.
 2278          */
 2279         if ((tiflags & TH_URG) && th->th_urp &&
 2280             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2281                 /*
 2282                  * This is a kludge, but if we receive and accept
 2283                  * random urgent pointers, we'll crash in
 2284                  * soreceive.  It's hard to imagine someone
 2285                  * actually wanting to send this much urgent data.
 2286                  */
 2287                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 2288                         th->th_urp = 0;                 /* XXX */
 2289                         tiflags &= ~TH_URG;             /* XXX */
 2290                         goto dodata;                    /* XXX */
 2291                 }
 2292                 /*
 2293                  * If this segment advances the known urgent pointer,
 2294                  * then mark the data stream.  This should not happen
 2295                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 2296                  * a FIN has been received from the remote side.
 2297                  * In these states we ignore the URG.
 2298                  *
 2299                  * According to RFC961 (Assigned Protocols),
 2300                  * the urgent pointer points to the last octet
 2301                  * of urgent data.  We continue, however,
 2302                  * to consider it to indicate the first octet
 2303                  * of data past the urgent section as the original
 2304                  * spec states (in one of two places).
 2305                  */
 2306                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 2307                         tp->rcv_up = th->th_seq + th->th_urp;
 2308                         so->so_oobmark = so->so_rcv.sb_cc +
 2309                             (tp->rcv_up - tp->rcv_nxt) - 1;
 2310                         if (so->so_oobmark == 0)
 2311                                 so->so_state |= SS_RCVATMARK;
 2312                         sohasoutofband(so);
 2313                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 2314                 }
 2315                 /*
 2316                  * Remove out of band data so doesn't get presented to user.
 2317                  * This can happen independent of advancing the URG pointer,
 2318                  * but if two URG's are pending at once, some out-of-band
 2319                  * data may creep in... ick.
 2320                  */
 2321                 if (th->th_urp <= (u_int16_t) tlen
 2322 #ifdef SO_OOBINLINE
 2323                      && (so->so_options & SO_OOBINLINE) == 0
 2324 #endif
 2325                      )
 2326                         tcp_pulloutofband(so, th, m, hdroptlen);
 2327         } else
 2328                 /*
 2329                  * If no out of band data is expected,
 2330                  * pull receive urgent pointer along
 2331                  * with the receive window.
 2332                  */
 2333                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 2334                         tp->rcv_up = tp->rcv_nxt;
 2335 dodata:                                                 /* XXX */
 2336 
 2337         /*
 2338          * Process the segment text, merging it into the TCP sequencing queue,
 2339          * and arranging for acknowledgement of receipt if necessary.
 2340          * This process logically involves adjusting tp->rcv_wnd as data
 2341          * is presented to the user (this happens in tcp_usrreq.c,
 2342          * case PRU_RCVD).  If a FIN has already been received on this
 2343          * connection then we just ignore the text.
 2344          */
 2345         if ((tlen || (tiflags & TH_FIN)) &&
 2346             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2347                 /*
 2348                  * Insert segment ti into reassembly queue of tcp with
 2349                  * control block tp.  Return TH_FIN if reassembly now includes
 2350                  * a segment with FIN.  The macro form does the common case
 2351                  * inline (segment is the next to be received on an
 2352                  * established connection, and the queue is empty),
 2353                  * avoiding linkage into and removal from the queue and
 2354                  * repetition of various conversions.
 2355                  * Set DELACK for segments received in order, but ack
 2356                  * immediately when segments are out of order
 2357                  * (so fast retransmit can work).
 2358                  */
 2359                 /* NOTE: this was TCP_REASS() macro, but used only once */
 2360                 TCP_REASS_LOCK(tp);
 2361                 if (th->th_seq == tp->rcv_nxt &&
 2362                     TAILQ_FIRST(&tp->segq) == NULL &&
 2363                     tp->t_state == TCPS_ESTABLISHED) {
 2364                         TCP_SETUP_ACK(tp, th);
 2365                         tp->rcv_nxt += tlen;
 2366                         tiflags = th->th_flags & TH_FIN;
 2367                         tcpstat.tcps_rcvpack++;
 2368                         tcpstat.tcps_rcvbyte += tlen;
 2369                         ND6_HINT(tp);
 2370                         if (so->so_state & SS_CANTRCVMORE)
 2371                                 m_freem(m);
 2372                         else {
 2373                                 m_adj(m, hdroptlen);
 2374                                 sbappendstream(&(so)->so_rcv, m);
 2375                         }
 2376                         sorwakeup(so);
 2377                 } else {
 2378                         m_adj(m, hdroptlen);
 2379                         tiflags = tcp_reass(tp, th, m, &tlen);
 2380                         tp->t_flags |= TF_ACKNOW;
 2381                 }
 2382                 TCP_REASS_UNLOCK(tp);
 2383 
 2384                 /*
 2385                  * Note the amount of data that peer has sent into
 2386                  * our window, in order to estimate the sender's
 2387                  * buffer size.
 2388                  */
 2389                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 2390         } else {
 2391                 m_freem(m);
 2392                 m = NULL;
 2393                 tiflags &= ~TH_FIN;
 2394         }
 2395 
 2396         /*
 2397          * If FIN is received ACK the FIN and let the user know
 2398          * that the connection is closing.  Ignore a FIN received before
 2399          * the connection is fully established.
 2400          */
 2401         if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
 2402                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2403                         socantrcvmore(so);
 2404                         tp->t_flags |= TF_ACKNOW;
 2405                         tp->rcv_nxt++;
 2406                 }
 2407                 switch (tp->t_state) {
 2408 
 2409                 /*
 2410                  * In ESTABLISHED STATE enter the CLOSE_WAIT state.
 2411                  */
 2412                 case TCPS_ESTABLISHED:
 2413                         tp->t_state = TCPS_CLOSE_WAIT;
 2414                         break;
 2415 
 2416                 /*
 2417                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
 2418                  * enter the CLOSING state.
 2419                  */
 2420                 case TCPS_FIN_WAIT_1:
 2421                         tp->t_state = TCPS_CLOSING;
 2422                         break;
 2423 
 2424                 /*
 2425                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
 2426                  * starting the time-wait timer, turning off the other
 2427                  * standard timers.
 2428                  */
 2429                 case TCPS_FIN_WAIT_2:
 2430                         tp->t_state = TCPS_TIME_WAIT;
 2431                         tcp_canceltimers(tp);
 2432                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2433                         soisdisconnected(so);
 2434                         break;
 2435 
 2436                 /*
 2437                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
 2438                  */
 2439                 case TCPS_TIME_WAIT:
 2440                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2441                         break;
 2442                 }
 2443         }
 2444 #ifdef TCP_DEBUG
 2445         if (so->so_options & SO_DEBUG)
 2446                 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
 2447 #endif
 2448 
 2449         /*
 2450          * Return any desired output.
 2451          */
 2452         if (needoutput || (tp->t_flags & TF_ACKNOW))
 2453                 (void) tcp_output(tp);
 2454         if (tcp_saveti)
 2455                 m_freem(tcp_saveti);
 2456         return;
 2457 
 2458 badsyn:
 2459         /*
 2460          * Received a bad SYN.  Increment counters and dropwithreset.
 2461          */
 2462         tcpstat.tcps_badsyn++;
 2463         tp = NULL;
 2464         goto dropwithreset;
 2465 
 2466 dropafterack:
 2467         /*
 2468          * Generate an ACK dropping incoming segment if it occupies
 2469          * sequence space, where the ACK reflects our state.
 2470          */
 2471         if (tiflags & TH_RST)
 2472                 goto drop;
 2473         goto dropafterack2;
 2474 
 2475 dropafterack_ratelim:
 2476         /*
 2477          * We may want to rate-limit ACKs against SYN/RST attack.
 2478          */
 2479         if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
 2480             tcp_ackdrop_ppslim) == 0) {
 2481                 /* XXX stat */
 2482                 goto drop;
 2483         }
 2484         /* ...fall into dropafterack2... */
 2485 
 2486 dropafterack2:
 2487         m_freem(m);
 2488         tp->t_flags |= TF_ACKNOW;
 2489         (void) tcp_output(tp);
 2490         if (tcp_saveti)
 2491                 m_freem(tcp_saveti);
 2492         return;
 2493 
 2494 dropwithreset_ratelim:
 2495         /*
 2496          * We may want to rate-limit RSTs in certain situations,
 2497          * particularly if we are sending an RST in response to
 2498          * an attempt to connect to or otherwise communicate with
 2499          * a port for which we have no socket.
 2500          */
 2501         if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
 2502             tcp_rst_ppslim) == 0) {
 2503                 /* XXX stat */
 2504                 goto drop;
 2505         }
 2506         /* ...fall into dropwithreset... */
 2507 
 2508 dropwithreset:
 2509         /*
 2510          * Generate a RST, dropping incoming segment.
 2511          * Make ACK acceptable to originator of segment.
 2512          */
 2513         if (tiflags & TH_RST)
 2514                 goto drop;
 2515 
 2516         switch (af) {
 2517 #ifdef INET6
 2518         case AF_INET6:
 2519                 /* For following calls to tcp_respond */
 2520                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 2521                         goto drop;
 2522                 break;
 2523 #endif /* INET6 */
 2524         case AF_INET:
 2525                 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
 2526                     in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 2527                         goto drop;
 2528         }
 2529 
 2530         if (tiflags & TH_ACK)
 2531                 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
 2532         else {
 2533                 if (tiflags & TH_SYN)
 2534                         tlen++;
 2535                 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
 2536                     TH_RST|TH_ACK);
 2537         }
 2538         if (tcp_saveti)
 2539                 m_freem(tcp_saveti);
 2540         return;
 2541 
 2542 badcsum:
 2543         tcpstat.tcps_rcvbadsum++;
 2544 drop:
 2545         /*
 2546          * Drop space held by incoming segment and return.
 2547          */
 2548         if (tp) {
 2549                 if (tp->t_inpcb)
 2550                         so = tp->t_inpcb->inp_socket;
 2551 #ifdef INET6
 2552                 else if (tp->t_in6pcb)
 2553                         so = tp->t_in6pcb->in6p_socket;
 2554 #endif
 2555                 else
 2556                         so = NULL;
 2557 #ifdef TCP_DEBUG
 2558                 if (so && (so->so_options & SO_DEBUG) != 0)
 2559                         tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
 2560 #endif
 2561         }
 2562         if (tcp_saveti)
 2563                 m_freem(tcp_saveti);
 2564         m_freem(m);
 2565         return;
 2566 }
 2567 
 2568 void
 2569 tcp_dooptions(tp, cp, cnt, th, oi)
 2570         struct tcpcb *tp;
 2571         u_char *cp;
 2572         int cnt;
 2573         struct tcphdr *th;
 2574         struct tcp_opt_info *oi;
 2575 {
 2576         u_int16_t mss;
 2577         int opt, optlen;
 2578 
 2579         for (; cnt > 0; cnt -= optlen, cp += optlen) {
 2580                 opt = cp[0];
 2581                 if (opt == TCPOPT_EOL)
 2582                         break;
 2583                 if (opt == TCPOPT_NOP)
 2584                         optlen = 1;
 2585                 else {
 2586                         if (cnt < 2)
 2587                                 break;
 2588                         optlen = cp[1];
 2589                         if (optlen < 2 || optlen > cnt)
 2590                                 break;
 2591                 }
 2592                 switch (opt) {
 2593 
 2594                 default:
 2595                         continue;
 2596 
 2597                 case TCPOPT_MAXSEG:
 2598                         if (optlen != TCPOLEN_MAXSEG)
 2599                                 continue;
 2600                         if (!(th->th_flags & TH_SYN))
 2601                                 continue;
 2602                         bcopy(cp + 2, &mss, sizeof(mss));
 2603                         oi->maxseg = ntohs(mss);
 2604                         break;
 2605 
 2606                 case TCPOPT_WINDOW:
 2607                         if (optlen != TCPOLEN_WINDOW)
 2608                                 continue;
 2609                         if (!(th->th_flags & TH_SYN))
 2610                                 continue;
 2611                         tp->t_flags |= TF_RCVD_SCALE;
 2612                         tp->requested_s_scale = cp[2];
 2613                         if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
 2614 #if 0   /*XXX*/
 2615                                 char *p;
 2616 
 2617                                 if (ip)
 2618                                         p = ntohl(ip->ip_src);
 2619 #ifdef INET6
 2620                                 else if (ip6)
 2621                                         p = ip6_sprintf(&ip6->ip6_src);
 2622 #endif
 2623                                 else
 2624                                         p = "(unknown)";
 2625                                 log(LOG_ERR, "TCP: invalid wscale %d from %s, "
 2626                                     "assuming %d\n",
 2627                                     tp->requested_s_scale, p,
 2628                                     TCP_MAX_WINSHIFT);
 2629 #else
 2630                                 log(LOG_ERR, "TCP: invalid wscale %d, "
 2631                                     "assuming %d\n",
 2632                                     tp->requested_s_scale,
 2633                                     TCP_MAX_WINSHIFT);
 2634 #endif
 2635                                 tp->requested_s_scale = TCP_MAX_WINSHIFT;
 2636                         }
 2637                         break;
 2638 
 2639                 case TCPOPT_TIMESTAMP:
 2640                         if (optlen != TCPOLEN_TIMESTAMP)
 2641                                 continue;
 2642                         oi->ts_present = 1;
 2643                         bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
 2644                         NTOHL(oi->ts_val);
 2645                         bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
 2646                         NTOHL(oi->ts_ecr);
 2647 
 2648                         /*
 2649                          * A timestamp received in a SYN makes
 2650                          * it ok to send timestamp requests and replies.
 2651                          */
 2652                         if (th->th_flags & TH_SYN) {
 2653                                 tp->t_flags |= TF_RCVD_TSTMP;
 2654                                 tp->ts_recent = oi->ts_val;
 2655                                 tp->ts_recent_age = TCP_TIMESTAMP(tp);
 2656                         }
 2657                         break;
 2658                 case TCPOPT_SACK_PERMITTED:
 2659                         if (optlen != TCPOLEN_SACK_PERMITTED)
 2660                                 continue;
 2661                         if (!(th->th_flags & TH_SYN))
 2662                                 continue;
 2663                         tp->t_flags &= ~TF_CANT_TXSACK;
 2664                         break;
 2665 
 2666                 case TCPOPT_SACK:
 2667                         if (tp->t_flags & TF_IGNR_RXSACK)
 2668                                 continue;
 2669                         if (optlen % 8 != 2 || optlen < 10)
 2670                                 continue;
 2671                         cp += 2;
 2672                         optlen -= 2;
 2673                         for (; optlen > 0; cp -= 8, optlen -= 8) {
 2674                                 tcp_seq lwe, rwe;
 2675                                 bcopy((char *)cp, (char *) &lwe, sizeof(lwe));
 2676                                 NTOHL(lwe);
 2677                                 bcopy((char *)cp, (char *) &rwe, sizeof(rwe));
 2678                                 NTOHL(rwe);
 2679                                 /* tcp_mark_sacked(tp, lwe, rwe); */
 2680                         }
 2681                         break;
 2682                 }
 2683         }
 2684 }
 2685 
 2686 /*
 2687  * Pull out of band byte out of a segment so
 2688  * it doesn't appear in the user's data queue.
 2689  * It is still reflected in the segment length for
 2690  * sequencing purposes.
 2691  */
 2692 void
 2693 tcp_pulloutofband(so, th, m, off)
 2694         struct socket *so;
 2695         struct tcphdr *th;
 2696         struct mbuf *m;
 2697         int off;
 2698 {
 2699         int cnt = off + th->th_urp - 1;
 2700 
 2701         while (cnt >= 0) {
 2702                 if (m->m_len > cnt) {
 2703                         char *cp = mtod(m, caddr_t) + cnt;
 2704                         struct tcpcb *tp = sototcpcb(so);
 2705 
 2706                         tp->t_iobc = *cp;
 2707                         tp->t_oobflags |= TCPOOB_HAVEDATA;
 2708                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 2709                         m->m_len--;
 2710                         return;
 2711                 }
 2712                 cnt -= m->m_len;
 2713                 m = m->m_next;
 2714                 if (m == 0)
 2715                         break;
 2716         }
 2717         panic("tcp_pulloutofband");
 2718 }
 2719 
 2720 /*
 2721  * Collect new round-trip time estimate
 2722  * and update averages and current timeout.
 2723  */
 2724 void
 2725 tcp_xmit_timer(tp, rtt)
 2726         struct tcpcb *tp;
 2727         uint32_t rtt;
 2728 {
 2729         int32_t delta;
 2730 
 2731         tcpstat.tcps_rttupdated++;
 2732         if (tp->t_srtt != 0) {
 2733                 /*
 2734                  * srtt is stored as fixed point with 3 bits after the
 2735                  * binary point (i.e., scaled by 8).  The following magic
 2736                  * is equivalent to the smoothing algorithm in rfc793 with
 2737                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 2738                  * point).  Adjust rtt to origin 0.
 2739                  */
 2740                 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
 2741                 if ((tp->t_srtt += delta) <= 0)
 2742                         tp->t_srtt = 1 << 2;
 2743                 /*
 2744                  * We accumulate a smoothed rtt variance (actually, a
 2745                  * smoothed mean difference), then set the retransmit
 2746                  * timer to smoothed rtt + 4 times the smoothed variance.
 2747                  * rttvar is stored as fixed point with 2 bits after the
 2748                  * binary point (scaled by 4).  The following is
 2749                  * equivalent to rfc793 smoothing with an alpha of .75
 2750                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 2751                  * rfc793's wired-in beta.
 2752                  */
 2753                 if (delta < 0)
 2754                         delta = -delta;
 2755                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
 2756                 if ((tp->t_rttvar += delta) <= 0)
 2757                         tp->t_rttvar = 1 << 2;
 2758         } else {
 2759                 /*
 2760                  * No rtt measurement yet - use the unsmoothed rtt.
 2761                  * Set the variance to half the rtt (so our first
 2762                  * retransmit happens at 3*rtt).
 2763                  */
 2764                 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
 2765                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
 2766         }
 2767         tp->t_rtttime = 0;
 2768         tp->t_rxtshift = 0;
 2769 
 2770         /*
 2771          * the retransmit should happen at rtt + 4 * rttvar.
 2772          * Because of the way we do the smoothing, srtt and rttvar
 2773          * will each average +1/2 tick of bias.  When we compute
 2774          * the retransmit timer, we want 1/2 tick of rounding and
 2775          * 1 extra tick because of +-1/2 tick uncertainty in the
 2776          * firing of the timer.  The bias will give us exactly the
 2777          * 1.5 tick we need.  But, because the bias is
 2778          * statistical, we have to test that we don't drop below
 2779          * the minimum feasible timer (which is 2 ticks).
 2780          */
 2781         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 2782             max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 2783 
 2784         /*
 2785          * We received an ack for a packet that wasn't retransmitted;
 2786          * it is probably safe to discard any error indications we've
 2787          * received recently.  This isn't quite right, but close enough
 2788          * for now (a route might have failed after we sent a segment,
 2789          * and the return path might not be symmetrical).
 2790          */
 2791         tp->t_softerror = 0;
 2792 }
 2793 
 2794 /*
 2795  * Checks for partial ack.  If partial ack arrives, force the retransmission
 2796  * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
 2797  * 1.  By setting snd_nxt to th_ack, this forces retransmission timer to
 2798  * be started again.  If the ack advances at least to tp->snd_recover, return 0.
 2799  */
 2800 int
 2801 tcp_newreno(tp, th)
 2802         struct tcpcb *tp;
 2803         struct tcphdr *th;
 2804 {
 2805         tcp_seq onxt = tp->snd_nxt;
 2806         u_long ocwnd = tp->snd_cwnd;
 2807 
 2808         if (SEQ_LT(th->th_ack, tp->snd_recover)) {
 2809                 /*
 2810                  * snd_una has not yet been updated and the socket's send
 2811                  * buffer has not yet drained off the ACK'd data, so we
 2812                  * have to leave snd_una as it was to get the correct data
 2813                  * offset in tcp_output().
 2814                  */
 2815                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 2816                 tp->t_rtttime = 0;
 2817                 tp->snd_nxt = th->th_ack;
 2818                 /*
 2819                  * Set snd_cwnd to one segment beyond ACK'd offset.  snd_una
 2820                  * is not yet updated when we're called.
 2821                  */
 2822                 tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
 2823                 (void) tcp_output(tp);
 2824                 tp->snd_cwnd = ocwnd;
 2825                 if (SEQ_GT(onxt, tp->snd_nxt))
 2826                         tp->snd_nxt = onxt;
 2827                 /*
 2828                  * Partial window deflation.  Relies on fact that tp->snd_una
 2829                  * not updated yet.
 2830                  */
 2831                 tp->snd_cwnd -= (th->th_ack - tp->snd_una - tp->t_segsz);
 2832                 return 1;
 2833         }
 2834         return 0;
 2835 }
 2836 
 2837 
 2838 /*
 2839  * TCP compressed state engine.  Currently used to hold compressed
 2840  * state for SYN_RECEIVED.
 2841  */
 2842 
 2843 u_long  syn_cache_count;
 2844 u_int32_t syn_hash1, syn_hash2;
 2845 
 2846 #define SYN_HASH(sa, sp, dp) \
 2847         ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
 2848                                      ((u_int32_t)(sp)))^syn_hash2)))
 2849 #ifndef INET6
 2850 #define SYN_HASHALL(hash, src, dst) \
 2851 do {                                                                    \
 2852         hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr,       \
 2853                 ((struct sockaddr_in *)(src))->sin_port,                \
 2854                 ((struct sockaddr_in *)(dst))->sin_port);               \
 2855 } while (/*CONSTCOND*/ 0)
 2856 #else
 2857 #define SYN_HASH6(sa, sp, dp) \
 2858         ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
 2859           (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
 2860          & 0x7fffffff)
 2861 
 2862 #define SYN_HASHALL(hash, src, dst) \
 2863 do {                                                                    \
 2864         switch ((src)->sa_family) {                                     \
 2865         case AF_INET:                                                   \
 2866                 hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
 2867                         ((struct sockaddr_in *)(src))->sin_port,        \
 2868                         ((struct sockaddr_in *)(dst))->sin_port);       \
 2869                 break;                                                  \
 2870         case AF_INET6:                                                  \
 2871                 hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
 2872                         ((struct sockaddr_in6 *)(src))->sin6_port,      \
 2873                         ((struct sockaddr_in6 *)(dst))->sin6_port);     \
 2874                 break;                                                  \
 2875         default:                                                        \
 2876                 hash = 0;                                               \
 2877         }                                                               \
 2878 } while (/*CONSTCOND*/0)
 2879 #endif /* INET6 */
 2880 
 2881 #define SYN_CACHE_RM(sc)                                                \
 2882 do {                                                                    \
 2883         TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket,     \
 2884             (sc), sc_bucketq);                                          \
 2885         (sc)->sc_tp = NULL;                                             \
 2886         LIST_REMOVE((sc), sc_tpq);                                      \
 2887         tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;                 \
 2888         callout_stop(&(sc)->sc_timer);                                  \
 2889         syn_cache_count--;                                              \
 2890 } while (/*CONSTCOND*/0)
 2891 
 2892 #define SYN_CACHE_PUT(sc)                                               \
 2893 do {                                                                    \
 2894         if ((sc)->sc_ipopts)                                            \
 2895                 (void) m_free((sc)->sc_ipopts);                         \
 2896         if ((sc)->sc_route4.ro_rt != NULL)                              \
 2897                 RTFREE((sc)->sc_route4.ro_rt);                          \
 2898         if (callout_invoking(&(sc)->sc_timer))                          \
 2899                 (sc)->sc_flags |= SCF_DEAD;                             \
 2900         else                                                            \
 2901                 pool_put(&syn_cache_pool, (sc));                        \
 2902 } while (/*CONSTCOND*/0)
 2903 
 2904 struct pool syn_cache_pool;
 2905 
 2906 /*
 2907  * We don't estimate RTT with SYNs, so each packet starts with the default
 2908  * RTT and each timer step has a fixed timeout value.
 2909  */
 2910 #define SYN_CACHE_TIMER_ARM(sc)                                         \
 2911 do {                                                                    \
 2912         TCPT_RANGESET((sc)->sc_rxtcur,                                  \
 2913             TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
 2914             TCPTV_REXMTMAX);                                            \
 2915         callout_reset(&(sc)->sc_timer,                                  \
 2916             (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
 2917 } while (/*CONSTCOND*/0)
 2918 
 2919 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
 2920 
 2921 void
 2922 syn_cache_init()
 2923 {
 2924         int i;
 2925 
 2926         /* Initialize the hash buckets. */
 2927         for (i = 0; i < tcp_syn_cache_size; i++)
 2928                 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
 2929 
 2930         /* Initialize the syn cache pool. */
 2931         pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
 2932             "synpl", NULL);
 2933 }
 2934 
 2935 void
 2936 syn_cache_insert(sc, tp)
 2937         struct syn_cache *sc;
 2938         struct tcpcb *tp;
 2939 {
 2940         struct syn_cache_head *scp;
 2941         struct syn_cache *sc2;
 2942         int s;
 2943 
 2944         /*
 2945          * If there are no entries in the hash table, reinitialize
 2946          * the hash secrets.
 2947          */
 2948         if (syn_cache_count == 0) {
 2949                 syn_hash1 = arc4random();
 2950                 syn_hash2 = arc4random();
 2951         }
 2952 
 2953         SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
 2954         sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
 2955         scp = &tcp_syn_cache[sc->sc_bucketidx];
 2956 
 2957         /*
 2958          * Make sure that we don't overflow the per-bucket
 2959          * limit or the total cache size limit.
 2960          */
 2961         s = splsoftnet();
 2962         if (scp->sch_length >= tcp_syn_bucket_limit) {
 2963                 tcpstat.tcps_sc_bucketoverflow++;
 2964                 /*
 2965                  * The bucket is full.  Toss the oldest element in the
 2966                  * bucket.  This will be the first entry in the bucket.
 2967                  */
 2968                 sc2 = TAILQ_FIRST(&scp->sch_bucket);
 2969 #ifdef DIAGNOSTIC
 2970                 /*
 2971                  * This should never happen; we should always find an
 2972                  * entry in our bucket.
 2973                  */
 2974                 if (sc2 == NULL)
 2975                         panic("syn_cache_insert: bucketoverflow: impossible");
 2976 #endif
 2977                 SYN_CACHE_RM(sc2);
 2978                 SYN_CACHE_PUT(sc2);
 2979         } else if (syn_cache_count >= tcp_syn_cache_limit) {
 2980                 struct syn_cache_head *scp2, *sce;
 2981 
 2982                 tcpstat.tcps_sc_overflowed++;
 2983                 /*
 2984                  * The cache is full.  Toss the oldest entry in the
 2985                  * first non-empty bucket we can find.
 2986                  *
 2987                  * XXX We would really like to toss the oldest
 2988                  * entry in the cache, but we hope that this
 2989                  * condition doesn't happen very often.
 2990                  */
 2991                 scp2 = scp;
 2992                 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
 2993                         sce = &tcp_syn_cache[tcp_syn_cache_size];
 2994                         for (++scp2; scp2 != scp; scp2++) {
 2995                                 if (scp2 >= sce)
 2996                                         scp2 = &tcp_syn_cache[0];
 2997                                 if (! TAILQ_EMPTY(&scp2->sch_bucket))
 2998                                         break;
 2999                         }
 3000 #ifdef DIAGNOSTIC
 3001                         /*
 3002                          * This should never happen; we should always find a
 3003                          * non-empty bucket.
 3004                          */
 3005                         if (scp2 == scp)
 3006                                 panic("syn_cache_insert: cacheoverflow: "
 3007                                     "impossible");
 3008 #endif
 3009                 }
 3010                 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
 3011                 SYN_CACHE_RM(sc2);
 3012                 SYN_CACHE_PUT(sc2);
 3013         }
 3014 
 3015         /*
 3016          * Initialize the entry's timer.
 3017          */
 3018         sc->sc_rxttot = 0;
 3019         sc->sc_rxtshift = 0;
 3020         SYN_CACHE_TIMER_ARM(sc);
 3021 
 3022         /* Link it from tcpcb entry */
 3023         LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
 3024 
 3025         /* Put it into the bucket. */
 3026         TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
 3027         scp->sch_length++;
 3028         syn_cache_count++;
 3029 
 3030         tcpstat.tcps_sc_added++;
 3031         splx(s);
 3032 }
 3033 
 3034 /*
 3035  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
 3036  * If we have retransmitted an entry the maximum number of times, expire
 3037  * that entry.
 3038  */
 3039 void
 3040 syn_cache_timer(void *arg)
 3041 {
 3042         struct syn_cache *sc = arg;
 3043         int s;
 3044 
 3045         s = splsoftnet();
 3046         callout_ack(&sc->sc_timer);
 3047 
 3048         if (__predict_false(sc->sc_flags & SCF_DEAD)) {
 3049                 tcpstat.tcps_sc_delayed_free++;
 3050                 pool_put(&syn_cache_pool, sc);
 3051                 splx(s);
 3052                 return;
 3053         }
 3054 
 3055         if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
 3056                 /* Drop it -- too many retransmissions. */
 3057                 goto dropit;
 3058         }
 3059 
 3060         /*
 3061          * Compute the total amount of time this entry has
 3062          * been on a queue.  If this entry has been on longer
 3063          * than the keep alive timer would allow, expire it.
 3064          */
 3065         sc->sc_rxttot += sc->sc_rxtcur;
 3066         if (sc->sc_rxttot >= TCPTV_KEEP_INIT)
 3067                 goto dropit;
 3068 
 3069         tcpstat.tcps_sc_retransmitted++;
 3070         (void) syn_cache_respond(sc, NULL);
 3071 
 3072         /* Advance the timer back-off. */
 3073         sc->sc_rxtshift++;
 3074         SYN_CACHE_TIMER_ARM(sc);
 3075 
 3076         splx(s);
 3077         return;
 3078 
 3079  dropit:
 3080         tcpstat.tcps_sc_timed_out++;
 3081         SYN_CACHE_RM(sc);
 3082         SYN_CACHE_PUT(sc);
 3083         splx(s);
 3084 }
 3085 
 3086 /*
 3087  * Remove syn cache created by the specified tcb entry,
 3088  * because this does not make sense to keep them
 3089  * (if there's no tcb entry, syn cache entry will never be used)
 3090  */
 3091 void
 3092 syn_cache_cleanup(tp)
 3093         struct tcpcb *tp;
 3094 {
 3095         struct syn_cache *sc, *nsc;
 3096         int s;
 3097 
 3098         s = splsoftnet();
 3099 
 3100         for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
 3101                 nsc = LIST_NEXT(sc, sc_tpq);
 3102 
 3103 #ifdef DIAGNOSTIC
 3104                 if (sc->sc_tp != tp)
 3105                         panic("invalid sc_tp in syn_cache_cleanup");
 3106 #endif
 3107                 SYN_CACHE_RM(sc);
 3108                 SYN_CACHE_PUT(sc);
 3109         }
 3110         /* just for safety */
 3111         LIST_INIT(&tp->t_sc);
 3112 
 3113         splx(s);
 3114 }
 3115 
 3116 /*
 3117  * Find an entry in the syn cache.
 3118  */
 3119 struct syn_cache *
 3120 syn_cache_lookup(src, dst, headp)
 3121         struct sockaddr *src;
 3122         struct sockaddr *dst;
 3123         struct syn_cache_head **headp;
 3124 {
 3125         struct syn_cache *sc;
 3126         struct syn_cache_head *scp;
 3127         u_int32_t hash;
 3128         int s;
 3129 
 3130         SYN_HASHALL(hash, src, dst);
 3131 
 3132         scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
 3133         *headp = scp;
 3134         s = splsoftnet();
 3135         for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
 3136              sc = TAILQ_NEXT(sc, sc_bucketq)) {
 3137                 if (sc->sc_hash != hash)
 3138                         continue;
 3139                 if (!bcmp(&sc->sc_src, src, src->sa_len) &&
 3140                     !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
 3141                         splx(s);
 3142                         return (sc);
 3143                 }
 3144         }
 3145         splx(s);
 3146         return (NULL);
 3147 }
 3148 
 3149 /*
 3150  * This function gets called when we receive an ACK for a
 3151  * socket in the LISTEN state.  We look up the connection
 3152  * in the syn cache, and if its there, we pull it out of
 3153  * the cache and turn it into a full-blown connection in
 3154  * the SYN-RECEIVED state.
 3155  *
 3156  * The return values may not be immediately obvious, and their effects
 3157  * can be subtle, so here they are:
 3158  *
 3159  *      NULL    SYN was not found in cache; caller should drop the
 3160  *              packet and send an RST.
 3161  *
 3162  *      -1      We were unable to create the new connection, and are
 3163  *              aborting it.  An ACK,RST is being sent to the peer
 3164  *              (unless we got screwey sequence numbners; see below),
 3165  *              because the 3-way handshake has been completed.  Caller
 3166  *              should not free the mbuf, since we may be using it.  If
 3167  *              we are not, we will free it.
 3168  *
 3169  *      Otherwise, the return value is a pointer to the new socket
 3170  *      associated with the connection.
 3171  */
 3172 struct socket *
 3173 syn_cache_get(src, dst, th, hlen, tlen, so, m)
 3174         struct sockaddr *src;
 3175         struct sockaddr *dst;
 3176         struct tcphdr *th;
 3177         unsigned int hlen, tlen;
 3178         struct socket *so;
 3179         struct mbuf *m;
 3180 {
 3181         struct syn_cache *sc;
 3182         struct syn_cache_head *scp;
 3183         struct inpcb *inp = NULL;
 3184 #ifdef INET6
 3185         struct in6pcb *in6p = NULL;
 3186 #endif
 3187         struct tcpcb *tp = 0;
 3188         struct mbuf *am;
 3189         int s;
 3190         struct socket *oso;
 3191 
 3192         s = splsoftnet();
 3193         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3194                 splx(s);
 3195                 return (NULL);
 3196         }
 3197 
 3198         /*
 3199          * Verify the sequence and ack numbers.  Try getting the correct
 3200          * response again.
 3201          */
 3202         if ((th->th_ack != sc->sc_iss + 1) ||
 3203             SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 3204             SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
 3205                 (void) syn_cache_respond(sc, m);
 3206                 splx(s);
 3207                 return ((struct socket *)(-1));
 3208         }
 3209 
 3210         /* Remove this cache entry */
 3211         SYN_CACHE_RM(sc);
 3212         splx(s);
 3213 
 3214         /*
 3215          * Ok, create the full blown connection, and set things up
 3216          * as they would have been set up if we had created the
 3217          * connection when the SYN arrived.  If we can't create
 3218          * the connection, abort it.
 3219          */
 3220         /*
 3221          * inp still has the OLD in_pcb stuff, set the
 3222          * v6-related flags on the new guy, too.   This is
 3223          * done particularly for the case where an AF_INET6
 3224          * socket is bound only to a port, and a v4 connection
 3225          * comes in on that port.
 3226          * we also copy the flowinfo from the original pcb
 3227          * to the new one.
 3228          */
 3229         oso = so;
 3230         so = sonewconn(so, SS_ISCONNECTED);
 3231         if (so == NULL)
 3232                 goto resetandabort;
 3233 
 3234         switch (so->so_proto->pr_domain->dom_family) {
 3235 #ifdef INET
 3236         case AF_INET:
 3237                 inp = sotoinpcb(so);
 3238                 break;
 3239 #endif
 3240 #ifdef INET6
 3241         case AF_INET6:
 3242                 in6p = sotoin6pcb(so);
 3243                 break;
 3244 #endif
 3245         }
 3246         switch (src->sa_family) {
 3247 #ifdef INET
 3248         case AF_INET:
 3249                 if (inp) {
 3250                         inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
 3251                         inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
 3252                         inp->inp_options = ip_srcroute();
 3253                         in_pcbstate(inp, INP_BOUND);
 3254                         if (inp->inp_options == NULL) {
 3255                                 inp->inp_options = sc->sc_ipopts;
 3256                                 sc->sc_ipopts = NULL;
 3257                         }
 3258                 }
 3259 #ifdef INET6
 3260                 else if (in6p) {
 3261                         /* IPv4 packet to AF_INET6 socket */
 3262                         bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr));
 3263                         in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
 3264                         bcopy(&((struct sockaddr_in *)dst)->sin_addr,
 3265                                 &in6p->in6p_laddr.s6_addr32[3],
 3266                                 sizeof(((struct sockaddr_in *)dst)->sin_addr));
 3267                         in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
 3268                         in6totcpcb(in6p)->t_family = AF_INET;
 3269                         if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
 3270                                 in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
 3271                         else
 3272                                 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
 3273                         in6_pcbstate(in6p, IN6P_BOUND);
 3274                 }
 3275 #endif
 3276                 break;
 3277 #endif
 3278 #ifdef INET6
 3279         case AF_INET6:
 3280                 if (in6p) {
 3281                         in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
 3282                         in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
 3283                         in6_pcbstate(in6p, IN6P_BOUND);
 3284                 }
 3285                 break;
 3286 #endif
 3287         }
 3288 #ifdef INET6
 3289         if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
 3290                 struct in6pcb *oin6p = sotoin6pcb(oso);
 3291                 /* inherit socket options from the listening socket */
 3292                 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
 3293                 if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
 3294                         m_freem(in6p->in6p_options);
 3295                         in6p->in6p_options = 0;
 3296                 }
 3297                 ip6_savecontrol(in6p, &in6p->in6p_options,
 3298                         mtod(m, struct ip6_hdr *), m);
 3299         }
 3300 #endif
 3301 
 3302 #if defined(IPSEC) || defined(FAST_IPSEC)
 3303         /*
 3304          * we make a copy of policy, instead of sharing the policy,
 3305          * for better behavior in terms of SA lookup and dead SA removal.
 3306          */
 3307         if (inp) {
 3308                 /* copy old policy into new socket's */
 3309                 if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
 3310                         printf("tcp_input: could not copy policy\n");
 3311         }
 3312 #ifdef INET6
 3313         else if (in6p) {
 3314                 /* copy old policy into new socket's */
 3315                 if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp,
 3316                     in6p->in6p_sp))
 3317                         printf("tcp_input: could not copy policy\n");
 3318         }
 3319 #endif
 3320 #endif
 3321 
 3322         /*
 3323          * Give the new socket our cached route reference.
 3324          */
 3325         if (inp)
 3326                 inp->inp_route = sc->sc_route4;         /* struct assignment */
 3327 #ifdef INET6
 3328         else
 3329                 in6p->in6p_route = sc->sc_route6;
 3330 #endif
 3331         sc->sc_route4.ro_rt = NULL;
 3332 
 3333         am = m_get(M_DONTWAIT, MT_SONAME);      /* XXX */
 3334         if (am == NULL)
 3335                 goto resetandabort;
 3336         MCLAIM(am, &tcp_mowner);
 3337         am->m_len = src->sa_len;
 3338         bcopy(src, mtod(am, caddr_t), src->sa_len);
 3339         if (inp) {
 3340                 if (in_pcbconnect(inp, am)) {
 3341                         (void) m_free(am);
 3342                         goto resetandabort;
 3343                 }
 3344         }
 3345 #ifdef INET6
 3346         else if (in6p) {
 3347                 if (src->sa_family == AF_INET) {
 3348                         /* IPv4 packet to AF_INET6 socket */
 3349                         struct sockaddr_in6 *sin6;
 3350                         sin6 = mtod(am, struct sockaddr_in6 *);
 3351                         am->m_len = sizeof(*sin6);
 3352                         bzero(sin6, sizeof(*sin6));
 3353                         sin6->sin6_family = AF_INET6;
 3354                         sin6->sin6_len = sizeof(*sin6);
 3355                         sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port;
 3356                         sin6->sin6_addr.s6_addr16[5] = htons(0xffff);
 3357                         bcopy(&((struct sockaddr_in *)src)->sin_addr,
 3358                                 &sin6->sin6_addr.s6_addr32[3],
 3359                                 sizeof(sin6->sin6_addr.s6_addr32[3]));
 3360                 }
 3361                 if (in6_pcbconnect(in6p, am)) {
 3362                         (void) m_free(am);
 3363                         goto resetandabort;
 3364                 }
 3365         }
 3366 #endif
 3367         else {
 3368                 (void) m_free(am);
 3369                 goto resetandabort;
 3370         }
 3371         (void) m_free(am);
 3372 
 3373         if (inp)
 3374                 tp = intotcpcb(inp);
 3375 #ifdef INET6
 3376         else if (in6p)
 3377                 tp = in6totcpcb(in6p);
 3378 #endif
 3379         else
 3380                 tp = NULL;
 3381         tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
 3382         if (sc->sc_request_r_scale != 15) {
 3383                 tp->requested_s_scale = sc->sc_requested_s_scale;
 3384                 tp->request_r_scale = sc->sc_request_r_scale;
 3385                 tp->snd_scale = sc->sc_requested_s_scale;
 3386                 tp->rcv_scale = sc->sc_request_r_scale;
 3387                 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 3388         }
 3389         if (sc->sc_flags & SCF_TIMESTAMP)
 3390                 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 3391         tp->ts_timebase = sc->sc_timebase;
 3392 
 3393         tp->t_template = tcp_template(tp);
 3394         if (tp->t_template == 0) {
 3395                 tp = tcp_drop(tp, ENOBUFS);     /* destroys socket */
 3396                 so = NULL;
 3397                 m_freem(m);
 3398                 goto abort;
 3399         }
 3400 
 3401         tp->iss = sc->sc_iss;
 3402         tp->irs = sc->sc_irs;
 3403         tcp_sendseqinit(tp);
 3404         tcp_rcvseqinit(tp);
 3405         tp->t_state = TCPS_SYN_RECEIVED;
 3406         TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
 3407         tcpstat.tcps_accepts++;
 3408 
 3409         /* Initialize tp->t_ourmss before we deal with the peer's! */
 3410         tp->t_ourmss = sc->sc_ourmaxseg;
 3411         tcp_mss_from_peer(tp, sc->sc_peermaxseg);
 3412 
 3413         /*
 3414          * Initialize the initial congestion window.  If we
 3415          * had to retransmit the SYN,ACK, we must initialize cwnd
 3416          * to 1 segment (i.e. the Loss Window).
 3417          */
 3418         if (sc->sc_rxtshift)
 3419                 tp->snd_cwnd = tp->t_peermss;
 3420         else {
 3421                 int ss = tcp_init_win;
 3422 #ifdef INET
 3423                 if (inp != NULL && in_localaddr(inp->inp_faddr))
 3424                         ss = tcp_init_win_local;
 3425 #endif
 3426 #ifdef INET6
 3427                 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
 3428                         ss = tcp_init_win_local;
 3429 #endif
 3430                 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
 3431         }
 3432 
 3433         tcp_rmx_rtt(tp);
 3434         tp->snd_wl1 = sc->sc_irs;
 3435         tp->rcv_up = sc->sc_irs + 1;
 3436 
 3437         /*
 3438          * This is what whould have happened in tcp_output() when
 3439          * the SYN,ACK was sent.
 3440          */
 3441         tp->snd_up = tp->snd_una;
 3442         tp->snd_max = tp->snd_nxt = tp->iss+1;
 3443         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 3444         if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
 3445                 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 3446         tp->last_ack_sent = tp->rcv_nxt;
 3447 
 3448         tcpstat.tcps_sc_completed++;
 3449         SYN_CACHE_PUT(sc);
 3450         return (so);
 3451 
 3452 resetandabort:
 3453         (void) tcp_respond(NULL, m, m, th,
 3454                            th->th_seq + tlen, (tcp_seq)0, TH_RST|TH_ACK);
 3455 abort:
 3456         if (so != NULL)
 3457                 (void) soabort(so);
 3458         SYN_CACHE_PUT(sc);
 3459         tcpstat.tcps_sc_aborted++;
 3460         return ((struct socket *)(-1));
 3461 }
 3462 
 3463 /*
 3464  * This function is called when we get a RST for a
 3465  * non-existent connection, so that we can see if the
 3466  * connection is in the syn cache.  If it is, zap it.
 3467  */
 3468 
 3469 void
 3470 syn_cache_reset(src, dst, th)
 3471         struct sockaddr *src;
 3472         struct sockaddr *dst;
 3473         struct tcphdr *th;
 3474 {
 3475         struct syn_cache *sc;
 3476         struct syn_cache_head *scp;
 3477         int s = splsoftnet();
 3478 
 3479         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3480                 splx(s);
 3481                 return;
 3482         }
 3483         if (SEQ_LT(th->th_seq, sc->sc_irs) ||
 3484             SEQ_GT(th->th_seq, sc->sc_irs+1)) {
 3485                 splx(s);
 3486                 return;
 3487         }
 3488         SYN_CACHE_RM(sc);
 3489         splx(s);
 3490         tcpstat.tcps_sc_reset++;
 3491         SYN_CACHE_PUT(sc);
 3492 }
 3493 
 3494 void
 3495 syn_cache_unreach(src, dst, th)
 3496         struct sockaddr *src;
 3497         struct sockaddr *dst;
 3498         struct tcphdr *th;
 3499 {
 3500         struct syn_cache *sc;
 3501         struct syn_cache_head *scp;
 3502         int s;
 3503 
 3504         s = splsoftnet();
 3505         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3506                 splx(s);
 3507                 return;
 3508         }
 3509         /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 3510         if (ntohl (th->th_seq) != sc->sc_iss) {
 3511                 splx(s);
 3512                 return;
 3513         }
 3514 
 3515         /*
 3516          * If we've retransmitted 3 times and this is our second error,
 3517          * we remove the entry.  Otherwise, we allow it to continue on.
 3518          * This prevents us from incorrectly nuking an entry during a
 3519          * spurious network outage.
 3520          *
 3521          * See tcp_notify().
 3522          */
 3523         if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
 3524                 sc->sc_flags |= SCF_UNREACH;
 3525                 splx(s);
 3526                 return;
 3527         }
 3528 
 3529         SYN_CACHE_RM(sc);
 3530         splx(s);
 3531         tcpstat.tcps_sc_unreach++;
 3532         SYN_CACHE_PUT(sc);
 3533 }
 3534 
 3535 /*
 3536  * Given a LISTEN socket and an inbound SYN request, add
 3537  * this to the syn cache, and send back a segment:
 3538  *      <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
 3539  * to the source.
 3540  *
 3541  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
 3542  * Doing so would require that we hold onto the data and deliver it
 3543  * to the application.  However, if we are the target of a SYN-flood
 3544  * DoS attack, an attacker could send data which would eventually
 3545  * consume all available buffer space if it were ACKed.  By not ACKing
 3546  * the data, we avoid this DoS scenario.
 3547  */
 3548 
 3549 int
 3550 syn_cache_add(src, dst, th, hlen, so, m, optp, optlen, oi)
 3551         struct sockaddr *src;
 3552         struct sockaddr *dst;
 3553         struct tcphdr *th;
 3554         unsigned int hlen;
 3555         struct socket *so;
 3556         struct mbuf *m;
 3557         u_char *optp;
 3558         int optlen;
 3559         struct tcp_opt_info *oi;
 3560 {
 3561         struct tcpcb tb, *tp;
 3562         long win;
 3563         struct syn_cache *sc;
 3564         struct syn_cache_head *scp;
 3565         struct mbuf *ipopts;
 3566 
 3567         tp = sototcpcb(so);
 3568 
 3569         /*
 3570          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 3571          *
 3572          * Note this check is performed in tcp_input() very early on.
 3573          */
 3574 
 3575         /*
 3576          * Initialize some local state.
 3577          */
 3578         win = sbspace(&so->so_rcv);
 3579         if (win > TCP_MAXWIN)
 3580                 win = TCP_MAXWIN;
 3581 
 3582         switch (src->sa_family) {
 3583 #ifdef INET
 3584         case AF_INET:
 3585                 /*
 3586                  * Remember the IP options, if any.
 3587                  */
 3588                 ipopts = ip_srcroute();
 3589                 break;
 3590 #endif
 3591         default:
 3592                 ipopts = NULL;
 3593         }
 3594 
 3595         if (optp) {
 3596                 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
 3597                 tcp_dooptions(&tb, optp, optlen, th, oi);
 3598         } else
 3599                 tb.t_flags = 0;
 3600 
 3601         /*
 3602          * See if we already have an entry for this connection.
 3603          * If we do, resend the SYN,ACK.  We do not count this
 3604          * as a retransmission (XXX though maybe we should).
 3605          */
 3606         if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
 3607                 tcpstat.tcps_sc_dupesyn++;
 3608                 if (ipopts) {
 3609                         /*
 3610                          * If we were remembering a previous source route,
 3611                          * forget it and use the new one we've been given.
 3612                          */
 3613                         if (sc->sc_ipopts)
 3614                                 (void) m_free(sc->sc_ipopts);
 3615                         sc->sc_ipopts = ipopts;
 3616                 }
 3617                 sc->sc_timestamp = tb.ts_recent;
 3618                 if (syn_cache_respond(sc, m) == 0) {
 3619                         tcpstat.tcps_sndacks++;
 3620                         tcpstat.tcps_sndtotal++;
 3621                 }
 3622                 return (1);
 3623         }
 3624 
 3625         sc = pool_get(&syn_cache_pool, PR_NOWAIT);
 3626         if (sc == NULL) {
 3627                 if (ipopts)
 3628                         (void) m_free(ipopts);
 3629                 return (0);
 3630         }
 3631 
 3632         /*
 3633          * Fill in the cache, and put the necessary IP and TCP
 3634          * options into the reply.
 3635          */
 3636         bzero(sc, sizeof(struct syn_cache));
 3637         callout_init(&sc->sc_timer);
 3638         bcopy(src, &sc->sc_src, src->sa_len);
 3639         bcopy(dst, &sc->sc_dst, dst->sa_len);
 3640         sc->sc_flags = 0;
 3641         sc->sc_ipopts = ipopts;
 3642         sc->sc_irs = th->th_seq;
 3643         switch (src->sa_family) {
 3644 #ifdef INET
 3645         case AF_INET:
 3646             {
 3647                 struct sockaddr_in *srcin = (void *) src;
 3648                 struct sockaddr_in *dstin = (void *) dst;
 3649 
 3650                 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
 3651                     &srcin->sin_addr, dstin->sin_port,
 3652                     srcin->sin_port, sizeof(dstin->sin_addr), 0);
 3653                 break;
 3654             }
 3655 #endif /* INET */
 3656 #ifdef INET6
 3657         case AF_INET6:
 3658             {
 3659                 struct sockaddr_in6 *srcin6 = (void *) src;
 3660                 struct sockaddr_in6 *dstin6 = (void *) dst;
 3661 
 3662                 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
 3663                     &srcin6->sin6_addr, dstin6->sin6_port,
 3664                     srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0);
 3665                 break;
 3666             }
 3667 #endif /* INET6 */
 3668         }
 3669         sc->sc_peermaxseg = oi->maxseg;
 3670         sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
 3671                                                 m->m_pkthdr.rcvif : NULL,
 3672                                                 sc->sc_src.sa.sa_family);
 3673         sc->sc_win = win;
 3674         sc->sc_timebase = tcp_now;      /* see tcp_newtcpcb() */
 3675         sc->sc_timestamp = tb.ts_recent;
 3676         if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
 3677             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
 3678                 sc->sc_flags |= SCF_TIMESTAMP;
 3679         if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 3680             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 3681                 sc->sc_requested_s_scale = tb.requested_s_scale;
 3682                 sc->sc_request_r_scale = 0;
 3683                 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
 3684                     TCP_MAXWIN << sc->sc_request_r_scale <
 3685                     so->so_rcv.sb_hiwat)
 3686                         sc->sc_request_r_scale++;
 3687         } else {
 3688                 sc->sc_requested_s_scale = 15;
 3689                 sc->sc_request_r_scale = 15;
 3690         }
 3691         sc->sc_tp = tp;
 3692         if (syn_cache_respond(sc, m) == 0) {
 3693                 syn_cache_insert(sc, tp);
 3694                 tcpstat.tcps_sndacks++;
 3695                 tcpstat.tcps_sndtotal++;
 3696         } else {
 3697                 SYN_CACHE_PUT(sc);
 3698                 tcpstat.tcps_sc_dropped++;
 3699         }
 3700         return (1);
 3701 }
 3702 
 3703 int
 3704 syn_cache_respond(sc, m)
 3705         struct syn_cache *sc;
 3706         struct mbuf *m;
 3707 {
 3708         struct route *ro;
 3709         u_int8_t *optp;
 3710         int optlen, error;
 3711         u_int16_t tlen;
 3712         struct ip *ip = NULL;
 3713 #ifdef INET6
 3714         struct ip6_hdr *ip6 = NULL;
 3715 #endif
 3716         struct tcpcb *tp;
 3717         struct tcphdr *th;
 3718         u_int hlen;
 3719         struct socket *so;
 3720 
 3721         switch (sc->sc_src.sa.sa_family) {
 3722         case AF_INET:
 3723                 hlen = sizeof(struct ip);
 3724                 ro = &sc->sc_route4;
 3725                 break;
 3726 #ifdef INET6
 3727         case AF_INET6:
 3728                 hlen = sizeof(struct ip6_hdr);
 3729                 ro = (struct route *)&sc->sc_route6;
 3730                 break;
 3731 #endif
 3732         default:
 3733                 if (m)
 3734                         m_freem(m);
 3735                 return (EAFNOSUPPORT);
 3736         }
 3737 
 3738         /* Compute the size of the TCP options. */
 3739         optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
 3740             ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
 3741 
 3742         tlen = hlen + sizeof(struct tcphdr) + optlen;
 3743 
 3744         /*
 3745          * Create the IP+TCP header from scratch.
 3746          */
 3747         if (m)
 3748                 m_freem(m);
 3749 #ifdef DIAGNOSTIC
 3750         if (max_linkhdr + tlen > MCLBYTES)
 3751                 return (ENOBUFS);
 3752 #endif
 3753         MGETHDR(m, M_DONTWAIT, MT_DATA);
 3754         if (m && tlen > MHLEN) {
 3755                 MCLGET(m, M_DONTWAIT);
 3756                 if ((m->m_flags & M_EXT) == 0) {
 3757                         m_freem(m);
 3758                         m = NULL;
 3759                 }
 3760         }
 3761         if (m == NULL)
 3762                 return (ENOBUFS);
 3763         MCLAIM(m, &tcp_tx_mowner);
 3764 
 3765         /* Fixup the mbuf. */
 3766         m->m_data += max_linkhdr;
 3767         m->m_len = m->m_pkthdr.len = tlen;
 3768         if (sc->sc_tp) {
 3769                 tp = sc->sc_tp;
 3770                 if (tp->t_inpcb)
 3771                         so = tp->t_inpcb->inp_socket;
 3772 #ifdef INET6
 3773                 else if (tp->t_in6pcb)
 3774                         so = tp->t_in6pcb->in6p_socket;
 3775 #endif
 3776                 else
 3777                         so = NULL;
 3778         } else
 3779                 so = NULL;
 3780         m->m_pkthdr.rcvif = NULL;
 3781         memset(mtod(m, u_char *), 0, tlen);
 3782 
 3783         switch (sc->sc_src.sa.sa_family) {
 3784         case AF_INET:
 3785                 ip = mtod(m, struct ip *);
 3786                 ip->ip_dst = sc->sc_src.sin.sin_addr;
 3787                 ip->ip_src = sc->sc_dst.sin.sin_addr;
 3788                 ip->ip_p = IPPROTO_TCP;
 3789                 th = (struct tcphdr *)(ip + 1);
 3790                 th->th_dport = sc->sc_src.sin.sin_port;
 3791                 th->th_sport = sc->sc_dst.sin.sin_port;
 3792                 break;
 3793 #ifdef INET6
 3794         case AF_INET6:
 3795                 ip6 = mtod(m, struct ip6_hdr *);
 3796                 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
 3797                 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
 3798                 ip6->ip6_nxt = IPPROTO_TCP;
 3799                 /* ip6_plen will be updated in ip6_output() */
 3800                 th = (struct tcphdr *)(ip6 + 1);
 3801                 th->th_dport = sc->sc_src.sin6.sin6_port;
 3802                 th->th_sport = sc->sc_dst.sin6.sin6_port;
 3803                 break;
 3804 #endif
 3805         default:
 3806                 th = NULL;
 3807         }
 3808 
 3809         th->th_seq = htonl(sc->sc_iss);
 3810         th->th_ack = htonl(sc->sc_irs + 1);
 3811         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 3812         th->th_flags = TH_SYN|TH_ACK;
 3813         th->th_win = htons(sc->sc_win);
 3814         /* th_sum already 0 */
 3815         /* th_urp already 0 */
 3816 
 3817         /* Tack on the TCP options. */
 3818         optp = (u_int8_t *)(th + 1);
 3819         *optp++ = TCPOPT_MAXSEG;
 3820         *optp++ = 4;
 3821         *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
 3822         *optp++ = sc->sc_ourmaxseg & 0xff;
 3823 
 3824         if (sc->sc_request_r_scale != 15) {
 3825                 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
 3826                     TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
 3827                     sc->sc_request_r_scale);
 3828                 optp += 4;
 3829         }
 3830 
 3831         if (sc->sc_flags & SCF_TIMESTAMP) {
 3832                 u_int32_t *lp = (u_int32_t *)(optp);
 3833                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 3834                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 3835                 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
 3836                 *lp   = htonl(sc->sc_timestamp);
 3837                 optp += TCPOLEN_TSTAMP_APPA;
 3838         }
 3839 
 3840         /* Compute the packet's checksum. */
 3841         switch (sc->sc_src.sa.sa_family) {
 3842         case AF_INET:
 3843                 ip->ip_len = htons(tlen - hlen);
 3844                 th->th_sum = 0;
 3845                 th->th_sum = in_cksum(m, tlen);
 3846                 break;
 3847 #ifdef INET6
 3848         case AF_INET6:
 3849                 ip6->ip6_plen = htons(tlen - hlen);
 3850                 th->th_sum = 0;
 3851                 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 3852                 break;
 3853 #endif
 3854         }
 3855 
 3856         /*
 3857          * Fill in some straggling IP bits.  Note the stack expects
 3858          * ip_len to be in host order, for convenience.
 3859          */
 3860         switch (sc->sc_src.sa.sa_family) {
 3861 #ifdef INET
 3862         case AF_INET:
 3863                 ip->ip_len = htons(tlen);
 3864                 ip->ip_ttl = ip_defttl;
 3865                 /* XXX tos? */
 3866                 break;
 3867 #endif
 3868 #ifdef INET6
 3869         case AF_INET6:
 3870                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 3871                 ip6->ip6_vfc |= IPV6_VERSION;
 3872                 ip6->ip6_plen = htons(tlen - hlen);
 3873                 /* ip6_hlim will be initialized afterwards */
 3874                 /* XXX flowlabel? */
 3875                 break;
 3876 #endif
 3877         }
 3878 
 3879         /* XXX use IPsec policy on listening socket, on SYN ACK */
 3880         tp = sc->sc_tp;
 3881 
 3882         switch (sc->sc_src.sa.sa_family) {
 3883 #ifdef INET
 3884         case AF_INET:
 3885                 error = ip_output(m, sc->sc_ipopts, ro,
 3886                     (ip_mtudisc ? IP_MTUDISC : 0), 
 3887                     (struct ip_moptions *)NULL, so);
 3888                 break;
 3889 #endif
 3890 #ifdef INET6
 3891         case AF_INET6:
 3892                 ip6->ip6_hlim = in6_selecthlim(NULL,
 3893                                 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
 3894 
 3895                 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
 3896                         (struct ip6_moptions *)0, so, NULL);
 3897                 break;
 3898 #endif
 3899         default:
 3900                 error = EAFNOSUPPORT;
 3901                 break;
 3902         }
 3903         return (error);
 3904 }

Cache object: af055db0041501d360d9bc8f147d9824


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.