tcp_input.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*      $NetBSD: tcp_input.c,v 1.255.2.2 2007/05/25 07:12:00 pavel Exp $        */
    2 
    3 /*
    4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. Neither the name of the project nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   34  *
   35  * NRL grants permission for redistribution and use in source and binary
   36  * forms, with or without modification, of the software and documentation
   37  * created at NRL provided that the following conditions are met:
   38  *
   39  * 1. Redistributions of source code must retain the above copyright
   40  *    notice, this list of conditions and the following disclaimer.
   41  * 2. Redistributions in binary form must reproduce the above copyright
   42  *    notice, this list of conditions and the following disclaimer in the
   43  *    documentation and/or other materials provided with the distribution.
   44  * 3. All advertising materials mentioning features or use of this software
   45  *    must display the following acknowledgements:
   46  *      This product includes software developed by the University of
   47  *      California, Berkeley and its contributors.
   48  *      This product includes software developed at the Information
   49  *      Technology Division, US Naval Research Laboratory.
   50  * 4. Neither the name of the NRL nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   65  *
   66  * The views and conclusions contained in the software and documentation
   67  * are those of the authors and should not be interpreted as representing
   68  * official policies, either expressed or implied, of the US Naval
   69  * Research Laboratory (NRL).
   70  */
   71 
   72 /*-
   73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
   74  * All rights reserved.
   75  *
   76  * This code is derived from software contributed to The NetBSD Foundation
   77  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
   78  * Facility, NASA Ames Research Center.
   79  * This code is derived from software contributed to The NetBSD Foundation
   80  * by Charles M. Hannum.
   81  * This code is derived from software contributed to The NetBSD Foundation
   82  * by Rui Paulo.
   83  *
   84  * Redistribution and use in source and binary forms, with or without
   85  * modification, are permitted provided that the following conditions
   86  * are met:
   87  * 1. Redistributions of source code must retain the above copyright
   88  *    notice, this list of conditions and the following disclaimer.
   89  * 2. Redistributions in binary form must reproduce the above copyright
   90  *    notice, this list of conditions and the following disclaimer in the
   91  *    documentation and/or other materials provided with the distribution.
   92  * 3. All advertising materials mentioning features or use of this software
   93  *    must display the following acknowledgement:
   94  *      This product includes software developed by the NetBSD
   95  *      Foundation, Inc. and its contributors.
   96  * 4. Neither the name of The NetBSD Foundation nor the names of its
   97  *    contributors may be used to endorse or promote products derived
   98  *    from this software without specific prior written permission.
   99  *
  100  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  101  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  102  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  103  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  104  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  105  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  106  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  107  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  108  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  109  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  110  * POSSIBILITY OF SUCH DAMAGE.
  111  */
  112 
  113 /*
  114  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  115  *      The Regents of the University of California.  All rights reserved.
  116  *
  117  * Redistribution and use in source and binary forms, with or without
  118  * modification, are permitted provided that the following conditions
  119  * are met:
  120  * 1. Redistributions of source code must retain the above copyright
  121  *    notice, this list of conditions and the following disclaimer.
  122  * 2. Redistributions in binary form must reproduce the above copyright
  123  *    notice, this list of conditions and the following disclaimer in the
  124  *    documentation and/or other materials provided with the distribution.
  125  * 3. Neither the name of the University nor the names of its contributors
  126  *    may be used to endorse or promote products derived from this software
  127  *    without specific prior written permission.
  128  *
  129  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  130  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  131  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  132  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  133  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  134  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  135  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  136  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  137  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  138  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  139  * SUCH DAMAGE.
  140  *
  141  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  142  */
  143 
  144 /*
  145  *      TODO list for SYN cache stuff:
  146  *
  147  *      Find room for a "state" field, which is needed to keep a
  148  *      compressed state for TIME_WAIT TCBs.  It's been noted already
  149  *      that this is fairly important for very high-volume web and
  150  *      mail servers, which use a large number of short-lived
  151  *      connections.
  152  */
  153 
  154 #include <sys/cdefs.h>
  155 __KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.255.2.2 2007/05/25 07:12:00 pavel Exp $");
  156 
  157 #include "opt_inet.h"
  158 #include "opt_ipsec.h"
  159 #include "opt_inet_csum.h"
  160 #include "opt_tcp_debug.h"
  161 
  162 #include <sys/param.h>
  163 #include <sys/systm.h>
  164 #include <sys/malloc.h>
  165 #include <sys/mbuf.h>
  166 #include <sys/protosw.h>
  167 #include <sys/socket.h>
  168 #include <sys/socketvar.h>
  169 #include <sys/errno.h>
  170 #include <sys/syslog.h>
  171 #include <sys/pool.h>
  172 #include <sys/domain.h>
  173 #include <sys/kernel.h>
  174 #ifdef TCP_SIGNATURE
  175 #include <sys/md5.h>
  176 #endif
  177 
  178 #include <net/if.h>
  179 #include <net/route.h>
  180 #include <net/if_types.h>
  181 
  182 #include <netinet/in.h>
  183 #include <netinet/in_systm.h>
  184 #include <netinet/ip.h>
  185 #include <netinet/in_pcb.h>
  186 #include <netinet/in_var.h>
  187 #include <netinet/ip_var.h>
  188 #include <netinet/in_offload.h>
  189 
  190 #ifdef INET6
  191 #ifndef INET
  192 #include <netinet/in.h>
  193 #endif
  194 #include <netinet/ip6.h>
  195 #include <netinet6/ip6_var.h>
  196 #include <netinet6/in6_pcb.h>
  197 #include <netinet6/ip6_var.h>
  198 #include <netinet6/in6_var.h>
  199 #include <netinet/icmp6.h>
  200 #include <netinet6/nd6.h>
  201 #ifdef TCP_SIGNATURE
  202 #include <netinet6/scope6_var.h>
  203 #endif
  204 #endif
  205 
  206 #ifndef INET6
  207 /* always need ip6.h for IP6_EXTHDR_GET */
  208 #include <netinet/ip6.h>
  209 #endif
  210 
  211 #include <netinet/tcp.h>
  212 #include <netinet/tcp_fsm.h>
  213 #include <netinet/tcp_seq.h>
  214 #include <netinet/tcp_timer.h>
  215 #include <netinet/tcp_var.h>
  216 #include <netinet/tcpip.h>
  217 #include <netinet/tcp_congctl.h>
  218 #include <netinet/tcp_debug.h>
  219 
  220 #include <machine/stdarg.h>
  221 
  222 #ifdef IPSEC
  223 #include <netinet6/ipsec.h>
  224 #include <netkey/key.h>
  225 #endif /*IPSEC*/
  226 #ifdef INET6
  227 #include "faith.h"
  228 #if defined(NFAITH) && NFAITH > 0
  229 #include <net/if_faith.h>
  230 #endif
  231 #endif  /* IPSEC */
  232 
  233 #ifdef FAST_IPSEC
  234 #include <netipsec/ipsec.h>
  235 #include <netipsec/ipsec_var.h>                 /* XXX ipsecstat namespace */
  236 #include <netipsec/key.h>
  237 #ifdef INET6
  238 #include <netipsec/ipsec6.h>
  239 #endif
  240 #endif  /* FAST_IPSEC*/
  241 
  242 int     tcprexmtthresh = 3;
  243 int     tcp_log_refused;
  244 
  245 static int tcp_rst_ppslim_count = 0;
  246 static struct timeval tcp_rst_ppslim_last;
  247 static int tcp_ackdrop_ppslim_count = 0;
  248 static struct timeval tcp_ackdrop_ppslim_last;
  249 
  250 #define TCP_PAWS_IDLE   (24U * 24 * 60 * 60 * PR_SLOWHZ)
  251 
  252 /* for modulo comparisons of timestamps */
  253 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
  254 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
  255 
  256 /*
  257  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
  258  */
  259 #ifdef INET6
  260 #define ND6_HINT(tp) \
  261 do { \
  262         if (tp && tp->t_in6pcb && tp->t_family == AF_INET6 && \
  263             tp->t_in6pcb->in6p_route.ro_rt) { \
  264                 nd6_nud_hint(tp->t_in6pcb->in6p_route.ro_rt, NULL, 0); \
  265         } \
  266 } while (/*CONSTCOND*/ 0)
  267 #else
  268 #define ND6_HINT(tp)
  269 #endif
  270 
  271 /*
  272  * Macro to compute ACK transmission behavior.  Delay the ACK unless
  273  * we have already delayed an ACK (must send an ACK every two segments).
  274  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
  275  * option is enabled.
  276  */
  277 #define TCP_SETUP_ACK(tp, th) \
  278 do { \
  279         if ((tp)->t_flags & TF_DELACK || \
  280             (tcp_ack_on_push && (th)->th_flags & TH_PUSH)) \
  281                 tp->t_flags |= TF_ACKNOW; \
  282         else \
  283                 TCP_SET_DELACK(tp); \
  284 } while (/*CONSTCOND*/ 0)
  285 
  286 #define ICMP_CHECK(tp, th, acked) \
  287 do { \
  288         /* \
  289          * If we had a pending ICMP message that \
  290          * refers to data that have just been  \
  291          * acknowledged, disregard the recorded ICMP \
  292          * message. \
  293          */ \
  294         if (((tp)->t_flags & TF_PMTUD_PEND) && \
  295             SEQ_GT((th)->th_ack, (tp)->t_pmtud_th_seq)) \
  296                 (tp)->t_flags &= ~TF_PMTUD_PEND; \
  297 \
  298         /* \
  299          * Keep track of the largest chunk of data \
  300          * acknowledged since last PMTU update \
  301          */ \
  302         if ((tp)->t_pmtud_mss_acked < (acked)) \
  303                 (tp)->t_pmtud_mss_acked = (acked); \
  304 } while (/*CONSTCOND*/ 0)
  305 
  306 /*
  307  * Convert TCP protocol fields to host order for easier processing.
  308  */
  309 #define TCP_FIELDS_TO_HOST(th)                                          \
  310 do {                                                                    \
  311         NTOHL((th)->th_seq);                                            \
  312         NTOHL((th)->th_ack);                                            \
  313         NTOHS((th)->th_win);                                            \
  314         NTOHS((th)->th_urp);                                            \
  315 } while (/*CONSTCOND*/ 0)
  316 
  317 /*
  318  * ... and reverse the above.
  319  */
  320 #define TCP_FIELDS_TO_NET(th)                                           \
  321 do {                                                                    \
  322         HTONL((th)->th_seq);                                            \
  323         HTONL((th)->th_ack);                                            \
  324         HTONS((th)->th_win);                                            \
  325         HTONS((th)->th_urp);                                            \
  326 } while (/*CONSTCOND*/ 0)
  327 
  328 #ifdef TCP_CSUM_COUNTERS
  329 #include <sys/device.h>
  330 
  331 #if defined(INET)
  332 extern struct evcnt tcp_hwcsum_ok;
  333 extern struct evcnt tcp_hwcsum_bad;
  334 extern struct evcnt tcp_hwcsum_data;
  335 extern struct evcnt tcp_swcsum;
  336 #endif /* defined(INET) */
  337 #if defined(INET6)
  338 extern struct evcnt tcp6_hwcsum_ok;
  339 extern struct evcnt tcp6_hwcsum_bad;
  340 extern struct evcnt tcp6_hwcsum_data;
  341 extern struct evcnt tcp6_swcsum;
  342 #endif /* defined(INET6) */
  343 
  344 #define TCP_CSUM_COUNTER_INCR(ev)       (ev)->ev_count++
  345 
  346 #else
  347 
  348 #define TCP_CSUM_COUNTER_INCR(ev)       /* nothing */
  349 
  350 #endif /* TCP_CSUM_COUNTERS */
  351 
  352 #ifdef TCP_REASS_COUNTERS
  353 #include <sys/device.h>
  354 
  355 extern struct evcnt tcp_reass_;
  356 extern struct evcnt tcp_reass_empty;
  357 extern struct evcnt tcp_reass_iteration[8];
  358 extern struct evcnt tcp_reass_prependfirst;
  359 extern struct evcnt tcp_reass_prepend;
  360 extern struct evcnt tcp_reass_insert;
  361 extern struct evcnt tcp_reass_inserttail;
  362 extern struct evcnt tcp_reass_append;
  363 extern struct evcnt tcp_reass_appendtail;
  364 extern struct evcnt tcp_reass_overlaptail;
  365 extern struct evcnt tcp_reass_overlapfront;
  366 extern struct evcnt tcp_reass_segdup;
  367 extern struct evcnt tcp_reass_fragdup;
  368 
  369 #define TCP_REASS_COUNTER_INCR(ev)      (ev)->ev_count++
  370 
  371 #else
  372 
  373 #define TCP_REASS_COUNTER_INCR(ev)      /* nothing */
  374 
  375 #endif /* TCP_REASS_COUNTERS */
  376 
  377 static int tcp_dooptions(struct tcpcb *, const u_char *, int,
  378     struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
  379 
  380 #ifdef INET
  381 static void tcp4_log_refused(const struct ip *, const struct tcphdr *);
  382 #endif
  383 #ifdef INET6
  384 static void tcp6_log_refused(const struct ip6_hdr *, const struct tcphdr *);
  385 #endif
  386 
  387 #define TRAVERSE(x) while ((x)->m_next) (x) = (x)->m_next
  388 
  389 static POOL_INIT(tcpipqent_pool, sizeof(struct ipqent), 0, 0, 0, "tcpipqepl",
  390     NULL);
  391 
  392 struct ipqent *
  393 tcpipqent_alloc()
  394 {
  395         struct ipqent *ipqe;
  396         int s;
  397 
  398         s = splvm();
  399         ipqe = pool_get(&tcpipqent_pool, PR_NOWAIT);
  400         splx(s);
  401 
  402         return ipqe;
  403 }
  404 
  405 void
  406 tcpipqent_free(struct ipqent *ipqe)
  407 {
  408         int s;
  409 
  410         s = splvm();
  411         pool_put(&tcpipqent_pool, ipqe);
  412         splx(s);
  413 }
  414 
  415 int
  416 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
  417 {
  418         struct ipqent *p, *q, *nq, *tiqe = NULL;
  419         struct socket *so = NULL;
  420         int pkt_flags;
  421         tcp_seq pkt_seq;
  422         unsigned pkt_len;
  423         u_long rcvpartdupbyte = 0;
  424         u_long rcvoobyte;
  425 #ifdef TCP_REASS_COUNTERS
  426         u_int count = 0;
  427 #endif
  428 
  429         if (tp->t_inpcb)
  430                 so = tp->t_inpcb->inp_socket;
  431 #ifdef INET6
  432         else if (tp->t_in6pcb)
  433                 so = tp->t_in6pcb->in6p_socket;
  434 #endif
  435 
  436         TCP_REASS_LOCK_CHECK(tp);
  437 
  438         /*
  439          * Call with th==0 after become established to
  440          * force pre-ESTABLISHED data up to user socket.
  441          */
  442         if (th == 0)
  443                 goto present;
  444 
  445         rcvoobyte = *tlen;
  446         /*
  447          * Copy these to local variables because the tcpiphdr
  448          * gets munged while we are collapsing mbufs.
  449          */
  450         pkt_seq = th->th_seq;
  451         pkt_len = *tlen;
  452         pkt_flags = th->th_flags;
  453 
  454         TCP_REASS_COUNTER_INCR(&tcp_reass_);
  455 
  456         if ((p = TAILQ_LAST(&tp->segq, ipqehead)) != NULL) {
  457                 /*
  458                  * When we miss a packet, the vast majority of time we get
  459                  * packets that follow it in order.  So optimize for that.
  460                  */
  461                 if (pkt_seq == p->ipqe_seq + p->ipqe_len) {
  462                         p->ipqe_len += pkt_len;
  463                         p->ipqe_flags |= pkt_flags;
  464                         m_cat(p->ipre_mlast, m);
  465                         TRAVERSE(p->ipre_mlast);
  466                         m = NULL;
  467                         tiqe = p;
  468                         TAILQ_REMOVE(&tp->timeq, p, ipqe_timeq);
  469                         TCP_REASS_COUNTER_INCR(&tcp_reass_appendtail);
  470                         goto skip_replacement;
  471                 }
  472                 /*
  473                  * While we're here, if the pkt is completely beyond
  474                  * anything we have, just insert it at the tail.
  475                  */
  476                 if (SEQ_GT(pkt_seq, p->ipqe_seq + p->ipqe_len)) {
  477                         TCP_REASS_COUNTER_INCR(&tcp_reass_inserttail);
  478                         goto insert_it;
  479                 }
  480         }
  481 
  482         q = TAILQ_FIRST(&tp->segq);
  483 
  484         if (q != NULL) {
  485                 /*
  486                  * If this segment immediately precedes the first out-of-order
  487                  * block, simply slap the segment in front of it and (mostly)
  488                  * skip the complicated logic.
  489                  */
  490                 if (pkt_seq + pkt_len == q->ipqe_seq) {
  491                         q->ipqe_seq = pkt_seq;
  492                         q->ipqe_len += pkt_len;
  493                         q->ipqe_flags |= pkt_flags;
  494                         m_cat(m, q->ipqe_m);
  495                         q->ipqe_m = m;
  496                         q->ipre_mlast = m; /* last mbuf may have changed */
  497                         TRAVERSE(q->ipre_mlast);
  498                         tiqe = q;
  499                         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  500                         TCP_REASS_COUNTER_INCR(&tcp_reass_prependfirst);
  501                         goto skip_replacement;
  502                 }
  503         } else {
  504                 TCP_REASS_COUNTER_INCR(&tcp_reass_empty);
  505         }
  506 
  507         /*
  508          * Find a segment which begins after this one does.
  509          */
  510         for (p = NULL; q != NULL; q = nq) {
  511                 nq = TAILQ_NEXT(q, ipqe_q);
  512 #ifdef TCP_REASS_COUNTERS
  513                 count++;
  514 #endif
  515                 /*
  516                  * If the received segment is just right after this
  517                  * fragment, merge the two together and then check
  518                  * for further overlaps.
  519                  */
  520                 if (q->ipqe_seq + q->ipqe_len == pkt_seq) {
  521 #ifdef TCPREASS_DEBUG
  522                         printf("tcp_reass[%p]: concat %u:%u(%u) to %u:%u(%u)\n",
  523                                tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
  524                                q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len);
  525 #endif
  526                         pkt_len += q->ipqe_len;
  527                         pkt_flags |= q->ipqe_flags;
  528                         pkt_seq = q->ipqe_seq;
  529                         m_cat(q->ipre_mlast, m);
  530                         TRAVERSE(q->ipre_mlast);
  531                         m = q->ipqe_m;
  532                         TCP_REASS_COUNTER_INCR(&tcp_reass_append);
  533                         goto free_ipqe;
  534                 }
  535                 /*
  536                  * If the received segment is completely past this
  537                  * fragment, we need to go the next fragment.
  538                  */
  539                 if (SEQ_LT(q->ipqe_seq + q->ipqe_len, pkt_seq)) {
  540                         p = q;
  541                         continue;
  542                 }
  543                 /*
  544                  * If the fragment is past the received segment,
  545                  * it (or any following) can't be concatenated.
  546                  */
  547                 if (SEQ_GT(q->ipqe_seq, pkt_seq + pkt_len)) {
  548                         TCP_REASS_COUNTER_INCR(&tcp_reass_insert);
  549                         break;
  550                 }
  551 
  552                 /*
  553                  * We've received all the data in this segment before.
  554                  * mark it as a duplicate and return.
  555                  */
  556                 if (SEQ_LEQ(q->ipqe_seq, pkt_seq) &&
  557                     SEQ_GEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
  558                         tcpstat.tcps_rcvduppack++;
  559                         tcpstat.tcps_rcvdupbyte += pkt_len;
  560                         tcp_new_dsack(tp, pkt_seq, pkt_len);
  561                         m_freem(m);
  562                         if (tiqe != NULL) {
  563                                 tcpipqent_free(tiqe);
  564                         }
  565                         TCP_REASS_COUNTER_INCR(&tcp_reass_segdup);
  566                         return (0);
  567                 }
  568                 /*
  569                  * Received segment completely overlaps this fragment
  570                  * so we drop the fragment (this keeps the temporal
  571                  * ordering of segments correct).
  572                  */
  573                 if (SEQ_GEQ(q->ipqe_seq, pkt_seq) &&
  574                     SEQ_LEQ(q->ipqe_seq + q->ipqe_len, pkt_seq + pkt_len)) {
  575                         rcvpartdupbyte += q->ipqe_len;
  576                         m_freem(q->ipqe_m);
  577                         TCP_REASS_COUNTER_INCR(&tcp_reass_fragdup);
  578                         goto free_ipqe;
  579                 }
  580                 /*
  581                  * RX'ed segment extends past the end of the
  582                  * fragment.  Drop the overlapping bytes.  Then
  583                  * merge the fragment and segment then treat as
  584                  * a longer received packet.
  585                  */
  586                 if (SEQ_LT(q->ipqe_seq, pkt_seq) &&
  587                     SEQ_GT(q->ipqe_seq + q->ipqe_len, pkt_seq))  {
  588                         int overlap = q->ipqe_seq + q->ipqe_len - pkt_seq;
  589 #ifdef TCPREASS_DEBUG
  590                         printf("tcp_reass[%p]: trim starting %d bytes of %u:%u(%u)\n",
  591                                tp, overlap,
  592                                pkt_seq, pkt_seq + pkt_len, pkt_len);
  593 #endif
  594                         m_adj(m, overlap);
  595                         rcvpartdupbyte += overlap;
  596                         m_cat(q->ipre_mlast, m);
  597                         TRAVERSE(q->ipre_mlast);
  598                         m = q->ipqe_m;
  599                         pkt_seq = q->ipqe_seq;
  600                         pkt_len += q->ipqe_len - overlap;
  601                         rcvoobyte -= overlap;
  602                         TCP_REASS_COUNTER_INCR(&tcp_reass_overlaptail);
  603                         goto free_ipqe;
  604                 }
  605                 /*
  606                  * RX'ed segment extends past the front of the
  607                  * fragment.  Drop the overlapping bytes on the
  608                  * received packet.  The packet will then be
  609                  * contatentated with this fragment a bit later.
  610                  */
  611                 if (SEQ_GT(q->ipqe_seq, pkt_seq) &&
  612                     SEQ_LT(q->ipqe_seq, pkt_seq + pkt_len))  {
  613                         int overlap = pkt_seq + pkt_len - q->ipqe_seq;
  614 #ifdef TCPREASS_DEBUG
  615                         printf("tcp_reass[%p]: trim trailing %d bytes of %u:%u(%u)\n",
  616                                tp, overlap,
  617                                pkt_seq, pkt_seq + pkt_len, pkt_len);
  618 #endif
  619                         m_adj(m, -overlap);
  620                         pkt_len -= overlap;
  621                         rcvpartdupbyte += overlap;
  622                         TCP_REASS_COUNTER_INCR(&tcp_reass_overlapfront);
  623                         rcvoobyte -= overlap;
  624                 }
  625                 /*
  626                  * If the received segment immediates precedes this
  627                  * fragment then tack the fragment onto this segment
  628                  * and reinsert the data.
  629                  */
  630                 if (q->ipqe_seq == pkt_seq + pkt_len) {
  631 #ifdef TCPREASS_DEBUG
  632                         printf("tcp_reass[%p]: append %u:%u(%u) to %u:%u(%u)\n",
  633                                tp, q->ipqe_seq, q->ipqe_seq + q->ipqe_len, q->ipqe_len,
  634                                pkt_seq, pkt_seq + pkt_len, pkt_len);
  635 #endif
  636                         pkt_len += q->ipqe_len;
  637                         pkt_flags |= q->ipqe_flags;
  638                         m_cat(m, q->ipqe_m);
  639                         TAILQ_REMOVE(&tp->segq, q, ipqe_q);
  640                         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  641                         tp->t_segqlen--;
  642                         KASSERT(tp->t_segqlen >= 0);
  643                         KASSERT(tp->t_segqlen != 0 ||
  644                             (TAILQ_EMPTY(&tp->segq) &&
  645                             TAILQ_EMPTY(&tp->timeq)));
  646                         if (tiqe == NULL) {
  647                                 tiqe = q;
  648                         } else {
  649                                 tcpipqent_free(q);
  650                         }
  651                         TCP_REASS_COUNTER_INCR(&tcp_reass_prepend);
  652                         break;
  653                 }
  654                 /*
  655                  * If the fragment is before the segment, remember it.
  656                  * When this loop is terminated, p will contain the
  657                  * pointer to fragment that is right before the received
  658                  * segment.
  659                  */
  660                 if (SEQ_LEQ(q->ipqe_seq, pkt_seq))
  661                         p = q;
  662 
  663                 continue;
  664 
  665                 /*
  666                  * This is a common operation.  It also will allow
  667                  * to save doing a malloc/free in most instances.
  668                  */
  669           free_ipqe:
  670                 TAILQ_REMOVE(&tp->segq, q, ipqe_q);
  671                 TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  672                 tp->t_segqlen--;
  673                 KASSERT(tp->t_segqlen >= 0);
  674                 KASSERT(tp->t_segqlen != 0 ||
  675                     (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
  676                 if (tiqe == NULL) {
  677                         tiqe = q;
  678                 } else {
  679                         tcpipqent_free(q);
  680                 }
  681         }
  682 
  683 #ifdef TCP_REASS_COUNTERS
  684         if (count > 7)
  685                 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[0]);
  686         else if (count > 0)
  687                 TCP_REASS_COUNTER_INCR(&tcp_reass_iteration[count]);
  688 #endif
  689 
  690     insert_it:
  691 
  692         /*
  693          * Allocate a new queue entry since the received segment did not
  694          * collapse onto any other out-of-order block; thus we are allocating
  695          * a new block.  If it had collapsed, tiqe would not be NULL and
  696          * we would be reusing it.
  697          * XXX If we can't, just drop the packet.  XXX
  698          */
  699         if (tiqe == NULL) {
  700                 tiqe = tcpipqent_alloc();
  701                 if (tiqe == NULL) {
  702                         tcpstat.tcps_rcvmemdrop++;
  703                         m_freem(m);
  704                         return (0);
  705                 }
  706         }
  707 
  708         /*
  709          * Update the counters.
  710          */
  711         tcpstat.tcps_rcvoopack++;
  712         tcpstat.tcps_rcvoobyte += rcvoobyte;
  713         if (rcvpartdupbyte) {
  714             tcpstat.tcps_rcvpartduppack++;
  715             tcpstat.tcps_rcvpartdupbyte += rcvpartdupbyte;
  716         }
  717 
  718         /*
  719          * Insert the new fragment queue entry into both queues.
  720          */
  721         tiqe->ipqe_m = m;
  722         tiqe->ipre_mlast = m;
  723         tiqe->ipqe_seq = pkt_seq;
  724         tiqe->ipqe_len = pkt_len;
  725         tiqe->ipqe_flags = pkt_flags;
  726         if (p == NULL) {
  727                 TAILQ_INSERT_HEAD(&tp->segq, tiqe, ipqe_q);
  728 #ifdef TCPREASS_DEBUG
  729                 if (tiqe->ipqe_seq != tp->rcv_nxt)
  730                         printf("tcp_reass[%p]: insert %u:%u(%u) at front\n",
  731                                tp, pkt_seq, pkt_seq + pkt_len, pkt_len);
  732 #endif
  733         } else {
  734                 TAILQ_INSERT_AFTER(&tp->segq, p, tiqe, ipqe_q);
  735 #ifdef TCPREASS_DEBUG
  736                 printf("tcp_reass[%p]: insert %u:%u(%u) after %u:%u(%u)\n",
  737                        tp, pkt_seq, pkt_seq + pkt_len, pkt_len,
  738                        p->ipqe_seq, p->ipqe_seq + p->ipqe_len, p->ipqe_len);
  739 #endif
  740         }
  741         tp->t_segqlen++;
  742 
  743 skip_replacement:
  744 
  745         TAILQ_INSERT_HEAD(&tp->timeq, tiqe, ipqe_timeq);
  746 
  747 present:
  748         /*
  749          * Present data to user, advancing rcv_nxt through
  750          * completed sequence space.
  751          */
  752         if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
  753                 return (0);
  754         q = TAILQ_FIRST(&tp->segq);
  755         if (q == NULL || q->ipqe_seq != tp->rcv_nxt)
  756                 return (0);
  757         if (tp->t_state == TCPS_SYN_RECEIVED && q->ipqe_len)
  758                 return (0);
  759 
  760         tp->rcv_nxt += q->ipqe_len;
  761         pkt_flags = q->ipqe_flags & TH_FIN;
  762         ND6_HINT(tp);
  763 
  764         TAILQ_REMOVE(&tp->segq, q, ipqe_q);
  765         TAILQ_REMOVE(&tp->timeq, q, ipqe_timeq);
  766         tp->t_segqlen--;
  767         KASSERT(tp->t_segqlen >= 0);
  768         KASSERT(tp->t_segqlen != 0 ||
  769             (TAILQ_EMPTY(&tp->segq) && TAILQ_EMPTY(&tp->timeq)));
  770         if (so->so_state & SS_CANTRCVMORE)
  771                 m_freem(q->ipqe_m);
  772         else
  773                 sbappendstream(&so->so_rcv, q->ipqe_m);
  774         tcpipqent_free(q);
  775         sorwakeup(so);
  776         return (pkt_flags);
  777 }
  778 
  779 #ifdef INET6
  780 int
  781 tcp6_input(struct mbuf **mp, int *offp, int proto)
  782 {
  783         struct mbuf *m = *mp;
  784 
  785         /*
  786          * draft-itojun-ipv6-tcp-to-anycast
  787          * better place to put this in?
  788          */
  789         if (m->m_flags & M_ANYCAST6) {
  790                 struct ip6_hdr *ip6;
  791                 if (m->m_len < sizeof(struct ip6_hdr)) {
  792                         if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
  793                                 tcpstat.tcps_rcvshort++;
  794                                 return IPPROTO_DONE;
  795                         }
  796                 }
  797                 ip6 = mtod(m, struct ip6_hdr *);
  798                 icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_ADDR,
  799                     (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
  800                 return IPPROTO_DONE;
  801         }
  802 
  803         tcp_input(m, *offp, proto);
  804         return IPPROTO_DONE;
  805 }
  806 #endif
  807 
  808 #ifdef INET
  809 static void
  810 tcp4_log_refused(const struct ip *ip, const struct tcphdr *th)
  811 {
  812         char src[4*sizeof "123"];
  813         char dst[4*sizeof "123"];
  814 
  815         if (ip) {
  816                 strlcpy(src, inet_ntoa(ip->ip_src), sizeof(src));
  817                 strlcpy(dst, inet_ntoa(ip->ip_dst), sizeof(dst));
  818         }
  819         else {
  820                 strlcpy(src, "(unknown)", sizeof(src));
  821                 strlcpy(dst, "(unknown)", sizeof(dst));
  822         }
  823         log(LOG_INFO,
  824             "Connection attempt to TCP %s:%d from %s:%d\n",
  825             dst, ntohs(th->th_dport),
  826             src, ntohs(th->th_sport));
  827 }
  828 #endif
  829 
  830 #ifdef INET6
  831 static void
  832 tcp6_log_refused(const struct ip6_hdr *ip6, const struct tcphdr *th)
  833 {
  834         char src[INET6_ADDRSTRLEN];
  835         char dst[INET6_ADDRSTRLEN];
  836 
  837         if (ip6) {
  838                 strlcpy(src, ip6_sprintf(&ip6->ip6_src), sizeof(src));
  839                 strlcpy(dst, ip6_sprintf(&ip6->ip6_dst), sizeof(dst));
  840         }
  841         else {
  842                 strlcpy(src, "(unknown v6)", sizeof(src));
  843                 strlcpy(dst, "(unknown v6)", sizeof(dst));
  844         }
  845         log(LOG_INFO,
  846             "Connection attempt to TCP [%s]:%d from [%s]:%d\n",
  847             dst, ntohs(th->th_dport),
  848             src, ntohs(th->th_sport));
  849 }
  850 #endif
  851 
  852 /*
  853  * Checksum extended TCP header and data.
  854  */
  855 int
  856 tcp_input_checksum(int af, struct mbuf *m, const struct tcphdr *th,
  857     int toff, int off, int tlen)
  858 {
  859 
  860         /*
  861          * XXX it's better to record and check if this mbuf is
  862          * already checked.
  863          */
  864 
  865         switch (af) {
  866 #ifdef INET
  867         case AF_INET:
  868                 switch (m->m_pkthdr.csum_flags &
  869                         ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv4) |
  870                          M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
  871                 case M_CSUM_TCPv4|M_CSUM_TCP_UDP_BAD:
  872                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_bad);
  873                         goto badcsum;
  874 
  875                 case M_CSUM_TCPv4|M_CSUM_DATA: {
  876                         u_int32_t hw_csum = m->m_pkthdr.csum_data;
  877 
  878                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_data);
  879                         if (m->m_pkthdr.csum_flags & M_CSUM_NO_PSEUDOHDR) {
  880                                 const struct ip *ip =
  881                                     mtod(m, const struct ip *);
  882 
  883                                 hw_csum = in_cksum_phdr(ip->ip_src.s_addr,
  884                                     ip->ip_dst.s_addr,
  885                                     htons(hw_csum + tlen + off + IPPROTO_TCP));
  886                         }
  887                         if ((hw_csum ^ 0xffff) != 0)
  888                                 goto badcsum;
  889                         break;
  890                 }
  891 
  892                 case M_CSUM_TCPv4:
  893                         /* Checksum was okay. */
  894                         TCP_CSUM_COUNTER_INCR(&tcp_hwcsum_ok);
  895                         break;
  896 
  897                 default:
  898                         /*
  899                          * Must compute it ourselves.  Maybe skip checksum
  900                          * on loopback interfaces.
  901                          */
  902                         if (__predict_true(!(m->m_pkthdr.rcvif->if_flags &
  903                                              IFF_LOOPBACK) ||
  904                                            tcp_do_loopback_cksum)) {
  905                                 TCP_CSUM_COUNTER_INCR(&tcp_swcsum);
  906                                 if (in4_cksum(m, IPPROTO_TCP, toff,
  907                                               tlen + off) != 0)
  908                                         goto badcsum;
  909                         }
  910                         break;
  911                 }
  912                 break;
  913 #endif /* INET4 */
  914 
  915 #ifdef INET6
  916         case AF_INET6:
  917                 switch (m->m_pkthdr.csum_flags &
  918                         ((m->m_pkthdr.rcvif->if_csum_flags_rx & M_CSUM_TCPv6) |
  919                          M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
  920                 case M_CSUM_TCPv6|M_CSUM_TCP_UDP_BAD:
  921                         TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_bad);
  922                         goto badcsum;
  923 
  924 #if 0 /* notyet */
  925                 case M_CSUM_TCPv6|M_CSUM_DATA:
  926 #endif
  927 
  928                 case M_CSUM_TCPv6:
  929                         /* Checksum was okay. */
  930                         TCP_CSUM_COUNTER_INCR(&tcp6_hwcsum_ok);
  931                         break;
  932 
  933                 default:
  934                         /*
  935                          * Must compute it ourselves.  Maybe skip checksum
  936                          * on loopback interfaces.
  937                          */
  938                         if (__predict_true((m->m_flags & M_LOOP) == 0 ||
  939                             tcp_do_loopback_cksum)) {
  940                                 TCP_CSUM_COUNTER_INCR(&tcp6_swcsum);
  941                                 if (in6_cksum(m, IPPROTO_TCP, toff,
  942                                     tlen + off) != 0)
  943                                         goto badcsum;
  944                         }
  945                 }
  946                 break;
  947 #endif /* INET6 */
  948         }
  949 
  950         return 0;
  951 
  952 badcsum:
  953         tcpstat.tcps_rcvbadsum++;
  954         return -1;
  955 }
  956 
  957 /*
  958  * TCP input routine, follows pages 65-76 of RFC 793 very closely.
  959  */
  960 void
  961 tcp_input(struct mbuf *m, ...)
  962 {
  963         struct tcphdr *th;
  964         struct ip *ip;
  965         struct inpcb *inp;
  966 #ifdef INET6
  967         struct ip6_hdr *ip6;
  968         struct in6pcb *in6p;
  969 #endif
  970         u_int8_t *optp = NULL;
  971         int optlen = 0;
  972         int len, tlen, toff, hdroptlen = 0;
  973         struct tcpcb *tp = 0;
  974         int tiflags;
  975         struct socket *so = NULL;
  976         int todrop, dupseg, acked, ourfinisacked, needoutput = 0;
  977 #ifdef TCP_DEBUG
  978         short ostate = 0;
  979 #endif
  980         u_long tiwin;
  981         struct tcp_opt_info opti;
  982         int off, iphlen;
  983         va_list ap;
  984         int af;         /* af on the wire */
  985         struct mbuf *tcp_saveti = NULL;
  986         uint32_t ts_rtt;
  987         uint8_t iptos;
  988 
  989         MCLAIM(m, &tcp_rx_mowner);
  990         va_start(ap, m);
  991         toff = va_arg(ap, int);
  992         (void)va_arg(ap, int);          /* ignore value, advance ap */
  993         va_end(ap);
  994 
  995         tcpstat.tcps_rcvtotal++;
  996 
  997         bzero(&opti, sizeof(opti));
  998         opti.ts_present = 0;
  999         opti.maxseg = 0;
 1000 
 1001         /*
 1002          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN.
 1003          *
 1004          * TCP is, by definition, unicast, so we reject all
 1005          * multicast outright.
 1006          *
 1007          * Note, there are additional src/dst address checks in
 1008          * the AF-specific code below.
 1009          */
 1010         if (m->m_flags & (M_BCAST|M_MCAST)) {
 1011                 /* XXX stat */
 1012                 goto drop;
 1013         }
 1014 #ifdef INET6
 1015         if (m->m_flags & M_ANYCAST6) {
 1016                 /* XXX stat */
 1017                 goto drop;
 1018         }
 1019 #endif
 1020 
 1021         /*
 1022          * Get IP and TCP header.
 1023          * Note: IP leaves IP header in first mbuf.
 1024          */
 1025         ip = mtod(m, struct ip *);
 1026 #ifdef INET6
 1027         ip6 = NULL;
 1028 #endif
 1029         switch (ip->ip_v) {
 1030 #ifdef INET
 1031         case 4:
 1032                 af = AF_INET;
 1033                 iphlen = sizeof(struct ip);
 1034                 ip = mtod(m, struct ip *);
 1035                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
 1036                         sizeof(struct tcphdr));
 1037                 if (th == NULL) {
 1038                         tcpstat.tcps_rcvshort++;
 1039                         return;
 1040                 }
 1041                 /* We do the checksum after PCB lookup... */
 1042                 len = ntohs(ip->ip_len);
 1043                 tlen = len - toff;
 1044                 iptos = ip->ip_tos;
 1045                 break;
 1046 #endif
 1047 #ifdef INET6
 1048         case 6:
 1049                 ip = NULL;
 1050                 iphlen = sizeof(struct ip6_hdr);
 1051                 af = AF_INET6;
 1052                 ip6 = mtod(m, struct ip6_hdr *);
 1053                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff,
 1054                         sizeof(struct tcphdr));
 1055                 if (th == NULL) {
 1056                         tcpstat.tcps_rcvshort++;
 1057                         return;
 1058                 }
 1059 
 1060                 /* Be proactive about malicious use of IPv4 mapped address */
 1061                 if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
 1062                     IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
 1063                         /* XXX stat */
 1064                         goto drop;
 1065                 }
 1066 
 1067                 /*
 1068                  * Be proactive about unspecified IPv6 address in source.
 1069                  * As we use all-zero to indicate unbounded/unconnected pcb,
 1070                  * unspecified IPv6 address can be used to confuse us.
 1071                  *
 1072                  * Note that packets with unspecified IPv6 destination is
 1073                  * already dropped in ip6_input.
 1074                  */
 1075                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
 1076                         /* XXX stat */
 1077                         goto drop;
 1078                 }
 1079 
 1080                 /*
 1081                  * Make sure destination address is not multicast.
 1082                  * Source address checked in ip6_input().
 1083                  */
 1084                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 1085                         /* XXX stat */
 1086                         goto drop;
 1087                 }
 1088 
 1089                 /* We do the checksum after PCB lookup... */
 1090                 len = m->m_pkthdr.len;
 1091                 tlen = len - toff;
 1092                 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
 1093                 break;
 1094 #endif
 1095         default:
 1096                 m_freem(m);
 1097                 return;
 1098         }
 1099 
 1100         KASSERT(TCP_HDR_ALIGNED_P(th));
 1101 
 1102         /*
 1103          * Check that TCP offset makes sense,
 1104          * pull out TCP options and adjust length.              XXX
 1105          */
 1106         off = th->th_off << 2;
 1107         if (off < sizeof (struct tcphdr) || off > tlen) {
 1108                 tcpstat.tcps_rcvbadoff++;
 1109                 goto drop;
 1110         }
 1111         tlen -= off;
 1112 
 1113         /*
 1114          * tcp_input() has been modified to use tlen to mean the TCP data
 1115          * length throughout the function.  Other functions can use
 1116          * m->m_pkthdr.len as the basis for calculating the TCP data length.
 1117          * rja
 1118          */
 1119 
 1120         if (off > sizeof (struct tcphdr)) {
 1121                 IP6_EXTHDR_GET(th, struct tcphdr *, m, toff, off);
 1122                 if (th == NULL) {
 1123                         tcpstat.tcps_rcvshort++;
 1124                         return;
 1125                 }
 1126                 /*
 1127                  * NOTE: ip/ip6 will not be affected by m_pulldown()
 1128                  * (as they're before toff) and we don't need to update those.
 1129                  */
 1130                 KASSERT(TCP_HDR_ALIGNED_P(th));
 1131                 optlen = off - sizeof (struct tcphdr);
 1132                 optp = ((u_int8_t *)th) + sizeof(struct tcphdr);
 1133                 /*
 1134                  * Do quick retrieval of timestamp options ("options
 1135                  * prediction?").  If timestamp is the only option and it's
 1136                  * formatted as recommended in RFC 1323 appendix A, we
 1137                  * quickly get the values now and not bother calling
 1138                  * tcp_dooptions(), etc.
 1139                  */
 1140                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
 1141                      (optlen > TCPOLEN_TSTAMP_APPA &&
 1142                         optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
 1143                      *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
 1144                      (th->th_flags & TH_SYN) == 0) {
 1145                         opti.ts_present = 1;
 1146                         opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
 1147                         opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
 1148                         optp = NULL;    /* we've parsed the options */
 1149                 }
 1150         }
 1151         tiflags = th->th_flags;
 1152 
 1153         /*
 1154          * Locate pcb for segment.
 1155          */
 1156 findpcb:
 1157         inp = NULL;
 1158 #ifdef INET6
 1159         in6p = NULL;
 1160 #endif
 1161         switch (af) {
 1162 #ifdef INET
 1163         case AF_INET:
 1164                 inp = in_pcblookup_connect(&tcbtable, ip->ip_src, th->th_sport,
 1165                     ip->ip_dst, th->th_dport);
 1166                 if (inp == 0) {
 1167                         ++tcpstat.tcps_pcbhashmiss;
 1168                         inp = in_pcblookup_bind(&tcbtable, ip->ip_dst, th->th_dport);
 1169                 }
 1170 #ifdef INET6
 1171                 if (inp == 0) {
 1172                         struct in6_addr s, d;
 1173 
 1174                         /* mapped addr case */
 1175                         bzero(&s, sizeof(s));
 1176                         s.s6_addr16[5] = htons(0xffff);
 1177                         bcopy(&ip->ip_src, &s.s6_addr32[3], sizeof(ip->ip_src));
 1178                         bzero(&d, sizeof(d));
 1179                         d.s6_addr16[5] = htons(0xffff);
 1180                         bcopy(&ip->ip_dst, &d.s6_addr32[3], sizeof(ip->ip_dst));
 1181                         in6p = in6_pcblookup_connect(&tcbtable, &s,
 1182                             th->th_sport, &d, th->th_dport, 0);
 1183                         if (in6p == 0) {
 1184                                 ++tcpstat.tcps_pcbhashmiss;
 1185                                 in6p = in6_pcblookup_bind(&tcbtable, &d,
 1186                                     th->th_dport, 0);
 1187                         }
 1188                 }
 1189 #endif
 1190 #ifndef INET6
 1191                 if (inp == 0)
 1192 #else
 1193                 if (inp == 0 && in6p == 0)
 1194 #endif
 1195                 {
 1196                         ++tcpstat.tcps_noport;
 1197                         if (tcp_log_refused &&
 1198                             (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
 1199                                 tcp4_log_refused(ip, th);
 1200                         }
 1201                         TCP_FIELDS_TO_HOST(th);
 1202                         goto dropwithreset_ratelim;
 1203                 }
 1204 #if defined(IPSEC) || defined(FAST_IPSEC)
 1205                 if (inp && (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0 &&
 1206                     ipsec4_in_reject(m, inp)) {
 1207                         ipsecstat.in_polvio++;
 1208                         goto drop;
 1209                 }
 1210 #ifdef INET6
 1211                 else if (in6p &&
 1212                     (in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
 1213                     ipsec6_in_reject_so(m, in6p->in6p_socket)) {
 1214                         ipsecstat.in_polvio++;
 1215                         goto drop;
 1216                 }
 1217 #endif
 1218 #endif /*IPSEC*/
 1219                 break;
 1220 #endif /*INET*/
 1221 #ifdef INET6
 1222         case AF_INET6:
 1223             {
 1224                 int faith;
 1225 
 1226 #if defined(NFAITH) && NFAITH > 0
 1227                 faith = faithprefix(&ip6->ip6_dst);
 1228 #else
 1229                 faith = 0;
 1230 #endif
 1231                 in6p = in6_pcblookup_connect(&tcbtable, &ip6->ip6_src,
 1232                     th->th_sport, &ip6->ip6_dst, th->th_dport, faith);
 1233                 if (in6p == NULL) {
 1234                         ++tcpstat.tcps_pcbhashmiss;
 1235                         in6p = in6_pcblookup_bind(&tcbtable, &ip6->ip6_dst,
 1236                                 th->th_dport, faith);
 1237                 }
 1238                 if (in6p == NULL) {
 1239                         ++tcpstat.tcps_noport;
 1240                         if (tcp_log_refused &&
 1241                             (tiflags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN) {
 1242                                 tcp6_log_refused(ip6, th);
 1243                         }
 1244                         TCP_FIELDS_TO_HOST(th);
 1245                         goto dropwithreset_ratelim;
 1246                 }
 1247 #if defined(IPSEC) || defined(FAST_IPSEC)
 1248                 if ((in6p->in6p_socket->so_options & SO_ACCEPTCONN) == 0 &&
 1249                     ipsec6_in_reject(m, in6p)) {
 1250                         ipsec6stat.in_polvio++;
 1251                         goto drop;
 1252                 }
 1253 #endif /*IPSEC*/
 1254                 break;
 1255             }
 1256 #endif
 1257         }
 1258 
 1259         /*
 1260          * If the state is CLOSED (i.e., TCB does not exist) then
 1261          * all data in the incoming segment is discarded.
 1262          * If the TCB exists but is in CLOSED state, it is embryonic,
 1263          * but should either do a listen or a connect soon.
 1264          */
 1265         tp = NULL;
 1266         so = NULL;
 1267         if (inp) {
 1268                 tp = intotcpcb(inp);
 1269                 so = inp->inp_socket;
 1270         }
 1271 #ifdef INET6
 1272         else if (in6p) {
 1273                 tp = in6totcpcb(in6p);
 1274                 so = in6p->in6p_socket;
 1275         }
 1276 #endif
 1277         if (tp == 0) {
 1278                 TCP_FIELDS_TO_HOST(th);
 1279                 goto dropwithreset_ratelim;
 1280         }
 1281         if (tp->t_state == TCPS_CLOSED)
 1282                 goto drop;
 1283 
 1284         /*
 1285          * Checksum extended TCP header and data.
 1286          */
 1287         if (tcp_input_checksum(af, m, th, toff, off, tlen))
 1288                 goto badcsum;
 1289 
 1290         TCP_FIELDS_TO_HOST(th);
 1291 
 1292         /* Unscale the window into a 32-bit value. */
 1293         if ((tiflags & TH_SYN) == 0)
 1294                 tiwin = th->th_win << tp->snd_scale;
 1295         else
 1296                 tiwin = th->th_win;
 1297 
 1298 #ifdef INET6
 1299         /* save packet options if user wanted */
 1300         if (in6p && (in6p->in6p_flags & IN6P_CONTROLOPTS)) {
 1301                 if (in6p->in6p_options) {
 1302                         m_freem(in6p->in6p_options);
 1303                         in6p->in6p_options = 0;
 1304                 }
 1305                 KASSERT(ip6 != NULL);
 1306                 ip6_savecontrol(in6p, &in6p->in6p_options, ip6, m);
 1307         }
 1308 #endif
 1309 
 1310         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
 1311                 union syn_cache_sa src;
 1312                 union syn_cache_sa dst;
 1313 
 1314                 bzero(&src, sizeof(src));
 1315                 bzero(&dst, sizeof(dst));
 1316                 switch (af) {
 1317 #ifdef INET
 1318                 case AF_INET:
 1319                         src.sin.sin_len = sizeof(struct sockaddr_in);
 1320                         src.sin.sin_family = AF_INET;
 1321                         src.sin.sin_addr = ip->ip_src;
 1322                         src.sin.sin_port = th->th_sport;
 1323 
 1324                         dst.sin.sin_len = sizeof(struct sockaddr_in);
 1325                         dst.sin.sin_family = AF_INET;
 1326                         dst.sin.sin_addr = ip->ip_dst;
 1327                         dst.sin.sin_port = th->th_dport;
 1328                         break;
 1329 #endif
 1330 #ifdef INET6
 1331                 case AF_INET6:
 1332                         src.sin6.sin6_len = sizeof(struct sockaddr_in6);
 1333                         src.sin6.sin6_family = AF_INET6;
 1334                         src.sin6.sin6_addr = ip6->ip6_src;
 1335                         src.sin6.sin6_port = th->th_sport;
 1336 
 1337                         dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
 1338                         dst.sin6.sin6_family = AF_INET6;
 1339                         dst.sin6.sin6_addr = ip6->ip6_dst;
 1340                         dst.sin6.sin6_port = th->th_dport;
 1341                         break;
 1342 #endif /* INET6 */
 1343                 default:
 1344                         goto badsyn;    /*sanity*/
 1345                 }
 1346 
 1347                 if (so->so_options & SO_DEBUG) {
 1348 #ifdef TCP_DEBUG
 1349                         ostate = tp->t_state;
 1350 #endif
 1351 
 1352                         tcp_saveti = NULL;
 1353                         if (iphlen + sizeof(struct tcphdr) > MHLEN)
 1354                                 goto nosave;
 1355 
 1356                         if (m->m_len > iphlen && (m->m_flags & M_EXT) == 0) {
 1357                                 tcp_saveti = m_copym(m, 0, iphlen, M_DONTWAIT);
 1358                                 if (!tcp_saveti)
 1359                                         goto nosave;
 1360                         } else {
 1361                                 MGETHDR(tcp_saveti, M_DONTWAIT, MT_HEADER);
 1362                                 if (!tcp_saveti)
 1363                                         goto nosave;
 1364                                 MCLAIM(m, &tcp_mowner);
 1365                                 tcp_saveti->m_len = iphlen;
 1366                                 m_copydata(m, 0, iphlen,
 1367                                     mtod(tcp_saveti, caddr_t));
 1368                         }
 1369 
 1370                         if (M_TRAILINGSPACE(tcp_saveti) < sizeof(struct tcphdr)) {
 1371                                 m_freem(tcp_saveti);
 1372                                 tcp_saveti = NULL;
 1373                         } else {
 1374                                 tcp_saveti->m_len += sizeof(struct tcphdr);
 1375                                 bcopy(th, mtod(tcp_saveti, caddr_t) + iphlen,
 1376                                     sizeof(struct tcphdr));
 1377                         }
 1378         nosave:;
 1379                 }
 1380                 if (so->so_options & SO_ACCEPTCONN) {
 1381                         if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
 1382                                 if (tiflags & TH_RST) {
 1383                                         syn_cache_reset(&src.sa, &dst.sa, th);
 1384                                 } else if ((tiflags & (TH_ACK|TH_SYN)) ==
 1385                                     (TH_ACK|TH_SYN)) {
 1386                                         /*
 1387                                          * Received a SYN,ACK.  This should
 1388                                          * never happen while we are in
 1389                                          * LISTEN.  Send an RST.
 1390                                          */
 1391                                         goto badsyn;
 1392                                 } else if (tiflags & TH_ACK) {
 1393                                         so = syn_cache_get(&src.sa, &dst.sa,
 1394                                                 th, toff, tlen, so, m);
 1395                                         if (so == NULL) {
 1396                                                 /*
 1397                                                  * We don't have a SYN for
 1398                                                  * this ACK; send an RST.
 1399                                                  */
 1400                                                 goto badsyn;
 1401                                         } else if (so ==
 1402                                             (struct socket *)(-1)) {
 1403                                                 /*
 1404                                                  * We were unable to create
 1405                                                  * the connection.  If the
 1406                                                  * 3-way handshake was
 1407                                                  * completed, and RST has
 1408                                                  * been sent to the peer.
 1409                                                  * Since the mbuf might be
 1410                                                  * in use for the reply,
 1411                                                  * do not free it.
 1412                                                  */
 1413                                                 m = NULL;
 1414                                         } else {
 1415                                                 /*
 1416                                                  * We have created a
 1417                                                  * full-blown connection.
 1418                                                  */
 1419                                                 tp = NULL;
 1420                                                 inp = NULL;
 1421 #ifdef INET6
 1422                                                 in6p = NULL;
 1423 #endif
 1424                                                 switch (so->so_proto->pr_domain->dom_family) {
 1425 #ifdef INET
 1426                                                 case AF_INET:
 1427                                                         inp = sotoinpcb(so);
 1428                                                         tp = intotcpcb(inp);
 1429                                                         break;
 1430 #endif
 1431 #ifdef INET6
 1432                                                 case AF_INET6:
 1433                                                         in6p = sotoin6pcb(so);
 1434                                                         tp = in6totcpcb(in6p);
 1435                                                         break;
 1436 #endif
 1437                                                 }
 1438                                                 if (tp == NULL)
 1439                                                         goto badsyn;    /*XXX*/
 1440                                                 tiwin <<= tp->snd_scale;
 1441                                                 goto after_listen;
 1442                                         }
 1443                                 } else {
 1444                                         /*
 1445                                          * None of RST, SYN or ACK was set.
 1446                                          * This is an invalid packet for a
 1447                                          * TCB in LISTEN state.  Send a RST.
 1448                                          */
 1449                                         goto badsyn;
 1450                                 }
 1451                         } else {
 1452                                 /*
 1453                                  * Received a SYN.
 1454                                  *
 1455                                  * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 1456                                  */
 1457                                 if (m->m_flags & (M_BCAST|M_MCAST))
 1458                                         goto drop;
 1459 
 1460                                 switch (af) {
 1461 #ifdef INET6
 1462                                 case AF_INET6:
 1463                                         if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 1464                                                 goto drop;
 1465                                         break;
 1466 #endif /* INET6 */
 1467                                 case AF_INET:
 1468                                         if (IN_MULTICAST(ip->ip_dst.s_addr) ||
 1469                                             in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 1470                                                 goto drop;
 1471                                 break;
 1472                                 }
 1473 
 1474 #ifdef INET6
 1475                                 /*
 1476                                  * If deprecated address is forbidden, we do
 1477                                  * not accept SYN to deprecated interface
 1478                                  * address to prevent any new inbound
 1479                                  * connection from getting established.
 1480                                  * When we do not accept SYN, we send a TCP
 1481                                  * RST, with deprecated source address (instead
 1482                                  * of dropping it).  We compromise it as it is
 1483                                  * much better for peer to send a RST, and
 1484                                  * RST will be the final packet for the
 1485                                  * exchange.
 1486                                  *
 1487                                  * If we do not forbid deprecated addresses, we
 1488                                  * accept the SYN packet.  RFC2462 does not
 1489                                  * suggest dropping SYN in this case.
 1490                                  * If we decipher RFC2462 5.5.4, it says like
 1491                                  * this:
 1492                                  * 1. use of deprecated addr with existing
 1493                                  *    communication is okay - "SHOULD continue
 1494                                  *    to be used"
 1495                                  * 2. use of it with new communication:
 1496                                  *   (2a) "SHOULD NOT be used if alternate
 1497                                  *        address with sufficient scope is
 1498                                  *        available"
 1499                                  *   (2b) nothing mentioned otherwise.
 1500                                  * Here we fall into (2b) case as we have no
 1501                                  * choice in our source address selection - we
 1502                                  * must obey the peer.
 1503                                  *
 1504                                  * The wording in RFC2462 is confusing, and
 1505                                  * there are multiple description text for
 1506                                  * deprecated address handling - worse, they
 1507                                  * are not exactly the same.  I believe 5.5.4
 1508                                  * is the best one, so we follow 5.5.4.
 1509                                  */
 1510                                 if (af == AF_INET6 && !ip6_use_deprecated) {
 1511                                         struct in6_ifaddr *ia6;
 1512                                         if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
 1513                                             &ip6->ip6_dst)) &&
 1514                                             (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
 1515                                                 tp = NULL;
 1516                                                 goto dropwithreset;
 1517                                         }
 1518                                 }
 1519 #endif
 1520 
 1521 #if defined(IPSEC) || defined(FAST_IPSEC)
 1522                                 switch (af) {
 1523 #ifdef INET
 1524                                 case AF_INET:
 1525                                         if (ipsec4_in_reject_so(m, so)) {
 1526                                                 ipsecstat.in_polvio++;
 1527                                                 tp = NULL;
 1528                                                 goto dropwithreset;
 1529                                         }
 1530                                         break;
 1531 #endif
 1532 #ifdef INET6
 1533                                 case AF_INET6:
 1534                                         if (ipsec6_in_reject_so(m, so)) {
 1535                                                 ipsec6stat.in_polvio++;
 1536                                                 tp = NULL;
 1537                                                 goto dropwithreset;
 1538                                         }
 1539                                         break;
 1540 #endif /*INET6*/
 1541                                 }
 1542 #endif /*IPSEC*/
 1543 
 1544                                 /*
 1545                                  * LISTEN socket received a SYN
 1546                                  * from itself?  This can't possibly
 1547                                  * be valid; drop the packet.
 1548                                  */
 1549                                 if (th->th_sport == th->th_dport) {
 1550                                         int i;
 1551 
 1552                                         switch (af) {
 1553 #ifdef INET
 1554                                         case AF_INET:
 1555                                                 i = in_hosteq(ip->ip_src, ip->ip_dst);
 1556                                                 break;
 1557 #endif
 1558 #ifdef INET6
 1559                                         case AF_INET6:
 1560                                                 i = IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &ip6->ip6_dst);
 1561                                                 break;
 1562 #endif
 1563                                         default:
 1564                                                 i = 1;
 1565                                         }
 1566                                         if (i) {
 1567                                                 tcpstat.tcps_badsyn++;
 1568                                                 goto drop;
 1569                                         }
 1570                                 }
 1571 
 1572                                 /*
 1573                                  * SYN looks ok; create compressed TCP
 1574                                  * state for it.
 1575                                  */
 1576                                 if (so->so_qlen <= so->so_qlimit &&
 1577                                     syn_cache_add(&src.sa, &dst.sa, th, tlen,
 1578                                                 so, m, optp, optlen, &opti))
 1579                                         m = NULL;
 1580                         }
 1581                         goto drop;
 1582                 }
 1583         }
 1584 
 1585 after_listen:
 1586 #ifdef DIAGNOSTIC
 1587         /*
 1588          * Should not happen now that all embryonic connections
 1589          * are handled with compressed state.
 1590          */
 1591         if (tp->t_state == TCPS_LISTEN)
 1592                 panic("tcp_input: TCPS_LISTEN");
 1593 #endif
 1594 
 1595         /*
 1596          * Segment received on connection.
 1597          * Reset idle time and keep-alive timer.
 1598          */
 1599         tp->t_rcvtime = tcp_now;
 1600         if (TCPS_HAVEESTABLISHED(tp->t_state))
 1601                 TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
 1602 
 1603         /*
 1604          * Process options.
 1605          */
 1606 #ifdef TCP_SIGNATURE
 1607         if (optp || (tp->t_flags & TF_SIGNATURE))
 1608 #else
 1609         if (optp)
 1610 #endif
 1611                 if (tcp_dooptions(tp, optp, optlen, th, m, toff, &opti) < 0)
 1612                         goto drop;
 1613 
 1614         if (TCP_SACK_ENABLED(tp)) {
 1615                 tcp_del_sackholes(tp, th);
 1616         }
 1617 
 1618         if (TCP_ECN_ALLOWED(tp)) {
 1619                 switch (iptos & IPTOS_ECN_MASK) {
 1620                 case IPTOS_ECN_CE:
 1621                         tp->t_flags |= TF_ECN_SND_ECE;
 1622                         tcpstat.tcps_ecn_ce++;
 1623                         break;
 1624                 case IPTOS_ECN_ECT0:
 1625                         tcpstat.tcps_ecn_ect++;
 1626                         break;
 1627                 case IPTOS_ECN_ECT1:
 1628                         /* XXX: ignore for now -- rpaulo */
 1629                         break;
 1630                 }
 1631 
 1632                 if (tiflags & TH_CWR)
 1633                         tp->t_flags &= ~TF_ECN_SND_ECE;
 1634 
 1635                 /*
 1636                  * Congestion experienced.
 1637                  * Ignore if we are already trying to recover.
 1638                  */
 1639                 if ((tiflags & TH_ECE) && SEQ_GEQ(tp->snd_una, tp->snd_recover))
 1640                         tp->t_congctl->cong_exp(tp);
 1641         }
 1642 
 1643         if (opti.ts_present && opti.ts_ecr) {
 1644                 /*
 1645                  * Calculate the RTT from the returned time stamp and the
 1646                  * connection's time base.  If the time stamp is later than
 1647                  * the current time, or is extremely old, fall back to non-1323
 1648                  * RTT calculation.  Since ts_ecr is unsigned, we can test both
 1649                  * at the same time.
 1650                  */
 1651                 ts_rtt = TCP_TIMESTAMP(tp) - opti.ts_ecr + 1;
 1652                 if (ts_rtt > TCP_PAWS_IDLE)
 1653                         ts_rtt = 0;
 1654         } else {
 1655                 ts_rtt = 0;
 1656         }
 1657 
 1658         /*
 1659          * Header prediction: check for the two common cases
 1660          * of a uni-directional data xfer.  If the packet has
 1661          * no control flags, is in-sequence, the window didn't
 1662          * change and we're not retransmitting, it's a
 1663          * candidate.  If the length is zero and the ack moved
 1664          * forward, we're the sender side of the xfer.  Just
 1665          * free the data acked & wake any higher level process
 1666          * that was blocked waiting for space.  If the length
 1667          * is non-zero and the ack didn't move, we're the
 1668          * receiver side.  If we're getting packets in-order
 1669          * (the reassembly queue is empty), add the data to
 1670          * the socket buffer and note that we need a delayed ack.
 1671          */
 1672         if (tp->t_state == TCPS_ESTABLISHED &&
 1673             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK))
 1674                 == TH_ACK &&
 1675             (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
 1676             th->th_seq == tp->rcv_nxt &&
 1677             tiwin && tiwin == tp->snd_wnd &&
 1678             tp->snd_nxt == tp->snd_max) {
 1679 
 1680                 /*
 1681                  * If last ACK falls within this segment's sequence numbers,
 1682                  *  record the timestamp.
 1683                  * NOTE: 
 1684                  * 1) That the test incorporates suggestions from the latest
 1685                  *    proposal of the tcplw@cray.com list (Braden 1993/04/26).
 1686                  * 2) That updating only on newer timestamps interferes with
 1687                  *    our earlier PAWS tests, so this check should be solely
 1688                  *    predicated on the sequence space of this segment.
 1689                  * 3) That we modify the segment boundary check to be 
 1690                  *        Last.ACK.Sent <= SEG.SEQ + SEG.Len  
 1691                  *    instead of RFC1323's
 1692                  *        Last.ACK.Sent < SEG.SEQ + SEG.Len,
 1693                  *    This modified check allows us to overcome RFC1323's
 1694                  *    limitations as described in Stevens TCP/IP Illustrated
 1695                  *    Vol. 2 p.869. In such cases, we can still calculate the
 1696                  *    RTT correctly when RCV.NXT == Last.ACK.Sent.
 1697                  */
 1698                 if (opti.ts_present &&
 1699                     SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 1700                     SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
 1701                     ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
 1702                         tp->ts_recent_age = tcp_now;
 1703                         tp->ts_recent = opti.ts_val;
 1704                 }
 1705 
 1706                 if (tlen == 0) {
 1707                         /* Ack prediction. */
 1708                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
 1709                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
 1710                             tp->snd_cwnd >= tp->snd_wnd &&
 1711                             tp->t_partialacks < 0) {
 1712                                 /*
 1713                                  * this is a pure ack for outstanding data.
 1714                                  */
 1715                                 ++tcpstat.tcps_predack;
 1716                                 if (ts_rtt)
 1717                                         tcp_xmit_timer(tp, ts_rtt);
 1718                                 else if (tp->t_rtttime &&
 1719                                     SEQ_GT(th->th_ack, tp->t_rtseq))
 1720                                         tcp_xmit_timer(tp,
 1721                                           tcp_now - tp->t_rtttime);
 1722                                 acked = th->th_ack - tp->snd_una;
 1723                                 tcpstat.tcps_rcvackpack++;
 1724                                 tcpstat.tcps_rcvackbyte += acked;
 1725                                 ND6_HINT(tp);
 1726 
 1727                                 if (acked > (tp->t_lastoff - tp->t_inoff))
 1728                                         tp->t_lastm = NULL;
 1729                                 sbdrop(&so->so_snd, acked);
 1730                                 tp->t_lastoff -= acked;
 1731 
 1732                                 ICMP_CHECK(tp, th, acked);
 1733 
 1734                                 tp->snd_una = th->th_ack;
 1735                                 tp->snd_fack = tp->snd_una;
 1736                                 if (SEQ_LT(tp->snd_high, tp->snd_una))
 1737                                         tp->snd_high = tp->snd_una;
 1738                                 m_freem(m);
 1739 
 1740                                 /*
 1741                                  * If all outstanding data are acked, stop
 1742                                  * retransmit timer, otherwise restart timer
 1743                                  * using current (possibly backed-off) value.
 1744                                  * If process is waiting for space,
 1745                                  * wakeup/selwakeup/signal.  If data
 1746                                  * are ready to send, let tcp_output
 1747                                  * decide between more output or persist.
 1748                                  */
 1749                                 if (tp->snd_una == tp->snd_max)
 1750                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1751                                 else if (TCP_TIMER_ISARMED(tp,
 1752                                     TCPT_PERSIST) == 0)
 1753                                         TCP_TIMER_ARM(tp, TCPT_REXMT,
 1754                                             tp->t_rxtcur);
 1755 
 1756                                 sowwakeup(so);
 1757                                 if (so->so_snd.sb_cc)
 1758                                         (void) tcp_output(tp);
 1759                                 if (tcp_saveti)
 1760                                         m_freem(tcp_saveti);
 1761                                 return;
 1762                         }
 1763                 } else if (th->th_ack == tp->snd_una &&
 1764                     TAILQ_FIRST(&tp->segq) == NULL &&
 1765                     tlen <= sbspace(&so->so_rcv)) {
 1766                         /*
 1767                          * this is a pure, in-sequence data packet
 1768                          * with nothing on the reassembly queue and
 1769                          * we have enough buffer space to take it.
 1770                          */
 1771                         ++tcpstat.tcps_preddat;
 1772                         tp->rcv_nxt += tlen;
 1773                         tcpstat.tcps_rcvpack++;
 1774                         tcpstat.tcps_rcvbyte += tlen;
 1775                         ND6_HINT(tp);
 1776                         /*
 1777                          * Drop TCP, IP headers and TCP options then add data
 1778                          * to socket buffer.
 1779                          */
 1780                         if (so->so_state & SS_CANTRCVMORE)
 1781                                 m_freem(m);
 1782                         else {
 1783                                 m_adj(m, toff + off);
 1784                                 sbappendstream(&so->so_rcv, m);
 1785                         }
 1786                         sorwakeup(so);
 1787                         TCP_SETUP_ACK(tp, th);
 1788                         if (tp->t_flags & TF_ACKNOW)
 1789                                 (void) tcp_output(tp);
 1790                         if (tcp_saveti)
 1791                                 m_freem(tcp_saveti);
 1792                         return;
 1793                 }
 1794         }
 1795 
 1796         /*
 1797          * Compute mbuf offset to TCP data segment.
 1798          */
 1799         hdroptlen = toff + off;
 1800 
 1801         /*
 1802          * Calculate amount of space in receive window,
 1803          * and then do TCP input processing.
 1804          * Receive window is amount of space in rcv queue,
 1805          * but not less than advertised window.
 1806          */
 1807         { int win;
 1808 
 1809         win = sbspace(&so->so_rcv);
 1810         if (win < 0)
 1811                 win = 0;
 1812         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 1813         }
 1814 
 1815         switch (tp->t_state) {
 1816         /*
 1817          * If the state is SYN_SENT:
 1818          *      if seg contains an ACK, but not for our SYN, drop the input.
 1819          *      if seg contains a RST, then drop the connection.
 1820          *      if seg does not contain SYN, then drop it.
 1821          * Otherwise this is an acceptable SYN segment
 1822          *      initialize tp->rcv_nxt and tp->irs
 1823          *      if seg contains ack then advance tp->snd_una
 1824          *      if seg contains a ECE and ECN support is enabled, the stream
 1825          *          is ECN capable.
 1826          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 1827          *      arrange for segment to be acked (eventually)
 1828          *      continue processing rest of data/controls, beginning with URG
 1829          */
 1830         case TCPS_SYN_SENT:
 1831                 if ((tiflags & TH_ACK) &&
 1832                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1833                      SEQ_GT(th->th_ack, tp->snd_max)))
 1834                         goto dropwithreset;
 1835                 if (tiflags & TH_RST) {
 1836                         if (tiflags & TH_ACK)
 1837                                 tp = tcp_drop(tp, ECONNREFUSED);
 1838                         goto drop;
 1839                 }
 1840                 if ((tiflags & TH_SYN) == 0)
 1841                         goto drop;
 1842                 if (tiflags & TH_ACK) {
 1843                         tp->snd_una = th->th_ack;
 1844                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 1845                                 tp->snd_nxt = tp->snd_una;
 1846                         if (SEQ_LT(tp->snd_high, tp->snd_una))
 1847                                 tp->snd_high = tp->snd_una;
 1848                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1849 
 1850                         if ((tiflags & TH_ECE) && tcp_do_ecn) {
 1851                                 tp->t_flags |= TF_ECN_PERMIT;
 1852                                 tcpstat.tcps_ecn_shs++;
 1853                         }
 1854 
 1855                 }
 1856                 tp->irs = th->th_seq;
 1857                 tcp_rcvseqinit(tp);
 1858                 tp->t_flags |= TF_ACKNOW;
 1859                 tcp_mss_from_peer(tp, opti.maxseg);
 1860 
 1861                 /*
 1862                  * Initialize the initial congestion window.  If we
 1863                  * had to retransmit the SYN, we must initialize cwnd
 1864                  * to 1 segment (i.e. the Loss Window).
 1865                  */
 1866                 if (tp->t_flags & TF_SYN_REXMT)
 1867                         tp->snd_cwnd = tp->t_peermss;
 1868                 else {
 1869                         int ss = tcp_init_win;
 1870 #ifdef INET
 1871                         if (inp != NULL && in_localaddr(inp->inp_faddr))
 1872                                 ss = tcp_init_win_local;
 1873 #endif
 1874 #ifdef INET6
 1875                         if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
 1876                                 ss = tcp_init_win_local;
 1877 #endif
 1878                         tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
 1879                 }
 1880 
 1881                 tcp_rmx_rtt(tp);
 1882                 if (tiflags & TH_ACK) {
 1883                         tcpstat.tcps_connects++;
 1884                         soisconnected(so);
 1885                         tcp_established(tp);
 1886                         /* Do window scaling on this connection? */
 1887                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1888                             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1889                                 tp->snd_scale = tp->requested_s_scale;
 1890                                 tp->rcv_scale = tp->request_r_scale;
 1891                         }
 1892                         TCP_REASS_LOCK(tp);
 1893                         (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
 1894                         TCP_REASS_UNLOCK(tp);
 1895                         /*
 1896                          * if we didn't have to retransmit the SYN,
 1897                          * use its rtt as our initial srtt & rtt var.
 1898                          */
 1899                         if (tp->t_rtttime)
 1900                                 tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 1901                 } else
 1902                         tp->t_state = TCPS_SYN_RECEIVED;
 1903 
 1904                 /*
 1905                  * Advance th->th_seq to correspond to first data byte.
 1906                  * If data, trim to stay within window,
 1907                  * dropping FIN if necessary.
 1908                  */
 1909                 th->th_seq++;
 1910                 if (tlen > tp->rcv_wnd) {
 1911                         todrop = tlen - tp->rcv_wnd;
 1912                         m_adj(m, -todrop);
 1913                         tlen = tp->rcv_wnd;
 1914                         tiflags &= ~TH_FIN;
 1915                         tcpstat.tcps_rcvpackafterwin++;
 1916                         tcpstat.tcps_rcvbyteafterwin += todrop;
 1917                 }
 1918                 tp->snd_wl1 = th->th_seq - 1;
 1919                 tp->rcv_up = th->th_seq;
 1920                 goto step6;
 1921 
 1922         /*
 1923          * If the state is SYN_RECEIVED:
 1924          *      If seg contains an ACK, but not for our SYN, drop the input
 1925          *      and generate an RST.  See page 36, rfc793
 1926          */
 1927         case TCPS_SYN_RECEIVED:
 1928                 if ((tiflags & TH_ACK) &&
 1929                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1930                      SEQ_GT(th->th_ack, tp->snd_max)))
 1931                         goto dropwithreset;
 1932                 break;
 1933         }
 1934 
 1935         /*
 1936          * States other than LISTEN or SYN_SENT.
 1937          * First check timestamp, if present.
 1938          * Then check that at least some bytes of segment are within
 1939          * receive window.  If segment begins before rcv_nxt,
 1940          * drop leading data (and SYN); if nothing left, just ack.
 1941          *
 1942          * RFC 1323 PAWS: If we have a timestamp reply on this segment
 1943          * and it's less than ts_recent, drop it.
 1944          */
 1945         if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
 1946             TSTMP_LT(opti.ts_val, tp->ts_recent)) {
 1947 
 1948                 /* Check to see if ts_recent is over 24 days old.  */
 1949                 if (tcp_now - tp->ts_recent_age > TCP_PAWS_IDLE) {
 1950                         /*
 1951                          * Invalidate ts_recent.  If this segment updates
 1952                          * ts_recent, the age will be reset later and ts_recent
 1953                          * will get a valid value.  If it does not, setting
 1954                          * ts_recent to zero will at least satisfy the
 1955                          * requirement that zero be placed in the timestamp
 1956                          * echo reply when ts_recent isn't valid.  The
 1957                          * age isn't reset until we get a valid ts_recent
 1958                          * because we don't want out-of-order segments to be
 1959                          * dropped when ts_recent is old.
 1960                          */
 1961                         tp->ts_recent = 0;
 1962                 } else {
 1963                         tcpstat.tcps_rcvduppack++;
 1964                         tcpstat.tcps_rcvdupbyte += tlen;
 1965                         tcpstat.tcps_pawsdrop++;
 1966                         tcp_new_dsack(tp, th->th_seq, tlen);
 1967                         goto dropafterack;
 1968                 }
 1969         }
 1970 
 1971         todrop = tp->rcv_nxt - th->th_seq;
 1972         dupseg = FALSE;
 1973         if (todrop > 0) {
 1974                 if (tiflags & TH_SYN) {
 1975                         tiflags &= ~TH_SYN;
 1976                         th->th_seq++;
 1977                         if (th->th_urp > 1)
 1978                                 th->th_urp--;
 1979                         else {
 1980                                 tiflags &= ~TH_URG;
 1981                                 th->th_urp = 0;
 1982                         }
 1983                         todrop--;
 1984                 }
 1985                 if (todrop > tlen ||
 1986                     (todrop == tlen && (tiflags & TH_FIN) == 0)) {
 1987                         /*
 1988                          * Any valid FIN or RST must be to the left of the
 1989                          * window.  At this point the FIN or RST must be a
 1990                          * duplicate or out of sequence; drop it.
 1991                          */
 1992                         if (tiflags & TH_RST)
 1993                                 goto drop;
 1994                         tiflags &= ~(TH_FIN|TH_RST);
 1995                         /*
 1996                          * Send an ACK to resynchronize and drop any data.
 1997                          * But keep on processing for RST or ACK.
 1998                          */
 1999                         tp->t_flags |= TF_ACKNOW;
 2000                         todrop = tlen;
 2001                         dupseg = TRUE;
 2002                         tcpstat.tcps_rcvdupbyte += todrop;
 2003                         tcpstat.tcps_rcvduppack++;
 2004                 } else if ((tiflags & TH_RST) &&
 2005                            th->th_seq != tp->last_ack_sent) {
 2006                         /*
 2007                          * Test for reset before adjusting the sequence
 2008                          * number for overlapping data.
 2009                          */
 2010                         goto dropafterack_ratelim;
 2011                 } else {
 2012                         tcpstat.tcps_rcvpartduppack++;
 2013                         tcpstat.tcps_rcvpartdupbyte += todrop;
 2014                 }
 2015                 tcp_new_dsack(tp, th->th_seq, todrop);
 2016                 hdroptlen += todrop;    /*drop from head afterwards*/
 2017                 th->th_seq += todrop;
 2018                 tlen -= todrop;
 2019                 if (th->th_urp > todrop)
 2020                         th->th_urp -= todrop;
 2021                 else {
 2022                         tiflags &= ~TH_URG;
 2023                         th->th_urp = 0;
 2024                 }
 2025         }
 2026 
 2027         /*
 2028          * If new data are received on a connection after the
 2029          * user processes are gone, then RST the other end.
 2030          */
 2031         if ((so->so_state & SS_NOFDREF) &&
 2032             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 2033                 tp = tcp_close(tp);
 2034                 tcpstat.tcps_rcvafterclose++;
 2035                 goto dropwithreset;
 2036         }
 2037 
 2038         /*
 2039          * If segment ends after window, drop trailing data
 2040          * (and PUSH and FIN); if nothing left, just ACK.
 2041          */
 2042         todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
 2043         if (todrop > 0) {
 2044                 tcpstat.tcps_rcvpackafterwin++;
 2045                 if (todrop >= tlen) {
 2046                         /*
 2047                          * The segment actually starts after the window.
 2048                          * th->th_seq + tlen - tp->rcv_nxt - tp->rcv_wnd >= tlen
 2049                          * th->th_seq - tp->rcv_nxt - tp->rcv_wnd >= 0
 2050                          * th->th_seq >= tp->rcv_nxt + tp->rcv_wnd
 2051                          */
 2052                         tcpstat.tcps_rcvbyteafterwin += tlen;
 2053                         /*
 2054                          * If a new connection request is received
 2055                          * while in TIME_WAIT, drop the old connection
 2056                          * and start over if the sequence numbers
 2057                          * are above the previous ones.
 2058                          *
 2059                          * NOTE: We will checksum the packet again, and
 2060                          * so we need to put the header fields back into
 2061                          * network order!
 2062                          * XXX This kind of sucks, but we don't expect
 2063                          * XXX this to happen very often, so maybe it
 2064                          * XXX doesn't matter so much.
 2065                          */
 2066                         if (tiflags & TH_SYN &&
 2067                             tp->t_state == TCPS_TIME_WAIT &&
 2068                             SEQ_GT(th->th_seq, tp->rcv_nxt)) {
 2069                                 tp = tcp_close(tp);
 2070                                 TCP_FIELDS_TO_NET(th);
 2071                                 goto findpcb;
 2072                         }
 2073                         /*
 2074                          * If window is closed can only take segments at
 2075                          * window edge, and have to drop data and PUSH from
 2076                          * incoming segments.  Continue processing, but
 2077                          * remember to ack.  Otherwise, drop segment
 2078                          * and (if not RST) ack.
 2079                          */
 2080                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 2081                                 tp->t_flags |= TF_ACKNOW;
 2082                                 tcpstat.tcps_rcvwinprobe++;
 2083                         } else
 2084                                 goto dropafterack;
 2085                 } else
 2086                         tcpstat.tcps_rcvbyteafterwin += todrop;
 2087                 m_adj(m, -todrop);
 2088                 tlen -= todrop;
 2089                 tiflags &= ~(TH_PUSH|TH_FIN);
 2090         }
 2091 
 2092         /*
 2093          * If last ACK falls within this segment's sequence numbers,
 2094          * and the timestamp is newer, record it.
 2095          */
 2096         if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
 2097             SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
 2098             SEQ_LT(tp->last_ack_sent, th->th_seq + tlen +
 2099                    ((tiflags & (TH_SYN|TH_FIN)) != 0))) {
 2100                 tp->ts_recent_age = tcp_now;
 2101                 tp->ts_recent = opti.ts_val;
 2102         }
 2103 
 2104         /*
 2105          * If the RST bit is set examine the state:
 2106          *    SYN_RECEIVED STATE:
 2107          *      If passive open, return to LISTEN state.
 2108          *      If active open, inform user that connection was refused.
 2109          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
 2110          *      Inform user that connection was reset, and close tcb.
 2111          *    CLOSING, LAST_ACK, TIME_WAIT STATES
 2112          *      Close the tcb.
 2113          */
 2114         if (tiflags & TH_RST) {
 2115                 if (th->th_seq != tp->last_ack_sent)
 2116                         goto dropafterack_ratelim;
 2117 
 2118                 switch (tp->t_state) {
 2119                 case TCPS_SYN_RECEIVED:
 2120                         so->so_error = ECONNREFUSED;
 2121                         goto close;
 2122 
 2123                 case TCPS_ESTABLISHED:
 2124                 case TCPS_FIN_WAIT_1:
 2125                 case TCPS_FIN_WAIT_2:
 2126                 case TCPS_CLOSE_WAIT:
 2127                         so->so_error = ECONNRESET;
 2128                 close:
 2129                         tp->t_state = TCPS_CLOSED;
 2130                         tcpstat.tcps_drops++;
 2131                         tp = tcp_close(tp);
 2132                         goto drop;
 2133 
 2134                 case TCPS_CLOSING:
 2135                 case TCPS_LAST_ACK:
 2136                 case TCPS_TIME_WAIT:
 2137                         tp = tcp_close(tp);
 2138                         goto drop;
 2139                 }
 2140         }
 2141 
 2142         /*
 2143          * Since we've covered the SYN-SENT and SYN-RECEIVED states above
 2144          * we must be in a synchronized state.  RFC791 states (under RST
 2145          * generation) that any unacceptable segment (an out-of-order SYN
 2146          * qualifies) received in a synchronized state must elicit only an
 2147          * empty acknowledgment segment ... and the connection remains in
 2148          * the same state.
 2149          */
 2150         if (tiflags & TH_SYN) {
 2151                 if (tp->rcv_nxt == th->th_seq) {
 2152                         tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack - 1,
 2153                             TH_ACK);
 2154                         if (tcp_saveti)
 2155                                 m_freem(tcp_saveti);
 2156                         return;
 2157                 }
 2158 
 2159                 goto dropafterack_ratelim;
 2160         }
 2161 
 2162         /*
 2163          * If the ACK bit is off we drop the segment and return.
 2164          */
 2165         if ((tiflags & TH_ACK) == 0) {
 2166                 if (tp->t_flags & TF_ACKNOW)
 2167                         goto dropafterack;
 2168                 else
 2169                         goto drop;
 2170         }
 2171 
 2172         /*
 2173          * Ack processing.
 2174          */
 2175         switch (tp->t_state) {
 2176 
 2177         /*
 2178          * In SYN_RECEIVED state if the ack ACKs our SYN then enter
 2179          * ESTABLISHED state and continue processing, otherwise
 2180          * send an RST.
 2181          */
 2182         case TCPS_SYN_RECEIVED:
 2183                 if (SEQ_GT(tp->snd_una, th->th_ack) ||
 2184                     SEQ_GT(th->th_ack, tp->snd_max))
 2185                         goto dropwithreset;
 2186                 tcpstat.tcps_connects++;
 2187                 soisconnected(so);
 2188                 tcp_established(tp);
 2189                 /* Do window scaling? */
 2190                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 2191                     (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 2192                         tp->snd_scale = tp->requested_s_scale;
 2193                         tp->rcv_scale = tp->request_r_scale;
 2194                 }
 2195                 TCP_REASS_LOCK(tp);
 2196                 (void) tcp_reass(tp, NULL, (struct mbuf *)0, &tlen);
 2197                 TCP_REASS_UNLOCK(tp);
 2198                 tp->snd_wl1 = th->th_seq - 1;
 2199                 /* fall into ... */
 2200 
 2201         /*
 2202          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 2203          * ACKs.  If the ack is in the range
 2204          *      tp->snd_una < th->th_ack <= tp->snd_max
 2205          * then advance tp->snd_una to th->th_ack and drop
 2206          * data from the retransmission queue.  If this ACK reflects
 2207          * more up to date window information we update our window information.
 2208          */
 2209         case TCPS_ESTABLISHED:
 2210         case TCPS_FIN_WAIT_1:
 2211         case TCPS_FIN_WAIT_2:
 2212         case TCPS_CLOSE_WAIT:
 2213         case TCPS_CLOSING:
 2214         case TCPS_LAST_ACK:
 2215         case TCPS_TIME_WAIT:
 2216 
 2217                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 2218                         if (tlen == 0 && !dupseg && tiwin == tp->snd_wnd) {
 2219                                 tcpstat.tcps_rcvdupack++;
 2220                                 /*
 2221                                  * If we have outstanding data (other than
 2222                                  * a window probe), this is a completely
 2223                                  * duplicate ack (ie, window info didn't
 2224                                  * change), the ack is the biggest we've
 2225                                  * seen and we've seen exactly our rexmt
 2226                                  * threshhold of them, assume a packet
 2227                                  * has been dropped and retransmit it.
 2228                                  * Kludge snd_nxt & the congestion
 2229                                  * window so we send only this one
 2230                                  * packet.
 2231                                  */
 2232                                 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 ||
 2233                                     th->th_ack != tp->snd_una)
 2234                                         tp->t_dupacks = 0;
 2235                                 else if (tp->t_partialacks < 0 &&
 2236                                          ((!TCP_SACK_ENABLED(tp) &&
 2237                                          ++tp->t_dupacks == tcprexmtthresh) ||
 2238                                          TCP_FACK_FASTRECOV(tp))) {
 2239                                         /*
 2240                                          * Do the fast retransmit, and adjust
 2241                                          * congestion control paramenters.
 2242                                          */
 2243                                         if (tp->t_congctl->fast_retransmit(tp, th)) {
 2244                                                 /* False fast retransmit */
 2245                                                 break;
 2246                                         } else
 2247                                                 goto drop;
 2248                                 } else if (tp->t_dupacks > tcprexmtthresh) {
 2249                                         tp->snd_cwnd += tp->t_segsz;
 2250                                         (void) tcp_output(tp);
 2251                                         goto drop;
 2252                                 }
 2253                         } else {
 2254                                 /*
 2255                                  * If the ack appears to be very old, only
 2256                                  * allow data that is in-sequence.  This
 2257                                  * makes it somewhat more difficult to insert
 2258                                  * forged data by guessing sequence numbers.
 2259                                  * Sent an ack to try to update the send
 2260                                  * sequence number on the other side.
 2261                                  */
 2262                                 if (tlen && th->th_seq != tp->rcv_nxt &&
 2263                                     SEQ_LT(th->th_ack,
 2264                                     tp->snd_una - tp->max_sndwnd))
 2265                                         goto dropafterack;
 2266                         }
 2267                         break;
 2268                 }
 2269                 /*
 2270                  * If the congestion window was inflated to account
 2271                  * for the other side's cached packets, retract it.
 2272                  */
 2273                 /* XXX: make SACK have his own congestion control
 2274                  * struct -- rpaulo */
 2275                 if (TCP_SACK_ENABLED(tp))
 2276                         tcp_sack_newack(tp, th);
 2277                 else
 2278                         tp->t_congctl->fast_retransmit_newack(tp, th);
 2279                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
 2280                         tcpstat.tcps_rcvacktoomuch++;
 2281                         goto dropafterack;
 2282                 }
 2283                 acked = th->th_ack - tp->snd_una;
 2284                 tcpstat.tcps_rcvackpack++;
 2285                 tcpstat.tcps_rcvackbyte += acked;
 2286 
 2287                 /*
 2288                  * If we have a timestamp reply, update smoothed
 2289                  * round trip time.  If no timestamp is present but
 2290                  * transmit timer is running and timed sequence
 2291                  * number was acked, update smoothed round trip time.
 2292                  * Since we now have an rtt measurement, cancel the
 2293                  * timer backoff (cf., Phil Karn's retransmit alg.).
 2294                  * Recompute the initial retransmit timer.
 2295                  */
 2296                 if (ts_rtt)
 2297                         tcp_xmit_timer(tp, ts_rtt);
 2298                 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 2299                         tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
 2300 
 2301                 /*
 2302                  * If all outstanding data is acked, stop retransmit
 2303                  * timer and remember to restart (more output or persist).
 2304                  * If there is more data to be acked, restart retransmit
 2305                  * timer, using current (possibly backed-off) value.
 2306                  */
 2307                 if (th->th_ack == tp->snd_max) {
 2308                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 2309                         needoutput = 1;
 2310                 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 2311                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 2312 
 2313                 /*
 2314                  * New data has been acked, adjust the congestion window.
 2315                  */
 2316                 tp->t_congctl->newack(tp, th);
 2317 
 2318                 ND6_HINT(tp);
 2319                 if (acked > so->so_snd.sb_cc) {
 2320                         tp->snd_wnd -= so->so_snd.sb_cc;
 2321                         sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
 2322                         ourfinisacked = 1;
 2323                 } else {
 2324                         if (acked > (tp->t_lastoff - tp->t_inoff))
 2325                                 tp->t_lastm = NULL;
 2326                         sbdrop(&so->so_snd, acked);
 2327                         tp->t_lastoff -= acked;
 2328                         tp->snd_wnd -= acked;
 2329                         ourfinisacked = 0;
 2330                 }
 2331                 sowwakeup(so);
 2332 
 2333                 ICMP_CHECK(tp, th, acked);
 2334 
 2335                 tp->snd_una = th->th_ack;
 2336                 if (SEQ_GT(tp->snd_una, tp->snd_fack))
 2337                         tp->snd_fack = tp->snd_una;
 2338                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 2339                         tp->snd_nxt = tp->snd_una;
 2340                 if (SEQ_LT(tp->snd_high, tp->snd_una))
 2341                         tp->snd_high = tp->snd_una;
 2342 
 2343                 switch (tp->t_state) {
 2344 
 2345                 /*
 2346                  * In FIN_WAIT_1 STATE in addition to the processing
 2347                  * for the ESTABLISHED state if our FIN is now acknowledged
 2348                  * then enter FIN_WAIT_2.
 2349                  */
 2350                 case TCPS_FIN_WAIT_1:
 2351                         if (ourfinisacked) {
 2352                                 /*
 2353                                  * If we can't receive any more
 2354                                  * data, then closing user can proceed.
 2355                                  * Starting the timer is contrary to the
 2356                                  * specification, but if we don't get a FIN
 2357                                  * we'll hang forever.
 2358                                  */
 2359                                 if (so->so_state & SS_CANTRCVMORE) {
 2360                                         soisdisconnected(so);
 2361                                         if (tcp_maxidle > 0)
 2362                                                 TCP_TIMER_ARM(tp, TCPT_2MSL,
 2363                                                     tcp_maxidle);
 2364                                 }
 2365                                 tp->t_state = TCPS_FIN_WAIT_2;
 2366                         }
 2367                         break;
 2368 
 2369                 /*
 2370                  * In CLOSING STATE in addition to the processing for
 2371                  * the ESTABLISHED state if the ACK acknowledges our FIN
 2372                  * then enter the TIME-WAIT state, otherwise ignore
 2373                  * the segment.
 2374                  */
 2375                 case TCPS_CLOSING:
 2376                         if (ourfinisacked) {
 2377                                 tp->t_state = TCPS_TIME_WAIT;
 2378                                 tcp_canceltimers(tp);
 2379                                 TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2380                                 soisdisconnected(so);
 2381                         }
 2382                         break;
 2383 
 2384                 /*
 2385                  * In LAST_ACK, we may still be waiting for data to drain
 2386                  * and/or to be acked, as well as for the ack of our FIN.
 2387                  * If our FIN is now acknowledged, delete the TCB,
 2388                  * enter the closed state and return.
 2389                  */
 2390                 case TCPS_LAST_ACK:
 2391                         if (ourfinisacked) {
 2392                                 tp = tcp_close(tp);
 2393                                 goto drop;
 2394                         }
 2395                         break;
 2396 
 2397                 /*
 2398                  * In TIME_WAIT state the only thing that should arrive
 2399                  * is a retransmission of the remote FIN.  Acknowledge
 2400                  * it and restart the finack timer.
 2401                  */
 2402                 case TCPS_TIME_WAIT:
 2403                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2404                         goto dropafterack;
 2405                 }
 2406         }
 2407 
 2408 step6:
 2409         /*
 2410          * Update window information.
 2411          * Don't look at window if no ACK: TAC's send garbage on first SYN.
 2412          */
 2413         if ((tiflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) ||
 2414             (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 2415             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 2416                 /* keep track of pure window updates */
 2417                 if (tlen == 0 &&
 2418                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 2419                         tcpstat.tcps_rcvwinupd++;
 2420                 tp->snd_wnd = tiwin;
 2421                 tp->snd_wl1 = th->th_seq;
 2422                 tp->snd_wl2 = th->th_ack;
 2423                 if (tp->snd_wnd > tp->max_sndwnd)
 2424                         tp->max_sndwnd = tp->snd_wnd;
 2425                 needoutput = 1;
 2426         }
 2427 
 2428         /*
 2429          * Process segments with URG.
 2430          */
 2431         if ((tiflags & TH_URG) && th->th_urp &&
 2432             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2433                 /*
 2434                  * This is a kludge, but if we receive and accept
 2435                  * random urgent pointers, we'll crash in
 2436                  * soreceive.  It's hard to imagine someone
 2437                  * actually wanting to send this much urgent data.
 2438                  */
 2439                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 2440                         th->th_urp = 0;                 /* XXX */
 2441                         tiflags &= ~TH_URG;             /* XXX */
 2442                         goto dodata;                    /* XXX */
 2443                 }
 2444                 /*
 2445                  * If this segment advances the known urgent pointer,
 2446                  * then mark the data stream.  This should not happen
 2447                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 2448                  * a FIN has been received from the remote side.
 2449                  * In these states we ignore the URG.
 2450                  *
 2451                  * According to RFC961 (Assigned Protocols),
 2452                  * the urgent pointer points to the last octet
 2453                  * of urgent data.  We continue, however,
 2454                  * to consider it to indicate the first octet
 2455                  * of data past the urgent section as the original
 2456                  * spec states (in one of two places).
 2457                  */
 2458                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 2459                         tp->rcv_up = th->th_seq + th->th_urp;
 2460                         so->so_oobmark = so->so_rcv.sb_cc +
 2461                             (tp->rcv_up - tp->rcv_nxt) - 1;
 2462                         if (so->so_oobmark == 0)
 2463                                 so->so_state |= SS_RCVATMARK;
 2464                         sohasoutofband(so);
 2465                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 2466                 }
 2467                 /*
 2468                  * Remove out of band data so doesn't get presented to user.
 2469                  * This can happen independent of advancing the URG pointer,
 2470                  * but if two URG's are pending at once, some out-of-band
 2471                  * data may creep in... ick.
 2472                  */
 2473                 if (th->th_urp <= (u_int16_t) tlen
 2474 #ifdef SO_OOBINLINE
 2475                      && (so->so_options & SO_OOBINLINE) == 0
 2476 #endif
 2477                      )
 2478                         tcp_pulloutofband(so, th, m, hdroptlen);
 2479         } else
 2480                 /*
 2481                  * If no out of band data is expected,
 2482                  * pull receive urgent pointer along
 2483                  * with the receive window.
 2484                  */
 2485                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 2486                         tp->rcv_up = tp->rcv_nxt;
 2487 dodata:                                                 /* XXX */
 2488 
 2489         /*
 2490          * Process the segment text, merging it into the TCP sequencing queue,
 2491          * and arranging for acknowledgement of receipt if necessary.
 2492          * This process logically involves adjusting tp->rcv_wnd as data
 2493          * is presented to the user (this happens in tcp_usrreq.c,
 2494          * case PRU_RCVD).  If a FIN has already been received on this
 2495          * connection then we just ignore the text.
 2496          */
 2497         if ((tlen || (tiflags & TH_FIN)) &&
 2498             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2499                 /*
 2500                  * Insert segment ti into reassembly queue of tcp with
 2501                  * control block tp.  Return TH_FIN if reassembly now includes
 2502                  * a segment with FIN.  The macro form does the common case
 2503                  * inline (segment is the next to be received on an
 2504                  * established connection, and the queue is empty),
 2505                  * avoiding linkage into and removal from the queue and
 2506                  * repetition of various conversions.
 2507                  * Set DELACK for segments received in order, but ack
 2508                  * immediately when segments are out of order
 2509                  * (so fast retransmit can work).
 2510                  */
 2511                 /* NOTE: this was TCP_REASS() macro, but used only once */
 2512                 TCP_REASS_LOCK(tp);
 2513                 if (th->th_seq == tp->rcv_nxt &&
 2514                     TAILQ_FIRST(&tp->segq) == NULL &&
 2515                     tp->t_state == TCPS_ESTABLISHED) {
 2516                         TCP_SETUP_ACK(tp, th);
 2517                         tp->rcv_nxt += tlen;
 2518                         tiflags = th->th_flags & TH_FIN;
 2519                         tcpstat.tcps_rcvpack++;
 2520                         tcpstat.tcps_rcvbyte += tlen;
 2521                         ND6_HINT(tp);
 2522                         if (so->so_state & SS_CANTRCVMORE)
 2523                                 m_freem(m);
 2524                         else {
 2525                                 m_adj(m, hdroptlen);
 2526                                 sbappendstream(&(so)->so_rcv, m);
 2527                         }
 2528                         sorwakeup(so);
 2529                 } else {
 2530                         m_adj(m, hdroptlen);
 2531                         tiflags = tcp_reass(tp, th, m, &tlen);
 2532                         tp->t_flags |= TF_ACKNOW;
 2533                 }
 2534                 TCP_REASS_UNLOCK(tp);
 2535 
 2536                 /*
 2537                  * Note the amount of data that peer has sent into
 2538                  * our window, in order to estimate the sender's
 2539                  * buffer size.
 2540                  */
 2541                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 2542         } else {
 2543                 m_freem(m);
 2544                 m = NULL;
 2545                 tiflags &= ~TH_FIN;
 2546         }
 2547 
 2548         /*
 2549          * If FIN is received ACK the FIN and let the user know
 2550          * that the connection is closing.  Ignore a FIN received before
 2551          * the connection is fully established.
 2552          */
 2553         if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
 2554                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 2555                         socantrcvmore(so);
 2556                         tp->t_flags |= TF_ACKNOW;
 2557                         tp->rcv_nxt++;
 2558                 }
 2559                 switch (tp->t_state) {
 2560 
 2561                 /*
 2562                  * In ESTABLISHED STATE enter the CLOSE_WAIT state.
 2563                  */
 2564                 case TCPS_ESTABLISHED:
 2565                         tp->t_state = TCPS_CLOSE_WAIT;
 2566                         break;
 2567 
 2568                 /*
 2569                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
 2570                  * enter the CLOSING state.
 2571                  */
 2572                 case TCPS_FIN_WAIT_1:
 2573                         tp->t_state = TCPS_CLOSING;
 2574                         break;
 2575 
 2576                 /*
 2577                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
 2578                  * starting the time-wait timer, turning off the other
 2579                  * standard timers.
 2580                  */
 2581                 case TCPS_FIN_WAIT_2:
 2582                         tp->t_state = TCPS_TIME_WAIT;
 2583                         tcp_canceltimers(tp);
 2584                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2585                         soisdisconnected(so);
 2586                         break;
 2587 
 2588                 /*
 2589                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
 2590                  */
 2591                 case TCPS_TIME_WAIT:
 2592                         TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
 2593                         break;
 2594                 }
 2595         }
 2596 #ifdef TCP_DEBUG
 2597         if (so->so_options & SO_DEBUG)
 2598                 tcp_trace(TA_INPUT, ostate, tp, tcp_saveti, 0);
 2599 #endif
 2600 
 2601         /*
 2602          * Return any desired output.
 2603          */
 2604         if (needoutput || (tp->t_flags & TF_ACKNOW)) {
 2605                 (void) tcp_output(tp);
 2606         }
 2607         if (tcp_saveti)
 2608                 m_freem(tcp_saveti);
 2609         return;
 2610 
 2611 badsyn:
 2612         /*
 2613          * Received a bad SYN.  Increment counters and dropwithreset.
 2614          */
 2615         tcpstat.tcps_badsyn++;
 2616         tp = NULL;
 2617         goto dropwithreset;
 2618 
 2619 dropafterack:
 2620         /*
 2621          * Generate an ACK dropping incoming segment if it occupies
 2622          * sequence space, where the ACK reflects our state.
 2623          */
 2624         if (tiflags & TH_RST)
 2625                 goto drop;
 2626         goto dropafterack2;
 2627 
 2628 dropafterack_ratelim:
 2629         /*
 2630          * We may want to rate-limit ACKs against SYN/RST attack.
 2631          */
 2632         if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
 2633             tcp_ackdrop_ppslim) == 0) {
 2634                 /* XXX stat */
 2635                 goto drop;
 2636         }
 2637         /* ...fall into dropafterack2... */
 2638 
 2639 dropafterack2:
 2640         m_freem(m);
 2641         tp->t_flags |= TF_ACKNOW;
 2642         (void) tcp_output(tp);
 2643         if (tcp_saveti)
 2644                 m_freem(tcp_saveti);
 2645         return;
 2646 
 2647 dropwithreset_ratelim:
 2648         /*
 2649          * We may want to rate-limit RSTs in certain situations,
 2650          * particularly if we are sending an RST in response to
 2651          * an attempt to connect to or otherwise communicate with
 2652          * a port for which we have no socket.
 2653          */
 2654         if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
 2655             tcp_rst_ppslim) == 0) {
 2656                 /* XXX stat */
 2657                 goto drop;
 2658         }
 2659         /* ...fall into dropwithreset... */
 2660 
 2661 dropwithreset:
 2662         /*
 2663          * Generate a RST, dropping incoming segment.
 2664          * Make ACK acceptable to originator of segment.
 2665          */
 2666         if (tiflags & TH_RST)
 2667                 goto drop;
 2668 
 2669         switch (af) {
 2670 #ifdef INET6
 2671         case AF_INET6:
 2672                 /* For following calls to tcp_respond */
 2673                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 2674                         goto drop;
 2675                 break;
 2676 #endif /* INET6 */
 2677         case AF_INET:
 2678                 if (IN_MULTICAST(ip->ip_dst.s_addr) ||
 2679                     in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 2680                         goto drop;
 2681         }
 2682 
 2683         if (tiflags & TH_ACK)
 2684                 (void)tcp_respond(tp, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
 2685         else {
 2686                 if (tiflags & TH_SYN)
 2687                         tlen++;
 2688                 (void)tcp_respond(tp, m, m, th, th->th_seq + tlen, (tcp_seq)0,
 2689                     TH_RST|TH_ACK);
 2690         }
 2691         if (tcp_saveti)
 2692                 m_freem(tcp_saveti);
 2693         return;
 2694 
 2695 badcsum:
 2696 drop:
 2697         /*
 2698          * Drop space held by incoming segment and return.
 2699          */
 2700         if (tp) {
 2701                 if (tp->t_inpcb)
 2702                         so = tp->t_inpcb->inp_socket;
 2703 #ifdef INET6
 2704                 else if (tp->t_in6pcb)
 2705                         so = tp->t_in6pcb->in6p_socket;
 2706 #endif
 2707                 else
 2708                         so = NULL;
 2709 #ifdef TCP_DEBUG
 2710                 if (so && (so->so_options & SO_DEBUG) != 0)
 2711                         tcp_trace(TA_DROP, ostate, tp, tcp_saveti, 0);
 2712 #endif
 2713         }
 2714         if (tcp_saveti)
 2715                 m_freem(tcp_saveti);
 2716         m_freem(m);
 2717         return;
 2718 }
 2719 
 2720 #ifdef TCP_SIGNATURE
 2721 int
 2722 tcp_signature_apply(void *fstate, caddr_t data, u_int len)
 2723 {
 2724 
 2725         MD5Update(fstate, (u_char *)data, len);
 2726         return (0);
 2727 }
 2728 
 2729 struct secasvar *
 2730 tcp_signature_getsav(struct mbuf *m, struct tcphdr *th)
 2731 {
 2732         struct secasvar *sav;
 2733 #ifdef FAST_IPSEC
 2734         union sockaddr_union dst;
 2735 #endif
 2736         struct ip *ip;
 2737         struct ip6_hdr *ip6;
 2738 
 2739         ip = mtod(m, struct ip *);
 2740         switch (ip->ip_v) {
 2741         case 4:
 2742                 ip = mtod(m, struct ip *);
 2743                 ip6 = NULL;
 2744                 break;
 2745         case 6:
 2746                 ip = NULL;
 2747                 ip6 = mtod(m, struct ip6_hdr *);
 2748                 break;
 2749         default:
 2750                 return (NULL);
 2751         }
 2752 
 2753 #ifdef FAST_IPSEC
 2754         /* Extract the destination from the IP header in the mbuf. */
 2755         bzero(&dst, sizeof(union sockaddr_union));
 2756         if (ip !=NULL) {
 2757                 dst.sa.sa_len = sizeof(struct sockaddr_in);
 2758                 dst.sa.sa_family = AF_INET;
 2759                 dst.sin.sin_addr = ip->ip_dst;
 2760         } else {
 2761                 dst.sa.sa_len = sizeof(struct sockaddr_in6);
 2762                 dst.sa.sa_family = AF_INET6;
 2763                 dst.sin6.sin6_addr = ip6->ip6_dst;
 2764         }
 2765 
 2766         /*
 2767          * Look up an SADB entry which matches the address of the peer.
 2768          */
 2769         sav = KEY_ALLOCSA(&dst, IPPROTO_TCP, htonl(TCP_SIG_SPI));
 2770 #else
 2771         if (ip)
 2772                 sav = key_allocsa(AF_INET, (caddr_t)&ip->ip_src,
 2773                     (caddr_t)&ip->ip_dst, IPPROTO_TCP,
 2774                     htonl(TCP_SIG_SPI), 0, 0);
 2775         else
 2776                 sav = key_allocsa(AF_INET6, (caddr_t)&ip6->ip6_src,
 2777                     (caddr_t)&ip6->ip6_dst, IPPROTO_TCP,
 2778                     htonl(TCP_SIG_SPI), 0, 0);
 2779 #endif
 2780 
 2781         return (sav);   /* freesav must be performed by caller */
 2782 }
 2783 
 2784 int
 2785 tcp_signature(struct mbuf *m, struct tcphdr *th, int thoff,
 2786     struct secasvar *sav, char *sig)
 2787 {
 2788         MD5_CTX ctx;
 2789         struct ip *ip;
 2790         struct ipovly *ipovly;
 2791         struct ip6_hdr *ip6;
 2792         struct ippseudo ippseudo;
 2793         struct ip6_hdr_pseudo ip6pseudo;
 2794         struct tcphdr th0;
 2795         int l, tcphdrlen;
 2796 
 2797         if (sav == NULL)
 2798                 return (-1);
 2799 
 2800         tcphdrlen = th->th_off * 4;
 2801 
 2802         switch (mtod(m, struct ip *)->ip_v) {
 2803         case 4:
 2804                 ip = mtod(m, struct ip *);
 2805                 ip6 = NULL;
 2806                 break;
 2807         case 6:
 2808                 ip = NULL;
 2809                 ip6 = mtod(m, struct ip6_hdr *);
 2810                 break;
 2811         default:
 2812                 return (-1);
 2813         }
 2814 
 2815         MD5Init(&ctx);
 2816 
 2817         if (ip) {
 2818                 memset(&ippseudo, 0, sizeof(ippseudo));
 2819                 ipovly = (struct ipovly *)ip;
 2820                 ippseudo.ippseudo_src = ipovly->ih_src;
 2821                 ippseudo.ippseudo_dst = ipovly->ih_dst;
 2822                 ippseudo.ippseudo_pad = 0;
 2823                 ippseudo.ippseudo_p = IPPROTO_TCP;
 2824                 ippseudo.ippseudo_len = htons(m->m_pkthdr.len - thoff);
 2825                 MD5Update(&ctx, (char *)&ippseudo, sizeof(ippseudo));
 2826         } else {
 2827                 memset(&ip6pseudo, 0, sizeof(ip6pseudo));
 2828                 ip6pseudo.ip6ph_src = ip6->ip6_src;
 2829                 in6_clearscope(&ip6pseudo.ip6ph_src);
 2830                 ip6pseudo.ip6ph_dst = ip6->ip6_dst;
 2831                 in6_clearscope(&ip6pseudo.ip6ph_dst);
 2832                 ip6pseudo.ip6ph_len = htons(m->m_pkthdr.len - thoff);
 2833                 ip6pseudo.ip6ph_nxt = IPPROTO_TCP;
 2834                 MD5Update(&ctx, (char *)&ip6pseudo, sizeof(ip6pseudo));
 2835         }
 2836 
 2837         th0 = *th;
 2838         th0.th_sum = 0;
 2839         MD5Update(&ctx, (char *)&th0, sizeof(th0));
 2840 
 2841         l = m->m_pkthdr.len - thoff - tcphdrlen;
 2842         if (l > 0)
 2843                 m_apply(m, thoff + tcphdrlen,
 2844                     m->m_pkthdr.len - thoff - tcphdrlen,
 2845                     tcp_signature_apply, &ctx);
 2846 
 2847         MD5Update(&ctx, _KEYBUF(sav->key_auth), _KEYLEN(sav->key_auth));
 2848         MD5Final(sig, &ctx);
 2849 
 2850         return (0);
 2851 }
 2852 #endif
 2853 
 2854 static int
 2855 tcp_dooptions(struct tcpcb *tp, const u_char *cp, int cnt,
 2856     struct tcphdr *th,
 2857     struct mbuf *m, int toff, struct tcp_opt_info *oi)
 2858 {
 2859         u_int16_t mss;
 2860         int opt, optlen = 0;
 2861 #ifdef TCP_SIGNATURE
 2862         caddr_t sigp = NULL;
 2863         char sigbuf[TCP_SIGLEN];
 2864         struct secasvar *sav = NULL;
 2865 #endif
 2866 
 2867         for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
 2868                 opt = cp[0];
 2869                 if (opt == TCPOPT_EOL)
 2870                         break;
 2871                 if (opt == TCPOPT_NOP)
 2872                         optlen = 1;
 2873                 else {
 2874                         if (cnt < 2)
 2875                                 break;
 2876                         optlen = cp[1];
 2877                         if (optlen < 2 || optlen > cnt)
 2878                                 break;
 2879                 }
 2880                 switch (opt) {
 2881 
 2882                 default:
 2883                         continue;
 2884 
 2885                 case TCPOPT_MAXSEG:
 2886                         if (optlen != TCPOLEN_MAXSEG)
 2887                                 continue;
 2888                         if (!(th->th_flags & TH_SYN))
 2889                                 continue;
 2890                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2891                                 continue;
 2892                         bcopy(cp + 2, &mss, sizeof(mss));
 2893                         oi->maxseg = ntohs(mss);
 2894                         break;
 2895 
 2896                 case TCPOPT_WINDOW:
 2897                         if (optlen != TCPOLEN_WINDOW)
 2898                                 continue;
 2899                         if (!(th->th_flags & TH_SYN))
 2900                                 continue;
 2901                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2902                                 continue;
 2903                         tp->t_flags |= TF_RCVD_SCALE;
 2904                         tp->requested_s_scale = cp[2];
 2905                         if (tp->requested_s_scale > TCP_MAX_WINSHIFT) {
 2906 #if 0   /*XXX*/
 2907                                 char *p;
 2908 
 2909                                 if (ip)
 2910                                         p = ntohl(ip->ip_src);
 2911 #ifdef INET6
 2912                                 else if (ip6)
 2913                                         p = ip6_sprintf(&ip6->ip6_src);
 2914 #endif
 2915                                 else
 2916                                         p = "(unknown)";
 2917                                 log(LOG_ERR, "TCP: invalid wscale %d from %s, "
 2918                                     "assuming %d\n",
 2919                                     tp->requested_s_scale, p,
 2920                                     TCP_MAX_WINSHIFT);
 2921 #else
 2922                                 log(LOG_ERR, "TCP: invalid wscale %d, "
 2923                                     "assuming %d\n",
 2924                                     tp->requested_s_scale,
 2925                                     TCP_MAX_WINSHIFT);
 2926 #endif
 2927                                 tp->requested_s_scale = TCP_MAX_WINSHIFT;
 2928                         }
 2929                         break;
 2930 
 2931                 case TCPOPT_TIMESTAMP:
 2932                         if (optlen != TCPOLEN_TIMESTAMP)
 2933                                 continue;
 2934                         oi->ts_present = 1;
 2935                         bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
 2936                         NTOHL(oi->ts_val);
 2937                         bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
 2938                         NTOHL(oi->ts_ecr);
 2939 
 2940                         if (!(th->th_flags & TH_SYN))
 2941                                 continue;
 2942                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2943                                 continue;
 2944                         /*
 2945                          * A timestamp received in a SYN makes
 2946                          * it ok to send timestamp requests and replies.
 2947                          */
 2948                         tp->t_flags |= TF_RCVD_TSTMP;
 2949                         tp->ts_recent = oi->ts_val;
 2950                         tp->ts_recent_age = tcp_now;
 2951                         break;
 2952 
 2953                 case TCPOPT_SACK_PERMITTED:
 2954                         if (optlen != TCPOLEN_SACK_PERMITTED)
 2955                                 continue;
 2956                         if (!(th->th_flags & TH_SYN))
 2957                                 continue;
 2958                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2959                                 continue;
 2960                         if (tcp_do_sack) {
 2961                                 tp->t_flags |= TF_SACK_PERMIT;
 2962                                 tp->t_flags |= TF_WILL_SACK;
 2963                         }
 2964                         break;
 2965 
 2966                 case TCPOPT_SACK:
 2967                         tcp_sack_option(tp, th, cp, optlen);
 2968                         break;
 2969 #ifdef TCP_SIGNATURE
 2970                 case TCPOPT_SIGNATURE:
 2971                         if (optlen != TCPOLEN_SIGNATURE)
 2972                                 continue;
 2973                         if (sigp && bcmp(sigp, cp + 2, TCP_SIGLEN))
 2974                                 return (-1);
 2975 
 2976                         sigp = sigbuf;
 2977                         memcpy(sigbuf, cp + 2, TCP_SIGLEN);
 2978                         tp->t_flags |= TF_SIGNATURE;
 2979                         break;
 2980 #endif
 2981                 }
 2982         }
 2983 
 2984 #ifdef TCP_SIGNATURE
 2985         if (tp->t_flags & TF_SIGNATURE) {
 2986 
 2987                 sav = tcp_signature_getsav(m, th);
 2988 
 2989                 if (sav == NULL && tp->t_state == TCPS_LISTEN)
 2990                         return (-1);
 2991         }
 2992 
 2993         if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
 2994                 if (sav == NULL)
 2995                         return (-1);
 2996 #ifdef FAST_IPSEC
 2997                 KEY_FREESAV(&sav);
 2998 #else
 2999                 key_freesav(sav);
 3000 #endif
 3001                 return (-1);
 3002         }
 3003 
 3004         if (sigp) {
 3005                 char sig[TCP_SIGLEN];
 3006 
 3007                 TCP_FIELDS_TO_NET(th);
 3008                 if (tcp_signature(m, th, toff, sav, sig) < 0) {
 3009                         TCP_FIELDS_TO_HOST(th);
 3010                         if (sav == NULL)
 3011                                 return (-1);
 3012 #ifdef FAST_IPSEC
 3013                         KEY_FREESAV(&sav);
 3014 #else
 3015                         key_freesav(sav);
 3016 #endif
 3017                         return (-1);
 3018                 }
 3019                 TCP_FIELDS_TO_HOST(th);
 3020 
 3021                 if (bcmp(sig, sigp, TCP_SIGLEN)) {
 3022                         tcpstat.tcps_badsig++;
 3023                         if (sav == NULL)
 3024                                 return (-1);
 3025 #ifdef FAST_IPSEC
 3026                         KEY_FREESAV(&sav);
 3027 #else
 3028                         key_freesav(sav);
 3029 #endif
 3030                         return (-1);
 3031                 } else
 3032                         tcpstat.tcps_goodsig++;
 3033 
 3034                 key_sa_recordxfer(sav, m);
 3035 #ifdef FAST_IPSEC
 3036                 KEY_FREESAV(&sav);
 3037 #else
 3038                 key_freesav(sav);
 3039 #endif
 3040         }
 3041 #endif
 3042 
 3043         return (0);
 3044 }
 3045 
 3046 /*
 3047  * Pull out of band byte out of a segment so
 3048  * it doesn't appear in the user's data queue.
 3049  * It is still reflected in the segment length for
 3050  * sequencing purposes.
 3051  */
 3052 void
 3053 tcp_pulloutofband(struct socket *so, struct tcphdr *th,
 3054     struct mbuf *m, int off)
 3055 {
 3056         int cnt = off + th->th_urp - 1;
 3057 
 3058         while (cnt >= 0) {
 3059                 if (m->m_len > cnt) {
 3060                         char *cp = mtod(m, caddr_t) + cnt;
 3061                         struct tcpcb *tp = sototcpcb(so);
 3062 
 3063                         tp->t_iobc = *cp;
 3064                         tp->t_oobflags |= TCPOOB_HAVEDATA;
 3065                         bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
 3066                         m->m_len--;
 3067                         return;
 3068                 }
 3069                 cnt -= m->m_len;
 3070                 m = m->m_next;
 3071                 if (m == 0)
 3072                         break;
 3073         }
 3074         panic("tcp_pulloutofband");
 3075 }
 3076 
 3077 /*
 3078  * Collect new round-trip time estimate
 3079  * and update averages and current timeout.
 3080  */
 3081 void
 3082 tcp_xmit_timer(struct tcpcb *tp, uint32_t rtt)
 3083 {
 3084         int32_t delta;
 3085 
 3086         tcpstat.tcps_rttupdated++;
 3087         if (tp->t_srtt != 0) {
 3088                 /*
 3089                  * srtt is stored as fixed point with 3 bits after the
 3090                  * binary point (i.e., scaled by 8).  The following magic
 3091                  * is equivalent to the smoothing algorithm in rfc793 with
 3092                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 3093                  * point).  Adjust rtt to origin 0.
 3094                  */
 3095                 delta = (rtt << 2) - (tp->t_srtt >> TCP_RTT_SHIFT);
 3096                 if ((tp->t_srtt += delta) <= 0)
 3097                         tp->t_srtt = 1 << 2;
 3098                 /*
 3099                  * We accumulate a smoothed rtt variance (actually, a
 3100                  * smoothed mean difference), then set the retransmit
 3101                  * timer to smoothed rtt + 4 times the smoothed variance.
 3102                  * rttvar is stored as fixed point with 2 bits after the
 3103                  * binary point (scaled by 4).  The following is
 3104                  * equivalent to rfc793 smoothing with an alpha of .75
 3105                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 3106                  * rfc793's wired-in beta.
 3107                  */
 3108                 if (delta < 0)
 3109                         delta = -delta;
 3110                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
 3111                 if ((tp->t_rttvar += delta) <= 0)
 3112                         tp->t_rttvar = 1 << 2;
 3113         } else {
 3114                 /*
 3115                  * No rtt measurement yet - use the unsmoothed rtt.
 3116                  * Set the variance to half the rtt (so our first
 3117                  * retransmit happens at 3*rtt).
 3118                  */
 3119                 tp->t_srtt = rtt << (TCP_RTT_SHIFT + 2);
 3120                 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT + 2 - 1);
 3121         }
 3122         tp->t_rtttime = 0;
 3123         tp->t_rxtshift = 0;
 3124 
 3125         /*
 3126          * the retransmit should happen at rtt + 4 * rttvar.
 3127          * Because of the way we do the smoothing, srtt and rttvar
 3128          * will each average +1/2 tick of bias.  When we compute
 3129          * the retransmit timer, we want 1/2 tick of rounding and
 3130          * 1 extra tick because of +-1/2 tick uncertainty in the
 3131          * firing of the timer.  The bias will give us exactly the
 3132          * 1.5 tick we need.  But, because the bias is
 3133          * statistical, we have to test that we don't drop below
 3134          * the minimum feasible timer (which is 2 ticks).
 3135          */
 3136         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
 3137             max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
 3138 
 3139         /*
 3140          * We received an ack for a packet that wasn't retransmitted;
 3141          * it is probably safe to discard any error indications we've
 3142          * received recently.  This isn't quite right, but close enough
 3143          * for now (a route might have failed after we sent a segment,
 3144          * and the return path might not be symmetrical).
 3145          */
 3146         tp->t_softerror = 0;
 3147 }
 3148 
 3149 
 3150 /*
 3151  * TCP compressed state engine.  Currently used to hold compressed
 3152  * state for SYN_RECEIVED.
 3153  */
 3154 
 3155 u_long  syn_cache_count;
 3156 u_int32_t syn_hash1, syn_hash2;
 3157 
 3158 #define SYN_HASH(sa, sp, dp) \
 3159         ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
 3160                                      ((u_int32_t)(sp)))^syn_hash2)))
 3161 #ifndef INET6
 3162 #define SYN_HASHALL(hash, src, dst) \
 3163 do {                                                                    \
 3164         hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
 3165                 ((const struct sockaddr_in *)(src))->sin_port,          \
 3166                 ((const struct sockaddr_in *)(dst))->sin_port);         \
 3167 } while (/*CONSTCOND*/ 0)
 3168 #else
 3169 #define SYN_HASH6(sa, sp, dp) \
 3170         ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
 3171           (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
 3172          & 0x7fffffff)
 3173 
 3174 #define SYN_HASHALL(hash, src, dst) \
 3175 do {                                                                    \
 3176         switch ((src)->sa_family) {                                     \
 3177         case AF_INET:                                                   \
 3178                 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
 3179                         ((const struct sockaddr_in *)(src))->sin_port,  \
 3180                         ((const struct sockaddr_in *)(dst))->sin_port); \
 3181                 break;                                                  \
 3182         case AF_INET6:                                                  \
 3183                 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
 3184                         ((const struct sockaddr_in6 *)(src))->sin6_port,        \
 3185                         ((const struct sockaddr_in6 *)(dst))->sin6_port);       \
 3186                 break;                                                  \
 3187         default:                                                        \
 3188                 hash = 0;                                               \
 3189         }                                                               \
 3190 } while (/*CONSTCOND*/0)
 3191 #endif /* INET6 */
 3192 
 3193 #define SYN_CACHE_RM(sc)                                                \
 3194 do {                                                                    \
 3195         TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket,     \
 3196             (sc), sc_bucketq);                                          \
 3197         (sc)->sc_tp = NULL;                                             \
 3198         LIST_REMOVE((sc), sc_tpq);                                      \
 3199         tcp_syn_cache[(sc)->sc_bucketidx].sch_length--;                 \
 3200         callout_stop(&(sc)->sc_timer);                                  \
 3201         syn_cache_count--;                                              \
 3202 } while (/*CONSTCOND*/0)
 3203 
 3204 #define SYN_CACHE_PUT(sc)                                               \
 3205 do {                                                                    \
 3206         if ((sc)->sc_ipopts)                                            \
 3207                 (void) m_free((sc)->sc_ipopts);                         \
 3208         if ((sc)->sc_route4.ro_rt != NULL)                              \
 3209                 RTFREE((sc)->sc_route4.ro_rt);                          \
 3210         if (callout_invoking(&(sc)->sc_timer))                          \
 3211                 (sc)->sc_flags |= SCF_DEAD;                             \
 3212         else                                                            \
 3213                 pool_put(&syn_cache_pool, (sc));                        \
 3214 } while (/*CONSTCOND*/0)
 3215 
 3216 POOL_INIT(syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0, "synpl", NULL);
 3217 
 3218 /*
 3219  * We don't estimate RTT with SYNs, so each packet starts with the default
 3220  * RTT and each timer step has a fixed timeout value.
 3221  */
 3222 #define SYN_CACHE_TIMER_ARM(sc)                                         \
 3223 do {                                                                    \
 3224         TCPT_RANGESET((sc)->sc_rxtcur,                                  \
 3225             TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
 3226             TCPTV_REXMTMAX);                                            \
 3227         callout_reset(&(sc)->sc_timer,                                  \
 3228             (sc)->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, (sc)); \
 3229 } while (/*CONSTCOND*/0)
 3230 
 3231 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
 3232 
 3233 void
 3234 syn_cache_init(void)
 3235 {
 3236         int i;
 3237 
 3238         /* Initialize the hash buckets. */
 3239         for (i = 0; i < tcp_syn_cache_size; i++)
 3240                 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
 3241 }
 3242 
 3243 void
 3244 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
 3245 {
 3246         struct syn_cache_head *scp;
 3247         struct syn_cache *sc2;
 3248         int s;
 3249 
 3250         /*
 3251          * If there are no entries in the hash table, reinitialize
 3252          * the hash secrets.
 3253          */
 3254         if (syn_cache_count == 0) {
 3255                 syn_hash1 = arc4random();
 3256                 syn_hash2 = arc4random();
 3257         }
 3258 
 3259         SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
 3260         sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
 3261         scp = &tcp_syn_cache[sc->sc_bucketidx];
 3262 
 3263         /*
 3264          * Make sure that we don't overflow the per-bucket
 3265          * limit or the total cache size limit.
 3266          */
 3267         s = splsoftnet();
 3268         if (scp->sch_length >= tcp_syn_bucket_limit) {
 3269                 tcpstat.tcps_sc_bucketoverflow++;
 3270                 /*
 3271                  * The bucket is full.  Toss the oldest element in the
 3272                  * bucket.  This will be the first entry in the bucket.
 3273                  */
 3274                 sc2 = TAILQ_FIRST(&scp->sch_bucket);
 3275 #ifdef DIAGNOSTIC
 3276                 /*
 3277                  * This should never happen; we should always find an
 3278                  * entry in our bucket.
 3279                  */
 3280                 if (sc2 == NULL)
 3281                         panic("syn_cache_insert: bucketoverflow: impossible");
 3282 #endif
 3283                 SYN_CACHE_RM(sc2);
 3284                 SYN_CACHE_PUT(sc2);     /* calls pool_put but see spl above */
 3285         } else if (syn_cache_count >= tcp_syn_cache_limit) {
 3286                 struct syn_cache_head *scp2, *sce;
 3287 
 3288                 tcpstat.tcps_sc_overflowed++;
 3289                 /*
 3290                  * The cache is full.  Toss the oldest entry in the
 3291                  * first non-empty bucket we can find.
 3292                  *
 3293                  * XXX We would really like to toss the oldest
 3294                  * entry in the cache, but we hope that this
 3295                  * condition doesn't happen very often.
 3296                  */
 3297                 scp2 = scp;
 3298                 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
 3299                         sce = &tcp_syn_cache[tcp_syn_cache_size];
 3300                         for (++scp2; scp2 != scp; scp2++) {
 3301                                 if (scp2 >= sce)
 3302                                         scp2 = &tcp_syn_cache[0];
 3303                                 if (! TAILQ_EMPTY(&scp2->sch_bucket))
 3304                                         break;
 3305                         }
 3306 #ifdef DIAGNOSTIC
 3307                         /*
 3308                          * This should never happen; we should always find a
 3309                          * non-empty bucket.
 3310                          */
 3311                         if (scp2 == scp)
 3312                                 panic("syn_cache_insert: cacheoverflow: "
 3313                                     "impossible");
 3314 #endif
 3315                 }
 3316                 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
 3317                 SYN_CACHE_RM(sc2);
 3318                 SYN_CACHE_PUT(sc2);     /* calls pool_put but see spl above */
 3319         }
 3320 
 3321         /*
 3322          * Initialize the entry's timer.
 3323          */
 3324         sc->sc_rxttot = 0;
 3325         sc->sc_rxtshift = 0;
 3326         SYN_CACHE_TIMER_ARM(sc);
 3327 
 3328         /* Link it from tcpcb entry */
 3329         LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
 3330 
 3331         /* Put it into the bucket. */
 3332         TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
 3333         scp->sch_length++;
 3334         syn_cache_count++;
 3335 
 3336         tcpstat.tcps_sc_added++;
 3337         splx(s);
 3338 }
 3339 
 3340 /*
 3341  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
 3342  * If we have retransmitted an entry the maximum number of times, expire
 3343  * that entry.
 3344  */
 3345 void
 3346 syn_cache_timer(void *arg)
 3347 {
 3348         struct syn_cache *sc = arg;
 3349         int s;
 3350 
 3351         s = splsoftnet();
 3352         callout_ack(&sc->sc_timer);
 3353 
 3354         if (__predict_false(sc->sc_flags & SCF_DEAD)) {
 3355                 tcpstat.tcps_sc_delayed_free++;
 3356                 pool_put(&syn_cache_pool, sc);
 3357                 splx(s);
 3358                 return;
 3359         }
 3360 
 3361         if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
 3362                 /* Drop it -- too many retransmissions. */
 3363                 goto dropit;
 3364         }
 3365 
 3366         /*
 3367          * Compute the total amount of time this entry has
 3368          * been on a queue.  If this entry has been on longer
 3369          * than the keep alive timer would allow, expire it.
 3370          */
 3371         sc->sc_rxttot += sc->sc_rxtcur;
 3372         if (sc->sc_rxttot >= TCPTV_KEEP_INIT)
 3373                 goto dropit;
 3374 
 3375         tcpstat.tcps_sc_retransmitted++;
 3376         (void) syn_cache_respond(sc, NULL);
 3377 
 3378         /* Advance the timer back-off. */
 3379         sc->sc_rxtshift++;
 3380         SYN_CACHE_TIMER_ARM(sc);
 3381 
 3382         splx(s);
 3383         return;
 3384 
 3385  dropit:
 3386         tcpstat.tcps_sc_timed_out++;
 3387         SYN_CACHE_RM(sc);
 3388         SYN_CACHE_PUT(sc);      /* calls pool_put but see spl above */
 3389         splx(s);
 3390 }
 3391 
 3392 /*
 3393  * Remove syn cache created by the specified tcb entry,
 3394  * because this does not make sense to keep them
 3395  * (if there's no tcb entry, syn cache entry will never be used)
 3396  */
 3397 void
 3398 syn_cache_cleanup(struct tcpcb *tp)
 3399 {
 3400         struct syn_cache *sc, *nsc;
 3401         int s;
 3402 
 3403         s = splsoftnet();
 3404 
 3405         for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
 3406                 nsc = LIST_NEXT(sc, sc_tpq);
 3407 
 3408 #ifdef DIAGNOSTIC
 3409                 if (sc->sc_tp != tp)
 3410                         panic("invalid sc_tp in syn_cache_cleanup");
 3411 #endif
 3412                 SYN_CACHE_RM(sc);
 3413                 SYN_CACHE_PUT(sc);      /* calls pool_put but see spl above */
 3414         }
 3415         /* just for safety */
 3416         LIST_INIT(&tp->t_sc);
 3417 
 3418         splx(s);
 3419 }
 3420 
 3421 /*
 3422  * Find an entry in the syn cache.
 3423  */
 3424 struct syn_cache *
 3425 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
 3426     struct syn_cache_head **headp)
 3427 {
 3428         struct syn_cache *sc;
 3429         struct syn_cache_head *scp;
 3430         u_int32_t hash;
 3431         int s;
 3432 
 3433         SYN_HASHALL(hash, src, dst);
 3434 
 3435         scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
 3436         *headp = scp;
 3437         s = splsoftnet();
 3438         for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
 3439              sc = TAILQ_NEXT(sc, sc_bucketq)) {
 3440                 if (sc->sc_hash != hash)
 3441                         continue;
 3442                 if (!bcmp(&sc->sc_src, src, src->sa_len) &&
 3443                     !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
 3444                         splx(s);
 3445                         return (sc);
 3446                 }
 3447         }
 3448         splx(s);
 3449         return (NULL);
 3450 }
 3451 
 3452 /*
 3453  * This function gets called when we receive an ACK for a
 3454  * socket in the LISTEN state.  We look up the connection
 3455  * in the syn cache, and if its there, we pull it out of
 3456  * the cache and turn it into a full-blown connection in
 3457  * the SYN-RECEIVED state.
 3458  *
 3459  * The return values may not be immediately obvious, and their effects
 3460  * can be subtle, so here they are:
 3461  *
 3462  *      NULL    SYN was not found in cache; caller should drop the
 3463  *              packet and send an RST.
 3464  *
 3465  *      -1      We were unable to create the new connection, and are
 3466  *              aborting it.  An ACK,RST is being sent to the peer
 3467  *              (unless we got screwey sequence numbners; see below),
 3468  *              because the 3-way handshake has been completed.  Caller
 3469  *              should not free the mbuf, since we may be using it.  If
 3470  *              we are not, we will free it.
 3471  *
 3472  *      Otherwise, the return value is a pointer to the new socket
 3473  *      associated with the connection.
 3474  */
 3475 struct socket *
 3476 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
 3477     struct tcphdr *th, unsigned int hlen, unsigned int tlen,
 3478     struct socket *so, struct mbuf *m)
 3479 {
 3480         struct syn_cache *sc;
 3481         struct syn_cache_head *scp;
 3482         struct inpcb *inp = NULL;
 3483 #ifdef INET6
 3484         struct in6pcb *in6p = NULL;
 3485 #endif
 3486         struct tcpcb *tp = 0;
 3487         struct mbuf *am;
 3488         int s;
 3489         struct socket *oso;
 3490 
 3491         s = splsoftnet();
 3492         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3493                 splx(s);
 3494                 return (NULL);
 3495         }
 3496 
 3497         /*
 3498          * Verify the sequence and ack numbers.  Try getting the correct
 3499          * response again.
 3500          */
 3501         if ((th->th_ack != sc->sc_iss + 1) ||
 3502             SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 3503             SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
 3504                 (void) syn_cache_respond(sc, m);
 3505                 splx(s);
 3506                 return ((struct socket *)(-1));
 3507         }
 3508 
 3509         /* Remove this cache entry */
 3510         SYN_CACHE_RM(sc);
 3511         splx(s);
 3512 
 3513         /*
 3514          * Ok, create the full blown connection, and set things up
 3515          * as they would have been set up if we had created the
 3516          * connection when the SYN arrived.  If we can't create
 3517          * the connection, abort it.
 3518          */
 3519         /*
 3520          * inp still has the OLD in_pcb stuff, set the
 3521          * v6-related flags on the new guy, too.   This is
 3522          * done particularly for the case where an AF_INET6
 3523          * socket is bound only to a port, and a v4 connection
 3524          * comes in on that port.
 3525          * we also copy the flowinfo from the original pcb
 3526          * to the new one.
 3527          */
 3528         oso = so;
 3529         so = sonewconn(so, SS_ISCONNECTED);
 3530         if (so == NULL)
 3531                 goto resetandabort;
 3532 
 3533         switch (so->so_proto->pr_domain->dom_family) {
 3534 #ifdef INET
 3535         case AF_INET:
 3536                 inp = sotoinpcb(so);
 3537                 break;
 3538 #endif
 3539 #ifdef INET6
 3540         case AF_INET6:
 3541                 in6p = sotoin6pcb(so);
 3542                 break;
 3543 #endif
 3544         }
 3545         switch (src->sa_family) {
 3546 #ifdef INET
 3547         case AF_INET:
 3548                 if (inp) {
 3549                         inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
 3550                         inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
 3551                         inp->inp_options = ip_srcroute();
 3552                         in_pcbstate(inp, INP_BOUND);
 3553                         if (inp->inp_options == NULL) {
 3554                                 inp->inp_options = sc->sc_ipopts;
 3555                                 sc->sc_ipopts = NULL;
 3556                         }
 3557                 }
 3558 #ifdef INET6
 3559                 else if (in6p) {
 3560                         /* IPv4 packet to AF_INET6 socket */
 3561                         bzero(&in6p->in6p_laddr, sizeof(in6p->in6p_laddr));
 3562                         in6p->in6p_laddr.s6_addr16[5] = htons(0xffff);
 3563                         bcopy(&((struct sockaddr_in *)dst)->sin_addr,
 3564                                 &in6p->in6p_laddr.s6_addr32[3],
 3565                                 sizeof(((struct sockaddr_in *)dst)->sin_addr));
 3566                         in6p->in6p_lport = ((struct sockaddr_in *)dst)->sin_port;
 3567                         in6totcpcb(in6p)->t_family = AF_INET;
 3568                         if (sotoin6pcb(oso)->in6p_flags & IN6P_IPV6_V6ONLY)
 3569                                 in6p->in6p_flags |= IN6P_IPV6_V6ONLY;
 3570                         else
 3571                                 in6p->in6p_flags &= ~IN6P_IPV6_V6ONLY;
 3572                         in6_pcbstate(in6p, IN6P_BOUND);
 3573                 }
 3574 #endif
 3575                 break;
 3576 #endif
 3577 #ifdef INET6
 3578         case AF_INET6:
 3579                 if (in6p) {
 3580                         in6p->in6p_laddr = ((struct sockaddr_in6 *)dst)->sin6_addr;
 3581                         in6p->in6p_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
 3582                         in6_pcbstate(in6p, IN6P_BOUND);
 3583                 }
 3584                 break;
 3585 #endif
 3586         }
 3587 #ifdef INET6
 3588         if (in6p && in6totcpcb(in6p)->t_family == AF_INET6 && sotoinpcb(oso)) {
 3589                 struct in6pcb *oin6p = sotoin6pcb(oso);
 3590                 /* inherit socket options from the listening socket */
 3591                 in6p->in6p_flags |= (oin6p->in6p_flags & IN6P_CONTROLOPTS);
 3592                 if (in6p->in6p_flags & IN6P_CONTROLOPTS) {
 3593                         m_freem(in6p->in6p_options);
 3594                         in6p->in6p_options = 0;
 3595                 }
 3596                 ip6_savecontrol(in6p, &in6p->in6p_options,
 3597                         mtod(m, struct ip6_hdr *), m);
 3598         }
 3599 #endif
 3600 
 3601 #if defined(IPSEC) || defined(FAST_IPSEC)
 3602         /*
 3603          * we make a copy of policy, instead of sharing the policy,
 3604          * for better behavior in terms of SA lookup and dead SA removal.
 3605          */
 3606         if (inp) {
 3607                 /* copy old policy into new socket's */
 3608                 if (ipsec_copy_pcbpolicy(sotoinpcb(oso)->inp_sp, inp->inp_sp))
 3609                         printf("tcp_input: could not copy policy\n");
 3610         }
 3611 #ifdef INET6
 3612         else if (in6p) {
 3613                 /* copy old policy into new socket's */
 3614                 if (ipsec_copy_pcbpolicy(sotoin6pcb(oso)->in6p_sp,
 3615                     in6p->in6p_sp))
 3616                         printf("tcp_input: could not copy policy\n");
 3617         }
 3618 #endif
 3619 #endif
 3620 
 3621         /*
 3622          * Give the new socket our cached route reference.
 3623          */
 3624         if (inp)
 3625                 inp->inp_route = sc->sc_route4;         /* struct assignment */
 3626 #ifdef INET6
 3627         else
 3628                 in6p->in6p_route = sc->sc_route6;
 3629 #endif
 3630         sc->sc_route4.ro_rt = NULL;
 3631 
 3632         am = m_get(M_DONTWAIT, MT_SONAME);      /* XXX */
 3633         if (am == NULL)
 3634                 goto resetandabort;
 3635         MCLAIM(am, &tcp_mowner);
 3636         am->m_len = src->sa_len;
 3637         bcopy(src, mtod(am, caddr_t), src->sa_len);
 3638         if (inp) {
 3639                 if (in_pcbconnect(inp, am, NULL)) {
 3640                         (void) m_free(am);
 3641                         goto resetandabort;
 3642                 }
 3643         }
 3644 #ifdef INET6
 3645         else if (in6p) {
 3646                 if (src->sa_family == AF_INET) {
 3647                         /* IPv4 packet to AF_INET6 socket */
 3648                         struct sockaddr_in6 *sin6;
 3649                         sin6 = mtod(am, struct sockaddr_in6 *);
 3650                         am->m_len = sizeof(*sin6);
 3651                         bzero(sin6, sizeof(*sin6));
 3652                         sin6->sin6_family = AF_INET6;
 3653                         sin6->sin6_len = sizeof(*sin6);
 3654                         sin6->sin6_port = ((struct sockaddr_in *)src)->sin_port;
 3655                         sin6->sin6_addr.s6_addr16[5] = htons(0xffff);
 3656                         bcopy(&((struct sockaddr_in *)src)->sin_addr,
 3657                                 &sin6->sin6_addr.s6_addr32[3],
 3658                                 sizeof(sin6->sin6_addr.s6_addr32[3]));
 3659                 }
 3660                 if (in6_pcbconnect(in6p, am, NULL)) {
 3661                         (void) m_free(am);
 3662                         goto resetandabort;
 3663                 }
 3664         }
 3665 #endif
 3666         else {
 3667                 (void) m_free(am);
 3668                 goto resetandabort;
 3669         }
 3670         (void) m_free(am);
 3671 
 3672         if (inp)
 3673                 tp = intotcpcb(inp);
 3674 #ifdef INET6
 3675         else if (in6p)
 3676                 tp = in6totcpcb(in6p);
 3677 #endif
 3678         else
 3679                 tp = NULL;
 3680         tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
 3681         if (sc->sc_request_r_scale != 15) {
 3682                 tp->requested_s_scale = sc->sc_requested_s_scale;
 3683                 tp->request_r_scale = sc->sc_request_r_scale;
 3684                 tp->snd_scale = sc->sc_requested_s_scale;
 3685                 tp->rcv_scale = sc->sc_request_r_scale;
 3686                 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 3687         }
 3688         if (sc->sc_flags & SCF_TIMESTAMP)
 3689                 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 3690         tp->ts_timebase = sc->sc_timebase;
 3691 
 3692         tp->t_template = tcp_template(tp);
 3693         if (tp->t_template == 0) {
 3694                 tp = tcp_drop(tp, ENOBUFS);     /* destroys socket */
 3695                 so = NULL;
 3696                 m_freem(m);
 3697                 goto abort;
 3698         }
 3699 
 3700         tp->iss = sc->sc_iss;
 3701         tp->irs = sc->sc_irs;
 3702         tcp_sendseqinit(tp);
 3703         tcp_rcvseqinit(tp);
 3704         tp->t_state = TCPS_SYN_RECEIVED;
 3705         TCP_TIMER_ARM(tp, TCPT_KEEP, TCPTV_KEEP_INIT);
 3706         tcpstat.tcps_accepts++;
 3707 
 3708         if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
 3709                 tp->t_flags |= TF_WILL_SACK;
 3710 
 3711         if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
 3712                 tp->t_flags |= TF_ECN_PERMIT;
 3713 
 3714 #ifdef TCP_SIGNATURE
 3715         if (sc->sc_flags & SCF_SIGNATURE)
 3716                 tp->t_flags |= TF_SIGNATURE;
 3717 #endif
 3718 
 3719         /* Initialize tp->t_ourmss before we deal with the peer's! */
 3720         tp->t_ourmss = sc->sc_ourmaxseg;
 3721         tcp_mss_from_peer(tp, sc->sc_peermaxseg);
 3722 
 3723         /*
 3724          * Initialize the initial congestion window.  If we
 3725          * had to retransmit the SYN,ACK, we must initialize cwnd
 3726          * to 1 segment (i.e. the Loss Window).
 3727          */
 3728         if (sc->sc_rxtshift)
 3729                 tp->snd_cwnd = tp->t_peermss;
 3730         else {
 3731                 int ss = tcp_init_win;
 3732 #ifdef INET
 3733                 if (inp != NULL && in_localaddr(inp->inp_faddr))
 3734                         ss = tcp_init_win_local;
 3735 #endif
 3736 #ifdef INET6
 3737                 if (in6p != NULL && in6_localaddr(&in6p->in6p_faddr))
 3738                         ss = tcp_init_win_local;
 3739 #endif
 3740                 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
 3741         }
 3742 
 3743         tcp_rmx_rtt(tp);
 3744         tp->snd_wl1 = sc->sc_irs;
 3745         tp->rcv_up = sc->sc_irs + 1;
 3746 
 3747         /*
 3748          * This is what whould have happened in tcp_output() when
 3749          * the SYN,ACK was sent.
 3750          */
 3751         tp->snd_up = tp->snd_una;
 3752         tp->snd_max = tp->snd_nxt = tp->iss+1;
 3753         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 3754         if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
 3755                 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 3756         tp->last_ack_sent = tp->rcv_nxt;
 3757         tp->t_partialacks = -1;
 3758         tp->t_dupacks = 0;
 3759 
 3760         tcpstat.tcps_sc_completed++;
 3761         s = splsoftnet();
 3762         SYN_CACHE_PUT(sc);
 3763         splx(s);
 3764         return (so);
 3765 
 3766 resetandabort:
 3767         (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
 3768 abort:
 3769         if (so != NULL)
 3770                 (void) soabort(so);
 3771         s = splsoftnet();
 3772         SYN_CACHE_PUT(sc);
 3773         splx(s);
 3774         tcpstat.tcps_sc_aborted++;
 3775         return ((struct socket *)(-1));
 3776 }
 3777 
 3778 /*
 3779  * This function is called when we get a RST for a
 3780  * non-existent connection, so that we can see if the
 3781  * connection is in the syn cache.  If it is, zap it.
 3782  */
 3783 
 3784 void
 3785 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
 3786 {
 3787         struct syn_cache *sc;
 3788         struct syn_cache_head *scp;
 3789         int s = splsoftnet();
 3790 
 3791         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3792                 splx(s);
 3793                 return;
 3794         }
 3795         if (SEQ_LT(th->th_seq, sc->sc_irs) ||
 3796             SEQ_GT(th->th_seq, sc->sc_irs+1)) {
 3797                 splx(s);
 3798                 return;
 3799         }
 3800         SYN_CACHE_RM(sc);
 3801         tcpstat.tcps_sc_reset++;
 3802         SYN_CACHE_PUT(sc);      /* calls pool_put but see spl above */
 3803         splx(s);
 3804 }
 3805 
 3806 void
 3807 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
 3808     struct tcphdr *th)
 3809 {
 3810         struct syn_cache *sc;
 3811         struct syn_cache_head *scp;
 3812         int s;
 3813 
 3814         s = splsoftnet();
 3815         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
 3816                 splx(s);
 3817                 return;
 3818         }
 3819         /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 3820         if (ntohl (th->th_seq) != sc->sc_iss) {
 3821                 splx(s);
 3822                 return;
 3823         }
 3824 
 3825         /*
 3826          * If we've retransmitted 3 times and this is our second error,
 3827          * we remove the entry.  Otherwise, we allow it to continue on.
 3828          * This prevents us from incorrectly nuking an entry during a
 3829          * spurious network outage.
 3830          *
 3831          * See tcp_notify().
 3832          */
 3833         if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
 3834                 sc->sc_flags |= SCF_UNREACH;
 3835                 splx(s);
 3836                 return;
 3837         }
 3838 
 3839         SYN_CACHE_RM(sc);
 3840         tcpstat.tcps_sc_unreach++;
 3841         SYN_CACHE_PUT(sc);      /* calls pool_put but see spl above */
 3842         splx(s);
 3843 }
 3844 
 3845 /*
 3846  * Given a LISTEN socket and an inbound SYN request, add
 3847  * this to the syn cache, and send back a segment:
 3848  *      <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
 3849  * to the source.
 3850  *
 3851  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
 3852  * Doing so would require that we hold onto the data and deliver it
 3853  * to the application.  However, if we are the target of a SYN-flood
 3854  * DoS attack, an attacker could send data which would eventually
 3855  * consume all available buffer space if it were ACKed.  By not ACKing
 3856  * the data, we avoid this DoS scenario.
 3857  */
 3858 
 3859 int
 3860 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
 3861     unsigned int hlen, struct socket *so, struct mbuf *m, u_char *optp,
 3862     int optlen, struct tcp_opt_info *oi)
 3863 {
 3864         struct tcpcb tb, *tp;
 3865         long win;
 3866         struct syn_cache *sc;
 3867         struct syn_cache_head *scp;
 3868         struct mbuf *ipopts;
 3869         struct tcp_opt_info opti;
 3870         int s;
 3871 
 3872         tp = sototcpcb(so);
 3873 
 3874         bzero(&opti, sizeof(opti));
 3875 
 3876         /*
 3877          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 3878          *
 3879          * Note this check is performed in tcp_input() very early on.
 3880          */
 3881 
 3882         /*
 3883          * Initialize some local state.
 3884          */
 3885         win = sbspace(&so->so_rcv);
 3886         if (win > TCP_MAXWIN)
 3887                 win = TCP_MAXWIN;
 3888 
 3889         switch (src->sa_family) {
 3890 #ifdef INET
 3891         case AF_INET:
 3892                 /*
 3893                  * Remember the IP options, if any.
 3894                  */
 3895                 ipopts = ip_srcroute();
 3896                 break;
 3897 #endif
 3898         default:
 3899                 ipopts = NULL;
 3900         }
 3901 
 3902 #ifdef TCP_SIGNATURE
 3903         if (optp || (tp->t_flags & TF_SIGNATURE))
 3904 #else
 3905         if (optp)
 3906 #endif
 3907         {
 3908                 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
 3909 #ifdef TCP_SIGNATURE
 3910                 tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
 3911 #endif
 3912                 tb.t_state = TCPS_LISTEN;
 3913                 if (tcp_dooptions(&tb, optp, optlen, th, m, m->m_pkthdr.len -
 3914                     sizeof(struct tcphdr) - optlen - hlen, oi) < 0)
 3915                         return (0);
 3916         } else
 3917                 tb.t_flags = 0;
 3918 
 3919         /*
 3920          * See if we already have an entry for this connection.
 3921          * If we do, resend the SYN,ACK.  We do not count this
 3922          * as a retransmission (XXX though maybe we should).
 3923          */
 3924         if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
 3925                 tcpstat.tcps_sc_dupesyn++;
 3926                 if (ipopts) {
 3927                         /*
 3928                          * If we were remembering a previous source route,
 3929                          * forget it and use the new one we've been given.
 3930                          */
 3931                         if (sc->sc_ipopts)
 3932                                 (void) m_free(sc->sc_ipopts);
 3933                         sc->sc_ipopts = ipopts;
 3934                 }
 3935                 sc->sc_timestamp = tb.ts_recent;
 3936                 if (syn_cache_respond(sc, m) == 0) {
 3937                         tcpstat.tcps_sndacks++;
 3938                         tcpstat.tcps_sndtotal++;
 3939                 }
 3940                 return (1);
 3941         }
 3942 
 3943         s = splsoftnet();
 3944         sc = pool_get(&syn_cache_pool, PR_NOWAIT);
 3945         splx(s);
 3946         if (sc == NULL) {
 3947                 if (ipopts)
 3948                         (void) m_free(ipopts);
 3949                 return (0);
 3950         }
 3951 
 3952         /*
 3953          * Fill in the cache, and put the necessary IP and TCP
 3954          * options into the reply.
 3955          */
 3956         bzero(sc, sizeof(struct syn_cache));
 3957         callout_init(&sc->sc_timer);
 3958         bcopy(src, &sc->sc_src, src->sa_len);
 3959         bcopy(dst, &sc->sc_dst, dst->sa_len);
 3960         sc->sc_flags = 0;
 3961         sc->sc_ipopts = ipopts;
 3962         sc->sc_irs = th->th_seq;
 3963         switch (src->sa_family) {
 3964 #ifdef INET
 3965         case AF_INET:
 3966             {
 3967                 struct sockaddr_in *srcin = (void *) src;
 3968                 struct sockaddr_in *dstin = (void *) dst;
 3969 
 3970                 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
 3971                     &srcin->sin_addr, dstin->sin_port,
 3972                     srcin->sin_port, sizeof(dstin->sin_addr), 0);
 3973                 break;
 3974             }
 3975 #endif /* INET */
 3976 #ifdef INET6
 3977         case AF_INET6:
 3978             {
 3979                 struct sockaddr_in6 *srcin6 = (void *) src;
 3980                 struct sockaddr_in6 *dstin6 = (void *) dst;
 3981 
 3982                 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
 3983                     &srcin6->sin6_addr, dstin6->sin6_port,
 3984                     srcin6->sin6_port, sizeof(dstin6->sin6_addr), 0);
 3985                 break;
 3986             }
 3987 #endif /* INET6 */
 3988         }
 3989         sc->sc_peermaxseg = oi->maxseg;
 3990         sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
 3991                                                 m->m_pkthdr.rcvif : NULL,
 3992                                                 sc->sc_src.sa.sa_family);
 3993         sc->sc_win = win;
 3994         sc->sc_timebase = tcp_now;      /* see tcp_newtcpcb() */
 3995         sc->sc_timestamp = tb.ts_recent;
 3996         if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
 3997             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
 3998                 sc->sc_flags |= SCF_TIMESTAMP;
 3999         if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 4000             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 4001                 sc->sc_requested_s_scale = tb.requested_s_scale;
 4002                 sc->sc_request_r_scale = 0;
 4003                 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
 4004                     TCP_MAXWIN << sc->sc_request_r_scale <
 4005                     so->so_rcv.sb_hiwat)
 4006                         sc->sc_request_r_scale++;
 4007         } else {
 4008                 sc->sc_requested_s_scale = 15;
 4009                 sc->sc_request_r_scale = 15;
 4010         }
 4011         if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
 4012                 sc->sc_flags |= SCF_SACK_PERMIT;
 4013 
 4014         /*
 4015          * ECN setup packet recieved.
 4016          */
 4017         if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
 4018                 sc->sc_flags |= SCF_ECN_PERMIT;
 4019 
 4020 #ifdef TCP_SIGNATURE
 4021         if (tb.t_flags & TF_SIGNATURE)
 4022                 sc->sc_flags |= SCF_SIGNATURE;
 4023 #endif
 4024         sc->sc_tp = tp;
 4025         if (syn_cache_respond(sc, m) == 0) {
 4026                 syn_cache_insert(sc, tp);
 4027                 tcpstat.tcps_sndacks++;
 4028                 tcpstat.tcps_sndtotal++;
 4029         } else {
 4030                 s = splsoftnet();
 4031                 SYN_CACHE_PUT(sc);
 4032                 splx(s);
 4033                 tcpstat.tcps_sc_dropped++;
 4034         }
 4035         return (1);
 4036 }
 4037 
 4038 int
 4039 syn_cache_respond(struct syn_cache *sc, struct mbuf *m)
 4040 {
 4041         struct route *ro;
 4042         u_int8_t *optp;
 4043         int optlen, error;
 4044         u_int16_t tlen;
 4045         struct ip *ip = NULL;
 4046 #ifdef INET6
 4047         struct ip6_hdr *ip6 = NULL;
 4048 #endif
 4049         struct tcpcb *tp = NULL;
 4050         struct tcphdr *th;
 4051         u_int hlen;
 4052         struct socket *so;
 4053 
 4054         switch (sc->sc_src.sa.sa_family) {
 4055         case AF_INET:
 4056                 hlen = sizeof(struct ip);
 4057                 ro = &sc->sc_route4;
 4058                 break;
 4059 #ifdef INET6
 4060         case AF_INET6:
 4061                 hlen = sizeof(struct ip6_hdr);
 4062                 ro = (struct route *)&sc->sc_route6;
 4063                 break;
 4064 #endif
 4065         default:
 4066                 if (m)
 4067                         m_freem(m);
 4068                 return (EAFNOSUPPORT);
 4069         }
 4070 
 4071         /* Compute the size of the TCP options. */
 4072         optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
 4073             ((sc->sc_flags & SCF_SACK_PERMIT) ? (TCPOLEN_SACK_PERMITTED + 2) : 0) +
 4074 #ifdef TCP_SIGNATURE
 4075             ((sc->sc_flags & SCF_SIGNATURE) ? (TCPOLEN_SIGNATURE + 2) : 0) +
 4076 #endif
 4077             ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
 4078 
 4079         tlen = hlen + sizeof(struct tcphdr) + optlen;
 4080 
 4081         /*
 4082          * Create the IP+TCP header from scratch.
 4083          */
 4084         if (m)
 4085                 m_freem(m);
 4086 #ifdef DIAGNOSTIC
 4087         if (max_linkhdr + tlen > MCLBYTES)
 4088                 return (ENOBUFS);
 4089 #endif
 4090         MGETHDR(m, M_DONTWAIT, MT_DATA);
 4091         if (m && tlen > MHLEN) {
 4092                 MCLGET(m, M_DONTWAIT);
 4093                 if ((m->m_flags & M_EXT) == 0) {
 4094                         m_freem(m);
 4095                         m = NULL;
 4096                 }
 4097         }
 4098         if (m == NULL)
 4099                 return (ENOBUFS);
 4100         MCLAIM(m, &tcp_tx_mowner);
 4101 
 4102         /* Fixup the mbuf. */
 4103         m->m_data += max_linkhdr;
 4104         m->m_len = m->m_pkthdr.len = tlen;
 4105         if (sc->sc_tp) {
 4106                 tp = sc->sc_tp;
 4107                 if (tp->t_inpcb)
 4108                         so = tp->t_inpcb->inp_socket;
 4109 #ifdef INET6
 4110                 else if (tp->t_in6pcb)
 4111                         so = tp->t_in6pcb->in6p_socket;
 4112 #endif
 4113                 else
 4114                         so = NULL;
 4115         } else
 4116                 so = NULL;
 4117         m->m_pkthdr.rcvif = NULL;
 4118         memset(mtod(m, u_char *), 0, tlen);
 4119 
 4120         switch (sc->sc_src.sa.sa_family) {
 4121         case AF_INET:
 4122                 ip = mtod(m, struct ip *);
 4123                 ip->ip_v = 4;
 4124                 ip->ip_dst = sc->sc_src.sin.sin_addr;
 4125                 ip->ip_src = sc->sc_dst.sin.sin_addr;
 4126                 ip->ip_p = IPPROTO_TCP;
 4127                 th = (struct tcphdr *)(ip + 1);
 4128                 th->th_dport = sc->sc_src.sin.sin_port;
 4129                 th->th_sport = sc->sc_dst.sin.sin_port;
 4130                 break;
 4131 #ifdef INET6
 4132         case AF_INET6:
 4133                 ip6 = mtod(m, struct ip6_hdr *);
 4134                 ip6->ip6_vfc = IPV6_VERSION;
 4135                 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
 4136                 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
 4137                 ip6->ip6_nxt = IPPROTO_TCP;
 4138                 /* ip6_plen will be updated in ip6_output() */
 4139                 th = (struct tcphdr *)(ip6 + 1);
 4140                 th->th_dport = sc->sc_src.sin6.sin6_port;
 4141                 th->th_sport = sc->sc_dst.sin6.sin6_port;
 4142                 break;
 4143 #endif
 4144         default:
 4145                 th = NULL;
 4146         }
 4147 
 4148         th->th_seq = htonl(sc->sc_iss);
 4149         th->th_ack = htonl(sc->sc_irs + 1);
 4150         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 4151         th->th_flags = TH_SYN|TH_ACK;
 4152         th->th_win = htons(sc->sc_win);
 4153         /* th_sum already 0 */
 4154         /* th_urp already 0 */
 4155 
 4156         /* Tack on the TCP options. */
 4157         optp = (u_int8_t *)(th + 1);
 4158         *optp++ = TCPOPT_MAXSEG;
 4159         *optp++ = 4;
 4160         *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
 4161         *optp++ = sc->sc_ourmaxseg & 0xff;
 4162 
 4163         if (sc->sc_request_r_scale != 15) {
 4164                 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
 4165                     TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
 4166                     sc->sc_request_r_scale);
 4167                 optp += 4;
 4168         }
 4169 
 4170         if (sc->sc_flags & SCF_TIMESTAMP) {
 4171                 u_int32_t *lp = (u_int32_t *)(optp);
 4172                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 4173                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 4174                 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
 4175                 *lp   = htonl(sc->sc_timestamp);
 4176                 optp += TCPOLEN_TSTAMP_APPA;
 4177         }
 4178 
 4179         if (sc->sc_flags & SCF_SACK_PERMIT) {
 4180                 u_int8_t *p = optp;
 4181 
 4182                 /* Let the peer know that we will SACK. */
 4183                 p[0] = TCPOPT_SACK_PERMITTED;
 4184                 p[1] = 2;
 4185                 p[2] = TCPOPT_NOP;
 4186                 p[3] = TCPOPT_NOP;
 4187                 optp += 4;
 4188         }
 4189 
 4190         /*
 4191          * Send ECN SYN-ACK setup packet.
 4192          * Routes can be asymetric, so, even if we receive a packet
 4193          * with ECE and CWR set, we must not assume no one will block
 4194          * the ECE packet we are about to send.
 4195          */
 4196         if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
 4197             SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
 4198                 th->th_flags |= TH_ECE;
 4199                 tcpstat.tcps_ecn_shs++;
 4200 
 4201                 /*
 4202                  * draft-ietf-tcpm-ecnsyn-00.txt
 4203                  *
 4204                  * "[...] a TCP node MAY respond to an ECN-setup
 4205                  * SYN packet by setting ECT in the responding
 4206                  * ECN-setup SYN/ACK packet, indicating to routers 
 4207                  * that the SYN/ACK packet is ECN-Capable.
 4208                  * This allows a congested router along the path
 4209                  * to mark the packet instead of dropping the
 4210                  * packet as an indication of congestion."
 4211                  *
 4212                  * "[...] There can be a great benefit in setting
 4213                  * an ECN-capable codepoint in SYN/ACK packets [...]
 4214                  * Congestion is  most likely to occur in
 4215                  * the server-to-client direction.  As a result,
 4216                  * setting an ECN-capable codepoint in SYN/ACK
 4217                  * packets can reduce the occurence of three-second
 4218                  * retransmit timeouts resulting from the drop
 4219                  * of SYN/ACK packets."
 4220                  *
 4221                  * Page 4 and 6, January 2006.
 4222                  */
 4223 
 4224                 switch (sc->sc_src.sa.sa_family) {
 4225 #ifdef INET
 4226                 case AF_INET:
 4227                         ip->ip_tos |= IPTOS_ECN_ECT0;
 4228                         break;
 4229 #endif
 4230 #ifdef INET6
 4231                 case AF_INET6:
 4232                         ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 4233                         break;
 4234 #endif
 4235                 }
 4236                 tcpstat.tcps_ecn_ect++;
 4237         }
 4238 
 4239 #ifdef TCP_SIGNATURE
 4240         if (sc->sc_flags & SCF_SIGNATURE) {
 4241                 struct secasvar *sav;
 4242                 u_int8_t *sigp;
 4243 
 4244                 sav = tcp_signature_getsav(m, th);
 4245 
 4246                 if (sav == NULL) {
 4247                         if (m)
 4248                                 m_freem(m);
 4249                         return (EPERM);
 4250                 }
 4251 
 4252                 *optp++ = TCPOPT_SIGNATURE;
 4253                 *optp++ = TCPOLEN_SIGNATURE;
 4254                 sigp = optp;
 4255                 bzero(optp, TCP_SIGLEN);
 4256                 optp += TCP_SIGLEN;
 4257                 *optp++ = TCPOPT_NOP;
 4258                 *optp++ = TCPOPT_EOL;
 4259 
 4260                 (void)tcp_signature(m, th, hlen, sav, sigp);
 4261 
 4262                 key_sa_recordxfer(sav, m);
 4263 #ifdef FAST_IPSEC
 4264                 KEY_FREESAV(&sav);
 4265 #else
 4266                 key_freesav(sav);
 4267 #endif
 4268         }
 4269 #endif
 4270 
 4271         /* Compute the packet's checksum. */
 4272         switch (sc->sc_src.sa.sa_family) {
 4273         case AF_INET:
 4274                 ip->ip_len = htons(tlen - hlen);
 4275                 th->th_sum = 0;
 4276                 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 4277                 break;
 4278 #ifdef INET6
 4279         case AF_INET6:
 4280                 ip6->ip6_plen = htons(tlen - hlen);
 4281                 th->th_sum = 0;
 4282                 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 4283                 break;
 4284 #endif
 4285         }
 4286 
 4287         /*
 4288          * Fill in some straggling IP bits.  Note the stack expects
 4289          * ip_len to be in host order, for convenience.
 4290          */
 4291         switch (sc->sc_src.sa.sa_family) {
 4292 #ifdef INET
 4293         case AF_INET:
 4294                 ip->ip_len = htons(tlen);
 4295                 ip->ip_ttl = ip_defttl;
 4296                 /* XXX tos? */
 4297                 break;
 4298 #endif
 4299 #ifdef INET6
 4300         case AF_INET6:
 4301                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 4302                 ip6->ip6_vfc |= IPV6_VERSION;
 4303                 ip6->ip6_plen = htons(tlen - hlen);
 4304                 /* ip6_hlim will be initialized afterwards */
 4305                 /* XXX flowlabel? */
 4306                 break;
 4307 #endif
 4308         }
 4309 
 4310         /* XXX use IPsec policy on listening socket, on SYN ACK */
 4311         tp = sc->sc_tp;
 4312 
 4313         switch (sc->sc_src.sa.sa_family) {
 4314 #ifdef INET
 4315         case AF_INET:
 4316                 error = ip_output(m, sc->sc_ipopts, ro,
 4317                     (ip_mtudisc ? IP_MTUDISC : 0),
 4318                     (struct ip_moptions *)NULL, so);
 4319                 break;
 4320 #endif
 4321 #ifdef INET6
 4322         case AF_INET6:
 4323                 ip6->ip6_hlim = in6_selecthlim(NULL,
 4324                                 ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
 4325 
 4326                 error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
 4327                         (struct ip6_moptions *)0, so, NULL);
 4328                 break;
 4329 #endif
 4330         default:
 4331                 error = EAFNOSUPPORT;
 4332                 break;
 4333         }
 4334         return (error);
 4335 }
Cache object: 8d629263e05be86a61bc812596da0aa0
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/netinet/tcp_input.c

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_input.c