The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_input.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $OpenBSD: tcp_input.c,v 1.386 2023/01/22 12:05:44 mvs Exp $     */
    2 /*      $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $  */
    3 
    4 /*
    5  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
    6  *      The Regents of the University of California.  All rights reserved.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  * 3. Neither the name of the University nor the names of its contributors
   17  *    may be used to endorse or promote products derived from this software
   18  *    without specific prior written permission.
   19  *
   20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   30  * SUCH DAMAGE.
   31  *
   32  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   33  *
   34  * NRL grants permission for redistribution and use in source and binary
   35  * forms, with or without modification, of the software and documentation
   36  * created at NRL provided that the following conditions are met:
   37  *
   38  * 1. Redistributions of source code must retain the above copyright
   39  *    notice, this list of conditions and the following disclaimer.
   40  * 2. Redistributions in binary form must reproduce the above copyright
   41  *    notice, this list of conditions and the following disclaimer in the
   42  *    documentation and/or other materials provided with the distribution.
   43  * 3. All advertising materials mentioning features or use of this software
   44  *    must display the following acknowledgements:
   45  *      This product includes software developed by the University of
   46  *      California, Berkeley and its contributors.
   47  *      This product includes software developed at the Information
   48  *      Technology Division, US Naval Research Laboratory.
   49  * 4. Neither the name of the NRL nor the names of its contributors
   50  *    may be used to endorse or promote products derived from this software
   51  *    without specific prior written permission.
   52  *
   53  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   54  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   55  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   56  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   57  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   58  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   59  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   60  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   61  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   62  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   63  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   64  *
   65  * The views and conclusions contained in the software and documentation
   66  * are those of the authors and should not be interpreted as representing
   67  * official policies, either expressed or implied, of the US Naval
   68  * Research Laboratory (NRL).
   69  */
   70 
   71 #include "pf.h"
   72 
   73 #include <sys/param.h>
   74 #include <sys/systm.h>
   75 #include <sys/mbuf.h>
   76 #include <sys/protosw.h>
   77 #include <sys/socket.h>
   78 #include <sys/socketvar.h>
   79 #include <sys/timeout.h>
   80 #include <sys/kernel.h>
   81 #include <sys/pool.h>
   82 
   83 #include <net/if.h>
   84 #include <net/if_var.h>
   85 #include <net/route.h>
   86 
   87 #include <netinet/in.h>
   88 #include <netinet/ip.h>
   89 #include <netinet/in_pcb.h>
   90 #include <netinet/ip_var.h>
   91 #include <netinet/tcp.h>
   92 #include <netinet/tcp_fsm.h>
   93 #include <netinet/tcp_seq.h>
   94 #include <netinet/tcp_timer.h>
   95 #include <netinet/tcp_var.h>
   96 #include <netinet/tcp_debug.h>
   97 
   98 #if NPF > 0
   99 #include <net/pfvar.h>
  100 #endif
  101 
  102 struct  tcpiphdr tcp_saveti;
  103 
  104 int tcp_mss_adv(struct mbuf *, int);
  105 int tcp_flush_queue(struct tcpcb *);
  106 
  107 #ifdef INET6
  108 #include <netinet6/in6_var.h>
  109 #include <netinet6/nd6.h>
  110 
  111 struct  tcpipv6hdr tcp_saveti6;
  112 
  113 /* for the packet header length in the mbuf */
  114 #define M_PH_LEN(m)      (((struct mbuf *)(m))->m_pkthdr.len)
  115 #define M_V6_LEN(m)      (M_PH_LEN(m) - sizeof(struct ip6_hdr))
  116 #define M_V4_LEN(m)      (M_PH_LEN(m) - sizeof(struct ip))
  117 #endif /* INET6 */
  118 
  119 int     tcprexmtthresh = 3;
  120 int     tcptv_keep_init = TCPTV_KEEP_INIT;
  121 
  122 int tcp_rst_ppslim = 100;               /* 100pps */
  123 int tcp_rst_ppslim_count = 0;
  124 struct timeval tcp_rst_ppslim_last;
  125 
  126 int tcp_ackdrop_ppslim = 100;           /* 100pps */
  127 int tcp_ackdrop_ppslim_count = 0;
  128 struct timeval tcp_ackdrop_ppslim_last;
  129 
  130 #define TCP_PAWS_IDLE   TCP_TIME(24 * 24 * 60 * 60)
  131 
  132 /* for modulo comparisons of timestamps */
  133 #define TSTMP_LT(a,b)   ((int)((a)-(b)) < 0)
  134 #define TSTMP_GEQ(a,b)  ((int)((a)-(b)) >= 0)
  135 
  136 /* for TCP SACK comparisons */
  137 #define SEQ_MIN(a,b)    (SEQ_LT(a,b) ? (a) : (b))
  138 #define SEQ_MAX(a,b)    (SEQ_GT(a,b) ? (a) : (b))
  139 
  140 /*
  141  * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
  142  */
  143 #ifdef INET6
  144 #define ND6_HINT(tp) \
  145 do { \
  146         if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
  147             rtisvalid(tp->t_inpcb->inp_route6.ro_rt)) {                 \
  148                 nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt);            \
  149         } \
  150 } while (0)
  151 #else
  152 #define ND6_HINT(tp)
  153 #endif
  154 
  155 #ifdef TCP_ECN
  156 /*
  157  * ECN (Explicit Congestion Notification) support based on RFC3168
  158  * implementation note:
  159  *   snd_last is used to track a recovery phase.
  160  *   when cwnd is reduced, snd_last is set to snd_max.
  161  *   while snd_last > snd_una, the sender is in a recovery phase and
  162  *   its cwnd should not be reduced again.
  163  *   snd_last follows snd_una when not in a recovery phase.
  164  */
  165 #endif
  166 
  167 /*
  168  * Macro to compute ACK transmission behavior.  Delay the ACK unless
  169  * we have already delayed an ACK (must send an ACK every two segments).
  170  * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
  171  * option is enabled or when the packet is coming from a loopback
  172  * interface.
  173  */
  174 #define TCP_SETUP_ACK(tp, tiflags, m) \
  175 do { \
  176         struct ifnet *ifp = NULL; \
  177         if (m && (m->m_flags & M_PKTHDR)) \
  178                 ifp = if_get(m->m_pkthdr.ph_ifidx); \
  179         if (TCP_TIMER_ISARMED(tp, TCPT_DELACK) || \
  180             (tcp_ack_on_push && (tiflags) & TH_PUSH) || \
  181             (ifp && (ifp->if_flags & IFF_LOOPBACK))) \
  182                 tp->t_flags |= TF_ACKNOW; \
  183         else \
  184                 TCP_TIMER_ARM(tp, TCPT_DELACK, tcp_delack_msecs); \
  185         if_put(ifp); \
  186 } while (0)
  187 
  188 void     tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
  189 void     tcp_newreno_partialack(struct tcpcb *, struct tcphdr *);
  190 
  191 void     syn_cache_put(struct syn_cache *);
  192 void     syn_cache_rm(struct syn_cache *);
  193 int      syn_cache_respond(struct syn_cache *, struct mbuf *, uint32_t);
  194 void     syn_cache_timer(void *);
  195 void     syn_cache_reaper(void *);
  196 void     syn_cache_insert(struct syn_cache *, struct tcpcb *);
  197 void     syn_cache_reset(struct sockaddr *, struct sockaddr *,
  198                 struct tcphdr *, u_int);
  199 int      syn_cache_add(struct sockaddr *, struct sockaddr *, struct tcphdr *,
  200                 unsigned int, struct socket *, struct mbuf *, u_char *, int,
  201                 struct tcp_opt_info *, tcp_seq *, uint32_t);
  202 struct socket *syn_cache_get(struct sockaddr *, struct sockaddr *,
  203                 struct tcphdr *, unsigned int, unsigned int, struct socket *,
  204                 struct mbuf *, uint32_t);
  205 struct syn_cache *syn_cache_lookup(struct sockaddr *, struct sockaddr *,
  206                 struct syn_cache_head **, u_int);
  207 
  208 /*
  209  * Insert segment ti into reassembly queue of tcp with
  210  * control block tp.  Return TH_FIN if reassembly now includes
  211  * a segment with FIN.  The macro form does the common case inline
  212  * (segment is the next to be received on an established connection,
  213  * and the queue is empty), avoiding linkage into and removal
  214  * from the queue and repetition of various conversions.
  215  * Set DELACK for segments received in order, but ack immediately
  216  * when segments are out of order (so fast retransmit can work).
  217  */
  218 
  219 int
  220 tcp_reass(struct tcpcb *tp, struct tcphdr *th, struct mbuf *m, int *tlen)
  221 {
  222         struct tcpqent *p, *q, *nq, *tiqe;
  223 
  224         /*
  225          * Allocate a new queue entry, before we throw away any data.
  226          * If we can't, just drop the packet.  XXX
  227          */
  228         tiqe = pool_get(&tcpqe_pool, PR_NOWAIT);
  229         if (tiqe == NULL) {
  230                 tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead);
  231                 if (tiqe != NULL && th->th_seq == tp->rcv_nxt) {
  232                         /* Reuse last entry since new segment fills a hole */
  233                         m_freem(tiqe->tcpqe_m);
  234                         TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q);
  235                 }
  236                 if (tiqe == NULL || th->th_seq != tp->rcv_nxt) {
  237                         /* Flush segment queue for this connection */
  238                         tcp_freeq(tp);
  239                         tcpstat_inc(tcps_rcvmemdrop);
  240                         m_freem(m);
  241                         return (0);
  242                 }
  243         }
  244 
  245         /*
  246          * Find a segment which begins after this one does.
  247          */
  248         for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL;
  249             p = q, q = TAILQ_NEXT(q, tcpqe_q))
  250                 if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq))
  251                         break;
  252 
  253         /*
  254          * If there is a preceding segment, it may provide some of
  255          * our data already.  If so, drop the data from the incoming
  256          * segment.  If it provides all of our data, drop us.
  257          */
  258         if (p != NULL) {
  259                 struct tcphdr *phdr = p->tcpqe_tcp;
  260                 int i;
  261 
  262                 /* conversion to int (in i) handles seq wraparound */
  263                 i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
  264                 if (i > 0) {
  265                         if (i >= *tlen) {
  266                                 tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte,
  267                                     *tlen);
  268                                 m_freem(m);
  269                                 pool_put(&tcpqe_pool, tiqe);
  270                                 return (0);
  271                         }
  272                         m_adj(m, i);
  273                         *tlen -= i;
  274                         th->th_seq += i;
  275                 }
  276         }
  277         tcpstat_pkt(tcps_rcvoopack, tcps_rcvoobyte, *tlen);
  278         tp->t_rcvoopack++;
  279 
  280         /*
  281          * While we overlap succeeding segments trim them or,
  282          * if they are completely covered, dequeue them.
  283          */
  284         for (; q != NULL; q = nq) {
  285                 struct tcphdr *qhdr = q->tcpqe_tcp;
  286                 int i = (th->th_seq + *tlen) - qhdr->th_seq;
  287 
  288                 if (i <= 0)
  289                         break;
  290                 if (i < qhdr->th_reseqlen) {
  291                         qhdr->th_seq += i;
  292                         qhdr->th_reseqlen -= i;
  293                         m_adj(q->tcpqe_m, i);
  294                         break;
  295                 }
  296                 nq = TAILQ_NEXT(q, tcpqe_q);
  297                 m_freem(q->tcpqe_m);
  298                 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
  299                 pool_put(&tcpqe_pool, q);
  300         }
  301 
  302         /* Insert the new segment queue entry into place. */
  303         tiqe->tcpqe_m = m;
  304         th->th_reseqlen = *tlen;
  305         tiqe->tcpqe_tcp = th;
  306         if (p == NULL) {
  307                 TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q);
  308         } else {
  309                 TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q);
  310         }
  311 
  312         if (th->th_seq != tp->rcv_nxt)
  313                 return (0);
  314 
  315         return (tcp_flush_queue(tp));
  316 }
  317 
  318 int
  319 tcp_flush_queue(struct tcpcb *tp)
  320 {
  321         struct socket *so = tp->t_inpcb->inp_socket;
  322         struct tcpqent *q, *nq;
  323         int flags;
  324 
  325         /*
  326          * Present data to user, advancing rcv_nxt through
  327          * completed sequence space.
  328          */
  329         if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
  330                 return (0);
  331         q = TAILQ_FIRST(&tp->t_segq);
  332         if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt)
  333                 return (0);
  334         if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen)
  335                 return (0);
  336         do {
  337                 tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen;
  338                 flags = q->tcpqe_tcp->th_flags & TH_FIN;
  339 
  340                 nq = TAILQ_NEXT(q, tcpqe_q);
  341                 TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
  342                 ND6_HINT(tp);
  343                 if (so->so_rcv.sb_state & SS_CANTRCVMORE)
  344                         m_freem(q->tcpqe_m);
  345                 else
  346                         sbappendstream(so, &so->so_rcv, q->tcpqe_m);
  347                 pool_put(&tcpqe_pool, q);
  348                 q = nq;
  349         } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
  350         tp->t_flags |= TF_BLOCKOUTPUT;
  351         sorwakeup(so);
  352         tp->t_flags &= ~TF_BLOCKOUTPUT;
  353         return (flags);
  354 }
  355 
  356 /*
  357  * TCP input routine, follows pages 65-76 of the
  358  * protocol specification dated September, 1981 very closely.
  359  */
  360 int
  361 tcp_input(struct mbuf **mp, int *offp, int proto, int af)
  362 {
  363         struct mbuf *m = *mp;
  364         int iphlen = *offp;
  365         struct ip *ip = NULL;
  366         struct inpcb *inp = NULL;
  367         u_int8_t *optp = NULL;
  368         int optlen = 0;
  369         int tlen, off;
  370         struct tcpcb *otp = NULL, *tp = NULL;
  371         int tiflags;
  372         struct socket *so = NULL;
  373         int todrop, acked, ourfinisacked;
  374         int hdroptlen = 0;
  375         short ostate;
  376         caddr_t saveti;
  377         tcp_seq iss, *reuse = NULL;
  378         uint32_t now;
  379         u_long tiwin;
  380         struct tcp_opt_info opti;
  381         struct tcphdr *th;
  382 #ifdef INET6
  383         struct ip6_hdr *ip6 = NULL;
  384 #endif /* INET6 */
  385 #ifdef TCP_ECN
  386         u_char iptos;
  387 #endif
  388 
  389         tcpstat_inc(tcps_rcvtotal);
  390 
  391         opti.ts_present = 0;
  392         opti.maxseg = 0;
  393         now = tcp_now();
  394 
  395         /*
  396          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
  397          */
  398         if (m->m_flags & (M_BCAST|M_MCAST))
  399                 goto drop;
  400 
  401         /*
  402          * Get IP and TCP header together in first mbuf.
  403          * Note: IP leaves IP header in first mbuf.
  404          */
  405         IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th));
  406         if (!th) {
  407                 tcpstat_inc(tcps_rcvshort);
  408                 return IPPROTO_DONE;
  409         }
  410 
  411         tlen = m->m_pkthdr.len - iphlen;
  412         switch (af) {
  413         case AF_INET:
  414                 ip = mtod(m, struct ip *);
  415 #ifdef TCP_ECN
  416                 /* save ip_tos before clearing it for checksum */
  417                 iptos = ip->ip_tos;
  418 #endif
  419                 break;
  420 #ifdef INET6
  421         case AF_INET6:
  422                 ip6 = mtod(m, struct ip6_hdr *);
  423 #ifdef TCP_ECN
  424                 iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
  425 #endif
  426 
  427                 /*
  428                  * Be proactive about unspecified IPv6 address in source.
  429                  * As we use all-zero to indicate unbounded/unconnected pcb,
  430                  * unspecified IPv6 address can be used to confuse us.
  431                  *
  432                  * Note that packets with unspecified IPv6 destination is
  433                  * already dropped in ip6_input.
  434                  */
  435                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
  436                         /* XXX stat */
  437                         goto drop;
  438                 }
  439 
  440                 /* Discard packets to multicast */
  441                 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
  442                         /* XXX stat */
  443                         goto drop;
  444                 }
  445                 break;
  446 #endif
  447         default:
  448                 unhandled_af(af);
  449         }
  450 
  451         /*
  452          * Checksum extended TCP header and data.
  453          */
  454         if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) {
  455                 int sum;
  456 
  457                 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) {
  458                         tcpstat_inc(tcps_rcvbadsum);
  459                         goto drop;
  460                 }
  461                 tcpstat_inc(tcps_inswcsum);
  462                 switch (af) {
  463                 case AF_INET:
  464                         sum = in4_cksum(m, IPPROTO_TCP, iphlen, tlen);
  465                         break;
  466 #ifdef INET6
  467                 case AF_INET6:
  468                         sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
  469                             tlen);
  470                         break;
  471 #endif
  472                 }
  473                 if (sum != 0) {
  474                         tcpstat_inc(tcps_rcvbadsum);
  475                         goto drop;
  476                 }
  477         }
  478 
  479         /*
  480          * Check that TCP offset makes sense,
  481          * pull out TCP options and adjust length.              XXX
  482          */
  483         off = th->th_off << 2;
  484         if (off < sizeof(struct tcphdr) || off > tlen) {
  485                 tcpstat_inc(tcps_rcvbadoff);
  486                 goto drop;
  487         }
  488         tlen -= off;
  489         if (off > sizeof(struct tcphdr)) {
  490                 IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off);
  491                 if (!th) {
  492                         tcpstat_inc(tcps_rcvshort);
  493                         return IPPROTO_DONE;
  494                 }
  495                 optlen = off - sizeof(struct tcphdr);
  496                 optp = (u_int8_t *)(th + 1);
  497                 /*
  498                  * Do quick retrieval of timestamp options ("options
  499                  * prediction?").  If timestamp is the only option and it's
  500                  * formatted as recommended in RFC 1323 appendix A, we
  501                  * quickly get the values now and not bother calling
  502                  * tcp_dooptions(), etc.
  503                  */
  504                 if ((optlen == TCPOLEN_TSTAMP_APPA ||
  505                      (optlen > TCPOLEN_TSTAMP_APPA &&
  506                       optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
  507                      *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
  508                      (th->th_flags & TH_SYN) == 0) {
  509                         opti.ts_present = 1;
  510                         opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
  511                         opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
  512                         optp = NULL;    /* we've parsed the options */
  513                 }
  514         }
  515         tiflags = th->th_flags;
  516 
  517         /*
  518          * Convert TCP protocol specific fields to host format.
  519          */
  520         th->th_seq = ntohl(th->th_seq);
  521         th->th_ack = ntohl(th->th_ack);
  522         th->th_win = ntohs(th->th_win);
  523         th->th_urp = ntohs(th->th_urp);
  524 
  525         if (th->th_dport == 0) {
  526                 tcpstat_inc(tcps_noport);
  527                 goto dropwithreset_ratelim;
  528         }
  529 
  530         /*
  531          * Locate pcb for segment.
  532          */
  533 #if NPF > 0
  534         inp = pf_inp_lookup(m);
  535 #endif
  536 findpcb:
  537         if (inp == NULL) {
  538                 switch (af) {
  539 #ifdef INET6
  540                 case AF_INET6:
  541                         inp = in6_pcblookup(&tcbtable, &ip6->ip6_src,
  542                             th->th_sport, &ip6->ip6_dst, th->th_dport,
  543                             m->m_pkthdr.ph_rtableid);
  544                         break;
  545 #endif
  546                 case AF_INET:
  547                         inp = in_pcblookup(&tcbtable, ip->ip_src,
  548                             th->th_sport, ip->ip_dst, th->th_dport,
  549                             m->m_pkthdr.ph_rtableid);
  550                         break;
  551                 }
  552         }
  553         if (inp == NULL) {
  554                 tcpstat_inc(tcps_pcbhashmiss);
  555                 switch (af) {
  556 #ifdef INET6
  557                 case AF_INET6:
  558                         inp = in6_pcblookup_listen(&tcbtable, &ip6->ip6_dst,
  559                             th->th_dport, m, m->m_pkthdr.ph_rtableid);
  560                         break;
  561 #endif /* INET6 */
  562                 case AF_INET:
  563                         inp = in_pcblookup_listen(&tcbtable, ip->ip_dst,
  564                             th->th_dport, m, m->m_pkthdr.ph_rtableid);
  565                         break;
  566                 }
  567                 /*
  568                  * If the state is CLOSED (i.e., TCB does not exist) then
  569                  * all data in the incoming segment is discarded.
  570                  * If the TCB exists but is in CLOSED state, it is embryonic,
  571                  * but should either do a listen or a connect soon.
  572                  */
  573         }
  574 #ifdef IPSEC
  575         if (ipsec_in_use) {
  576                 struct m_tag *mtag;
  577                 struct tdb *tdb = NULL;
  578                 int error;
  579 
  580                 /* Find most recent IPsec tag */
  581                 mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
  582                 if (mtag != NULL) {
  583                         struct tdb_ident *tdbi;
  584 
  585                         tdbi = (struct tdb_ident *)(mtag + 1);
  586                         tdb = gettdb(tdbi->rdomain, tdbi->spi,
  587                             &tdbi->dst, tdbi->proto);
  588                 }
  589                 error = ipsp_spd_lookup(m, af, iphlen, IPSP_DIRECTION_IN,
  590                     tdb, inp, NULL, NULL);
  591                 tdb_unref(tdb);
  592                 if (error) {
  593                         tcpstat_inc(tcps_rcvnosec);
  594                         goto drop;
  595                 }
  596         }
  597 #endif /* IPSEC */
  598 
  599         if (inp == NULL) {
  600                 tcpstat_inc(tcps_noport);
  601                 goto dropwithreset_ratelim;
  602         }
  603 
  604         KASSERT(sotoinpcb(inp->inp_socket) == inp);
  605         KASSERT(intotcpcb(inp) == NULL || intotcpcb(inp)->t_inpcb == inp);
  606         soassertlocked(inp->inp_socket);
  607 
  608         /* Check the minimum TTL for socket. */
  609         switch (af) {
  610         case AF_INET:
  611                 if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
  612                         goto drop;
  613                 break;
  614 #ifdef INET6
  615         case AF_INET6:
  616                 if (inp->inp_ip6_minhlim &&
  617                     inp->inp_ip6_minhlim > ip6->ip6_hlim)
  618                         goto drop;
  619                 break;
  620 #endif
  621         }
  622 
  623         tp = intotcpcb(inp);
  624         if (tp == NULL)
  625                 goto dropwithreset_ratelim;
  626         if (tp->t_state == TCPS_CLOSED)
  627                 goto drop;
  628 
  629         /* Unscale the window into a 32-bit value. */
  630         if ((tiflags & TH_SYN) == 0)
  631                 tiwin = th->th_win << tp->snd_scale;
  632         else
  633                 tiwin = th->th_win;
  634 
  635         so = inp->inp_socket;
  636         if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
  637                 union syn_cache_sa src;
  638                 union syn_cache_sa dst;
  639 
  640                 bzero(&src, sizeof(src));
  641                 bzero(&dst, sizeof(dst));
  642                 switch (af) {
  643                 case AF_INET:
  644                         src.sin.sin_len = sizeof(struct sockaddr_in);
  645                         src.sin.sin_family = AF_INET;
  646                         src.sin.sin_addr = ip->ip_src;
  647                         src.sin.sin_port = th->th_sport;
  648 
  649                         dst.sin.sin_len = sizeof(struct sockaddr_in);
  650                         dst.sin.sin_family = AF_INET;
  651                         dst.sin.sin_addr = ip->ip_dst;
  652                         dst.sin.sin_port = th->th_dport;
  653                         break;
  654 #ifdef INET6
  655                 case AF_INET6:
  656                         src.sin6.sin6_len = sizeof(struct sockaddr_in6);
  657                         src.sin6.sin6_family = AF_INET6;
  658                         src.sin6.sin6_addr = ip6->ip6_src;
  659                         src.sin6.sin6_port = th->th_sport;
  660 
  661                         dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
  662                         dst.sin6.sin6_family = AF_INET6;
  663                         dst.sin6.sin6_addr = ip6->ip6_dst;
  664                         dst.sin6.sin6_port = th->th_dport;
  665                         break;
  666 #endif /* INET6 */
  667                 }
  668 
  669                 if (so->so_options & SO_DEBUG) {
  670                         otp = tp;
  671                         ostate = tp->t_state;
  672                         switch (af) {
  673 #ifdef INET6
  674                         case AF_INET6:
  675                                 saveti = (caddr_t) &tcp_saveti6;
  676                                 memcpy(&tcp_saveti6.ti6_i, ip6, sizeof(*ip6));
  677                                 memcpy(&tcp_saveti6.ti6_t, th, sizeof(*th));
  678                                 break;
  679 #endif
  680                         case AF_INET:
  681                                 saveti = (caddr_t) &tcp_saveti;
  682                                 memcpy(&tcp_saveti.ti_i, ip, sizeof(*ip));
  683                                 memcpy(&tcp_saveti.ti_t, th, sizeof(*th));
  684                                 break;
  685                         }
  686                 }
  687                 if (so->so_options & SO_ACCEPTCONN) {
  688                         switch (tiflags & (TH_RST|TH_SYN|TH_ACK)) {
  689 
  690                         case TH_SYN|TH_ACK|TH_RST:
  691                         case TH_SYN|TH_RST:
  692                         case TH_ACK|TH_RST:
  693                         case TH_RST:
  694                                 syn_cache_reset(&src.sa, &dst.sa, th,
  695                                     inp->inp_rtableid);
  696                                 goto drop;
  697 
  698                         case TH_SYN|TH_ACK:
  699                                 /*
  700                                  * Received a SYN,ACK.  This should
  701                                  * never happen while we are in
  702                                  * LISTEN.  Send an RST.
  703                                  */
  704                                 goto badsyn;
  705 
  706                         case TH_ACK:
  707                                 so = syn_cache_get(&src.sa, &dst.sa,
  708                                     th, iphlen, tlen, so, m, now);
  709                                 if (so == NULL) {
  710                                         /*
  711                                          * We don't have a SYN for
  712                                          * this ACK; send an RST.
  713                                          */
  714                                         goto badsyn;
  715                                 } else if (so == (struct socket *)(-1)) {
  716                                         /*
  717                                          * We were unable to create
  718                                          * the connection.  If the
  719                                          * 3-way handshake was
  720                                          * completed, and RST has
  721                                          * been sent to the peer.
  722                                          * Since the mbuf might be
  723                                          * in use for the reply,
  724                                          * do not free it.
  725                                          */
  726                                         m = *mp = NULL;
  727                                         goto drop;
  728                                 } else {
  729                                         /*
  730                                          * We have created a
  731                                          * full-blown connection.
  732                                          */
  733                                         tp = NULL;
  734                                         in_pcbunref(inp);
  735                                         inp = in_pcbref(sotoinpcb(so));
  736                                         tp = intotcpcb(inp);
  737                                         if (tp == NULL)
  738                                                 goto badsyn;    /*XXX*/
  739 
  740                                 }
  741                                 break;
  742 
  743                         default:
  744                                 /*
  745                                  * None of RST, SYN or ACK was set.
  746                                  * This is an invalid packet for a
  747                                  * TCB in LISTEN state.  Send a RST.
  748                                  */
  749                                 goto badsyn;
  750 
  751                         case TH_SYN:
  752                                 /*
  753                                  * Received a SYN.
  754                                  */
  755 #ifdef INET6
  756                                 /*
  757                                  * If deprecated address is forbidden, we do
  758                                  * not accept SYN to deprecated interface
  759                                  * address to prevent any new inbound
  760                                  * connection from getting established.
  761                                  * When we do not accept SYN, we send a TCP
  762                                  * RST, with deprecated source address (instead
  763                                  * of dropping it).  We compromise it as it is
  764                                  * much better for peer to send a RST, and
  765                                  * RST will be the final packet for the
  766                                  * exchange.
  767                                  *
  768                                  * If we do not forbid deprecated addresses, we
  769                                  * accept the SYN packet.  RFC2462 does not
  770                                  * suggest dropping SYN in this case.
  771                                  * If we decipher RFC2462 5.5.4, it says like
  772                                  * this:
  773                                  * 1. use of deprecated addr with existing
  774                                  *    communication is okay - "SHOULD continue
  775                                  *    to be used"
  776                                  * 2. use of it with new communication:
  777                                  *   (2a) "SHOULD NOT be used if alternate
  778                                  *        address with sufficient scope is
  779                                  *        available"
  780                                  *   (2b) nothing mentioned otherwise.
  781                                  * Here we fall into (2b) case as we have no
  782                                  * choice in our source address selection - we
  783                                  * must obey the peer.
  784                                  *
  785                                  * The wording in RFC2462 is confusing, and
  786                                  * there are multiple description text for
  787                                  * deprecated address handling - worse, they
  788                                  * are not exactly the same.  I believe 5.5.4
  789                                  * is the best one, so we follow 5.5.4.
  790                                  */
  791                                 if (ip6 && !ip6_use_deprecated) {
  792                                         struct in6_ifaddr *ia6;
  793                                         struct ifnet *ifp =
  794                                             if_get(m->m_pkthdr.ph_ifidx);
  795 
  796                                         if (ifp &&
  797                                             (ia6 = in6ifa_ifpwithaddr(ifp,
  798                                             &ip6->ip6_dst)) &&
  799                                             (ia6->ia6_flags &
  800                                             IN6_IFF_DEPRECATED)) {
  801                                                 tp = NULL;
  802                                                 if_put(ifp);
  803                                                 goto dropwithreset;
  804                                         }
  805                                         if_put(ifp);
  806                                 }
  807 #endif
  808 
  809                                 /*
  810                                  * LISTEN socket received a SYN
  811                                  * from itself?  This can't possibly
  812                                  * be valid; drop the packet.
  813                                  */
  814                                 if (th->th_dport == th->th_sport) {
  815                                         switch (af) {
  816 #ifdef INET6
  817                                         case AF_INET6:
  818                                                 if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
  819                                                     &ip6->ip6_dst)) {
  820                                                         tcpstat_inc(tcps_badsyn);
  821                                                         goto drop;
  822                                                 }
  823                                                 break;
  824 #endif /* INET6 */
  825                                         case AF_INET:
  826                                                 if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
  827                                                         tcpstat_inc(tcps_badsyn);
  828                                                         goto drop;
  829                                                 }
  830                                                 break;
  831                                         }
  832                                 }
  833 
  834                                 /*
  835                                  * SYN looks ok; create compressed TCP
  836                                  * state for it.
  837                                  */
  838                                 if (so->so_qlen > so->so_qlimit ||
  839                                     syn_cache_add(&src.sa, &dst.sa, th, iphlen,
  840                                     so, m, optp, optlen, &opti, reuse, now)
  841                                     == -1) {
  842                                         tcpstat_inc(tcps_dropsyn);
  843                                         goto drop;
  844                                 }
  845                                 in_pcbunref(inp);
  846                                 return IPPROTO_DONE;
  847                         }
  848                 }
  849         }
  850 
  851 #ifdef DIAGNOSTIC
  852         /*
  853          * Should not happen now that all embryonic connections
  854          * are handled with compressed state.
  855          */
  856         if (tp->t_state == TCPS_LISTEN)
  857                 panic("tcp_input: TCPS_LISTEN");
  858 #endif
  859 
  860 #if NPF > 0
  861         pf_inp_link(m, inp);
  862 #endif
  863 
  864         /*
  865          * Segment received on connection.
  866          * Reset idle time and keep-alive timer.
  867          */
  868         tp->t_rcvtime = now;
  869         if (TCPS_HAVEESTABLISHED(tp->t_state))
  870                 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle));
  871 
  872         if (tp->sack_enable)
  873                 tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
  874 
  875         /*
  876          * Process options.
  877          */
  878 #ifdef TCP_SIGNATURE
  879         if (optp || (tp->t_flags & TF_SIGNATURE))
  880 #else
  881         if (optp)
  882 #endif
  883                 if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti,
  884                     m->m_pkthdr.ph_rtableid, now))
  885                         goto drop;
  886 
  887         if (opti.ts_present && opti.ts_ecr) {
  888                 int rtt_test;
  889 
  890                 /* subtract out the tcp timestamp modulator */
  891                 opti.ts_ecr -= tp->ts_modulate;
  892 
  893                 /* make sure ts_ecr is sensible */
  894                 rtt_test = now - opti.ts_ecr;
  895                 if (rtt_test < 0 || rtt_test > TCP_RTT_MAX)
  896                         opti.ts_ecr = 0;
  897         }
  898 
  899 #ifdef TCP_ECN
  900         /* if congestion experienced, set ECE bit in subsequent packets. */
  901         if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
  902                 tp->t_flags |= TF_RCVD_CE;
  903                 tcpstat_inc(tcps_ecn_rcvce);
  904         }
  905 #endif
  906         /*
  907          * Header prediction: check for the two common cases
  908          * of a uni-directional data xfer.  If the packet has
  909          * no control flags, is in-sequence, the window didn't
  910          * change and we're not retransmitting, it's a
  911          * candidate.  If the length is zero and the ack moved
  912          * forward, we're the sender side of the xfer.  Just
  913          * free the data acked & wake any higher level process
  914          * that was blocked waiting for space.  If the length
  915          * is non-zero and the ack didn't move, we're the
  916          * receiver side.  If we're getting packets in-order
  917          * (the reassembly queue is empty), add the data to
  918          * the socket buffer and note that we need a delayed ack.
  919          */
  920         if (tp->t_state == TCPS_ESTABLISHED &&
  921 #ifdef TCP_ECN
  922             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
  923 #else
  924             (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
  925 #endif
  926             (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
  927             th->th_seq == tp->rcv_nxt &&
  928             tiwin && tiwin == tp->snd_wnd &&
  929             tp->snd_nxt == tp->snd_max) {
  930 
  931                 /*
  932                  * If last ACK falls within this segment's sequence numbers,
  933                  *  record the timestamp.
  934                  * Fix from Braden, see Stevens p. 870
  935                  */
  936                 if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
  937                         tp->ts_recent_age = now;
  938                         tp->ts_recent = opti.ts_val;
  939                 }
  940 
  941                 if (tlen == 0) {
  942                         if (SEQ_GT(th->th_ack, tp->snd_una) &&
  943                             SEQ_LEQ(th->th_ack, tp->snd_max) &&
  944                             tp->snd_cwnd >= tp->snd_wnd &&
  945                             tp->t_dupacks == 0) {
  946                                 /*
  947                                  * this is a pure ack for outstanding data.
  948                                  */
  949                                 tcpstat_inc(tcps_predack);
  950                                 if (opti.ts_present && opti.ts_ecr)
  951                                         tcp_xmit_timer(tp, now - opti.ts_ecr);
  952                                 else if (tp->t_rtttime &&
  953                                     SEQ_GT(th->th_ack, tp->t_rtseq))
  954                                         tcp_xmit_timer(tp, now - tp->t_rtttime);
  955                                 acked = th->th_ack - tp->snd_una;
  956                                 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte,
  957                                     acked);
  958                                 tp->t_rcvacktime = now;
  959                                 ND6_HINT(tp);
  960                                 sbdrop(so, &so->so_snd, acked);
  961 
  962                                 /*
  963                                  * If we had a pending ICMP message that
  964                                  * refers to data that have just been
  965                                  * acknowledged, disregard the recorded ICMP
  966                                  * message.
  967                                  */
  968                                 if ((tp->t_flags & TF_PMTUD_PEND) &&
  969                                     SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
  970                                         tp->t_flags &= ~TF_PMTUD_PEND;
  971 
  972                                 /*
  973                                  * Keep track of the largest chunk of data
  974                                  * acknowledged since last PMTU update
  975                                  */
  976                                 if (tp->t_pmtud_mss_acked < acked)
  977                                         tp->t_pmtud_mss_acked = acked;
  978 
  979                                 tp->snd_una = th->th_ack;
  980                                 /* Pull snd_wl2 up to prevent seq wrap. */
  981                                 tp->snd_wl2 = th->th_ack;
  982                                 /*
  983                                  * We want snd_last to track snd_una so
  984                                  * as to avoid sequence wraparound problems
  985                                  * for very large transfers.
  986                                  */
  987 #ifdef TCP_ECN
  988                                 if (SEQ_GT(tp->snd_una, tp->snd_last))
  989 #endif
  990                                 tp->snd_last = tp->snd_una;
  991                                 m_freem(m);
  992 
  993                                 /*
  994                                  * If all outstanding data are acked, stop
  995                                  * retransmit timer, otherwise restart timer
  996                                  * using current (possibly backed-off) value.
  997                                  * If process is waiting for space,
  998                                  * wakeup/selwakeup/signal.  If data
  999                                  * are ready to send, let tcp_output
 1000                                  * decide between more output or persist.
 1001                                  */
 1002                                 if (tp->snd_una == tp->snd_max)
 1003                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1004                                 else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 1005                                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1006 
 1007                                 tcp_update_sndspace(tp);
 1008                                 if (sb_notify(so, &so->so_snd)) {
 1009                                         tp->t_flags |= TF_BLOCKOUTPUT;
 1010                                         sowwakeup(so);
 1011                                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1012                                 }
 1013                                 if (so->so_snd.sb_cc ||
 1014                                     tp->t_flags & TF_NEEDOUTPUT)
 1015                                         (void) tcp_output(tp);
 1016                                 in_pcbunref(inp);
 1017                                 return IPPROTO_DONE;
 1018                         }
 1019                 } else if (th->th_ack == tp->snd_una &&
 1020                     TAILQ_EMPTY(&tp->t_segq) &&
 1021                     tlen <= sbspace(so, &so->so_rcv)) {
 1022                         /*
 1023                          * This is a pure, in-sequence data packet
 1024                          * with nothing on the reassembly queue and
 1025                          * we have enough buffer space to take it.
 1026                          */
 1027                         /* Clean receiver SACK report if present */
 1028                         if (tp->sack_enable && tp->rcv_numsacks)
 1029                                 tcp_clean_sackreport(tp);
 1030                         tcpstat_inc(tcps_preddat);
 1031                         tp->rcv_nxt += tlen;
 1032                         /* Pull snd_wl1 and rcv_up up to prevent seq wrap. */
 1033                         tp->snd_wl1 = th->th_seq;
 1034                         /* Packet has most recent segment, no urgent exists. */
 1035                         tp->rcv_up = tp->rcv_nxt;
 1036                         tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
 1037                         ND6_HINT(tp);
 1038 
 1039                         TCP_SETUP_ACK(tp, tiflags, m);
 1040                         /*
 1041                          * Drop TCP, IP headers and TCP options then add data
 1042                          * to socket buffer.
 1043                          */
 1044                         if (so->so_rcv.sb_state & SS_CANTRCVMORE)
 1045                                 m_freem(m);
 1046                         else {
 1047                                 if (tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
 1048                                     now - tp->rfbuf_ts > (tp->t_srtt >>
 1049                                     (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT))) {
 1050                                         tcp_update_rcvspace(tp);
 1051                                         /* Start over with next RTT. */
 1052                                         tp->rfbuf_cnt = 0;
 1053                                         tp->rfbuf_ts = 0;
 1054                                 } else
 1055                                         tp->rfbuf_cnt += tlen;
 1056                                 m_adj(m, iphlen + off);
 1057                                 sbappendstream(so, &so->so_rcv, m);
 1058                         }
 1059                         tp->t_flags |= TF_BLOCKOUTPUT;
 1060                         sorwakeup(so);
 1061                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1062                         if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
 1063                                 (void) tcp_output(tp);
 1064                         in_pcbunref(inp);
 1065                         return IPPROTO_DONE;
 1066                 }
 1067         }
 1068 
 1069         /*
 1070          * Compute mbuf offset to TCP data segment.
 1071          */
 1072         hdroptlen = iphlen + off;
 1073 
 1074         /*
 1075          * Calculate amount of space in receive window,
 1076          * and then do TCP input processing.
 1077          * Receive window is amount of space in rcv queue,
 1078          * but not less than advertised window.
 1079          */
 1080         { int win;
 1081 
 1082         win = sbspace(so, &so->so_rcv);
 1083         if (win < 0)
 1084                 win = 0;
 1085         tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 1086         }
 1087 
 1088         switch (tp->t_state) {
 1089 
 1090         /*
 1091          * If the state is SYN_RECEIVED:
 1092          *      if seg contains SYN/ACK, send an RST.
 1093          *      if seg contains an ACK, but not for our SYN/ACK, send an RST
 1094          */
 1095 
 1096         case TCPS_SYN_RECEIVED:
 1097                 if (tiflags & TH_ACK) {
 1098                         if (tiflags & TH_SYN) {
 1099                                 tcpstat_inc(tcps_badsyn);
 1100                                 goto dropwithreset;
 1101                         }
 1102                         if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
 1103                             SEQ_GT(th->th_ack, tp->snd_max))
 1104                                 goto dropwithreset;
 1105                 }
 1106                 break;
 1107 
 1108         /*
 1109          * If the state is SYN_SENT:
 1110          *      if seg contains an ACK, but not for our SYN, drop the input.
 1111          *      if seg contains a RST, then drop the connection.
 1112          *      if seg does not contain SYN, then drop it.
 1113          * Otherwise this is an acceptable SYN segment
 1114          *      initialize tp->rcv_nxt and tp->irs
 1115          *      if seg contains ack then advance tp->snd_una
 1116          *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
 1117          *      arrange for segment to be acked (eventually)
 1118          *      continue processing rest of data/controls, beginning with URG
 1119          */
 1120         case TCPS_SYN_SENT:
 1121                 if ((tiflags & TH_ACK) &&
 1122                     (SEQ_LEQ(th->th_ack, tp->iss) ||
 1123                      SEQ_GT(th->th_ack, tp->snd_max)))
 1124                         goto dropwithreset;
 1125                 if (tiflags & TH_RST) {
 1126 #ifdef TCP_ECN
 1127                         /* if ECN is enabled, fall back to non-ecn at rexmit */
 1128                         if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
 1129                                 goto drop;
 1130 #endif
 1131                         if (tiflags & TH_ACK)
 1132                                 tp = tcp_drop(tp, ECONNREFUSED);
 1133                         goto drop;
 1134                 }
 1135                 if ((tiflags & TH_SYN) == 0)
 1136                         goto drop;
 1137                 if (tiflags & TH_ACK) {
 1138                         tp->snd_una = th->th_ack;
 1139                         if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 1140                                 tp->snd_nxt = tp->snd_una;
 1141                 }
 1142                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1143                 tp->irs = th->th_seq;
 1144                 tcp_mss(tp, opti.maxseg);
 1145                 /* Reset initial window to 1 segment for retransmit */
 1146                 if (tp->t_rxtshift > 0)
 1147                         tp->snd_cwnd = tp->t_maxseg;
 1148                 tcp_rcvseqinit(tp);
 1149                 tp->t_flags |= TF_ACKNOW;
 1150                 /*
 1151                  * If we've sent a SACK_PERMITTED option, and the peer
 1152                  * also replied with one, then TF_SACK_PERMIT should have
 1153                  * been set in tcp_dooptions().  If it was not, disable SACKs.
 1154                  */
 1155                 if (tp->sack_enable)
 1156                         tp->sack_enable = tp->t_flags & TF_SACK_PERMIT;
 1157 #ifdef TCP_ECN
 1158                 /*
 1159                  * if ECE is set but CWR is not set for SYN-ACK, or
 1160                  * both ECE and CWR are set for simultaneous open,
 1161                  * peer is ECN capable.
 1162                  */
 1163                 if (tcp_do_ecn) {
 1164                         switch (tiflags & (TH_ACK|TH_ECE|TH_CWR)) {
 1165                         case TH_ACK|TH_ECE:
 1166                         case TH_ECE|TH_CWR:
 1167                                 tp->t_flags |= TF_ECN_PERMIT;
 1168                                 tiflags &= ~(TH_ECE|TH_CWR);
 1169                                 tcpstat_inc(tcps_ecn_accepts);
 1170                         }
 1171                 }
 1172 #endif
 1173 
 1174                 if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
 1175                         tcpstat_inc(tcps_connects);
 1176                         tp->t_flags |= TF_BLOCKOUTPUT;
 1177                         soisconnected(so);
 1178                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1179                         tp->t_state = TCPS_ESTABLISHED;
 1180                         TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle));
 1181                         /* Do window scaling on this connection? */
 1182                         if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1183                                 (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1184                                 tp->snd_scale = tp->requested_s_scale;
 1185                                 tp->rcv_scale = tp->request_r_scale;
 1186                         }
 1187                         tcp_flush_queue(tp);
 1188 
 1189                         /*
 1190                          * if we didn't have to retransmit the SYN,
 1191                          * use its rtt as our initial srtt & rtt var.
 1192                          */
 1193                         if (tp->t_rtttime)
 1194                                 tcp_xmit_timer(tp, now - tp->t_rtttime);
 1195                         /*
 1196                          * Since new data was acked (the SYN), open the
 1197                          * congestion window by one MSS.  We do this
 1198                          * here, because we won't go through the normal
 1199                          * ACK processing below.  And since this is the
 1200                          * start of the connection, we know we are in
 1201                          * the exponential phase of slow-start.
 1202                          */
 1203                         tp->snd_cwnd += tp->t_maxseg;
 1204                 } else
 1205                         tp->t_state = TCPS_SYN_RECEIVED;
 1206 
 1207 #if 0
 1208 trimthenstep6:
 1209 #endif
 1210                 /*
 1211                  * Advance th->th_seq to correspond to first data byte.
 1212                  * If data, trim to stay within window,
 1213                  * dropping FIN if necessary.
 1214                  */
 1215                 th->th_seq++;
 1216                 if (tlen > tp->rcv_wnd) {
 1217                         todrop = tlen - tp->rcv_wnd;
 1218                         m_adj(m, -todrop);
 1219                         tlen = tp->rcv_wnd;
 1220                         tiflags &= ~TH_FIN;
 1221                         tcpstat_pkt(tcps_rcvpackafterwin, tcps_rcvbyteafterwin,
 1222                             todrop);
 1223                 }
 1224                 tp->snd_wl1 = th->th_seq - 1;
 1225                 tp->rcv_up = th->th_seq;
 1226                 goto step6;
 1227         /*
 1228          * If a new connection request is received while in TIME_WAIT,
 1229          * drop the old connection and start over if the if the
 1230          * timestamp or the sequence numbers are above the previous
 1231          * ones.
 1232          */
 1233         case TCPS_TIME_WAIT:
 1234                 if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) &&
 1235                     ((opti.ts_present &&
 1236                     TSTMP_LT(tp->ts_recent, opti.ts_val)) ||
 1237                     SEQ_GT(th->th_seq, tp->rcv_nxt))) {
 1238 #if NPF > 0
 1239                         /*
 1240                          * The socket will be recreated but the new state
 1241                          * has already been linked to the socket.  Remove the
 1242                          * link between old socket and new state.
 1243                          */
 1244                         pf_inp_unlink(inp);
 1245 #endif
 1246                         /*
 1247                         * Advance the iss by at least 32768, but
 1248                         * clear the msb in order to make sure
 1249                         * that SEG_LT(snd_nxt, iss).
 1250                         */
 1251                         iss = tp->snd_nxt +
 1252                             ((arc4random() & 0x7fffffff) | 0x8000);
 1253                         reuse = &iss;
 1254                         tp = tcp_close(tp);
 1255                         in_pcbunref(inp);
 1256                         inp = NULL;
 1257                         goto findpcb;
 1258                 }
 1259         }
 1260 
 1261         /*
 1262          * States other than LISTEN or SYN_SENT.
 1263          * First check timestamp, if present.
 1264          * Then check that at least some bytes of segment are within
 1265          * receive window.  If segment begins before rcv_nxt,
 1266          * drop leading data (and SYN); if nothing left, just ack.
 1267          *
 1268          * RFC 1323 PAWS: If we have a timestamp reply on this segment
 1269          * and it's less than opti.ts_recent, drop it.
 1270          */
 1271         if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
 1272             TSTMP_LT(opti.ts_val, tp->ts_recent)) {
 1273 
 1274                 /* Check to see if ts_recent is over 24 days old.  */
 1275                 if ((int)(now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
 1276                         /*
 1277                          * Invalidate ts_recent.  If this segment updates
 1278                          * ts_recent, the age will be reset later and ts_recent
 1279                          * will get a valid value.  If it does not, setting
 1280                          * ts_recent to zero will at least satisfy the
 1281                          * requirement that zero be placed in the timestamp
 1282                          * echo reply when ts_recent isn't valid.  The
 1283                          * age isn't reset until we get a valid ts_recent
 1284                          * because we don't want out-of-order segments to be
 1285                          * dropped when ts_recent is old.
 1286                          */
 1287                         tp->ts_recent = 0;
 1288                 } else {
 1289                         tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, tlen);
 1290                         tcpstat_inc(tcps_pawsdrop);
 1291                         if (tlen)
 1292                                 goto dropafterack;
 1293                         goto drop;
 1294                 }
 1295         }
 1296 
 1297         todrop = tp->rcv_nxt - th->th_seq;
 1298         if (todrop > 0) {
 1299                 if (tiflags & TH_SYN) {
 1300                         tiflags &= ~TH_SYN;
 1301                         th->th_seq++;
 1302                         if (th->th_urp > 1)
 1303                                 th->th_urp--;
 1304                         else
 1305                                 tiflags &= ~TH_URG;
 1306                         todrop--;
 1307                 }
 1308                 if (todrop > tlen ||
 1309                     (todrop == tlen && (tiflags & TH_FIN) == 0)) {
 1310                         /*
 1311                          * Any valid FIN must be to the left of the
 1312                          * window.  At this point, FIN must be a
 1313                          * duplicate or out-of-sequence, so drop it.
 1314                          */
 1315                         tiflags &= ~TH_FIN;
 1316                         /*
 1317                          * Send ACK to resynchronize, and drop any data,
 1318                          * but keep on processing for RST or ACK.
 1319                          */
 1320                         tp->t_flags |= TF_ACKNOW;
 1321                         todrop = tlen;
 1322                         tcpstat_pkt(tcps_rcvduppack, tcps_rcvdupbyte, todrop);
 1323                 } else {
 1324                         tcpstat_pkt(tcps_rcvpartduppack, tcps_rcvpartdupbyte,
 1325                             todrop);
 1326                 }
 1327                 hdroptlen += todrop;    /* drop from head afterwards */
 1328                 th->th_seq += todrop;
 1329                 tlen -= todrop;
 1330                 if (th->th_urp > todrop)
 1331                         th->th_urp -= todrop;
 1332                 else {
 1333                         tiflags &= ~TH_URG;
 1334                         th->th_urp = 0;
 1335                 }
 1336         }
 1337 
 1338         /*
 1339          * If new data are received on a connection after the
 1340          * user processes are gone, then RST the other end.
 1341          */
 1342         if ((so->so_state & SS_NOFDREF) &&
 1343             tp->t_state > TCPS_CLOSE_WAIT && tlen) {
 1344                 tp = tcp_close(tp);
 1345                 tcpstat_inc(tcps_rcvafterclose);
 1346                 goto dropwithreset;
 1347         }
 1348 
 1349         /*
 1350          * If segment ends after window, drop trailing data
 1351          * (and PUSH and FIN); if nothing left, just ACK.
 1352          */
 1353         todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
 1354         if (todrop > 0) {
 1355                 tcpstat_inc(tcps_rcvpackafterwin);
 1356                 if (todrop >= tlen) {
 1357                         tcpstat_add(tcps_rcvbyteafterwin, tlen);
 1358                         /*
 1359                          * If window is closed can only take segments at
 1360                          * window edge, and have to drop data and PUSH from
 1361                          * incoming segments.  Continue processing, but
 1362                          * remember to ack.  Otherwise, drop segment
 1363                          * and ack.
 1364                          */
 1365                         if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
 1366                                 tp->t_flags |= TF_ACKNOW;
 1367                                 tcpstat_inc(tcps_rcvwinprobe);
 1368                         } else
 1369                                 goto dropafterack;
 1370                 } else
 1371                         tcpstat_add(tcps_rcvbyteafterwin, todrop);
 1372                 m_adj(m, -todrop);
 1373                 tlen -= todrop;
 1374                 tiflags &= ~(TH_PUSH|TH_FIN);
 1375         }
 1376 
 1377         /*
 1378          * If last ACK falls within this segment's sequence numbers,
 1379          * record its timestamp if it's more recent.
 1380          * NOTE that the test is modified according to the latest
 1381          * proposal of the tcplw@cray.com list (Braden 1993/04/26).
 1382          */
 1383         if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
 1384             SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
 1385                 tp->ts_recent_age = now;
 1386                 tp->ts_recent = opti.ts_val;
 1387         }
 1388 
 1389         /*
 1390          * If the RST bit is set examine the state:
 1391          *    SYN_RECEIVED STATE:
 1392          *      If passive open, return to LISTEN state.
 1393          *      If active open, inform user that connection was refused.
 1394          *    ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
 1395          *      Inform user that connection was reset, and close tcb.
 1396          *    CLOSING, LAST_ACK, TIME_WAIT STATES
 1397          *      Close the tcb.
 1398          */
 1399         if (tiflags & TH_RST) {
 1400                 if (th->th_seq != tp->last_ack_sent &&
 1401                     th->th_seq != tp->rcv_nxt &&
 1402                     th->th_seq != (tp->rcv_nxt + 1))
 1403                         goto drop;
 1404 
 1405                 switch (tp->t_state) {
 1406                 case TCPS_SYN_RECEIVED:
 1407 #ifdef TCP_ECN
 1408                         /* if ECN is enabled, fall back to non-ecn at rexmit */
 1409                         if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
 1410                                 goto drop;
 1411 #endif
 1412                         so->so_error = ECONNREFUSED;
 1413                         goto close;
 1414 
 1415                 case TCPS_ESTABLISHED:
 1416                 case TCPS_FIN_WAIT_1:
 1417                 case TCPS_FIN_WAIT_2:
 1418                 case TCPS_CLOSE_WAIT:
 1419                         so->so_error = ECONNRESET;
 1420                 close:
 1421                         tp->t_state = TCPS_CLOSED;
 1422                         tcpstat_inc(tcps_drops);
 1423                         tp = tcp_close(tp);
 1424                         goto drop;
 1425                 case TCPS_CLOSING:
 1426                 case TCPS_LAST_ACK:
 1427                 case TCPS_TIME_WAIT:
 1428                         tp = tcp_close(tp);
 1429                         goto drop;
 1430                 }
 1431         }
 1432 
 1433         /*
 1434          * If a SYN is in the window, then this is an
 1435          * error and we ACK and drop the packet.
 1436          */
 1437         if (tiflags & TH_SYN)
 1438                 goto dropafterack_ratelim;
 1439 
 1440         /*
 1441          * If the ACK bit is off we drop the segment and return.
 1442          */
 1443         if ((tiflags & TH_ACK) == 0) {
 1444                 if (tp->t_flags & TF_ACKNOW)
 1445                         goto dropafterack;
 1446                 else
 1447                         goto drop;
 1448         }
 1449 
 1450         /*
 1451          * Ack processing.
 1452          */
 1453         switch (tp->t_state) {
 1454 
 1455         /*
 1456          * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
 1457          * ESTABLISHED state and continue processing.
 1458          * The ACK was checked above.
 1459          */
 1460         case TCPS_SYN_RECEIVED:
 1461                 tcpstat_inc(tcps_connects);
 1462                 tp->t_flags |= TF_BLOCKOUTPUT;
 1463                 soisconnected(so);
 1464                 tp->t_flags &= ~TF_BLOCKOUTPUT;
 1465                 tp->t_state = TCPS_ESTABLISHED;
 1466                 TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcp_keepidle));
 1467                 /* Do window scaling? */
 1468                 if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1469                         (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1470                         tp->snd_scale = tp->requested_s_scale;
 1471                         tp->rcv_scale = tp->request_r_scale;
 1472                         tiwin = th->th_win << tp->snd_scale;
 1473                 }
 1474                 tcp_flush_queue(tp);
 1475                 tp->snd_wl1 = th->th_seq - 1;
 1476                 /* fall into ... */
 1477 
 1478         /*
 1479          * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
 1480          * ACKs.  If the ack is in the range
 1481          *      tp->snd_una < th->th_ack <= tp->snd_max
 1482          * then advance tp->snd_una to th->th_ack and drop
 1483          * data from the retransmission queue.  If this ACK reflects
 1484          * more up to date window information we update our window information.
 1485          */
 1486         case TCPS_ESTABLISHED:
 1487         case TCPS_FIN_WAIT_1:
 1488         case TCPS_FIN_WAIT_2:
 1489         case TCPS_CLOSE_WAIT:
 1490         case TCPS_CLOSING:
 1491         case TCPS_LAST_ACK:
 1492         case TCPS_TIME_WAIT:
 1493 #ifdef TCP_ECN
 1494                 /*
 1495                  * if we receive ECE and are not already in recovery phase,
 1496                  * reduce cwnd by half but don't slow-start.
 1497                  * advance snd_last to snd_max not to reduce cwnd again
 1498                  * until all outstanding packets are acked.
 1499                  */
 1500                 if (tcp_do_ecn && (tiflags & TH_ECE)) {
 1501                         if ((tp->t_flags & TF_ECN_PERMIT) &&
 1502                             SEQ_GEQ(tp->snd_una, tp->snd_last)) {
 1503                                 u_int win;
 1504 
 1505                                 win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
 1506                                 if (win > 1) {
 1507                                         tp->snd_ssthresh = win / 2 * tp->t_maxseg;
 1508                                         tp->snd_cwnd = tp->snd_ssthresh;
 1509                                         tp->snd_last = tp->snd_max;
 1510                                         tp->t_flags |= TF_SEND_CWR;
 1511                                         tcpstat_inc(tcps_cwr_ecn);
 1512                                 }
 1513                         }
 1514                         tcpstat_inc(tcps_ecn_rcvece);
 1515                 }
 1516                 /*
 1517                  * if we receive CWR, we know that the peer has reduced
 1518                  * its congestion window.  stop sending ecn-echo.
 1519                  */
 1520                 if ((tiflags & TH_CWR)) {
 1521                         tp->t_flags &= ~TF_RCVD_CE;
 1522                         tcpstat_inc(tcps_ecn_rcvcwr);
 1523                 }
 1524 #endif /* TCP_ECN */
 1525 
 1526                 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
 1527                         /*
 1528                          * Duplicate/old ACK processing.
 1529                          * Increments t_dupacks:
 1530                          *      Pure duplicate (same seq/ack/window, no data)
 1531                          * Doesn't affect t_dupacks:
 1532                          *      Data packets.
 1533                          *      Normal window updates (window opens)
 1534                          * Resets t_dupacks:
 1535                          *      New data ACKed.
 1536                          *      Window shrinks
 1537                          *      Old ACK
 1538                          */
 1539                         if (tlen) {
 1540                                 /* Drop very old ACKs unless th_seq matches */
 1541                                 if (th->th_seq != tp->rcv_nxt &&
 1542                                    SEQ_LT(th->th_ack,
 1543                                    tp->snd_una - tp->max_sndwnd)) {
 1544                                         tcpstat_inc(tcps_rcvacktooold);
 1545                                         goto drop;
 1546                                 }
 1547                                 break;
 1548                         }
 1549                         /*
 1550                          * If we get an old ACK, there is probably packet
 1551                          * reordering going on.  Be conservative and reset
 1552                          * t_dupacks so that we are less aggressive in
 1553                          * doing a fast retransmit.
 1554                          */
 1555                         if (th->th_ack != tp->snd_una) {
 1556                                 tp->t_dupacks = 0;
 1557                                 break;
 1558                         }
 1559                         if (tiwin == tp->snd_wnd) {
 1560                                 tcpstat_inc(tcps_rcvdupack);
 1561                                 /*
 1562                                  * If we have outstanding data (other than
 1563                                  * a window probe), this is a completely
 1564                                  * duplicate ack (ie, window info didn't
 1565                                  * change), the ack is the biggest we've
 1566                                  * seen and we've seen exactly our rexmt
 1567                                  * threshold of them, assume a packet
 1568                                  * has been dropped and retransmit it.
 1569                                  * Kludge snd_nxt & the congestion
 1570                                  * window so we send only this one
 1571                                  * packet.
 1572                                  *
 1573                                  * We know we're losing at the current
 1574                                  * window size so do congestion avoidance
 1575                                  * (set ssthresh to half the current window
 1576                                  * and pull our congestion window back to
 1577                                  * the new ssthresh).
 1578                                  *
 1579                                  * Dup acks mean that packets have left the
 1580                                  * network (they're now cached at the receiver)
 1581                                  * so bump cwnd by the amount in the receiver
 1582                                  * to keep a constant cwnd packets in the
 1583                                  * network.
 1584                                  */
 1585                                 if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0)
 1586                                         tp->t_dupacks = 0;
 1587                                 else if (++tp->t_dupacks == tcprexmtthresh) {
 1588                                         tcp_seq onxt = tp->snd_nxt;
 1589                                         u_long win =
 1590                                             ulmin(tp->snd_wnd, tp->snd_cwnd) /
 1591                                                 2 / tp->t_maxseg;
 1592 
 1593                                         if (SEQ_LT(th->th_ack, tp->snd_last)){
 1594                                                 /*
 1595                                                  * False fast retx after
 1596                                                  * timeout.  Do not cut window.
 1597                                                  */
 1598                                                 tp->t_dupacks = 0;
 1599                                                 goto drop;
 1600                                         }
 1601                                         if (win < 2)
 1602                                                 win = 2;
 1603                                         tp->snd_ssthresh = win * tp->t_maxseg;
 1604                                         tp->snd_last = tp->snd_max;
 1605                                         if (tp->sack_enable) {
 1606                                                 TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1607                                                 tp->t_rtttime = 0;
 1608 #ifdef TCP_ECN
 1609                                                 tp->t_flags |= TF_SEND_CWR;
 1610 #endif
 1611                                                 tcpstat_inc(tcps_cwr_frecovery);
 1612                                                 tcpstat_inc(tcps_sack_recovery_episode);
 1613                                                 /*
 1614                                                  * tcp_output() will send
 1615                                                  * oldest SACK-eligible rtx.
 1616                                                  */
 1617                                                 (void) tcp_output(tp);
 1618                                                 tp->snd_cwnd = tp->snd_ssthresh+
 1619                                                    tp->t_maxseg * tp->t_dupacks;
 1620                                                 goto drop;
 1621                                         }
 1622                                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1623                                         tp->t_rtttime = 0;
 1624                                         tp->snd_nxt = th->th_ack;
 1625                                         tp->snd_cwnd = tp->t_maxseg;
 1626 #ifdef TCP_ECN
 1627                                         tp->t_flags |= TF_SEND_CWR;
 1628 #endif
 1629                                         tcpstat_inc(tcps_cwr_frecovery);
 1630                                         tcpstat_inc(tcps_sndrexmitfast);
 1631                                         (void) tcp_output(tp);
 1632 
 1633                                         tp->snd_cwnd = tp->snd_ssthresh +
 1634                                             tp->t_maxseg * tp->t_dupacks;
 1635                                         if (SEQ_GT(onxt, tp->snd_nxt))
 1636                                                 tp->snd_nxt = onxt;
 1637                                         goto drop;
 1638                                 } else if (tp->t_dupacks > tcprexmtthresh) {
 1639                                         tp->snd_cwnd += tp->t_maxseg;
 1640                                         (void) tcp_output(tp);
 1641                                         goto drop;
 1642                                 }
 1643                         } else if (tiwin < tp->snd_wnd) {
 1644                                 /*
 1645                                  * The window was retracted!  Previous dup
 1646                                  * ACKs may have been due to packets arriving
 1647                                  * after the shrunken window, not a missing
 1648                                  * packet, so play it safe and reset t_dupacks
 1649                                  */
 1650                                 tp->t_dupacks = 0;
 1651                         }
 1652                         break;
 1653                 }
 1654                 /*
 1655                  * If the congestion window was inflated to account
 1656                  * for the other side's cached packets, retract it.
 1657                  */
 1658                 if (tp->t_dupacks >= tcprexmtthresh) {
 1659                         /* Check for a partial ACK */
 1660                         if (SEQ_LT(th->th_ack, tp->snd_last)) {
 1661                                 if (tp->sack_enable)
 1662                                         tcp_sack_partialack(tp, th);
 1663                                 else
 1664                                         tcp_newreno_partialack(tp, th);
 1665                         } else {
 1666                                 /* Out of fast recovery */
 1667                                 tp->snd_cwnd = tp->snd_ssthresh;
 1668                                 if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
 1669                                     tp->snd_ssthresh)
 1670                                         tp->snd_cwnd =
 1671                                             tcp_seq_subtract(tp->snd_max,
 1672                                             th->th_ack);
 1673                                 tp->t_dupacks = 0;
 1674                         }
 1675                 } else {
 1676                         /*
 1677                          * Reset the duplicate ACK counter if we
 1678                          * were not in fast recovery.
 1679                          */
 1680                         tp->t_dupacks = 0;
 1681                 }
 1682                 if (SEQ_GT(th->th_ack, tp->snd_max)) {
 1683                         tcpstat_inc(tcps_rcvacktoomuch);
 1684                         goto dropafterack_ratelim;
 1685                 }
 1686                 acked = th->th_ack - tp->snd_una;
 1687                 tcpstat_pkt(tcps_rcvackpack, tcps_rcvackbyte, acked);
 1688                 tp->t_rcvacktime = now;
 1689 
 1690                 /*
 1691                  * If we have a timestamp reply, update smoothed
 1692                  * round trip time.  If no timestamp is present but
 1693                  * transmit timer is running and timed sequence
 1694                  * number was acked, update smoothed round trip time.
 1695                  * Since we now have an rtt measurement, cancel the
 1696                  * timer backoff (cf., Phil Karn's retransmit alg.).
 1697                  * Recompute the initial retransmit timer.
 1698                  */
 1699                 if (opti.ts_present && opti.ts_ecr)
 1700                         tcp_xmit_timer(tp, now - opti.ts_ecr);
 1701                 else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
 1702                         tcp_xmit_timer(tp, now - tp->t_rtttime);
 1703 
 1704                 /*
 1705                  * If all outstanding data is acked, stop retransmit
 1706                  * timer and remember to restart (more output or persist).
 1707                  * If there is more data to be acked, restart retransmit
 1708                  * timer, using current (possibly backed-off) value.
 1709                  */
 1710                 if (th->th_ack == tp->snd_max) {
 1711                         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 1712                         tp->t_flags |= TF_NEEDOUTPUT;
 1713                 } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
 1714                         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 1715                 /*
 1716                  * When new data is acked, open the congestion window.
 1717                  * If the window gives us less than ssthresh packets
 1718                  * in flight, open exponentially (maxseg per packet).
 1719                  * Otherwise open linearly: maxseg per window
 1720                  * (maxseg^2 / cwnd per packet).
 1721                  */
 1722                 {
 1723                 u_int cw = tp->snd_cwnd;
 1724                 u_int incr = tp->t_maxseg;
 1725 
 1726                 if (cw > tp->snd_ssthresh)
 1727                         incr = max(incr * incr / cw, 1);
 1728                 if (tp->t_dupacks < tcprexmtthresh)
 1729                         tp->snd_cwnd = ulmin(cw + incr,
 1730                             TCP_MAXWIN << tp->snd_scale);
 1731                 }
 1732                 ND6_HINT(tp);
 1733                 if (acked > so->so_snd.sb_cc) {
 1734                         if (tp->snd_wnd > so->so_snd.sb_cc)
 1735                                 tp->snd_wnd -= so->so_snd.sb_cc;
 1736                         else
 1737                                 tp->snd_wnd = 0;
 1738                         sbdrop(so, &so->so_snd, (int)so->so_snd.sb_cc);
 1739                         ourfinisacked = 1;
 1740                 } else {
 1741                         sbdrop(so, &so->so_snd, acked);
 1742                         if (tp->snd_wnd > acked)
 1743                                 tp->snd_wnd -= acked;
 1744                         else
 1745                                 tp->snd_wnd = 0;
 1746                         ourfinisacked = 0;
 1747                 }
 1748 
 1749                 tcp_update_sndspace(tp);
 1750                 if (sb_notify(so, &so->so_snd)) {
 1751                         tp->t_flags |= TF_BLOCKOUTPUT;
 1752                         sowwakeup(so);
 1753                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1754                 }
 1755 
 1756                 /*
 1757                  * If we had a pending ICMP message that referred to data
 1758                  * that have just been acknowledged, disregard the recorded
 1759                  * ICMP message.
 1760                  */
 1761                 if ((tp->t_flags & TF_PMTUD_PEND) &&
 1762                     SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
 1763                         tp->t_flags &= ~TF_PMTUD_PEND;
 1764 
 1765                 /*
 1766                  * Keep track of the largest chunk of data acknowledged
 1767                  * since last PMTU update
 1768                  */
 1769                 if (tp->t_pmtud_mss_acked < acked)
 1770                         tp->t_pmtud_mss_acked = acked;
 1771 
 1772                 tp->snd_una = th->th_ack;
 1773 #ifdef TCP_ECN
 1774                 /* sync snd_last with snd_una */
 1775                 if (SEQ_GT(tp->snd_una, tp->snd_last))
 1776                         tp->snd_last = tp->snd_una;
 1777 #endif
 1778                 if (SEQ_LT(tp->snd_nxt, tp->snd_una))
 1779                         tp->snd_nxt = tp->snd_una;
 1780 
 1781                 switch (tp->t_state) {
 1782 
 1783                 /*
 1784                  * In FIN_WAIT_1 STATE in addition to the processing
 1785                  * for the ESTABLISHED state if our FIN is now acknowledged
 1786                  * then enter FIN_WAIT_2.
 1787                  */
 1788                 case TCPS_FIN_WAIT_1:
 1789                         if (ourfinisacked) {
 1790                                 /*
 1791                                  * If we can't receive any more
 1792                                  * data, then closing user can proceed.
 1793                                  * Starting the timer is contrary to the
 1794                                  * specification, but if we don't get a FIN
 1795                                  * we'll hang forever.
 1796                                  */
 1797                                 if (so->so_rcv.sb_state & SS_CANTRCVMORE) {
 1798                                         tp->t_flags |= TF_BLOCKOUTPUT;
 1799                                         soisdisconnected(so);
 1800                                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1801                                         TCP_TIMER_ARM(tp, TCPT_2MSL,
 1802                                             TCP_TIME(tcp_maxidle));
 1803                                 }
 1804                                 tp->t_state = TCPS_FIN_WAIT_2;
 1805                         }
 1806                         break;
 1807 
 1808                 /*
 1809                  * In CLOSING STATE in addition to the processing for
 1810                  * the ESTABLISHED state if the ACK acknowledges our FIN
 1811                  * then enter the TIME-WAIT state, otherwise ignore
 1812                  * the segment.
 1813                  */
 1814                 case TCPS_CLOSING:
 1815                         if (ourfinisacked) {
 1816                                 tp->t_state = TCPS_TIME_WAIT;
 1817                                 tcp_canceltimers(tp);
 1818                                 TCP_TIMER_ARM(tp, TCPT_2MSL,
 1819                                     TCP_TIME(2 * TCPTV_MSL));
 1820                                 tp->t_flags |= TF_BLOCKOUTPUT;
 1821                                 soisdisconnected(so);
 1822                                 tp->t_flags &= ~TF_BLOCKOUTPUT;
 1823                         }
 1824                         break;
 1825 
 1826                 /*
 1827                  * In LAST_ACK, we may still be waiting for data to drain
 1828                  * and/or to be acked, as well as for the ack of our FIN.
 1829                  * If our FIN is now acknowledged, delete the TCB,
 1830                  * enter the closed state and return.
 1831                  */
 1832                 case TCPS_LAST_ACK:
 1833                         if (ourfinisacked) {
 1834                                 tp = tcp_close(tp);
 1835                                 goto drop;
 1836                         }
 1837                         break;
 1838 
 1839                 /*
 1840                  * In TIME_WAIT state the only thing that should arrive
 1841                  * is a retransmission of the remote FIN.  Acknowledge
 1842                  * it and restart the finack timer.
 1843                  */
 1844                 case TCPS_TIME_WAIT:
 1845                         TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL));
 1846                         goto dropafterack;
 1847                 }
 1848         }
 1849 
 1850 step6:
 1851         /*
 1852          * Update window information.
 1853          * Don't look at window if no ACK: TAC's send garbage on first SYN.
 1854          */
 1855         if ((tiflags & TH_ACK) &&
 1856             (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq &&
 1857             (SEQ_LT(tp->snd_wl2, th->th_ack) ||
 1858             (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
 1859                 /* keep track of pure window updates */
 1860                 if (tlen == 0 &&
 1861                     tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
 1862                         tcpstat_inc(tcps_rcvwinupd);
 1863                 tp->snd_wnd = tiwin;
 1864                 tp->snd_wl1 = th->th_seq;
 1865                 tp->snd_wl2 = th->th_ack;
 1866                 if (tp->snd_wnd > tp->max_sndwnd)
 1867                         tp->max_sndwnd = tp->snd_wnd;
 1868                 tp->t_flags |= TF_NEEDOUTPUT;
 1869         }
 1870 
 1871         /*
 1872          * Process segments with URG.
 1873          */
 1874         if ((tiflags & TH_URG) && th->th_urp &&
 1875             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 1876                 /*
 1877                  * This is a kludge, but if we receive and accept
 1878                  * random urgent pointers, we'll crash in
 1879                  * soreceive.  It's hard to imagine someone
 1880                  * actually wanting to send this much urgent data.
 1881                  */
 1882                 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
 1883                         th->th_urp = 0;                 /* XXX */
 1884                         tiflags &= ~TH_URG;             /* XXX */
 1885                         goto dodata;                    /* XXX */
 1886                 }
 1887                 /*
 1888                  * If this segment advances the known urgent pointer,
 1889                  * then mark the data stream.  This should not happen
 1890                  * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
 1891                  * a FIN has been received from the remote side.
 1892                  * In these states we ignore the URG.
 1893                  *
 1894                  * According to RFC961 (Assigned Protocols),
 1895                  * the urgent pointer points to the last octet
 1896                  * of urgent data.  We continue, however,
 1897                  * to consider it to indicate the first octet
 1898                  * of data past the urgent section as the original
 1899                  * spec states (in one of two places).
 1900                  */
 1901                 if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
 1902                         tp->rcv_up = th->th_seq + th->th_urp;
 1903                         so->so_oobmark = so->so_rcv.sb_cc +
 1904                             (tp->rcv_up - tp->rcv_nxt) - 1;
 1905                         if (so->so_oobmark == 0)
 1906                                 so->so_rcv.sb_state |= SS_RCVATMARK;
 1907                         sohasoutofband(so);
 1908                         tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 1909                 }
 1910                 /*
 1911                  * Remove out of band data so doesn't get presented to user.
 1912                  * This can happen independent of advancing the URG pointer,
 1913                  * but if two URG's are pending at once, some out-of-band
 1914                  * data may creep in... ick.
 1915                  */
 1916                 if (th->th_urp <= (u_int16_t) tlen &&
 1917                     (so->so_options & SO_OOBINLINE) == 0)
 1918                         tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
 1919         } else
 1920                 /*
 1921                  * If no out of band data is expected,
 1922                  * pull receive urgent pointer along
 1923                  * with the receive window.
 1924                  */
 1925                 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
 1926                         tp->rcv_up = tp->rcv_nxt;
 1927 dodata:                                                 /* XXX */
 1928 
 1929         /*
 1930          * Process the segment text, merging it into the TCP sequencing queue,
 1931          * and arranging for acknowledgment of receipt if necessary.
 1932          * This process logically involves adjusting tp->rcv_wnd as data
 1933          * is presented to the user (this happens in tcp_usrreq.c,
 1934          * case PRU_RCVD).  If a FIN has already been received on this
 1935          * connection then we just ignore the text.
 1936          */
 1937         if ((tlen || (tiflags & TH_FIN)) &&
 1938             TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 1939                 tcp_seq laststart = th->th_seq;
 1940                 tcp_seq lastend = th->th_seq + tlen;
 1941 
 1942                 if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) &&
 1943                     tp->t_state == TCPS_ESTABLISHED) {
 1944                         TCP_SETUP_ACK(tp, tiflags, m);
 1945                         tp->rcv_nxt += tlen;
 1946                         tiflags = th->th_flags & TH_FIN;
 1947                         tcpstat_pkt(tcps_rcvpack, tcps_rcvbyte, tlen);
 1948                         ND6_HINT(tp);
 1949                         if (so->so_rcv.sb_state & SS_CANTRCVMORE)
 1950                                 m_freem(m);
 1951                         else {
 1952                                 m_adj(m, hdroptlen);
 1953                                 sbappendstream(so, &so->so_rcv, m);
 1954                         }
 1955                         tp->t_flags |= TF_BLOCKOUTPUT;
 1956                         sorwakeup(so);
 1957                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1958                 } else {
 1959                         m_adj(m, hdroptlen);
 1960                         tiflags = tcp_reass(tp, th, m, &tlen);
 1961                         tp->t_flags |= TF_ACKNOW;
 1962                 }
 1963                 if (tp->sack_enable)
 1964                         tcp_update_sack_list(tp, laststart, lastend);
 1965 
 1966                 /*
 1967                  * variable len never referenced again in modern BSD,
 1968                  * so why bother computing it ??
 1969                  */
 1970 #if 0
 1971                 /*
 1972                  * Note the amount of data that peer has sent into
 1973                  * our window, in order to estimate the sender's
 1974                  * buffer size.
 1975                  */
 1976                 len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
 1977 #endif /* 0 */
 1978         } else {
 1979                 m_freem(m);
 1980                 tiflags &= ~TH_FIN;
 1981         }
 1982 
 1983         /*
 1984          * If FIN is received ACK the FIN and let the user know
 1985          * that the connection is closing.  Ignore a FIN received before
 1986          * the connection is fully established.
 1987          */
 1988         if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
 1989                 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
 1990                         tp->t_flags |= TF_BLOCKOUTPUT;
 1991                         socantrcvmore(so);
 1992                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 1993                         tp->t_flags |= TF_ACKNOW;
 1994                         tp->rcv_nxt++;
 1995                 }
 1996                 switch (tp->t_state) {
 1997 
 1998                 /*
 1999                  * In ESTABLISHED STATE enter the CLOSE_WAIT state.
 2000                  */
 2001                 case TCPS_ESTABLISHED:
 2002                         tp->t_state = TCPS_CLOSE_WAIT;
 2003                         break;
 2004 
 2005                 /*
 2006                  * If still in FIN_WAIT_1 STATE FIN has not been acked so
 2007                  * enter the CLOSING state.
 2008                  */
 2009                 case TCPS_FIN_WAIT_1:
 2010                         tp->t_state = TCPS_CLOSING;
 2011                         break;
 2012 
 2013                 /*
 2014                  * In FIN_WAIT_2 state enter the TIME_WAIT state,
 2015                  * starting the time-wait timer, turning off the other
 2016                  * standard timers.
 2017                  */
 2018                 case TCPS_FIN_WAIT_2:
 2019                         tp->t_state = TCPS_TIME_WAIT;
 2020                         tcp_canceltimers(tp);
 2021                         TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL));
 2022                         tp->t_flags |= TF_BLOCKOUTPUT;
 2023                         soisdisconnected(so);
 2024                         tp->t_flags &= ~TF_BLOCKOUTPUT;
 2025                         break;
 2026 
 2027                 /*
 2028                  * In TIME_WAIT state restart the 2 MSL time_wait timer.
 2029                  */
 2030                 case TCPS_TIME_WAIT:
 2031                         TCP_TIMER_ARM(tp, TCPT_2MSL, TCP_TIME(2 * TCPTV_MSL));
 2032                         break;
 2033                 }
 2034         }
 2035         if (otp)
 2036                 tcp_trace(TA_INPUT, ostate, tp, otp, saveti, 0, tlen);
 2037 
 2038         /*
 2039          * Return any desired output.
 2040          */
 2041         if (tp->t_flags & (TF_ACKNOW|TF_NEEDOUTPUT))
 2042                 (void) tcp_output(tp);
 2043         in_pcbunref(inp);
 2044         return IPPROTO_DONE;
 2045 
 2046 badsyn:
 2047         /*
 2048          * Received a bad SYN.  Increment counters and dropwithreset.
 2049          */
 2050         tcpstat_inc(tcps_badsyn);
 2051         tp = NULL;
 2052         goto dropwithreset;
 2053 
 2054 dropafterack_ratelim:
 2055         if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
 2056             tcp_ackdrop_ppslim) == 0) {
 2057                 /* XXX stat */
 2058                 goto drop;
 2059         }
 2060         /* ...fall into dropafterack... */
 2061 
 2062 dropafterack:
 2063         /*
 2064          * Generate an ACK dropping incoming segment if it occupies
 2065          * sequence space, where the ACK reflects our state.
 2066          */
 2067         if (tiflags & TH_RST)
 2068                 goto drop;
 2069         m_freem(m);
 2070         tp->t_flags |= TF_ACKNOW;
 2071         (void) tcp_output(tp);
 2072         in_pcbunref(inp);
 2073         return IPPROTO_DONE;
 2074 
 2075 dropwithreset_ratelim:
 2076         /*
 2077          * We may want to rate-limit RSTs in certain situations,
 2078          * particularly if we are sending an RST in response to
 2079          * an attempt to connect to or otherwise communicate with
 2080          * a port for which we have no socket.
 2081          */
 2082         if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
 2083             tcp_rst_ppslim) == 0) {
 2084                 /* XXX stat */
 2085                 goto drop;
 2086         }
 2087         /* ...fall into dropwithreset... */
 2088 
 2089 dropwithreset:
 2090         /*
 2091          * Generate a RST, dropping incoming segment.
 2092          * Make ACK acceptable to originator of segment.
 2093          * Don't bother to respond to RST.
 2094          */
 2095         if (tiflags & TH_RST)
 2096                 goto drop;
 2097         if (tiflags & TH_ACK) {
 2098                 tcp_respond(tp, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack,
 2099                     TH_RST, m->m_pkthdr.ph_rtableid, now);
 2100         } else {
 2101                 if (tiflags & TH_SYN)
 2102                         tlen++;
 2103                 tcp_respond(tp, mtod(m, caddr_t), th, th->th_seq + tlen,
 2104                     (tcp_seq)0, TH_RST|TH_ACK, m->m_pkthdr.ph_rtableid, now);
 2105         }
 2106         m_freem(m);
 2107         in_pcbunref(inp);
 2108         return IPPROTO_DONE;
 2109 
 2110 drop:
 2111         /*
 2112          * Drop space held by incoming segment and return.
 2113          */
 2114         if (otp)
 2115                 tcp_trace(TA_DROP, ostate, tp, otp, saveti, 0, tlen);
 2116 
 2117         m_freem(m);
 2118         in_pcbunref(inp);
 2119         return IPPROTO_DONE;
 2120 }
 2121 
 2122 int
 2123 tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
 2124     struct mbuf *m, int iphlen, struct tcp_opt_info *oi,
 2125     u_int rtableid, uint32_t now)
 2126 {
 2127         u_int16_t mss = 0;
 2128         int opt, optlen;
 2129 #ifdef TCP_SIGNATURE
 2130         caddr_t sigp = NULL;
 2131         struct tdb *tdb = NULL;
 2132 #endif /* TCP_SIGNATURE */
 2133 
 2134         for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
 2135                 opt = cp[0];
 2136                 if (opt == TCPOPT_EOL)
 2137                         break;
 2138                 if (opt == TCPOPT_NOP)
 2139                         optlen = 1;
 2140                 else {
 2141                         if (cnt < 2)
 2142                                 break;
 2143                         optlen = cp[1];
 2144                         if (optlen < 2 || optlen > cnt)
 2145                                 break;
 2146                 }
 2147                 switch (opt) {
 2148 
 2149                 default:
 2150                         continue;
 2151 
 2152                 case TCPOPT_MAXSEG:
 2153                         if (optlen != TCPOLEN_MAXSEG)
 2154                                 continue;
 2155                         if (!(th->th_flags & TH_SYN))
 2156                                 continue;
 2157                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2158                                 continue;
 2159                         memcpy(&mss, cp + 2, sizeof(mss));
 2160                         mss = ntohs(mss);
 2161                         oi->maxseg = mss;
 2162                         break;
 2163 
 2164                 case TCPOPT_WINDOW:
 2165                         if (optlen != TCPOLEN_WINDOW)
 2166                                 continue;
 2167                         if (!(th->th_flags & TH_SYN))
 2168                                 continue;
 2169                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2170                                 continue;
 2171                         tp->t_flags |= TF_RCVD_SCALE;
 2172                         tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
 2173                         break;
 2174 
 2175                 case TCPOPT_TIMESTAMP:
 2176                         if (optlen != TCPOLEN_TIMESTAMP)
 2177                                 continue;
 2178                         oi->ts_present = 1;
 2179                         memcpy(&oi->ts_val, cp + 2, sizeof(oi->ts_val));
 2180                         oi->ts_val = ntohl(oi->ts_val);
 2181                         memcpy(&oi->ts_ecr, cp + 6, sizeof(oi->ts_ecr));
 2182                         oi->ts_ecr = ntohl(oi->ts_ecr);
 2183 
 2184                         if (!(th->th_flags & TH_SYN))
 2185                                 continue;
 2186                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2187                                 continue;
 2188                         /*
 2189                          * A timestamp received in a SYN makes
 2190                          * it ok to send timestamp requests and replies.
 2191                          */
 2192                         tp->t_flags |= TF_RCVD_TSTMP;
 2193                         tp->ts_recent = oi->ts_val;
 2194                         tp->ts_recent_age = now;
 2195                         break;
 2196 
 2197                 case TCPOPT_SACK_PERMITTED:
 2198                         if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED)
 2199                                 continue;
 2200                         if (!(th->th_flags & TH_SYN))
 2201                                 continue;
 2202                         if (TCPS_HAVERCVDSYN(tp->t_state))
 2203                                 continue;
 2204                         /* MUST only be set on SYN */
 2205                         tp->t_flags |= TF_SACK_PERMIT;
 2206                         break;
 2207                 case TCPOPT_SACK:
 2208                         tcp_sack_option(tp, th, cp, optlen);
 2209                         break;
 2210 #ifdef TCP_SIGNATURE
 2211                 case TCPOPT_SIGNATURE:
 2212                         if (optlen != TCPOLEN_SIGNATURE)
 2213                                 continue;
 2214 
 2215                         if (sigp && timingsafe_bcmp(sigp, cp + 2, 16))
 2216                                 goto bad;
 2217 
 2218                         sigp = cp + 2;
 2219                         break;
 2220 #endif /* TCP_SIGNATURE */
 2221                 }
 2222         }
 2223 
 2224 #ifdef TCP_SIGNATURE
 2225         if (tp->t_flags & TF_SIGNATURE) {
 2226                 union sockaddr_union src, dst;
 2227 
 2228                 memset(&src, 0, sizeof(union sockaddr_union));
 2229                 memset(&dst, 0, sizeof(union sockaddr_union));
 2230 
 2231                 switch (tp->pf) {
 2232                 case 0:
 2233                 case AF_INET:
 2234                         src.sa.sa_len = sizeof(struct sockaddr_in);
 2235                         src.sa.sa_family = AF_INET;
 2236                         src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
 2237                         dst.sa.sa_len = sizeof(struct sockaddr_in);
 2238                         dst.sa.sa_family = AF_INET;
 2239                         dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
 2240                         break;
 2241 #ifdef INET6
 2242                 case AF_INET6:
 2243                         src.sa.sa_len = sizeof(struct sockaddr_in6);
 2244                         src.sa.sa_family = AF_INET6;
 2245                         src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
 2246                         dst.sa.sa_len = sizeof(struct sockaddr_in6);
 2247                         dst.sa.sa_family = AF_INET6;
 2248                         dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
 2249                         break;
 2250 #endif /* INET6 */
 2251                 }
 2252 
 2253                 tdb = gettdbbysrcdst(rtable_l2(rtableid),
 2254                     0, &src, &dst, IPPROTO_TCP);
 2255 
 2256                 /*
 2257                  * We don't have an SA for this peer, so we turn off
 2258                  * TF_SIGNATURE on the listen socket
 2259                  */
 2260                 if (tdb == NULL && tp->t_state == TCPS_LISTEN)
 2261                         tp->t_flags &= ~TF_SIGNATURE;
 2262 
 2263         }
 2264 
 2265         if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
 2266                 tcpstat_inc(tcps_rcvbadsig);
 2267                 goto bad;
 2268         }
 2269 
 2270         if (sigp) {
 2271                 char sig[16];
 2272 
 2273                 if (tdb == NULL) {
 2274                         tcpstat_inc(tcps_rcvbadsig);
 2275                         goto bad;
 2276                 }
 2277 
 2278                 if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0)
 2279                         goto bad;
 2280 
 2281                 if (timingsafe_bcmp(sig, sigp, 16)) {
 2282                         tcpstat_inc(tcps_rcvbadsig);
 2283                         goto bad;
 2284                 }
 2285 
 2286                 tcpstat_inc(tcps_rcvgoodsig);
 2287         }
 2288 
 2289         tdb_unref(tdb);
 2290 #endif /* TCP_SIGNATURE */
 2291 
 2292         return (0);
 2293 
 2294 #ifdef TCP_SIGNATURE
 2295  bad:
 2296         tdb_unref(tdb);
 2297 #endif /* TCP_SIGNATURE */
 2298         return (-1);
 2299 }
 2300 
 2301 u_long
 2302 tcp_seq_subtract(u_long a, u_long b)
 2303 {
 2304         return ((long)(a - b));
 2305 }
 2306 
 2307 /*
 2308  * This function is called upon receipt of new valid data (while not in header
 2309  * prediction mode), and it updates the ordered list of sacks.
 2310  */
 2311 void
 2312 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart,
 2313     tcp_seq rcv_lastend)
 2314 {
 2315         /*
 2316          * First reported block MUST be the most recent one.  Subsequent
 2317          * blocks SHOULD be in the order in which they arrived at the
 2318          * receiver.  These two conditions make the implementation fully
 2319          * compliant with RFC 2018.
 2320          */
 2321         int i, j = 0, count = 0, lastpos = -1;
 2322         struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
 2323 
 2324         /* First clean up current list of sacks */
 2325         for (i = 0; i < tp->rcv_numsacks; i++) {
 2326                 sack = tp->sackblks[i];
 2327                 if (sack.start == 0 && sack.end == 0) {
 2328                         count++; /* count = number of blocks to be discarded */
 2329                         continue;
 2330                 }
 2331                 if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
 2332                         tp->sackblks[i].start = tp->sackblks[i].end = 0;
 2333                         count++;
 2334                 } else {
 2335                         temp[j].start = tp->sackblks[i].start;
 2336                         temp[j++].end = tp->sackblks[i].end;
 2337                 }
 2338         }
 2339         tp->rcv_numsacks -= count;
 2340         if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
 2341                 tcp_clean_sackreport(tp);
 2342                 if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) {
 2343                         /* ==> need first sack block */
 2344                         tp->sackblks[0].start = rcv_laststart;
 2345                         tp->sackblks[0].end = rcv_lastend;
 2346                         tp->rcv_numsacks = 1;
 2347                 }
 2348                 return;
 2349         }
 2350         /* Otherwise, sack blocks are already present. */
 2351         for (i = 0; i < tp->rcv_numsacks; i++)
 2352                 tp->sackblks[i] = temp[i]; /* first copy back sack list */
 2353         if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend))
 2354                 return;     /* sack list remains unchanged */
 2355         /*
 2356          * From here, segment just received should be (part of) the 1st sack.
 2357          * Go through list, possibly coalescing sack block entries.
 2358          */
 2359         firstsack.start = rcv_laststart;
 2360         firstsack.end = rcv_lastend;
 2361         for (i = 0; i < tp->rcv_numsacks; i++) {
 2362                 sack = tp->sackblks[i];
 2363                 if (SEQ_LT(sack.end, firstsack.start) ||
 2364                     SEQ_GT(sack.start, firstsack.end))
 2365                         continue; /* no overlap */
 2366                 if (sack.start == firstsack.start && sack.end == firstsack.end){
 2367                         /*
 2368                          * identical block; delete it here since we will
 2369                          * move it to the front of the list.
 2370                          */
 2371                         tp->sackblks[i].start = tp->sackblks[i].end = 0;
 2372                         lastpos = i;    /* last posn with a zero entry */
 2373                         continue;
 2374                 }
 2375                 if (SEQ_LEQ(sack.start, firstsack.start))
 2376                         firstsack.start = sack.start; /* merge blocks */
 2377                 if (SEQ_GEQ(sack.end, firstsack.end))
 2378                         firstsack.end = sack.end;     /* merge blocks */
 2379                 tp->sackblks[i].start = tp->sackblks[i].end = 0;
 2380                 lastpos = i;    /* last posn with a zero entry */
 2381         }
 2382         if (lastpos != -1) {    /* at least one merge */
 2383                 for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
 2384                         sack = tp->sackblks[i];
 2385                         if (sack.start == 0 && sack.end == 0)
 2386                                 continue;
 2387                         temp[j++] = sack;
 2388                 }
 2389                 tp->rcv_numsacks = j; /* including first blk (added later) */
 2390                 for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
 2391                         tp->sackblks[i] = temp[i];
 2392         } else {        /* no merges -- shift sacks by 1 */
 2393                 if (tp->rcv_numsacks < MAX_SACK_BLKS)
 2394                         tp->rcv_numsacks++;
 2395                 for (i = tp->rcv_numsacks-1; i > 0; i--)
 2396                         tp->sackblks[i] = tp->sackblks[i-1];
 2397         }
 2398         tp->sackblks[0] = firstsack;
 2399         return;
 2400 }
 2401 
 2402 /*
 2403  * Process the TCP SACK option.  tp->snd_holes is an ordered list
 2404  * of holes (oldest to newest, in terms of the sequence space).
 2405  */
 2406 void
 2407 tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
 2408 {
 2409         int tmp_olen;
 2410         u_char *tmp_cp;
 2411         struct sackhole *cur, *p, *temp;
 2412 
 2413         if (!tp->sack_enable)
 2414                 return;
 2415         /* SACK without ACK doesn't make sense. */
 2416         if ((th->th_flags & TH_ACK) == 0)
 2417                 return;
 2418         /* Make sure the ACK on this segment is in [snd_una, snd_max]. */
 2419         if (SEQ_LT(th->th_ack, tp->snd_una) ||
 2420             SEQ_GT(th->th_ack, tp->snd_max))
 2421                 return;
 2422         /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
 2423         if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
 2424                 return;
 2425         /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
 2426         tmp_cp = cp + 2;
 2427         tmp_olen = optlen - 2;
 2428         tcpstat_inc(tcps_sack_rcv_opts);
 2429         if (tp->snd_numholes < 0)
 2430                 tp->snd_numholes = 0;
 2431         if (tp->t_maxseg == 0)
 2432                 panic("tcp_sack_option"); /* Should never happen */
 2433         while (tmp_olen > 0) {
 2434                 struct sackblk sack;
 2435 
 2436                 memcpy(&sack.start, tmp_cp, sizeof(tcp_seq));
 2437                 sack.start = ntohl(sack.start);
 2438                 memcpy(&sack.end, tmp_cp + sizeof(tcp_seq), sizeof(tcp_seq));
 2439                 sack.end = ntohl(sack.end);
 2440                 tmp_olen -= TCPOLEN_SACK;
 2441                 tmp_cp += TCPOLEN_SACK;
 2442                 if (SEQ_LEQ(sack.end, sack.start))
 2443                         continue; /* bad SACK fields */
 2444                 if (SEQ_LEQ(sack.end, tp->snd_una))
 2445                         continue; /* old block */
 2446                 if (SEQ_GT(th->th_ack, tp->snd_una)) {
 2447                         if (SEQ_LT(sack.start, th->th_ack))
 2448                                 continue;
 2449                 }
 2450                 if (SEQ_GT(sack.end, tp->snd_max))
 2451                         continue;
 2452                 if (tp->snd_holes == NULL) { /* first hole */
 2453                         tp->snd_holes = (struct sackhole *)
 2454                             pool_get(&sackhl_pool, PR_NOWAIT);
 2455                         if (tp->snd_holes == NULL) {
 2456                                 /* ENOBUFS, so ignore SACKed block for now */
 2457                                 goto dropped;
 2458                         }
 2459                         cur = tp->snd_holes;
 2460                         cur->start = th->th_ack;
 2461                         cur->end = sack.start;
 2462                         cur->rxmit = cur->start;
 2463                         cur->next = NULL;
 2464                         tp->snd_numholes = 1;
 2465                         tp->rcv_lastsack = sack.end;
 2466                         /*
 2467                          * dups is at least one.  If more data has been
 2468                          * SACKed, it can be greater than one.
 2469                          */
 2470                         cur->dups = min(tcprexmtthresh,
 2471                             ((sack.end - cur->end)/tp->t_maxseg));
 2472                         if (cur->dups < 1)
 2473                                 cur->dups = 1;
 2474                         continue; /* with next sack block */
 2475                 }
 2476                 /* Go thru list of holes:  p = previous,  cur = current */
 2477                 p = cur = tp->snd_holes;
 2478                 while (cur) {
 2479                         if (SEQ_LEQ(sack.end, cur->start))
 2480                                 /* SACKs data before the current hole */
 2481                                 break; /* no use going through more holes */
 2482                         if (SEQ_GEQ(sack.start, cur->end)) {
 2483                                 /* SACKs data beyond the current hole */
 2484                                 cur->dups++;
 2485                                 if (((sack.end - cur->end)/tp->t_maxseg) >=
 2486                                     tcprexmtthresh)
 2487                                         cur->dups = tcprexmtthresh;
 2488                                 p = cur;
 2489                                 cur = cur->next;
 2490                                 continue;
 2491                         }
 2492                         if (SEQ_LEQ(sack.start, cur->start)) {
 2493                                 /* Data acks at least the beginning of hole */
 2494                                 if (SEQ_GEQ(sack.end, cur->end)) {
 2495                                         /* Acks entire hole, so delete hole */
 2496                                         if (p != cur) {
 2497                                                 p->next = cur->next;
 2498                                                 pool_put(&sackhl_pool, cur);
 2499                                                 cur = p->next;
 2500                                         } else {
 2501                                                 cur = cur->next;
 2502                                                 pool_put(&sackhl_pool, p);
 2503                                                 p = cur;
 2504                                                 tp->snd_holes = p;
 2505                                         }
 2506                                         tp->snd_numholes--;
 2507                                         continue;
 2508                                 }
 2509                                 /* otherwise, move start of hole forward */
 2510                                 cur->start = sack.end;
 2511                                 cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
 2512                                 p = cur;
 2513                                 cur = cur->next;
 2514                                 continue;
 2515                         }
 2516                         /* move end of hole backward */
 2517                         if (SEQ_GEQ(sack.end, cur->end)) {
 2518                                 cur->end = sack.start;
 2519                                 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 2520                                 cur->dups++;
 2521                                 if (((sack.end - cur->end)/tp->t_maxseg) >=
 2522                                     tcprexmtthresh)
 2523                                         cur->dups = tcprexmtthresh;
 2524                                 p = cur;
 2525                                 cur = cur->next;
 2526                                 continue;
 2527                         }
 2528                         if (SEQ_LT(cur->start, sack.start) &&
 2529                             SEQ_GT(cur->end, sack.end)) {
 2530                                 /*
 2531                                  * ACKs some data in middle of a hole; need to
 2532                                  * split current hole
 2533                                  */
 2534                                 if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT)
 2535                                         goto dropped;
 2536                                 temp = (struct sackhole *)
 2537                                     pool_get(&sackhl_pool, PR_NOWAIT);
 2538                                 if (temp == NULL)
 2539                                         goto dropped; /* ENOBUFS */
 2540                                 temp->next = cur->next;
 2541                                 temp->start = sack.end;
 2542                                 temp->end = cur->end;
 2543                                 temp->dups = cur->dups;
 2544                                 temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
 2545                                 cur->end = sack.start;
 2546                                 cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
 2547                                 cur->dups++;
 2548                                 if (((sack.end - cur->end)/tp->t_maxseg) >=
 2549                                         tcprexmtthresh)
 2550                                         cur->dups = tcprexmtthresh;
 2551                                 cur->next = temp;
 2552                                 p = temp;
 2553                                 cur = p->next;
 2554                                 tp->snd_numholes++;
 2555                         }
 2556                 }
 2557                 /* At this point, p points to the last hole on the list */
 2558                 if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
 2559                         /*
 2560                          * Need to append new hole at end.
 2561                          * Last hole is p (and it's not NULL).
 2562                          */
 2563                         if (tp->snd_numholes >= TCP_SACKHOLE_LIMIT)
 2564                                 goto dropped;
 2565                         temp = (struct sackhole *)
 2566                             pool_get(&sackhl_pool, PR_NOWAIT);
 2567                         if (temp == NULL)
 2568                                 goto dropped; /* ENOBUFS */
 2569                         temp->start = tp->rcv_lastsack;
 2570                         temp->end = sack.start;
 2571                         temp->dups = min(tcprexmtthresh,
 2572                             ((sack.end - sack.start)/tp->t_maxseg));
 2573                         if (temp->dups < 1)
 2574                                 temp->dups = 1;
 2575                         temp->rxmit = temp->start;
 2576                         temp->next = 0;
 2577                         p->next = temp;
 2578                         tp->rcv_lastsack = sack.end;
 2579                         tp->snd_numholes++;
 2580                 }
 2581         }
 2582         return;
 2583 dropped:
 2584         tcpstat_inc(tcps_sack_drop_opts);
 2585 }
 2586 
 2587 /*
 2588  * Delete stale (i.e, cumulatively ack'd) holes.  Hole is deleted only if
 2589  * it is completely acked; otherwise, tcp_sack_option(), called from
 2590  * tcp_dooptions(), will fix up the hole.
 2591  */
 2592 void
 2593 tcp_del_sackholes(struct tcpcb *tp, struct tcphdr *th)
 2594 {
 2595         if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
 2596                 /* max because this could be an older ack just arrived */
 2597                 tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
 2598                         th->th_ack : tp->snd_una;
 2599                 struct sackhole *cur = tp->snd_holes;
 2600                 struct sackhole *prev;
 2601                 while (cur)
 2602                         if (SEQ_LEQ(cur->end, lastack)) {
 2603                                 prev = cur;
 2604                                 cur = cur->next;
 2605                                 pool_put(&sackhl_pool, prev);
 2606                                 tp->snd_numholes--;
 2607                         } else if (SEQ_LT(cur->start, lastack)) {
 2608                                 cur->start = lastack;
 2609                                 if (SEQ_LT(cur->rxmit, cur->start))
 2610                                         cur->rxmit = cur->start;
 2611                                 break;
 2612                         } else
 2613                                 break;
 2614                 tp->snd_holes = cur;
 2615         }
 2616 }
 2617 
 2618 /*
 2619  * Delete all receiver-side SACK information.
 2620  */
 2621 void
 2622 tcp_clean_sackreport(struct tcpcb *tp)
 2623 {
 2624         int i;
 2625 
 2626         tp->rcv_numsacks = 0;
 2627         for (i = 0; i < MAX_SACK_BLKS; i++)
 2628                 tp->sackblks[i].start = tp->sackblks[i].end=0;
 2629 
 2630 }
 2631 
 2632 /*
 2633  * Partial ack handling within a sack recovery episode.  When a partial ack
 2634  * arrives, turn off retransmission timer, deflate the window, do not clear
 2635  * tp->t_dupacks.
 2636  */
 2637 void
 2638 tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
 2639 {
 2640         /* Turn off retx. timer (will start again next segment) */
 2641         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 2642         tp->t_rtttime = 0;
 2643         /*
 2644          * Partial window deflation.  This statement relies on the
 2645          * fact that tp->snd_una has not been updated yet.
 2646          */
 2647         if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
 2648                 tp->snd_cwnd -= th->th_ack - tp->snd_una;
 2649                 tp->snd_cwnd += tp->t_maxseg;
 2650         } else
 2651                 tp->snd_cwnd = tp->t_maxseg;
 2652         tp->snd_cwnd += tp->t_maxseg;
 2653         tp->t_flags |= TF_NEEDOUTPUT;
 2654 }
 2655 
 2656 /*
 2657  * Pull out of band byte out of a segment so
 2658  * it doesn't appear in the user's data queue.
 2659  * It is still reflected in the segment length for
 2660  * sequencing purposes.
 2661  */
 2662 void
 2663 tcp_pulloutofband(struct socket *so, u_int urgent, struct mbuf *m, int off)
 2664 {
 2665         int cnt = off + urgent - 1;
 2666 
 2667         while (cnt >= 0) {
 2668                 if (m->m_len > cnt) {
 2669                         char *cp = mtod(m, caddr_t) + cnt;
 2670                         struct tcpcb *tp = sototcpcb(so);
 2671 
 2672                         tp->t_iobc = *cp;
 2673                         tp->t_oobflags |= TCPOOB_HAVEDATA;
 2674                         memmove(cp, cp + 1, m->m_len - cnt - 1);
 2675                         m->m_len--;
 2676                         return;
 2677                 }
 2678                 cnt -= m->m_len;
 2679                 m = m->m_next;
 2680                 if (m == NULL)
 2681                         break;
 2682         }
 2683         panic("tcp_pulloutofband");
 2684 }
 2685 
 2686 /*
 2687  * Collect new round-trip time estimate
 2688  * and update averages and current timeout.
 2689  */
 2690 void
 2691 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 2692 {
 2693         int delta, rttmin;
 2694 
 2695         if (rtt < 0)
 2696                 rtt = 0;
 2697         else if (rtt > TCP_RTT_MAX)
 2698                 rtt = TCP_RTT_MAX;
 2699 
 2700         tcpstat_inc(tcps_rttupdated);
 2701         if (tp->t_srtt != 0) {
 2702                 /*
 2703                  * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits
 2704                  * after the binary point (scaled by 4), whereas
 2705                  * srtt is stored as fixed point with 5 bits after the
 2706                  * binary point (i.e., scaled by 32).  The following magic
 2707                  * is equivalent to the smoothing algorithm in rfc793 with
 2708                  * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
 2709                  * point).
 2710                  */
 2711                 delta = (rtt << TCP_RTT_BASE_SHIFT) -
 2712                     (tp->t_srtt >> TCP_RTT_SHIFT);
 2713                 if ((tp->t_srtt += delta) <= 0)
 2714                         tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT;
 2715                 /*
 2716                  * We accumulate a smoothed rtt variance (actually, a
 2717                  * smoothed mean difference), then set the retransmit
 2718                  * timer to smoothed rtt + 4 times the smoothed variance.
 2719                  * rttvar is stored as fixed point with 4 bits after the
 2720                  * binary point (scaled by 16).  The following is
 2721                  * equivalent to rfc793 smoothing with an alpha of .75
 2722                  * (rttvar = rttvar*3/4 + |delta| / 4).  This replaces
 2723                  * rfc793's wired-in beta.
 2724                  */
 2725                 if (delta < 0)
 2726                         delta = -delta;
 2727                 delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
 2728                 if ((tp->t_rttvar += delta) <= 0)
 2729                         tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT;
 2730         } else {
 2731                 /*
 2732                  * No rtt measurement yet - use the unsmoothed rtt.
 2733                  * Set the variance to half the rtt (so our first
 2734                  * retransmit happens at 3*rtt).
 2735                  */
 2736                 tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
 2737                 tp->t_rttvar = (rtt + 1) <<
 2738                     (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
 2739         }
 2740         tp->t_rtttime = 0;
 2741         tp->t_rxtshift = 0;
 2742 
 2743         /*
 2744          * the retransmit should happen at rtt + 4 * rttvar.
 2745          * Because of the way we do the smoothing, srtt and rttvar
 2746          * will each average +1/2 tick of bias.  When we compute
 2747          * the retransmit timer, we want 1/2 tick of rounding and
 2748          * 1 extra tick because of +-1/2 tick uncertainty in the
 2749          * firing of the timer.  The bias will give us exactly the
 2750          * 1.5 tick we need.  But, because the bias is
 2751          * statistical, we have to test that we don't drop below
 2752          * the minimum feasible timer (which is 2 ticks).
 2753          */
 2754         rttmin = min(max(tp->t_rttmin, rtt + 2 * (TCP_TIME(1) / hz)),
 2755             TCPTV_REXMTMAX);
 2756         TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
 2757 
 2758         /*
 2759          * We received an ack for a packet that wasn't retransmitted;
 2760          * it is probably safe to discard any error indications we've
 2761          * received recently.  This isn't quite right, but close enough
 2762          * for now (a route might have failed after we sent a segment,
 2763          * and the return path might not be symmetrical).
 2764          */
 2765         tp->t_softerror = 0;
 2766 }
 2767 
 2768 /*
 2769  * Determine a reasonable value for maxseg size.
 2770  * If the route is known, check route for mtu.
 2771  * If none, use an mss that can be handled on the outgoing
 2772  * interface without forcing IP to fragment; if bigger than
 2773  * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
 2774  * to utilize large mbufs.  If no route is found, route has no mtu,
 2775  * or the destination isn't local, use a default, hopefully conservative
 2776  * size (usually 512 or the default IP max size, but no more than the mtu
 2777  * of the interface), as we can't discover anything about intervening
 2778  * gateways or networks.  We also initialize the congestion/slow start
 2779  * window to be a single segment if the destination isn't local.
 2780  * While looking at the routing entry, we also initialize other path-dependent
 2781  * parameters from pre-set or cached values in the routing entry.
 2782  *
 2783  * Also take into account the space needed for options that we
 2784  * send regularly.  Make maxseg shorter by that amount to assure
 2785  * that we can send maxseg amount of data even when the options
 2786  * are present.  Store the upper limit of the length of options plus
 2787  * data in maxopd.
 2788  *
 2789  * NOTE: offer == -1 indicates that the maxseg size changed due to
 2790  * Path MTU discovery.
 2791  */
 2792 int
 2793 tcp_mss(struct tcpcb *tp, int offer)
 2794 {
 2795         struct rtentry *rt;
 2796         struct ifnet *ifp = NULL;
 2797         int mss, mssopt;
 2798         int iphlen;
 2799         struct inpcb *inp;
 2800 
 2801         inp = tp->t_inpcb;
 2802 
 2803         mssopt = mss = tcp_mssdflt;
 2804 
 2805         rt = in_pcbrtentry(inp);
 2806 
 2807         if (rt == NULL)
 2808                 goto out;
 2809 
 2810         ifp = if_get(rt->rt_ifidx);
 2811         if (ifp == NULL)
 2812                 goto out;
 2813 
 2814         switch (tp->pf) {
 2815 #ifdef INET6
 2816         case AF_INET6:
 2817                 iphlen = sizeof(struct ip6_hdr);
 2818                 break;
 2819 #endif
 2820         case AF_INET:
 2821                 iphlen = sizeof(struct ip);
 2822                 break;
 2823         default:
 2824                 /* the family does not support path MTU discovery */
 2825                 goto out;
 2826         }
 2827 
 2828         /*
 2829          * if there's an mtu associated with the route and we support
 2830          * path MTU discovery for the underlying protocol family, use it.
 2831          */
 2832         if (rt->rt_mtu) {
 2833                 /*
 2834                  * One may wish to lower MSS to take into account options,
 2835                  * especially security-related options.
 2836                  */
 2837                 if (tp->pf == AF_INET6 && rt->rt_mtu < IPV6_MMTU) {
 2838                         /*
 2839                          * RFC2460 section 5, last paragraph: if path MTU is
 2840                          * smaller than 1280, use 1280 as packet size and
 2841                          * attach fragment header.
 2842                          */
 2843                         mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
 2844                             sizeof(struct tcphdr);
 2845                 } else {
 2846                         mss = rt->rt_mtu - iphlen -
 2847                             sizeof(struct tcphdr);
 2848                 }
 2849         } else if (ifp->if_flags & IFF_LOOPBACK) {
 2850                 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 2851         } else if (tp->pf == AF_INET) {
 2852                 if (ip_mtudisc)
 2853                         mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 2854         }
 2855 #ifdef INET6
 2856         else if (tp->pf == AF_INET6) {
 2857                 /*
 2858                  * for IPv6, path MTU discovery is always turned on,
 2859                  * or the node must use packet size <= 1280.
 2860                  */
 2861                 mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 2862         }
 2863 #endif /* INET6 */
 2864 
 2865         /* Calculate the value that we offer in TCPOPT_MAXSEG */
 2866         if (offer != -1) {
 2867                 mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
 2868                 mssopt = max(tcp_mssdflt, mssopt);
 2869         }
 2870  out:
 2871         if_put(ifp);
 2872         /*
 2873          * The current mss, t_maxseg, is initialized to the default value.
 2874          * If we compute a smaller value, reduce the current mss.
 2875          * If we compute a larger value, return it for use in sending
 2876          * a max seg size option, but don't store it for use
 2877          * unless we received an offer at least that large from peer.
 2878          *
 2879          * However, do not accept offers lower than the minimum of
 2880          * the interface MTU and 216.
 2881          */
 2882         if (offer > 0)
 2883                 tp->t_peermss = offer;
 2884         if (tp->t_peermss)
 2885                 mss = min(mss, max(tp->t_peermss, 216));
 2886 
 2887         /* sanity - at least max opt. space */
 2888         mss = max(mss, 64);
 2889 
 2890         /*
 2891          * maxopd stores the maximum length of data AND options
 2892          * in a segment; maxseg is the amount of data in a normal
 2893          * segment.  We need to store this value (maxopd) apart
 2894          * from maxseg, because now every segment carries options
 2895          * and thus we normally have somewhat less data in segments.
 2896          */
 2897         tp->t_maxopd = mss;
 2898 
 2899         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 2900             (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 2901                 mss -= TCPOLEN_TSTAMP_APPA;
 2902 #ifdef TCP_SIGNATURE
 2903         if (tp->t_flags & TF_SIGNATURE)
 2904                 mss -= TCPOLEN_SIGLEN;
 2905 #endif
 2906 
 2907         if (offer == -1) {
 2908                 /* mss changed due to Path MTU discovery */
 2909                 tp->t_flags &= ~TF_PMTUD_PEND;
 2910                 tp->t_pmtud_mtu_sent = 0;
 2911                 tp->t_pmtud_mss_acked = 0;
 2912                 if (mss < tp->t_maxseg) {
 2913                         /*
 2914                          * Follow suggestion in RFC 2414 to reduce the
 2915                          * congestion window by the ratio of the old
 2916                          * segment size to the new segment size.
 2917                          */
 2918                         tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
 2919                             mss, mss);
 2920                 }
 2921         } else if (tcp_do_rfc3390 == 2) {
 2922                 /* increase initial window  */
 2923                 tp->snd_cwnd = ulmin(10 * mss, ulmax(2 * mss, 14600));
 2924         } else if (tcp_do_rfc3390) {
 2925                 /* increase initial window  */
 2926                 tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
 2927         } else
 2928                 tp->snd_cwnd = mss;
 2929 
 2930         tp->t_maxseg = mss;
 2931 
 2932         return (offer != -1 ? mssopt : mss);
 2933 }
 2934 
 2935 u_int
 2936 tcp_hdrsz(struct tcpcb *tp)
 2937 {
 2938         u_int hlen;
 2939 
 2940         switch (tp->pf) {
 2941 #ifdef INET6
 2942         case AF_INET6:
 2943                 hlen = sizeof(struct ip6_hdr);
 2944                 break;
 2945 #endif
 2946         case AF_INET:
 2947                 hlen = sizeof(struct ip);
 2948                 break;
 2949         default:
 2950                 hlen = 0;
 2951                 break;
 2952         }
 2953         hlen += sizeof(struct tcphdr);
 2954 
 2955         if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
 2956             (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
 2957                 hlen += TCPOLEN_TSTAMP_APPA;
 2958 #ifdef TCP_SIGNATURE
 2959         if (tp->t_flags & TF_SIGNATURE)
 2960                 hlen += TCPOLEN_SIGLEN;
 2961 #endif
 2962         return (hlen);
 2963 }
 2964 
 2965 /*
 2966  * Set connection variables based on the effective MSS.
 2967  * We are passed the TCPCB for the actual connection.  If we
 2968  * are the server, we are called by the compressed state engine
 2969  * when the 3-way handshake is complete.  If we are the client,
 2970  * we are called when we receive the SYN,ACK from the server.
 2971  *
 2972  * NOTE: The t_maxseg value must be initialized in the TCPCB
 2973  * before this routine is called!
 2974  */
 2975 void
 2976 tcp_mss_update(struct tcpcb *tp)
 2977 {
 2978         int mss;
 2979         u_long bufsize;
 2980         struct rtentry *rt;
 2981         struct socket *so;
 2982 
 2983         so = tp->t_inpcb->inp_socket;
 2984         mss = tp->t_maxseg;
 2985 
 2986         rt = in_pcbrtentry(tp->t_inpcb);
 2987 
 2988         if (rt == NULL)
 2989                 return;
 2990 
 2991         bufsize = so->so_snd.sb_hiwat;
 2992         if (bufsize < mss) {
 2993                 mss = bufsize;
 2994                 /* Update t_maxseg and t_maxopd */
 2995                 tcp_mss(tp, mss);
 2996         } else {
 2997                 bufsize = roundup(bufsize, mss);
 2998                 if (bufsize > sb_max)
 2999                         bufsize = sb_max;
 3000                 (void)sbreserve(so, &so->so_snd, bufsize);
 3001         }
 3002 
 3003         bufsize = so->so_rcv.sb_hiwat;
 3004         if (bufsize > mss) {
 3005                 bufsize = roundup(bufsize, mss);
 3006                 if (bufsize > sb_max)
 3007                         bufsize = sb_max;
 3008                 (void)sbreserve(so, &so->so_rcv, bufsize);
 3009         }
 3010 
 3011 }
 3012 
 3013 /*
 3014  * When a partial ack arrives, force the retransmission of the
 3015  * next unacknowledged segment.  Do not clear tp->t_dupacks.
 3016  * By setting snd_nxt to ti_ack, this forces retransmission timer
 3017  * to be started again.
 3018  */
 3019 void
 3020 tcp_newreno_partialack(struct tcpcb *tp, struct tcphdr *th)
 3021 {
 3022         /*
 3023          * snd_una has not been updated and the socket send buffer
 3024          * not yet drained of the acked data, so we have to leave
 3025          * snd_una as it was to get the correct data offset in
 3026          * tcp_output().
 3027          */
 3028         tcp_seq onxt = tp->snd_nxt;
 3029         u_long  ocwnd = tp->snd_cwnd;
 3030 
 3031         TCP_TIMER_DISARM(tp, TCPT_REXMT);
 3032         tp->t_rtttime = 0;
 3033         tp->snd_nxt = th->th_ack;
 3034         /*
 3035          * Set snd_cwnd to one segment beyond acknowledged offset
 3036          * (tp->snd_una not yet updated when this function is called)
 3037          */
 3038         tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
 3039         (void)tcp_output(tp);
 3040         tp->snd_cwnd = ocwnd;
 3041         if (SEQ_GT(onxt, tp->snd_nxt))
 3042                 tp->snd_nxt = onxt;
 3043         /*
 3044          * Partial window deflation.  Relies on fact that tp->snd_una
 3045          * not updated yet.
 3046          */
 3047         if (tp->snd_cwnd > th->th_ack - tp->snd_una)
 3048                 tp->snd_cwnd -= th->th_ack - tp->snd_una;
 3049         else
 3050                 tp->snd_cwnd = 0;
 3051         tp->snd_cwnd += tp->t_maxseg;
 3052 }
 3053 
 3054 int
 3055 tcp_mss_adv(struct mbuf *m, int af)
 3056 {
 3057         int mss = 0;
 3058         int iphlen;
 3059         struct ifnet *ifp = NULL;
 3060 
 3061         if (m && (m->m_flags & M_PKTHDR))
 3062                 ifp = if_get(m->m_pkthdr.ph_ifidx);
 3063 
 3064         switch (af) {
 3065         case AF_INET:
 3066                 if (ifp != NULL)
 3067                         mss = ifp->if_mtu;
 3068                 iphlen = sizeof(struct ip);
 3069                 break;
 3070 #ifdef INET6
 3071         case AF_INET6:
 3072                 if (ifp != NULL)
 3073                         mss = ifp->if_mtu;
 3074                 iphlen = sizeof(struct ip6_hdr);
 3075                 break;
 3076 #endif
 3077         default:
 3078                 unhandled_af(af);
 3079         }
 3080         if_put(ifp);
 3081         mss = mss - iphlen - sizeof(struct tcphdr);
 3082         return (max(mss, tcp_mssdflt));
 3083 }
 3084 
 3085 /*
 3086  * TCP compressed state engine.  Currently used to hold compressed
 3087  * state for SYN_RECEIVED.
 3088  */
 3089 
 3090 /* syn hash parameters */
 3091 int     tcp_syn_hash_size = TCP_SYN_HASH_SIZE;
 3092 int     tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
 3093 int     tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
 3094 int     tcp_syn_use_limit = 100000;
 3095 
 3096 struct syn_cache_set tcp_syn_cache[2];
 3097 int tcp_syn_cache_active;
 3098 
 3099 #define SYN_HASH(sa, sp, dp, rand) \
 3100         (((sa)->s_addr ^ (rand)[0]) *                           \
 3101         (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4]))
 3102 #ifndef INET6
 3103 #define SYN_HASHALL(hash, src, dst, rand) \
 3104 do {                                                                    \
 3105         hash = SYN_HASH(&satosin(src)->sin_addr,                        \
 3106                 satosin(src)->sin_port,                                 \
 3107                 satosin(dst)->sin_port, (rand));                        \
 3108 } while (/*CONSTCOND*/ 0)
 3109 #else
 3110 #define SYN_HASH6(sa, sp, dp, rand) \
 3111         (((sa)->s6_addr32[0] ^ (rand)[0]) *                     \
 3112         ((sa)->s6_addr32[1] ^ (rand)[1]) *                      \
 3113         ((sa)->s6_addr32[2] ^ (rand)[2]) *                      \
 3114         ((sa)->s6_addr32[3] ^ (rand)[3]) *                      \
 3115         (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp))) ^ (rand)[4]))
 3116 
 3117 #define SYN_HASHALL(hash, src, dst, rand) \
 3118 do {                                                                    \
 3119         switch ((src)->sa_family) {                                     \
 3120         case AF_INET:                                                   \
 3121                 hash = SYN_HASH(&satosin(src)->sin_addr,                \
 3122                         satosin(src)->sin_port,                         \
 3123                         satosin(dst)->sin_port, (rand));                \
 3124                 break;                                                  \
 3125         case AF_INET6:                                                  \
 3126                 hash = SYN_HASH6(&satosin6(src)->sin6_addr,             \
 3127                         satosin6(src)->sin6_port,                       \
 3128                         satosin6(dst)->sin6_port, (rand));              \
 3129                 break;                                                  \
 3130         default:                                                        \
 3131                 hash = 0;                                               \
 3132         }                                                               \
 3133 } while (/*CONSTCOND*/0)
 3134 #endif /* INET6 */
 3135 
 3136 void
 3137 syn_cache_rm(struct syn_cache *sc)
 3138 {
 3139         sc->sc_flags |= SCF_DEAD;
 3140         TAILQ_REMOVE(&sc->sc_buckethead->sch_bucket, sc, sc_bucketq);
 3141         sc->sc_tp = NULL;
 3142         LIST_REMOVE(sc, sc_tpq);
 3143         sc->sc_buckethead->sch_length--;
 3144         timeout_del(&sc->sc_timer);
 3145         sc->sc_set->scs_count--;
 3146 }
 3147 
 3148 void
 3149 syn_cache_put(struct syn_cache *sc)
 3150 {
 3151         m_free(sc->sc_ipopts);
 3152         if (sc->sc_route4.ro_rt != NULL) {
 3153                 rtfree(sc->sc_route4.ro_rt);
 3154                 sc->sc_route4.ro_rt = NULL;
 3155         }
 3156         timeout_set(&sc->sc_timer, syn_cache_reaper, sc);
 3157         timeout_add(&sc->sc_timer, 0);
 3158 }
 3159 
 3160 struct pool syn_cache_pool;
 3161 
 3162 /*
 3163  * We don't estimate RTT with SYNs, so each packet starts with the default
 3164  * RTT and each timer step has a fixed timeout value.
 3165  */
 3166 #define SYN_CACHE_TIMER_ARM(sc)                                         \
 3167 do {                                                                    \
 3168         TCPT_RANGESET((sc)->sc_rxtcur,                                  \
 3169             TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
 3170             TCPTV_REXMTMAX);                                            \
 3171         if (!timeout_initialized(&(sc)->sc_timer))                      \
 3172                 timeout_set_proc(&(sc)->sc_timer, syn_cache_timer, (sc)); \
 3173         timeout_add_msec(&(sc)->sc_timer, (sc)->sc_rxtcur);             \
 3174 } while (/*CONSTCOND*/0)
 3175 
 3176 void
 3177 syn_cache_init(void)
 3178 {
 3179         int i;
 3180 
 3181         /* Initialize the hash buckets. */
 3182         tcp_syn_cache[0].scs_buckethead = mallocarray(tcp_syn_hash_size,
 3183             sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO);
 3184         tcp_syn_cache[1].scs_buckethead = mallocarray(tcp_syn_hash_size,
 3185             sizeof(struct syn_cache_head), M_SYNCACHE, M_WAITOK|M_ZERO);
 3186         tcp_syn_cache[0].scs_size = tcp_syn_hash_size;
 3187         tcp_syn_cache[1].scs_size = tcp_syn_hash_size;
 3188         for (i = 0; i < tcp_syn_hash_size; i++) {
 3189                 TAILQ_INIT(&tcp_syn_cache[0].scs_buckethead[i].sch_bucket);
 3190                 TAILQ_INIT(&tcp_syn_cache[1].scs_buckethead[i].sch_bucket);
 3191         }
 3192 
 3193         /* Initialize the syn cache pool. */
 3194         pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, IPL_SOFTNET,
 3195             0, "syncache", NULL);
 3196 }
 3197 
 3198 void
 3199 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
 3200 {
 3201         struct syn_cache_set *set = &tcp_syn_cache[tcp_syn_cache_active];
 3202         struct syn_cache_head *scp;
 3203         struct syn_cache *sc2;
 3204         int i;
 3205 
 3206         NET_ASSERT_LOCKED();
 3207 
 3208         /*
 3209          * If there are no entries in the hash table, reinitialize
 3210          * the hash secrets.  To avoid useless cache swaps and
 3211          * reinitialization, use it until the limit is reached.
 3212          * An empty cache is also the opportunity to resize the hash.
 3213          */
 3214         if (set->scs_count == 0 && set->scs_use <= 0) {
 3215                 set->scs_use = tcp_syn_use_limit;
 3216                 if (set->scs_size != tcp_syn_hash_size) {
 3217                         scp = mallocarray(tcp_syn_hash_size, sizeof(struct
 3218                             syn_cache_head), M_SYNCACHE, M_NOWAIT|M_ZERO);
 3219                         if (scp == NULL) {
 3220                                 /* Try again next time. */
 3221                                 set->scs_use = 0;
 3222                         } else {
 3223                                 free(set->scs_buckethead, M_SYNCACHE,
 3224                                     set->scs_size *
 3225                                     sizeof(struct syn_cache_head));
 3226                                 set->scs_buckethead = scp;
 3227                                 set->scs_size = tcp_syn_hash_size;
 3228                                 for (i = 0; i < tcp_syn_hash_size; i++)
 3229                                         TAILQ_INIT(&scp[i].sch_bucket);
 3230                         }
 3231                 }
 3232                 arc4random_buf(set->scs_random, sizeof(set->scs_random));
 3233                 tcpstat_inc(tcps_sc_seedrandom);
 3234         }
 3235 
 3236         SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa,
 3237             set->scs_random);
 3238         scp = &set->scs_buckethead[sc->sc_hash % set->scs_size];
 3239         sc->sc_buckethead = scp;
 3240 
 3241         /*
 3242          * Make sure that we don't overflow the per-bucket
 3243          * limit or the total cache size limit.
 3244          */
 3245         if (scp->sch_length >= tcp_syn_bucket_limit) {
 3246                 tcpstat_inc(tcps_sc_bucketoverflow);
 3247                 /*
 3248                  * Someone might attack our bucket hash function.  Reseed
 3249                  * with random as soon as the passive syn cache gets empty.
 3250                  */
 3251                 set->scs_use = 0;
 3252                 /*
 3253                  * The bucket is full.  Toss the oldest element in the
 3254                  * bucket.  This will be the first entry in the bucket.
 3255                  */
 3256                 sc2 = TAILQ_FIRST(&scp->sch_bucket);
 3257 #ifdef DIAGNOSTIC
 3258                 /*
 3259                  * This should never happen; we should always find an
 3260                  * entry in our bucket.
 3261                  */
 3262                 if (sc2 == NULL)
 3263                         panic("%s: bucketoverflow: impossible", __func__);
 3264 #endif
 3265                 syn_cache_rm(sc2);
 3266                 syn_cache_put(sc2);
 3267         } else if (set->scs_count >= tcp_syn_cache_limit) {
 3268                 struct syn_cache_head *scp2, *sce;
 3269 
 3270                 tcpstat_inc(tcps_sc_overflowed);
 3271                 /*
 3272                  * The cache is full.  Toss the oldest entry in the
 3273                  * first non-empty bucket we can find.
 3274                  *
 3275                  * XXX We would really like to toss the oldest
 3276                  * entry in the cache, but we hope that this
 3277                  * condition doesn't happen very often.
 3278                  */
 3279                 scp2 = scp;
 3280                 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
 3281                         sce = &set->scs_buckethead[set->scs_size];
 3282                         for (++scp2; scp2 != scp; scp2++) {
 3283                                 if (scp2 >= sce)
 3284                                         scp2 = &set->scs_buckethead[0];
 3285                                 if (! TAILQ_EMPTY(&scp2->sch_bucket))
 3286                                         break;
 3287                         }
 3288 #ifdef DIAGNOSTIC
 3289                         /*
 3290                          * This should never happen; we should always find a
 3291                          * non-empty bucket.
 3292                          */
 3293                         if (scp2 == scp)
 3294                                 panic("%s: cacheoverflow: impossible",
 3295                                     __func__);
 3296 #endif
 3297                 }
 3298                 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
 3299                 syn_cache_rm(sc2);
 3300                 syn_cache_put(sc2);
 3301         }
 3302 
 3303         /*
 3304          * Initialize the entry's timer.
 3305          */
 3306         sc->sc_rxttot = 0;
 3307         sc->sc_rxtshift = 0;
 3308         SYN_CACHE_TIMER_ARM(sc);
 3309 
 3310         /* Link it from tcpcb entry */
 3311         LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
 3312 
 3313         /* Put it into the bucket. */
 3314         TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
 3315         scp->sch_length++;
 3316         sc->sc_set = set;
 3317         set->scs_count++;
 3318         set->scs_use--;
 3319 
 3320         tcpstat_inc(tcps_sc_added);
 3321 
 3322         /*
 3323          * If the active cache has exceeded its use limit and
 3324          * the passive syn cache is empty, exchange their roles.
 3325          */
 3326         if (set->scs_use <= 0 &&
 3327             tcp_syn_cache[!tcp_syn_cache_active].scs_count == 0)
 3328                 tcp_syn_cache_active = !tcp_syn_cache_active;
 3329 }
 3330 
 3331 /*
 3332  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
 3333  * If we have retransmitted an entry the maximum number of times, expire
 3334  * that entry.
 3335  */
 3336 void
 3337 syn_cache_timer(void *arg)
 3338 {
 3339         struct syn_cache *sc = arg;
 3340         uint32_t now;
 3341 
 3342         NET_LOCK();
 3343         if (sc->sc_flags & SCF_DEAD)
 3344                 goto out;
 3345 
 3346         now = tcp_now();
 3347 
 3348         if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
 3349                 /* Drop it -- too many retransmissions. */
 3350                 goto dropit;
 3351         }
 3352 
 3353         /*
 3354          * Compute the total amount of time this entry has
 3355          * been on a queue.  If this entry has been on longer
 3356          * than the keep alive timer would allow, expire it.
 3357          */
 3358         sc->sc_rxttot += sc->sc_rxtcur;
 3359         if (sc->sc_rxttot >= TCP_TIME(tcptv_keep_init))
 3360                 goto dropit;
 3361 
 3362         tcpstat_inc(tcps_sc_retransmitted);
 3363         (void) syn_cache_respond(sc, NULL, now);
 3364 
 3365         /* Advance the timer back-off. */
 3366         sc->sc_rxtshift++;
 3367         SYN_CACHE_TIMER_ARM(sc);
 3368 
 3369  out:
 3370         NET_UNLOCK();
 3371         return;
 3372 
 3373  dropit:
 3374         tcpstat_inc(tcps_sc_timed_out);
 3375         syn_cache_rm(sc);
 3376         syn_cache_put(sc);
 3377         NET_UNLOCK();
 3378 }
 3379 
 3380 void
 3381 syn_cache_reaper(void *arg)
 3382 {
 3383         struct syn_cache *sc = arg;
 3384 
 3385         pool_put(&syn_cache_pool, (sc));
 3386         return;
 3387 }
 3388 
 3389 /*
 3390  * Remove syn cache created by the specified tcb entry,
 3391  * because this does not make sense to keep them
 3392  * (if there's no tcb entry, syn cache entry will never be used)
 3393  */
 3394 void
 3395 syn_cache_cleanup(struct tcpcb *tp)
 3396 {
 3397         struct syn_cache *sc, *nsc;
 3398 
 3399         NET_ASSERT_LOCKED();
 3400 
 3401         LIST_FOREACH_SAFE(sc, &tp->t_sc, sc_tpq, nsc) {
 3402 #ifdef DIAGNOSTIC
 3403                 if (sc->sc_tp != tp)
 3404                         panic("invalid sc_tp in syn_cache_cleanup");
 3405 #endif
 3406                 syn_cache_rm(sc);
 3407                 syn_cache_put(sc);
 3408         }
 3409         /* just for safety */
 3410         LIST_INIT(&tp->t_sc);
 3411 }
 3412 
 3413 /*
 3414  * Find an entry in the syn cache.
 3415  */
 3416 struct syn_cache *
 3417 syn_cache_lookup(struct sockaddr *src, struct sockaddr *dst,
 3418     struct syn_cache_head **headp, u_int rtableid)
 3419 {
 3420         struct syn_cache_set *sets[2];
 3421         struct syn_cache *sc;
 3422         struct syn_cache_head *scp;
 3423         u_int32_t hash;
 3424         int i;
 3425 
 3426         NET_ASSERT_LOCKED();
 3427 
 3428         /* Check the active cache first, the passive cache is likely empty. */
 3429         sets[0] = &tcp_syn_cache[tcp_syn_cache_active];
 3430         sets[1] = &tcp_syn_cache[!tcp_syn_cache_active];
 3431         for (i = 0; i < 2; i++) {
 3432                 if (sets[i]->scs_count == 0)
 3433                         continue;
 3434                 SYN_HASHALL(hash, src, dst, sets[i]->scs_random);
 3435                 scp = &sets[i]->scs_buckethead[hash % sets[i]->scs_size];
 3436                 *headp = scp;
 3437                 TAILQ_FOREACH(sc, &scp->sch_bucket, sc_bucketq) {
 3438                         if (sc->sc_hash != hash)
 3439                                 continue;
 3440                         if (!bcmp(&sc->sc_src, src, src->sa_len) &&
 3441                             !bcmp(&sc->sc_dst, dst, dst->sa_len) &&
 3442                             rtable_l2(rtableid) == rtable_l2(sc->sc_rtableid))
 3443                                 return (sc);
 3444                 }
 3445         }
 3446         return (NULL);
 3447 }
 3448 
 3449 /*
 3450  * This function gets called when we receive an ACK for a
 3451  * socket in the LISTEN state.  We look up the connection
 3452  * in the syn cache, and if its there, we pull it out of
 3453  * the cache and turn it into a full-blown connection in
 3454  * the SYN-RECEIVED state.
 3455  *
 3456  * The return values may not be immediately obvious, and their effects
 3457  * can be subtle, so here they are:
 3458  *
 3459  *      NULL    SYN was not found in cache; caller should drop the
 3460  *              packet and send an RST.
 3461  *
 3462  *      -1      We were unable to create the new connection, and are
 3463  *              aborting it.  An ACK,RST is being sent to the peer
 3464  *              (unless we got screwy sequence numbers; see below),
 3465  *              because the 3-way handshake has been completed.  Caller
 3466  *              should not free the mbuf, since we may be using it.  If
 3467  *              we are not, we will free it.
 3468  *
 3469  *      Otherwise, the return value is a pointer to the new socket
 3470  *      associated with the connection.
 3471  */
 3472 struct socket *
 3473 syn_cache_get(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
 3474     u_int hlen, u_int tlen, struct socket *so, struct mbuf *m, uint32_t now)
 3475 {
 3476         struct syn_cache *sc;
 3477         struct syn_cache_head *scp;
 3478         struct inpcb *inp, *oldinp;
 3479         struct tcpcb *tp = NULL;
 3480         struct mbuf *am;
 3481         struct socket *oso;
 3482 
 3483         NET_ASSERT_LOCKED();
 3484 
 3485         sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
 3486         if (sc == NULL)
 3487                 return (NULL);
 3488 
 3489         /*
 3490          * Verify the sequence and ack numbers.  Try getting the correct
 3491          * response again.
 3492          */
 3493         if ((th->th_ack != sc->sc_iss + 1) ||
 3494             SEQ_LEQ(th->th_seq, sc->sc_irs) ||
 3495             SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
 3496                 (void) syn_cache_respond(sc, m, now);
 3497                 return ((struct socket *)(-1));
 3498         }
 3499 
 3500         /* Remove this cache entry */
 3501         syn_cache_rm(sc);
 3502 
 3503         /*
 3504          * Ok, create the full blown connection, and set things up
 3505          * as they would have been set up if we had created the
 3506          * connection when the SYN arrived.  If we can't create
 3507          * the connection, abort it.
 3508          */
 3509         oso = so;
 3510         so = sonewconn(so, SS_ISCONNECTED, M_DONTWAIT);
 3511         if (so == NULL)
 3512                 goto resetandabort;
 3513 
 3514         oldinp = sotoinpcb(oso);
 3515         inp = sotoinpcb(so);
 3516 
 3517 #ifdef IPSEC
 3518         /*
 3519          * We need to copy the required security levels
 3520          * from the old pcb. Ditto for any other
 3521          * IPsec-related information.
 3522          */
 3523         memcpy(inp->inp_seclevel, oldinp->inp_seclevel,
 3524             sizeof(oldinp->inp_seclevel));
 3525 #endif /* IPSEC */
 3526 #ifdef INET6
 3527         /*
 3528          * inp still has the OLD in_pcb stuff, set the
 3529          * v6-related flags on the new guy, too.
 3530          */
 3531         inp->inp_flags |= (oldinp->inp_flags & INP_IPV6);
 3532         if (inp->inp_flags & INP_IPV6) {
 3533                 inp->inp_ipv6.ip6_hlim = oldinp->inp_ipv6.ip6_hlim;
 3534                 inp->inp_hops = oldinp->inp_hops;
 3535         } else
 3536 #endif /* INET6 */
 3537         {
 3538                 inp->inp_ip.ip_ttl = oldinp->inp_ip.ip_ttl;
 3539         }
 3540 
 3541 #if NPF > 0
 3542         if (m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
 3543                 struct pf_divert *divert;
 3544 
 3545                 divert = pf_find_divert(m);
 3546                 KASSERT(divert != NULL);
 3547                 inp->inp_rtableid = divert->rdomain;
 3548         } else
 3549 #endif
 3550         /* inherit rtable from listening socket */
 3551         inp->inp_rtableid = sc->sc_rtableid;
 3552 
 3553         inp->inp_lport = th->th_dport;
 3554         switch (src->sa_family) {
 3555 #ifdef INET6
 3556         case AF_INET6:
 3557                 inp->inp_laddr6 = satosin6(dst)->sin6_addr;
 3558                 break;
 3559 #endif /* INET6 */
 3560         case AF_INET:
 3561                 inp->inp_laddr = satosin(dst)->sin_addr;
 3562                 inp->inp_options = ip_srcroute(m);
 3563                 if (inp->inp_options == NULL) {
 3564                         inp->inp_options = sc->sc_ipopts;
 3565                         sc->sc_ipopts = NULL;
 3566                 }
 3567                 break;
 3568         }
 3569         in_pcbrehash(inp);
 3570 
 3571         /*
 3572          * Give the new socket our cached route reference.
 3573          */
 3574         if (src->sa_family == AF_INET)
 3575                 inp->inp_route = sc->sc_route4;         /* struct assignment */
 3576 #ifdef INET6
 3577         else
 3578                 inp->inp_route6 = sc->sc_route6;
 3579 #endif
 3580         sc->sc_route4.ro_rt = NULL;
 3581 
 3582         am = m_get(M_DONTWAIT, MT_SONAME);      /* XXX */
 3583         if (am == NULL)
 3584                 goto resetandabort;
 3585         am->m_len = src->sa_len;
 3586         memcpy(mtod(am, caddr_t), src, src->sa_len);
 3587         if (in_pcbconnect(inp, am)) {
 3588                 (void) m_free(am);
 3589                 goto resetandabort;
 3590         }
 3591         (void) m_free(am);
 3592 
 3593         tp = intotcpcb(inp);
 3594         tp->t_flags = sototcpcb(oso)->t_flags & (TF_NOPUSH|TF_NODELAY);
 3595         if (sc->sc_request_r_scale != 15) {
 3596                 tp->requested_s_scale = sc->sc_requested_s_scale;
 3597                 tp->request_r_scale = sc->sc_request_r_scale;
 3598                 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
 3599         }
 3600         if (sc->sc_flags & SCF_TIMESTAMP)
 3601                 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
 3602 
 3603         tp->t_template = tcp_template(tp);
 3604         if (tp->t_template == 0) {
 3605                 tp = tcp_drop(tp, ENOBUFS);     /* destroys socket */
 3606                 so = NULL;
 3607                 goto abort;
 3608         }
 3609         tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
 3610         tp->ts_modulate = sc->sc_modulate;
 3611         tp->ts_recent = sc->sc_timestamp;
 3612         tp->iss = sc->sc_iss;
 3613         tp->irs = sc->sc_irs;
 3614         tcp_sendseqinit(tp);
 3615         tp->snd_last = tp->snd_una;
 3616 #ifdef TCP_ECN
 3617         if (sc->sc_flags & SCF_ECN_PERMIT) {
 3618                 tp->t_flags |= TF_ECN_PERMIT;
 3619                 tcpstat_inc(tcps_ecn_accepts);
 3620         }
 3621 #endif
 3622         if (sc->sc_flags & SCF_SACK_PERMIT)
 3623                 tp->t_flags |= TF_SACK_PERMIT;
 3624 #ifdef TCP_SIGNATURE
 3625         if (sc->sc_flags & SCF_SIGNATURE)
 3626                 tp->t_flags |= TF_SIGNATURE;
 3627 #endif
 3628         tcp_rcvseqinit(tp);
 3629         tp->t_state = TCPS_SYN_RECEIVED;
 3630         tp->t_rcvtime = now;
 3631         tp->t_sndtime = now;
 3632         tp->t_rcvacktime = now;
 3633         tp->t_sndacktime = now;
 3634         TCP_TIMER_ARM(tp, TCPT_KEEP, TCP_TIME(tcptv_keep_init));
 3635         tcpstat_inc(tcps_accepts);
 3636 
 3637         tcp_mss(tp, sc->sc_peermaxseg);  /* sets t_maxseg */
 3638         if (sc->sc_peermaxseg)
 3639                 tcp_mss_update(tp);
 3640         /* Reset initial window to 1 segment for retransmit */
 3641         if (sc->sc_rxtshift > 0)
 3642                 tp->snd_cwnd = tp->t_maxseg;
 3643         tp->snd_wl1 = sc->sc_irs;
 3644         tp->rcv_up = sc->sc_irs + 1;
 3645 
 3646         /*
 3647          * This is what would have happened in tcp_output() when
 3648          * the SYN,ACK was sent.
 3649          */
 3650         tp->snd_up = tp->snd_una;
 3651         tp->snd_max = tp->snd_nxt = tp->iss+1;
 3652         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
 3653         if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
 3654                 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
 3655         tp->last_ack_sent = tp->rcv_nxt;
 3656 
 3657         tcpstat_inc(tcps_sc_completed);
 3658         syn_cache_put(sc);
 3659         return (so);
 3660 
 3661 resetandabort:
 3662         tcp_respond(NULL, mtod(m, caddr_t), th, (tcp_seq)0, th->th_ack, TH_RST,
 3663             m->m_pkthdr.ph_rtableid, now);
 3664 abort:
 3665         m_freem(m);
 3666         if (so != NULL)
 3667                 soabort(so);
 3668         syn_cache_put(sc);
 3669         tcpstat_inc(tcps_sc_aborted);
 3670         return ((struct socket *)(-1));
 3671 }
 3672 
 3673 /*
 3674  * This function is called when we get a RST for a
 3675  * non-existent connection, so that we can see if the
 3676  * connection is in the syn cache.  If it is, zap it.
 3677  */
 3678 
 3679 void
 3680 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
 3681     u_int rtableid)
 3682 {
 3683         struct syn_cache *sc;
 3684         struct syn_cache_head *scp;
 3685 
 3686         NET_ASSERT_LOCKED();
 3687 
 3688         if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL)
 3689                 return;
 3690         if (SEQ_LT(th->th_seq, sc->sc_irs) ||
 3691             SEQ_GT(th->th_seq, sc->sc_irs + 1))
 3692                 return;
 3693         syn_cache_rm(sc);
 3694         tcpstat_inc(tcps_sc_reset);
 3695         syn_cache_put(sc);
 3696 }
 3697 
 3698 void
 3699 syn_cache_unreach(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
 3700     u_int rtableid)
 3701 {
 3702         struct syn_cache *sc;
 3703         struct syn_cache_head *scp;
 3704 
 3705         NET_ASSERT_LOCKED();
 3706 
 3707         if ((sc = syn_cache_lookup(src, dst, &scp, rtableid)) == NULL)
 3708                 return;
 3709         /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
 3710         if (ntohl (th->th_seq) != sc->sc_iss) {
 3711                 return;
 3712         }
 3713 
 3714         /*
 3715          * If we've retransmitted 3 times and this is our second error,
 3716          * we remove the entry.  Otherwise, we allow it to continue on.
 3717          * This prevents us from incorrectly nuking an entry during a
 3718          * spurious network outage.
 3719          *
 3720          * See tcp_notify().
 3721          */
 3722         if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
 3723                 sc->sc_flags |= SCF_UNREACH;
 3724                 return;
 3725         }
 3726 
 3727         syn_cache_rm(sc);
 3728         tcpstat_inc(tcps_sc_unreach);
 3729         syn_cache_put(sc);
 3730 }
 3731 
 3732 /*
 3733  * Given a LISTEN socket and an inbound SYN request, add
 3734  * this to the syn cache, and send back a segment:
 3735  *      <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
 3736  * to the source.
 3737  *
 3738  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
 3739  * Doing so would require that we hold onto the data and deliver it
 3740  * to the application.  However, if we are the target of a SYN-flood
 3741  * DoS attack, an attacker could send data which would eventually
 3742  * consume all available buffer space if it were ACKed.  By not ACKing
 3743  * the data, we avoid this DoS scenario.
 3744  */
 3745 
 3746 int
 3747 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
 3748     u_int iphlen, struct socket *so, struct mbuf *m, u_char *optp, int optlen,
 3749     struct tcp_opt_info *oi, tcp_seq *issp, uint32_t now)
 3750 {
 3751         struct tcpcb tb, *tp;
 3752         long win;
 3753         struct syn_cache *sc;
 3754         struct syn_cache_head *scp;
 3755         struct mbuf *ipopts;
 3756 
 3757         tp = sototcpcb(so);
 3758 
 3759         /*
 3760          * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
 3761          *
 3762          * Note this check is performed in tcp_input() very early on.
 3763          */
 3764 
 3765         /*
 3766          * Initialize some local state.
 3767          */
 3768         win = sbspace(so, &so->so_rcv);
 3769         if (win > TCP_MAXWIN)
 3770                 win = TCP_MAXWIN;
 3771 
 3772         bzero(&tb, sizeof(tb));
 3773 #ifdef TCP_SIGNATURE
 3774         if (optp || (tp->t_flags & TF_SIGNATURE)) {
 3775 #else
 3776         if (optp) {
 3777 #endif
 3778                 tb.pf = tp->pf;
 3779                 tb.sack_enable = tp->sack_enable;
 3780                 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
 3781 #ifdef TCP_SIGNATURE
 3782                 if (tp->t_flags & TF_SIGNATURE)
 3783                         tb.t_flags |= TF_SIGNATURE;
 3784 #endif
 3785                 tb.t_state = TCPS_LISTEN;
 3786                 if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi,
 3787                     sotoinpcb(so)->inp_rtableid, now))
 3788                         return (-1);
 3789         }
 3790 
 3791         switch (src->sa_family) {
 3792         case AF_INET:
 3793                 /*
 3794                  * Remember the IP options, if any.
 3795                  */
 3796                 ipopts = ip_srcroute(m);
 3797                 break;
 3798         default:
 3799                 ipopts = NULL;
 3800         }
 3801 
 3802         /*
 3803          * See if we already have an entry for this connection.
 3804          * If we do, resend the SYN,ACK.  We do not count this
 3805          * as a retransmission (XXX though maybe we should).
 3806          */
 3807         sc = syn_cache_lookup(src, dst, &scp, sotoinpcb(so)->inp_rtableid);
 3808         if (sc != NULL) {
 3809                 tcpstat_inc(tcps_sc_dupesyn);
 3810                 if (ipopts) {
 3811                         /*
 3812                          * If we were remembering a previous source route,
 3813                          * forget it and use the new one we've been given.
 3814                          */
 3815                         m_free(sc->sc_ipopts);
 3816                         sc->sc_ipopts = ipopts;
 3817                 }
 3818                 sc->sc_timestamp = tb.ts_recent;
 3819                 if (syn_cache_respond(sc, m, now) == 0) {
 3820                         tcpstat_inc(tcps_sndacks);
 3821                         tcpstat_inc(tcps_sndtotal);
 3822                 }
 3823                 return (0);
 3824         }
 3825 
 3826         sc = pool_get(&syn_cache_pool, PR_NOWAIT|PR_ZERO);
 3827         if (sc == NULL) {
 3828                 m_free(ipopts);
 3829                 return (-1);
 3830         }
 3831 
 3832         /*
 3833          * Fill in the cache, and put the necessary IP and TCP
 3834          * options into the reply.
 3835          */
 3836         memcpy(&sc->sc_src, src, src->sa_len);
 3837         memcpy(&sc->sc_dst, dst, dst->sa_len);
 3838         sc->sc_rtableid = sotoinpcb(so)->inp_rtableid;
 3839         sc->sc_flags = 0;
 3840         sc->sc_ipopts = ipopts;
 3841         sc->sc_irs = th->th_seq;
 3842 
 3843         sc->sc_iss = issp ? *issp : arc4random();
 3844         sc->sc_peermaxseg = oi->maxseg;
 3845         sc->sc_ourmaxseg = tcp_mss_adv(m, sc->sc_src.sa.sa_family);
 3846         sc->sc_win = win;
 3847         sc->sc_timestamp = tb.ts_recent;
 3848         if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
 3849             (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
 3850                 sc->sc_flags |= SCF_TIMESTAMP;
 3851                 sc->sc_modulate = arc4random();
 3852         }
 3853         if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 3854             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 3855                 sc->sc_requested_s_scale = tb.requested_s_scale;
 3856                 sc->sc_request_r_scale = 0;
 3857                 /*
 3858                  * Pick the smallest possible scaling factor that
 3859                  * will still allow us to scale up to sb_max.
 3860                  *
 3861                  * We do this because there are broken firewalls that
 3862                  * will corrupt the window scale option, leading to
 3863                  * the other endpoint believing that our advertised
 3864                  * window is unscaled.  At scale factors larger than
 3865                  * 5 the unscaled window will drop below 1500 bytes,
 3866                  * leading to serious problems when traversing these
 3867                  * broken firewalls.
 3868                  *
 3869                  * With the default sbmax of 256K, a scale factor
 3870                  * of 3 will be chosen by this algorithm.  Those who
 3871                  * choose a larger sbmax should watch out
 3872                  * for the compatibility problems mentioned above.
 3873                  *
 3874                  * RFC1323: The Window field in a SYN (i.e., a <SYN>
 3875                  * or <SYN,ACK>) segment itself is never scaled.
 3876                  */
 3877                 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
 3878                     (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
 3879                         sc->sc_request_r_scale++;
 3880         } else {
 3881                 sc->sc_requested_s_scale = 15;
 3882                 sc->sc_request_r_scale = 15;
 3883         }
 3884 #ifdef TCP_ECN
 3885         /*
 3886          * if both ECE and CWR flag bits are set, peer is ECN capable.
 3887          */
 3888         if (tcp_do_ecn &&
 3889             (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
 3890                 sc->sc_flags |= SCF_ECN_PERMIT;
 3891 #endif
 3892         /*
 3893          * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
 3894          * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
 3895          */
 3896         if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
 3897                 sc->sc_flags |= SCF_SACK_PERMIT;
 3898 #ifdef TCP_SIGNATURE
 3899         if (tb.t_flags & TF_SIGNATURE)
 3900                 sc->sc_flags |= SCF_SIGNATURE;
 3901 #endif
 3902         sc->sc_tp = tp;
 3903         if (syn_cache_respond(sc, m, now) == 0) {
 3904                 syn_cache_insert(sc, tp);
 3905                 tcpstat_inc(tcps_sndacks);
 3906                 tcpstat_inc(tcps_sndtotal);
 3907         } else {
 3908                 syn_cache_put(sc);
 3909                 tcpstat_inc(tcps_sc_dropped);
 3910         }
 3911 
 3912         return (0);
 3913 }
 3914 
 3915 int
 3916 syn_cache_respond(struct syn_cache *sc, struct mbuf *m, uint32_t now)
 3917 {
 3918         u_int8_t *optp;
 3919         int optlen, error;
 3920         u_int16_t tlen;
 3921         struct ip *ip = NULL;
 3922 #ifdef INET6
 3923         struct ip6_hdr *ip6 = NULL;
 3924 #endif
 3925         struct tcphdr *th;
 3926         u_int hlen;
 3927         struct inpcb *inp;
 3928 
 3929         switch (sc->sc_src.sa.sa_family) {
 3930         case AF_INET:
 3931                 hlen = sizeof(struct ip);
 3932                 break;
 3933 #ifdef INET6
 3934         case AF_INET6:
 3935                 hlen = sizeof(struct ip6_hdr);
 3936                 break;
 3937 #endif
 3938         default:
 3939                 m_freem(m);
 3940                 return (EAFNOSUPPORT);
 3941         }
 3942 
 3943         /* Compute the size of the TCP options. */
 3944         optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
 3945             ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
 3946 #ifdef TCP_SIGNATURE
 3947             ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
 3948 #endif
 3949             ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
 3950 
 3951         tlen = hlen + sizeof(struct tcphdr) + optlen;
 3952 
 3953         /*
 3954          * Create the IP+TCP header from scratch.
 3955          */
 3956         m_freem(m);
 3957 #ifdef DIAGNOSTIC
 3958         if (max_linkhdr + tlen > MCLBYTES)
 3959                 return (ENOBUFS);
 3960 #endif
 3961         MGETHDR(m, M_DONTWAIT, MT_DATA);
 3962         if (m && max_linkhdr + tlen > MHLEN) {
 3963                 MCLGET(m, M_DONTWAIT);
 3964                 if ((m->m_flags & M_EXT) == 0) {
 3965                         m_freem(m);
 3966                         m = NULL;
 3967                 }
 3968         }
 3969         if (m == NULL)
 3970                 return (ENOBUFS);
 3971 
 3972         /* Fixup the mbuf. */
 3973         m->m_data += max_linkhdr;
 3974         m->m_len = m->m_pkthdr.len = tlen;
 3975         m->m_pkthdr.ph_ifidx = 0;
 3976         m->m_pkthdr.ph_rtableid = sc->sc_rtableid;
 3977         memset(mtod(m, u_char *), 0, tlen);
 3978 
 3979         switch (sc->sc_src.sa.sa_family) {
 3980         case AF_INET:
 3981                 ip = mtod(m, struct ip *);
 3982                 ip->ip_dst = sc->sc_src.sin.sin_addr;
 3983                 ip->ip_src = sc->sc_dst.sin.sin_addr;
 3984                 ip->ip_p = IPPROTO_TCP;
 3985                 th = (struct tcphdr *)(ip + 1);
 3986                 th->th_dport = sc->sc_src.sin.sin_port;
 3987                 th->th_sport = sc->sc_dst.sin.sin_port;
 3988                 break;
 3989 #ifdef INET6
 3990         case AF_INET6:
 3991                 ip6 = mtod(m, struct ip6_hdr *);
 3992                 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
 3993                 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
 3994                 ip6->ip6_nxt = IPPROTO_TCP;
 3995                 /* ip6_plen will be updated in ip6_output() */
 3996                 th = (struct tcphdr *)(ip6 + 1);
 3997                 th->th_dport = sc->sc_src.sin6.sin6_port;
 3998                 th->th_sport = sc->sc_dst.sin6.sin6_port;
 3999                 break;
 4000 #endif
 4001         default:
 4002                 unhandled_af(sc->sc_src.sa.sa_family);
 4003         }
 4004 
 4005         th->th_seq = htonl(sc->sc_iss);
 4006         th->th_ack = htonl(sc->sc_irs + 1);
 4007         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 4008         th->th_flags = TH_SYN|TH_ACK;
 4009 #ifdef TCP_ECN
 4010         /* Set ECE for SYN-ACK if peer supports ECN. */
 4011         if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
 4012                 th->th_flags |= TH_ECE;
 4013 #endif
 4014         th->th_win = htons(sc->sc_win);
 4015         /* th_sum already 0 */
 4016         /* th_urp already 0 */
 4017 
 4018         /* Tack on the TCP options. */
 4019         optp = (u_int8_t *)(th + 1);
 4020         *optp++ = TCPOPT_MAXSEG;
 4021         *optp++ = 4;
 4022         *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
 4023         *optp++ = sc->sc_ourmaxseg & 0xff;
 4024 
 4025         /* Include SACK_PERMIT_HDR option if peer has already done so. */
 4026         if (sc->sc_flags & SCF_SACK_PERMIT) {
 4027                 *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
 4028                 optp += 4;
 4029         }
 4030 
 4031         if (sc->sc_request_r_scale != 15) {
 4032                 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
 4033                     TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
 4034                     sc->sc_request_r_scale);
 4035                 optp += 4;
 4036         }
 4037 
 4038         if (sc->sc_flags & SCF_TIMESTAMP) {
 4039                 u_int32_t *lp = (u_int32_t *)(optp);
 4040                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 4041                 *lp++ = htonl(TCPOPT_TSTAMP_HDR);
 4042                 *lp++ = htonl(now + sc->sc_modulate);
 4043                 *lp   = htonl(sc->sc_timestamp);
 4044                 optp += TCPOLEN_TSTAMP_APPA;
 4045         }
 4046 
 4047 #ifdef TCP_SIGNATURE
 4048         if (sc->sc_flags & SCF_SIGNATURE) {
 4049                 union sockaddr_union src, dst;
 4050                 struct tdb *tdb;
 4051 
 4052                 bzero(&src, sizeof(union sockaddr_union));
 4053                 bzero(&dst, sizeof(union sockaddr_union));
 4054                 src.sa.sa_len = sc->sc_src.sa.sa_len;
 4055                 src.sa.sa_family = sc->sc_src.sa.sa_family;
 4056                 dst.sa.sa_len = sc->sc_dst.sa.sa_len;
 4057                 dst.sa.sa_family = sc->sc_dst.sa.sa_family;
 4058 
 4059                 switch (sc->sc_src.sa.sa_family) {
 4060                 case 0: /*default to PF_INET*/
 4061                 case AF_INET:
 4062                         src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
 4063                         dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
 4064                         break;
 4065 #ifdef INET6
 4066                 case AF_INET6:
 4067                         src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
 4068                         dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
 4069                         break;
 4070 #endif /* INET6 */
 4071                 }
 4072 
 4073                 tdb = gettdbbysrcdst(rtable_l2(sc->sc_rtableid),
 4074                     0, &src, &dst, IPPROTO_TCP);
 4075                 if (tdb == NULL) {
 4076                         m_freem(m);
 4077                         return (EPERM);
 4078                 }
 4079 
 4080                 /* Send signature option */
 4081                 *(optp++) = TCPOPT_SIGNATURE;
 4082                 *(optp++) = TCPOLEN_SIGNATURE;
 4083 
 4084                 if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th,
 4085                     hlen, 0, optp) < 0) {
 4086                         m_freem(m);
 4087                         tdb_unref(tdb);
 4088                         return (EINVAL);
 4089                 }
 4090                 tdb_unref(tdb);
 4091                 optp += 16;
 4092 
 4093                 /* Pad options list to the next 32 bit boundary and
 4094                  * terminate it.
 4095                  */
 4096                 *optp++ = TCPOPT_NOP;
 4097                 *optp++ = TCPOPT_EOL;
 4098         }
 4099 #endif /* TCP_SIGNATURE */
 4100 
 4101         /* Compute the packet's checksum. */
 4102         switch (sc->sc_src.sa.sa_family) {
 4103         case AF_INET:
 4104                 ip->ip_len = htons(tlen - hlen);
 4105                 th->th_sum = 0;
 4106                 th->th_sum = in_cksum(m, tlen);
 4107                 break;
 4108 #ifdef INET6
 4109         case AF_INET6:
 4110                 ip6->ip6_plen = htons(tlen - hlen);
 4111                 th->th_sum = 0;
 4112                 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 4113                 break;
 4114 #endif
 4115         }
 4116 
 4117         /* use IPsec policy and ttl from listening socket, on SYN ACK */
 4118         inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
 4119 
 4120         /*
 4121          * Fill in some straggling IP bits.  Note the stack expects
 4122          * ip_len to be in host order, for convenience.
 4123          */
 4124         switch (sc->sc_src.sa.sa_family) {
 4125         case AF_INET:
 4126                 ip->ip_len = htons(tlen);
 4127                 ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
 4128                 if (inp != NULL)
 4129                         ip->ip_tos = inp->inp_ip.ip_tos;
 4130                 break;
 4131 #ifdef INET6
 4132         case AF_INET6:
 4133                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 4134                 ip6->ip6_vfc |= IPV6_VERSION;
 4135                 ip6->ip6_plen = htons(tlen - hlen);
 4136                 /* ip6_hlim will be initialized afterwards */
 4137                 /* leave flowlabel = 0, it is legal and require no state mgmt */
 4138                 break;
 4139 #endif
 4140         }
 4141 
 4142         switch (sc->sc_src.sa.sa_family) {
 4143         case AF_INET:
 4144                 error = ip_output(m, sc->sc_ipopts, &sc->sc_route4,
 4145                     (ip_mtudisc ? IP_MTUDISC : 0),  NULL, inp, 0);
 4146                 break;
 4147 #ifdef INET6
 4148         case AF_INET6:
 4149                 ip6->ip6_hlim = in6_selecthlim(inp);
 4150 
 4151                 error = ip6_output(m, NULL /*XXX*/, &sc->sc_route6, 0,
 4152                     NULL, NULL);
 4153                 break;
 4154 #endif
 4155         default:
 4156                 error = EAFNOSUPPORT;
 4157                 break;
 4158         }
 4159         return (error);
 4160 }

Cache object: cf95ea4efe7d4bff221cc58b13f02344


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.