The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_syncache.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*      $NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $ */
    2 
    3 /*
    4  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
    5  * All rights reserved.
    6  *
    7  * Redistribution and use in source and binary forms, with or without
    8  * modification, are permitted provided that the following conditions
    9  * are met:
   10  * 1. Redistributions of source code must retain the above copyright
   11  *    notice, this list of conditions and the following disclaimer.
   12  * 2. Redistributions in binary form must reproduce the above copyright
   13  *    notice, this list of conditions and the following disclaimer in the
   14  *    documentation and/or other materials provided with the distribution.
   15  * 3. Neither the name of the project nor the names of its contributors
   16  *    may be used to endorse or promote products derived from this software
   17  *    without specific prior written permission.
   18  *
   19  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
   20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
   23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   29  * SUCH DAMAGE.
   30  */
   31 
   32 /*
   33  *      @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
   34  *
   35  * NRL grants permission for redistribution and use in source and binary
   36  * forms, with or without modification, of the software and documentation
   37  * created at NRL provided that the following conditions are met:
   38  *
   39  * 1. Redistributions of source code must retain the above copyright
   40  *    notice, this list of conditions and the following disclaimer.
   41  * 2. Redistributions in binary form must reproduce the above copyright
   42  *    notice, this list of conditions and the following disclaimer in the
   43  *    documentation and/or other materials provided with the distribution.
   44  * 3. All advertising materials mentioning features or use of this software
   45  *    must display the following acknowledgements:
   46  *      This product includes software developed by the University of
   47  *      California, Berkeley and its contributors.
   48  *      This product includes software developed at the Information
   49  *      Technology Division, US Naval Research Laboratory.
   50  * 4. Neither the name of the NRL nor the names of its contributors
   51  *    may be used to endorse or promote products derived from this software
   52  *    without specific prior written permission.
   53  *
   54  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
   55  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   56  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
   57  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
   58  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   59  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   60  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   61  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   62  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   63  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   64  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   65  *
   66  * The views and conclusions contained in the software and documentation
   67  * are those of the authors and should not be interpreted as representing
   68  * official policies, either expressed or implied, of the US Naval
   69  * Research Laboratory (NRL).
   70  */
   71 
   72 /*-
   73  * Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
   74  * 2011 The NetBSD Foundation, Inc.
   75  * All rights reserved.
   76  *
   77  * This code is derived from software contributed to The NetBSD Foundation
   78  * by Coyote Point Systems, Inc.
   79  * This code is derived from software contributed to The NetBSD Foundation
   80  * by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
   81  * Facility, NASA Ames Research Center.
   82  * This code is derived from software contributed to The NetBSD Foundation
   83  * by Charles M. Hannum.
   84  * This code is derived from software contributed to The NetBSD Foundation
   85  * by Rui Paulo.
   86  *
   87  * Redistribution and use in source and binary forms, with or without
   88  * modification, are permitted provided that the following conditions
   89  * are met:
   90  * 1. Redistributions of source code must retain the above copyright
   91  *    notice, this list of conditions and the following disclaimer.
   92  * 2. Redistributions in binary form must reproduce the above copyright
   93  *    notice, this list of conditions and the following disclaimer in the
   94  *    documentation and/or other materials provided with the distribution.
   95  *
   96  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   97  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   98  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   99  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  100  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  101  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  102  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  103  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  104  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  105  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  106  * POSSIBILITY OF SUCH DAMAGE.
  107  */
  108 
  109 /*
  110  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  111  *      The Regents of the University of California.  All rights reserved.
  112  *
  113  * Redistribution and use in source and binary forms, with or without
  114  * modification, are permitted provided that the following conditions
  115  * are met:
  116  * 1. Redistributions of source code must retain the above copyright
  117  *    notice, this list of conditions and the following disclaimer.
  118  * 2. Redistributions in binary form must reproduce the above copyright
  119  *    notice, this list of conditions and the following disclaimer in the
  120  *    documentation and/or other materials provided with the distribution.
  121  * 3. Neither the name of the University nor the names of its contributors
  122  *    may be used to endorse or promote products derived from this software
  123  *    without specific prior written permission.
  124  *
  125  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  126  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  127  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  128  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  129  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  130  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  131  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  132  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  133  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  134  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  135  * SUCH DAMAGE.
  136  *
  137  *      @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
  138  */
  139 
  140 /*
  141  *      TODO list for SYN cache stuff:
  142  *
  143  *      Find room for a "state" field, which is needed to keep a
  144  *      compressed state for TIME_WAIT TCBs.  It's been noted already
  145  *      that this is fairly important for very high-volume web and
  146  *      mail servers, which use a large number of short-lived
  147  *      connections.
  148  */
  149 
  150 #include <sys/cdefs.h>
  151 __KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $");
  152 
  153 #ifdef _KERNEL_OPT
  154 #include "opt_inet.h"
  155 #include "opt_ipsec.h"
  156 #endif
  157 
  158 #include <sys/param.h>
  159 #include <sys/systm.h>
  160 #include <sys/mbuf.h>
  161 #include <sys/protosw.h>
  162 #include <sys/socket.h>
  163 #include <sys/socketvar.h>
  164 #include <sys/errno.h>
  165 #include <sys/syslog.h>
  166 #include <sys/pool.h>
  167 #include <sys/domain.h>
  168 #include <sys/kernel.h>
  169 #include <sys/lwp.h> /* for lwp0 */
  170 #include <sys/cprng.h>
  171 
  172 #include <netinet/in.h>
  173 #include <netinet/ip.h>
  174 #include <netinet/in_pcb.h>
  175 #include <netinet/in_var.h>
  176 #include <netinet/ip_var.h>
  177 
  178 #include <netinet/ip6.h>
  179 #ifdef INET6
  180 #include <netinet6/ip6_var.h>
  181 #include <netinet6/in6_pcb.h>
  182 #include <netinet6/ip6_var.h>
  183 #include <netinet6/in6_var.h>
  184 #endif
  185 
  186 #include <netinet/tcp.h>
  187 #include <netinet/tcp_fsm.h>
  188 #include <netinet/tcp_seq.h>
  189 #include <netinet/tcp_timer.h>
  190 #include <netinet/tcp_var.h>
  191 #include <netinet/tcp_private.h>
  192 #include <netinet/tcp_syncache.h>
  193 
  194 #ifdef TCP_SIGNATURE
  195 #ifdef IPSEC
  196 #include <netipsec/ipsec.h>
  197 #include <netipsec/key.h>
  198 #ifdef INET6
  199 #include <netipsec/ipsec6.h>
  200 #endif
  201 #endif  /* IPSEC*/
  202 #endif
  203 
  204 static void     syn_cache_timer(void *);
  205 static struct syn_cache *
  206                 syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
  207                 struct syn_cache_head **);
  208 static int      syn_cache_respond(struct syn_cache *);
  209 
  210 /* syn hash parameters */
  211 #define TCP_SYN_HASH_SIZE       293
  212 #define TCP_SYN_BUCKET_SIZE     35
  213 static int      tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
  214 int             tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
  215 int             tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
  216 static struct   syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
  217 
  218 /*
  219  * TCP compressed state engine.  Currently used to hold compressed
  220  * state for SYN_RECEIVED.
  221  */
  222 
  223 u_long  syn_cache_count;
  224 static u_int32_t syn_hash1, syn_hash2;
  225 
  226 #define SYN_HASH(sa, sp, dp) \
  227         ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
  228                                      ((u_int32_t)(sp)))^syn_hash2)))
  229 #ifndef INET6
  230 #define SYN_HASHALL(hash, src, dst) \
  231 do {                                                                    \
  232         hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
  233                 ((const struct sockaddr_in *)(src))->sin_port,          \
  234                 ((const struct sockaddr_in *)(dst))->sin_port);         \
  235 } while (/*CONSTCOND*/ 0)
  236 #else
  237 #define SYN_HASH6(sa, sp, dp) \
  238         ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
  239           (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
  240          & 0x7fffffff)
  241 
  242 #define SYN_HASHALL(hash, src, dst) \
  243 do {                                                                    \
  244         switch ((src)->sa_family) {                                     \
  245         case AF_INET:                                                   \
  246                 hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
  247                         ((const struct sockaddr_in *)(src))->sin_port,  \
  248                         ((const struct sockaddr_in *)(dst))->sin_port); \
  249                 break;                                                  \
  250         case AF_INET6:                                                  \
  251                 hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
  252                         ((const struct sockaddr_in6 *)(src))->sin6_port,        \
  253                         ((const struct sockaddr_in6 *)(dst))->sin6_port);       \
  254                 break;                                                  \
  255         default:                                                        \
  256                 hash = 0;                                               \
  257         }                                                               \
  258 } while (/*CONSTCOND*/0)
  259 #endif /* INET6 */
  260 
  261 static struct pool syn_cache_pool;
  262 
  263 /*
  264  * We don't estimate RTT with SYNs, so each packet starts with the default
  265  * RTT and each timer step has a fixed timeout value.
  266  */
  267 static inline void
  268 syn_cache_timer_arm(struct syn_cache *sc)
  269 {
  270 
  271         TCPT_RANGESET(sc->sc_rxtcur,
  272             TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
  273             TCPTV_REXMTMAX);
  274         callout_reset(&sc->sc_timer,
  275             sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
  276 }
  277 
  278 #define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
  279 
  280 static inline void
  281 syn_cache_rm(struct syn_cache *sc)
  282 {
  283         TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
  284             sc, sc_bucketq);
  285         sc->sc_tp = NULL;
  286         LIST_REMOVE(sc, sc_tpq);
  287         tcp_syn_cache[sc->sc_bucketidx].sch_length--;
  288         callout_stop(&sc->sc_timer);
  289         syn_cache_count--;
  290 }
  291 
  292 static inline void
  293 syn_cache_put(struct syn_cache *sc)
  294 {
  295         if (sc->sc_ipopts)
  296                 (void) m_free(sc->sc_ipopts);
  297         rtcache_free(&sc->sc_route);
  298         sc->sc_flags |= SCF_DEAD;
  299         if (!callout_invoking(&sc->sc_timer))
  300                 callout_schedule(&(sc)->sc_timer, 1);
  301 }
  302 
  303 void
  304 syn_cache_init(void)
  305 {
  306         int i;
  307 
  308         pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
  309             "synpl", NULL, IPL_SOFTNET);
  310 
  311         /* Initialize the hash buckets. */
  312         for (i = 0; i < tcp_syn_cache_size; i++)
  313                 TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
  314 }
  315 
  316 void
  317 syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
  318 {
  319         struct syn_cache_head *scp;
  320         struct syn_cache *sc2;
  321         int s;
  322 
  323         /*
  324          * If there are no entries in the hash table, reinitialize
  325          * the hash secrets.
  326          */
  327         if (syn_cache_count == 0) {
  328                 syn_hash1 = cprng_fast32();
  329                 syn_hash2 = cprng_fast32();
  330         }
  331 
  332         SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
  333         sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
  334         scp = &tcp_syn_cache[sc->sc_bucketidx];
  335 
  336         /*
  337          * Make sure that we don't overflow the per-bucket
  338          * limit or the total cache size limit.
  339          */
  340         s = splsoftnet();
  341         if (scp->sch_length >= tcp_syn_bucket_limit) {
  342                 TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
  343                 /*
  344                  * The bucket is full.  Toss the oldest element in the
  345                  * bucket.  This will be the first entry in the bucket.
  346                  */
  347                 sc2 = TAILQ_FIRST(&scp->sch_bucket);
  348 #ifdef DIAGNOSTIC
  349                 /*
  350                  * This should never happen; we should always find an
  351                  * entry in our bucket.
  352                  */
  353                 if (sc2 == NULL)
  354                         panic("syn_cache_insert: bucketoverflow: impossible");
  355 #endif
  356                 syn_cache_rm(sc2);
  357                 syn_cache_put(sc2);     /* calls pool_put but see spl above */
  358         } else if (syn_cache_count >= tcp_syn_cache_limit) {
  359                 struct syn_cache_head *scp2, *sce;
  360 
  361                 TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
  362                 /*
  363                  * The cache is full.  Toss the oldest entry in the
  364                  * first non-empty bucket we can find.
  365                  *
  366                  * XXX We would really like to toss the oldest
  367                  * entry in the cache, but we hope that this
  368                  * condition doesn't happen very often.
  369                  */
  370                 scp2 = scp;
  371                 if (TAILQ_EMPTY(&scp2->sch_bucket)) {
  372                         sce = &tcp_syn_cache[tcp_syn_cache_size];
  373                         for (++scp2; scp2 != scp; scp2++) {
  374                                 if (scp2 >= sce)
  375                                         scp2 = &tcp_syn_cache[0];
  376                                 if (! TAILQ_EMPTY(&scp2->sch_bucket))
  377                                         break;
  378                         }
  379 #ifdef DIAGNOSTIC
  380                         /*
  381                          * This should never happen; we should always find a
  382                          * non-empty bucket.
  383                          */
  384                         if (scp2 == scp)
  385                                 panic("syn_cache_insert: cacheoverflow: "
  386                                     "impossible");
  387 #endif
  388                 }
  389                 sc2 = TAILQ_FIRST(&scp2->sch_bucket);
  390                 syn_cache_rm(sc2);
  391                 syn_cache_put(sc2);     /* calls pool_put but see spl above */
  392         }
  393 
  394         /*
  395          * Initialize the entry's timer.
  396          */
  397         sc->sc_rxttot = 0;
  398         sc->sc_rxtshift = 0;
  399         syn_cache_timer_arm(sc);
  400 
  401         /* Link it from tcpcb entry */
  402         LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
  403 
  404         /* Put it into the bucket. */
  405         TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
  406         scp->sch_length++;
  407         syn_cache_count++;
  408 
  409         TCP_STATINC(TCP_STAT_SC_ADDED);
  410         splx(s);
  411 }
  412 
  413 /*
  414  * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
  415  * If we have retransmitted an entry the maximum number of times, expire
  416  * that entry.
  417  */
  418 static void
  419 syn_cache_timer(void *arg)
  420 {
  421         struct syn_cache *sc = arg;
  422 
  423         mutex_enter(softnet_lock);
  424         KERNEL_LOCK(1, NULL);
  425 
  426         callout_ack(&sc->sc_timer);
  427 
  428         if (__predict_false(sc->sc_flags & SCF_DEAD)) {
  429                 TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
  430                 goto free;
  431         }
  432 
  433         if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
  434                 /* Drop it -- too many retransmissions. */
  435                 goto dropit;
  436         }
  437 
  438         /*
  439          * Compute the total amount of time this entry has
  440          * been on a queue.  If this entry has been on longer
  441          * than the keep alive timer would allow, expire it.
  442          */
  443         sc->sc_rxttot += sc->sc_rxtcur;
  444         if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
  445                 goto dropit;
  446 
  447         TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
  448         (void)syn_cache_respond(sc);
  449 
  450         /* Advance the timer back-off. */
  451         sc->sc_rxtshift++;
  452         syn_cache_timer_arm(sc);
  453 
  454         goto out;
  455 
  456  dropit:
  457         TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
  458         syn_cache_rm(sc);
  459         if (sc->sc_ipopts)
  460                 (void) m_free(sc->sc_ipopts);
  461         rtcache_free(&sc->sc_route);
  462 
  463  free:
  464         callout_destroy(&sc->sc_timer);
  465         pool_put(&syn_cache_pool, sc);
  466 
  467  out:
  468         KERNEL_UNLOCK_ONE(NULL);
  469         mutex_exit(softnet_lock);
  470 }
  471 
  472 /*
  473  * Remove syn cache created by the specified tcb entry,
  474  * because this does not make sense to keep them
  475  * (if there's no tcb entry, syn cache entry will never be used)
  476  */
  477 void
  478 syn_cache_cleanup(struct tcpcb *tp)
  479 {
  480         struct syn_cache *sc, *nsc;
  481         int s;
  482 
  483         s = splsoftnet();
  484 
  485         for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
  486                 nsc = LIST_NEXT(sc, sc_tpq);
  487 
  488 #ifdef DIAGNOSTIC
  489                 if (sc->sc_tp != tp)
  490                         panic("invalid sc_tp in syn_cache_cleanup");
  491 #endif
  492                 syn_cache_rm(sc);
  493                 syn_cache_put(sc);      /* calls pool_put but see spl above */
  494         }
  495         /* just for safety */
  496         LIST_INIT(&tp->t_sc);
  497 
  498         splx(s);
  499 }
  500 
  501 /*
  502  * Find an entry in the syn cache.
  503  */
  504 static struct syn_cache *
  505 syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
  506     struct syn_cache_head **headp)
  507 {
  508         struct syn_cache *sc;
  509         struct syn_cache_head *scp;
  510         u_int32_t hash;
  511         int s;
  512 
  513         SYN_HASHALL(hash, src, dst);
  514 
  515         scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
  516         *headp = scp;
  517         s = splsoftnet();
  518         for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
  519              sc = TAILQ_NEXT(sc, sc_bucketq)) {
  520                 if (sc->sc_hash != hash)
  521                         continue;
  522                 if (!memcmp(&sc->sc_src, src, src->sa_len) &&
  523                     !memcmp(&sc->sc_dst, dst, dst->sa_len)) {
  524                         splx(s);
  525                         return (sc);
  526                 }
  527         }
  528         splx(s);
  529         return (NULL);
  530 }
  531 
  532 /*
  533  * This function gets called when we receive an ACK for a socket in the
  534  * LISTEN state. We look up the connection in the syn cache, and if it's
  535  * there, we pull it out of the cache and turn it into a full-blown
  536  * connection in the SYN-RECEIVED state.
  537  *
  538  * The return values may not be immediately obvious, and their effects
  539  * can be subtle, so here they are:
  540  *
  541  *      NULL    SYN was not found in cache; caller should drop the
  542  *              packet and send an RST.
  543  *
  544  *      -1      We were unable to create the new connection, and are
  545  *              aborting it.  An ACK,RST is being sent to the peer
  546  *              (unless we got screwey sequence numbers; see below),
  547  *              because the 3-way handshake has been completed.  Caller
  548  *              should not free the mbuf, since we may be using it.  If
  549  *              we are not, we will free it.
  550  *
  551  *      Otherwise, the return value is a pointer to the new socket
  552  *      associated with the connection.
  553  */
  554 struct socket *
  555 syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
  556     struct tcphdr *th, struct socket *so, struct mbuf *m)
  557 {
  558         struct syn_cache *sc;
  559         struct syn_cache_head *scp;
  560         struct inpcb *inp = NULL;
  561         struct tcpcb *tp;
  562         int s;
  563         struct socket *oso;
  564 
  565         s = splsoftnet();
  566         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
  567                 splx(s);
  568                 return NULL;
  569         }
  570 
  571         /*
  572          * Verify the sequence and ack numbers.  Try getting the correct
  573          * response again.
  574          */
  575         if ((th->th_ack != sc->sc_iss + 1) ||
  576             SEQ_LEQ(th->th_seq, sc->sc_irs) ||
  577             SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
  578                 m_freem(m);
  579                 (void)syn_cache_respond(sc);
  580                 splx(s);
  581                 return ((struct socket *)(-1));
  582         }
  583 
  584         /* Remove this cache entry */
  585         syn_cache_rm(sc);
  586         splx(s);
  587 
  588         /*
  589          * Ok, create the full blown connection, and set things up
  590          * as they would have been set up if we had created the
  591          * connection when the SYN arrived.  If we can't create
  592          * the connection, abort it.
  593          */
  594         /*
  595          * inp still has the OLD in_pcb stuff, set the
  596          * v6-related flags on the new guy, too.   This is
  597          * done particularly for the case where an AF_INET6
  598          * socket is bound only to a port, and a v4 connection
  599          * comes in on that port.
  600          * we also copy the flowinfo from the original pcb
  601          * to the new one.
  602          */
  603         oso = so;
  604         so = sonewconn(so, true);
  605         if (so == NULL)
  606                 goto resetandabort;
  607 
  608         inp = sotoinpcb(so);
  609 
  610         switch (src->sa_family) {
  611         case AF_INET:
  612                 if (inp->inp_af == AF_INET) {
  613                         in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
  614                         inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
  615                         inp->inp_options = ip_srcroute(m);
  616                         inpcb_set_state(inp, INP_BOUND);
  617                         if (inp->inp_options == NULL) {
  618                                 inp->inp_options = sc->sc_ipopts;
  619                                 sc->sc_ipopts = NULL;
  620                         }
  621                 }
  622 #ifdef INET6
  623                 else if (inp->inp_af == AF_INET6) {
  624                         /* IPv4 packet to AF_INET6 socket */
  625                         memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
  626                         in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
  627                         bcopy(&((struct sockaddr_in *)dst)->sin_addr,
  628                                 &in6p_laddr(inp).s6_addr32[3],
  629                                 sizeof(((struct sockaddr_in *)dst)->sin_addr));
  630                         inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
  631                         intotcpcb(inp)->t_family = AF_INET;
  632                         if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
  633                                 inp->inp_flags |= IN6P_IPV6_V6ONLY;
  634                         else
  635                                 inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
  636                         inpcb_set_state(inp, INP_BOUND);
  637                 }
  638 #endif
  639                 break;
  640 #ifdef INET6
  641         case AF_INET6:
  642                 if (inp->inp_af == AF_INET6) {
  643                         in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
  644                         inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
  645                         inpcb_set_state(inp, INP_BOUND);
  646                 }
  647                 break;
  648 #endif
  649         }
  650 
  651 #ifdef INET6
  652         if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
  653                 struct inpcb *oinp = sotoinpcb(oso);
  654                 /* inherit socket options from the listening socket */
  655                 inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
  656                 if (inp->inp_flags & IN6P_CONTROLOPTS) {
  657                         m_freem(inp->inp_options);
  658                         inp->inp_options = NULL;
  659                 }
  660                 ip6_savecontrol(inp, &inp->inp_options,
  661                     mtod(m, struct ip6_hdr *), m);
  662         }
  663 #endif
  664 
  665         /*
  666          * Give the new socket our cached route reference.
  667          */
  668         rtcache_copy(&inp->inp_route, &sc->sc_route);
  669         rtcache_free(&sc->sc_route);
  670 
  671         if (inp->inp_af == AF_INET) {
  672                 struct sockaddr_in sin;
  673                 memcpy(&sin, src, src->sa_len);
  674                 if (inpcb_connect(inp, &sin, &lwp0)) {
  675                         goto resetandabort;
  676                 }
  677         }
  678 #ifdef INET6
  679         else if (inp->inp_af == AF_INET6) {
  680                 struct sockaddr_in6 sin6;
  681                 memcpy(&sin6, src, src->sa_len);
  682                 if (src->sa_family == AF_INET) {
  683                         /* IPv4 packet to AF_INET6 socket */
  684                         in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
  685                 }
  686                 if (in6pcb_connect(inp, &sin6, NULL)) {
  687                         goto resetandabort;
  688                 }
  689         }
  690 #endif
  691         else {
  692                 goto resetandabort;
  693         }
  694 
  695         tp = intotcpcb(inp);
  696 
  697         tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
  698         if (sc->sc_request_r_scale != 15) {
  699                 tp->requested_s_scale = sc->sc_requested_s_scale;
  700                 tp->request_r_scale = sc->sc_request_r_scale;
  701                 tp->snd_scale = sc->sc_requested_s_scale;
  702                 tp->rcv_scale = sc->sc_request_r_scale;
  703                 tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
  704         }
  705         if (sc->sc_flags & SCF_TIMESTAMP)
  706                 tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
  707         tp->ts_timebase = sc->sc_timebase;
  708 
  709         tp->t_template = tcp_template(tp);
  710         if (tp->t_template == 0) {
  711                 tp = tcp_drop(tp, ENOBUFS);     /* destroys socket */
  712                 so = NULL;
  713                 m_freem(m);
  714                 goto abort;
  715         }
  716 
  717         tp->iss = sc->sc_iss;
  718         tp->irs = sc->sc_irs;
  719         tcp_sendseqinit(tp);
  720         tcp_rcvseqinit(tp);
  721         tp->t_state = TCPS_SYN_RECEIVED;
  722         TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
  723         TCP_STATINC(TCP_STAT_ACCEPTS);
  724 
  725         if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
  726                 tp->t_flags |= TF_WILL_SACK;
  727 
  728         if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
  729                 tp->t_flags |= TF_ECN_PERMIT;
  730 
  731 #ifdef TCP_SIGNATURE
  732         if (sc->sc_flags & SCF_SIGNATURE)
  733                 tp->t_flags |= TF_SIGNATURE;
  734 #endif
  735 
  736         /* Initialize tp->t_ourmss before we deal with the peer's! */
  737         tp->t_ourmss = sc->sc_ourmaxseg;
  738         tcp_mss_from_peer(tp, sc->sc_peermaxseg);
  739 
  740         /*
  741          * Initialize the initial congestion window.  If we
  742          * had to retransmit the SYN,ACK, we must initialize cwnd
  743          * to 1 segment (i.e. the Loss Window).
  744          */
  745         if (sc->sc_rxtshift)
  746                 tp->snd_cwnd = tp->t_peermss;
  747         else {
  748                 int ss = tcp_init_win;
  749                 if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
  750                         ss = tcp_init_win_local;
  751 #ifdef INET6
  752                 else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
  753                         ss = tcp_init_win_local;
  754 #endif
  755                 tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
  756         }
  757 
  758         tcp_rmx_rtt(tp);
  759         tp->snd_wl1 = sc->sc_irs;
  760         tp->rcv_up = sc->sc_irs + 1;
  761 
  762         /*
  763          * This is what would have happened in tcp_output() when
  764          * the SYN,ACK was sent.
  765          */
  766         tp->snd_up = tp->snd_una;
  767         tp->snd_max = tp->snd_nxt = tp->iss+1;
  768         TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
  769         if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
  770                 tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
  771         tp->last_ack_sent = tp->rcv_nxt;
  772         tp->t_partialacks = -1;
  773         tp->t_dupacks = 0;
  774 
  775         TCP_STATINC(TCP_STAT_SC_COMPLETED);
  776         s = splsoftnet();
  777         syn_cache_put(sc);
  778         splx(s);
  779         return so;
  780 
  781 resetandabort:
  782         (void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
  783 abort:
  784         if (so != NULL) {
  785                 (void) soqremque(so, 1);
  786                 (void) soabort(so);
  787                 mutex_enter(softnet_lock);
  788         }
  789         s = splsoftnet();
  790         syn_cache_put(sc);
  791         splx(s);
  792         TCP_STATINC(TCP_STAT_SC_ABORTED);
  793         return ((struct socket *)(-1));
  794 }
  795 
  796 /*
  797  * This function is called when we get a RST for a
  798  * non-existent connection, so that we can see if the
  799  * connection is in the syn cache.  If it is, zap it.
  800  */
  801 
  802 void
  803 syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
  804 {
  805         struct syn_cache *sc;
  806         struct syn_cache_head *scp;
  807         int s = splsoftnet();
  808 
  809         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
  810                 splx(s);
  811                 return;
  812         }
  813         if (SEQ_LT(th->th_seq, sc->sc_irs) ||
  814             SEQ_GT(th->th_seq, sc->sc_irs+1)) {
  815                 splx(s);
  816                 return;
  817         }
  818         syn_cache_rm(sc);
  819         TCP_STATINC(TCP_STAT_SC_RESET);
  820         syn_cache_put(sc);      /* calls pool_put but see spl above */
  821         splx(s);
  822 }
  823 
  824 void
  825 syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
  826     struct tcphdr *th)
  827 {
  828         struct syn_cache *sc;
  829         struct syn_cache_head *scp;
  830         int s;
  831 
  832         s = splsoftnet();
  833         if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
  834                 splx(s);
  835                 return;
  836         }
  837         /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
  838         if (ntohl(th->th_seq) != sc->sc_iss) {
  839                 splx(s);
  840                 return;
  841         }
  842 
  843         /*
  844          * If we've retransmitted 3 times and this is our second error,
  845          * we remove the entry.  Otherwise, we allow it to continue on.
  846          * This prevents us from incorrectly nuking an entry during a
  847          * spurious network outage.
  848          *
  849          * See tcp_notify().
  850          */
  851         if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
  852                 sc->sc_flags |= SCF_UNREACH;
  853                 splx(s);
  854                 return;
  855         }
  856 
  857         syn_cache_rm(sc);
  858         TCP_STATINC(TCP_STAT_SC_UNREACH);
  859         syn_cache_put(sc);      /* calls pool_put but see spl above */
  860         splx(s);
  861 }
  862 
  863 /*
  864  * Given a LISTEN socket and an inbound SYN request, add this to the syn
  865  * cache, and send back a segment:
  866  *      <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
  867  * to the source.
  868  *
  869  * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
  870  * Doing so would require that we hold onto the data and deliver it
  871  * to the application.  However, if we are the target of a SYN-flood
  872  * DoS attack, an attacker could send data which would eventually
  873  * consume all available buffer space if it were ACKed.  By not ACKing
  874  * the data, we avoid this DoS scenario.
  875  */
  876 int
  877 syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
  878     unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
  879     int optlen, struct tcp_opt_info *oi)
  880 {
  881         struct tcpcb tb, *tp;
  882         long win;
  883         struct syn_cache *sc;
  884         struct syn_cache_head *scp;
  885         struct mbuf *ipopts;
  886         int s;
  887 
  888         tp = sototcpcb(so);
  889 
  890         /*
  891          * Initialize some local state.
  892          */
  893         win = sbspace(&so->so_rcv);
  894         if (win > TCP_MAXWIN)
  895                 win = TCP_MAXWIN;
  896 
  897 #ifdef TCP_SIGNATURE
  898         if (optp || (tp->t_flags & TF_SIGNATURE))
  899 #else
  900         if (optp)
  901 #endif
  902         {
  903                 tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
  904 #ifdef TCP_SIGNATURE
  905                 tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
  906 #endif
  907                 tb.t_state = TCPS_LISTEN;
  908                 if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
  909                         return 0;
  910         } else
  911                 tb.t_flags = 0;
  912 
  913         switch (src->sa_family) {
  914         case AF_INET:
  915                 /* Remember the IP options, if any. */
  916                 ipopts = ip_srcroute(m);
  917                 break;
  918         default:
  919                 ipopts = NULL;
  920         }
  921 
  922         /*
  923          * See if we already have an entry for this connection.
  924          * If we do, resend the SYN,ACK.  We do not count this
  925          * as a retransmission (XXX though maybe we should).
  926          */
  927         if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
  928                 TCP_STATINC(TCP_STAT_SC_DUPESYN);
  929                 if (ipopts) {
  930                         /*
  931                          * If we were remembering a previous source route,
  932                          * forget it and use the new one we've been given.
  933                          */
  934                         if (sc->sc_ipopts)
  935                                 (void)m_free(sc->sc_ipopts);
  936                         sc->sc_ipopts = ipopts;
  937                 }
  938                 sc->sc_timestamp = tb.ts_recent;
  939                 m_freem(m);
  940                 if (syn_cache_respond(sc) == 0) {
  941                         uint64_t *tcps = TCP_STAT_GETREF();
  942                         tcps[TCP_STAT_SNDACKS]++;
  943                         tcps[TCP_STAT_SNDTOTAL]++;
  944                         TCP_STAT_PUTREF();
  945                 }
  946                 return 1;
  947         }
  948 
  949         s = splsoftnet();
  950         sc = pool_get(&syn_cache_pool, PR_NOWAIT);
  951         splx(s);
  952         if (sc == NULL) {
  953                 if (ipopts)
  954                         (void)m_free(ipopts);
  955                 return 0;
  956         }
  957 
  958         /*
  959          * Fill in the cache, and put the necessary IP and TCP
  960          * options into the reply.
  961          */
  962         memset(sc, 0, sizeof(struct syn_cache));
  963         callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
  964         memcpy(&sc->sc_src, src, src->sa_len);
  965         memcpy(&sc->sc_dst, dst, dst->sa_len);
  966         sc->sc_flags = 0;
  967         sc->sc_ipopts = ipopts;
  968         sc->sc_irs = th->th_seq;
  969         switch (src->sa_family) {
  970         case AF_INET:
  971             {
  972                 struct sockaddr_in *srcin = (void *)src;
  973                 struct sockaddr_in *dstin = (void *)dst;
  974 
  975                 sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
  976                     &srcin->sin_addr, dstin->sin_port,
  977                     srcin->sin_port, sizeof(dstin->sin_addr));
  978                 break;
  979             }
  980 #ifdef INET6
  981         case AF_INET6:
  982             {
  983                 struct sockaddr_in6 *srcin6 = (void *)src;
  984                 struct sockaddr_in6 *dstin6 = (void *)dst;
  985 
  986                 sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
  987                     &srcin6->sin6_addr, dstin6->sin6_port,
  988                     srcin6->sin6_port, sizeof(dstin6->sin6_addr));
  989                 break;
  990             }
  991 #endif
  992         }
  993         sc->sc_peermaxseg = oi->maxseg;
  994         sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
  995             m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
  996         sc->sc_win = win;
  997         sc->sc_timebase = tcp_now - 1;  /* see tcp_newtcpcb() */
  998         sc->sc_timestamp = tb.ts_recent;
  999         if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
 1000             (TF_REQ_TSTMP|TF_RCVD_TSTMP))
 1001                 sc->sc_flags |= SCF_TIMESTAMP;
 1002         if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
 1003             (TF_RCVD_SCALE|TF_REQ_SCALE)) {
 1004                 sc->sc_requested_s_scale = tb.requested_s_scale;
 1005                 sc->sc_request_r_scale = 0;
 1006                 /*
 1007                  * Pick the smallest possible scaling factor that
 1008                  * will still allow us to scale up to sb_max.
 1009                  *
 1010                  * We do this because there are broken firewalls that
 1011                  * will corrupt the window scale option, leading to
 1012                  * the other endpoint believing that our advertised
 1013                  * window is unscaled.  At scale factors larger than
 1014                  * 5 the unscaled window will drop below 1500 bytes,
 1015                  * leading to serious problems when traversing these
 1016                  * broken firewalls.
 1017                  *
 1018                  * With the default sbmax of 256K, a scale factor
 1019                  * of 3 will be chosen by this algorithm.  Those who
 1020                  * choose a larger sbmax should watch out
 1021                  * for the compatibility problems mentioned above.
 1022                  *
 1023                  * RFC1323: The Window field in a SYN (i.e., a <SYN>
 1024                  * or <SYN,ACK>) segment itself is never scaled.
 1025                  */
 1026                 while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
 1027                     (TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
 1028                         sc->sc_request_r_scale++;
 1029         } else {
 1030                 sc->sc_requested_s_scale = 15;
 1031                 sc->sc_request_r_scale = 15;
 1032         }
 1033         if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
 1034                 sc->sc_flags |= SCF_SACK_PERMIT;
 1035 
 1036         /*
 1037          * ECN setup packet received.
 1038          */
 1039         if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
 1040                 sc->sc_flags |= SCF_ECN_PERMIT;
 1041 
 1042 #ifdef TCP_SIGNATURE
 1043         if (tb.t_flags & TF_SIGNATURE)
 1044                 sc->sc_flags |= SCF_SIGNATURE;
 1045 #endif
 1046         sc->sc_tp = tp;
 1047         m_freem(m);
 1048         if (syn_cache_respond(sc) == 0) {
 1049                 uint64_t *tcps = TCP_STAT_GETREF();
 1050                 tcps[TCP_STAT_SNDACKS]++;
 1051                 tcps[TCP_STAT_SNDTOTAL]++;
 1052                 TCP_STAT_PUTREF();
 1053                 syn_cache_insert(sc, tp);
 1054         } else {
 1055                 s = splsoftnet();
 1056                 /*
 1057                  * syn_cache_put() will try to schedule the timer, so
 1058                  * we need to initialize it
 1059                  */
 1060                 syn_cache_timer_arm(sc);
 1061                 syn_cache_put(sc);
 1062                 splx(s);
 1063                 TCP_STATINC(TCP_STAT_SC_DROPPED);
 1064         }
 1065         return 1;
 1066 }
 1067 
 1068 /*
 1069  * syn_cache_respond: (re)send SYN+ACK.
 1070  *
 1071  * Returns 0 on success.
 1072  */
 1073 
 1074 static int
 1075 syn_cache_respond(struct syn_cache *sc)
 1076 {
 1077 #ifdef INET6
 1078         struct rtentry *rt = NULL;
 1079 #endif
 1080         struct route *ro;
 1081         u_int8_t *optp;
 1082         int optlen, error;
 1083         u_int16_t tlen;
 1084         struct ip *ip = NULL;
 1085 #ifdef INET6
 1086         struct ip6_hdr *ip6 = NULL;
 1087 #endif
 1088         struct tcpcb *tp;
 1089         struct tcphdr *th;
 1090         struct mbuf *m;
 1091         u_int hlen;
 1092 #ifdef TCP_SIGNATURE
 1093         struct secasvar *sav = NULL;
 1094         u_int8_t *sigp = NULL;
 1095 #endif
 1096 
 1097         ro = &sc->sc_route;
 1098         switch (sc->sc_src.sa.sa_family) {
 1099         case AF_INET:
 1100                 hlen = sizeof(struct ip);
 1101                 break;
 1102 #ifdef INET6
 1103         case AF_INET6:
 1104                 hlen = sizeof(struct ip6_hdr);
 1105                 break;
 1106 #endif
 1107         default:
 1108                 return EAFNOSUPPORT;
 1109         }
 1110 
 1111         /* Worst case scenario, since we don't know the option size yet. */
 1112         tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
 1113         KASSERT(max_linkhdr + tlen <= MCLBYTES);
 1114 
 1115         /*
 1116          * Create the IP+TCP header from scratch.
 1117          */
 1118         MGETHDR(m, M_DONTWAIT, MT_DATA);
 1119         if (m && (max_linkhdr + tlen) > MHLEN) {
 1120                 MCLGET(m, M_DONTWAIT);
 1121                 if ((m->m_flags & M_EXT) == 0) {
 1122                         m_freem(m);
 1123                         m = NULL;
 1124                 }
 1125         }
 1126         if (m == NULL)
 1127                 return ENOBUFS;
 1128         MCLAIM(m, &tcp_tx_mowner);
 1129 
 1130         tp = sc->sc_tp;
 1131 
 1132         /* Fixup the mbuf. */
 1133         m->m_data += max_linkhdr;
 1134         m_reset_rcvif(m);
 1135         memset(mtod(m, void *), 0, tlen);
 1136 
 1137         switch (sc->sc_src.sa.sa_family) {
 1138         case AF_INET:
 1139                 ip = mtod(m, struct ip *);
 1140                 ip->ip_v = 4;
 1141                 ip->ip_dst = sc->sc_src.sin.sin_addr;
 1142                 ip->ip_src = sc->sc_dst.sin.sin_addr;
 1143                 ip->ip_p = IPPROTO_TCP;
 1144                 th = (struct tcphdr *)(ip + 1);
 1145                 th->th_dport = sc->sc_src.sin.sin_port;
 1146                 th->th_sport = sc->sc_dst.sin.sin_port;
 1147                 break;
 1148 #ifdef INET6
 1149         case AF_INET6:
 1150                 ip6 = mtod(m, struct ip6_hdr *);
 1151                 ip6->ip6_vfc = IPV6_VERSION;
 1152                 ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
 1153                 ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
 1154                 ip6->ip6_nxt = IPPROTO_TCP;
 1155                 /* ip6_plen will be updated in ip6_output() */
 1156                 th = (struct tcphdr *)(ip6 + 1);
 1157                 th->th_dport = sc->sc_src.sin6.sin6_port;
 1158                 th->th_sport = sc->sc_dst.sin6.sin6_port;
 1159                 break;
 1160 #endif
 1161         default:
 1162                 panic("%s: impossible (1)", __func__);
 1163         }
 1164 
 1165         th->th_seq = htonl(sc->sc_iss);
 1166         th->th_ack = htonl(sc->sc_irs + 1);
 1167         th->th_flags = TH_SYN|TH_ACK;
 1168         th->th_win = htons(sc->sc_win);
 1169         /* th_x2, th_sum, th_urp already 0 from memset */
 1170 
 1171         /* Tack on the TCP options. */
 1172         optp = (u_int8_t *)(th + 1);
 1173         optlen = 0;
 1174         *optp++ = TCPOPT_MAXSEG;
 1175         *optp++ = TCPOLEN_MAXSEG;
 1176         *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
 1177         *optp++ = sc->sc_ourmaxseg & 0xff;
 1178         optlen += TCPOLEN_MAXSEG;
 1179 
 1180         if (sc->sc_request_r_scale != 15) {
 1181                 *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
 1182                     TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
 1183                     sc->sc_request_r_scale);
 1184                 optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
 1185                 optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
 1186         }
 1187 
 1188         if (sc->sc_flags & SCF_SACK_PERMIT) {
 1189                 /* Let the peer know that we will SACK. */
 1190                 *optp++ = TCPOPT_SACK_PERMITTED;
 1191                 *optp++ = TCPOLEN_SACK_PERMITTED;
 1192                 optlen += TCPOLEN_SACK_PERMITTED;
 1193         }
 1194 
 1195         if (sc->sc_flags & SCF_TIMESTAMP) {
 1196                 while (optlen % 4 != 2) {
 1197                         optlen += TCPOLEN_NOP;
 1198                         *optp++ = TCPOPT_NOP;
 1199                 }
 1200                 *optp++ = TCPOPT_TIMESTAMP;
 1201                 *optp++ = TCPOLEN_TIMESTAMP;
 1202                 u_int32_t *lp = (u_int32_t *)(optp);
 1203                 /* Form timestamp option as shown in appendix A of RFC 1323. */
 1204                 *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
 1205                 *lp   = htonl(sc->sc_timestamp);
 1206                 optp += TCPOLEN_TIMESTAMP - 2;
 1207                 optlen += TCPOLEN_TIMESTAMP;
 1208         }
 1209 
 1210 #ifdef TCP_SIGNATURE
 1211         if (sc->sc_flags & SCF_SIGNATURE) {
 1212                 sav = tcp_signature_getsav(m);
 1213                 if (sav == NULL) {
 1214                         m_freem(m);
 1215                         return EPERM;
 1216                 }
 1217 
 1218                 *optp++ = TCPOPT_SIGNATURE;
 1219                 *optp++ = TCPOLEN_SIGNATURE;
 1220                 sigp = optp;
 1221                 memset(optp, 0, TCP_SIGLEN);
 1222                 optp += TCP_SIGLEN;
 1223                 optlen += TCPOLEN_SIGNATURE;
 1224         }
 1225 #endif
 1226 
 1227         /*
 1228          * Terminate and pad TCP options to a 4 byte boundary.
 1229          *
 1230          * According to RFC793: "The content of the header beyond the
 1231          * End-of-Option option must be header padding (i.e., zero)."
 1232          * And later: "The padding is composed of zeros."
 1233          */
 1234         if (optlen % 4) {
 1235                 optlen += TCPOLEN_EOL;
 1236                 *optp++ = TCPOPT_EOL;
 1237         }
 1238         while (optlen % 4) {
 1239                 optlen += TCPOLEN_PAD;
 1240                 *optp++ = TCPOPT_PAD;
 1241         }
 1242 
 1243         /* Compute the actual values now that we've added the options. */
 1244         tlen = hlen + sizeof(struct tcphdr) + optlen;
 1245         m->m_len = m->m_pkthdr.len = tlen;
 1246         th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
 1247 
 1248 #ifdef TCP_SIGNATURE
 1249         if (sav) {
 1250                 (void)tcp_signature(m, th, hlen, sav, sigp);
 1251                 key_sa_recordxfer(sav, m);
 1252                 KEY_SA_UNREF(&sav);
 1253         }
 1254 #endif
 1255 
 1256         /*
 1257          * Send ECN SYN-ACK setup packet.
 1258          * Routes can be asymmetric, so, even if we receive a packet
 1259          * with ECE and CWR set, we must not assume no one will block
 1260          * the ECE packet we are about to send.
 1261          */
 1262         if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
 1263             SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
 1264                 th->th_flags |= TH_ECE;
 1265                 TCP_STATINC(TCP_STAT_ECN_SHS);
 1266 
 1267                 /*
 1268                  * draft-ietf-tcpm-ecnsyn-00.txt
 1269                  *
 1270                  * "[...] a TCP node MAY respond to an ECN-setup
 1271                  * SYN packet by setting ECT in the responding
 1272                  * ECN-setup SYN/ACK packet, indicating to routers 
 1273                  * that the SYN/ACK packet is ECN-Capable.
 1274                  * This allows a congested router along the path
 1275                  * to mark the packet instead of dropping the
 1276                  * packet as an indication of congestion."
 1277                  *
 1278                  * "[...] There can be a great benefit in setting
 1279                  * an ECN-capable codepoint in SYN/ACK packets [...]
 1280                  * Congestion is  most likely to occur in
 1281                  * the server-to-client direction.  As a result,
 1282                  * setting an ECN-capable codepoint in SYN/ACK
 1283                  * packets can reduce the occurrence of three-second
 1284                  * retransmit timeouts resulting from the drop
 1285                  * of SYN/ACK packets."
 1286                  *
 1287                  * Page 4 and 6, January 2006.
 1288                  */
 1289 
 1290                 switch (sc->sc_src.sa.sa_family) {
 1291                 case AF_INET:
 1292                         ip->ip_tos |= IPTOS_ECN_ECT0;
 1293                         break;
 1294 #ifdef INET6
 1295                 case AF_INET6:
 1296                         ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
 1297                         break;
 1298 #endif
 1299                 }
 1300                 TCP_STATINC(TCP_STAT_ECN_ECT);
 1301         }
 1302 
 1303 
 1304         /*
 1305          * Compute the packet's checksum.
 1306          *
 1307          * Fill in some straggling IP bits.  Note the stack expects
 1308          * ip_len to be in host order, for convenience.
 1309          */
 1310         switch (sc->sc_src.sa.sa_family) {
 1311         case AF_INET:
 1312                 ip->ip_len = htons(tlen - hlen);
 1313                 th->th_sum = 0;
 1314                 th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 1315                 ip->ip_len = htons(tlen);
 1316                 ip->ip_ttl = ip_defttl;
 1317                 /* XXX tos? */
 1318                 break;
 1319 #ifdef INET6
 1320         case AF_INET6:
 1321                 ip6->ip6_plen = htons(tlen - hlen);
 1322                 th->th_sum = 0;
 1323                 th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
 1324                 ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
 1325                 ip6->ip6_vfc |= IPV6_VERSION;
 1326                 ip6->ip6_plen = htons(tlen - hlen);
 1327                 /* ip6_hlim will be initialized afterwards */
 1328                 /* XXX flowlabel? */
 1329                 break;
 1330 #endif
 1331         }
 1332 
 1333         /* XXX use IPsec policy on listening socket, on SYN ACK */
 1334         tp = sc->sc_tp;
 1335 
 1336         switch (sc->sc_src.sa.sa_family) {
 1337         case AF_INET:
 1338                 error = ip_output(m, sc->sc_ipopts, ro,
 1339                     (ip_mtudisc ? IP_MTUDISC : 0),
 1340                     NULL, tp ? tp->t_inpcb : NULL);
 1341                 break;
 1342 #ifdef INET6
 1343         case AF_INET6:
 1344                 ip6->ip6_hlim = in6pcb_selecthlim(NULL,
 1345                     (rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
 1346                 rtcache_unref(rt, ro);
 1347 
 1348                 error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
 1349                     tp ? tp->t_inpcb : NULL, NULL);
 1350                 break;
 1351 #endif
 1352         default:
 1353                 panic("%s: impossible (2)", __func__);
 1354         }
 1355 
 1356         return error;
 1357 }

Cache object: 634a75da96250214150ee622494de656


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.