The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/tcp_vtw.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * Copyright (c) 2011 The NetBSD Foundation, Inc.
    3  * All rights reserved.
    4  *
    5  * This code is derived from software contributed to The NetBSD Foundation
    6  * by Coyote Point Systems, Inc.
    7  *
    8  * Redistribution and use in source and binary forms, with or without
    9  * modification, are permitted provided that the following conditions
   10  * are met:
   11  * 1. Redistributions of source code must retain the above copyright
   12  *    notice, this list of conditions and the following disclaimer.
   13  * 2. Redistributions in binary form must reproduce the above copyright
   14  *    notice, this list of conditions and the following disclaimer in the
   15  *    documentation and/or other materials provided with the distribution.
   16  *
   17  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
   18  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
   19  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   20  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
   21  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
   22  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
   23  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
   24  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
   25  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   26  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   27  * POSSIBILITY OF SUCH DAMAGE.
   28  */
   29 
   30 /*
   31  * Reduces the resources demanded by TCP sessions in TIME_WAIT-state using
   32  * methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime
   33  * Truncation (MSLT).
   34  * 
   35  * MSLT and VTW were contributed by Coyote Point Systems, Inc.
   36  * 
   37  * Even after a TCP session enters the TIME_WAIT state, its corresponding
   38  * socket and protocol control blocks (PCBs) stick around until the TCP
   39  * Maximum Segment Lifetime (MSL) expires.  On a host whose workload
   40  * necessarily creates and closes down many TCP sockets, the sockets & PCBs
   41  * for TCP sessions in TIME_WAIT state amount to many megabytes of dead
   42  * weight in RAM.
   43  * 
   44  * Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to
   45  * a class based on the nearness of the peer.  Corresponding to each class
   46  * is an MSL, and a session uses the MSL of its class.  The classes are
   47  * loopback (local host equals remote host), local (local host and remote
   48  * host are on the same link/subnet), and remote (local host and remote
   49  * host communicate via one or more gateways).  Classes corresponding to
   50  * nearer peers have lower MSLs by default: 2 seconds for loopback, 10
   51  * seconds for local, 60 seconds for remote.  Loopback and local sessions
   52  * expire more quickly when MSLT is used.
   53  * 
   54  * Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket
   55  * dead weight with a compact representation of the session, called a
   56  * "vestigial PCB".  VTW data structures are designed to be very fast and
   57  * memory-efficient: for fast insertion and lookup of vestigial PCBs,
   58  * the PCBs are stored in a hash table that is designed to minimize the
   59  * number of cacheline visits per lookup/insertion.  The memory both
   60  * for vestigial PCBs and for elements of the PCB hashtable come from
   61  * fixed-size pools, and linked data structures exploit this to conserve
   62  * memory by representing references with a narrow index/offset from the
   63  * start of a pool instead of a pointer.  When space for new vestigial PCBs
   64  * runs out, VTW makes room by discarding old vestigial PCBs, oldest first.
   65  * VTW cooperates with MSLT.
   66  * 
   67  * It may help to think of VTW as a "FIN cache" by analogy to the SYN
   68  * cache.
   69  * 
   70  * A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT
   71  * sessions as fast as it can is approximately 17% idle when VTW is active
   72  * versus 0% idle when VTW is inactive.  It has 103 megabytes more free RAM
   73  * when VTW is active (approximately 64k vestigial PCBs are created) than
   74  * when it is inactive.
   75  */
   76 
   77 #include <sys/cdefs.h>
   78 
   79 #ifdef _KERNEL_OPT
   80 #include "opt_ddb.h"
   81 #include "opt_inet.h"
   82 #include "opt_inet_csum.h"
   83 #include "opt_tcp_debug.h"
   84 #endif
   85 
   86 #include <sys/param.h>
   87 #include <sys/systm.h>
   88 #include <sys/kmem.h>
   89 #include <sys/mbuf.h>
   90 #include <sys/protosw.h>
   91 #include <sys/socket.h>
   92 #include <sys/socketvar.h>
   93 #include <sys/errno.h>
   94 #include <sys/syslog.h>
   95 #include <sys/pool.h>
   96 #include <sys/domain.h>
   97 #include <sys/kernel.h>
   98 #include <net/if.h>
   99 #include <net/if_types.h>
  100 
  101 #include <netinet/in.h>
  102 #include <netinet/in_systm.h>
  103 #include <netinet/ip.h>
  104 #include <netinet/in_pcb.h>
  105 #include <netinet/in_var.h>
  106 #include <netinet/ip_var.h>
  107 #include <netinet/in_offload.h>
  108 #include <netinet/ip6.h>
  109 #include <netinet6/ip6_var.h>
  110 #include <netinet6/in6_pcb.h>
  111 #include <netinet6/ip6_var.h>
  112 #include <netinet6/in6_var.h>
  113 #include <netinet/icmp6.h>
  114 
  115 #include <netinet/tcp.h>
  116 #include <netinet/tcp_fsm.h>
  117 #include <netinet/tcp_seq.h>
  118 #include <netinet/tcp_timer.h>
  119 #include <netinet/tcp_var.h>
  120 #include <netinet/tcp_private.h>
  121 
  122 #include <netinet/tcp_vtw.h>
  123 
  124 __KERNEL_RCSID(0, "$NetBSD: tcp_vtw.c,v 1.24 2022/11/04 09:00:58 ozaki-r Exp $");
  125 
  126 #define db_trace(__a, __b)      do { } while (/*CONSTCOND*/0)
  127 
  128 static void vtw_debug_init(void);
  129 
  130 fatp_ctl_t fat_tcpv4;
  131 fatp_ctl_t fat_tcpv6;
  132 vtw_ctl_t  vtw_tcpv4[VTW_NCLASS];
  133 vtw_ctl_t  vtw_tcpv6[VTW_NCLASS];
  134 vtw_stats_t vtw_stats;
  135 
  136 /* We provide state for the lookup_ports iterator.
  137  * As currently we are netlock-protected, there is one.
  138  * If we were finer-grain, we would have one per CPU.
  139  * I do not want to be in the business of alloc/free.
  140  * The best alternate would be allocate on the caller's
  141  * stack, but that would require them to know the struct,
  142  * or at least the size.
  143  * See how she goes.
  144  */
  145 struct tcp_ports_iterator {
  146         union {
  147                 struct in_addr  v4;
  148                 struct in6_addr v6;
  149         }               addr;
  150         u_int           port;
  151 
  152         uint32_t        wild    : 1;
  153 
  154         vtw_ctl_t       *ctl;
  155         fatp_t          *fp;
  156 
  157         uint16_t        slot_idx;
  158         uint16_t        ctl_idx;
  159 };
  160 
  161 static struct tcp_ports_iterator tcp_ports_iterator_v4;
  162 static struct tcp_ports_iterator tcp_ports_iterator_v6;
  163 
  164 static int vtw_age(vtw_ctl_t *, struct timeval *);
  165 
  166 /*!\brief allocate a fat pointer from a collection.
  167  */
  168 static fatp_t *
  169 fatp_alloc(fatp_ctl_t *fat)
  170 {
  171         fatp_t  *fp     = 0;
  172 
  173         if (fat->nfree) {
  174                 fp = fat->free;
  175                 if (fp) {
  176                         fat->free = fatp_next(fat, fp);
  177                         --fat->nfree;
  178                         ++fat->nalloc;
  179                         fp->nxt = 0;
  180 
  181                         KASSERT(!fp->inuse);
  182                 }
  183         }
  184 
  185         return fp;
  186 }
  187 
  188 /*!\brief free a fat pointer.
  189  */
  190 static void
  191 fatp_free(fatp_ctl_t *fat, fatp_t *fp)
  192 {
  193         if (fp) {
  194                 KASSERT(!fp->inuse);
  195                 KASSERT(!fp->nxt);
  196 
  197                 fp->nxt = fatp_index(fat, fat->free);
  198                 fat->free = fp;
  199 
  200                 ++fat->nfree;
  201                 --fat->nalloc;
  202         }
  203 }
  204 
  205 /*!\brief initialise a collection of fat pointers.
  206  *
  207  *\param n      # hash buckets
  208  *\param m      total # fat pointers to allocate
  209  *
  210  * We allocate 2x as much, as we have two hashes: full and lport only.
  211  */
  212 static void
  213 fatp_init(fatp_ctl_t *fat, uint32_t n, uint32_t m,
  214     fatp_t *fat_base, fatp_t **fat_hash)
  215 {
  216         fatp_t  *fp;
  217 
  218         KASSERT(n <= FATP_MAX / 2);
  219 
  220         fat->hash = fat_hash;
  221         fat->base = fat_base;
  222 
  223         fat->port = &fat->hash[m];
  224 
  225         fat->mask   = m - 1;    // ASSERT is power of 2 (m)
  226         fat->lim    = fat->base + 2*n - 1;
  227         fat->nfree  = 0;
  228         fat->nalloc = 2*n;
  229 
  230         /* Initialise the free list.
  231          */
  232         for (fp = fat->lim; fp >= fat->base; --fp) {
  233                 fatp_free(fat, fp);
  234         }
  235 }
  236 
  237 /*
  238  * The `xtra' is XORed into the tag stored.
  239  */
  240 static uint32_t fatp_xtra[] = {
  241         0x11111111,0x22222222,0x33333333,0x44444444,
  242         0x55555555,0x66666666,0x77777777,0x88888888,
  243         0x12121212,0x21212121,0x34343434,0x43434343,
  244         0x56565656,0x65656565,0x78787878,0x87878787,
  245         0x11221122,0x22112211,0x33443344,0x44334433,
  246         0x55665566,0x66556655,0x77887788,0x88778877,
  247         0x11112222,0x22221111,0x33334444,0x44443333,
  248         0x55556666,0x66665555,0x77778888,0x88887777,
  249 };
  250 
  251 /*!\brief turn a {fatp_t*,slot} into an integral key.
  252  *
  253  * The key can be used to obtain the fatp_t, and the slot,
  254  * as it directly encodes them.
  255  */
  256 static inline uint32_t
  257 fatp_key(fatp_ctl_t *fat, fatp_t *fp, uint32_t slot)
  258 {
  259         CTASSERT(CACHE_LINE_SIZE == 32 ||
  260                  CACHE_LINE_SIZE == 64 ||
  261                  CACHE_LINE_SIZE == 128);
  262 
  263         switch (fatp_ntags()) {
  264         case 7:
  265                 return (fatp_index(fat, fp) << 3) | slot;
  266         case 15:
  267                 return (fatp_index(fat, fp) << 4) | slot;
  268         case 31:
  269                 return (fatp_index(fat, fp) << 5) | slot;
  270         default:
  271                 KASSERT(0 && "no support, for no good reason");
  272                 return ~0;
  273         }
  274 }
  275 
  276 static inline uint32_t
  277 fatp_slot_from_key(fatp_ctl_t *fat, uint32_t key)
  278 {
  279         CTASSERT(CACHE_LINE_SIZE == 32 ||
  280                  CACHE_LINE_SIZE == 64 ||
  281                  CACHE_LINE_SIZE == 128);
  282 
  283         switch (fatp_ntags()) {
  284         case 7:
  285                 return key & 7;
  286         case 15:
  287                 return key & 15;
  288         case 31:
  289                 return key & 31;
  290         default:
  291                 KASSERT(0 && "no support, for no good reason");
  292                 return ~0;
  293         }
  294 }
  295 
  296 static inline fatp_t *
  297 fatp_from_key(fatp_ctl_t *fat, uint32_t key)
  298 {
  299         CTASSERT(CACHE_LINE_SIZE == 32 ||
  300                  CACHE_LINE_SIZE == 64 ||
  301                  CACHE_LINE_SIZE == 128);
  302 
  303         switch (fatp_ntags()) {
  304         case 7:
  305                 key >>= 3;
  306                 break;
  307         case 15:
  308                 key >>= 4;
  309                 break;
  310         case 31:
  311                 key >>= 5;
  312                 break;
  313         default:
  314                 KASSERT(0 && "no support, for no good reason");
  315                 return 0;
  316         }
  317 
  318         return key ? fat->base + key - 1 : 0;
  319 }
  320 
  321 static inline uint32_t
  322 idx_encode(vtw_ctl_t *ctl, uint32_t idx)
  323 {
  324         return (idx << ctl->idx_bits) | idx;
  325 }
  326 
  327 static inline uint32_t
  328 idx_decode(vtw_ctl_t *ctl, uint32_t bits)
  329 {
  330         uint32_t        idx     = bits & ctl->idx_mask;
  331 
  332         if (idx_encode(ctl, idx) == bits)
  333                 return idx;
  334         else
  335                 return ~0;
  336 }
  337 
  338 /*!\brief       insert index into fatp hash
  339  *
  340  *\param        idx     -       index of element being placed in hash chain
  341  *\param        tag     -       32-bit tag identifier
  342  *
  343  *\returns
  344  *      value which can be used to locate entry.
  345  *
  346  *\note
  347  *      we rely on the fact that there are unused high bits in the index
  348  *      for verification purposes on lookup.
  349  */
  350 
  351 static inline uint32_t
  352 fatp_vtw_inshash(fatp_ctl_t *fat, uint32_t idx, uint32_t tag, int which,
  353     void *dbg)
  354 {
  355         fatp_t  *fp;
  356         fatp_t  **hash = (which ? fat->port : fat->hash);
  357         int     i;
  358 
  359         fp = hash[tag & fat->mask];
  360 
  361         while (!fp || fatp_full(fp)) {
  362                 fatp_t  *fq;
  363 
  364                 /* All entries are inuse at the top level.
  365                  * We allocate a spare, and push the top level
  366                  * down one.  All entries in the fp we push down
  367                  * (think of a tape worm here) will be expelled sooner than
  368                  * any entries added subsequently to this hash bucket.
  369                  * This is a property of the time waits we are exploiting.
  370                  */
  371 
  372                 fq = fatp_alloc(fat);
  373                 if (!fq) {
  374                         vtw_age(fat->vtw, 0);
  375                         fp = hash[tag & fat->mask];
  376                         continue;
  377                 }
  378 
  379                 fq->inuse = 0;
  380                 fq->nxt   = fatp_index(fat, fp);
  381 
  382                 hash[tag & fat->mask] = fq;
  383 
  384                 fp = fq;
  385         }
  386 
  387         KASSERT(!fatp_full(fp));
  388 
  389         /* Fill highest index first.  Lookup is lowest first.
  390          */
  391         for (i = fatp_ntags(); --i >= 0; ) {
  392                 if (!((1 << i) & fp->inuse)) {
  393                         break;
  394                 }
  395         }
  396 
  397         fp->inuse |= 1 << i;
  398         fp->tag[i] = tag ^ idx_encode(fat->vtw, idx) ^ fatp_xtra[i];
  399 
  400         db_trace(KTR_VTW
  401                  , (fp, "fat: inuse %5.5x tag[%x] %8.8x"
  402                     , fp->inuse
  403                     , i, fp->tag[i]));
  404 
  405         return fatp_key(fat, fp, i);
  406 }
  407 
  408 static inline int
  409 vtw_alive(const vtw_t *vtw)
  410 {
  411         return vtw->hashed && vtw->expire.tv_sec;
  412 }
  413 
  414 static inline uint32_t
  415 vtw_index_v4(vtw_ctl_t *ctl, vtw_v4_t *v4)
  416 {
  417         if (ctl->base.v4 <= v4 && v4 <= ctl->lim.v4)
  418                 return v4 - ctl->base.v4;
  419 
  420         KASSERT(0 && "vtw out of bounds");
  421 
  422         return ~0;
  423 }
  424 
  425 static inline uint32_t
  426 vtw_index_v6(vtw_ctl_t *ctl, vtw_v6_t *v6)
  427 {
  428         if (ctl->base.v6 <= v6 && v6 <= ctl->lim.v6)
  429                 return v6 - ctl->base.v6;
  430 
  431         KASSERT(0 && "vtw out of bounds");
  432 
  433         return ~0;
  434 }
  435 
  436 static inline uint32_t
  437 vtw_index(vtw_ctl_t *ctl, vtw_t *vtw)
  438 {
  439         if (ctl->clidx)
  440                 ctl = ctl->ctl;
  441 
  442         if (ctl->is_v4)
  443                 return vtw_index_v4(ctl, (vtw_v4_t *)vtw);
  444 
  445         if (ctl->is_v6)
  446                 return vtw_index_v6(ctl, (vtw_v6_t *)vtw);
  447 
  448         KASSERT(0 && "neither 4 nor 6.  most curious.");
  449 
  450         return ~0;
  451 }
  452 
  453 static inline vtw_t *
  454 vtw_from_index(vtw_ctl_t *ctl, uint32_t idx)
  455 {
  456         if (ctl->clidx)
  457                 ctl = ctl->ctl;
  458 
  459         /* See if the index looks like it might be an index.
  460          * Bits on outside of the valid index bits is a give away.
  461          */
  462         idx = idx_decode(ctl, idx);
  463 
  464         if (idx == ~0) {
  465                 return 0;
  466         } else if (ctl->is_v4) {
  467                 vtw_v4_t        *vtw = ctl->base.v4 + idx;
  468 
  469                 return (ctl->base.v4 <= vtw && vtw <= ctl->lim.v4)
  470                         ? &vtw->common : 0;
  471         } else if (ctl->is_v6) {
  472                 vtw_v6_t        *vtw = ctl->base.v6 + idx;
  473 
  474                 return (ctl->base.v6 <= vtw && vtw <= ctl->lim.v6)
  475                         ? &vtw->common : 0;
  476         } else {
  477                 KASSERT(0 && "badness");
  478                 return 0;
  479         }
  480 }
  481 
  482 /*!\brief return the next vtw after this one.
  483  *
  484  * Due to the differing sizes of the entries in differing
  485  * arenas, we have to ensure we ++ the correct pointer type.
  486  *
  487  * Also handles wrap.
  488  */
  489 static inline vtw_t *
  490 vtw_next(vtw_ctl_t *ctl, vtw_t *vtw)
  491 {
  492         if (ctl->is_v4) {
  493                 vtw_v4_t        *v4 = (void*)vtw;
  494 
  495                 vtw = &(++v4)->common;
  496         } else {
  497                 vtw_v6_t        *v6 = (void*)vtw;
  498 
  499                 vtw = &(++v6)->common;
  500         }
  501 
  502         if (vtw > ctl->lim.v)
  503                 vtw = ctl->base.v;
  504 
  505         return vtw;
  506 }
  507 
  508 /*!\brief       remove entry from FATP hash chains
  509  */
  510 static inline void
  511 vtw_unhash(vtw_ctl_t *ctl, vtw_t *vtw)
  512 {
  513         fatp_ctl_t      *fat    = ctl->fat;
  514         fatp_t          *fp;
  515         uint32_t        key = vtw->key;
  516         uint32_t        tag, slot, idx;
  517         vtw_v4_t        *v4 = (void*)vtw;
  518         vtw_v6_t        *v6 = (void*)vtw;
  519 
  520         if (!vtw->hashed) {
  521                 KASSERT(0 && "unhashed");
  522                 return;
  523         }
  524 
  525         if (fat->vtw->is_v4) {
  526                 tag = v4_tag(v4->faddr, v4->fport, v4->laddr, v4->lport);
  527         } else if (fat->vtw->is_v6) {
  528                 tag = v6_tag(&v6->faddr, v6->fport, &v6->laddr, v6->lport);
  529         } else {
  530                 tag = 0;
  531                 KASSERT(0 && "not reached");
  532         }
  533 
  534         /* Remove from fat->hash[]
  535          */
  536         slot = fatp_slot_from_key(fat, key);
  537         fp   = fatp_from_key(fat, key);
  538         idx  = vtw_index(ctl, vtw);
  539 
  540         db_trace(KTR_VTW
  541                  , (fp, "fat: del inuse %5.5x slot %x idx %x key %x tag %x"
  542                     , fp->inuse, slot, idx, key, tag));
  543 
  544         KASSERT(fp->inuse & (1 << slot));
  545         KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
  546                                   ^ fatp_xtra[slot]));
  547 
  548         if ((fp->inuse & (1 << slot))
  549             && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
  550                                  ^ fatp_xtra[slot])) {
  551                 fp->inuse ^= 1 << slot;
  552                 fp->tag[slot] = 0;
  553 
  554                 /* When we delete entries, we do not compact.  This is
  555                  * due to temporality.  We add entries, and they
  556                  * (eventually) expire. Older entries will be further
  557                  * down the chain.
  558                  */
  559                 if (!fp->inuse) {
  560                         uint32_t hi = tag & fat->mask;
  561                         fatp_t  *fq = 0;
  562                         fatp_t  *fr = fat->hash[hi];
  563 
  564                         while (fr && fr != fp) {
  565                                 fr = fatp_next(fat, fq = fr);
  566                         }
  567 
  568                         if (fr == fp) {
  569                                 if (fq) {
  570                                         fq->nxt = fp->nxt;
  571                                         fp->nxt = 0;
  572                                         fatp_free(fat, fp);
  573                                 } else {
  574                                         KASSERT(fat->hash[hi] == fp);
  575 
  576                                         if (fp->nxt) {
  577                                                 fat->hash[hi]
  578                                                         = fatp_next(fat, fp);
  579                                                 fp->nxt = 0;
  580                                                 fatp_free(fat, fp);
  581                                         } else {
  582                                                 /* retain for next use.
  583                                                  */
  584                                                 ;
  585                                         }
  586                                 }
  587                         } else {
  588                                 fr = fat->hash[hi];
  589 
  590                                 do {
  591                                         db_trace(KTR_VTW
  592                                                  , (fr
  593                                                     , "fat:*del inuse %5.5x"
  594                                                     " nxt %x"
  595                                                     , fr->inuse, fr->nxt));
  596 
  597                                         fr = fatp_next(fat, fq = fr);
  598                                 } while (fr && fr != fp);
  599 
  600                                 KASSERT(0 && "oops");
  601                         }
  602                 }
  603                 vtw->key ^= ~0;
  604         }
  605         
  606         if (fat->vtw->is_v4) {
  607                 tag = v4_port_tag(v4->lport);
  608         } else if (fat->vtw->is_v6) {
  609                 tag = v6_port_tag(v6->lport);
  610         }
  611 
  612         /* Remove from fat->port[]
  613          */
  614         key  = vtw->port_key;
  615         slot = fatp_slot_from_key(fat, key);
  616         fp   = fatp_from_key(fat, key);
  617         idx  = vtw_index(ctl, vtw);
  618 
  619         db_trace(KTR_VTW
  620                  , (fp, "fatport: del inuse %5.5x"
  621                     " slot %x idx %x key %x tag %x"
  622                     , fp->inuse, slot, idx, key, tag));
  623 
  624         KASSERT(fp->inuse & (1 << slot));
  625         KASSERT(fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
  626                                   ^ fatp_xtra[slot]));
  627 
  628         if ((fp->inuse & (1 << slot))
  629             && fp->tag[slot] == (tag ^ idx_encode(ctl, idx)
  630                                  ^ fatp_xtra[slot])) {
  631                 fp->inuse ^= 1 << slot;
  632                 fp->tag[slot] = 0;
  633 
  634                 if (!fp->inuse) {
  635                         uint32_t hi = tag & fat->mask;
  636                         fatp_t  *fq = 0;
  637                         fatp_t  *fr = fat->port[hi];
  638 
  639                         while (fr && fr != fp) {
  640                                 fr = fatp_next(fat, fq = fr);
  641                         }
  642 
  643                         if (fr == fp) {
  644                                 if (fq) {
  645                                         fq->nxt = fp->nxt;
  646                                         fp->nxt = 0;
  647                                         fatp_free(fat, fp);
  648                                 } else {
  649                                         KASSERT(fat->port[hi] == fp);
  650 
  651                                         if (fp->nxt) {
  652                                                 fat->port[hi]
  653                                                         = fatp_next(fat, fp);
  654                                                 fp->nxt = 0;
  655                                                 fatp_free(fat, fp);
  656                                         } else {
  657                                                 /* retain for next use.
  658                                                  */
  659                                                 ;
  660                                         }
  661                                 }
  662                         }
  663                 }
  664                 vtw->port_key ^= ~0;
  665         }
  666 
  667         vtw->hashed = 0;
  668 }
  669 
  670 /*!\brief       remove entry from hash, possibly free.
  671  */
  672 void
  673 vtw_del(vtw_ctl_t *ctl, vtw_t *vtw)
  674 {
  675         KASSERT(mutex_owned(softnet_lock));
  676 
  677         if (vtw->hashed) {
  678                 ++vtw_stats.del;
  679                 vtw_unhash(ctl, vtw);
  680         }
  681 
  682         /* We only delete the oldest entry.
  683          */
  684         if (vtw != ctl->oldest.v)
  685                 return;
  686 
  687         --ctl->nalloc;
  688         ++ctl->nfree;
  689 
  690         vtw->expire.tv_sec  = 0;
  691         vtw->expire.tv_usec = ~0;
  692 
  693         if (!ctl->nalloc)
  694                 ctl->oldest.v = 0;
  695 
  696         ctl->oldest.v = vtw_next(ctl, vtw);
  697 }
  698 
  699 /*!\brief       insert vestigial timewait in hash chain
  700  */
  701 static void
  702 vtw_inshash_v4(vtw_ctl_t *ctl, vtw_t *vtw)
  703 {
  704         uint32_t        idx     = vtw_index(ctl, vtw);
  705         uint32_t        tag;
  706         vtw_v4_t        *v4 = (void*)vtw;
  707 
  708         KASSERT(mutex_owned(softnet_lock));
  709         KASSERT(!vtw->hashed);
  710         KASSERT(ctl->clidx == vtw->msl_class);
  711 
  712         ++vtw_stats.ins;
  713 
  714         tag = v4_tag(v4->faddr, v4->fport,
  715                      v4->laddr, v4->lport);
  716 
  717         vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
  718 
  719         db_trace(KTR_VTW, (ctl
  720                            , "vtw: ins %8.8x:%4.4x %8.8x:%4.4x"
  721                            " tag %8.8x key %8.8x"
  722                            , v4->faddr, v4->fport
  723                            , v4->laddr, v4->lport
  724                            , tag
  725                            , vtw->key));
  726 
  727         tag = v4_port_tag(v4->lport);
  728         vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
  729 
  730         db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
  731                            , v4->lport, v4->lport
  732                            , tag
  733                            , vtw->key));
  734 
  735         vtw->hashed = 1;
  736 }
  737 
  738 /*!\brief       insert vestigial timewait in hash chain
  739  */
  740 static void
  741 vtw_inshash_v6(vtw_ctl_t *ctl, vtw_t *vtw)
  742 {
  743         uint32_t        idx     = vtw_index(ctl, vtw);
  744         uint32_t        tag;
  745         vtw_v6_t        *v6     = (void*)vtw;
  746 
  747         KASSERT(mutex_owned(softnet_lock));
  748         KASSERT(!vtw->hashed);
  749         KASSERT(ctl->clidx == vtw->msl_class);
  750 
  751         ++vtw_stats.ins;
  752 
  753         tag = v6_tag(&v6->faddr, v6->fport,
  754                      &v6->laddr, v6->lport);
  755 
  756         vtw->key = fatp_vtw_inshash(ctl->fat, idx, tag, 0, vtw);
  757 
  758         tag = v6_port_tag(v6->lport);
  759         vtw->port_key = fatp_vtw_inshash(ctl->fat, idx, tag, 1, vtw);
  760 
  761         db_trace(KTR_VTW, (ctl, "vtw: ins %P - %4.4x tag %8.8x key %8.8x"
  762                            , v6->lport, v6->lport
  763                            , tag
  764                            , vtw->key));
  765 
  766         vtw->hashed = 1;
  767 }
  768 
  769 static vtw_t *
  770 vtw_lookup_hash_v4(vtw_ctl_t *ctl, uint32_t faddr, uint16_t fport
  771                                  , uint32_t laddr, uint16_t lport
  772                                  , int which)
  773 {
  774         vtw_v4_t        *v4;
  775         vtw_t           *vtw;
  776         uint32_t        tag;
  777         fatp_t          *fp;
  778         int             i;
  779         uint32_t        fatps = 0, probes = 0, losings = 0;
  780 
  781         if (!ctl || !ctl->fat)
  782                 return 0;
  783 
  784         ++vtw_stats.look[which];
  785 
  786         if (which) {
  787                 tag = v4_port_tag(lport);
  788                 fp  = ctl->fat->port[tag & ctl->fat->mask];
  789         } else {
  790                 tag = v4_tag(faddr, fport, laddr, lport);
  791                 fp  = ctl->fat->hash[tag & ctl->fat->mask];
  792         }
  793 
  794         while (fp && fp->inuse) {
  795                 uint32_t        inuse = fp->inuse;
  796 
  797                 ++fatps;
  798 
  799                 for (i = 0; inuse && i < fatp_ntags(); ++i) {
  800                         uint32_t        idx;
  801 
  802                         if (!(inuse & (1 << i)))
  803                                 continue;
  804 
  805                         inuse ^= 1 << i;
  806 
  807                         ++probes;
  808                         ++vtw_stats.probe[which];
  809 
  810                         idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
  811                         vtw = vtw_from_index(ctl, idx);
  812 
  813                         if (!vtw) {
  814                                 /* Hopefully fast path.
  815                                  */
  816                                 db_trace(KTR_VTW
  817                                          , (fp, "vtw: fast %A:%P %A:%P"
  818                                             " idx %x tag %x"
  819                                             , faddr, fport
  820                                             , laddr, lport
  821                                             , idx, tag));
  822                                 continue;
  823                         }
  824 
  825                         v4 = (void*)vtw;
  826 
  827                         /* The de-referencing of vtw is what we want to avoid.
  828                          * Losing.
  829                          */
  830                         if (vtw_alive(vtw)
  831                             && ((which ? vtw->port_key : vtw->key)
  832                                 == fatp_key(ctl->fat, fp, i))
  833                             && (which
  834                                 || (v4->faddr == faddr && v4->laddr == laddr
  835                                     && v4->fport == fport))
  836                             && v4->lport == lport) {
  837                                 ++vtw_stats.hit[which];
  838 
  839                                 db_trace(KTR_VTW
  840                                          , (fp, "vtw: hit %8.8x:%4.4x"
  841                                             " %8.8x:%4.4x idx %x key %x"
  842                                             , faddr, fport
  843                                             , laddr, lport
  844                                             , idx_decode(ctl, idx), vtw->key));
  845 
  846                                 KASSERT(vtw->hashed);
  847 
  848                                 goto out;
  849                         }
  850                         ++vtw_stats.losing[which];
  851                         ++losings;
  852                         
  853                         if (vtw_alive(vtw)) {
  854                                 db_trace(KTR_VTW
  855                                          , (fp, "vtw:!mis %8.8x:%4.4x"
  856                                             " %8.8x:%4.4x key %x tag %x"
  857                                             , faddr, fport
  858                                             , laddr, lport
  859                                             , fatp_key(ctl->fat, fp, i)
  860                                             , v4_tag(faddr, fport
  861                                                      , laddr, lport)));
  862                                 db_trace(KTR_VTW
  863                                          , (vtw, "vtw:!mis %8.8x:%4.4x"
  864                                             " %8.8x:%4.4x key %x tag %x"
  865                                             , v4->faddr, v4->fport
  866                                             , v4->laddr, v4->lport
  867                                             , vtw->key
  868                                             , v4_tag(v4->faddr, v4->fport
  869                                                      , v4->laddr, v4->lport)));
  870 
  871                                 if (vtw->key == fatp_key(ctl->fat, fp, i)) {
  872                                         db_trace(KTR_VTW
  873                                                  , (vtw, "vtw:!mis %8.8x:%4.4x"
  874                                                     " %8.8x:%4.4x key %x"
  875                                                     " which %x"
  876                                                     , v4->faddr, v4->fport
  877                                                     , v4->laddr, v4->lport
  878                                                     , vtw->key
  879                                                     , which));
  880 
  881                                 } else {
  882                                         db_trace(KTR_VTW
  883                                                  , (vtw
  884                                                     , "vtw:!mis"
  885                                                     " key %8.8x != %8.8x"
  886                                                     " idx %x i %x which %x"
  887                                                     , vtw->key
  888                                                     , fatp_key(ctl->fat, fp, i)
  889                                                     , idx_decode(ctl, idx)
  890                                                     , i
  891                                                     , which));
  892                                 }
  893                         } else {
  894                                 db_trace(KTR_VTW
  895                                          , (fp
  896                                             , "vtw:!mis free entry"
  897                                             " idx %x vtw %p which %x"
  898                                             , idx_decode(ctl, idx)
  899                                             , vtw, which));
  900                         }
  901                 }
  902 
  903                 if (fp->nxt) {
  904                         fp = fatp_next(ctl->fat, fp);
  905                 } else {
  906                         break;
  907                 }
  908         }
  909         ++vtw_stats.miss[which];
  910         vtw = 0;
  911 out:
  912         if (fatps > vtw_stats.max_chain[which])
  913                 vtw_stats.max_chain[which] = fatps;
  914         if (probes > vtw_stats.max_probe[which])
  915                 vtw_stats.max_probe[which] = probes;
  916         if (losings > vtw_stats.max_loss[which])
  917                 vtw_stats.max_loss[which] = losings;
  918 
  919         return vtw;
  920 }
  921 
  922 static vtw_t *
  923 vtw_lookup_hash_v6(vtw_ctl_t *ctl, const struct in6_addr *faddr, uint16_t fport
  924                                  , const struct in6_addr *laddr, uint16_t lport
  925                                  , int which)
  926 {
  927         vtw_v6_t        *v6;
  928         vtw_t           *vtw;
  929         uint32_t        tag;
  930         fatp_t          *fp;
  931         int             i;
  932         uint32_t        fatps = 0, probes = 0, losings = 0;
  933 
  934         ++vtw_stats.look[which];
  935 
  936         if (!ctl || !ctl->fat)
  937                 return 0;
  938 
  939         if (which) {
  940                 tag = v6_port_tag(lport);
  941                 fp  = ctl->fat->port[tag & ctl->fat->mask];
  942         } else {
  943                 tag = v6_tag(faddr, fport, laddr, lport);
  944                 fp  = ctl->fat->hash[tag & ctl->fat->mask];
  945         }
  946 
  947         while (fp && fp->inuse) {
  948                 uint32_t        inuse = fp->inuse;
  949 
  950                 ++fatps;
  951 
  952                 for (i = 0; inuse && i < fatp_ntags(); ++i) {
  953                         uint32_t        idx;
  954 
  955                         if (!(inuse & (1 << i)))
  956                                 continue;
  957 
  958                         inuse ^= 1 << i;
  959 
  960                         ++probes;
  961                         ++vtw_stats.probe[which];
  962 
  963                         idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
  964                         vtw = vtw_from_index(ctl, idx);
  965 
  966                         db_trace(KTR_VTW
  967                                  , (fp, "probe: %2d %6A:%4.4x %6A:%4.4x idx %x"
  968                                     , i
  969                                     , db_store(faddr, sizeof (*faddr)), fport
  970                                     , db_store(laddr, sizeof (*laddr)), lport
  971                                     , idx_decode(ctl, idx)));
  972 
  973                         if (!vtw) {
  974                                 /* Hopefully fast path.
  975                                  */
  976                                 continue;
  977                         }
  978 
  979                         v6 = (void*)vtw;
  980 
  981                         if (vtw_alive(vtw)
  982                             && ((which ? vtw->port_key : vtw->key)
  983                                 == fatp_key(ctl->fat, fp, i))
  984                             && v6->lport == lport
  985                             && (which
  986                                 || (v6->fport == fport
  987                                     && !bcmp(&v6->faddr, faddr, sizeof (*faddr))
  988                                     && !bcmp(&v6->laddr, laddr
  989                                              , sizeof (*laddr))))) {
  990                                 ++vtw_stats.hit[which];
  991 
  992                                 KASSERT(vtw->hashed);
  993                                 goto out;
  994                         } else {
  995                                 ++vtw_stats.losing[which];
  996                                 ++losings;
  997                         }
  998                 }
  999 
 1000                 if (fp->nxt) {
 1001                         fp = fatp_next(ctl->fat, fp);
 1002                 } else {
 1003                         break;
 1004                 }
 1005         }
 1006         ++vtw_stats.miss[which];
 1007         vtw = 0;
 1008 out:
 1009         if (fatps > vtw_stats.max_chain[which])
 1010                 vtw_stats.max_chain[which] = fatps;
 1011         if (probes > vtw_stats.max_probe[which])
 1012                 vtw_stats.max_probe[which] = probes;
 1013         if (losings > vtw_stats.max_loss[which])
 1014                 vtw_stats.max_loss[which] = losings;
 1015 
 1016         return vtw;
 1017 }
 1018 
 1019 /*!\brief port iterator
 1020  */
 1021 static vtw_t *
 1022 vtw_next_port_v4(struct tcp_ports_iterator *it)
 1023 {
 1024         vtw_ctl_t       *ctl = it->ctl;
 1025         vtw_v4_t        *v4;
 1026         vtw_t           *vtw;
 1027         uint32_t        tag;
 1028         uint16_t        lport = it->port;
 1029         fatp_t          *fp;
 1030         int             i;
 1031         uint32_t        fatps = 0, probes = 0, losings = 0;
 1032 
 1033         tag = v4_port_tag(lport);
 1034         if (!it->fp) {
 1035                 it->fp = ctl->fat->port[tag & ctl->fat->mask];
 1036                 it->slot_idx = 0;
 1037         }
 1038         fp  = it->fp;
 1039 
 1040         while (fp) {
 1041                 uint32_t        inuse = fp->inuse;
 1042 
 1043                 ++fatps;
 1044 
 1045                 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
 1046                         uint32_t        idx;
 1047 
 1048                         if (!(inuse & (1 << i)))
 1049                                 continue;
 1050 
 1051                         inuse &= ~0U << i;
 1052 
 1053                         if (i < it->slot_idx)
 1054                                 continue;
 1055 
 1056                         ++vtw_stats.probe[1];
 1057                         ++probes;
 1058 
 1059                         idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
 1060                         vtw = vtw_from_index(ctl, idx);
 1061 
 1062                         if (!vtw) {
 1063                                 /* Hopefully fast path.
 1064                                  */
 1065                                 continue;
 1066                         }
 1067 
 1068                         v4 = (void*)vtw;
 1069 
 1070                         if (vtw_alive(vtw)
 1071                             && vtw->port_key == fatp_key(ctl->fat, fp, i)
 1072                             && v4->lport == lport) {
 1073                                 ++vtw_stats.hit[1];
 1074 
 1075                                 it->slot_idx = i + 1;
 1076 
 1077                                 goto out;
 1078                         } else if (vtw_alive(vtw)) {
 1079                                 ++vtw_stats.losing[1];
 1080                                 ++losings;
 1081 
 1082                                 db_trace(KTR_VTW
 1083                                          , (vtw, "vtw:!mis"
 1084                                             " port %8.8x:%4.4x %8.8x:%4.4x"
 1085                                             " key %x port %x"
 1086                                             , v4->faddr, v4->fport
 1087                                             , v4->laddr, v4->lport
 1088                                             , vtw->key
 1089                                             , lport));
 1090                         } else {
 1091                                 /* Really losing here.  We are coming
 1092                                  * up with references to free entries.
 1093                                  * Might find it better to use
 1094                                  * traditional, or need another
 1095                                  * add-hockery.  The other add-hockery
 1096                                  * would be to pul more into into the
 1097                                  * cache line to reject the false
 1098                                  * hits.
 1099                                  */
 1100                                 ++vtw_stats.losing[1];
 1101                                 ++losings;
 1102                                 db_trace(KTR_VTW
 1103                                          , (fp, "vtw:!mis port %x"
 1104                                             " - free entry idx %x vtw %p"
 1105                                             , lport
 1106                                             , idx_decode(ctl, idx)
 1107                                             , vtw));
 1108                         }
 1109                 }
 1110 
 1111                 if (fp->nxt) {
 1112                         it->fp = fp = fatp_next(ctl->fat, fp);
 1113                         it->slot_idx = 0;
 1114                 } else {
 1115                         it->fp = 0;
 1116                         break;
 1117                 }
 1118         }
 1119         ++vtw_stats.miss[1];
 1120 
 1121         vtw = 0;
 1122 out:
 1123         if (fatps > vtw_stats.max_chain[1])
 1124                 vtw_stats.max_chain[1] = fatps;
 1125         if (probes > vtw_stats.max_probe[1])
 1126                 vtw_stats.max_probe[1] = probes;
 1127         if (losings > vtw_stats.max_loss[1])
 1128                 vtw_stats.max_loss[1] = losings;
 1129 
 1130         return vtw;
 1131 }
 1132 
 1133 /*!\brief port iterator
 1134  */
 1135 static vtw_t *
 1136 vtw_next_port_v6(struct tcp_ports_iterator *it)
 1137 {
 1138         vtw_ctl_t       *ctl = it->ctl;
 1139         vtw_v6_t        *v6;
 1140         vtw_t           *vtw;
 1141         uint32_t        tag;
 1142         uint16_t        lport = it->port;
 1143         fatp_t          *fp;
 1144         int             i;
 1145         uint32_t        fatps = 0, probes = 0, losings = 0;
 1146 
 1147         tag = v6_port_tag(lport);
 1148         if (!it->fp) {
 1149                 it->fp = ctl->fat->port[tag & ctl->fat->mask];
 1150                 it->slot_idx = 0;
 1151         }
 1152         fp  = it->fp;
 1153 
 1154         while (fp) {
 1155                 uint32_t        inuse = fp->inuse;
 1156 
 1157                 ++fatps;
 1158 
 1159                 for (i = it->slot_idx; inuse && i < fatp_ntags(); ++i) {
 1160                         uint32_t        idx;
 1161 
 1162                         if (!(inuse & (1 << i)))
 1163                                 continue;
 1164 
 1165                         inuse &= ~0U << i;
 1166 
 1167                         if (i < it->slot_idx)
 1168                                 continue;
 1169 
 1170                         ++vtw_stats.probe[1];
 1171                         ++probes;
 1172 
 1173                         idx = fp->tag[i] ^ tag ^ fatp_xtra[i];
 1174                         vtw = vtw_from_index(ctl, idx);
 1175 
 1176                         if (!vtw) {
 1177                                 /* Hopefully fast path.
 1178                                  */
 1179                                 continue;
 1180                         }
 1181 
 1182                         v6 = (void*)vtw;
 1183 
 1184                         db_trace(KTR_VTW
 1185                                  , (vtw, "vtw: i %x idx %x fp->tag %x"
 1186                                     " tag %x xtra %x"
 1187                                     , i, idx_decode(ctl, idx)
 1188                                     , fp->tag[i], tag, fatp_xtra[i]));
 1189 
 1190                         if (vtw_alive(vtw)
 1191                             && vtw->port_key == fatp_key(ctl->fat, fp, i)
 1192                             && v6->lport == lport) {
 1193                                 ++vtw_stats.hit[1];
 1194 
 1195                                 db_trace(KTR_VTW
 1196                                          , (fp, "vtw: nxt port %P - %4.4x"
 1197                                             " idx %x key %x"
 1198                                             , lport, lport
 1199                                             , idx_decode(ctl, idx), vtw->key));
 1200 
 1201                                 it->slot_idx = i + 1;
 1202                                 goto out;
 1203                         } else if (vtw_alive(vtw)) {
 1204                                 ++vtw_stats.losing[1];
 1205 
 1206                                 db_trace(KTR_VTW
 1207                                          , (vtw, "vtw:!mis port %6A:%4.4x"
 1208                                             " %6A:%4.4x key %x port %x"
 1209                                             , db_store(&v6->faddr
 1210                                                        , sizeof (v6->faddr))
 1211                                             , v6->fport
 1212                                             , db_store(&v6->laddr
 1213                                                        , sizeof (v6->faddr))
 1214                                             , v6->lport
 1215                                             , vtw->key
 1216                                             , lport));
 1217                         } else {
 1218                                 /* Really losing here.  We are coming
 1219                                  * up with references to free entries.
 1220                                  * Might find it better to use
 1221                                  * traditional, or need another
 1222                                  * add-hockery.  The other add-hockery
 1223                                  * would be to pul more into into the
 1224                                  * cache line to reject the false
 1225                                  * hits.
 1226                                  */
 1227                                 ++vtw_stats.losing[1];
 1228                                 ++losings;
 1229 
 1230                                 db_trace(KTR_VTW
 1231                                          , (fp
 1232                                             , "vtw:!mis port %x"
 1233                                             " - free entry idx %x vtw %p"
 1234                                             , lport, idx_decode(ctl, idx)
 1235                                             , vtw));
 1236                         }
 1237                 }
 1238 
 1239                 if (fp->nxt) {
 1240                         it->fp = fp = fatp_next(ctl->fat, fp);
 1241                         it->slot_idx = 0;
 1242                 } else {
 1243                         it->fp = 0;
 1244                         break;
 1245                 }
 1246         }
 1247         ++vtw_stats.miss[1];
 1248 
 1249         vtw = 0;
 1250 out:
 1251         if (fatps > vtw_stats.max_chain[1])
 1252                 vtw_stats.max_chain[1] = fatps;
 1253         if (probes > vtw_stats.max_probe[1])
 1254                 vtw_stats.max_probe[1] = probes;
 1255         if (losings > vtw_stats.max_loss[1])
 1256                 vtw_stats.max_loss[1] = losings;
 1257 
 1258         return vtw;
 1259 }
 1260 
 1261 /*!\brief initialise the VTW allocation arena
 1262  *
 1263  * There are 1+3 allocation classes:
 1264  *      0       classless
 1265  *      {1,2,3} MSL-class based allocation
 1266  *
 1267  * The allocation arenas are all initialised.  Classless gets all the
 1268  * space.  MSL-class based divides the arena, so that allocation
 1269  * within a class can proceed without having to consider entries
 1270  * (aka: cache lines) from different classes.
 1271  *
 1272  * Usually, we are completely classless or class-based, but there can be
 1273  * transition periods, corresponding to dynamic adjustments in the config
 1274  * by the operator.
 1275  */
 1276 static void
 1277 vtw_init(fatp_ctl_t *fat, vtw_ctl_t *ctl, const uint32_t n, vtw_t *ctl_base_v)
 1278 {
 1279         int class_n, i;
 1280         vtw_t   *base;
 1281 
 1282         ctl->base.v = ctl_base_v;
 1283 
 1284         if (ctl->is_v4) {
 1285                 ctl->lim.v4    = ctl->base.v4 + n - 1;
 1286                 ctl->alloc.v4  = ctl->base.v4;
 1287         } else {
 1288                 ctl->lim.v6    = ctl->base.v6 + n - 1;
 1289                 ctl->alloc.v6  = ctl->base.v6;
 1290         }
 1291 
 1292         ctl->nfree  = n;
 1293         ctl->ctl    = ctl;
 1294 
 1295         ctl->idx_bits = 32;
 1296         for (ctl->idx_mask = ~0; (ctl->idx_mask & (n-1)) == n-1; ) {
 1297                 ctl->idx_mask >>= 1;
 1298                 ctl->idx_bits  -= 1;
 1299         }
 1300 
 1301         ctl->idx_mask <<= 1;
 1302         ctl->idx_mask  |= 1;
 1303         ctl->idx_bits  += 1;
 1304 
 1305         ctl->fat = fat;
 1306         fat->vtw = ctl;
 1307 
 1308         /* Divide the resources equally amongst the classes.
 1309          * This is not optimal, as the different classes
 1310          * arrive and leave at different rates, but it is
 1311          * the best I can do for now.
 1312          */
 1313         class_n = n / (VTW_NCLASS-1);
 1314         base    = ctl->base.v;
 1315 
 1316         for (i = 1; i < VTW_NCLASS; ++i) {
 1317                 int j;
 1318 
 1319                 ctl[i] = ctl[0];
 1320                 ctl[i].clidx = i;
 1321 
 1322                 ctl[i].base.v = base;
 1323                 ctl[i].alloc  = ctl[i].base;
 1324 
 1325                 for (j = 0; j < class_n - 1; ++j) {
 1326                         if (tcp_msl_enable)
 1327                                 base->msl_class = i;
 1328                         base = vtw_next(ctl, base);
 1329                 }
 1330 
 1331                 ctl[i].lim.v = base;
 1332                 base = vtw_next(ctl, base);
 1333                 ctl[i].nfree = class_n;
 1334         }
 1335 
 1336         vtw_debug_init();
 1337 }
 1338 
 1339 /*!\brief       map class to TCP MSL
 1340  */
 1341 static inline uint32_t
 1342 class_to_msl(int msl_class)
 1343 {
 1344         switch (msl_class) {
 1345         case 0:
 1346         case 1:
 1347                 return tcp_msl_remote ? tcp_msl_remote : (TCPTV_MSL >> 0);
 1348         case 2:
 1349                 return tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
 1350         default:
 1351                 return tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
 1352         }
 1353 }
 1354 
 1355 /*!\brief       map TCP MSL to class
 1356  */
 1357 static inline uint32_t
 1358 msl_to_class(int msl)
 1359 {
 1360         if (tcp_msl_enable) {
 1361                 if (msl <= (tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2)))
 1362                         return 1+2;
 1363                 if (msl <= (tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1)))
 1364                         return 1+1;
 1365                 return 1;
 1366         }
 1367         return 0;
 1368 }
 1369 
 1370 /*!\brief allocate a vtw entry
 1371  */
 1372 static inline vtw_t *
 1373 vtw_alloc(vtw_ctl_t *ctl)
 1374 {
 1375         vtw_t   *vtw    = 0;
 1376         int     stuck   = 0;
 1377         int     avail   = ctl ? (ctl->nalloc + ctl->nfree) : 0;
 1378         int     msl;
 1379 
 1380         KASSERT(mutex_owned(softnet_lock));
 1381 
 1382         /* If no resources, we will not get far.
 1383          */
 1384         if (!ctl || !ctl->base.v4 || avail <= 0)
 1385                 return 0;
 1386 
 1387         /* Obtain a free one.
 1388          */
 1389         while (!ctl->nfree) {
 1390                 vtw_age(ctl, 0);
 1391 
 1392                 if (++stuck > avail) {
 1393                         /* When in transition between
 1394                          * schemes (classless, classed) we
 1395                          * can be stuck having to await the
 1396                          * expiration of cross-allocated entries.
 1397                          *
 1398                          * Returning zero means we will fall back to the
 1399                          * traditional TIME_WAIT handling, except in the
 1400                          * case of a re-shed, in which case we cannot
 1401                          * perform the reshecd, but will retain the extant
 1402                          * entry.
 1403                          */
 1404                         db_trace(KTR_VTW
 1405                                  , (ctl, "vtw:!none free in class %x %x/%x"
 1406                                     , ctl->clidx
 1407                                     , ctl->nalloc, ctl->nfree));
 1408 
 1409                         return 0;
 1410                 }
 1411         }
 1412 
 1413         vtw = ctl->alloc.v;
 1414 
 1415         if (vtw->msl_class != ctl->clidx) {
 1416                 /* Usurping rules:
 1417                  *      0 -> {1,2,3} or {1,2,3} -> 0
 1418                  */
 1419                 KASSERT(!vtw->msl_class || !ctl->clidx);
 1420 
 1421                 if (vtw->hashed || vtw->expire.tv_sec) {
 1422                     /* As this is owned by some other class,
 1423                      * we must wait for it to expire it.
 1424                      * This will only happen on class/classless
 1425                      * transitions, which are guaranteed to progress
 1426                      * to completion in small finite time, barring bugs.
 1427                      */
 1428                     db_trace(KTR_VTW
 1429                              , (ctl, "vtw:!%p class %x!=%x %x:%x%s"
 1430                                 , vtw, vtw->msl_class, ctl->clidx
 1431                                 , vtw->expire.tv_sec
 1432                                 , vtw->expire.tv_usec
 1433                                 , vtw->hashed ? " hashed" : ""));
 1434 
 1435                     return 0;
 1436                 }
 1437 
 1438                 db_trace(KTR_VTW
 1439                          , (ctl, "vtw:!%p usurped from %x to %x"
 1440                             , vtw, vtw->msl_class, ctl->clidx));
 1441 
 1442                 vtw->msl_class = ctl->clidx;
 1443         }
 1444 
 1445         if (vtw_alive(vtw)) {
 1446                 KASSERT(0 && "next free not free");
 1447                 return 0;
 1448         }
 1449 
 1450         /* Advance allocation pointer.
 1451          */
 1452         ctl->alloc.v = vtw_next(ctl, vtw);
 1453 
 1454         --ctl->nfree;
 1455         ++ctl->nalloc;
 1456 
 1457         msl = (2 * class_to_msl(ctl->clidx) * 1000) / PR_SLOWHZ;        // msec
 1458 
 1459         /* mark expiration
 1460          */
 1461         getmicrouptime(&vtw->expire);
 1462 
 1463         /* Move expiration into the future.
 1464          */
 1465         vtw->expire.tv_sec  += msl / 1000;
 1466         vtw->expire.tv_usec += 1000 * (msl % 1000);
 1467 
 1468         while (vtw->expire.tv_usec >= 1000*1000) {
 1469                 vtw->expire.tv_usec -= 1000*1000;
 1470                 vtw->expire.tv_sec  += 1;
 1471         }
 1472 
 1473         if (!ctl->oldest.v)
 1474                 ctl->oldest.v = vtw;
 1475 
 1476         return vtw;
 1477 }
 1478 
 1479 /*!\brief expiration
 1480  */
 1481 static int
 1482 vtw_age(vtw_ctl_t *ctl, struct timeval *_when)
 1483 {
 1484         vtw_t   *vtw;
 1485         struct timeval then, *when = _when;
 1486         int     maxtries = 0;
 1487 
 1488         if (!ctl->oldest.v) {
 1489                 KASSERT(!ctl->nalloc);
 1490                 return 0;
 1491         }
 1492 
 1493         for (vtw = ctl->oldest.v; vtw && ctl->nalloc; ) {
 1494                 if (++maxtries > ctl->nalloc)
 1495                         break;
 1496 
 1497                 if (vtw->msl_class != ctl->clidx) {
 1498                         db_trace(KTR_VTW
 1499                                  , (vtw, "vtw:!age class mismatch %x != %x"
 1500                                     , vtw->msl_class, ctl->clidx));
 1501                         /* XXXX
 1502                          * See if the appropriate action is to skip to the next.
 1503                          * XXXX
 1504                          */
 1505                         ctl->oldest.v = vtw = vtw_next(ctl, vtw);
 1506                         continue;
 1507                 }
 1508                 if (!when) {
 1509                         /* Latch oldest timeval if none specified.
 1510                          */
 1511                         then = vtw->expire;
 1512                         when = &then;
 1513                 }
 1514 
 1515                 if (!timercmp(&vtw->expire, when, <=))
 1516                         break;
 1517 
 1518                 db_trace(KTR_VTW
 1519                          , (vtw, "vtw: expire %x %8.8x:%8.8x %x/%x"
 1520                             , ctl->clidx
 1521                             , vtw->expire.tv_sec
 1522                             , vtw->expire.tv_usec
 1523                             , ctl->nalloc
 1524                             , ctl->nfree));
 1525 
 1526                 if (!_when)
 1527                         ++vtw_stats.kill;
 1528 
 1529                 vtw_del(ctl, vtw);
 1530                 vtw = ctl->oldest.v;
 1531         }
 1532 
 1533         return ctl->nalloc;     // # remaining allocated
 1534 }
 1535 
 1536 static callout_t vtw_cs;
 1537 
 1538 /*!\brief notice the passage of time.
 1539  * It seems to be getting faster.  What happened to the year?
 1540  */
 1541 static void
 1542 vtw_tick(void *arg)
 1543 {
 1544         struct timeval now;
 1545         int i, cnt = 0;
 1546 
 1547         getmicrouptime(&now);
 1548 
 1549         db_trace(KTR_VTW, (arg, "vtk: tick - now %8.8x:%8.8x"
 1550                            , now.tv_sec, now.tv_usec));
 1551 
 1552         mutex_enter(softnet_lock);
 1553 
 1554         for (i = 0; i < VTW_NCLASS; ++i) {
 1555                 cnt += vtw_age(&vtw_tcpv4[i], &now);
 1556                 cnt += vtw_age(&vtw_tcpv6[i], &now);
 1557         }
 1558 
 1559         /* Keep ticks coming while we need them.
 1560          */
 1561         if (cnt)
 1562                 callout_schedule(&vtw_cs, hz / 5);
 1563         else {
 1564                 tcp_vtw_was_enabled = 0;
 1565                 tcbtable.vestige    = 0;
 1566         }
 1567         mutex_exit(softnet_lock);
 1568 }
 1569 
 1570 /* inpcb_lookup_locals assist for handling vestigial entries.
 1571  */
 1572 static void *
 1573 tcp_init_ports_v4(struct in_addr addr, u_int port, int wild)
 1574 {
 1575         struct tcp_ports_iterator *it = &tcp_ports_iterator_v4;
 1576 
 1577         bzero(it, sizeof (*it));
 1578 
 1579         /* Note: the reference to vtw_tcpv4[0] is fine.
 1580          * We do not need per-class iteration.  We just
 1581          * need to get to the fat, and there is one
 1582          * shared fat.
 1583          */
 1584         if (vtw_tcpv4[0].fat) {
 1585                 it->addr.v4 = addr;
 1586                 it->port = port;
 1587                 it->wild = !!wild;
 1588                 it->ctl  = &vtw_tcpv4[0];
 1589 
 1590                 ++vtw_stats.look[1];
 1591         }
 1592 
 1593         return it;
 1594 }
 1595 
 1596 /*!\brief export an IPv4 vtw.
 1597  */
 1598 static int
 1599 vtw_export_v4(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
 1600 {
 1601         vtw_v4_t        *v4 = (void*)vtw;
 1602 
 1603         bzero(res, sizeof (*res));
 1604 
 1605         if (ctl && vtw) {
 1606                 if (!ctl->clidx && vtw->msl_class)
 1607                         ctl += vtw->msl_class;
 1608                 else
 1609                         KASSERT(ctl->clidx == vtw->msl_class);
 1610 
 1611                 res->valid = 1;
 1612                 res->v4    = 1;
 1613 
 1614                 res->faddr.v4.s_addr = v4->faddr;
 1615                 res->laddr.v4.s_addr = v4->laddr;
 1616                 res->fport      = v4->fport;
 1617                 res->lport      = v4->lport;
 1618                 res->vtw        = vtw;          // netlock held over call(s)
 1619                 res->ctl        = ctl;
 1620                 res->reuse_addr = vtw->reuse_addr;
 1621                 res->reuse_port = vtw->reuse_port;
 1622                 res->snd_nxt    = vtw->snd_nxt;
 1623                 res->rcv_nxt    = vtw->rcv_nxt;
 1624                 res->rcv_wnd    = vtw->rcv_wnd;
 1625                 res->uid        = vtw->uid;
 1626         }
 1627 
 1628         return res->valid;
 1629 }
 1630 
 1631 /*!\brief return next port in the port iterator.  yowza.
 1632  */
 1633 static int
 1634 tcp_next_port_v4(void *arg, struct vestigial_inpcb *res)
 1635 {
 1636         struct tcp_ports_iterator *it = arg;
 1637         vtw_t           *vtw = 0;
 1638 
 1639         if (it->ctl)
 1640                 vtw = vtw_next_port_v4(it);
 1641 
 1642         if (!vtw)
 1643                 it->ctl = 0;
 1644 
 1645         return vtw_export_v4(it->ctl, vtw, res);
 1646 }
 1647 
 1648 static int
 1649 tcp_lookup_v4(struct in_addr faddr, uint16_t fport,
 1650               struct in_addr laddr, uint16_t lport,
 1651               struct vestigial_inpcb *res)
 1652 {
 1653         vtw_t           *vtw;
 1654         vtw_ctl_t       *ctl;
 1655 
 1656 
 1657         db_trace(KTR_VTW
 1658                  , (res, "vtw: lookup %A:%P %A:%P"
 1659                     , faddr, fport
 1660                     , laddr, lport));
 1661 
 1662         vtw = vtw_lookup_hash_v4((ctl = &vtw_tcpv4[0])
 1663                                  , faddr.s_addr, fport
 1664                                  , laddr.s_addr, lport, 0);
 1665 
 1666         return vtw_export_v4(ctl, vtw, res);
 1667 }
 1668 
 1669 /* inpcb_lookup_locals assist for handling vestigial entries.
 1670  */
 1671 static void *
 1672 tcp_init_ports_v6(const struct in6_addr *addr, u_int port, int wild)
 1673 {
 1674         struct tcp_ports_iterator *it = &tcp_ports_iterator_v6;
 1675 
 1676         bzero(it, sizeof (*it));
 1677 
 1678         /* Note: the reference to vtw_tcpv6[0] is fine.
 1679          * We do not need per-class iteration.  We just
 1680          * need to get to the fat, and there is one
 1681          * shared fat.
 1682          */
 1683         if (vtw_tcpv6[0].fat) {
 1684                 it->addr.v6 = *addr;
 1685                 it->port = port;
 1686                 it->wild = !!wild;
 1687                 it->ctl  = &vtw_tcpv6[0];
 1688 
 1689                 ++vtw_stats.look[1];
 1690         }
 1691 
 1692         return it;
 1693 }
 1694 
 1695 /*!\brief export an IPv6 vtw.
 1696  */
 1697 static int
 1698 vtw_export_v6(vtw_ctl_t *ctl, vtw_t *vtw, vestigial_inpcb_t *res)
 1699 {
 1700         vtw_v6_t        *v6 = (void*)vtw;
 1701 
 1702         bzero(res, sizeof (*res));
 1703 
 1704         if (ctl && vtw) {
 1705                 if (!ctl->clidx && vtw->msl_class)
 1706                         ctl += vtw->msl_class;
 1707                 else
 1708                         KASSERT(ctl->clidx == vtw->msl_class);
 1709 
 1710                 res->valid = 1;
 1711                 res->v4    = 0;
 1712 
 1713                 res->faddr.v6   = v6->faddr;
 1714                 res->laddr.v6   = v6->laddr;
 1715                 res->fport      = v6->fport;
 1716                 res->lport      = v6->lport;
 1717                 res->vtw        = vtw;          // netlock held over call(s)
 1718                 res->ctl        = ctl;
 1719 
 1720                 res->v6only     = vtw->v6only;
 1721                 res->reuse_addr = vtw->reuse_addr;
 1722                 res->reuse_port = vtw->reuse_port;
 1723 
 1724                 res->snd_nxt    = vtw->snd_nxt;
 1725                 res->rcv_nxt    = vtw->rcv_nxt;
 1726                 res->rcv_wnd    = vtw->rcv_wnd;
 1727                 res->uid        = vtw->uid;
 1728         }
 1729 
 1730         return res->valid;
 1731 }
 1732 
 1733 static int
 1734 tcp_next_port_v6(void *arg, struct vestigial_inpcb *res)
 1735 {
 1736         struct tcp_ports_iterator *it = arg;
 1737         vtw_t           *vtw = 0;
 1738 
 1739         if (it->ctl)
 1740                 vtw = vtw_next_port_v6(it);
 1741 
 1742         if (!vtw)
 1743                 it->ctl = 0;
 1744 
 1745         return vtw_export_v6(it->ctl, vtw, res);
 1746 }
 1747 
 1748 static int
 1749 tcp_lookup_v6(const struct in6_addr *faddr, uint16_t fport,
 1750               const struct in6_addr *laddr, uint16_t lport,
 1751               struct vestigial_inpcb *res)
 1752 {
 1753         vtw_ctl_t       *ctl;
 1754         vtw_t           *vtw;
 1755 
 1756         db_trace(KTR_VTW
 1757                  , (res, "vtw: lookup %6A:%P %6A:%P"
 1758                     , db_store(faddr, sizeof (*faddr)), fport
 1759                     , db_store(laddr, sizeof (*laddr)), lport));
 1760 
 1761         vtw = vtw_lookup_hash_v6((ctl = &vtw_tcpv6[0])
 1762                                  , faddr, fport
 1763                                  , laddr, lport, 0);
 1764 
 1765         return vtw_export_v6(ctl, vtw, res);
 1766 }
 1767 
 1768 static vestigial_hooks_t tcp_hooks = {
 1769         .init_ports4    = tcp_init_ports_v4,
 1770         .next_port4     = tcp_next_port_v4,
 1771         .lookup4        = tcp_lookup_v4,
 1772         .init_ports6    = tcp_init_ports_v6,
 1773         .next_port6     = tcp_next_port_v6,
 1774         .lookup6        = tcp_lookup_v6,
 1775 };
 1776 
 1777 static bool
 1778 vtw_select(int af, fatp_ctl_t **fatp, vtw_ctl_t **ctlp)
 1779 {
 1780         fatp_ctl_t      *fat;
 1781         vtw_ctl_t       *ctl;
 1782 
 1783         switch (af) {
 1784         case AF_INET:
 1785                 fat = &fat_tcpv4;
 1786                 ctl = &vtw_tcpv4[0];
 1787                 break;
 1788         case AF_INET6:
 1789                 fat = &fat_tcpv6;
 1790                 ctl = &vtw_tcpv6[0];
 1791                 break;
 1792         default:
 1793                 return false;
 1794         }
 1795         if (fatp != NULL)
 1796                 *fatp = fat;
 1797         if (ctlp != NULL)
 1798                 *ctlp = ctl;
 1799         return true;
 1800 }
 1801 
 1802 /*!\brief       initialize controlling instance
 1803  */
 1804 static int
 1805 vtw_control_init(int af)
 1806 {
 1807         fatp_ctl_t      *fat;
 1808         vtw_ctl_t       *ctl;
 1809         fatp_t          *fat_base;
 1810         fatp_t          **fat_hash;
 1811         vtw_t           *ctl_base_v;
 1812         uint32_t        n, m;
 1813         size_t sz;
 1814 
 1815         KASSERT(powerof2(tcp_vtw_entries));
 1816 
 1817         if (!vtw_select(af, &fat, &ctl))
 1818                 return EAFNOSUPPORT;
 1819 
 1820         if (fat->hash != NULL) {
 1821                 KASSERT(fat->base != NULL && ctl->base.v != NULL);
 1822                 return 0;
 1823         }
 1824 
 1825         /* Allocate 10% more capacity in the fat pointers.
 1826          * We should only need ~#hash additional based on
 1827          * how they age, but TIME_WAIT assassination could cause
 1828          * sparse fat pointer utilisation.
 1829          */
 1830         m = 512;
 1831         n = 2*m + (11 * (tcp_vtw_entries / fatp_ntags())) / 10;
 1832         sz = (ctl->is_v4 ? sizeof(vtw_v4_t) : sizeof(vtw_v6_t));
 1833 
 1834         fat_hash = kmem_zalloc(2*m * sizeof(fatp_t *), KM_SLEEP);
 1835         fat_base = kmem_zalloc(2*n * sizeof(fatp_t), KM_SLEEP);
 1836         ctl_base_v = kmem_zalloc(tcp_vtw_entries * sz, KM_SLEEP);
 1837         fatp_init(fat, n, m, fat_base, fat_hash);
 1838         vtw_init(fat, ctl, tcp_vtw_entries, ctl_base_v);
 1839 
 1840         return 0;
 1841 }
 1842 
 1843 /*!\brief       select controlling instance
 1844  */
 1845 static vtw_ctl_t *
 1846 vtw_control(int af, uint32_t msl)
 1847 {
 1848         fatp_ctl_t      *fat;
 1849         vtw_ctl_t       *ctl;
 1850         int             msl_class = msl_to_class(msl);
 1851 
 1852         if (!vtw_select(af, &fat, &ctl))
 1853                 return NULL;
 1854 
 1855         if (!fat->base || !ctl->base.v)
 1856                 return NULL;
 1857 
 1858         if (!tcp_vtw_was_enabled) {
 1859                 /* This guarantees is timer ticks until we no longer need them.
 1860                  */
 1861                 tcp_vtw_was_enabled = 1;
 1862 
 1863                 callout_schedule(&vtw_cs, hz / 5);
 1864 
 1865                 tcbtable.vestige = &tcp_hooks;
 1866         }
 1867 
 1868         return ctl + msl_class;
 1869 }
 1870 
 1871 /*!\brief       add TCP pcb to vestigial timewait
 1872  */
 1873 int
 1874 vtw_add(int af, struct tcpcb *tp)
 1875 {
 1876 #ifdef VTW_DEBUG
 1877         int             enable;
 1878 #endif
 1879         vtw_ctl_t       *ctl;
 1880         vtw_t           *vtw;
 1881 
 1882         KASSERT(mutex_owned(softnet_lock));
 1883 
 1884         ctl = vtw_control(af, tp->t_msl);
 1885         if (!ctl)
 1886                 return 0;
 1887 
 1888 #ifdef VTW_DEBUG
 1889         enable = (af == AF_INET) ? tcp4_vtw_enable : tcp6_vtw_enable;
 1890 #endif
 1891 
 1892         vtw = vtw_alloc(ctl);
 1893 
 1894         if (vtw) {
 1895                 vtw->snd_nxt = tp->snd_nxt;
 1896                 vtw->rcv_nxt = tp->rcv_nxt;
 1897 
 1898                 switch (af) {
 1899                 case AF_INET: {
 1900                         struct inpcb    *inp = tp->t_inpcb;
 1901                         vtw_v4_t        *v4  = (void*)vtw;
 1902 
 1903                         v4->faddr = in4p_faddr(inp).s_addr;
 1904                         v4->laddr = in4p_laddr(inp).s_addr;
 1905                         v4->fport = inp->inp_fport;
 1906                         v4->lport = inp->inp_lport;
 1907 
 1908                         vtw->reuse_port = !!(inp->inp_socket->so_options
 1909                                              & SO_REUSEPORT);
 1910                         vtw->reuse_addr = !!(inp->inp_socket->so_options
 1911                                              & SO_REUSEADDR);
 1912                         vtw->v6only     = 0;
 1913                         vtw->uid        = inp->inp_socket->so_uidinfo->ui_uid;
 1914 
 1915                         vtw_inshash_v4(ctl, vtw);
 1916 
 1917 
 1918 #ifdef VTW_DEBUG
 1919                         /* Immediate lookup (connected and port) to
 1920                          * ensure at least that works!
 1921                          */
 1922                         if (enable & 4) {
 1923                                 KASSERT(vtw_lookup_hash_v4
 1924                                         (ctl
 1925                                          , in4p_faddr(inp).s_addr, inp->inp_fport
 1926                                          , in4p_laddr(inp).s_addr, inp->inp_lport
 1927                                          , 0)
 1928                                         == vtw);
 1929                                 KASSERT(vtw_lookup_hash_v4
 1930                                         (ctl
 1931                                          , in4p_faddr(inp).s_addr, inp->inp_fport
 1932                                          , in4p_laddr(inp).s_addr, inp->inp_lport
 1933                                          , 1));
 1934                         }
 1935                         /* Immediate port iterator functionality check: not wild
 1936                          */
 1937                         if (enable & 8) {
 1938                                 struct tcp_ports_iterator *it;
 1939                                 struct vestigial_inpcb res;
 1940                                 int cnt = 0;
 1941 
 1942                                 it = tcp_init_ports_v4(in4p_laddr(inp)
 1943                                                        , inp->inp_lport, 0);
 1944 
 1945                                 while (tcp_next_port_v4(it, &res)) {
 1946                                         ++cnt;
 1947                                 }
 1948                                 KASSERT(cnt);
 1949                         }
 1950                         /* Immediate port iterator functionality check: wild
 1951                          */
 1952                         if (enable & 16) {
 1953                                 struct tcp_ports_iterator *it;
 1954                                 struct vestigial_inpcb res;
 1955                                 struct in_addr any;
 1956                                 int cnt = 0;
 1957 
 1958                                 any.s_addr = htonl(INADDR_ANY);
 1959 
 1960                                 it = tcp_init_ports_v4(any, inp->inp_lport, 1);
 1961 
 1962                                 while (tcp_next_port_v4(it, &res)) {
 1963                                         ++cnt;
 1964                                 }
 1965                                 KASSERT(cnt);
 1966                         }
 1967 #endif /* VTW_DEBUG */
 1968                         break;
 1969                 }
 1970 
 1971                 case AF_INET6: {
 1972                         struct inpcb    *inp = tp->t_inpcb;
 1973                         vtw_v6_t        *v6  = (void*)vtw;
 1974 
 1975                         v6->faddr = in6p_faddr(inp);
 1976                         v6->laddr = in6p_laddr(inp);
 1977                         v6->fport = inp->inp_fport;
 1978                         v6->lport = inp->inp_lport;
 1979 
 1980                         vtw->reuse_port = !!(inp->inp_socket->so_options
 1981                                              & SO_REUSEPORT);
 1982                         vtw->reuse_addr = !!(inp->inp_socket->so_options
 1983                                              & SO_REUSEADDR);
 1984                         vtw->v6only     = !!(inp->inp_flags
 1985                                              & IN6P_IPV6_V6ONLY);
 1986                         vtw->uid        = inp->inp_socket->so_uidinfo->ui_uid;
 1987 
 1988                         vtw_inshash_v6(ctl, vtw);
 1989 #ifdef VTW_DEBUG
 1990                         /* Immediate lookup (connected and port) to
 1991                          * ensure at least that works!
 1992                          */
 1993                         if (enable & 4) {
 1994                                 KASSERT(vtw_lookup_hash_v6(ctl
 1995                                          , &in6p_faddr(inp), inp->inp_fport
 1996                                          , &in6p_laddr(inp), inp->inp_lport
 1997                                          , 0)
 1998                                         == vtw);
 1999                                 KASSERT(vtw_lookup_hash_v6
 2000                                         (ctl
 2001                                          , &in6p_faddr(inp), inp->inp_fport
 2002                                          , &in6p_laddr(inp), inp->inp_lport
 2003                                          , 1));
 2004                         }
 2005                         /* Immediate port iterator functionality check: not wild
 2006                          */
 2007                         if (enable & 8) {
 2008                                 struct tcp_ports_iterator *it;
 2009                                 struct vestigial_inpcb res;
 2010                                 int cnt = 0;
 2011 
 2012                                 it = tcp_init_ports_v6(&in6p_laddr(inp)
 2013                                                        , inp->inp_lport, 0);
 2014 
 2015                                 while (tcp_next_port_v6(it, &res)) {
 2016                                         ++cnt;
 2017                                 }
 2018                                 KASSERT(cnt);
 2019                         }
 2020                         /* Immediate port iterator functionality check: wild
 2021                          */
 2022                         if (enable & 16) {
 2023                                 struct tcp_ports_iterator *it;
 2024                                 struct vestigial_inpcb res;
 2025                                 static struct in6_addr any = IN6ADDR_ANY_INIT;
 2026                                 int cnt = 0;
 2027 
 2028                                 it = tcp_init_ports_v6(&any
 2029                                                        , inp->inp_lport, 1);
 2030 
 2031                                 while (tcp_next_port_v6(it, &res)) {
 2032                                         ++cnt;
 2033                                 }
 2034                                 KASSERT(cnt);
 2035                         }
 2036 #endif /* VTW_DEBUG */
 2037                         break;
 2038                 }
 2039                 }
 2040 
 2041                 tcp_canceltimers(tp);
 2042                 tp = tcp_close(tp);
 2043                 KASSERT(!tp);
 2044 
 2045                 return 1;
 2046         }
 2047 
 2048         return 0;
 2049 }
 2050 
 2051 /*!\brief       restart timer for vestigial time-wait entry
 2052  */
 2053 static void
 2054 vtw_restart_v4(vestigial_inpcb_t *vp)
 2055 {
 2056         vtw_v4_t        copy = *(vtw_v4_t*)vp->vtw;
 2057         vtw_t           *vtw;
 2058         vtw_t           *cp  = &copy.common;
 2059         vtw_ctl_t       *ctl;
 2060 
 2061         KASSERT(mutex_owned(softnet_lock));
 2062 
 2063         db_trace(KTR_VTW
 2064                  , (vp->vtw, "vtw: restart %A:%P %A:%P"
 2065                     , vp->faddr.v4.s_addr, vp->fport
 2066                     , vp->laddr.v4.s_addr, vp->lport));
 2067 
 2068         /* Class might have changed, so have a squiz.
 2069          */
 2070         ctl = vtw_control(AF_INET, class_to_msl(cp->msl_class));
 2071         vtw = vtw_alloc(ctl);
 2072 
 2073         if (vtw) {
 2074                 vtw_v4_t        *v4  = (void*)vtw;
 2075 
 2076                 /* Safe now to unhash the old entry
 2077                  */
 2078                 vtw_del(vp->ctl, vp->vtw);
 2079 
 2080                 vtw->snd_nxt = cp->snd_nxt;
 2081                 vtw->rcv_nxt = cp->rcv_nxt;
 2082 
 2083                 v4->faddr = copy.faddr;
 2084                 v4->laddr = copy.laddr;
 2085                 v4->fport = copy.fport;
 2086                 v4->lport = copy.lport;
 2087 
 2088                 vtw->reuse_port = cp->reuse_port;
 2089                 vtw->reuse_addr = cp->reuse_addr;
 2090                 vtw->v6only     = 0;
 2091                 vtw->uid        = cp->uid;
 2092 
 2093                 vtw_inshash_v4(ctl, vtw);
 2094         }
 2095 
 2096         vp->valid = 0;
 2097 }
 2098 
 2099 /*!\brief       restart timer for vestigial time-wait entry
 2100  */
 2101 static void
 2102 vtw_restart_v6(vestigial_inpcb_t *vp)
 2103 {
 2104         vtw_v6_t        copy = *(vtw_v6_t*)vp->vtw;
 2105         vtw_t           *vtw;
 2106         vtw_t           *cp  = &copy.common;
 2107         vtw_ctl_t       *ctl;
 2108 
 2109         KASSERT(mutex_owned(softnet_lock));
 2110 
 2111         db_trace(KTR_VTW
 2112                  , (vp->vtw, "vtw: restart %6A:%P %6A:%P"
 2113                     , db_store(&vp->faddr.v6, sizeof (vp->faddr.v6))
 2114                     , vp->fport
 2115                     , db_store(&vp->laddr.v6, sizeof (vp->laddr.v6))
 2116                     , vp->lport));
 2117 
 2118         /* Class might have changed, so have a squiz.
 2119          */
 2120         ctl = vtw_control(AF_INET6, class_to_msl(cp->msl_class));
 2121         vtw = vtw_alloc(ctl);
 2122 
 2123         if (vtw) {
 2124                 vtw_v6_t        *v6  = (void*)vtw;
 2125 
 2126                 /* Safe now to unhash the old entry
 2127                  */
 2128                 vtw_del(vp->ctl, vp->vtw);
 2129 
 2130                 vtw->snd_nxt = cp->snd_nxt;
 2131                 vtw->rcv_nxt = cp->rcv_nxt;
 2132 
 2133                 v6->faddr = copy.faddr;
 2134                 v6->laddr = copy.laddr;
 2135                 v6->fport = copy.fport;
 2136                 v6->lport = copy.lport;
 2137 
 2138                 vtw->reuse_port = cp->reuse_port;
 2139                 vtw->reuse_addr = cp->reuse_addr;
 2140                 vtw->v6only     = cp->v6only;
 2141                 vtw->uid        = cp->uid;
 2142 
 2143                 vtw_inshash_v6(ctl, vtw);
 2144         }
 2145 
 2146         vp->valid = 0;
 2147 }
 2148 
 2149 /*!\brief       restart timer for vestigial time-wait entry
 2150  */
 2151 void
 2152 vtw_restart(vestigial_inpcb_t *vp)
 2153 {
 2154         if (!vp || !vp->valid)
 2155                 return;
 2156 
 2157         if (vp->v4)
 2158                 vtw_restart_v4(vp);
 2159         else
 2160                 vtw_restart_v6(vp);
 2161 }
 2162 
 2163 int
 2164 sysctl_tcp_vtw_enable(SYSCTLFN_ARGS)
 2165 {  
 2166         int en, rc;
 2167         struct sysctlnode node;
 2168 
 2169         node = *rnode;
 2170         en = *(int *)rnode->sysctl_data;
 2171         node.sysctl_data = &en;
 2172 
 2173         rc = sysctl_lookup(SYSCTLFN_CALL(&node));
 2174         if (rc != 0 || newp == NULL)
 2175                 return rc;
 2176 
 2177         if (rnode->sysctl_data != &tcp4_vtw_enable &&
 2178             rnode->sysctl_data != &tcp6_vtw_enable)
 2179                 rc = ENOENT;
 2180         else if ((en & 1) == 0)
 2181                 rc = 0;
 2182         else if (rnode->sysctl_data == &tcp4_vtw_enable)
 2183                 rc = vtw_control_init(AF_INET);
 2184         else /* rnode->sysctl_data == &tcp6_vtw_enable */
 2185                 rc = vtw_control_init(AF_INET6);
 2186 
 2187         if (rc == 0)
 2188                 *(int *)rnode->sysctl_data = en;
 2189 
 2190         return rc;
 2191 }
 2192 
 2193 int
 2194 vtw_earlyinit(void)
 2195 {
 2196         int i, rc;
 2197 
 2198         callout_init(&vtw_cs, 0);
 2199         callout_setfunc(&vtw_cs, vtw_tick, 0);
 2200 
 2201         for (i = 0; i < VTW_NCLASS; ++i) {
 2202                 vtw_tcpv4[i].is_v4 = 1;
 2203                 vtw_tcpv6[i].is_v6 = 1;
 2204         }
 2205 
 2206         if ((tcp4_vtw_enable & 1) != 0 &&
 2207             (rc = vtw_control_init(AF_INET)) != 0)
 2208                 return rc;
 2209 
 2210         if ((tcp6_vtw_enable & 1) != 0 &&
 2211             (rc = vtw_control_init(AF_INET6)) != 0)
 2212                 return rc;
 2213 
 2214         return 0;
 2215 }
 2216 
 2217 #ifdef VTW_DEBUG
 2218 #include <sys/syscallargs.h>
 2219 #include <sys/sysctl.h>
 2220 
 2221 /*!\brief       add lalp, fafp entries for debug
 2222  */
 2223 int
 2224 vtw_debug_add(int af, sin_either_t *la, sin_either_t *fa, int msl, int msl_class)
 2225 {
 2226         vtw_ctl_t       *ctl;
 2227         vtw_t           *vtw;
 2228 
 2229         ctl = vtw_control(af, msl ? msl : class_to_msl(msl_class));
 2230         if (!ctl)
 2231                 return 0;
 2232 
 2233         vtw = vtw_alloc(ctl);
 2234 
 2235         if (vtw) {
 2236                 vtw->snd_nxt = 0;
 2237                 vtw->rcv_nxt = 0;
 2238 
 2239                 switch (af) {
 2240                 case AF_INET: {
 2241                         vtw_v4_t        *v4  = (void*)vtw;
 2242 
 2243                         v4->faddr = fa->sin_addr.v4.s_addr;
 2244                         v4->laddr = la->sin_addr.v4.s_addr;
 2245                         v4->fport = fa->sin_port;
 2246                         v4->lport = la->sin_port;
 2247 
 2248                         vtw->reuse_port = 1;
 2249                         vtw->reuse_addr = 1;
 2250                         vtw->v6only     = 0;
 2251                         vtw->uid        = 0;
 2252 
 2253                         vtw_inshash_v4(ctl, vtw);
 2254                         break;
 2255                 }
 2256 
 2257                 case AF_INET6: {
 2258                         vtw_v6_t        *v6  = (void*)vtw;
 2259 
 2260                         v6->faddr = fa->sin_addr.v6;
 2261                         v6->laddr = la->sin_addr.v6;
 2262 
 2263                         v6->fport = fa->sin_port;
 2264                         v6->lport = la->sin_port;
 2265 
 2266                         vtw->reuse_port = 1;
 2267                         vtw->reuse_addr = 1;
 2268                         vtw->v6only     = 0;
 2269                         vtw->uid        = 0;
 2270 
 2271                         vtw_inshash_v6(ctl, vtw);
 2272                         break;
 2273                 }
 2274 
 2275                 default:
 2276                         break;
 2277                 }
 2278 
 2279                 return 1;
 2280         }
 2281 
 2282         return 0;
 2283 }
 2284 
 2285 static int vtw_syscall = 0;
 2286 
 2287 static int
 2288 vtw_debug_process(vtw_sysargs_t *ap)
 2289 {
 2290         struct vestigial_inpcb vestige;
 2291         int     rc = 0;
 2292 
 2293         mutex_enter(softnet_lock);
 2294 
 2295         switch (ap->op) {
 2296         case 0:         // insert
 2297                 vtw_debug_add(ap->la.sin_family
 2298                               , &ap->la
 2299                               , &ap->fa
 2300                               , TCPTV_MSL
 2301                               , 0);
 2302                 break;
 2303 
 2304         case 1:         // lookup
 2305         case 2:         // restart
 2306                 switch (ap->la.sin_family) {
 2307                 case AF_INET:
 2308                         if (tcp_lookup_v4(ap->fa.sin_addr.v4, ap->fa.sin_port,
 2309                                           ap->la.sin_addr.v4, ap->la.sin_port,
 2310                                           &vestige)) {
 2311                                 if (ap->op == 2) {
 2312                                         vtw_restart(&vestige);
 2313                                 }
 2314                                 rc = 0;
 2315                         } else
 2316                                 rc = ESRCH;
 2317                         break;
 2318 
 2319                 case AF_INET6:
 2320                         if (tcp_lookup_v6(&ap->fa.sin_addr.v6, ap->fa.sin_port,
 2321                                           &ap->la.sin_addr.v6, ap->la.sin_port,
 2322                                           &vestige)) {
 2323                                 if (ap->op == 2) {
 2324                                         vtw_restart(&vestige);
 2325                                 }
 2326                                 rc = 0;
 2327                         } else
 2328                                 rc = ESRCH;
 2329                         break;
 2330                 default:
 2331                         rc = EINVAL;
 2332                 }
 2333                 break;
 2334 
 2335         default:
 2336                 rc = EINVAL;
 2337         }
 2338 
 2339         mutex_exit(softnet_lock);
 2340         return rc;
 2341 }
 2342 
 2343 struct sys_vtw_args {
 2344         syscallarg(const vtw_sysargs_t *) req;
 2345         syscallarg(size_t) len;
 2346 };
 2347 
 2348 static int
 2349 vtw_sys(struct lwp *l, const void *_, register_t *retval)
 2350 {
 2351         const struct sys_vtw_args *uap = _;
 2352         void    *buf;
 2353         int     rc;
 2354         size_t  len     = SCARG(uap, len);
 2355 
 2356         if (len != sizeof (vtw_sysargs_t))
 2357                 return EINVAL;
 2358 
 2359         buf = kmem_alloc(len, KM_SLEEP);
 2360         rc = copyin(SCARG(uap, req), buf, len);
 2361         if (!rc) {
 2362                 rc = vtw_debug_process(buf);
 2363         }
 2364         kmem_free(buf, len);
 2365 
 2366         return rc;
 2367 }
 2368 
 2369 static void
 2370 vtw_sanity_check(void)
 2371 {
 2372         vtw_ctl_t       *ctl;
 2373         vtw_t           *vtw;
 2374         int             i;
 2375         int             n;
 2376 
 2377         for (i = 0; i < VTW_NCLASS; ++i) {
 2378                 ctl = &vtw_tcpv4[i];
 2379 
 2380                 if (!ctl->base.v || ctl->nalloc)
 2381                         continue;
 2382 
 2383                 for (n = 0, vtw = ctl->base.v; ; ) {
 2384                         ++n;
 2385                         vtw = vtw_next(ctl, vtw);
 2386                         if (vtw == ctl->base.v)
 2387                                 break;
 2388                 }
 2389                 db_trace(KTR_VTW
 2390                          , (ctl, "sanity: class %x n %x nfree %x"
 2391                             , i, n, ctl->nfree));
 2392 
 2393                 KASSERT(n == ctl->nfree);
 2394         }
 2395 
 2396         for (i = 0; i < VTW_NCLASS; ++i) {
 2397                 ctl = &vtw_tcpv6[i];
 2398 
 2399                 if (!ctl->base.v || ctl->nalloc)
 2400                         continue;
 2401 
 2402                 for (n = 0, vtw = ctl->base.v; ; ) {
 2403                         ++n;
 2404                         vtw = vtw_next(ctl, vtw);
 2405                         if (vtw == ctl->base.v)
 2406                                 break;
 2407                 }
 2408                 db_trace(KTR_VTW
 2409                          , (ctl, "sanity: class %x n %x nfree %x"
 2410                             , i, n, ctl->nfree));
 2411                 KASSERT(n == ctl->nfree);
 2412         }
 2413 }
 2414                 
 2415 /*!\brief       Initialise debug support.
 2416  */
 2417 static void
 2418 vtw_debug_init(void)
 2419 {
 2420         int     i;
 2421 
 2422         vtw_sanity_check();
 2423 
 2424         if (vtw_syscall)
 2425                 return;
 2426 
 2427         for (i = 511; i; --i) {
 2428                 if (sysent[i].sy_call == sys_nosys) {
 2429                         sysent[i].sy_call    = vtw_sys;
 2430                         sysent[i].sy_narg    = 2;
 2431                         sysent[i].sy_argsize = sizeof (struct sys_vtw_args);
 2432                         sysent[i].sy_flags   = 0;
 2433 
 2434                         vtw_syscall = i;
 2435                         break;
 2436                 }
 2437         }
 2438         if (i) {
 2439                 const struct sysctlnode *node;
 2440                 uint32_t        flags;
 2441 
 2442                 flags = sysctl_root.sysctl_flags;
 2443 
 2444                 sysctl_root.sysctl_flags |= CTLFLAG_READWRITE;
 2445                 sysctl_root.sysctl_flags &= ~CTLFLAG_PERMANENT;
 2446 
 2447                 sysctl_createv(0, 0, 0, &node,
 2448                                CTLFLAG_PERMANENT, CTLTYPE_NODE,
 2449                                "koff",
 2450                                SYSCTL_DESCR("Kernel Obscure Feature Finder"),
 2451                                0, 0, 0, 0, CTL_CREATE, CTL_EOL);
 2452 
 2453                 if (!node) {
 2454                         sysctl_createv(0, 0, 0, &node,
 2455                                        CTLFLAG_PERMANENT, CTLTYPE_NODE,
 2456                                        "koffka",
 2457                                        SYSCTL_DESCR("The Real(tm) Kernel"
 2458                                                     " Obscure Feature Finder"),
 2459                                        0, 0, 0, 0, CTL_CREATE, CTL_EOL);
 2460                 }
 2461                 if (node) {
 2462                         sysctl_createv(0, 0, 0, 0,
 2463                                        CTLFLAG_PERMANENT|CTLFLAG_READONLY,
 2464                                        CTLTYPE_INT, "vtw_debug_syscall",
 2465                                        SYSCTL_DESCR("vtw debug"
 2466                                                     " system call number"),
 2467                                        0, 0, &vtw_syscall, 0, node->sysctl_num,
 2468                                        CTL_CREATE, CTL_EOL);
 2469                 }
 2470                 sysctl_root.sysctl_flags = flags;
 2471         }
 2472 }
 2473 #else /* !VTW_DEBUG */
 2474 static void
 2475 vtw_debug_init(void)
 2476 {
 2477         return;
 2478 }
 2479 #endif /* !VTW_DEBUG */

Cache object: 5f3e2e483a01eab59e43367f918664a0


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.