The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
    5  *      The Regents of the University of California.
    6  * Copyright (c) 2007-2009 Robert N. M. Watson
    7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
    8  * All rights reserved.
    9  *
   10  * Portions of this software were developed by Robert N. M. Watson under
   11  * contract to Juniper Networks, Inc.
   12  *
   13  * Redistribution and use in source and binary forms, with or without
   14  * modification, are permitted provided that the following conditions
   15  * are met:
   16  * 1. Redistributions of source code must retain the above copyright
   17  *    notice, this list of conditions and the following disclaimer.
   18  * 2. Redistributions in binary form must reproduce the above copyright
   19  *    notice, this list of conditions and the following disclaimer in the
   20  *    documentation and/or other materials provided with the distribution.
   21  * 3. Neither the name of the University nor the names of its contributors
   22  *    may be used to endorse or promote products derived from this software
   23  *    without specific prior written permission.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  *
   37  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
   38  */
   39 
   40 #include <sys/cdefs.h>
   41 __FBSDID("$FreeBSD$");
   42 
   43 #include "opt_ddb.h"
   44 #include "opt_ipsec.h"
   45 #include "opt_inet.h"
   46 #include "opt_inet6.h"
   47 #include "opt_ratelimit.h"
   48 #include "opt_pcbgroup.h"
   49 #include "opt_rss.h"
   50 
   51 #include <sys/param.h>
   52 #include <sys/systm.h>
   53 #include <sys/lock.h>
   54 #include <sys/malloc.h>
   55 #include <sys/mbuf.h>
   56 #include <sys/callout.h>
   57 #include <sys/eventhandler.h>
   58 #include <sys/domain.h>
   59 #include <sys/protosw.h>
   60 #include <sys/rmlock.h>
   61 #include <sys/smp.h>
   62 #include <sys/socket.h>
   63 #include <sys/socketvar.h>
   64 #include <sys/sockio.h>
   65 #include <sys/priv.h>
   66 #include <sys/proc.h>
   67 #include <sys/refcount.h>
   68 #include <sys/jail.h>
   69 #include <sys/kernel.h>
   70 #include <sys/sysctl.h>
   71 
   72 #ifdef DDB
   73 #include <ddb/ddb.h>
   74 #endif
   75 
   76 #include <vm/uma.h>
   77 
   78 #include <net/if.h>
   79 #include <net/if_var.h>
   80 #include <net/if_types.h>
   81 #include <net/if_llatbl.h>
   82 #include <net/route.h>
   83 #include <net/rss_config.h>
   84 #include <net/vnet.h>
   85 
   86 #if defined(INET) || defined(INET6)
   87 #include <netinet/in.h>
   88 #include <netinet/in_pcb.h>
   89 #ifdef INET
   90 #include <netinet/in_var.h>
   91 #endif
   92 #include <netinet/ip_var.h>
   93 #include <netinet/tcp_var.h>
   94 #ifdef TCPHPTS
   95 #include <netinet/tcp_hpts.h>
   96 #endif
   97 #include <netinet/udp.h>
   98 #include <netinet/udp_var.h>
   99 #ifdef INET6
  100 #include <netinet/ip6.h>
  101 #include <netinet6/in6_pcb.h>
  102 #include <netinet6/in6_var.h>
  103 #include <netinet6/ip6_var.h>
  104 #endif /* INET6 */
  105 #endif
  106 
  107 #include <netipsec/ipsec_support.h>
  108 
  109 #include <security/mac/mac_framework.h>
  110 
  111 #define INPCBLBGROUP_SIZMIN     8
  112 #define INPCBLBGROUP_SIZMAX     256
  113 
  114 static struct callout   ipport_tick_callout;
  115 
  116 /*
  117  * These configure the range of local port addresses assigned to
  118  * "unspecified" outgoing connections/packets/whatever.
  119  */
  120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
  121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
  122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
  123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
  124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
  125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
  126 
  127 /*
  128  * Reserved ports accessible only to root. There are significant
  129  * security considerations that must be accounted for when changing these,
  130  * but the security benefits can be great. Please be careful.
  131  */
  132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
  133 VNET_DEFINE(int, ipport_reservedlow);
  134 
  135 /* Variables dealing with random ephemeral port allocation. */
  136 VNET_DEFINE(int, ipport_randomized) = 1;        /* user controlled via sysctl */
  137 VNET_DEFINE(int, ipport_randomcps) = 10;        /* user controlled via sysctl */
  138 VNET_DEFINE(int, ipport_randomtime) = 45;       /* user controlled via sysctl */
  139 VNET_DEFINE(int, ipport_stoprandom);            /* toggled by ipport_tick */
  140 VNET_DEFINE(int, ipport_tcpallocs);
  141 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
  142 
  143 #define V_ipport_tcplastcount           VNET(ipport_tcplastcount)
  144 
  145 static void     in_pcbremlists(struct inpcb *inp);
  146 #ifdef INET
  147 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
  148                             struct in_addr faddr, u_int fport_arg,
  149                             struct in_addr laddr, u_int lport_arg,
  150                             int lookupflags, struct ifnet *ifp);
  151 
  152 #define RANGECHK(var, min, max) \
  153         if ((var) < (min)) { (var) = (min); } \
  154         else if ((var) > (max)) { (var) = (max); }
  155 
  156 static int
  157 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
  158 {
  159         int error;
  160 
  161         error = sysctl_handle_int(oidp, arg1, arg2, req);
  162         if (error == 0) {
  163                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
  164                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
  165                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
  166                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
  167                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
  168                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
  169         }
  170         return (error);
  171 }
  172 
  173 #undef RANGECHK
  174 
  175 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
  176     "IP Ports");
  177 
  178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
  179         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
  180         &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
  181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
  182         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
  183         &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
  184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
  185         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
  186         &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
  187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
  188         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
  189         &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
  190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
  191         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
  192         &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
  193 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
  194         CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
  195         &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
  196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
  197         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
  198         &VNET_NAME(ipport_reservedhigh), 0, "");
  199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
  200         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
  201 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
  202         CTLFLAG_VNET | CTLFLAG_RW,
  203         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
  204 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
  205         CTLFLAG_VNET | CTLFLAG_RW,
  206         &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
  207         "allocations before switching to a sequential one");
  208 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
  209         CTLFLAG_VNET | CTLFLAG_RW,
  210         &VNET_NAME(ipport_randomtime), 0,
  211         "Minimum time to keep sequential port "
  212         "allocation before switching to a random one");
  213 #endif /* INET */
  214 
  215 /*
  216  * in_pcb.c: manage the Protocol Control Blocks.
  217  *
  218  * NOTE: It is assumed that most of these functions will be called with
  219  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  220  * functions often modify hash chains or addresses in pcbs.
  221  */
  222 
  223 static struct inpcblbgroup *
  224 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
  225     uint16_t port, const union in_dependaddr *addr, int size)
  226 {
  227         struct inpcblbgroup *grp;
  228         size_t bytes;
  229 
  230         bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
  231         grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
  232         if (!grp)
  233                 return (NULL);
  234         grp->il_vflag = vflag;
  235         grp->il_lport = port;
  236         grp->il_dependladdr = *addr;
  237         grp->il_inpsiz = size;
  238         CK_LIST_INSERT_HEAD(hdr, grp, il_list);
  239         return (grp);
  240 }
  241 
  242 static void
  243 in_pcblbgroup_free_deferred(epoch_context_t ctx)
  244 {
  245         struct inpcblbgroup *grp;
  246 
  247         grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
  248         free(grp, M_PCB);
  249 }
  250 
  251 static void
  252 in_pcblbgroup_free(struct inpcblbgroup *grp)
  253 {
  254 
  255         CK_LIST_REMOVE(grp, il_list);
  256         epoch_call(net_epoch_preempt, &grp->il_epoch_ctx,
  257             in_pcblbgroup_free_deferred);
  258 }
  259 
  260 static struct inpcblbgroup *
  261 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
  262     struct inpcblbgroup *old_grp, int size)
  263 {
  264         struct inpcblbgroup *grp;
  265         int i;
  266 
  267         grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
  268             old_grp->il_lport, &old_grp->il_dependladdr, size);
  269         if (grp == NULL)
  270                 return (NULL);
  271 
  272         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
  273             ("invalid new local group size %d and old local group count %d",
  274              grp->il_inpsiz, old_grp->il_inpcnt));
  275 
  276         for (i = 0; i < old_grp->il_inpcnt; ++i)
  277                 grp->il_inp[i] = old_grp->il_inp[i];
  278         grp->il_inpcnt = old_grp->il_inpcnt;
  279         in_pcblbgroup_free(old_grp);
  280         return (grp);
  281 }
  282 
  283 /*
  284  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  285  * and shrink group if possible.
  286  */
  287 static void
  288 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
  289     int i)
  290 {
  291         struct inpcblbgroup *grp, *new_grp;
  292 
  293         grp = *grpp;
  294         for (; i + 1 < grp->il_inpcnt; ++i)
  295                 grp->il_inp[i] = grp->il_inp[i + 1];
  296         grp->il_inpcnt--;
  297 
  298         if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
  299             grp->il_inpcnt <= grp->il_inpsiz / 4) {
  300                 /* Shrink this group. */
  301                 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
  302                 if (new_grp != NULL)
  303                         *grpp = new_grp;
  304         }
  305 }
  306 
  307 /*
  308  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  309  */
  310 static int
  311 in_pcbinslbgrouphash(struct inpcb *inp)
  312 {
  313         const static struct timeval interval = { 60, 0 };
  314         static struct timeval lastprint;
  315         struct inpcbinfo *pcbinfo;
  316         struct inpcblbgrouphead *hdr;
  317         struct inpcblbgroup *grp;
  318         uint32_t idx;
  319 
  320         pcbinfo = inp->inp_pcbinfo;
  321 
  322         INP_WLOCK_ASSERT(inp);
  323         INP_HASH_WLOCK_ASSERT(pcbinfo);
  324 
  325         /*
  326          * Don't allow jailed socket to join local group.
  327          */
  328         if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
  329                 return (0);
  330 
  331 #ifdef INET6
  332         /*
  333          * Don't allow IPv4 mapped INET6 wild socket.
  334          */
  335         if ((inp->inp_vflag & INP_IPV4) &&
  336             inp->inp_laddr.s_addr == INADDR_ANY &&
  337             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
  338                 return (0);
  339         }
  340 #endif
  341 
  342         idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
  343         hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
  344         CK_LIST_FOREACH(grp, hdr, il_list) {
  345                 if (grp->il_vflag == inp->inp_vflag &&
  346                     grp->il_lport == inp->inp_lport &&
  347                     memcmp(&grp->il_dependladdr,
  348                     &inp->inp_inc.inc_ie.ie_dependladdr,
  349                     sizeof(grp->il_dependladdr)) == 0)
  350                         break;
  351         }
  352         if (grp == NULL) {
  353                 /* Create new load balance group. */
  354                 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
  355                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
  356                     INPCBLBGROUP_SIZMIN);
  357                 if (grp == NULL)
  358                         return (ENOBUFS);
  359         } else if (grp->il_inpcnt == grp->il_inpsiz) {
  360                 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
  361                         if (ratecheck(&lastprint, &interval))
  362                                 printf("lb group port %d, limit reached\n",
  363                                     ntohs(grp->il_lport));
  364                         return (0);
  365                 }
  366 
  367                 /* Expand this local group. */
  368                 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
  369                 if (grp == NULL)
  370                         return (ENOBUFS);
  371         }
  372 
  373         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
  374             ("invalid local group size %d and count %d", grp->il_inpsiz,
  375             grp->il_inpcnt));
  376 
  377         grp->il_inp[grp->il_inpcnt] = inp;
  378         grp->il_inpcnt++;
  379         return (0);
  380 }
  381 
  382 /*
  383  * Remove PCB from load balance group.
  384  */
  385 static void
  386 in_pcbremlbgrouphash(struct inpcb *inp)
  387 {
  388         struct inpcbinfo *pcbinfo;
  389         struct inpcblbgrouphead *hdr;
  390         struct inpcblbgroup *grp;
  391         int i;
  392 
  393         pcbinfo = inp->inp_pcbinfo;
  394 
  395         INP_WLOCK_ASSERT(inp);
  396         INP_HASH_WLOCK_ASSERT(pcbinfo);
  397 
  398         hdr = &pcbinfo->ipi_lbgrouphashbase[
  399             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
  400         CK_LIST_FOREACH(grp, hdr, il_list) {
  401                 for (i = 0; i < grp->il_inpcnt; ++i) {
  402                         if (grp->il_inp[i] != inp)
  403                                 continue;
  404 
  405                         if (grp->il_inpcnt == 1) {
  406                                 /* We are the last, free this local group. */
  407                                 in_pcblbgroup_free(grp);
  408                         } else {
  409                                 /* Pull up inpcbs, shrink group if possible. */
  410                                 in_pcblbgroup_reorder(hdr, &grp, i);
  411                         }
  412                         return;
  413                 }
  414         }
  415 }
  416 
  417 /*
  418  * Different protocols initialize their inpcbs differently - giving
  419  * different name to the lock.  But they all are disposed the same.
  420  */
  421 static void
  422 inpcb_fini(void *mem, int size)
  423 {
  424         struct inpcb *inp = mem;
  425 
  426         INP_LOCK_DESTROY(inp);
  427 }
  428 
  429 /*
  430  * Initialize an inpcbinfo -- we should be able to reduce the number of
  431  * arguments in time.
  432  */
  433 void
  434 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
  435     struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
  436     char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
  437 {
  438 
  439         porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
  440 
  441         INP_INFO_LOCK_INIT(pcbinfo, name);
  442         INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash");     /* XXXRW: argument? */
  443         INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
  444 #ifdef VIMAGE
  445         pcbinfo->ipi_vnet = curvnet;
  446 #endif
  447         pcbinfo->ipi_listhead = listhead;
  448         CK_LIST_INIT(pcbinfo->ipi_listhead);
  449         pcbinfo->ipi_count = 0;
  450         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
  451             &pcbinfo->ipi_hashmask);
  452         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
  453             &pcbinfo->ipi_porthashmask);
  454         pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
  455             &pcbinfo->ipi_lbgrouphashmask);
  456 #ifdef PCBGROUP
  457         in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
  458 #endif
  459         pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
  460             NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
  461         uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
  462         uma_zone_set_warning(pcbinfo->ipi_zone,
  463             "kern.ipc.maxsockets limit reached");
  464 }
  465 
  466 /*
  467  * Destroy an inpcbinfo.
  468  */
  469 void
  470 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
  471 {
  472 
  473         KASSERT(pcbinfo->ipi_count == 0,
  474             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
  475 
  476         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
  477         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
  478             pcbinfo->ipi_porthashmask);
  479         hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
  480             pcbinfo->ipi_lbgrouphashmask);
  481 #ifdef PCBGROUP
  482         in_pcbgroup_destroy(pcbinfo);
  483 #endif
  484         uma_zdestroy(pcbinfo->ipi_zone);
  485         INP_LIST_LOCK_DESTROY(pcbinfo);
  486         INP_HASH_LOCK_DESTROY(pcbinfo);
  487         INP_INFO_LOCK_DESTROY(pcbinfo);
  488 }
  489 
  490 /*
  491  * Allocate a PCB and associate it with the socket.
  492  * On success return with the PCB locked.
  493  */
  494 int
  495 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
  496 {
  497         struct inpcb *inp;
  498         int error;
  499 
  500 #ifdef INVARIANTS
  501         if (pcbinfo == &V_tcbinfo) {
  502                 INP_INFO_RLOCK_ASSERT(pcbinfo);
  503         } else {
  504                 INP_INFO_WLOCK_ASSERT(pcbinfo);
  505         }
  506 #endif
  507 
  508         error = 0;
  509         inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
  510         if (inp == NULL)
  511                 return (ENOBUFS);
  512         bzero(&inp->inp_start_zero, inp_zero_size);
  513         inp->inp_pcbinfo = pcbinfo;
  514         inp->inp_socket = so;
  515         inp->inp_cred = crhold(so->so_cred);
  516         inp->inp_inc.inc_fibnum = so->so_fibnum;
  517 #ifdef MAC
  518         error = mac_inpcb_init(inp, M_NOWAIT);
  519         if (error != 0)
  520                 goto out;
  521         mac_inpcb_create(so, inp);
  522 #endif
  523 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  524         error = ipsec_init_pcbpolicy(inp);
  525         if (error != 0) {
  526 #ifdef MAC
  527                 mac_inpcb_destroy(inp);
  528 #endif
  529                 goto out;
  530         }
  531 #endif /*IPSEC*/
  532 #ifdef INET6
  533         if (INP_SOCKAF(so) == AF_INET6) {
  534                 inp->inp_vflag |= INP_IPV6PROTO;
  535                 if (V_ip6_v6only)
  536                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
  537         }
  538 #endif
  539         INP_WLOCK(inp);
  540         INP_LIST_WLOCK(pcbinfo);
  541         CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
  542         pcbinfo->ipi_count++;
  543         so->so_pcb = (caddr_t)inp;
  544 #ifdef INET6
  545         if (V_ip6_auto_flowlabel)
  546                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
  547 #endif
  548         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
  549         refcount_init(&inp->inp_refcount, 1);   /* Reference from inpcbinfo */
  550 
  551         /*
  552          * Routes in inpcb's can cache L2 as well; they are guaranteed
  553          * to be cleaned up.
  554          */
  555         inp->inp_route.ro_flags = RT_LLE_CACHE;
  556         INP_LIST_WUNLOCK(pcbinfo);
  557 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
  558 out:
  559         if (error != 0) {
  560                 crfree(inp->inp_cred);
  561                 uma_zfree(pcbinfo->ipi_zone, inp);
  562         }
  563 #endif
  564         return (error);
  565 }
  566 
  567 #ifdef INET
  568 int
  569 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
  570 {
  571         int anonport, error;
  572 
  573         INP_WLOCK_ASSERT(inp);
  574         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
  575 
  576         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
  577                 return (EINVAL);
  578         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
  579         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
  580             &inp->inp_lport, cred);
  581         if (error)
  582                 return (error);
  583         if (in_pcbinshash(inp) != 0) {
  584                 inp->inp_laddr.s_addr = INADDR_ANY;
  585                 inp->inp_lport = 0;
  586                 return (EAGAIN);
  587         }
  588         if (anonport)
  589                 inp->inp_flags |= INP_ANONPORT;
  590         return (0);
  591 }
  592 #endif
  593 
  594 #if defined(INET) || defined(INET6)
  595 /*
  596  * Assign a local port like in_pcb_lport(), but also used with connect()
  597  * and a foreign address and port.  If fsa is non-NULL, choose a local port
  598  * that is unused with those, otherwise one that is completely unused.
  599  * lsa can be NULL for IPv6.
  600  */
  601 int
  602 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
  603     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
  604 {
  605         struct inpcbinfo *pcbinfo;
  606         struct inpcb *tmpinp;
  607         unsigned short *lastport;
  608         int count, dorandom, error;
  609         u_short aux, first, last, lport;
  610 #ifdef INET
  611         struct in_addr laddr, faddr;
  612 #endif
  613 #ifdef INET6
  614         struct in6_addr *laddr6, *faddr6;
  615 #endif
  616 
  617         pcbinfo = inp->inp_pcbinfo;
  618 
  619         /*
  620          * Because no actual state changes occur here, a global write lock on
  621          * the pcbinfo isn't required.
  622          */
  623         INP_LOCK_ASSERT(inp);
  624         INP_HASH_LOCK_ASSERT(pcbinfo);
  625 
  626         if (inp->inp_flags & INP_HIGHPORT) {
  627                 first = V_ipport_hifirstauto;   /* sysctl */
  628                 last  = V_ipport_hilastauto;
  629                 lastport = &pcbinfo->ipi_lasthi;
  630         } else if (inp->inp_flags & INP_LOWPORT) {
  631                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
  632                 if (error)
  633                         return (error);
  634                 first = V_ipport_lowfirstauto;  /* 1023 */
  635                 last  = V_ipport_lowlastauto;   /* 600 */
  636                 lastport = &pcbinfo->ipi_lastlow;
  637         } else {
  638                 first = V_ipport_firstauto;     /* sysctl */
  639                 last  = V_ipport_lastauto;
  640                 lastport = &pcbinfo->ipi_lastport;
  641         }
  642         /*
  643          * For UDP(-Lite), use random port allocation as long as the user
  644          * allows it.  For TCP (and as of yet unknown) connections,
  645          * use random port allocation only if the user allows it AND
  646          * ipport_tick() allows it.
  647          */
  648         if (V_ipport_randomized &&
  649                 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
  650                 pcbinfo == &V_ulitecbinfo))
  651                 dorandom = 1;
  652         else
  653                 dorandom = 0;
  654         /*
  655          * It makes no sense to do random port allocation if
  656          * we have the only port available.
  657          */
  658         if (first == last)
  659                 dorandom = 0;
  660         /* Make sure to not include UDP(-Lite) packets in the count. */
  661         if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
  662                 V_ipport_tcpallocs++;
  663         /*
  664          * Instead of having two loops further down counting up or down
  665          * make sure that first is always <= last and go with only one
  666          * code path implementing all logic.
  667          */
  668         if (first > last) {
  669                 aux = first;
  670                 first = last;
  671                 last = aux;
  672         }
  673 
  674 #ifdef INET
  675         laddr.s_addr = INADDR_ANY;
  676         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
  677                 if (lsa != NULL)
  678                         laddr = ((struct sockaddr_in *)lsa)->sin_addr;
  679                 if (fsa != NULL)
  680                         faddr = ((struct sockaddr_in *)fsa)->sin_addr;
  681         }
  682 #endif
  683 #ifdef INET6
  684         laddr6 = NULL;
  685         if ((inp->inp_vflag & INP_IPV6) != 0) {
  686                 if (lsa != NULL)
  687                         laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
  688                 if (fsa != NULL)
  689                         faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
  690         }
  691 #endif
  692 
  693         tmpinp = NULL;
  694         lport = *lportp;
  695 
  696         if (dorandom)
  697                 *lastport = first + (arc4random() % (last - first));
  698 
  699         count = last - first;
  700 
  701         do {
  702                 if (count-- < 0)        /* completely used? */
  703                         return (EADDRNOTAVAIL);
  704                 ++*lastport;
  705                 if (*lastport < first || *lastport > last)
  706                         *lastport = first;
  707                 lport = htons(*lastport);
  708 
  709                 if (fsa != NULL) {
  710 
  711 #ifdef INET
  712                         if (lsa->sa_family == AF_INET) {
  713                                 tmpinp = in_pcblookup_hash_locked(pcbinfo,
  714                                     faddr, fport, laddr, lport, lookupflags,
  715                                     NULL);
  716                         }
  717 #endif
  718 #ifdef INET6
  719                         if (lsa->sa_family == AF_INET6) {
  720                                 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
  721                                     faddr6, fport, laddr6, lport, lookupflags,
  722                                     NULL);
  723                         }
  724 #endif
  725                 } else {
  726 #ifdef INET6
  727                         if ((inp->inp_vflag & INP_IPV6) != 0)
  728                                 tmpinp = in6_pcblookup_local(pcbinfo,
  729                                     &inp->in6p_laddr, lport, lookupflags, cred);
  730 #endif
  731 #if defined(INET) && defined(INET6)
  732                         else
  733 #endif
  734 #ifdef INET
  735                                 tmpinp = in_pcblookup_local(pcbinfo, laddr,
  736                                     lport, lookupflags, cred);
  737 #endif
  738                 }
  739         } while (tmpinp != NULL);
  740 
  741         *lportp = lport;
  742 
  743         return (0);
  744 }
  745 
  746 /*
  747  * Select a local port (number) to use.
  748  */
  749 int
  750 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
  751     struct ucred *cred, int lookupflags)
  752 {
  753         struct sockaddr_in laddr;
  754 
  755         if (laddrp) {
  756                 bzero(&laddr, sizeof(laddr));
  757                 laddr.sin_family = AF_INET;
  758                 laddr.sin_addr = *laddrp;
  759         }
  760         return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
  761             NULL, lportp, NULL, 0, cred, lookupflags));
  762 }
  763 
  764 /*
  765  * Return cached socket options.
  766  */
  767 int
  768 inp_so_options(const struct inpcb *inp)
  769 {
  770         int so_options;
  771 
  772         so_options = 0;
  773 
  774         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
  775                 so_options |= SO_REUSEPORT_LB;
  776         if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
  777                 so_options |= SO_REUSEPORT;
  778         if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
  779                 so_options |= SO_REUSEADDR;
  780         return (so_options);
  781 }
  782 #endif /* INET || INET6 */
  783 
  784 /*
  785  * Check if a new BINDMULTI socket is allowed to be created.
  786  *
  787  * ni points to the new inp.
  788  * oi points to the existing inp.
  789  *
  790  * This checks whether the existing inp also has BINDMULTI and
  791  * whether the credentials match.
  792  */
  793 int
  794 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
  795 {
  796         /* Check permissions match */
  797         if ((ni->inp_flags2 & INP_BINDMULTI) &&
  798             (ni->inp_cred->cr_uid !=
  799             oi->inp_cred->cr_uid))
  800                 return (0);
  801 
  802         /* Check the existing inp has BINDMULTI set */
  803         if ((ni->inp_flags2 & INP_BINDMULTI) &&
  804             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
  805                 return (0);
  806 
  807         /*
  808          * We're okay - either INP_BINDMULTI isn't set on ni, or
  809          * it is and it matches the checks.
  810          */
  811         return (1);
  812 }
  813 
  814 #ifdef INET
  815 /*
  816  * Set up a bind operation on a PCB, performing port allocation
  817  * as required, but do not actually modify the PCB. Callers can
  818  * either complete the bind by setting inp_laddr/inp_lport and
  819  * calling in_pcbinshash(), or they can just use the resulting
  820  * port and address to authorise the sending of a once-off packet.
  821  *
  822  * On error, the values of *laddrp and *lportp are not changed.
  823  */
  824 int
  825 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
  826     u_short *lportp, struct ucred *cred)
  827 {
  828         struct socket *so = inp->inp_socket;
  829         struct sockaddr_in *sin;
  830         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
  831         struct in_addr laddr;
  832         u_short lport = 0;
  833         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
  834         int error;
  835 
  836         /*
  837          * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
  838          * so that we don't have to add to the (already messy) code below.
  839          */
  840         int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
  841 
  842         /*
  843          * No state changes, so read locks are sufficient here.
  844          */
  845         INP_LOCK_ASSERT(inp);
  846         INP_HASH_LOCK_ASSERT(pcbinfo);
  847 
  848         laddr.s_addr = *laddrp;
  849         if (nam != NULL && laddr.s_addr != INADDR_ANY)
  850                 return (EINVAL);
  851         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
  852                 lookupflags = INPLOOKUP_WILDCARD;
  853         if (nam == NULL) {
  854                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
  855                         return (error);
  856         } else {
  857                 sin = (struct sockaddr_in *)nam;
  858                 if (nam->sa_len != sizeof (*sin))
  859                         return (EINVAL);
  860 #ifdef notdef
  861                 /*
  862                  * We should check the family, but old programs
  863                  * incorrectly fail to initialize it.
  864                  */
  865                 if (sin->sin_family != AF_INET)
  866                         return (EAFNOSUPPORT);
  867 #endif
  868                 error = prison_local_ip4(cred, &sin->sin_addr);
  869                 if (error)
  870                         return (error);
  871                 if (sin->sin_port != *lportp) {
  872                         /* Don't allow the port to change. */
  873                         if (*lportp != 0)
  874                                 return (EINVAL);
  875                         lport = sin->sin_port;
  876                 }
  877                 /* NB: lport is left as 0 if the port isn't being changed. */
  878                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
  879                         /*
  880                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
  881                          * allow complete duplication of binding if
  882                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
  883                          * and a multicast address is bound on both
  884                          * new and duplicated sockets.
  885                          */
  886                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
  887                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
  888                         /*
  889                          * XXX: How to deal with SO_REUSEPORT_LB here?
  890                          * Treat same as SO_REUSEPORT for now.
  891                          */
  892                         if ((so->so_options &
  893                             (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
  894                                 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
  895                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
  896                         sin->sin_port = 0;              /* yech... */
  897                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
  898                         /*
  899                          * Is the address a local IP address?
  900                          * If INP_BINDANY is set, then the socket may be bound
  901                          * to any endpoint address, local or not.
  902                          */
  903                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
  904                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
  905                                 return (EADDRNOTAVAIL);
  906                 }
  907                 laddr = sin->sin_addr;
  908                 if (lport) {
  909                         struct inpcb *t;
  910                         struct tcptw *tw;
  911 
  912                         /* GROSS */
  913                         if (ntohs(lport) <= V_ipport_reservedhigh &&
  914                             ntohs(lport) >= V_ipport_reservedlow &&
  915                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
  916                             0))
  917                                 return (EACCES);
  918                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
  919                             priv_check_cred(inp->inp_cred,
  920                             PRIV_NETINET_REUSEPORT, 0) != 0) {
  921                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
  922                                     lport, INPLOOKUP_WILDCARD, cred);
  923         /*
  924          * XXX
  925          * This entire block sorely needs a rewrite.
  926          */
  927                                 if (t &&
  928                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
  929                                     ((t->inp_flags & INP_TIMEWAIT) == 0) &&
  930                                     (so->so_type != SOCK_STREAM ||
  931                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
  932                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
  933                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
  934                                      (t->inp_flags2 & INP_REUSEPORT) ||
  935                                      (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
  936                                     (inp->inp_cred->cr_uid !=
  937                                      t->inp_cred->cr_uid))
  938                                         return (EADDRINUSE);
  939 
  940                                 /*
  941                                  * If the socket is a BINDMULTI socket, then
  942                                  * the credentials need to match and the
  943                                  * original socket also has to have been bound
  944                                  * with BINDMULTI.
  945                                  */
  946                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
  947                                         return (EADDRINUSE);
  948                         }
  949                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
  950                             lport, lookupflags, cred);
  951                         if (t && (t->inp_flags & INP_TIMEWAIT)) {
  952                                 /*
  953                                  * XXXRW: If an incpb has had its timewait
  954                                  * state recycled, we treat the address as
  955                                  * being in use (for now).  This is better
  956                                  * than a panic, but not desirable.
  957                                  */
  958                                 tw = intotw(t);
  959                                 if (tw == NULL ||
  960                                     ((reuseport & tw->tw_so_options) == 0 &&
  961                                         (reuseport_lb &
  962                                             tw->tw_so_options) == 0)) {
  963                                         return (EADDRINUSE);
  964                                 }
  965                         } else if (t &&
  966                                    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
  967                                    (reuseport & inp_so_options(t)) == 0 &&
  968                                    (reuseport_lb & inp_so_options(t)) == 0) {
  969 #ifdef INET6
  970                                 if (ntohl(sin->sin_addr.s_addr) !=
  971                                     INADDR_ANY ||
  972                                     ntohl(t->inp_laddr.s_addr) !=
  973                                     INADDR_ANY ||
  974                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
  975                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
  976 #endif
  977                                                 return (EADDRINUSE);
  978                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
  979                                         return (EADDRINUSE);
  980                         }
  981                 }
  982         }
  983         if (*lportp != 0)
  984                 lport = *lportp;
  985         if (lport == 0) {
  986                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
  987                 if (error != 0)
  988                         return (error);
  989 
  990         }
  991         *laddrp = laddr.s_addr;
  992         *lportp = lport;
  993         return (0);
  994 }
  995 
  996 /*
  997  * Connect from a socket to a specified address.
  998  * Both address and port must be specified in argument sin.
  999  * If don't have a local address for this socket yet,
 1000  * then pick one.
 1001  */
 1002 int
 1003 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
 1004     struct ucred *cred, struct mbuf *m, bool rehash)
 1005 {
 1006         u_short lport, fport;
 1007         in_addr_t laddr, faddr;
 1008         int anonport, error;
 1009 
 1010         INP_WLOCK_ASSERT(inp);
 1011         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 1012 
 1013         lport = inp->inp_lport;
 1014         laddr = inp->inp_laddr.s_addr;
 1015         anonport = (lport == 0);
 1016         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 1017             NULL, cred);
 1018         if (error)
 1019                 return (error);
 1020 
 1021         /* Do the initial binding of the local address if required. */
 1022         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 1023                 KASSERT(rehash == true,
 1024                     ("Rehashing required for unbound inps"));
 1025                 inp->inp_lport = lport;
 1026                 inp->inp_laddr.s_addr = laddr;
 1027                 if (in_pcbinshash(inp) != 0) {
 1028                         inp->inp_laddr.s_addr = INADDR_ANY;
 1029                         inp->inp_lport = 0;
 1030                         return (EAGAIN);
 1031                 }
 1032         }
 1033 
 1034         /* Commit the remaining changes. */
 1035         inp->inp_lport = lport;
 1036         inp->inp_laddr.s_addr = laddr;
 1037         inp->inp_faddr.s_addr = faddr;
 1038         inp->inp_fport = fport;
 1039         if (rehash) {
 1040                 in_pcbrehash_mbuf(inp, m);
 1041         } else {
 1042                 in_pcbinshash_mbuf(inp, m);
 1043         }
 1044 
 1045         if (anonport)
 1046                 inp->inp_flags |= INP_ANONPORT;
 1047         return (0);
 1048 }
 1049 
 1050 int
 1051 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
 1052 {
 1053 
 1054         return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true));
 1055 }
 1056 
 1057 /*
 1058  * Do proper source address selection on an unbound socket in case
 1059  * of connect. Take jails into account as well.
 1060  */
 1061 int
 1062 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
 1063     struct ucred *cred)
 1064 {
 1065         struct ifaddr *ifa;
 1066         struct sockaddr *sa;
 1067         struct sockaddr_in *sin;
 1068         struct route sro;
 1069         int error;
 1070 
 1071         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 1072         /*
 1073          * Bypass source address selection and use the primary jail IP
 1074          * if requested.
 1075          */
 1076         if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
 1077                 return (0);
 1078 
 1079         error = 0;
 1080         bzero(&sro, sizeof(sro));
 1081 
 1082         sin = (struct sockaddr_in *)&sro.ro_dst;
 1083         sin->sin_family = AF_INET;
 1084         sin->sin_len = sizeof(struct sockaddr_in);
 1085         sin->sin_addr.s_addr = faddr->s_addr;
 1086 
 1087         /*
 1088          * If route is known our src addr is taken from the i/f,
 1089          * else punt.
 1090          *
 1091          * Find out route to destination.
 1092          */
 1093         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 1094                 in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
 1095 
 1096         /*
 1097          * If we found a route, use the address corresponding to
 1098          * the outgoing interface.
 1099          * 
 1100          * Otherwise assume faddr is reachable on a directly connected
 1101          * network and try to find a corresponding interface to take
 1102          * the source address from.
 1103          */
 1104         NET_EPOCH_ENTER();
 1105         if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
 1106                 struct in_ifaddr *ia;
 1107                 struct ifnet *ifp;
 1108 
 1109                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 1110                                         inp->inp_socket->so_fibnum));
 1111                 if (ia == NULL) {
 1112                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 1113                                                 inp->inp_socket->so_fibnum));
 1114 
 1115                 }
 1116                 if (ia == NULL) {
 1117                         error = ENETUNREACH;
 1118                         goto done;
 1119                 }
 1120 
 1121                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 1122                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1123                         goto done;
 1124                 }
 1125 
 1126                 ifp = ia->ia_ifp;
 1127                 ia = NULL;
 1128                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 1129 
 1130                         sa = ifa->ifa_addr;
 1131                         if (sa->sa_family != AF_INET)
 1132                                 continue;
 1133                         sin = (struct sockaddr_in *)sa;
 1134                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 1135                                 ia = (struct in_ifaddr *)ifa;
 1136                                 break;
 1137                         }
 1138                 }
 1139                 if (ia != NULL) {
 1140                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1141                         goto done;
 1142                 }
 1143 
 1144                 /* 3. As a last resort return the 'default' jail address. */
 1145                 error = prison_get_ip4(cred, laddr);
 1146                 goto done;
 1147         }
 1148 
 1149         /*
 1150          * If the outgoing interface on the route found is not
 1151          * a loopback interface, use the address from that interface.
 1152          * In case of jails do those three steps:
 1153          * 1. check if the interface address belongs to the jail. If so use it.
 1154          * 2. check if we have any address on the outgoing interface
 1155          *    belonging to this jail. If so use it.
 1156          * 3. as a last resort return the 'default' jail address.
 1157          */
 1158         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
 1159                 struct in_ifaddr *ia;
 1160                 struct ifnet *ifp;
 1161 
 1162                 /* If not jailed, use the default returned. */
 1163                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 1164                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 1165                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1166                         goto done;
 1167                 }
 1168 
 1169                 /* Jailed. */
 1170                 /* 1. Check if the iface address belongs to the jail. */
 1171                 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
 1172                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 1173                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
 1174                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1175                         goto done;
 1176                 }
 1177 
 1178                 /*
 1179                  * 2. Check if we have any address on the outgoing interface
 1180                  *    belonging to this jail.
 1181                  */
 1182                 ia = NULL;
 1183                 ifp = sro.ro_rt->rt_ifp;
 1184                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 1185                         sa = ifa->ifa_addr;
 1186                         if (sa->sa_family != AF_INET)
 1187                                 continue;
 1188                         sin = (struct sockaddr_in *)sa;
 1189                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 1190                                 ia = (struct in_ifaddr *)ifa;
 1191                                 break;
 1192                         }
 1193                 }
 1194                 if (ia != NULL) {
 1195                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1196                         goto done;
 1197                 }
 1198 
 1199                 /* 3. As a last resort return the 'default' jail address. */
 1200                 error = prison_get_ip4(cred, laddr);
 1201                 goto done;
 1202         }
 1203 
 1204         /*
 1205          * The outgoing interface is marked with 'loopback net', so a route
 1206          * to ourselves is here.
 1207          * Try to find the interface of the destination address and then
 1208          * take the address from there. That interface is not necessarily
 1209          * a loopback interface.
 1210          * In case of jails, check that it is an address of the jail
 1211          * and if we cannot find, fall back to the 'default' jail address.
 1212          */
 1213         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
 1214                 struct sockaddr_in sain;
 1215                 struct in_ifaddr *ia;
 1216 
 1217                 bzero(&sain, sizeof(struct sockaddr_in));
 1218                 sain.sin_family = AF_INET;
 1219                 sain.sin_len = sizeof(struct sockaddr_in);
 1220                 sain.sin_addr.s_addr = faddr->s_addr;
 1221 
 1222                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
 1223                                         inp->inp_socket->so_fibnum));
 1224                 if (ia == NULL)
 1225                         ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
 1226                                                 inp->inp_socket->so_fibnum));
 1227                 if (ia == NULL)
 1228                         ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
 1229 
 1230                 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
 1231                         if (ia == NULL) {
 1232                                 error = ENETUNREACH;
 1233                                 goto done;
 1234                         }
 1235                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1236                         goto done;
 1237                 }
 1238 
 1239                 /* Jailed. */
 1240                 if (ia != NULL) {
 1241                         struct ifnet *ifp;
 1242 
 1243                         ifp = ia->ia_ifp;
 1244                         ia = NULL;
 1245                         CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 1246                                 sa = ifa->ifa_addr;
 1247                                 if (sa->sa_family != AF_INET)
 1248                                         continue;
 1249                                 sin = (struct sockaddr_in *)sa;
 1250                                 if (prison_check_ip4(cred,
 1251                                     &sin->sin_addr) == 0) {
 1252                                         ia = (struct in_ifaddr *)ifa;
 1253                                         break;
 1254                                 }
 1255                         }
 1256                         if (ia != NULL) {
 1257                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1258                                 goto done;
 1259                         }
 1260                 }
 1261 
 1262                 /* 3. As a last resort return the 'default' jail address. */
 1263                 error = prison_get_ip4(cred, laddr);
 1264                 goto done;
 1265         }
 1266 
 1267 done:
 1268         NET_EPOCH_EXIT();
 1269         if (sro.ro_rt != NULL)
 1270                 RTFREE(sro.ro_rt);
 1271         return (error);
 1272 }
 1273 
 1274 /*
 1275  * Set up for a connect from a socket to the specified address.
 1276  * On entry, *laddrp and *lportp should contain the current local
 1277  * address and port for the PCB; these are updated to the values
 1278  * that should be placed in inp_laddr and inp_lport to complete
 1279  * the connect.
 1280  *
 1281  * On success, *faddrp and *fportp will be set to the remote address
 1282  * and port. These are not updated in the error case.
 1283  *
 1284  * If the operation fails because the connection already exists,
 1285  * *oinpp will be set to the PCB of that connection so that the
 1286  * caller can decide to override it. In all other cases, *oinpp
 1287  * is set to NULL.
 1288  */
 1289 int
 1290 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
 1291     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
 1292     struct inpcb **oinpp, struct ucred *cred)
 1293 {
 1294         struct rm_priotracker in_ifa_tracker;
 1295         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 1296         struct in_ifaddr *ia;
 1297         struct inpcb *oinp;
 1298         struct in_addr laddr, faddr;
 1299         u_short lport, fport;
 1300         int error;
 1301 
 1302         /*
 1303          * Because a global state change doesn't actually occur here, a read
 1304          * lock is sufficient.
 1305          */
 1306         INP_LOCK_ASSERT(inp);
 1307         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 1308 
 1309         if (oinpp != NULL)
 1310                 *oinpp = NULL;
 1311         if (nam->sa_len != sizeof (*sin))
 1312                 return (EINVAL);
 1313         if (sin->sin_family != AF_INET)
 1314                 return (EAFNOSUPPORT);
 1315         if (sin->sin_port == 0)
 1316                 return (EADDRNOTAVAIL);
 1317         laddr.s_addr = *laddrp;
 1318         lport = *lportp;
 1319         faddr = sin->sin_addr;
 1320         fport = sin->sin_port;
 1321 
 1322         if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 1323                 /*
 1324                  * If the destination address is INADDR_ANY,
 1325                  * use the primary local address.
 1326                  * If the supplied address is INADDR_BROADCAST,
 1327                  * and the primary interface supports broadcast,
 1328                  * choose the broadcast address for that interface.
 1329                  */
 1330                 if (faddr.s_addr == INADDR_ANY) {
 1331                         IN_IFADDR_RLOCK(&in_ifa_tracker);
 1332                         faddr =
 1333                             IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 1334                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 1335                         if (cred != NULL &&
 1336                             (error = prison_get_ip4(cred, &faddr)) != 0)
 1337                                 return (error);
 1338                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 1339                         IN_IFADDR_RLOCK(&in_ifa_tracker);
 1340                         if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 1341                             IFF_BROADCAST)
 1342                                 faddr = satosin(&CK_STAILQ_FIRST(
 1343                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 1344                         IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 1345                 }
 1346         }
 1347         if (laddr.s_addr == INADDR_ANY) {
 1348                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
 1349                 /*
 1350                  * If the destination address is multicast and an outgoing
 1351                  * interface has been set as a multicast option, prefer the
 1352                  * address of that interface as our source address.
 1353                  */
 1354                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 1355                     inp->inp_moptions != NULL) {
 1356                         struct ip_moptions *imo;
 1357                         struct ifnet *ifp;
 1358 
 1359                         imo = inp->inp_moptions;
 1360                         if (imo->imo_multicast_ifp != NULL) {
 1361                                 ifp = imo->imo_multicast_ifp;
 1362                                 IN_IFADDR_RLOCK(&in_ifa_tracker);
 1363                                 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 1364                                         if ((ia->ia_ifp == ifp) &&
 1365                                             (cred == NULL ||
 1366                                             prison_check_ip4(cred,
 1367                                             &ia->ia_addr.sin_addr) == 0))
 1368                                                 break;
 1369                                 }
 1370                                 if (ia == NULL)
 1371                                         error = EADDRNOTAVAIL;
 1372                                 else {
 1373                                         laddr = ia->ia_addr.sin_addr;
 1374                                         error = 0;
 1375                                 }
 1376                                 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
 1377                         }
 1378                 }
 1379                 if (error)
 1380                         return (error);
 1381         }
 1382         if (lport != 0) {
 1383                 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
 1384                     fport, laddr, lport, 0, NULL);
 1385                 if (oinp != NULL) {
 1386                         if (oinpp != NULL)
 1387                                 *oinpp = oinp;
 1388                         return (EADDRINUSE);
 1389                 }
 1390         } else {
 1391                 struct sockaddr_in lsin, fsin;
 1392 
 1393                 bzero(&lsin, sizeof(lsin));
 1394                 bzero(&fsin, sizeof(fsin));
 1395                 lsin.sin_family = AF_INET;
 1396                 lsin.sin_addr = laddr;
 1397                 fsin.sin_family = AF_INET;
 1398                 fsin.sin_addr = faddr;
 1399                 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
 1400                     &lport, (struct sockaddr *)& fsin, fport, cred,
 1401                     INPLOOKUP_WILDCARD);
 1402                 if (error)
 1403                         return (error);
 1404         }
 1405         *laddrp = laddr.s_addr;
 1406         *lportp = lport;
 1407         *faddrp = faddr.s_addr;
 1408         *fportp = fport;
 1409         return (0);
 1410 }
 1411 
 1412 void
 1413 in_pcbdisconnect(struct inpcb *inp)
 1414 {
 1415 
 1416         INP_WLOCK_ASSERT(inp);
 1417         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 1418 
 1419         inp->inp_faddr.s_addr = INADDR_ANY;
 1420         inp->inp_fport = 0;
 1421         in_pcbrehash(inp);
 1422 }
 1423 #endif /* INET */
 1424 
 1425 /*
 1426  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
 1427  * For most protocols, this will be invoked immediately prior to calling
 1428  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
 1429  * socket, in which case in_pcbfree() is deferred.
 1430  */
 1431 void
 1432 in_pcbdetach(struct inpcb *inp)
 1433 {
 1434 
 1435         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 1436 
 1437 #ifdef RATELIMIT
 1438         if (inp->inp_snd_tag != NULL)
 1439                 in_pcbdetach_txrtlmt(inp);
 1440 #endif
 1441         inp->inp_socket->so_pcb = NULL;
 1442         inp->inp_socket = NULL;
 1443 }
 1444 
 1445 /*
 1446  * in_pcbref() bumps the reference count on an inpcb in order to maintain
 1447  * stability of an inpcb pointer despite the inpcb lock being released.  This
 1448  * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
 1449  * but where the inpcb lock may already held, or when acquiring a reference
 1450  * via a pcbgroup.
 1451  *
 1452  * in_pcbref() should be used only to provide brief memory stability, and
 1453  * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
 1454  * garbage collect the inpcb if it has been in_pcbfree()'d from another
 1455  * context.  Until in_pcbrele() has returned that the inpcb is still valid,
 1456  * lock and rele are the *only* safe operations that may be performed on the
 1457  * inpcb.
 1458  *
 1459  * While the inpcb will not be freed, releasing the inpcb lock means that the
 1460  * connection's state may change, so the caller should be careful to
 1461  * revalidate any cached state on reacquiring the lock.  Drop the reference
 1462  * using in_pcbrele().
 1463  */
 1464 void
 1465 in_pcbref(struct inpcb *inp)
 1466 {
 1467 
 1468         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 1469 
 1470         refcount_acquire(&inp->inp_refcount);
 1471 }
 1472 
 1473 /*
 1474  * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
 1475  * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
 1476  * return a flag indicating whether or not the inpcb remains valid.  If it is
 1477  * valid, we return with the inpcb lock held.
 1478  *
 1479  * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
 1480  * reference on an inpcb.  Historically more work was done here (actually, in
 1481  * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
 1482  * need for the pcbinfo lock in in_pcbrele().  Deferring the free is entirely
 1483  * about memory stability (and continued use of the write lock).
 1484  */
 1485 int
 1486 in_pcbrele_rlocked(struct inpcb *inp)
 1487 {
 1488         struct inpcbinfo *pcbinfo;
 1489 
 1490         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 1491 
 1492         INP_RLOCK_ASSERT(inp);
 1493 
 1494         if (refcount_release(&inp->inp_refcount) == 0) {
 1495                 /*
 1496                  * If the inpcb has been freed, let the caller know, even if
 1497                  * this isn't the last reference.
 1498                  */
 1499                 if (inp->inp_flags2 & INP_FREED) {
 1500                         INP_RUNLOCK(inp);
 1501                         return (1);
 1502                 }
 1503                 return (0);
 1504         }
 1505         
 1506         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 1507 #ifdef TCPHPTS
 1508         if (inp->inp_in_hpts || inp->inp_in_input) {
 1509                 struct tcp_hpts_entry *hpts;
 1510                 /*
 1511                  * We should not be on the hpts at 
 1512                  * this point in any form. we must
 1513                  * get the lock to be sure.
 1514                  */
 1515                 hpts = tcp_hpts_lock(inp);
 1516                 if (inp->inp_in_hpts)
 1517                         panic("Hpts:%p inp:%p at free still on hpts",
 1518                               hpts, inp);
 1519                 mtx_unlock(&hpts->p_mtx);
 1520                 hpts = tcp_input_lock(inp);
 1521                 if (inp->inp_in_input) 
 1522                         panic("Hpts:%p inp:%p at free still on input hpts",
 1523                               hpts, inp);
 1524                 mtx_unlock(&hpts->p_mtx);
 1525         }
 1526 #endif
 1527         INP_RUNLOCK(inp);
 1528         pcbinfo = inp->inp_pcbinfo;
 1529         uma_zfree(pcbinfo->ipi_zone, inp);
 1530         return (1);
 1531 }
 1532 
 1533 int
 1534 in_pcbrele_wlocked(struct inpcb *inp)
 1535 {
 1536         struct inpcbinfo *pcbinfo;
 1537 
 1538         KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
 1539 
 1540         INP_WLOCK_ASSERT(inp);
 1541 
 1542         if (refcount_release(&inp->inp_refcount) == 0) {
 1543                 /*
 1544                  * If the inpcb has been freed, let the caller know, even if
 1545                  * this isn't the last reference.
 1546                  */
 1547                 if (inp->inp_flags2 & INP_FREED) {
 1548                         INP_WUNLOCK(inp);
 1549                         return (1);
 1550                 }
 1551                 return (0);
 1552         }
 1553 
 1554         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 1555 #ifdef TCPHPTS
 1556         if (inp->inp_in_hpts || inp->inp_in_input) {
 1557                 struct tcp_hpts_entry *hpts;
 1558                 /*
 1559                  * We should not be on the hpts at 
 1560                  * this point in any form. we must
 1561                  * get the lock to be sure.
 1562                  */
 1563                 hpts = tcp_hpts_lock(inp);
 1564                 if (inp->inp_in_hpts)
 1565                         panic("Hpts:%p inp:%p at free still on hpts",
 1566                               hpts, inp);
 1567                 mtx_unlock(&hpts->p_mtx);
 1568                 hpts = tcp_input_lock(inp);
 1569                 if (inp->inp_in_input) 
 1570                         panic("Hpts:%p inp:%p at free still on input hpts",
 1571                               hpts, inp);
 1572                 mtx_unlock(&hpts->p_mtx);
 1573         }
 1574 #endif
 1575         INP_WUNLOCK(inp);
 1576         pcbinfo = inp->inp_pcbinfo;
 1577         uma_zfree(pcbinfo->ipi_zone, inp);
 1578         return (1);
 1579 }
 1580 
 1581 /*
 1582  * Temporary wrapper.
 1583  */
 1584 int
 1585 in_pcbrele(struct inpcb *inp)
 1586 {
 1587 
 1588         return (in_pcbrele_wlocked(inp));
 1589 }
 1590 
 1591 void
 1592 in_pcblist_rele_rlocked(epoch_context_t ctx)
 1593 {
 1594         struct in_pcblist *il;
 1595         struct inpcb *inp;
 1596         struct inpcbinfo *pcbinfo;
 1597         int i, n;
 1598 
 1599         il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
 1600         pcbinfo = il->il_pcbinfo;
 1601         n = il->il_count;
 1602         INP_INFO_WLOCK(pcbinfo);
 1603         for (i = 0; i < n; i++) {
 1604                 inp = il->il_inp_list[i];
 1605                 INP_RLOCK(inp);
 1606                 if (!in_pcbrele_rlocked(inp))
 1607                         INP_RUNLOCK(inp);
 1608         }
 1609         INP_INFO_WUNLOCK(pcbinfo);
 1610         free(il, M_TEMP);
 1611 }
 1612 
 1613 static void
 1614 inpcbport_free(epoch_context_t ctx)
 1615 {
 1616         struct inpcbport *phd;
 1617 
 1618         phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
 1619         free(phd, M_PCB);
 1620 }
 1621 
 1622 static void
 1623 in_pcbfree_deferred(epoch_context_t ctx)
 1624 {
 1625         struct inpcb *inp;
 1626         int released __unused;
 1627 
 1628         inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
 1629 
 1630         INP_WLOCK(inp);
 1631         CURVNET_SET(inp->inp_vnet);
 1632 #ifdef INET
 1633         struct ip_moptions *imo = inp->inp_moptions;
 1634         inp->inp_moptions = NULL;
 1635 #endif
 1636         /* XXXRW: Do as much as possible here. */
 1637 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 1638         if (inp->inp_sp != NULL)
 1639                 ipsec_delete_pcbpolicy(inp);
 1640 #endif
 1641 #ifdef INET6
 1642         struct ip6_moptions *im6o = NULL;
 1643         if (inp->inp_vflag & INP_IPV6PROTO) {
 1644                 ip6_freepcbopts(inp->in6p_outputopts);
 1645                 im6o = inp->in6p_moptions;
 1646                 inp->in6p_moptions = NULL;
 1647         }
 1648 #endif
 1649         if (inp->inp_options)
 1650                 (void)m_free(inp->inp_options);
 1651         inp->inp_vflag = 0;
 1652         crfree(inp->inp_cred);
 1653 #ifdef MAC
 1654         mac_inpcb_destroy(inp);
 1655 #endif
 1656         released = in_pcbrele_wlocked(inp);
 1657         MPASS(released);
 1658 #ifdef INET6
 1659         ip6_freemoptions(im6o);
 1660 #endif
 1661 #ifdef INET
 1662         inp_freemoptions(imo);
 1663 #endif  
 1664         CURVNET_RESTORE();
 1665 }
 1666 
 1667 /*
 1668  * Unconditionally schedule an inpcb to be freed by decrementing its
 1669  * reference count, which should occur only after the inpcb has been detached
 1670  * from its socket.  If another thread holds a temporary reference (acquired
 1671  * using in_pcbref()) then the free is deferred until that reference is
 1672  * released using in_pcbrele(), but the inpcb is still unlocked.  Almost all
 1673  * work, including removal from global lists, is done in this context, where
 1674  * the pcbinfo lock is held.
 1675  */
 1676 void
 1677 in_pcbfree(struct inpcb *inp)
 1678 {
 1679         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 1680 
 1681         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 1682         KASSERT((inp->inp_flags2 & INP_FREED) == 0,
 1683             ("%s: called twice for pcb %p", __func__, inp));
 1684         if (inp->inp_flags2 & INP_FREED) {
 1685                 INP_WUNLOCK(inp);
 1686                 return;
 1687         }
 1688 
 1689 #ifdef INVARIANTS
 1690         if (pcbinfo == &V_tcbinfo) {
 1691                 INP_INFO_LOCK_ASSERT(pcbinfo);
 1692         } else {
 1693                 INP_INFO_WLOCK_ASSERT(pcbinfo);
 1694         }
 1695 #endif
 1696         INP_WLOCK_ASSERT(inp);
 1697         INP_LIST_WLOCK(pcbinfo);
 1698         in_pcbremlists(inp);
 1699         INP_LIST_WUNLOCK(pcbinfo);
 1700         RO_INVALIDATE_CACHE(&inp->inp_route);
 1701         /* mark as destruction in progress */
 1702         inp->inp_flags2 |= INP_FREED;
 1703         INP_WUNLOCK(inp);
 1704         epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred);
 1705 }
 1706 
 1707 /*
 1708  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
 1709  * port reservation, and preventing it from being returned by inpcb lookups.
 1710  *
 1711  * It is used by TCP to mark an inpcb as unused and avoid future packet
 1712  * delivery or event notification when a socket remains open but TCP has
 1713  * closed.  This might occur as a result of a shutdown()-initiated TCP close
 1714  * or a RST on the wire, and allows the port binding to be reused while still
 1715  * maintaining the invariant that so_pcb always points to a valid inpcb until
 1716  * in_pcbdetach().
 1717  *
 1718  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
 1719  * in_pcbnotifyall() and in_pcbpurgeif0()?
 1720  */
 1721 void
 1722 in_pcbdrop(struct inpcb *inp)
 1723 {
 1724 
 1725         INP_WLOCK_ASSERT(inp);
 1726 #ifdef INVARIANTS
 1727         if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 1728                 MPASS(inp->inp_refcount > 1);
 1729 #endif
 1730 
 1731         /*
 1732          * XXXRW: Possibly we should protect the setting of INP_DROPPED with
 1733          * the hash lock...?
 1734          */
 1735         inp->inp_flags |= INP_DROPPED;
 1736         if (inp->inp_flags & INP_INHASHLIST) {
 1737                 struct inpcbport *phd = inp->inp_phd;
 1738 
 1739                 INP_HASH_WLOCK(inp->inp_pcbinfo);
 1740                 in_pcbremlbgrouphash(inp);
 1741                 CK_LIST_REMOVE(inp, inp_hash);
 1742                 CK_LIST_REMOVE(inp, inp_portlist);
 1743                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 1744                         CK_LIST_REMOVE(phd, phd_hash);
 1745                         epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
 1746                 }
 1747                 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 1748                 inp->inp_flags &= ~INP_INHASHLIST;
 1749 #ifdef PCBGROUP
 1750                 in_pcbgroup_remove(inp);
 1751 #endif
 1752         }
 1753 }
 1754 
 1755 #ifdef INET
 1756 /*
 1757  * Common routines to return the socket addresses associated with inpcbs.
 1758  */
 1759 struct sockaddr *
 1760 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 1761 {
 1762         struct sockaddr_in *sin;
 1763 
 1764         sin = malloc(sizeof *sin, M_SONAME,
 1765                 M_WAITOK | M_ZERO);
 1766         sin->sin_family = AF_INET;
 1767         sin->sin_len = sizeof(*sin);
 1768         sin->sin_addr = *addr_p;
 1769         sin->sin_port = port;
 1770 
 1771         return (struct sockaddr *)sin;
 1772 }
 1773 
 1774 int
 1775 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 1776 {
 1777         struct inpcb *inp;
 1778         struct in_addr addr;
 1779         in_port_t port;
 1780 
 1781         inp = sotoinpcb(so);
 1782         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 1783 
 1784         INP_RLOCK(inp);
 1785         port = inp->inp_lport;
 1786         addr = inp->inp_laddr;
 1787         INP_RUNLOCK(inp);
 1788 
 1789         *nam = in_sockaddr(port, &addr);
 1790         return 0;
 1791 }
 1792 
 1793 int
 1794 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 1795 {
 1796         struct inpcb *inp;
 1797         struct in_addr addr;
 1798         in_port_t port;
 1799 
 1800         inp = sotoinpcb(so);
 1801         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 1802 
 1803         INP_RLOCK(inp);
 1804         port = inp->inp_fport;
 1805         addr = inp->inp_faddr;
 1806         INP_RUNLOCK(inp);
 1807 
 1808         *nam = in_sockaddr(port, &addr);
 1809         return 0;
 1810 }
 1811 
 1812 void
 1813 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
 1814     struct inpcb *(*notify)(struct inpcb *, int))
 1815 {
 1816         struct inpcb *inp, *inp_temp;
 1817 
 1818         INP_INFO_WLOCK(pcbinfo);
 1819         CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
 1820                 INP_WLOCK(inp);
 1821 #ifdef INET6
 1822                 if ((inp->inp_vflag & INP_IPV4) == 0) {
 1823                         INP_WUNLOCK(inp);
 1824                         continue;
 1825                 }
 1826 #endif
 1827                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
 1828                     inp->inp_socket == NULL) {
 1829                         INP_WUNLOCK(inp);
 1830                         continue;
 1831                 }
 1832                 if ((*notify)(inp, errno))
 1833                         INP_WUNLOCK(inp);
 1834         }
 1835         INP_INFO_WUNLOCK(pcbinfo);
 1836 }
 1837 
 1838 void
 1839 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 1840 {
 1841         struct inpcb *inp;
 1842         struct in_multi *inm;
 1843         struct in_mfilter *imf;
 1844         struct ip_moptions *imo;
 1845 
 1846         INP_INFO_WLOCK(pcbinfo);
 1847         CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
 1848                 INP_WLOCK(inp);
 1849                 imo = inp->inp_moptions;
 1850                 if ((inp->inp_vflag & INP_IPV4) &&
 1851                     imo != NULL) {
 1852                         /*
 1853                          * Unselect the outgoing interface if it is being
 1854                          * detached.
 1855                          */
 1856                         if (imo->imo_multicast_ifp == ifp)
 1857                                 imo->imo_multicast_ifp = NULL;
 1858 
 1859                         /*
 1860                          * Drop multicast group membership if we joined
 1861                          * through the interface being detached.
 1862                          *
 1863                          * XXX This can all be deferred to an epoch_call
 1864                          */
 1865 restart:
 1866                         IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 1867                                 if ((inm = imf->imf_inm) == NULL)
 1868                                         continue;
 1869                                 if (inm->inm_ifp != ifp)
 1870                                         continue;
 1871                                 ip_mfilter_remove(&imo->imo_head, imf);
 1872                                 IN_MULTI_LOCK_ASSERT();
 1873                                 in_leavegroup_locked(inm, NULL);
 1874                                 ip_mfilter_free(imf);
 1875                                 goto restart;
 1876                         }
 1877                 }
 1878                 INP_WUNLOCK(inp);
 1879         }
 1880         INP_INFO_WUNLOCK(pcbinfo);
 1881 }
 1882 
 1883 /*
 1884  * Lookup a PCB based on the local address and port.  Caller must hold the
 1885  * hash lock.  No inpcb locks or references are acquired.
 1886  */
 1887 #define INP_LOOKUP_MAPPED_PCB_COST      3
 1888 struct inpcb *
 1889 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
 1890     u_short lport, int lookupflags, struct ucred *cred)
 1891 {
 1892         struct inpcb *inp;
 1893 #ifdef INET6
 1894         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 1895 #else
 1896         int matchwild = 3;
 1897 #endif
 1898         int wildcard;
 1899 
 1900         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 1901             ("%s: invalid lookup flags %d", __func__, lookupflags));
 1902 
 1903         INP_HASH_LOCK_ASSERT(pcbinfo);
 1904 
 1905         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 1906                 struct inpcbhead *head;
 1907                 /*
 1908                  * Look for an unconnected (wildcard foreign addr) PCB that
 1909                  * matches the local address and port we're looking for.
 1910                  */
 1911                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 1912                     0, pcbinfo->ipi_hashmask)];
 1913                 CK_LIST_FOREACH(inp, head, inp_hash) {
 1914 #ifdef INET6
 1915                         /* XXX inp locking */
 1916                         if ((inp->inp_vflag & INP_IPV4) == 0)
 1917                                 continue;
 1918 #endif
 1919                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
 1920                             inp->inp_laddr.s_addr == laddr.s_addr &&
 1921                             inp->inp_lport == lport) {
 1922                                 /*
 1923                                  * Found?
 1924                                  */
 1925                                 if (cred == NULL ||
 1926                                     prison_equal_ip4(cred->cr_prison,
 1927                                         inp->inp_cred->cr_prison))
 1928                                         return (inp);
 1929                         }
 1930                 }
 1931                 /*
 1932                  * Not found.
 1933                  */
 1934                 return (NULL);
 1935         } else {
 1936                 struct inpcbporthead *porthash;
 1937                 struct inpcbport *phd;
 1938                 struct inpcb *match = NULL;
 1939                 /*
 1940                  * Best fit PCB lookup.
 1941                  *
 1942                  * First see if this local port is in use by looking on the
 1943                  * port hash list.
 1944                  */
 1945                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 1946                     pcbinfo->ipi_porthashmask)];
 1947                 CK_LIST_FOREACH(phd, porthash, phd_hash) {
 1948                         if (phd->phd_port == lport)
 1949                                 break;
 1950                 }
 1951                 if (phd != NULL) {
 1952                         /*
 1953                          * Port is in use by one or more PCBs. Look for best
 1954                          * fit.
 1955                          */
 1956                         CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 1957                                 wildcard = 0;
 1958                                 if (cred != NULL &&
 1959                                     !prison_equal_ip4(inp->inp_cred->cr_prison,
 1960                                         cred->cr_prison))
 1961                                         continue;
 1962 #ifdef INET6
 1963                                 /* XXX inp locking */
 1964                                 if ((inp->inp_vflag & INP_IPV4) == 0)
 1965                                         continue;
 1966                                 /*
 1967                                  * We never select the PCB that has
 1968                                  * INP_IPV6 flag and is bound to :: if
 1969                                  * we have another PCB which is bound
 1970                                  * to 0.0.0.0.  If a PCB has the
 1971                                  * INP_IPV6 flag, then we set its cost
 1972                                  * higher than IPv4 only PCBs.
 1973                                  *
 1974                                  * Note that the case only happens
 1975                                  * when a socket is bound to ::, under
 1976                                  * the condition that the use of the
 1977                                  * mapped address is allowed.
 1978                                  */
 1979                                 if ((inp->inp_vflag & INP_IPV6) != 0)
 1980                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 1981 #endif
 1982                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
 1983                                         wildcard++;
 1984                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
 1985                                         if (laddr.s_addr == INADDR_ANY)
 1986                                                 wildcard++;
 1987                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
 1988                                                 continue;
 1989                                 } else {
 1990                                         if (laddr.s_addr != INADDR_ANY)
 1991                                                 wildcard++;
 1992                                 }
 1993                                 if (wildcard < matchwild) {
 1994                                         match = inp;
 1995                                         matchwild = wildcard;
 1996                                         if (matchwild == 0)
 1997                                                 break;
 1998                                 }
 1999                         }
 2000                 }
 2001                 return (match);
 2002         }
 2003 }
 2004 #undef INP_LOOKUP_MAPPED_PCB_COST
 2005 
 2006 static struct inpcb *
 2007 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
 2008     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
 2009     uint16_t fport, int lookupflags)
 2010 {
 2011         struct inpcb *local_wild;
 2012         const struct inpcblbgrouphead *hdr;
 2013         struct inpcblbgroup *grp;
 2014         uint32_t idx;
 2015 
 2016         INP_HASH_LOCK_ASSERT(pcbinfo);
 2017 
 2018         hdr = &pcbinfo->ipi_lbgrouphashbase[
 2019             INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 2020 
 2021         /*
 2022          * Order of socket selection:
 2023          * 1. non-wild.
 2024          * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
 2025          *
 2026          * NOTE:
 2027          * - Load balanced group does not contain jailed sockets
 2028          * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
 2029          */
 2030         local_wild = NULL;
 2031         CK_LIST_FOREACH(grp, hdr, il_list) {
 2032 #ifdef INET6
 2033                 if (!(grp->il_vflag & INP_IPV4))
 2034                         continue;
 2035 #endif
 2036                 if (grp->il_lport != lport)
 2037                         continue;
 2038 
 2039                 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
 2040                     grp->il_inpcnt;
 2041                 if (grp->il_laddr.s_addr == laddr->s_addr)
 2042                         return (grp->il_inp[idx]);
 2043                 if (grp->il_laddr.s_addr == INADDR_ANY &&
 2044                     (lookupflags & INPLOOKUP_WILDCARD) != 0)
 2045                         local_wild = grp->il_inp[idx];
 2046         }
 2047         return (local_wild);
 2048 }
 2049 
 2050 #ifdef PCBGROUP
 2051 /*
 2052  * Lookup PCB in hash list, using pcbgroup tables.
 2053  */
 2054 static struct inpcb *
 2055 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
 2056     struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
 2057     u_int lport_arg, int lookupflags, struct ifnet *ifp)
 2058 {
 2059         struct inpcbhead *head;
 2060         struct inpcb *inp, *tmpinp;
 2061         u_short fport = fport_arg, lport = lport_arg;
 2062         bool locked;
 2063 
 2064         /*
 2065          * First look for an exact match.
 2066          */
 2067         tmpinp = NULL;
 2068         INP_GROUP_LOCK(pcbgroup);
 2069         head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 2070             pcbgroup->ipg_hashmask)];
 2071         CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 2072 #ifdef INET6
 2073                 /* XXX inp locking */
 2074                 if ((inp->inp_vflag & INP_IPV4) == 0)
 2075                         continue;
 2076 #endif
 2077                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
 2078                     inp->inp_laddr.s_addr == laddr.s_addr &&
 2079                     inp->inp_fport == fport &&
 2080                     inp->inp_lport == lport) {
 2081                         /*
 2082                          * XXX We should be able to directly return
 2083                          * the inp here, without any checks.
 2084                          * Well unless both bound with SO_REUSEPORT?
 2085                          */
 2086                         if (prison_flag(inp->inp_cred, PR_IP4))
 2087                                 goto found;
 2088                         if (tmpinp == NULL)
 2089                                 tmpinp = inp;
 2090                 }
 2091         }
 2092         if (tmpinp != NULL) {
 2093                 inp = tmpinp;
 2094                 goto found;
 2095         }
 2096 
 2097 #ifdef  RSS
 2098         /*
 2099          * For incoming connections, we may wish to do a wildcard
 2100          * match for an RSS-local socket.
 2101          */
 2102         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 2103                 struct inpcb *local_wild = NULL, *local_exact = NULL;
 2104 #ifdef INET6
 2105                 struct inpcb *local_wild_mapped = NULL;
 2106 #endif
 2107                 struct inpcb *jail_wild = NULL;
 2108                 struct inpcbhead *head;
 2109                 int injail;
 2110 
 2111                 /*
 2112                  * Order of socket selection - we always prefer jails.
 2113                  *      1. jailed, non-wild.
 2114                  *      2. jailed, wild.
 2115                  *      3. non-jailed, non-wild.
 2116                  *      4. non-jailed, wild.
 2117                  */
 2118 
 2119                 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
 2120                     lport, 0, pcbgroup->ipg_hashmask)];
 2121                 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
 2122 #ifdef INET6
 2123                         /* XXX inp locking */
 2124                         if ((inp->inp_vflag & INP_IPV4) == 0)
 2125                                 continue;
 2126 #endif
 2127                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
 2128                             inp->inp_lport != lport)
 2129                                 continue;
 2130 
 2131                         injail = prison_flag(inp->inp_cred, PR_IP4);
 2132                         if (injail) {
 2133                                 if (prison_check_ip4(inp->inp_cred,
 2134                                     &laddr) != 0)
 2135                                         continue;
 2136                         } else {
 2137                                 if (local_exact != NULL)
 2138                                         continue;
 2139                         }
 2140 
 2141                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
 2142                                 if (injail)
 2143                                         goto found;
 2144                                 else
 2145                                         local_exact = inp;
 2146                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 2147 #ifdef INET6
 2148                                 /* XXX inp locking, NULL check */
 2149                                 if (inp->inp_vflag & INP_IPV6PROTO)
 2150                                         local_wild_mapped = inp;
 2151                                 else
 2152 #endif
 2153                                         if (injail)
 2154                                                 jail_wild = inp;
 2155                                         else
 2156                                                 local_wild = inp;
 2157                         }
 2158                 } /* LIST_FOREACH */
 2159 
 2160                 inp = jail_wild;
 2161                 if (inp == NULL)
 2162                         inp = local_exact;
 2163                 if (inp == NULL)
 2164                         inp = local_wild;
 2165 #ifdef INET6
 2166                 if (inp == NULL)
 2167                         inp = local_wild_mapped;
 2168 #endif
 2169                 if (inp != NULL)
 2170                         goto found;
 2171         }
 2172 #endif
 2173 
 2174         /*
 2175          * Then look for a wildcard match, if requested.
 2176          */
 2177         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 2178                 struct inpcb *local_wild = NULL, *local_exact = NULL;
 2179 #ifdef INET6
 2180                 struct inpcb *local_wild_mapped = NULL;
 2181 #endif
 2182                 struct inpcb *jail_wild = NULL;
 2183                 struct inpcbhead *head;
 2184                 int injail;
 2185 
 2186                 /*
 2187                  * Order of socket selection - we always prefer jails.
 2188                  *      1. jailed, non-wild.
 2189                  *      2. jailed, wild.
 2190                  *      3. non-jailed, non-wild.
 2191                  *      4. non-jailed, wild.
 2192                  */
 2193                 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
 2194                     0, pcbinfo->ipi_wildmask)];
 2195                 CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
 2196 #ifdef INET6
 2197                         /* XXX inp locking */
 2198                         if ((inp->inp_vflag & INP_IPV4) == 0)
 2199                                 continue;
 2200 #endif
 2201                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
 2202                             inp->inp_lport != lport)
 2203                                 continue;
 2204 
 2205                         injail = prison_flag(inp->inp_cred, PR_IP4);
 2206                         if (injail) {
 2207                                 if (prison_check_ip4(inp->inp_cred,
 2208                                     &laddr) != 0)
 2209                                         continue;
 2210                         } else {
 2211                                 if (local_exact != NULL)
 2212                                         continue;
 2213                         }
 2214 
 2215                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
 2216                                 if (injail)
 2217                                         goto found;
 2218                                 else
 2219                                         local_exact = inp;
 2220                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 2221 #ifdef INET6
 2222                                 /* XXX inp locking, NULL check */
 2223                                 if (inp->inp_vflag & INP_IPV6PROTO)
 2224                                         local_wild_mapped = inp;
 2225                                 else
 2226 #endif
 2227                                         if (injail)
 2228                                                 jail_wild = inp;
 2229                                         else
 2230                                                 local_wild = inp;
 2231                         }
 2232                 } /* LIST_FOREACH */
 2233                 inp = jail_wild;
 2234                 if (inp == NULL)
 2235                         inp = local_exact;
 2236                 if (inp == NULL)
 2237                         inp = local_wild;
 2238 #ifdef INET6
 2239                 if (inp == NULL)
 2240                         inp = local_wild_mapped;
 2241 #endif
 2242                 if (inp != NULL)
 2243                         goto found;
 2244         } /* if (lookupflags & INPLOOKUP_WILDCARD) */
 2245         INP_GROUP_UNLOCK(pcbgroup);
 2246         return (NULL);
 2247 
 2248 found:
 2249         if (lookupflags & INPLOOKUP_WLOCKPCB)
 2250                 locked = INP_TRY_WLOCK(inp);
 2251         else if (lookupflags & INPLOOKUP_RLOCKPCB)
 2252                 locked = INP_TRY_RLOCK(inp);
 2253         else
 2254                 panic("%s: locking bug", __func__);
 2255         if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
 2256                 if (lookupflags & INPLOOKUP_WLOCKPCB)
 2257                         INP_WUNLOCK(inp);
 2258                 else
 2259                         INP_RUNLOCK(inp);
 2260                 return (NULL);
 2261         } else if (!locked)
 2262                 in_pcbref(inp);
 2263         INP_GROUP_UNLOCK(pcbgroup);
 2264         if (!locked) {
 2265                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
 2266                         INP_WLOCK(inp);
 2267                         if (in_pcbrele_wlocked(inp))
 2268                                 return (NULL);
 2269                 } else {
 2270                         INP_RLOCK(inp);
 2271                         if (in_pcbrele_rlocked(inp))
 2272                                 return (NULL);
 2273                 }
 2274         }
 2275 #ifdef INVARIANTS
 2276         if (lookupflags & INPLOOKUP_WLOCKPCB)
 2277                 INP_WLOCK_ASSERT(inp);
 2278         else
 2279                 INP_RLOCK_ASSERT(inp);
 2280 #endif
 2281         return (inp);
 2282 }
 2283 #endif /* PCBGROUP */
 2284 
 2285 /*
 2286  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
 2287  * that the caller has locked the hash list, and will not perform any further
 2288  * locking or reference operations on either the hash list or the connection.
 2289  */
 2290 static struct inpcb *
 2291 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 2292     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
 2293     struct ifnet *ifp)
 2294 {
 2295         struct inpcbhead *head;
 2296         struct inpcb *inp, *tmpinp;
 2297         u_short fport = fport_arg, lport = lport_arg;
 2298 
 2299 #ifdef INVARIANTS
 2300         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 2301             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2302         if (!mtx_owned(&pcbinfo->ipi_hash_lock))
 2303                 MPASS(in_epoch_verbose(net_epoch_preempt, 1));
 2304 #endif
 2305         /*
 2306          * First look for an exact match.
 2307          */
 2308         tmpinp = NULL;
 2309         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
 2310             pcbinfo->ipi_hashmask)];
 2311         CK_LIST_FOREACH(inp, head, inp_hash) {
 2312 #ifdef INET6
 2313                 /* XXX inp locking */
 2314                 if ((inp->inp_vflag & INP_IPV4) == 0)
 2315                         continue;
 2316 #endif
 2317                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
 2318                     inp->inp_laddr.s_addr == laddr.s_addr &&
 2319                     inp->inp_fport == fport &&
 2320                     inp->inp_lport == lport) {
 2321                         /*
 2322                          * XXX We should be able to directly return
 2323                          * the inp here, without any checks.
 2324                          * Well unless both bound with SO_REUSEPORT?
 2325                          */
 2326                         if (prison_flag(inp->inp_cred, PR_IP4))
 2327                                 return (inp);
 2328                         if (tmpinp == NULL)
 2329                                 tmpinp = inp;
 2330                 }
 2331         }
 2332         if (tmpinp != NULL)
 2333                 return (tmpinp);
 2334 
 2335         /*
 2336          * Then look in lb group (for wildcard match).
 2337          */
 2338         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 2339                 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 2340                     fport, lookupflags);
 2341                 if (inp != NULL)
 2342                         return (inp);
 2343         }
 2344 
 2345         /*
 2346          * Then look for a wildcard match, if requested.
 2347          */
 2348         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 2349                 struct inpcb *local_wild = NULL, *local_exact = NULL;
 2350 #ifdef INET6
 2351                 struct inpcb *local_wild_mapped = NULL;
 2352 #endif
 2353                 struct inpcb *jail_wild = NULL;
 2354                 int injail;
 2355 
 2356                 /*
 2357                  * Order of socket selection - we always prefer jails.
 2358                  *      1. jailed, non-wild.
 2359                  *      2. jailed, wild.
 2360                  *      3. non-jailed, non-wild.
 2361                  *      4. non-jailed, wild.
 2362                  */
 2363 
 2364                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
 2365                     0, pcbinfo->ipi_hashmask)];
 2366                 CK_LIST_FOREACH(inp, head, inp_hash) {
 2367 #ifdef INET6
 2368                         /* XXX inp locking */
 2369                         if ((inp->inp_vflag & INP_IPV4) == 0)
 2370                                 continue;
 2371 #endif
 2372                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
 2373                             inp->inp_lport != lport)
 2374                                 continue;
 2375 
 2376                         injail = prison_flag(inp->inp_cred, PR_IP4);
 2377                         if (injail) {
 2378                                 if (prison_check_ip4(inp->inp_cred,
 2379                                     &laddr) != 0)
 2380                                         continue;
 2381                         } else {
 2382                                 if (local_exact != NULL)
 2383                                         continue;
 2384                         }
 2385 
 2386                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
 2387                                 if (injail)
 2388                                         return (inp);
 2389                                 else
 2390                                         local_exact = inp;
 2391                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 2392 #ifdef INET6
 2393                                 /* XXX inp locking, NULL check */
 2394                                 if (inp->inp_vflag & INP_IPV6PROTO)
 2395                                         local_wild_mapped = inp;
 2396                                 else
 2397 #endif
 2398                                         if (injail)
 2399                                                 jail_wild = inp;
 2400                                         else
 2401                                                 local_wild = inp;
 2402                         }
 2403                 } /* LIST_FOREACH */
 2404                 if (jail_wild != NULL)
 2405                         return (jail_wild);
 2406                 if (local_exact != NULL)
 2407                         return (local_exact);
 2408                 if (local_wild != NULL)
 2409                         return (local_wild);
 2410 #ifdef INET6
 2411                 if (local_wild_mapped != NULL)
 2412                         return (local_wild_mapped);
 2413 #endif
 2414         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 2415 
 2416         return (NULL);
 2417 }
 2418 
 2419 /*
 2420  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
 2421  * hash list lock, and will return the inpcb locked (i.e., requires
 2422  * INPLOOKUP_LOCKPCB).
 2423  */
 2424 static struct inpcb *
 2425 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 2426     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
 2427     struct ifnet *ifp)
 2428 {
 2429         struct inpcb *inp;
 2430 
 2431         INP_HASH_RLOCK(pcbinfo);
 2432         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 2433             (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
 2434         if (inp != NULL) {
 2435                 if (lookupflags & INPLOOKUP_WLOCKPCB) {
 2436                         INP_WLOCK(inp);
 2437                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 2438                                 INP_WUNLOCK(inp);
 2439                                 inp = NULL;
 2440                         }
 2441                 } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
 2442                         INP_RLOCK(inp);
 2443                         if (__predict_false(inp->inp_flags2 & INP_FREED)) {
 2444                                 INP_RUNLOCK(inp);
 2445                                 inp = NULL;
 2446                         }
 2447                 } else
 2448                         panic("%s: locking bug", __func__);
 2449 #ifdef INVARIANTS
 2450                 if (inp != NULL) {
 2451                         if (lookupflags & INPLOOKUP_WLOCKPCB)
 2452                                 INP_WLOCK_ASSERT(inp);
 2453                         else
 2454                                 INP_RLOCK_ASSERT(inp);
 2455                 }
 2456 #endif
 2457         }
 2458         INP_HASH_RUNLOCK(pcbinfo);
 2459         return (inp);
 2460 }
 2461 
 2462 /*
 2463  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
 2464  * from which a pre-calculated hash value may be extracted.
 2465  *
 2466  * Possibly more of this logic should be in in_pcbgroup.c.
 2467  */
 2468 struct inpcb *
 2469 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
 2470     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 2471 {
 2472 #if defined(PCBGROUP) && !defined(RSS)
 2473         struct inpcbgroup *pcbgroup;
 2474 #endif
 2475 
 2476         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 2477             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2478         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 2479             ("%s: LOCKPCB not set", __func__));
 2480 
 2481         /*
 2482          * When not using RSS, use connection groups in preference to the
 2483          * reservation table when looking up 4-tuples.  When using RSS, just
 2484          * use the reservation table, due to the cost of the Toeplitz hash
 2485          * in software.
 2486          *
 2487          * XXXRW: This policy belongs in the pcbgroup code, as in principle
 2488          * we could be doing RSS with a non-Toeplitz hash that is affordable
 2489          * in software.
 2490          */
 2491 #if defined(PCBGROUP) && !defined(RSS)
 2492         if (in_pcbgroup_enabled(pcbinfo)) {
 2493                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 2494                     fport);
 2495                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 2496                     laddr, lport, lookupflags, ifp));
 2497         }
 2498 #endif
 2499         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 2500             lookupflags, ifp));
 2501 }
 2502 
 2503 struct inpcb *
 2504 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 2505     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
 2506     struct ifnet *ifp, struct mbuf *m)
 2507 {
 2508 #ifdef PCBGROUP
 2509         struct inpcbgroup *pcbgroup;
 2510 #endif
 2511 
 2512         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 2513             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2514         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 2515             ("%s: LOCKPCB not set", __func__));
 2516 
 2517 #ifdef PCBGROUP
 2518         /*
 2519          * If we can use a hardware-generated hash to look up the connection
 2520          * group, use that connection group to find the inpcb.  Otherwise
 2521          * fall back on a software hash -- or the reservation table if we're
 2522          * using RSS.
 2523          *
 2524          * XXXRW: As above, that policy belongs in the pcbgroup code.
 2525          */
 2526         if (in_pcbgroup_enabled(pcbinfo) &&
 2527             !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
 2528                 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
 2529                     m->m_pkthdr.flowid);
 2530                 if (pcbgroup != NULL)
 2531                         return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
 2532                             fport, laddr, lport, lookupflags, ifp));
 2533 #ifndef RSS
 2534                 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
 2535                     fport);
 2536                 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
 2537                     laddr, lport, lookupflags, ifp));
 2538 #endif
 2539         }
 2540 #endif
 2541         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 2542             lookupflags, ifp));
 2543 }
 2544 #endif /* INET */
 2545 
 2546 /*
 2547  * Insert PCB onto various hash lists.
 2548  */
 2549 static int
 2550 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
 2551 {
 2552         struct inpcbhead *pcbhash;
 2553         struct inpcbporthead *pcbporthash;
 2554         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 2555         struct inpcbport *phd;
 2556         u_int32_t hashkey_faddr;
 2557         int so_options;
 2558 
 2559         INP_WLOCK_ASSERT(inp);
 2560         INP_HASH_WLOCK_ASSERT(pcbinfo);
 2561 
 2562         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 2563             ("in_pcbinshash: INP_INHASHLIST"));
 2564 
 2565 #ifdef INET6
 2566         if (inp->inp_vflag & INP_IPV6)
 2567                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 2568         else
 2569 #endif
 2570         hashkey_faddr = inp->inp_faddr.s_addr;
 2571 
 2572         pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 2573                  inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 2574 
 2575         pcbporthash = &pcbinfo->ipi_porthashbase[
 2576             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 2577 
 2578         /*
 2579          * Add entry to load balance group.
 2580          * Only do this if SO_REUSEPORT_LB is set.
 2581          */
 2582         so_options = inp_so_options(inp);
 2583         if (so_options & SO_REUSEPORT_LB) {
 2584                 int ret = in_pcbinslbgrouphash(inp);
 2585                 if (ret) {
 2586                         /* pcb lb group malloc fail (ret=ENOBUFS). */
 2587                         return (ret);
 2588                 }
 2589         }
 2590 
 2591         /*
 2592          * Go through port list and look for a head for this lport.
 2593          */
 2594         CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 2595                 if (phd->phd_port == inp->inp_lport)
 2596                         break;
 2597         }
 2598         /*
 2599          * If none exists, malloc one and tack it on.
 2600          */
 2601         if (phd == NULL) {
 2602                 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
 2603                 if (phd == NULL) {
 2604                         return (ENOBUFS); /* XXX */
 2605                 }
 2606                 bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
 2607                 phd->phd_port = inp->inp_lport;
 2608                 CK_LIST_INIT(&phd->phd_pcblist);
 2609                 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 2610         }
 2611         inp->inp_phd = phd;
 2612         CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 2613         CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 2614         inp->inp_flags |= INP_INHASHLIST;
 2615 #ifdef PCBGROUP
 2616         if (m != NULL) {
 2617                 in_pcbgroup_update_mbuf(inp, m);
 2618         } else {
 2619                 in_pcbgroup_update(inp);
 2620         }
 2621 #endif
 2622         return (0);
 2623 }
 2624 
 2625 int
 2626 in_pcbinshash(struct inpcb *inp)
 2627 {
 2628 
 2629         return (in_pcbinshash_internal(inp, NULL));
 2630 }
 2631 
 2632 int
 2633 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m)
 2634 {
 2635 
 2636         return (in_pcbinshash_internal(inp, m));
 2637 }
 2638 
 2639 /*
 2640  * Move PCB to the proper hash bucket when { faddr, fport } have  been
 2641  * changed. NOTE: This does not handle the case of the lport changing (the
 2642  * hashed port list would have to be updated as well), so the lport must
 2643  * not change after in_pcbinshash() has been called.
 2644  */
 2645 void
 2646 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
 2647 {
 2648         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 2649         struct inpcbhead *head;
 2650         u_int32_t hashkey_faddr;
 2651 
 2652         INP_WLOCK_ASSERT(inp);
 2653         INP_HASH_WLOCK_ASSERT(pcbinfo);
 2654 
 2655         KASSERT(inp->inp_flags & INP_INHASHLIST,
 2656             ("in_pcbrehash: !INP_INHASHLIST"));
 2657 
 2658 #ifdef INET6
 2659         if (inp->inp_vflag & INP_IPV6)
 2660                 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
 2661         else
 2662 #endif
 2663         hashkey_faddr = inp->inp_faddr.s_addr;
 2664 
 2665         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
 2666                 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 2667 
 2668         CK_LIST_REMOVE(inp, inp_hash);
 2669         CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 2670 
 2671 #ifdef PCBGROUP
 2672         if (m != NULL)
 2673                 in_pcbgroup_update_mbuf(inp, m);
 2674         else
 2675                 in_pcbgroup_update(inp);
 2676 #endif
 2677 }
 2678 
 2679 void
 2680 in_pcbrehash(struct inpcb *inp)
 2681 {
 2682 
 2683         in_pcbrehash_mbuf(inp, NULL);
 2684 }
 2685 
 2686 /*
 2687  * Remove PCB from various lists.
 2688  */
 2689 static void
 2690 in_pcbremlists(struct inpcb *inp)
 2691 {
 2692         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 2693 
 2694 #ifdef INVARIANTS
 2695         if (pcbinfo == &V_tcbinfo) {
 2696                 INP_INFO_RLOCK_ASSERT(pcbinfo);
 2697         } else {
 2698                 INP_INFO_WLOCK_ASSERT(pcbinfo);
 2699         }
 2700 #endif
 2701 
 2702         INP_WLOCK_ASSERT(inp);
 2703         INP_LIST_WLOCK_ASSERT(pcbinfo);
 2704 
 2705         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 2706         if (inp->inp_flags & INP_INHASHLIST) {
 2707                 struct inpcbport *phd = inp->inp_phd;
 2708 
 2709                 INP_HASH_WLOCK(pcbinfo);
 2710 
 2711                 /* XXX: Only do if SO_REUSEPORT_LB set? */
 2712                 in_pcbremlbgrouphash(inp);
 2713 
 2714                 CK_LIST_REMOVE(inp, inp_hash);
 2715                 CK_LIST_REMOVE(inp, inp_portlist);
 2716                 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 2717                         CK_LIST_REMOVE(phd, phd_hash);
 2718                         epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
 2719                 }
 2720                 INP_HASH_WUNLOCK(pcbinfo);
 2721                 inp->inp_flags &= ~INP_INHASHLIST;
 2722         }
 2723         CK_LIST_REMOVE(inp, inp_list);
 2724         pcbinfo->ipi_count--;
 2725 #ifdef PCBGROUP
 2726         in_pcbgroup_remove(inp);
 2727 #endif
 2728 }
 2729 
 2730 /*
 2731  * Check for alternatives when higher level complains
 2732  * about service problems.  For now, invalidate cached
 2733  * routing information.  If the route was created dynamically
 2734  * (by a redirect), time to try a default gateway again.
 2735  */
 2736 void
 2737 in_losing(struct inpcb *inp)
 2738 {
 2739 
 2740         RO_INVALIDATE_CACHE(&inp->inp_route);
 2741         return;
 2742 }
 2743 
 2744 /*
 2745  * A set label operation has occurred at the socket layer, propagate the
 2746  * label change into the in_pcb for the socket.
 2747  */
 2748 void
 2749 in_pcbsosetlabel(struct socket *so)
 2750 {
 2751 #ifdef MAC
 2752         struct inpcb *inp;
 2753 
 2754         inp = sotoinpcb(so);
 2755         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 2756 
 2757         INP_WLOCK(inp);
 2758         SOCK_LOCK(so);
 2759         mac_inpcb_sosetlabel(so, inp);
 2760         SOCK_UNLOCK(so);
 2761         INP_WUNLOCK(inp);
 2762 #endif
 2763 }
 2764 
 2765 /*
 2766  * ipport_tick runs once per second, determining if random port allocation
 2767  * should be continued.  If more than ipport_randomcps ports have been
 2768  * allocated in the last second, then we return to sequential port
 2769  * allocation. We return to random allocation only once we drop below
 2770  * ipport_randomcps for at least ipport_randomtime seconds.
 2771  */
 2772 static void
 2773 ipport_tick(void *xtp)
 2774 {
 2775         VNET_ITERATOR_DECL(vnet_iter);
 2776 
 2777         VNET_LIST_RLOCK_NOSLEEP();
 2778         VNET_FOREACH(vnet_iter) {
 2779                 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
 2780                 if (V_ipport_tcpallocs <=
 2781                     V_ipport_tcplastcount + V_ipport_randomcps) {
 2782                         if (V_ipport_stoprandom > 0)
 2783                                 V_ipport_stoprandom--;
 2784                 } else
 2785                         V_ipport_stoprandom = V_ipport_randomtime;
 2786                 V_ipport_tcplastcount = V_ipport_tcpallocs;
 2787                 CURVNET_RESTORE();
 2788         }
 2789         VNET_LIST_RUNLOCK_NOSLEEP();
 2790         callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
 2791 }
 2792 
 2793 static void
 2794 ip_fini(void *xtp)
 2795 {
 2796 
 2797         callout_stop(&ipport_tick_callout);
 2798 }
 2799 
 2800 /* 
 2801  * The ipport_callout should start running at about the time we attach the
 2802  * inet or inet6 domains.
 2803  */
 2804 static void
 2805 ipport_tick_init(const void *unused __unused)
 2806 {
 2807 
 2808         /* Start ipport_tick. */
 2809         callout_init(&ipport_tick_callout, 1);
 2810         callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
 2811         EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
 2812                 SHUTDOWN_PRI_DEFAULT);
 2813 }
 2814 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 
 2815     ipport_tick_init, NULL);
 2816 
 2817 void
 2818 inp_wlock(struct inpcb *inp)
 2819 {
 2820 
 2821         INP_WLOCK(inp);
 2822 }
 2823 
 2824 void
 2825 inp_wunlock(struct inpcb *inp)
 2826 {
 2827 
 2828         INP_WUNLOCK(inp);
 2829 }
 2830 
 2831 void
 2832 inp_rlock(struct inpcb *inp)
 2833 {
 2834 
 2835         INP_RLOCK(inp);
 2836 }
 2837 
 2838 void
 2839 inp_runlock(struct inpcb *inp)
 2840 {
 2841 
 2842         INP_RUNLOCK(inp);
 2843 }
 2844 
 2845 #ifdef INVARIANT_SUPPORT
 2846 void
 2847 inp_lock_assert(struct inpcb *inp)
 2848 {
 2849 
 2850         INP_WLOCK_ASSERT(inp);
 2851 }
 2852 
 2853 void
 2854 inp_unlock_assert(struct inpcb *inp)
 2855 {
 2856 
 2857         INP_UNLOCK_ASSERT(inp);
 2858 }
 2859 #endif
 2860 
 2861 void
 2862 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
 2863 {
 2864         struct inpcb *inp;
 2865 
 2866         INP_INFO_WLOCK(&V_tcbinfo);
 2867         CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
 2868                 INP_WLOCK(inp);
 2869                 func(inp, arg);
 2870                 INP_WUNLOCK(inp);
 2871         }
 2872         INP_INFO_WUNLOCK(&V_tcbinfo);
 2873 }
 2874 
 2875 struct socket *
 2876 inp_inpcbtosocket(struct inpcb *inp)
 2877 {
 2878 
 2879         INP_WLOCK_ASSERT(inp);
 2880         return (inp->inp_socket);
 2881 }
 2882 
 2883 struct tcpcb *
 2884 inp_inpcbtotcpcb(struct inpcb *inp)
 2885 {
 2886 
 2887         INP_WLOCK_ASSERT(inp);
 2888         return ((struct tcpcb *)inp->inp_ppcb);
 2889 }
 2890 
 2891 int
 2892 inp_ip_tos_get(const struct inpcb *inp)
 2893 {
 2894 
 2895         return (inp->inp_ip_tos);
 2896 }
 2897 
 2898 void
 2899 inp_ip_tos_set(struct inpcb *inp, int val)
 2900 {
 2901 
 2902         inp->inp_ip_tos = val;
 2903 }
 2904 
 2905 void
 2906 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 2907     uint32_t *faddr, uint16_t *fp)
 2908 {
 2909 
 2910         INP_LOCK_ASSERT(inp);
 2911         *laddr = inp->inp_laddr.s_addr;
 2912         *faddr = inp->inp_faddr.s_addr;
 2913         *lp = inp->inp_lport;
 2914         *fp = inp->inp_fport;
 2915 }
 2916 
 2917 struct inpcb *
 2918 so_sotoinpcb(struct socket *so)
 2919 {
 2920 
 2921         return (sotoinpcb(so));
 2922 }
 2923 
 2924 struct tcpcb *
 2925 so_sototcpcb(struct socket *so)
 2926 {
 2927 
 2928         return (sototcpcb(so));
 2929 }
 2930 
 2931 /*
 2932  * Create an external-format (``xinpcb'') structure using the information in
 2933  * the kernel-format in_pcb structure pointed to by inp.  This is done to
 2934  * reduce the spew of irrelevant information over this interface, to isolate
 2935  * user code from changes in the kernel structure, and potentially to provide
 2936  * information-hiding if we decide that some of this information should be
 2937  * hidden from users.
 2938  */
 2939 void
 2940 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 2941 {
 2942 
 2943         bzero(xi, sizeof(*xi));
 2944         xi->xi_len = sizeof(struct xinpcb);
 2945         if (inp->inp_socket)
 2946                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
 2947         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 2948         xi->inp_gencnt = inp->inp_gencnt;
 2949         xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 2950         xi->inp_flow = inp->inp_flow;
 2951         xi->inp_flowid = inp->inp_flowid;
 2952         xi->inp_flowtype = inp->inp_flowtype;
 2953         xi->inp_flags = inp->inp_flags;
 2954         xi->inp_flags2 = inp->inp_flags2;
 2955         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 2956         xi->in6p_cksum = inp->in6p_cksum;
 2957         xi->in6p_hops = inp->in6p_hops;
 2958         xi->inp_ip_tos = inp->inp_ip_tos;
 2959         xi->inp_vflag = inp->inp_vflag;
 2960         xi->inp_ip_ttl = inp->inp_ip_ttl;
 2961         xi->inp_ip_p = inp->inp_ip_p;
 2962         xi->inp_ip_minttl = inp->inp_ip_minttl;
 2963 }
 2964 
 2965 #ifdef DDB
 2966 static void
 2967 db_print_indent(int indent)
 2968 {
 2969         int i;
 2970 
 2971         for (i = 0; i < indent; i++)
 2972                 db_printf(" ");
 2973 }
 2974 
 2975 static void
 2976 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 2977 {
 2978         char faddr_str[48], laddr_str[48];
 2979 
 2980         db_print_indent(indent);
 2981         db_printf("%s at %p\n", name, inc);
 2982 
 2983         indent += 2;
 2984 
 2985 #ifdef INET6
 2986         if (inc->inc_flags & INC_ISIPV6) {
 2987                 /* IPv6. */
 2988                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
 2989                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
 2990         } else
 2991 #endif
 2992         {
 2993                 /* IPv4. */
 2994                 inet_ntoa_r(inc->inc_laddr, laddr_str);
 2995                 inet_ntoa_r(inc->inc_faddr, faddr_str);
 2996         }
 2997         db_print_indent(indent);
 2998         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 2999             ntohs(inc->inc_lport));
 3000         db_print_indent(indent);
 3001         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 3002             ntohs(inc->inc_fport));
 3003 }
 3004 
 3005 static void
 3006 db_print_inpflags(int inp_flags)
 3007 {
 3008         int comma;
 3009 
 3010         comma = 0;
 3011         if (inp_flags & INP_RECVOPTS) {
 3012                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 3013                 comma = 1;
 3014         }
 3015         if (inp_flags & INP_RECVRETOPTS) {
 3016                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 3017                 comma = 1;
 3018         }
 3019         if (inp_flags & INP_RECVDSTADDR) {
 3020                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 3021                 comma = 1;
 3022         }
 3023         if (inp_flags & INP_ORIGDSTADDR) {
 3024                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 3025                 comma = 1;
 3026         }
 3027         if (inp_flags & INP_HDRINCL) {
 3028                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
 3029                 comma = 1;
 3030         }
 3031         if (inp_flags & INP_HIGHPORT) {
 3032                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 3033                 comma = 1;
 3034         }
 3035         if (inp_flags & INP_LOWPORT) {
 3036                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
 3037                 comma = 1;
 3038         }
 3039         if (inp_flags & INP_ANONPORT) {
 3040                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
 3041                 comma = 1;
 3042         }
 3043         if (inp_flags & INP_RECVIF) {
 3044                 db_printf("%sINP_RECVIF", comma ? ", " : "");
 3045                 comma = 1;
 3046         }
 3047         if (inp_flags & INP_MTUDISC) {
 3048                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
 3049                 comma = 1;
 3050         }
 3051         if (inp_flags & INP_RECVTTL) {
 3052                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
 3053                 comma = 1;
 3054         }
 3055         if (inp_flags & INP_DONTFRAG) {
 3056                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 3057                 comma = 1;
 3058         }
 3059         if (inp_flags & INP_RECVTOS) {
 3060                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
 3061                 comma = 1;
 3062         }
 3063         if (inp_flags & IN6P_IPV6_V6ONLY) {
 3064                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 3065                 comma = 1;
 3066         }
 3067         if (inp_flags & IN6P_PKTINFO) {
 3068                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 3069                 comma = 1;
 3070         }
 3071         if (inp_flags & IN6P_HOPLIMIT) {
 3072                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 3073                 comma = 1;
 3074         }
 3075         if (inp_flags & IN6P_HOPOPTS) {
 3076                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 3077                 comma = 1;
 3078         }
 3079         if (inp_flags & IN6P_DSTOPTS) {
 3080                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 3081                 comma = 1;
 3082         }
 3083         if (inp_flags & IN6P_RTHDR) {
 3084                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 3085                 comma = 1;
 3086         }
 3087         if (inp_flags & IN6P_RTHDRDSTOPTS) {
 3088                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 3089                 comma = 1;
 3090         }
 3091         if (inp_flags & IN6P_TCLASS) {
 3092                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 3093                 comma = 1;
 3094         }
 3095         if (inp_flags & IN6P_AUTOFLOWLABEL) {
 3096                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 3097                 comma = 1;
 3098         }
 3099         if (inp_flags & INP_TIMEWAIT) {
 3100                 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
 3101                 comma  = 1;
 3102         }
 3103         if (inp_flags & INP_ONESBCAST) {
 3104                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 3105                 comma  = 1;
 3106         }
 3107         if (inp_flags & INP_DROPPED) {
 3108                 db_printf("%sINP_DROPPED", comma ? ", " : "");
 3109                 comma  = 1;
 3110         }
 3111         if (inp_flags & INP_SOCKREF) {
 3112                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
 3113                 comma  = 1;
 3114         }
 3115         if (inp_flags & IN6P_RFC2292) {
 3116                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 3117                 comma = 1;
 3118         }
 3119         if (inp_flags & IN6P_MTU) {
 3120                 db_printf("IN6P_MTU%s", comma ? ", " : "");
 3121                 comma = 1;
 3122         }
 3123 }
 3124 
 3125 static void
 3126 db_print_inpvflag(u_char inp_vflag)
 3127 {
 3128         int comma;
 3129 
 3130         comma = 0;
 3131         if (inp_vflag & INP_IPV4) {
 3132                 db_printf("%sINP_IPV4", comma ? ", " : "");
 3133                 comma  = 1;
 3134         }
 3135         if (inp_vflag & INP_IPV6) {
 3136                 db_printf("%sINP_IPV6", comma ? ", " : "");
 3137                 comma  = 1;
 3138         }
 3139         if (inp_vflag & INP_IPV6PROTO) {
 3140                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 3141                 comma  = 1;
 3142         }
 3143 }
 3144 
 3145 static void
 3146 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 3147 {
 3148 
 3149         db_print_indent(indent);
 3150         db_printf("%s at %p\n", name, inp);
 3151 
 3152         indent += 2;
 3153 
 3154         db_print_indent(indent);
 3155         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 3156 
 3157         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 3158 
 3159         db_print_indent(indent);
 3160         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 3161             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 3162 
 3163         db_print_indent(indent);
 3164         db_printf("inp_label: %p   inp_flags: 0x%x (",
 3165            inp->inp_label, inp->inp_flags);
 3166         db_print_inpflags(inp->inp_flags);
 3167         db_printf(")\n");
 3168 
 3169         db_print_indent(indent);
 3170         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 3171             inp->inp_vflag);
 3172         db_print_inpvflag(inp->inp_vflag);
 3173         db_printf(")\n");
 3174 
 3175         db_print_indent(indent);
 3176         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 3177             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 3178 
 3179         db_print_indent(indent);
 3180 #ifdef INET6
 3181         if (inp->inp_vflag & INP_IPV6) {
 3182                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
 3183                     "in6p_moptions: %p\n", inp->in6p_options,
 3184                     inp->in6p_outputopts, inp->in6p_moptions);
 3185                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 3186                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 3187                     inp->in6p_hops);
 3188         } else
 3189 #endif
 3190         {
 3191                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 3192                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 3193                     inp->inp_options, inp->inp_moptions);
 3194         }
 3195 
 3196         db_print_indent(indent);
 3197         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 3198             (uintmax_t)inp->inp_gencnt);
 3199 }
 3200 
 3201 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 3202 {
 3203         struct inpcb *inp;
 3204 
 3205         if (!have_addr) {
 3206                 db_printf("usage: show inpcb <addr>\n");
 3207                 return;
 3208         }
 3209         inp = (struct inpcb *)addr;
 3210 
 3211         db_print_inpcb(inp, "inpcb", 0);
 3212 }
 3213 #endif /* DDB */
 3214 
 3215 #ifdef RATELIMIT
 3216 /*
 3217  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
 3218  * if any.
 3219  */
 3220 int
 3221 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 3222 {
 3223         union if_snd_tag_modify_params params = {
 3224                 .rate_limit.max_rate = max_pacing_rate,
 3225         };
 3226         struct m_snd_tag *mst;
 3227         struct ifnet *ifp;
 3228         int error;
 3229 
 3230         mst = inp->inp_snd_tag;
 3231         if (mst == NULL)
 3232                 return (EINVAL);
 3233 
 3234         ifp = mst->ifp;
 3235         if (ifp == NULL)
 3236                 return (EINVAL);
 3237 
 3238         if (ifp->if_snd_tag_modify == NULL) {
 3239                 error = EOPNOTSUPP;
 3240         } else {
 3241                 error = ifp->if_snd_tag_modify(mst, &params);
 3242         }
 3243         return (error);
 3244 }
 3245 
 3246 /*
 3247  * Query existing TX rate limit based on the existing
 3248  * "inp->inp_snd_tag", if any.
 3249  */
 3250 int
 3251 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 3252 {
 3253         union if_snd_tag_query_params params = { };
 3254         struct m_snd_tag *mst;
 3255         struct ifnet *ifp;
 3256         int error;
 3257 
 3258         mst = inp->inp_snd_tag;
 3259         if (mst == NULL)
 3260                 return (EINVAL);
 3261 
 3262         ifp = mst->ifp;
 3263         if (ifp == NULL)
 3264                 return (EINVAL);
 3265 
 3266         if (ifp->if_snd_tag_query == NULL) {
 3267                 error = EOPNOTSUPP;
 3268         } else {
 3269                 error = ifp->if_snd_tag_query(mst, &params);
 3270                 if (error == 0 &&  p_max_pacing_rate != NULL)
 3271                         *p_max_pacing_rate = params.rate_limit.max_rate;
 3272         }
 3273         return (error);
 3274 }
 3275 
 3276 /*
 3277  * Query existing TX queue level based on the existing
 3278  * "inp->inp_snd_tag", if any.
 3279  */
 3280 int
 3281 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 3282 {
 3283         union if_snd_tag_query_params params = { };
 3284         struct m_snd_tag *mst;
 3285         struct ifnet *ifp;
 3286         int error;
 3287 
 3288         mst = inp->inp_snd_tag;
 3289         if (mst == NULL)
 3290                 return (EINVAL);
 3291 
 3292         ifp = mst->ifp;
 3293         if (ifp == NULL)
 3294                 return (EINVAL);
 3295 
 3296         if (ifp->if_snd_tag_query == NULL)
 3297                 return (EOPNOTSUPP);
 3298 
 3299         error = ifp->if_snd_tag_query(mst, &params);
 3300         if (error == 0 &&  p_txqueue_level != NULL)
 3301                 *p_txqueue_level = params.rate_limit.queue_level;
 3302         return (error);
 3303 }
 3304 
 3305 /*
 3306  * Allocate a new TX rate limit send tag from the network interface
 3307  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
 3308  */
 3309 int
 3310 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
 3311     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
 3312 {
 3313         union if_snd_tag_alloc_params params = {
 3314                 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 3315                     IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 3316                 .rate_limit.hdr.flowid = flowid,
 3317                 .rate_limit.hdr.flowtype = flowtype,
 3318                 .rate_limit.max_rate = max_pacing_rate,
 3319         };
 3320         int error;
 3321 
 3322         INP_WLOCK_ASSERT(inp);
 3323 
 3324         /*
 3325          * If there is already a send tag, or the INP is being torn
 3326          * down, allocating a new send tag is not allowed. Else send
 3327          * tags may leak.
 3328          */
 3329         if (inp->inp_snd_tag != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
 3330                 return (EINVAL);
 3331 
 3332         if (ifp->if_snd_tag_alloc == NULL) {
 3333                 error = EOPNOTSUPP;
 3334         } else {
 3335                 error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
 3336 
 3337                 /*
 3338                  * At success increment the refcount on
 3339                  * the send tag's network interface:
 3340                  */
 3341                 if (error == 0)
 3342                         if_ref(inp->inp_snd_tag->ifp);
 3343         }
 3344         return (error);
 3345 }
 3346 
 3347 /*
 3348  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
 3349  * if any:
 3350  */
 3351 void
 3352 in_pcbdetach_txrtlmt(struct inpcb *inp)
 3353 {
 3354         struct m_snd_tag *mst;
 3355         struct ifnet *ifp;
 3356 
 3357         INP_WLOCK_ASSERT(inp);
 3358 
 3359         mst = inp->inp_snd_tag;
 3360         inp->inp_snd_tag = NULL;
 3361 
 3362         if (mst == NULL)
 3363                 return;
 3364 
 3365         ifp = mst->ifp;
 3366         if (ifp == NULL)
 3367                 return;
 3368 
 3369         /*
 3370          * If the device was detached while we still had reference(s)
 3371          * on the ifp, we assume if_snd_tag_free() was replaced with
 3372          * stubs.
 3373          */
 3374         ifp->if_snd_tag_free(mst);
 3375 
 3376         /* release reference count on network interface */
 3377         if_rele(ifp);
 3378 }
 3379 
 3380 /*
 3381  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
 3382  * is set in the fast path and will attach/detach/modify the TX rate
 3383  * limit send tag based on the socket's so_max_pacing_rate value.
 3384  */
 3385 void
 3386 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 3387 {
 3388         struct socket *socket;
 3389         uint32_t max_pacing_rate;
 3390         bool did_upgrade;
 3391         int error;
 3392 
 3393         if (inp == NULL)
 3394                 return;
 3395 
 3396         socket = inp->inp_socket;
 3397         if (socket == NULL)
 3398                 return;
 3399 
 3400         if (!INP_WLOCKED(inp)) {
 3401                 /*
 3402                  * NOTE: If the write locking fails, we need to bail
 3403                  * out and use the non-ratelimited ring for the
 3404                  * transmit until there is a new chance to get the
 3405                  * write lock.
 3406                  */
 3407                 if (!INP_TRY_UPGRADE(inp))
 3408                         return;
 3409                 did_upgrade = 1;
 3410         } else {
 3411                 did_upgrade = 0;
 3412         }
 3413 
 3414         /*
 3415          * NOTE: The so_max_pacing_rate value is read unlocked,
 3416          * because atomic updates are not required since the variable
 3417          * is checked at every mbuf we send. It is assumed that the
 3418          * variable read itself will be atomic.
 3419          */
 3420         max_pacing_rate = socket->so_max_pacing_rate;
 3421 
 3422         /*
 3423          * NOTE: When attaching to a network interface a reference is
 3424          * made to ensure the network interface doesn't go away until
 3425          * all ratelimit connections are gone. The network interface
 3426          * pointers compared below represent valid network interfaces,
 3427          * except when comparing towards NULL.
 3428          */
 3429         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 3430                 error = 0;
 3431         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 3432                 if (inp->inp_snd_tag != NULL)
 3433                         in_pcbdetach_txrtlmt(inp);
 3434                 error = 0;
 3435         } else if (inp->inp_snd_tag == NULL) {
 3436                 /*
 3437                  * In order to utilize packet pacing with RSS, we need
 3438                  * to wait until there is a valid RSS hash before we
 3439                  * can proceed:
 3440                  */
 3441                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 3442                         error = EAGAIN;
 3443                 } else {
 3444                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 3445                             mb->m_pkthdr.flowid, max_pacing_rate);
 3446                 }
 3447         } else {
 3448                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 3449         }
 3450         if (error == 0 || error == EOPNOTSUPP)
 3451                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 3452         if (did_upgrade)
 3453                 INP_DOWNGRADE(inp);
 3454 }
 3455 
 3456 /*
 3457  * Track route changes for TX rate limiting.
 3458  */
 3459 void
 3460 in_pcboutput_eagain(struct inpcb *inp)
 3461 {
 3462         bool did_upgrade;
 3463 
 3464         if (inp == NULL)
 3465                 return;
 3466 
 3467         if (inp->inp_snd_tag == NULL)
 3468                 return;
 3469 
 3470         if (!INP_WLOCKED(inp)) {
 3471                 /*
 3472                  * NOTE: If the write locking fails, we need to bail
 3473                  * out and use the non-ratelimited ring for the
 3474                  * transmit until there is a new chance to get the
 3475                  * write lock.
 3476                  */
 3477                 if (!INP_TRY_UPGRADE(inp))
 3478                         return;
 3479                 did_upgrade = 1;
 3480         } else {
 3481                 did_upgrade = 0;
 3482         }
 3483 
 3484         /* detach rate limiting */
 3485         in_pcbdetach_txrtlmt(inp);
 3486 
 3487         /* make sure new mbuf send tag allocation is made */
 3488         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 3489 
 3490         if (did_upgrade)
 3491                 INP_DOWNGRADE(inp);
 3492 }
 3493 #endif /* RATELIMIT */

Cache object: 78a1d13de9e166f6da6adf927e07bb27


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.