The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1982, 1986, 1991, 1993, 1995
    5  *      The Regents of the University of California.
    6  * Copyright (c) 2007-2009 Robert N. M. Watson
    7  * Copyright (c) 2010-2011 Juniper Networks, Inc.
    8  * All rights reserved.
    9  *
   10  * Portions of this software were developed by Robert N. M. Watson under
   11  * contract to Juniper Networks, Inc.
   12  *
   13  * Redistribution and use in source and binary forms, with or without
   14  * modification, are permitted provided that the following conditions
   15  * are met:
   16  * 1. Redistributions of source code must retain the above copyright
   17  *    notice, this list of conditions and the following disclaimer.
   18  * 2. Redistributions in binary form must reproduce the above copyright
   19  *    notice, this list of conditions and the following disclaimer in the
   20  *    documentation and/or other materials provided with the distribution.
   21  * 3. Neither the name of the University nor the names of its contributors
   22  *    may be used to endorse or promote products derived from this software
   23  *    without specific prior written permission.
   24  *
   25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   35  * SUCH DAMAGE.
   36  *
   37  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
   38  */
   39 
   40 #include <sys/cdefs.h>
   41 __FBSDID("$FreeBSD$");
   42 
   43 #include "opt_ddb.h"
   44 #include "opt_ipsec.h"
   45 #include "opt_inet.h"
   46 #include "opt_inet6.h"
   47 #include "opt_ratelimit.h"
   48 #include "opt_route.h"
   49 #include "opt_rss.h"
   50 
   51 #include <sys/param.h>
   52 #include <sys/hash.h>
   53 #include <sys/systm.h>
   54 #include <sys/libkern.h>
   55 #include <sys/lock.h>
   56 #include <sys/malloc.h>
   57 #include <sys/mbuf.h>
   58 #include <sys/eventhandler.h>
   59 #include <sys/domain.h>
   60 #include <sys/protosw.h>
   61 #include <sys/smp.h>
   62 #include <sys/socket.h>
   63 #include <sys/socketvar.h>
   64 #include <sys/sockio.h>
   65 #include <sys/priv.h>
   66 #include <sys/proc.h>
   67 #include <sys/refcount.h>
   68 #include <sys/jail.h>
   69 #include <sys/kernel.h>
   70 #include <sys/sysctl.h>
   71 
   72 #ifdef DDB
   73 #include <ddb/ddb.h>
   74 #endif
   75 
   76 #include <vm/uma.h>
   77 #include <vm/vm.h>
   78 
   79 #include <net/if.h>
   80 #include <net/if_var.h>
   81 #include <net/if_types.h>
   82 #include <net/if_llatbl.h>
   83 #include <net/route.h>
   84 #include <net/rss_config.h>
   85 #include <net/vnet.h>
   86 
   87 #if defined(INET) || defined(INET6)
   88 #include <netinet/in.h>
   89 #include <netinet/in_pcb.h>
   90 #include <netinet/in_pcb_var.h>
   91 #include <netinet/tcp.h>
   92 #ifdef INET
   93 #include <netinet/in_var.h>
   94 #include <netinet/in_fib.h>
   95 #endif
   96 #include <netinet/ip_var.h>
   97 #ifdef INET6
   98 #include <netinet/ip6.h>
   99 #include <netinet6/in6_pcb.h>
  100 #include <netinet6/in6_var.h>
  101 #include <netinet6/ip6_var.h>
  102 #endif /* INET6 */
  103 #include <net/route/nhop.h>
  104 #endif
  105 
  106 #include <netipsec/ipsec_support.h>
  107 
  108 #include <security/mac/mac_framework.h>
  109 
  110 #define INPCBLBGROUP_SIZMIN     8
  111 #define INPCBLBGROUP_SIZMAX     256
  112 #define INP_FREED       0x00000200      /* See in_pcb.h. */
  113 
  114 /*
  115  * These configure the range of local port addresses assigned to
  116  * "unspecified" outgoing connections/packets/whatever.
  117  */
  118 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1;    /* 1023 */
  119 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART;    /* 600 */
  120 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST;     /* 10000 */
  121 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST;       /* 65535 */
  122 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO;      /* 49152 */
  123 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO;        /* 65535 */
  124 
  125 /*
  126  * Reserved ports accessible only to root. There are significant
  127  * security considerations that must be accounted for when changing these,
  128  * but the security benefits can be great. Please be careful.
  129  */
  130 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1;    /* 1023 */
  131 VNET_DEFINE(int, ipport_reservedlow);
  132 
  133 /* Enable random ephemeral port allocation by default. */
  134 VNET_DEFINE(int, ipport_randomized) = 1;
  135 
  136 #ifdef INET
  137 static struct inpcb     *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
  138                             struct in_addr faddr, u_int fport_arg,
  139                             struct in_addr laddr, u_int lport_arg,
  140                             int lookupflags, struct ifnet *ifp,
  141                             uint8_t numa_domain);
  142 
  143 #define RANGECHK(var, min, max) \
  144         if ((var) < (min)) { (var) = (min); } \
  145         else if ((var) > (max)) { (var) = (max); }
  146 
  147 static int
  148 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
  149 {
  150         int error;
  151 
  152         error = sysctl_handle_int(oidp, arg1, arg2, req);
  153         if (error == 0) {
  154                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
  155                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
  156                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
  157                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
  158                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
  159                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
  160         }
  161         return (error);
  162 }
  163 
  164 #undef RANGECHK
  165 
  166 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange,
  167     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  168     "IP Ports");
  169 
  170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
  171     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  172     &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I",
  173     "");
  174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
  175     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  176     &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I",
  177     "");
  178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
  179     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  180     &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I",
  181     "");
  182 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
  183     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  184     &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I",
  185     "");
  186 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
  187     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  188     &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I",
  189     "");
  190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
  191     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
  192     &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I",
  193     "");
  194 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
  195         CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
  196         &VNET_NAME(ipport_reservedhigh), 0, "");
  197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
  198         CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
  199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
  200         CTLFLAG_VNET | CTLFLAG_RW,
  201         &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
  202 
  203 #ifdef RATELIMIT
  204 counter_u64_t rate_limit_new;
  205 counter_u64_t rate_limit_chg;
  206 counter_u64_t rate_limit_active;
  207 counter_u64_t rate_limit_alloc_fail;
  208 counter_u64_t rate_limit_set_ok;
  209 
  210 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
  211     "IP Rate Limiting");
  212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
  213     &rate_limit_active, "Active rate limited connections");
  214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
  215    &rate_limit_alloc_fail, "Rate limited connection failures");
  216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
  217    &rate_limit_set_ok, "Rate limited setting succeeded");
  218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD,
  219    &rate_limit_new, "Total Rate limit new attempts");
  220 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD,
  221    &rate_limit_chg, "Total Rate limited change attempts");
  222 
  223 #endif /* RATELIMIT */
  224 
  225 #endif /* INET */
  226 
  227 VNET_DEFINE(uint32_t, in_pcbhashseed);
  228 static void
  229 in_pcbhashseed_init(void)
  230 {
  231 
  232         V_in_pcbhashseed = arc4random();
  233 }
  234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST,
  235     in_pcbhashseed_init, 0);
  236 
  237 static void in_pcbremhash(struct inpcb *);
  238 
  239 /*
  240  * in_pcb.c: manage the Protocol Control Blocks.
  241  *
  242  * NOTE: It is assumed that most of these functions will be called with
  243  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
  244  * functions often modify hash chains or addresses in pcbs.
  245  */
  246 
  247 static struct inpcblbgroup *
  248 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred,
  249     u_char vflag, uint16_t port, const union in_dependaddr *addr, int size,
  250     uint8_t numa_domain)
  251 {
  252         struct inpcblbgroup *grp;
  253         size_t bytes;
  254 
  255         bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
  256         grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
  257         if (grp == NULL)
  258                 return (NULL);
  259         grp->il_cred = crhold(cred);
  260         grp->il_vflag = vflag;
  261         grp->il_lport = port;
  262         grp->il_numa_domain = numa_domain;
  263         grp->il_dependladdr = *addr;
  264         grp->il_inpsiz = size;
  265         CK_LIST_INSERT_HEAD(hdr, grp, il_list);
  266         return (grp);
  267 }
  268 
  269 static void
  270 in_pcblbgroup_free_deferred(epoch_context_t ctx)
  271 {
  272         struct inpcblbgroup *grp;
  273 
  274         grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
  275         crfree(grp->il_cred);
  276         free(grp, M_PCB);
  277 }
  278 
  279 static void
  280 in_pcblbgroup_free(struct inpcblbgroup *grp)
  281 {
  282 
  283         CK_LIST_REMOVE(grp, il_list);
  284         NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
  285 }
  286 
  287 static struct inpcblbgroup *
  288 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
  289     struct inpcblbgroup *old_grp, int size)
  290 {
  291         struct inpcblbgroup *grp;
  292         int i;
  293 
  294         grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag,
  295             old_grp->il_lport, &old_grp->il_dependladdr, size,
  296             old_grp->il_numa_domain);
  297         if (grp == NULL)
  298                 return (NULL);
  299 
  300         KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
  301             ("invalid new local group size %d and old local group count %d",
  302              grp->il_inpsiz, old_grp->il_inpcnt));
  303 
  304         for (i = 0; i < old_grp->il_inpcnt; ++i)
  305                 grp->il_inp[i] = old_grp->il_inp[i];
  306         grp->il_inpcnt = old_grp->il_inpcnt;
  307         in_pcblbgroup_free(old_grp);
  308         return (grp);
  309 }
  310 
  311 /*
  312  * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
  313  * and shrink group if possible.
  314  */
  315 static void
  316 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
  317     int i)
  318 {
  319         struct inpcblbgroup *grp, *new_grp;
  320 
  321         grp = *grpp;
  322         for (; i + 1 < grp->il_inpcnt; ++i)
  323                 grp->il_inp[i] = grp->il_inp[i + 1];
  324         grp->il_inpcnt--;
  325 
  326         if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
  327             grp->il_inpcnt <= grp->il_inpsiz / 4) {
  328                 /* Shrink this group. */
  329                 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
  330                 if (new_grp != NULL)
  331                         *grpp = new_grp;
  332         }
  333 }
  334 
  335 /*
  336  * Add PCB to load balance group for SO_REUSEPORT_LB option.
  337  */
  338 static int
  339 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
  340 {
  341         const static struct timeval interval = { 60, 0 };
  342         static struct timeval lastprint;
  343         struct inpcbinfo *pcbinfo;
  344         struct inpcblbgrouphead *hdr;
  345         struct inpcblbgroup *grp;
  346         uint32_t idx;
  347 
  348         pcbinfo = inp->inp_pcbinfo;
  349 
  350         INP_WLOCK_ASSERT(inp);
  351         INP_HASH_WLOCK_ASSERT(pcbinfo);
  352 
  353 #ifdef INET6
  354         /*
  355          * Don't allow IPv4 mapped INET6 wild socket.
  356          */
  357         if ((inp->inp_vflag & INP_IPV4) &&
  358             inp->inp_laddr.s_addr == INADDR_ANY &&
  359             INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
  360                 return (0);
  361         }
  362 #endif
  363 
  364         idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
  365         hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
  366         CK_LIST_FOREACH(grp, hdr, il_list) {
  367                 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison &&
  368                     grp->il_vflag == inp->inp_vflag &&
  369                     grp->il_lport == inp->inp_lport &&
  370                     grp->il_numa_domain == numa_domain &&
  371                     memcmp(&grp->il_dependladdr,
  372                     &inp->inp_inc.inc_ie.ie_dependladdr,
  373                     sizeof(grp->il_dependladdr)) == 0) {
  374                         break;
  375                 }
  376         }
  377         if (grp == NULL) {
  378                 /* Create new load balance group. */
  379                 grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag,
  380                     inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
  381                     INPCBLBGROUP_SIZMIN, numa_domain);
  382                 if (grp == NULL)
  383                         return (ENOBUFS);
  384         } else if (grp->il_inpcnt == grp->il_inpsiz) {
  385                 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
  386                         if (ratecheck(&lastprint, &interval))
  387                                 printf("lb group port %d, limit reached\n",
  388                                     ntohs(grp->il_lport));
  389                         return (0);
  390                 }
  391 
  392                 /* Expand this local group. */
  393                 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
  394                 if (grp == NULL)
  395                         return (ENOBUFS);
  396         }
  397 
  398         KASSERT(grp->il_inpcnt < grp->il_inpsiz,
  399             ("invalid local group size %d and count %d", grp->il_inpsiz,
  400             grp->il_inpcnt));
  401 
  402         grp->il_inp[grp->il_inpcnt] = inp;
  403         grp->il_inpcnt++;
  404         return (0);
  405 }
  406 
  407 /*
  408  * Remove PCB from load balance group.
  409  */
  410 static void
  411 in_pcbremlbgrouphash(struct inpcb *inp)
  412 {
  413         struct inpcbinfo *pcbinfo;
  414         struct inpcblbgrouphead *hdr;
  415         struct inpcblbgroup *grp;
  416         int i;
  417 
  418         pcbinfo = inp->inp_pcbinfo;
  419 
  420         INP_WLOCK_ASSERT(inp);
  421         INP_HASH_WLOCK_ASSERT(pcbinfo);
  422 
  423         hdr = &pcbinfo->ipi_lbgrouphashbase[
  424             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
  425         CK_LIST_FOREACH(grp, hdr, il_list) {
  426                 for (i = 0; i < grp->il_inpcnt; ++i) {
  427                         if (grp->il_inp[i] != inp)
  428                                 continue;
  429 
  430                         if (grp->il_inpcnt == 1) {
  431                                 /* We are the last, free this local group. */
  432                                 in_pcblbgroup_free(grp);
  433                         } else {
  434                                 /* Pull up inpcbs, shrink group if possible. */
  435                                 in_pcblbgroup_reorder(hdr, &grp, i);
  436                         }
  437                         return;
  438                 }
  439         }
  440 }
  441 
  442 int
  443 in_pcblbgroup_numa(struct inpcb *inp, int arg)
  444 {
  445         struct inpcbinfo *pcbinfo;
  446         struct inpcblbgrouphead *hdr;
  447         struct inpcblbgroup *grp;
  448         int err, i;
  449         uint8_t numa_domain;
  450 
  451         switch (arg) {
  452         case TCP_REUSPORT_LB_NUMA_NODOM:
  453                 numa_domain = M_NODOM;
  454                 break;
  455         case TCP_REUSPORT_LB_NUMA_CURDOM:
  456                 numa_domain = PCPU_GET(domain);
  457                 break;
  458         default:
  459                 if (arg < 0 || arg >= vm_ndomains)
  460                         return (EINVAL);
  461                 numa_domain = arg;
  462         }
  463 
  464         err = 0;
  465         pcbinfo = inp->inp_pcbinfo;
  466         INP_WLOCK_ASSERT(inp);
  467         INP_HASH_WLOCK(pcbinfo);
  468         hdr = &pcbinfo->ipi_lbgrouphashbase[
  469             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
  470         CK_LIST_FOREACH(grp, hdr, il_list) {
  471                 for (i = 0; i < grp->il_inpcnt; ++i) {
  472                         if (grp->il_inp[i] != inp)
  473                                 continue;
  474 
  475                         if (grp->il_numa_domain == numa_domain) {
  476                                 goto abort_with_hash_wlock;
  477                         }
  478 
  479                         /* Remove it from the old group. */
  480                         in_pcbremlbgrouphash(inp);
  481 
  482                         /* Add it to the new group based on numa domain. */
  483                         in_pcbinslbgrouphash(inp, numa_domain);
  484                         goto abort_with_hash_wlock;
  485                 }
  486         }
  487         err = ENOENT;
  488 abort_with_hash_wlock:
  489         INP_HASH_WUNLOCK(pcbinfo);
  490         return (err);
  491 }
  492 
  493 /* Make sure it is safe to use hashinit(9) on CK_LIST. */
  494 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb)));
  495 
  496 /*
  497  * Initialize an inpcbinfo - a per-VNET instance of connections db.
  498  */
  499 void
  500 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor,
  501     u_int hash_nelements, u_int porthash_nelements)
  502 {
  503 
  504         mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF);
  505         mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name,
  506             NULL, MTX_DEF);
  507 #ifdef VIMAGE
  508         pcbinfo->ipi_vnet = curvnet;
  509 #endif
  510         CK_LIST_INIT(&pcbinfo->ipi_listhead);
  511         pcbinfo->ipi_count = 0;
  512         pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
  513             &pcbinfo->ipi_hashmask);
  514         porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
  515         pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
  516             &pcbinfo->ipi_porthashmask);
  517         pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
  518             &pcbinfo->ipi_lbgrouphashmask);
  519         pcbinfo->ipi_zone = pcbstor->ips_zone;
  520         pcbinfo->ipi_portzone = pcbstor->ips_portzone;
  521         pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone);
  522 }
  523 
  524 /*
  525  * Destroy an inpcbinfo.
  526  */
  527 void
  528 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
  529 {
  530 
  531         KASSERT(pcbinfo->ipi_count == 0,
  532             ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
  533 
  534         hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
  535         hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
  536             pcbinfo->ipi_porthashmask);
  537         hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
  538             pcbinfo->ipi_lbgrouphashmask);
  539         mtx_destroy(&pcbinfo->ipi_hash_lock);
  540         mtx_destroy(&pcbinfo->ipi_lock);
  541 }
  542 
  543 /*
  544  * Initialize a pcbstorage - per protocol zones to allocate inpcbs.
  545  */
  546 static void inpcb_dtor(void *, int, void *);
  547 static void inpcb_fini(void *, int);
  548 void
  549 in_pcbstorage_init(void *arg)
  550 {
  551         struct inpcbstorage *pcbstor = arg;
  552 
  553         pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name,
  554             pcbstor->ips_size, NULL, inpcb_dtor, pcbstor->ips_pcbinit,
  555             inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR);
  556         pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name,
  557             sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
  558         uma_zone_set_smr(pcbstor->ips_portzone,
  559             uma_zone_get_smr(pcbstor->ips_zone));
  560 }
  561 
  562 /*
  563  * Destroy a pcbstorage - used by unloadable protocols.
  564  */
  565 void
  566 in_pcbstorage_destroy(void *arg)
  567 {
  568         struct inpcbstorage *pcbstor = arg;
  569 
  570         uma_zdestroy(pcbstor->ips_zone);
  571         uma_zdestroy(pcbstor->ips_portzone);
  572 }
  573 
  574 /*
  575  * Allocate a PCB and associate it with the socket.
  576  * On success return with the PCB locked.
  577  */
  578 int
  579 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
  580 {
  581         struct inpcb *inp;
  582 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
  583         int error;
  584 #endif
  585 
  586         inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT);
  587         if (inp == NULL)
  588                 return (ENOBUFS);
  589         bzero(&inp->inp_start_zero, inp_zero_size);
  590 #ifdef NUMA
  591         inp->inp_numa_domain = M_NODOM;
  592 #endif
  593         inp->inp_pcbinfo = pcbinfo;
  594         inp->inp_socket = so;
  595         inp->inp_cred = crhold(so->so_cred);
  596         inp->inp_inc.inc_fibnum = so->so_fibnum;
  597 #ifdef MAC
  598         error = mac_inpcb_init(inp, M_NOWAIT);
  599         if (error != 0)
  600                 goto out;
  601         mac_inpcb_create(so, inp);
  602 #endif
  603 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  604         error = ipsec_init_pcbpolicy(inp);
  605         if (error != 0) {
  606 #ifdef MAC
  607                 mac_inpcb_destroy(inp);
  608 #endif
  609                 goto out;
  610         }
  611 #endif /*IPSEC*/
  612 #ifdef INET6
  613         if (INP_SOCKAF(so) == AF_INET6) {
  614                 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6;
  615                 if (V_ip6_v6only)
  616                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
  617 #ifdef INET
  618                 else
  619                         inp->inp_vflag |= INP_IPV4;
  620 #endif
  621                 if (V_ip6_auto_flowlabel)
  622                         inp->inp_flags |= IN6P_AUTOFLOWLABEL;
  623                 inp->in6p_hops = -1;    /* use kernel default */
  624         }
  625 #endif
  626 #if defined(INET) && defined(INET6)
  627         else
  628 #endif
  629 #ifdef INET
  630                 inp->inp_vflag |= INP_IPV4;
  631 #endif
  632         /*
  633          * Routes in inpcb's can cache L2 as well; they are guaranteed
  634          * to be cleaned up.
  635          */
  636         inp->inp_route.ro_flags = RT_LLE_CACHE;
  637         refcount_init(&inp->inp_refcount, 1);   /* Reference from socket. */
  638         INP_WLOCK(inp);
  639         INP_INFO_WLOCK(pcbinfo);
  640         pcbinfo->ipi_count++;
  641         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
  642         CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list);
  643         INP_INFO_WUNLOCK(pcbinfo);
  644         so->so_pcb = inp;
  645 
  646         return (0);
  647 
  648 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
  649 out:
  650         uma_zfree_smr(pcbinfo->ipi_zone, inp);
  651         return (error);
  652 #endif
  653 }
  654 
  655 #ifdef INET
  656 int
  657 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
  658 {
  659         int anonport, error;
  660 
  661         KASSERT(nam == NULL || nam->sa_family == AF_INET,
  662             ("%s: invalid address family for %p", __func__, nam));
  663         KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in),
  664             ("%s: invalid address length for %p", __func__, nam));
  665         INP_WLOCK_ASSERT(inp);
  666         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
  667 
  668         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
  669                 return (EINVAL);
  670         anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
  671         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
  672             &inp->inp_lport, cred);
  673         if (error)
  674                 return (error);
  675         if (in_pcbinshash(inp) != 0) {
  676                 inp->inp_laddr.s_addr = INADDR_ANY;
  677                 inp->inp_lport = 0;
  678                 return (EAGAIN);
  679         }
  680         if (anonport)
  681                 inp->inp_flags |= INP_ANONPORT;
  682         return (0);
  683 }
  684 #endif
  685 
  686 #if defined(INET) || defined(INET6)
  687 /*
  688  * Assign a local port like in_pcb_lport(), but also used with connect()
  689  * and a foreign address and port.  If fsa is non-NULL, choose a local port
  690  * that is unused with those, otherwise one that is completely unused.
  691  * lsa can be NULL for IPv6.
  692  */
  693 int
  694 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
  695     struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
  696 {
  697         struct inpcbinfo *pcbinfo;
  698         struct inpcb *tmpinp;
  699         unsigned short *lastport;
  700         int count, error;
  701         u_short aux, first, last, lport;
  702 #ifdef INET
  703         struct in_addr laddr, faddr;
  704 #endif
  705 #ifdef INET6
  706         struct in6_addr *laddr6, *faddr6;
  707 #endif
  708 
  709         pcbinfo = inp->inp_pcbinfo;
  710 
  711         /*
  712          * Because no actual state changes occur here, a global write lock on
  713          * the pcbinfo isn't required.
  714          */
  715         INP_LOCK_ASSERT(inp);
  716         INP_HASH_LOCK_ASSERT(pcbinfo);
  717 
  718         if (inp->inp_flags & INP_HIGHPORT) {
  719                 first = V_ipport_hifirstauto;   /* sysctl */
  720                 last  = V_ipport_hilastauto;
  721                 lastport = &pcbinfo->ipi_lasthi;
  722         } else if (inp->inp_flags & INP_LOWPORT) {
  723                 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT);
  724                 if (error)
  725                         return (error);
  726                 first = V_ipport_lowfirstauto;  /* 1023 */
  727                 last  = V_ipport_lowlastauto;   /* 600 */
  728                 lastport = &pcbinfo->ipi_lastlow;
  729         } else {
  730                 first = V_ipport_firstauto;     /* sysctl */
  731                 last  = V_ipport_lastauto;
  732                 lastport = &pcbinfo->ipi_lastport;
  733         }
  734 
  735         /*
  736          * Instead of having two loops further down counting up or down
  737          * make sure that first is always <= last and go with only one
  738          * code path implementing all logic.
  739          */
  740         if (first > last) {
  741                 aux = first;
  742                 first = last;
  743                 last = aux;
  744         }
  745 
  746 #ifdef INET
  747         laddr.s_addr = INADDR_ANY;      /* used by INET6+INET below too */
  748         if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
  749                 if (lsa != NULL)
  750                         laddr = ((struct sockaddr_in *)lsa)->sin_addr;
  751                 if (fsa != NULL)
  752                         faddr = ((struct sockaddr_in *)fsa)->sin_addr;
  753         }
  754 #endif
  755 #ifdef INET6
  756         laddr6 = NULL;
  757         if ((inp->inp_vflag & INP_IPV6) != 0) {
  758                 if (lsa != NULL)
  759                         laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
  760                 if (fsa != NULL)
  761                         faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
  762         }
  763 #endif
  764 
  765         tmpinp = NULL;
  766         lport = *lportp;
  767 
  768         if (V_ipport_randomized)
  769                 *lastport = first + (arc4random() % (last - first));
  770 
  771         count = last - first;
  772 
  773         do {
  774                 if (count-- < 0)        /* completely used? */
  775                         return (EADDRNOTAVAIL);
  776                 ++*lastport;
  777                 if (*lastport < first || *lastport > last)
  778                         *lastport = first;
  779                 lport = htons(*lastport);
  780 
  781                 if (fsa != NULL) {
  782 #ifdef INET
  783                         if (lsa->sa_family == AF_INET) {
  784                                 tmpinp = in_pcblookup_hash_locked(pcbinfo,
  785                                     faddr, fport, laddr, lport, lookupflags,
  786                                     NULL, M_NODOM);
  787                         }
  788 #endif
  789 #ifdef INET6
  790                         if (lsa->sa_family == AF_INET6) {
  791                                 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
  792                                     faddr6, fport, laddr6, lport, lookupflags,
  793                                     NULL, M_NODOM);
  794                         }
  795 #endif
  796                 } else {
  797 #ifdef INET6
  798                         if ((inp->inp_vflag & INP_IPV6) != 0) {
  799                                 tmpinp = in6_pcblookup_local(pcbinfo,
  800                                     &inp->in6p_laddr, lport, lookupflags, cred);
  801 #ifdef INET
  802                                 if (tmpinp == NULL &&
  803                                     (inp->inp_vflag & INP_IPV4))
  804                                         tmpinp = in_pcblookup_local(pcbinfo,
  805                                             laddr, lport, lookupflags, cred);
  806 #endif
  807                         }
  808 #endif
  809 #if defined(INET) && defined(INET6)
  810                         else
  811 #endif
  812 #ifdef INET
  813                                 tmpinp = in_pcblookup_local(pcbinfo, laddr,
  814                                     lport, lookupflags, cred);
  815 #endif
  816                 }
  817         } while (tmpinp != NULL);
  818 
  819         *lportp = lport;
  820 
  821         return (0);
  822 }
  823 
  824 /*
  825  * Select a local port (number) to use.
  826  */
  827 int
  828 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
  829     struct ucred *cred, int lookupflags)
  830 {
  831         struct sockaddr_in laddr;
  832 
  833         if (laddrp) {
  834                 bzero(&laddr, sizeof(laddr));
  835                 laddr.sin_family = AF_INET;
  836                 laddr.sin_addr = *laddrp;
  837         }
  838         return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
  839             NULL, lportp, NULL, 0, cred, lookupflags));
  840 }
  841 
  842 /*
  843  * Return cached socket options.
  844  */
  845 int
  846 inp_so_options(const struct inpcb *inp)
  847 {
  848         int so_options;
  849 
  850         so_options = 0;
  851 
  852         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
  853                 so_options |= SO_REUSEPORT_LB;
  854         if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
  855                 so_options |= SO_REUSEPORT;
  856         if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
  857                 so_options |= SO_REUSEADDR;
  858         return (so_options);
  859 }
  860 #endif /* INET || INET6 */
  861 
  862 /*
  863  * Check if a new BINDMULTI socket is allowed to be created.
  864  *
  865  * ni points to the new inp.
  866  * oi points to the existing inp.
  867  *
  868  * This checks whether the existing inp also has BINDMULTI and
  869  * whether the credentials match.
  870  */
  871 int
  872 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
  873 {
  874         /* Check permissions match */
  875         if ((ni->inp_flags2 & INP_BINDMULTI) &&
  876             (ni->inp_cred->cr_uid !=
  877             oi->inp_cred->cr_uid))
  878                 return (0);
  879 
  880         /* Check the existing inp has BINDMULTI set */
  881         if ((ni->inp_flags2 & INP_BINDMULTI) &&
  882             ((oi->inp_flags2 & INP_BINDMULTI) == 0))
  883                 return (0);
  884 
  885         /*
  886          * We're okay - either INP_BINDMULTI isn't set on ni, or
  887          * it is and it matches the checks.
  888          */
  889         return (1);
  890 }
  891 
  892 #ifdef INET
  893 /*
  894  * Set up a bind operation on a PCB, performing port allocation
  895  * as required, but do not actually modify the PCB. Callers can
  896  * either complete the bind by setting inp_laddr/inp_lport and
  897  * calling in_pcbinshash(), or they can just use the resulting
  898  * port and address to authorise the sending of a once-off packet.
  899  *
  900  * On error, the values of *laddrp and *lportp are not changed.
  901  */
  902 int
  903 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
  904     u_short *lportp, struct ucred *cred)
  905 {
  906         struct socket *so = inp->inp_socket;
  907         struct sockaddr_in *sin;
  908         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
  909         struct in_addr laddr;
  910         u_short lport = 0;
  911         int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
  912         int error;
  913 
  914         /*
  915          * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
  916          * so that we don't have to add to the (already messy) code below.
  917          */
  918         int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
  919 
  920         /*
  921          * No state changes, so read locks are sufficient here.
  922          */
  923         INP_LOCK_ASSERT(inp);
  924         INP_HASH_LOCK_ASSERT(pcbinfo);
  925 
  926         laddr.s_addr = *laddrp;
  927         if (nam != NULL && laddr.s_addr != INADDR_ANY)
  928                 return (EINVAL);
  929         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
  930                 lookupflags = INPLOOKUP_WILDCARD;
  931         if (nam == NULL) {
  932                 if ((error = prison_local_ip4(cred, &laddr)) != 0)
  933                         return (error);
  934         } else {
  935                 sin = (struct sockaddr_in *)nam;
  936                 KASSERT(sin->sin_family == AF_INET,
  937                     ("%s: invalid family for address %p", __func__, sin));
  938                 KASSERT(sin->sin_len == sizeof(*sin),
  939                     ("%s: invalid length for address %p", __func__, sin));
  940 
  941                 error = prison_local_ip4(cred, &sin->sin_addr);
  942                 if (error)
  943                         return (error);
  944                 if (sin->sin_port != *lportp) {
  945                         /* Don't allow the port to change. */
  946                         if (*lportp != 0)
  947                                 return (EINVAL);
  948                         lport = sin->sin_port;
  949                 }
  950                 /* NB: lport is left as 0 if the port isn't being changed. */
  951                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
  952                         /*
  953                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
  954                          * allow complete duplication of binding if
  955                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
  956                          * and a multicast address is bound on both
  957                          * new and duplicated sockets.
  958                          */
  959                         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
  960                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
  961                         /*
  962                          * XXX: How to deal with SO_REUSEPORT_LB here?
  963                          * Treat same as SO_REUSEPORT for now.
  964                          */
  965                         if ((so->so_options &
  966                             (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
  967                                 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
  968                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
  969                         sin->sin_port = 0;              /* yech... */
  970                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
  971                         /*
  972                          * Is the address a local IP address?
  973                          * If INP_BINDANY is set, then the socket may be bound
  974                          * to any endpoint address, local or not.
  975                          */
  976                         if ((inp->inp_flags & INP_BINDANY) == 0 &&
  977                             ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
  978                                 return (EADDRNOTAVAIL);
  979                 }
  980                 laddr = sin->sin_addr;
  981                 if (lport) {
  982                         struct inpcb *t;
  983 
  984                         /* GROSS */
  985                         if (ntohs(lport) <= V_ipport_reservedhigh &&
  986                             ntohs(lport) >= V_ipport_reservedlow &&
  987                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT))
  988                                 return (EACCES);
  989                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
  990                             priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) {
  991                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
  992                                     lport, INPLOOKUP_WILDCARD, cred);
  993         /*
  994          * XXX
  995          * This entire block sorely needs a rewrite.
  996          */
  997                                 if (t &&
  998                                     ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
  999                                     (so->so_type != SOCK_STREAM ||
 1000                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 1001                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 1002                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
 1003                                      (t->inp_flags2 & INP_REUSEPORT) ||
 1004                                      (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 1005                                     (inp->inp_cred->cr_uid !=
 1006                                      t->inp_cred->cr_uid))
 1007                                         return (EADDRINUSE);
 1008 
 1009                                 /*
 1010                                  * If the socket is a BINDMULTI socket, then
 1011                                  * the credentials need to match and the
 1012                                  * original socket also has to have been bound
 1013                                  * with BINDMULTI.
 1014                                  */
 1015                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 1016                                         return (EADDRINUSE);
 1017                         }
 1018                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
 1019                             lport, lookupflags, cred);
 1020                         if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
 1021                             (reuseport & inp_so_options(t)) == 0 &&
 1022                             (reuseport_lb & inp_so_options(t)) == 0) {
 1023 #ifdef INET6
 1024                                 if (ntohl(sin->sin_addr.s_addr) !=
 1025                                     INADDR_ANY ||
 1026                                     ntohl(t->inp_laddr.s_addr) !=
 1027                                     INADDR_ANY ||
 1028                                     (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 1029                                     (t->inp_vflag & INP_IPV6PROTO) == 0)
 1030 #endif
 1031                                                 return (EADDRINUSE);
 1032                                 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 1033                                         return (EADDRINUSE);
 1034                         }
 1035                 }
 1036         }
 1037         if (*lportp != 0)
 1038                 lport = *lportp;
 1039         if (lport == 0) {
 1040                 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
 1041                 if (error != 0)
 1042                         return (error);
 1043         }
 1044         *laddrp = laddr.s_addr;
 1045         *lportp = lport;
 1046         return (0);
 1047 }
 1048 
 1049 /*
 1050  * Connect from a socket to a specified address.
 1051  * Both address and port must be specified in argument sin.
 1052  * If don't have a local address for this socket yet,
 1053  * then pick one.
 1054  */
 1055 int
 1056 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred,
 1057     bool rehash)
 1058 {
 1059         u_short lport, fport;
 1060         in_addr_t laddr, faddr;
 1061         int anonport, error;
 1062 
 1063         INP_WLOCK_ASSERT(inp);
 1064         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 1065 
 1066         lport = inp->inp_lport;
 1067         laddr = inp->inp_laddr.s_addr;
 1068         anonport = (lport == 0);
 1069         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
 1070             NULL, cred);
 1071         if (error)
 1072                 return (error);
 1073 
 1074         /* Do the initial binding of the local address if required. */
 1075         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 1076                 KASSERT(rehash == true,
 1077                     ("Rehashing required for unbound inps"));
 1078                 inp->inp_lport = lport;
 1079                 inp->inp_laddr.s_addr = laddr;
 1080                 if (in_pcbinshash(inp) != 0) {
 1081                         inp->inp_laddr.s_addr = INADDR_ANY;
 1082                         inp->inp_lport = 0;
 1083                         return (EAGAIN);
 1084                 }
 1085         }
 1086 
 1087         /* Commit the remaining changes. */
 1088         inp->inp_lport = lport;
 1089         inp->inp_laddr.s_addr = laddr;
 1090         inp->inp_faddr.s_addr = faddr;
 1091         inp->inp_fport = fport;
 1092         if (rehash) {
 1093                 in_pcbrehash(inp);
 1094         } else {
 1095                 in_pcbinshash(inp);
 1096         }
 1097 
 1098         if (anonport)
 1099                 inp->inp_flags |= INP_ANONPORT;
 1100         return (0);
 1101 }
 1102 
 1103 /*
 1104  * Do proper source address selection on an unbound socket in case
 1105  * of connect. Take jails into account as well.
 1106  */
 1107 int
 1108 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
 1109     struct ucred *cred)
 1110 {
 1111         struct ifaddr *ifa;
 1112         struct sockaddr *sa;
 1113         struct sockaddr_in *sin, dst;
 1114         struct nhop_object *nh;
 1115         int error;
 1116 
 1117         NET_EPOCH_ASSERT();
 1118         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
 1119 
 1120         /*
 1121          * Bypass source address selection and use the primary jail IP
 1122          * if requested.
 1123          */
 1124         if (!prison_saddrsel_ip4(cred, laddr))
 1125                 return (0);
 1126 
 1127         error = 0;
 1128 
 1129         nh = NULL;
 1130         bzero(&dst, sizeof(dst));
 1131         sin = &dst;
 1132         sin->sin_family = AF_INET;
 1133         sin->sin_len = sizeof(struct sockaddr_in);
 1134         sin->sin_addr.s_addr = faddr->s_addr;
 1135 
 1136         /*
 1137          * If route is known our src addr is taken from the i/f,
 1138          * else punt.
 1139          *
 1140          * Find out route to destination.
 1141          */
 1142         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
 1143                 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr,
 1144                     0, NHR_NONE, 0);
 1145 
 1146         /*
 1147          * If we found a route, use the address corresponding to
 1148          * the outgoing interface.
 1149          *
 1150          * Otherwise assume faddr is reachable on a directly connected
 1151          * network and try to find a corresponding interface to take
 1152          * the source address from.
 1153          */
 1154         if (nh == NULL || nh->nh_ifp == NULL) {
 1155                 struct in_ifaddr *ia;
 1156                 struct ifnet *ifp;
 1157 
 1158                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
 1159                                         inp->inp_socket->so_fibnum));
 1160                 if (ia == NULL) {
 1161                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
 1162                                                 inp->inp_socket->so_fibnum));
 1163                 }
 1164                 if (ia == NULL) {
 1165                         error = ENETUNREACH;
 1166                         goto done;
 1167                 }
 1168 
 1169                 if (!prison_flag(cred, PR_IP4)) {
 1170                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1171                         goto done;
 1172                 }
 1173 
 1174                 ifp = ia->ia_ifp;
 1175                 ia = NULL;
 1176                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 1177                         sa = ifa->ifa_addr;
 1178                         if (sa->sa_family != AF_INET)
 1179                                 continue;
 1180                         sin = (struct sockaddr_in *)sa;
 1181                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 1182                                 ia = (struct in_ifaddr *)ifa;
 1183                                 break;
 1184                         }
 1185                 }
 1186                 if (ia != NULL) {
 1187                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1188                         goto done;
 1189                 }
 1190 
 1191                 /* 3. As a last resort return the 'default' jail address. */
 1192                 error = prison_get_ip4(cred, laddr);
 1193                 goto done;
 1194         }
 1195 
 1196         /*
 1197          * If the outgoing interface on the route found is not
 1198          * a loopback interface, use the address from that interface.
 1199          * In case of jails do those three steps:
 1200          * 1. check if the interface address belongs to the jail. If so use it.
 1201          * 2. check if we have any address on the outgoing interface
 1202          *    belonging to this jail. If so use it.
 1203          * 3. as a last resort return the 'default' jail address.
 1204          */
 1205         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) {
 1206                 struct in_ifaddr *ia;
 1207                 struct ifnet *ifp;
 1208 
 1209                 /* If not jailed, use the default returned. */
 1210                 if (!prison_flag(cred, PR_IP4)) {
 1211                         ia = (struct in_ifaddr *)nh->nh_ifa;
 1212                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1213                         goto done;
 1214                 }
 1215 
 1216                 /* Jailed. */
 1217                 /* 1. Check if the iface address belongs to the jail. */
 1218                 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr;
 1219                 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 1220                         ia = (struct in_ifaddr *)nh->nh_ifa;
 1221                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1222                         goto done;
 1223                 }
 1224 
 1225                 /*
 1226                  * 2. Check if we have any address on the outgoing interface
 1227                  *    belonging to this jail.
 1228                  */
 1229                 ia = NULL;
 1230                 ifp = nh->nh_ifp;
 1231                 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 1232                         sa = ifa->ifa_addr;
 1233                         if (sa->sa_family != AF_INET)
 1234                                 continue;
 1235                         sin = (struct sockaddr_in *)sa;
 1236                         if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
 1237                                 ia = (struct in_ifaddr *)ifa;
 1238                                 break;
 1239                         }
 1240                 }
 1241                 if (ia != NULL) {
 1242                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1243                         goto done;
 1244                 }
 1245 
 1246                 /* 3. As a last resort return the 'default' jail address. */
 1247                 error = prison_get_ip4(cred, laddr);
 1248                 goto done;
 1249         }
 1250 
 1251         /*
 1252          * The outgoing interface is marked with 'loopback net', so a route
 1253          * to ourselves is here.
 1254          * Try to find the interface of the destination address and then
 1255          * take the address from there. That interface is not necessarily
 1256          * a loopback interface.
 1257          * In case of jails, check that it is an address of the jail
 1258          * and if we cannot find, fall back to the 'default' jail address.
 1259          */
 1260         if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) {
 1261                 struct in_ifaddr *ia;
 1262 
 1263                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst),
 1264                                         inp->inp_socket->so_fibnum));
 1265                 if (ia == NULL)
 1266                         ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0,
 1267                                                 inp->inp_socket->so_fibnum));
 1268                 if (ia == NULL)
 1269                         ia = ifatoia(ifa_ifwithaddr(sintosa(&dst)));
 1270 
 1271                 if (!prison_flag(cred, PR_IP4)) {
 1272                         if (ia == NULL) {
 1273                                 error = ENETUNREACH;
 1274                                 goto done;
 1275                         }
 1276                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1277                         goto done;
 1278                 }
 1279 
 1280                 /* Jailed. */
 1281                 if (ia != NULL) {
 1282                         struct ifnet *ifp;
 1283 
 1284                         ifp = ia->ia_ifp;
 1285                         ia = NULL;
 1286                         CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 1287                                 sa = ifa->ifa_addr;
 1288                                 if (sa->sa_family != AF_INET)
 1289                                         continue;
 1290                                 sin = (struct sockaddr_in *)sa;
 1291                                 if (prison_check_ip4(cred,
 1292                                     &sin->sin_addr) == 0) {
 1293                                         ia = (struct in_ifaddr *)ifa;
 1294                                         break;
 1295                                 }
 1296                         }
 1297                         if (ia != NULL) {
 1298                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
 1299                                 goto done;
 1300                         }
 1301                 }
 1302 
 1303                 /* 3. As a last resort return the 'default' jail address. */
 1304                 error = prison_get_ip4(cred, laddr);
 1305                 goto done;
 1306         }
 1307 
 1308 done:
 1309         return (error);
 1310 }
 1311 
 1312 /*
 1313  * Set up for a connect from a socket to the specified address.
 1314  * On entry, *laddrp and *lportp should contain the current local
 1315  * address and port for the PCB; these are updated to the values
 1316  * that should be placed in inp_laddr and inp_lport to complete
 1317  * the connect.
 1318  *
 1319  * On success, *faddrp and *fportp will be set to the remote address
 1320  * and port. These are not updated in the error case.
 1321  *
 1322  * If the operation fails because the connection already exists,
 1323  * *oinpp will be set to the PCB of that connection so that the
 1324  * caller can decide to override it. In all other cases, *oinpp
 1325  * is set to NULL.
 1326  */
 1327 int
 1328 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
 1329     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
 1330     struct inpcb **oinpp, struct ucred *cred)
 1331 {
 1332         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
 1333         struct in_ifaddr *ia;
 1334         struct inpcb *oinp;
 1335         struct in_addr laddr, faddr;
 1336         u_short lport, fport;
 1337         int error;
 1338 
 1339         KASSERT(sin->sin_family == AF_INET,
 1340             ("%s: invalid address family for %p", __func__, sin));
 1341         KASSERT(sin->sin_len == sizeof(*sin),
 1342             ("%s: invalid address length for %p", __func__, sin));
 1343 
 1344         /*
 1345          * Because a global state change doesn't actually occur here, a read
 1346          * lock is sufficient.
 1347          */
 1348         NET_EPOCH_ASSERT();
 1349         INP_LOCK_ASSERT(inp);
 1350         INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
 1351 
 1352         if (oinpp != NULL)
 1353                 *oinpp = NULL;
 1354         if (sin->sin_port == 0)
 1355                 return (EADDRNOTAVAIL);
 1356         laddr.s_addr = *laddrp;
 1357         lport = *lportp;
 1358         faddr = sin->sin_addr;
 1359         fport = sin->sin_port;
 1360 #ifdef ROUTE_MPATH
 1361         if (CALC_FLOWID_OUTBOUND) {
 1362                 uint32_t hash_val, hash_type;
 1363 
 1364                 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport,
 1365                     inp->inp_socket->so_proto->pr_protocol, &hash_type);
 1366 
 1367                 inp->inp_flowid = hash_val;
 1368                 inp->inp_flowtype = hash_type;
 1369         }
 1370 #endif
 1371         if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
 1372                 /*
 1373                  * If the destination address is INADDR_ANY,
 1374                  * use the primary local address.
 1375                  * If the supplied address is INADDR_BROADCAST,
 1376                  * and the primary interface supports broadcast,
 1377                  * choose the broadcast address for that interface.
 1378                  */
 1379                 if (faddr.s_addr == INADDR_ANY) {
 1380                         faddr =
 1381                             IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
 1382                         if ((error = prison_get_ip4(cred, &faddr)) != 0)
 1383                                 return (error);
 1384                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
 1385                         if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
 1386                             IFF_BROADCAST)
 1387                                 faddr = satosin(&CK_STAILQ_FIRST(
 1388                                     &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
 1389                 }
 1390         }
 1391         if (laddr.s_addr == INADDR_ANY) {
 1392                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
 1393                 /*
 1394                  * If the destination address is multicast and an outgoing
 1395                  * interface has been set as a multicast option, prefer the
 1396                  * address of that interface as our source address.
 1397                  */
 1398                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
 1399                     inp->inp_moptions != NULL) {
 1400                         struct ip_moptions *imo;
 1401                         struct ifnet *ifp;
 1402 
 1403                         imo = inp->inp_moptions;
 1404                         if (imo->imo_multicast_ifp != NULL) {
 1405                                 ifp = imo->imo_multicast_ifp;
 1406                                 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
 1407                                         if (ia->ia_ifp == ifp &&
 1408                                             prison_check_ip4(cred,
 1409                                             &ia->ia_addr.sin_addr) == 0)
 1410                                                 break;
 1411                                 }
 1412                                 if (ia == NULL)
 1413                                         error = EADDRNOTAVAIL;
 1414                                 else {
 1415                                         laddr = ia->ia_addr.sin_addr;
 1416                                         error = 0;
 1417                                 }
 1418                         }
 1419                 }
 1420                 if (error)
 1421                         return (error);
 1422         }
 1423 
 1424         if (lport != 0) {
 1425                 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
 1426                     fport, laddr, lport, 0, NULL, M_NODOM);
 1427                 if (oinp != NULL) {
 1428                         if (oinpp != NULL)
 1429                                 *oinpp = oinp;
 1430                         return (EADDRINUSE);
 1431                 }
 1432         } else {
 1433                 struct sockaddr_in lsin, fsin;
 1434 
 1435                 bzero(&lsin, sizeof(lsin));
 1436                 bzero(&fsin, sizeof(fsin));
 1437                 lsin.sin_family = AF_INET;
 1438                 lsin.sin_addr = laddr;
 1439                 fsin.sin_family = AF_INET;
 1440                 fsin.sin_addr = faddr;
 1441                 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
 1442                     &lport, (struct sockaddr *)& fsin, fport, cred,
 1443                     INPLOOKUP_WILDCARD);
 1444                 if (error)
 1445                         return (error);
 1446         }
 1447         *laddrp = laddr.s_addr;
 1448         *lportp = lport;
 1449         *faddrp = faddr.s_addr;
 1450         *fportp = fport;
 1451         return (0);
 1452 }
 1453 
 1454 void
 1455 in_pcbdisconnect(struct inpcb *inp)
 1456 {
 1457 
 1458         INP_WLOCK_ASSERT(inp);
 1459         INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 1460 
 1461         inp->inp_faddr.s_addr = INADDR_ANY;
 1462         inp->inp_fport = 0;
 1463         in_pcbrehash(inp);
 1464 }
 1465 #endif /* INET */
 1466 
 1467 /*
 1468  * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
 1469  * For most protocols, this will be invoked immediately prior to calling
 1470  * in_pcbfree().  However, with TCP the inpcb may significantly outlive the
 1471  * socket, in which case in_pcbfree() is deferred.
 1472  */
 1473 void
 1474 in_pcbdetach(struct inpcb *inp)
 1475 {
 1476 
 1477         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
 1478 
 1479 #ifdef RATELIMIT
 1480         if (inp->inp_snd_tag != NULL)
 1481                 in_pcbdetach_txrtlmt(inp);
 1482 #endif
 1483         inp->inp_socket->so_pcb = NULL;
 1484         inp->inp_socket = NULL;
 1485 }
 1486 
 1487 /*
 1488  * inpcb hash lookups are protected by SMR section.
 1489  *
 1490  * Once desired pcb has been found, switching from SMR section to a pcb
 1491  * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK
 1492  * here because SMR is a critical section.
 1493  * In 99%+ cases inp_smr_lock() would obtain the lock immediately.
 1494  */
 1495 static inline void
 1496 inp_lock(struct inpcb *inp, const inp_lookup_t lock)
 1497 {
 1498 
 1499         lock == INPLOOKUP_RLOCKPCB ?
 1500             rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock);
 1501 }
 1502 
 1503 static inline void
 1504 inp_unlock(struct inpcb *inp, const inp_lookup_t lock)
 1505 {
 1506 
 1507         lock == INPLOOKUP_RLOCKPCB ?
 1508             rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock);
 1509 }
 1510 
 1511 static inline int
 1512 inp_trylock(struct inpcb *inp, const inp_lookup_t lock)
 1513 {
 1514 
 1515         return (lock == INPLOOKUP_RLOCKPCB ?
 1516             rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock));
 1517 }
 1518 
 1519 static inline bool
 1520 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock)
 1521 {
 1522 
 1523         return (lock == INPLOOKUP_RLOCKPCB ?
 1524             in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp));
 1525 }
 1526 
 1527 static inline bool
 1528 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags)
 1529 {
 1530 
 1531         MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB);
 1532         SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr);
 1533 
 1534         if (__predict_true(inp_trylock(inp, lock))) {
 1535                 if (__predict_false(inp->inp_flags & ignflags)) {
 1536                         smr_exit(inp->inp_pcbinfo->ipi_smr);
 1537                         inp_unlock(inp, lock);
 1538                         return (false);
 1539                 }
 1540                 smr_exit(inp->inp_pcbinfo->ipi_smr);
 1541                 return (true);
 1542         }
 1543 
 1544         if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
 1545                 smr_exit(inp->inp_pcbinfo->ipi_smr);
 1546                 inp_lock(inp, lock);
 1547                 if (__predict_false(in_pcbrele(inp, lock)))
 1548                         return (false);
 1549                 /*
 1550                  * inp acquired through refcount & lock for sure didn't went
 1551                  * through uma_zfree().  However, it may have already went
 1552                  * through in_pcbfree() and has another reference, that
 1553                  * prevented its release by our in_pcbrele().
 1554                  */
 1555                 if (__predict_false(inp->inp_flags & ignflags)) {
 1556                         inp_unlock(inp, lock);
 1557                         return (false);
 1558                 }
 1559                 return (true);
 1560         } else {
 1561                 smr_exit(inp->inp_pcbinfo->ipi_smr);
 1562                 return (false);
 1563         }
 1564 }
 1565 
 1566 bool
 1567 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock)
 1568 {
 1569 
 1570         /*
 1571          * in_pcblookup() family of functions ignore not only freed entries,
 1572          * that may be found due to lockless access to the hash, but dropped
 1573          * entries, too.
 1574          */
 1575         return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED));
 1576 }
 1577 
 1578 /*
 1579  * inp_next() - inpcb hash/list traversal iterator
 1580  *
 1581  * Requires initialized struct inpcb_iterator for context.
 1582  * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR().
 1583  *
 1584  * - Iterator can have either write-lock or read-lock semantics, that can not
 1585  *   be changed later.
 1586  * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through
 1587  *   a single hash slot.  Note: only rip_input() does the latter.
 1588  * - Iterator may have optional bool matching function.  The matching function
 1589  *   will be executed for each inpcb in the SMR context, so it can not acquire
 1590  *   locks and can safely access only immutable fields of inpcb.
 1591  *
 1592  * A fresh initialized iterator has NULL inpcb in its context and that
 1593  * means that inp_next() call would return the very first inpcb on the list
 1594  * locked with desired semantic.  In all following calls the context pointer
 1595  * shall hold the current inpcb pointer.  The KPI user is not supposed to
 1596  * unlock the current inpcb!  Upon end of traversal inp_next() will return NULL
 1597  * and write NULL to its context.  After end of traversal an iterator can be
 1598  * reused.
 1599  *
 1600  * List traversals have the following features/constraints:
 1601  * - New entries won't be seen, as they are always added to the head of a list.
 1602  * - Removed entries won't stop traversal as long as they are not added to
 1603  *   a different list. This is violated by in_pcbrehash().
 1604  */
 1605 #define II_LIST_FIRST(ipi, hash)                                        \
 1606                 (((hash) == INP_ALL_LIST) ?                             \
 1607                     CK_LIST_FIRST(&(ipi)->ipi_listhead) :               \
 1608                     CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)]))
 1609 #define II_LIST_NEXT(inp, hash)                                         \
 1610                 (((hash) == INP_ALL_LIST) ?                             \
 1611                     CK_LIST_NEXT((inp), inp_list) :                     \
 1612                     CK_LIST_NEXT((inp), inp_hash))
 1613 #define II_LOCK_ASSERT(inp, lock)                                       \
 1614                 rw_assert(&(inp)->inp_lock,                             \
 1615                     (lock) == INPLOOKUP_RLOCKPCB ?  RA_RLOCKED : RA_WLOCKED )
 1616 struct inpcb *
 1617 inp_next(struct inpcb_iterator *ii)
 1618 {
 1619         const struct inpcbinfo *ipi = ii->ipi;
 1620         inp_match_t *match = ii->match;
 1621         void *ctx = ii->ctx;
 1622         inp_lookup_t lock = ii->lock;
 1623         int hash = ii->hash;
 1624         struct inpcb *inp;
 1625 
 1626         if (ii->inp == NULL) {          /* First call. */
 1627                 smr_enter(ipi->ipi_smr);
 1628                 /* This is unrolled CK_LIST_FOREACH(). */
 1629                 for (inp = II_LIST_FIRST(ipi, hash);
 1630                     inp != NULL;
 1631                     inp = II_LIST_NEXT(inp, hash)) {
 1632                         if (match != NULL && (match)(inp, ctx) == false)
 1633                                 continue;
 1634                         if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED)))
 1635                                 break;
 1636                         else {
 1637                                 smr_enter(ipi->ipi_smr);
 1638                                 MPASS(inp != II_LIST_FIRST(ipi, hash));
 1639                                 inp = II_LIST_FIRST(ipi, hash);
 1640                                 if (inp == NULL)
 1641                                         break;
 1642                         }
 1643                 }
 1644 
 1645                 if (inp == NULL)
 1646                         smr_exit(ipi->ipi_smr);
 1647                 else
 1648                         ii->inp = inp;
 1649 
 1650                 return (inp);
 1651         }
 1652 
 1653         /* Not a first call. */
 1654         smr_enter(ipi->ipi_smr);
 1655 restart:
 1656         inp = ii->inp;
 1657         II_LOCK_ASSERT(inp, lock);
 1658 next:
 1659         inp = II_LIST_NEXT(inp, hash);
 1660         if (inp == NULL) {
 1661                 smr_exit(ipi->ipi_smr);
 1662                 goto found;
 1663         }
 1664 
 1665         if (match != NULL && (match)(inp, ctx) == false)
 1666                 goto next;
 1667 
 1668         if (__predict_true(inp_trylock(inp, lock))) {
 1669                 if (__predict_false(inp->inp_flags & INP_FREED)) {
 1670                         /*
 1671                          * Entries are never inserted in middle of a list, thus
 1672                          * as long as we are in SMR, we can continue traversal.
 1673                          * Jump to 'restart' should yield in the same result,
 1674                          * but could produce unnecessary looping.  Could this
 1675                          * looping be unbound?
 1676                          */
 1677                         inp_unlock(inp, lock);
 1678                         goto next;
 1679                 } else {
 1680                         smr_exit(ipi->ipi_smr);
 1681                         goto found;
 1682                 }
 1683         }
 1684 
 1685         /*
 1686          * Can't obtain lock immediately, thus going hard.  Once we exit the
 1687          * SMR section we can no longer jump to 'next', and our only stable
 1688          * anchoring point is ii->inp, which we keep locked for this case, so
 1689          * we jump to 'restart'.
 1690          */
 1691         if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) {
 1692                 smr_exit(ipi->ipi_smr);
 1693                 inp_lock(inp, lock);
 1694                 if (__predict_false(in_pcbrele(inp, lock))) {
 1695                         smr_enter(ipi->ipi_smr);
 1696                         goto restart;
 1697                 }
 1698                 /*
 1699                  * See comment in inp_smr_lock().
 1700                  */
 1701                 if (__predict_false(inp->inp_flags & INP_FREED)) {
 1702                         inp_unlock(inp, lock);
 1703                         smr_enter(ipi->ipi_smr);
 1704                         goto restart;
 1705                 }
 1706         } else
 1707                 goto next;
 1708 
 1709 found:
 1710         inp_unlock(ii->inp, lock);
 1711         ii->inp = inp;
 1712 
 1713         return (ii->inp);
 1714 }
 1715 
 1716 /*
 1717  * in_pcbref() bumps the reference count on an inpcb in order to maintain
 1718  * stability of an inpcb pointer despite the inpcb lock being released or
 1719  * SMR section exited.
 1720  *
 1721  * To free a reference later in_pcbrele_(r|w)locked() must be performed.
 1722  */
 1723 void
 1724 in_pcbref(struct inpcb *inp)
 1725 {
 1726         u_int old __diagused;
 1727 
 1728         old = refcount_acquire(&inp->inp_refcount);
 1729         KASSERT(old > 0, ("%s: refcount 0", __func__));
 1730 }
 1731 
 1732 /*
 1733  * Drop a refcount on an inpcb elevated using in_pcbref(), potentially
 1734  * freeing the pcb, if the reference was very last.
 1735  */
 1736 bool
 1737 in_pcbrele_rlocked(struct inpcb *inp)
 1738 {
 1739 
 1740         INP_RLOCK_ASSERT(inp);
 1741 
 1742         if (refcount_release(&inp->inp_refcount) == 0)
 1743                 return (false);
 1744 
 1745         MPASS(inp->inp_flags & INP_FREED);
 1746         MPASS(inp->inp_socket == NULL);
 1747         MPASS(inp->inp_in_hpts == 0);
 1748         INP_RUNLOCK(inp);
 1749         uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 1750         return (true);
 1751 }
 1752 
 1753 bool
 1754 in_pcbrele_wlocked(struct inpcb *inp)
 1755 {
 1756 
 1757         INP_WLOCK_ASSERT(inp);
 1758 
 1759         if (refcount_release(&inp->inp_refcount) == 0)
 1760                 return (false);
 1761 
 1762         MPASS(inp->inp_flags & INP_FREED);
 1763         MPASS(inp->inp_socket == NULL);
 1764         MPASS(inp->inp_in_hpts == 0);
 1765         INP_WUNLOCK(inp);
 1766         uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp);
 1767         return (true);
 1768 }
 1769 
 1770 /*
 1771  * Unconditionally schedule an inpcb to be freed by decrementing its
 1772  * reference count, which should occur only after the inpcb has been detached
 1773  * from its socket.  If another thread holds a temporary reference (acquired
 1774  * using in_pcbref()) then the free is deferred until that reference is
 1775  * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked.
 1776  *  Almost all work, including removal from global lists, is done in this
 1777  * context, where the pcbinfo lock is held.
 1778  */
 1779 void
 1780 in_pcbfree(struct inpcb *inp)
 1781 {
 1782         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 1783 #ifdef INET
 1784         struct ip_moptions *imo;
 1785 #endif
 1786 #ifdef INET6
 1787         struct ip6_moptions *im6o;
 1788 #endif
 1789 
 1790         INP_WLOCK_ASSERT(inp);
 1791         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
 1792         KASSERT((inp->inp_flags & INP_FREED) == 0,
 1793             ("%s: called twice for pcb %p", __func__, inp));
 1794 
 1795         inp->inp_flags |= INP_FREED;
 1796         INP_INFO_WLOCK(pcbinfo);
 1797         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
 1798         pcbinfo->ipi_count--;
 1799         CK_LIST_REMOVE(inp, inp_list);
 1800         INP_INFO_WUNLOCK(pcbinfo);
 1801 
 1802         if (inp->inp_flags & INP_INHASHLIST)
 1803                 in_pcbremhash(inp);
 1804 
 1805         RO_INVALIDATE_CACHE(&inp->inp_route);
 1806 #ifdef MAC
 1807         mac_inpcb_destroy(inp);
 1808 #endif
 1809 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 1810         if (inp->inp_sp != NULL)
 1811                 ipsec_delete_pcbpolicy(inp);
 1812 #endif
 1813 #ifdef INET
 1814         if (inp->inp_options)
 1815                 (void)m_free(inp->inp_options);
 1816         imo = inp->inp_moptions;
 1817 #endif
 1818 #ifdef INET6
 1819         if (inp->inp_vflag & INP_IPV6PROTO) {
 1820                 ip6_freepcbopts(inp->in6p_outputopts);
 1821                 im6o = inp->in6p_moptions;
 1822         } else
 1823                 im6o = NULL;
 1824 #endif
 1825 
 1826         if (__predict_false(in_pcbrele_wlocked(inp) == false)) {
 1827                 INP_WUNLOCK(inp);
 1828         }
 1829 #ifdef INET6
 1830         ip6_freemoptions(im6o);
 1831 #endif
 1832 #ifdef INET
 1833         inp_freemoptions(imo);
 1834 #endif
 1835         /* Destruction is finalized in inpcb_dtor(). */
 1836 }
 1837 
 1838 static void
 1839 inpcb_dtor(void *mem, int size, void *arg)
 1840 {
 1841         struct inpcb *inp = mem;
 1842 
 1843         crfree(inp->inp_cred);
 1844 #ifdef INVARIANTS
 1845         inp->inp_cred = NULL;
 1846 #endif
 1847 }
 1848 
 1849 /*
 1850  * Different protocols initialize their inpcbs differently - giving
 1851  * different name to the lock.  But they all are disposed the same.
 1852  */
 1853 static void
 1854 inpcb_fini(void *mem, int size)
 1855 {
 1856         struct inpcb *inp = mem;
 1857 
 1858         INP_LOCK_DESTROY(inp);
 1859 }
 1860 
 1861 /*
 1862  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
 1863  * port reservation, and preventing it from being returned by inpcb lookups.
 1864  *
 1865  * It is used by TCP to mark an inpcb as unused and avoid future packet
 1866  * delivery or event notification when a socket remains open but TCP has
 1867  * closed.  This might occur as a result of a shutdown()-initiated TCP close
 1868  * or a RST on the wire, and allows the port binding to be reused while still
 1869  * maintaining the invariant that so_pcb always points to a valid inpcb until
 1870  * in_pcbdetach().
 1871  *
 1872  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
 1873  * in_pcbnotifyall() and in_pcbpurgeif0()?
 1874  */
 1875 void
 1876 in_pcbdrop(struct inpcb *inp)
 1877 {
 1878 
 1879         INP_WLOCK_ASSERT(inp);
 1880 #ifdef INVARIANTS
 1881         if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
 1882                 MPASS(inp->inp_refcount > 1);
 1883 #endif
 1884 
 1885         inp->inp_flags |= INP_DROPPED;
 1886         if (inp->inp_flags & INP_INHASHLIST)
 1887                 in_pcbremhash(inp);
 1888 }
 1889 
 1890 #ifdef INET
 1891 /*
 1892  * Common routines to return the socket addresses associated with inpcbs.
 1893  */
 1894 struct sockaddr *
 1895 in_sockaddr(in_port_t port, struct in_addr *addr_p)
 1896 {
 1897         struct sockaddr_in *sin;
 1898 
 1899         sin = malloc(sizeof *sin, M_SONAME,
 1900                 M_WAITOK | M_ZERO);
 1901         sin->sin_family = AF_INET;
 1902         sin->sin_len = sizeof(*sin);
 1903         sin->sin_addr = *addr_p;
 1904         sin->sin_port = port;
 1905 
 1906         return (struct sockaddr *)sin;
 1907 }
 1908 
 1909 int
 1910 in_getsockaddr(struct socket *so, struct sockaddr **nam)
 1911 {
 1912         struct inpcb *inp;
 1913         struct in_addr addr;
 1914         in_port_t port;
 1915 
 1916         inp = sotoinpcb(so);
 1917         KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
 1918 
 1919         INP_RLOCK(inp);
 1920         port = inp->inp_lport;
 1921         addr = inp->inp_laddr;
 1922         INP_RUNLOCK(inp);
 1923 
 1924         *nam = in_sockaddr(port, &addr);
 1925         return 0;
 1926 }
 1927 
 1928 int
 1929 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
 1930 {
 1931         struct inpcb *inp;
 1932         struct in_addr addr;
 1933         in_port_t port;
 1934 
 1935         inp = sotoinpcb(so);
 1936         KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
 1937 
 1938         INP_RLOCK(inp);
 1939         port = inp->inp_fport;
 1940         addr = inp->inp_faddr;
 1941         INP_RUNLOCK(inp);
 1942 
 1943         *nam = in_sockaddr(port, &addr);
 1944         return 0;
 1945 }
 1946 
 1947 void
 1948 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
 1949     struct inpcb *(*notify)(struct inpcb *, int))
 1950 {
 1951         struct inpcb *inp, *inp_temp;
 1952 
 1953         INP_INFO_WLOCK(pcbinfo);
 1954         CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) {
 1955                 INP_WLOCK(inp);
 1956 #ifdef INET6
 1957                 if ((inp->inp_vflag & INP_IPV4) == 0) {
 1958                         INP_WUNLOCK(inp);
 1959                         continue;
 1960                 }
 1961 #endif
 1962                 if (inp->inp_faddr.s_addr != faddr.s_addr ||
 1963                     inp->inp_socket == NULL) {
 1964                         INP_WUNLOCK(inp);
 1965                         continue;
 1966                 }
 1967                 if ((*notify)(inp, errno))
 1968                         INP_WUNLOCK(inp);
 1969         }
 1970         INP_INFO_WUNLOCK(pcbinfo);
 1971 }
 1972 
 1973 static bool
 1974 inp_v4_multi_match(const struct inpcb *inp, void *v __unused)
 1975 {
 1976 
 1977         if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL)
 1978                 return (true);
 1979         else
 1980                 return (false);
 1981 }
 1982 
 1983 void
 1984 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
 1985 {
 1986         struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB,
 1987             inp_v4_multi_match, NULL);
 1988         struct inpcb *inp;
 1989         struct in_multi *inm;
 1990         struct in_mfilter *imf;
 1991         struct ip_moptions *imo;
 1992 
 1993         IN_MULTI_LOCK_ASSERT();
 1994 
 1995         while ((inp = inp_next(&inpi)) != NULL) {
 1996                 INP_WLOCK_ASSERT(inp);
 1997 
 1998                 imo = inp->inp_moptions;
 1999                 /*
 2000                  * Unselect the outgoing interface if it is being
 2001                  * detached.
 2002                  */
 2003                 if (imo->imo_multicast_ifp == ifp)
 2004                         imo->imo_multicast_ifp = NULL;
 2005 
 2006                 /*
 2007                  * Drop multicast group membership if we joined
 2008                  * through the interface being detached.
 2009                  *
 2010                  * XXX This can all be deferred to an epoch_call
 2011                  */
 2012 restart:
 2013                 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
 2014                         if ((inm = imf->imf_inm) == NULL)
 2015                                 continue;
 2016                         if (inm->inm_ifp != ifp)
 2017                                 continue;
 2018                         ip_mfilter_remove(&imo->imo_head, imf);
 2019                         in_leavegroup_locked(inm, NULL);
 2020                         ip_mfilter_free(imf);
 2021                         goto restart;
 2022                 }
 2023         }
 2024 }
 2025 
 2026 /*
 2027  * Lookup a PCB based on the local address and port.  Caller must hold the
 2028  * hash lock.  No inpcb locks or references are acquired.
 2029  */
 2030 #define INP_LOOKUP_MAPPED_PCB_COST      3
 2031 struct inpcb *
 2032 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
 2033     u_short lport, int lookupflags, struct ucred *cred)
 2034 {
 2035         struct inpcb *inp;
 2036 #ifdef INET6
 2037         int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
 2038 #else
 2039         int matchwild = 3;
 2040 #endif
 2041         int wildcard;
 2042 
 2043         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 2044             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2045         INP_HASH_LOCK_ASSERT(pcbinfo);
 2046 
 2047         if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
 2048                 struct inpcbhead *head;
 2049                 /*
 2050                  * Look for an unconnected (wildcard foreign addr) PCB that
 2051                  * matches the local address and port we're looking for.
 2052                  */
 2053                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 2054                     pcbinfo->ipi_hashmask)];
 2055                 CK_LIST_FOREACH(inp, head, inp_hash) {
 2056 #ifdef INET6
 2057                         /* XXX inp locking */
 2058                         if ((inp->inp_vflag & INP_IPV4) == 0)
 2059                                 continue;
 2060 #endif
 2061                         if (inp->inp_faddr.s_addr == INADDR_ANY &&
 2062                             inp->inp_laddr.s_addr == laddr.s_addr &&
 2063                             inp->inp_lport == lport) {
 2064                                 /*
 2065                                  * Found?
 2066                                  */
 2067                                 if (prison_equal_ip4(cred->cr_prison,
 2068                                     inp->inp_cred->cr_prison))
 2069                                         return (inp);
 2070                         }
 2071                 }
 2072                 /*
 2073                  * Not found.
 2074                  */
 2075                 return (NULL);
 2076         } else {
 2077                 struct inpcbporthead *porthash;
 2078                 struct inpcbport *phd;
 2079                 struct inpcb *match = NULL;
 2080                 /*
 2081                  * Best fit PCB lookup.
 2082                  *
 2083                  * First see if this local port is in use by looking on the
 2084                  * port hash list.
 2085                  */
 2086                 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
 2087                     pcbinfo->ipi_porthashmask)];
 2088                 CK_LIST_FOREACH(phd, porthash, phd_hash) {
 2089                         if (phd->phd_port == lport)
 2090                                 break;
 2091                 }
 2092                 if (phd != NULL) {
 2093                         /*
 2094                          * Port is in use by one or more PCBs. Look for best
 2095                          * fit.
 2096                          */
 2097                         CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
 2098                                 wildcard = 0;
 2099                                 if (!prison_equal_ip4(inp->inp_cred->cr_prison,
 2100                                     cred->cr_prison))
 2101                                         continue;
 2102 #ifdef INET6
 2103                                 /* XXX inp locking */
 2104                                 if ((inp->inp_vflag & INP_IPV4) == 0)
 2105                                         continue;
 2106                                 /*
 2107                                  * We never select the PCB that has
 2108                                  * INP_IPV6 flag and is bound to :: if
 2109                                  * we have another PCB which is bound
 2110                                  * to 0.0.0.0.  If a PCB has the
 2111                                  * INP_IPV6 flag, then we set its cost
 2112                                  * higher than IPv4 only PCBs.
 2113                                  *
 2114                                  * Note that the case only happens
 2115                                  * when a socket is bound to ::, under
 2116                                  * the condition that the use of the
 2117                                  * mapped address is allowed.
 2118                                  */
 2119                                 if ((inp->inp_vflag & INP_IPV6) != 0)
 2120                                         wildcard += INP_LOOKUP_MAPPED_PCB_COST;
 2121 #endif
 2122                                 if (inp->inp_faddr.s_addr != INADDR_ANY)
 2123                                         wildcard++;
 2124                                 if (inp->inp_laddr.s_addr != INADDR_ANY) {
 2125                                         if (laddr.s_addr == INADDR_ANY)
 2126                                                 wildcard++;
 2127                                         else if (inp->inp_laddr.s_addr != laddr.s_addr)
 2128                                                 continue;
 2129                                 } else {
 2130                                         if (laddr.s_addr != INADDR_ANY)
 2131                                                 wildcard++;
 2132                                 }
 2133                                 if (wildcard < matchwild) {
 2134                                         match = inp;
 2135                                         matchwild = wildcard;
 2136                                         if (matchwild == 0)
 2137                                                 break;
 2138                                 }
 2139                         }
 2140                 }
 2141                 return (match);
 2142         }
 2143 }
 2144 #undef INP_LOOKUP_MAPPED_PCB_COST
 2145 
 2146 static bool
 2147 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain)
 2148 {
 2149         return (domain == M_NODOM || domain == grp->il_numa_domain);
 2150 }
 2151 
 2152 static struct inpcb *
 2153 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
 2154     const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
 2155     uint16_t fport, int lookupflags, int domain)
 2156 {
 2157         const struct inpcblbgrouphead *hdr;
 2158         struct inpcblbgroup *grp;
 2159         struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
 2160 
 2161         INP_HASH_LOCK_ASSERT(pcbinfo);
 2162 
 2163         hdr = &pcbinfo->ipi_lbgrouphashbase[
 2164             INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
 2165 
 2166         /*
 2167          * Search for an LB group match based on the following criteria:
 2168          * - prefer jailed groups to non-jailed groups
 2169          * - prefer exact source address matches to wildcard matches
 2170          * - prefer groups bound to the specified NUMA domain
 2171          */
 2172         jail_exact = jail_wild = local_exact = local_wild = NULL;
 2173         CK_LIST_FOREACH(grp, hdr, il_list) {
 2174                 bool injail;
 2175 
 2176 #ifdef INET6
 2177                 if (!(grp->il_vflag & INP_IPV4))
 2178                         continue;
 2179 #endif
 2180                 if (grp->il_lport != lport)
 2181                         continue;
 2182 
 2183                 injail = prison_flag(grp->il_cred, PR_IP4) != 0;
 2184                 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison,
 2185                     laddr) != 0)
 2186                         continue;
 2187 
 2188                 if (grp->il_laddr.s_addr == laddr->s_addr) {
 2189                         if (injail) {
 2190                                 jail_exact = grp;
 2191                                 if (in_pcblookup_lb_numa_match(grp, domain))
 2192                                         /* This is a perfect match. */
 2193                                         goto out;
 2194                         } else if (local_exact == NULL ||
 2195                             in_pcblookup_lb_numa_match(grp, domain)) {
 2196                                 local_exact = grp;
 2197                         }
 2198                 } else if (grp->il_laddr.s_addr == INADDR_ANY &&
 2199                     (lookupflags & INPLOOKUP_WILDCARD) != 0) {
 2200                         if (injail) {
 2201                                 if (jail_wild == NULL ||
 2202                                     in_pcblookup_lb_numa_match(grp, domain))
 2203                                         jail_wild = grp;
 2204                         } else if (local_wild == NULL ||
 2205                             in_pcblookup_lb_numa_match(grp, domain)) {
 2206                                 local_wild = grp;
 2207                         }
 2208                 }
 2209         }
 2210 
 2211         if (jail_exact != NULL)
 2212                 grp = jail_exact;
 2213         else if (jail_wild != NULL)
 2214                 grp = jail_wild;
 2215         else if (local_exact != NULL)
 2216                 grp = local_exact;
 2217         else
 2218                 grp = local_wild;
 2219         if (grp == NULL)
 2220                 return (NULL);
 2221 out:
 2222         return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
 2223             grp->il_inpcnt]);
 2224 }
 2225 
 2226 /*
 2227  * Lookup PCB in hash list, using pcbinfo tables.  This variation assumes
 2228  * that the caller has either locked the hash list, which usually happens
 2229  * for bind(2) operations, or is in SMR section, which happens when sorting
 2230  * out incoming packets.
 2231  */
 2232 static struct inpcb *
 2233 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 2234     u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
 2235     struct ifnet *ifp, uint8_t numa_domain)
 2236 {
 2237         struct inpcbhead *head;
 2238         struct inpcb *inp, *tmpinp;
 2239         u_short fport = fport_arg, lport = lport_arg;
 2240 
 2241         KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
 2242             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2243         INP_HASH_LOCK_ASSERT(pcbinfo);
 2244 
 2245         /*
 2246          * First look for an exact match.
 2247          */
 2248         tmpinp = NULL;
 2249         head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport,
 2250             pcbinfo->ipi_hashmask)];
 2251         CK_LIST_FOREACH(inp, head, inp_hash) {
 2252 #ifdef INET6
 2253                 /* XXX inp locking */
 2254                 if ((inp->inp_vflag & INP_IPV4) == 0)
 2255                         continue;
 2256 #endif
 2257                 if (inp->inp_faddr.s_addr == faddr.s_addr &&
 2258                     inp->inp_laddr.s_addr == laddr.s_addr &&
 2259                     inp->inp_fport == fport &&
 2260                     inp->inp_lport == lport) {
 2261                         /*
 2262                          * XXX We should be able to directly return
 2263                          * the inp here, without any checks.
 2264                          * Well unless both bound with SO_REUSEPORT?
 2265                          */
 2266                         if (prison_flag(inp->inp_cred, PR_IP4))
 2267                                 return (inp);
 2268                         if (tmpinp == NULL)
 2269                                 tmpinp = inp;
 2270                 }
 2271         }
 2272         if (tmpinp != NULL)
 2273                 return (tmpinp);
 2274 
 2275         /*
 2276          * Then look for a wildcard match, if requested.
 2277          */
 2278         if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
 2279                 struct inpcb *local_wild = NULL, *local_exact = NULL;
 2280 #ifdef INET6
 2281                 struct inpcb *local_wild_mapped = NULL;
 2282 #endif
 2283                 struct inpcb *jail_wild = NULL;
 2284                 int injail;
 2285 
 2286                 /*
 2287                  * First see if an LB group matches the request before scanning
 2288                  * all sockets on this port.
 2289                  */
 2290                 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
 2291                     fport, lookupflags, numa_domain);
 2292                 if (inp != NULL)
 2293                         return (inp);
 2294 
 2295                 /*
 2296                  * Order of socket selection - we always prefer jails.
 2297                  *      1. jailed, non-wild.
 2298                  *      2. jailed, wild.
 2299                  *      3. non-jailed, non-wild.
 2300                  *      4. non-jailed, wild.
 2301                  */
 2302 
 2303                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport,
 2304                     pcbinfo->ipi_hashmask)];
 2305                 CK_LIST_FOREACH(inp, head, inp_hash) {
 2306 #ifdef INET6
 2307                         /* XXX inp locking */
 2308                         if ((inp->inp_vflag & INP_IPV4) == 0)
 2309                                 continue;
 2310 #endif
 2311                         if (inp->inp_faddr.s_addr != INADDR_ANY ||
 2312                             inp->inp_lport != lport)
 2313                                 continue;
 2314 
 2315                         injail = prison_flag(inp->inp_cred, PR_IP4);
 2316                         if (injail) {
 2317                                 if (prison_check_ip4_locked(
 2318                                     inp->inp_cred->cr_prison, &laddr) != 0)
 2319                                         continue;
 2320                         } else {
 2321                                 if (local_exact != NULL)
 2322                                         continue;
 2323                         }
 2324 
 2325                         if (inp->inp_laddr.s_addr == laddr.s_addr) {
 2326                                 if (injail)
 2327                                         return (inp);
 2328                                 else
 2329                                         local_exact = inp;
 2330                         } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
 2331 #ifdef INET6
 2332                                 /* XXX inp locking, NULL check */
 2333                                 if (inp->inp_vflag & INP_IPV6PROTO)
 2334                                         local_wild_mapped = inp;
 2335                                 else
 2336 #endif
 2337                                         if (injail)
 2338                                                 jail_wild = inp;
 2339                                         else
 2340                                                 local_wild = inp;
 2341                         }
 2342                 } /* LIST_FOREACH */
 2343                 if (jail_wild != NULL)
 2344                         return (jail_wild);
 2345                 if (local_exact != NULL)
 2346                         return (local_exact);
 2347                 if (local_wild != NULL)
 2348                         return (local_wild);
 2349 #ifdef INET6
 2350                 if (local_wild_mapped != NULL)
 2351                         return (local_wild_mapped);
 2352 #endif
 2353         } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
 2354 
 2355         return (NULL);
 2356 }
 2357 
 2358 /*
 2359  * Lookup PCB in hash list, using pcbinfo tables.  This variation locks the
 2360  * hash list lock, and will return the inpcb locked (i.e., requires
 2361  * INPLOOKUP_LOCKPCB).
 2362  */
 2363 static struct inpcb *
 2364 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 2365     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
 2366     struct ifnet *ifp, uint8_t numa_domain)
 2367 {
 2368         struct inpcb *inp;
 2369 
 2370         smr_enter(pcbinfo->ipi_smr);
 2371         inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
 2372             lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain);
 2373         if (inp != NULL) {
 2374                 if (__predict_false(inp_smr_lock(inp,
 2375                     (lookupflags & INPLOOKUP_LOCKMASK)) == false))
 2376                         inp = NULL;
 2377         } else
 2378                 smr_exit(pcbinfo->ipi_smr);
 2379 
 2380         return (inp);
 2381 }
 2382 
 2383 /*
 2384  * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
 2385  * from which a pre-calculated hash value may be extracted.
 2386  */
 2387 struct inpcb *
 2388 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
 2389     struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
 2390 {
 2391 
 2392         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 2393             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2394         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 2395             ("%s: LOCKPCB not set", __func__));
 2396 
 2397         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 2398             lookupflags, ifp, M_NODOM));
 2399 }
 2400 
 2401 struct inpcb *
 2402 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
 2403     u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
 2404     struct ifnet *ifp, struct mbuf *m)
 2405 {
 2406 
 2407         KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
 2408             ("%s: invalid lookup flags %d", __func__, lookupflags));
 2409         KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
 2410             ("%s: LOCKPCB not set", __func__));
 2411 
 2412         return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
 2413             lookupflags, ifp, m->m_pkthdr.numa_domain));
 2414 }
 2415 #endif /* INET */
 2416 
 2417 /*
 2418  * Insert PCB onto various hash lists.
 2419  */
 2420 int
 2421 in_pcbinshash(struct inpcb *inp)
 2422 {
 2423         struct inpcbhead *pcbhash;
 2424         struct inpcbporthead *pcbporthash;
 2425         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 2426         struct inpcbport *phd;
 2427 
 2428         INP_WLOCK_ASSERT(inp);
 2429         INP_HASH_WLOCK_ASSERT(pcbinfo);
 2430 
 2431         KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
 2432             ("in_pcbinshash: INP_INHASHLIST"));
 2433 
 2434 #ifdef INET6
 2435         if (inp->inp_vflag & INP_IPV6)
 2436                 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
 2437                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 2438         else
 2439 #endif
 2440                 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
 2441                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 2442 
 2443         pcbporthash = &pcbinfo->ipi_porthashbase[
 2444             INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 2445 
 2446         /*
 2447          * Add entry to load balance group.
 2448          * Only do this if SO_REUSEPORT_LB is set.
 2449          */
 2450         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) {
 2451                 int error = in_pcbinslbgrouphash(inp, M_NODOM);
 2452                 if (error != 0)
 2453                         return (error);
 2454         }
 2455 
 2456         /*
 2457          * Go through port list and look for a head for this lport.
 2458          */
 2459         CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
 2460                 if (phd->phd_port == inp->inp_lport)
 2461                         break;
 2462         }
 2463 
 2464         /*
 2465          * If none exists, malloc one and tack it on.
 2466          */
 2467         if (phd == NULL) {
 2468                 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT);
 2469                 if (phd == NULL) {
 2470                         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 2471                                 in_pcbremlbgrouphash(inp);
 2472                         return (ENOMEM);
 2473                 }
 2474                 phd->phd_port = inp->inp_lport;
 2475                 CK_LIST_INIT(&phd->phd_pcblist);
 2476                 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 2477         }
 2478         inp->inp_phd = phd;
 2479         CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 2480         CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 2481         inp->inp_flags |= INP_INHASHLIST;
 2482 
 2483         return (0);
 2484 }
 2485 
 2486 static void
 2487 in_pcbremhash(struct inpcb *inp)
 2488 {
 2489         struct inpcbport *phd = inp->inp_phd;
 2490 
 2491         INP_WLOCK_ASSERT(inp);
 2492         MPASS(inp->inp_flags & INP_INHASHLIST);
 2493 
 2494         INP_HASH_WLOCK(inp->inp_pcbinfo);
 2495         if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
 2496                 in_pcbremlbgrouphash(inp);
 2497         CK_LIST_REMOVE(inp, inp_hash);
 2498         CK_LIST_REMOVE(inp, inp_portlist);
 2499         if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
 2500                 CK_LIST_REMOVE(phd, phd_hash);
 2501                 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd);
 2502         }
 2503         INP_HASH_WUNLOCK(inp->inp_pcbinfo);
 2504         inp->inp_flags &= ~INP_INHASHLIST;
 2505 }
 2506 
 2507 /*
 2508  * Move PCB to the proper hash bucket when { faddr, fport } have  been
 2509  * changed. NOTE: This does not handle the case of the lport changing (the
 2510  * hashed port list would have to be updated as well), so the lport must
 2511  * not change after in_pcbinshash() has been called.
 2512  *
 2513  * XXXGL: a race between this function and SMR-protected hash iterator
 2514  * will lead to iterator traversing a possibly wrong hash list. However,
 2515  * this race should have been here since change from rwlock to epoch.
 2516  */
 2517 void
 2518 in_pcbrehash(struct inpcb *inp)
 2519 {
 2520         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 2521         struct inpcbhead *head;
 2522 
 2523         INP_WLOCK_ASSERT(inp);
 2524         INP_HASH_WLOCK_ASSERT(pcbinfo);
 2525 
 2526         KASSERT(inp->inp_flags & INP_INHASHLIST,
 2527             ("in_pcbrehash: !INP_INHASHLIST"));
 2528 
 2529 #ifdef INET6
 2530         if (inp->inp_vflag & INP_IPV6)
 2531                 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr,
 2532                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 2533         else
 2534 #endif
 2535                 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr,
 2536                     inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
 2537 
 2538         CK_LIST_REMOVE(inp, inp_hash);
 2539         CK_LIST_INSERT_HEAD(head, inp, inp_hash);
 2540 }
 2541 
 2542 /*
 2543  * Check for alternatives when higher level complains
 2544  * about service problems.  For now, invalidate cached
 2545  * routing information.  If the route was created dynamically
 2546  * (by a redirect), time to try a default gateway again.
 2547  */
 2548 void
 2549 in_losing(struct inpcb *inp)
 2550 {
 2551 
 2552         RO_INVALIDATE_CACHE(&inp->inp_route);
 2553         return;
 2554 }
 2555 
 2556 /*
 2557  * A set label operation has occurred at the socket layer, propagate the
 2558  * label change into the in_pcb for the socket.
 2559  */
 2560 void
 2561 in_pcbsosetlabel(struct socket *so)
 2562 {
 2563 #ifdef MAC
 2564         struct inpcb *inp;
 2565 
 2566         inp = sotoinpcb(so);
 2567         KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
 2568 
 2569         INP_WLOCK(inp);
 2570         SOCK_LOCK(so);
 2571         mac_inpcb_sosetlabel(so, inp);
 2572         SOCK_UNLOCK(so);
 2573         INP_WUNLOCK(inp);
 2574 #endif
 2575 }
 2576 
 2577 void
 2578 inp_wlock(struct inpcb *inp)
 2579 {
 2580 
 2581         INP_WLOCK(inp);
 2582 }
 2583 
 2584 void
 2585 inp_wunlock(struct inpcb *inp)
 2586 {
 2587 
 2588         INP_WUNLOCK(inp);
 2589 }
 2590 
 2591 void
 2592 inp_rlock(struct inpcb *inp)
 2593 {
 2594 
 2595         INP_RLOCK(inp);
 2596 }
 2597 
 2598 void
 2599 inp_runlock(struct inpcb *inp)
 2600 {
 2601 
 2602         INP_RUNLOCK(inp);
 2603 }
 2604 
 2605 #ifdef INVARIANT_SUPPORT
 2606 void
 2607 inp_lock_assert(struct inpcb *inp)
 2608 {
 2609 
 2610         INP_WLOCK_ASSERT(inp);
 2611 }
 2612 
 2613 void
 2614 inp_unlock_assert(struct inpcb *inp)
 2615 {
 2616 
 2617         INP_UNLOCK_ASSERT(inp);
 2618 }
 2619 #endif
 2620 
 2621 void
 2622 inp_apply_all(struct inpcbinfo *pcbinfo,
 2623     void (*func)(struct inpcb *, void *), void *arg)
 2624 {
 2625         struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
 2626             INPLOOKUP_WLOCKPCB);
 2627         struct inpcb *inp;
 2628 
 2629         while ((inp = inp_next(&inpi)) != NULL)
 2630                 func(inp, arg);
 2631 }
 2632 
 2633 struct socket *
 2634 inp_inpcbtosocket(struct inpcb *inp)
 2635 {
 2636 
 2637         INP_WLOCK_ASSERT(inp);
 2638         return (inp->inp_socket);
 2639 }
 2640 
 2641 struct tcpcb *
 2642 inp_inpcbtotcpcb(struct inpcb *inp)
 2643 {
 2644 
 2645         INP_WLOCK_ASSERT(inp);
 2646         return ((struct tcpcb *)inp->inp_ppcb);
 2647 }
 2648 
 2649 int
 2650 inp_ip_tos_get(const struct inpcb *inp)
 2651 {
 2652 
 2653         return (inp->inp_ip_tos);
 2654 }
 2655 
 2656 void
 2657 inp_ip_tos_set(struct inpcb *inp, int val)
 2658 {
 2659 
 2660         inp->inp_ip_tos = val;
 2661 }
 2662 
 2663 void
 2664 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 2665     uint32_t *faddr, uint16_t *fp)
 2666 {
 2667 
 2668         INP_LOCK_ASSERT(inp);
 2669         *laddr = inp->inp_laddr.s_addr;
 2670         *faddr = inp->inp_faddr.s_addr;
 2671         *lp = inp->inp_lport;
 2672         *fp = inp->inp_fport;
 2673 }
 2674 
 2675 struct inpcb *
 2676 so_sotoinpcb(struct socket *so)
 2677 {
 2678 
 2679         return (sotoinpcb(so));
 2680 }
 2681 
 2682 /*
 2683  * Create an external-format (``xinpcb'') structure using the information in
 2684  * the kernel-format in_pcb structure pointed to by inp.  This is done to
 2685  * reduce the spew of irrelevant information over this interface, to isolate
 2686  * user code from changes in the kernel structure, and potentially to provide
 2687  * information-hiding if we decide that some of this information should be
 2688  * hidden from users.
 2689  */
 2690 void
 2691 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
 2692 {
 2693 
 2694         bzero(xi, sizeof(*xi));
 2695         xi->xi_len = sizeof(struct xinpcb);
 2696         if (inp->inp_socket)
 2697                 sotoxsocket(inp->inp_socket, &xi->xi_socket);
 2698         bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
 2699         xi->inp_gencnt = inp->inp_gencnt;
 2700         xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
 2701         xi->inp_flow = inp->inp_flow;
 2702         xi->inp_flowid = inp->inp_flowid;
 2703         xi->inp_flowtype = inp->inp_flowtype;
 2704         xi->inp_flags = inp->inp_flags;
 2705         xi->inp_flags2 = inp->inp_flags2;
 2706         xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
 2707         xi->in6p_cksum = inp->in6p_cksum;
 2708         xi->in6p_hops = inp->in6p_hops;
 2709         xi->inp_ip_tos = inp->inp_ip_tos;
 2710         xi->inp_vflag = inp->inp_vflag;
 2711         xi->inp_ip_ttl = inp->inp_ip_ttl;
 2712         xi->inp_ip_p = inp->inp_ip_p;
 2713         xi->inp_ip_minttl = inp->inp_ip_minttl;
 2714 }
 2715 
 2716 int
 2717 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
 2718     int (*ctloutput_set)(struct inpcb *, struct sockopt *))
 2719 {
 2720         struct sockopt sopt;
 2721         struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo,
 2722             INPLOOKUP_WLOCKPCB);
 2723         struct inpcb *inp;
 2724         struct sockopt_parameters *params;
 2725         struct socket *so;
 2726         int error;
 2727         char buf[1024];
 2728 
 2729         if (req->oldptr != NULL || req->oldlen != 0)
 2730                 return (EINVAL);
 2731         if (req->newptr == NULL)
 2732                 return (EPERM);
 2733         if (req->newlen > sizeof(buf))
 2734                 return (ENOMEM);
 2735         error = SYSCTL_IN(req, buf, req->newlen);
 2736         if (error != 0)
 2737                 return (error);
 2738         if (req->newlen < sizeof(struct sockopt_parameters))
 2739                 return (EINVAL);
 2740         params = (struct sockopt_parameters *)buf;
 2741         sopt.sopt_level = params->sop_level;
 2742         sopt.sopt_name = params->sop_optname;
 2743         sopt.sopt_dir = SOPT_SET;
 2744         sopt.sopt_val = params->sop_optval;
 2745         sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters);
 2746         sopt.sopt_td = NULL;
 2747 #ifdef INET6
 2748         if (params->sop_inc.inc_flags & INC_ISIPV6) {
 2749                 if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_laddr))
 2750                         params->sop_inc.inc6_laddr.s6_addr16[1] =
 2751                             htons(params->sop_inc.inc6_zoneid & 0xffff);
 2752                 if (IN6_IS_SCOPE_LINKLOCAL(&params->sop_inc.inc6_faddr))
 2753                         params->sop_inc.inc6_faddr.s6_addr16[1] =
 2754                             htons(params->sop_inc.inc6_zoneid & 0xffff);
 2755         }
 2756 #endif
 2757         if (params->sop_inc.inc_lport != htons(0)) {
 2758                 if (params->sop_inc.inc_fport == htons(0))
 2759                         inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport,
 2760                             pcbinfo->ipi_hashmask);
 2761                 else
 2762 #ifdef INET6
 2763                         if (params->sop_inc.inc_flags & INC_ISIPV6)
 2764                                 inpi.hash = INP6_PCBHASH(
 2765                                     &params->sop_inc.inc6_faddr,
 2766                                     params->sop_inc.inc_lport,
 2767                                     params->sop_inc.inc_fport,
 2768                                     pcbinfo->ipi_hashmask);
 2769                         else
 2770 #endif
 2771                                 inpi.hash = INP_PCBHASH(
 2772                                     &params->sop_inc.inc_faddr,
 2773                                     params->sop_inc.inc_lport,
 2774                                     params->sop_inc.inc_fport,
 2775                                     pcbinfo->ipi_hashmask);
 2776         }
 2777         while ((inp = inp_next(&inpi)) != NULL)
 2778                 if (inp->inp_gencnt == params->sop_id) {
 2779                         if (inp->inp_flags & INP_DROPPED) {
 2780                                 INP_WUNLOCK(inp);
 2781                                 return (ECONNRESET);
 2782                         }
 2783                         so = inp->inp_socket;
 2784                         KASSERT(so != NULL, ("inp_socket == NULL"));
 2785                         soref(so);
 2786                         error = (*ctloutput_set)(inp, &sopt);
 2787                         sorele(so);
 2788                         break;
 2789                 }
 2790         if (inp == NULL)
 2791                 error = ESRCH;
 2792         return (error);
 2793 }
 2794 
 2795 #ifdef DDB
 2796 static void
 2797 db_print_indent(int indent)
 2798 {
 2799         int i;
 2800 
 2801         for (i = 0; i < indent; i++)
 2802                 db_printf(" ");
 2803 }
 2804 
 2805 static void
 2806 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
 2807 {
 2808         char faddr_str[48], laddr_str[48];
 2809 
 2810         db_print_indent(indent);
 2811         db_printf("%s at %p\n", name, inc);
 2812 
 2813         indent += 2;
 2814 
 2815 #ifdef INET6
 2816         if (inc->inc_flags & INC_ISIPV6) {
 2817                 /* IPv6. */
 2818                 ip6_sprintf(laddr_str, &inc->inc6_laddr);
 2819                 ip6_sprintf(faddr_str, &inc->inc6_faddr);
 2820         } else
 2821 #endif
 2822         {
 2823                 /* IPv4. */
 2824                 inet_ntoa_r(inc->inc_laddr, laddr_str);
 2825                 inet_ntoa_r(inc->inc_faddr, faddr_str);
 2826         }
 2827         db_print_indent(indent);
 2828         db_printf("inc_laddr %s   inc_lport %u\n", laddr_str,
 2829             ntohs(inc->inc_lport));
 2830         db_print_indent(indent);
 2831         db_printf("inc_faddr %s   inc_fport %u\n", faddr_str,
 2832             ntohs(inc->inc_fport));
 2833 }
 2834 
 2835 static void
 2836 db_print_inpflags(int inp_flags)
 2837 {
 2838         int comma;
 2839 
 2840         comma = 0;
 2841         if (inp_flags & INP_RECVOPTS) {
 2842                 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
 2843                 comma = 1;
 2844         }
 2845         if (inp_flags & INP_RECVRETOPTS) {
 2846                 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
 2847                 comma = 1;
 2848         }
 2849         if (inp_flags & INP_RECVDSTADDR) {
 2850                 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
 2851                 comma = 1;
 2852         }
 2853         if (inp_flags & INP_ORIGDSTADDR) {
 2854                 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
 2855                 comma = 1;
 2856         }
 2857         if (inp_flags & INP_HDRINCL) {
 2858                 db_printf("%sINP_HDRINCL", comma ? ", " : "");
 2859                 comma = 1;
 2860         }
 2861         if (inp_flags & INP_HIGHPORT) {
 2862                 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
 2863                 comma = 1;
 2864         }
 2865         if (inp_flags & INP_LOWPORT) {
 2866                 db_printf("%sINP_LOWPORT", comma ? ", " : "");
 2867                 comma = 1;
 2868         }
 2869         if (inp_flags & INP_ANONPORT) {
 2870                 db_printf("%sINP_ANONPORT", comma ? ", " : "");
 2871                 comma = 1;
 2872         }
 2873         if (inp_flags & INP_RECVIF) {
 2874                 db_printf("%sINP_RECVIF", comma ? ", " : "");
 2875                 comma = 1;
 2876         }
 2877         if (inp_flags & INP_MTUDISC) {
 2878                 db_printf("%sINP_MTUDISC", comma ? ", " : "");
 2879                 comma = 1;
 2880         }
 2881         if (inp_flags & INP_RECVTTL) {
 2882                 db_printf("%sINP_RECVTTL", comma ? ", " : "");
 2883                 comma = 1;
 2884         }
 2885         if (inp_flags & INP_DONTFRAG) {
 2886                 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
 2887                 comma = 1;
 2888         }
 2889         if (inp_flags & INP_RECVTOS) {
 2890                 db_printf("%sINP_RECVTOS", comma ? ", " : "");
 2891                 comma = 1;
 2892         }
 2893         if (inp_flags & IN6P_IPV6_V6ONLY) {
 2894                 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
 2895                 comma = 1;
 2896         }
 2897         if (inp_flags & IN6P_PKTINFO) {
 2898                 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
 2899                 comma = 1;
 2900         }
 2901         if (inp_flags & IN6P_HOPLIMIT) {
 2902                 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
 2903                 comma = 1;
 2904         }
 2905         if (inp_flags & IN6P_HOPOPTS) {
 2906                 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
 2907                 comma = 1;
 2908         }
 2909         if (inp_flags & IN6P_DSTOPTS) {
 2910                 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
 2911                 comma = 1;
 2912         }
 2913         if (inp_flags & IN6P_RTHDR) {
 2914                 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
 2915                 comma = 1;
 2916         }
 2917         if (inp_flags & IN6P_RTHDRDSTOPTS) {
 2918                 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
 2919                 comma = 1;
 2920         }
 2921         if (inp_flags & IN6P_TCLASS) {
 2922                 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
 2923                 comma = 1;
 2924         }
 2925         if (inp_flags & IN6P_AUTOFLOWLABEL) {
 2926                 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
 2927                 comma = 1;
 2928         }
 2929         if (inp_flags & INP_ONESBCAST) {
 2930                 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
 2931                 comma  = 1;
 2932         }
 2933         if (inp_flags & INP_DROPPED) {
 2934                 db_printf("%sINP_DROPPED", comma ? ", " : "");
 2935                 comma  = 1;
 2936         }
 2937         if (inp_flags & INP_SOCKREF) {
 2938                 db_printf("%sINP_SOCKREF", comma ? ", " : "");
 2939                 comma  = 1;
 2940         }
 2941         if (inp_flags & IN6P_RFC2292) {
 2942                 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
 2943                 comma = 1;
 2944         }
 2945         if (inp_flags & IN6P_MTU) {
 2946                 db_printf("IN6P_MTU%s", comma ? ", " : "");
 2947                 comma = 1;
 2948         }
 2949 }
 2950 
 2951 static void
 2952 db_print_inpvflag(u_char inp_vflag)
 2953 {
 2954         int comma;
 2955 
 2956         comma = 0;
 2957         if (inp_vflag & INP_IPV4) {
 2958                 db_printf("%sINP_IPV4", comma ? ", " : "");
 2959                 comma  = 1;
 2960         }
 2961         if (inp_vflag & INP_IPV6) {
 2962                 db_printf("%sINP_IPV6", comma ? ", " : "");
 2963                 comma  = 1;
 2964         }
 2965         if (inp_vflag & INP_IPV6PROTO) {
 2966                 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
 2967                 comma  = 1;
 2968         }
 2969 }
 2970 
 2971 static void
 2972 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
 2973 {
 2974 
 2975         db_print_indent(indent);
 2976         db_printf("%s at %p\n", name, inp);
 2977 
 2978         indent += 2;
 2979 
 2980         db_print_indent(indent);
 2981         db_printf("inp_flow: 0x%x\n", inp->inp_flow);
 2982 
 2983         db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
 2984 
 2985         db_print_indent(indent);
 2986         db_printf("inp_ppcb: %p   inp_pcbinfo: %p   inp_socket: %p\n",
 2987             inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
 2988 
 2989         db_print_indent(indent);
 2990         db_printf("inp_label: %p   inp_flags: 0x%x (",
 2991            inp->inp_label, inp->inp_flags);
 2992         db_print_inpflags(inp->inp_flags);
 2993         db_printf(")\n");
 2994 
 2995         db_print_indent(indent);
 2996         db_printf("inp_sp: %p   inp_vflag: 0x%x (", inp->inp_sp,
 2997             inp->inp_vflag);
 2998         db_print_inpvflag(inp->inp_vflag);
 2999         db_printf(")\n");
 3000 
 3001         db_print_indent(indent);
 3002         db_printf("inp_ip_ttl: %d   inp_ip_p: %d   inp_ip_minttl: %d\n",
 3003             inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
 3004 
 3005         db_print_indent(indent);
 3006 #ifdef INET6
 3007         if (inp->inp_vflag & INP_IPV6) {
 3008                 db_printf("in6p_options: %p   in6p_outputopts: %p   "
 3009                     "in6p_moptions: %p\n", inp->in6p_options,
 3010                     inp->in6p_outputopts, inp->in6p_moptions);
 3011                 db_printf("in6p_icmp6filt: %p   in6p_cksum %d   "
 3012                     "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
 3013                     inp->in6p_hops);
 3014         } else
 3015 #endif
 3016         {
 3017                 db_printf("inp_ip_tos: %d   inp_ip_options: %p   "
 3018                     "inp_ip_moptions: %p\n", inp->inp_ip_tos,
 3019                     inp->inp_options, inp->inp_moptions);
 3020         }
 3021 
 3022         db_print_indent(indent);
 3023         db_printf("inp_phd: %p   inp_gencnt: %ju\n", inp->inp_phd,
 3024             (uintmax_t)inp->inp_gencnt);
 3025 }
 3026 
 3027 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
 3028 {
 3029         struct inpcb *inp;
 3030 
 3031         if (!have_addr) {
 3032                 db_printf("usage: show inpcb <addr>\n");
 3033                 return;
 3034         }
 3035         inp = (struct inpcb *)addr;
 3036 
 3037         db_print_inpcb(inp, "inpcb", 0);
 3038 }
 3039 #endif /* DDB */
 3040 
 3041 #ifdef RATELIMIT
 3042 /*
 3043  * Modify TX rate limit based on the existing "inp->inp_snd_tag",
 3044  * if any.
 3045  */
 3046 int
 3047 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
 3048 {
 3049         union if_snd_tag_modify_params params = {
 3050                 .rate_limit.max_rate = max_pacing_rate,
 3051                 .rate_limit.flags = M_NOWAIT,
 3052         };
 3053         struct m_snd_tag *mst;
 3054         int error;
 3055 
 3056         mst = inp->inp_snd_tag;
 3057         if (mst == NULL)
 3058                 return (EINVAL);
 3059 
 3060         if (mst->sw->snd_tag_modify == NULL) {
 3061                 error = EOPNOTSUPP;
 3062         } else {
 3063                 error = mst->sw->snd_tag_modify(mst, &params);
 3064         }
 3065         return (error);
 3066 }
 3067 
 3068 /*
 3069  * Query existing TX rate limit based on the existing
 3070  * "inp->inp_snd_tag", if any.
 3071  */
 3072 int
 3073 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
 3074 {
 3075         union if_snd_tag_query_params params = { };
 3076         struct m_snd_tag *mst;
 3077         int error;
 3078 
 3079         mst = inp->inp_snd_tag;
 3080         if (mst == NULL)
 3081                 return (EINVAL);
 3082 
 3083         if (mst->sw->snd_tag_query == NULL) {
 3084                 error = EOPNOTSUPP;
 3085         } else {
 3086                 error = mst->sw->snd_tag_query(mst, &params);
 3087                 if (error == 0 && p_max_pacing_rate != NULL)
 3088                         *p_max_pacing_rate = params.rate_limit.max_rate;
 3089         }
 3090         return (error);
 3091 }
 3092 
 3093 /*
 3094  * Query existing TX queue level based on the existing
 3095  * "inp->inp_snd_tag", if any.
 3096  */
 3097 int
 3098 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
 3099 {
 3100         union if_snd_tag_query_params params = { };
 3101         struct m_snd_tag *mst;
 3102         int error;
 3103 
 3104         mst = inp->inp_snd_tag;
 3105         if (mst == NULL)
 3106                 return (EINVAL);
 3107 
 3108         if (mst->sw->snd_tag_query == NULL)
 3109                 return (EOPNOTSUPP);
 3110 
 3111         error = mst->sw->snd_tag_query(mst, &params);
 3112         if (error == 0 && p_txqueue_level != NULL)
 3113                 *p_txqueue_level = params.rate_limit.queue_level;
 3114         return (error);
 3115 }
 3116 
 3117 /*
 3118  * Allocate a new TX rate limit send tag from the network interface
 3119  * given by the "ifp" argument and save it in "inp->inp_snd_tag":
 3120  */
 3121 int
 3122 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
 3123     uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
 3124 
 3125 {
 3126         union if_snd_tag_alloc_params params = {
 3127                 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
 3128                     IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
 3129                 .rate_limit.hdr.flowid = flowid,
 3130                 .rate_limit.hdr.flowtype = flowtype,
 3131                 .rate_limit.hdr.numa_domain = inp->inp_numa_domain,
 3132                 .rate_limit.max_rate = max_pacing_rate,
 3133                 .rate_limit.flags = M_NOWAIT,
 3134         };
 3135         int error;
 3136 
 3137         INP_WLOCK_ASSERT(inp);
 3138 
 3139         /*
 3140          * If there is already a send tag, or the INP is being torn
 3141          * down, allocating a new send tag is not allowed. Else send
 3142          * tags may leak.
 3143          */
 3144         if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0)
 3145                 return (EINVAL);
 3146 
 3147         error = m_snd_tag_alloc(ifp, &params, st);
 3148 #ifdef INET
 3149         if (error == 0) {
 3150                 counter_u64_add(rate_limit_set_ok, 1);
 3151                 counter_u64_add(rate_limit_active, 1);
 3152         } else if (error != EOPNOTSUPP)
 3153                   counter_u64_add(rate_limit_alloc_fail, 1);
 3154 #endif
 3155         return (error);
 3156 }
 3157 
 3158 void
 3159 in_pcbdetach_tag(struct m_snd_tag *mst)
 3160 {
 3161 
 3162         m_snd_tag_rele(mst);
 3163 #ifdef INET
 3164         counter_u64_add(rate_limit_active, -1);
 3165 #endif
 3166 }
 3167 
 3168 /*
 3169  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
 3170  * if any:
 3171  */
 3172 void
 3173 in_pcbdetach_txrtlmt(struct inpcb *inp)
 3174 {
 3175         struct m_snd_tag *mst;
 3176 
 3177         INP_WLOCK_ASSERT(inp);
 3178 
 3179         mst = inp->inp_snd_tag;
 3180         inp->inp_snd_tag = NULL;
 3181 
 3182         if (mst == NULL)
 3183                 return;
 3184 
 3185         m_snd_tag_rele(mst);
 3186 #ifdef INET
 3187         counter_u64_add(rate_limit_active, -1);
 3188 #endif
 3189 }
 3190 
 3191 int
 3192 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
 3193 {
 3194         int error;
 3195 
 3196         /*
 3197          * If the existing send tag is for the wrong interface due to
 3198          * a route change, first drop the existing tag.  Set the
 3199          * CHANGED flag so that we will keep trying to allocate a new
 3200          * tag if we fail to allocate one this time.
 3201          */
 3202         if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) {
 3203                 in_pcbdetach_txrtlmt(inp);
 3204                 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 3205         }
 3206 
 3207         /*
 3208          * NOTE: When attaching to a network interface a reference is
 3209          * made to ensure the network interface doesn't go away until
 3210          * all ratelimit connections are gone. The network interface
 3211          * pointers compared below represent valid network interfaces,
 3212          * except when comparing towards NULL.
 3213          */
 3214         if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
 3215                 error = 0;
 3216         } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
 3217                 if (inp->inp_snd_tag != NULL)
 3218                         in_pcbdetach_txrtlmt(inp);
 3219                 error = 0;
 3220         } else if (inp->inp_snd_tag == NULL) {
 3221                 /*
 3222                  * In order to utilize packet pacing with RSS, we need
 3223                  * to wait until there is a valid RSS hash before we
 3224                  * can proceed:
 3225                  */
 3226                 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
 3227                         error = EAGAIN;
 3228                 } else {
 3229                         error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
 3230                             mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
 3231                 }
 3232         } else {
 3233                 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 3234         }
 3235         if (error == 0 || error == EOPNOTSUPP)
 3236                 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
 3237 
 3238         return (error);
 3239 }
 3240 
 3241 /*
 3242  * This function should be called when the INP_RATE_LIMIT_CHANGED flag
 3243  * is set in the fast path and will attach/detach/modify the TX rate
 3244  * limit send tag based on the socket's so_max_pacing_rate value.
 3245  */
 3246 void
 3247 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
 3248 {
 3249         struct socket *socket;
 3250         uint32_t max_pacing_rate;
 3251         bool did_upgrade;
 3252 
 3253         if (inp == NULL)
 3254                 return;
 3255 
 3256         socket = inp->inp_socket;
 3257         if (socket == NULL)
 3258                 return;
 3259 
 3260         if (!INP_WLOCKED(inp)) {
 3261                 /*
 3262                  * NOTE: If the write locking fails, we need to bail
 3263                  * out and use the non-ratelimited ring for the
 3264                  * transmit until there is a new chance to get the
 3265                  * write lock.
 3266                  */
 3267                 if (!INP_TRY_UPGRADE(inp))
 3268                         return;
 3269                 did_upgrade = 1;
 3270         } else {
 3271                 did_upgrade = 0;
 3272         }
 3273 
 3274         /*
 3275          * NOTE: The so_max_pacing_rate value is read unlocked,
 3276          * because atomic updates are not required since the variable
 3277          * is checked at every mbuf we send. It is assumed that the
 3278          * variable read itself will be atomic.
 3279          */
 3280         max_pacing_rate = socket->so_max_pacing_rate;
 3281 
 3282         in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
 3283 
 3284         if (did_upgrade)
 3285                 INP_DOWNGRADE(inp);
 3286 }
 3287 
 3288 /*
 3289  * Track route changes for TX rate limiting.
 3290  */
 3291 void
 3292 in_pcboutput_eagain(struct inpcb *inp)
 3293 {
 3294         bool did_upgrade;
 3295 
 3296         if (inp == NULL)
 3297                 return;
 3298 
 3299         if (inp->inp_snd_tag == NULL)
 3300                 return;
 3301 
 3302         if (!INP_WLOCKED(inp)) {
 3303                 /*
 3304                  * NOTE: If the write locking fails, we need to bail
 3305                  * out and use the non-ratelimited ring for the
 3306                  * transmit until there is a new chance to get the
 3307                  * write lock.
 3308                  */
 3309                 if (!INP_TRY_UPGRADE(inp))
 3310                         return;
 3311                 did_upgrade = 1;
 3312         } else {
 3313                 did_upgrade = 0;
 3314         }
 3315 
 3316         /* detach rate limiting */
 3317         in_pcbdetach_txrtlmt(inp);
 3318 
 3319         /* make sure new mbuf send tag allocation is made */
 3320         inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 3321 
 3322         if (did_upgrade)
 3323                 INP_DOWNGRADE(inp);
 3324 }
 3325 
 3326 #ifdef INET
 3327 static void
 3328 rl_init(void *st)
 3329 {
 3330         rate_limit_new = counter_u64_alloc(M_WAITOK);
 3331         rate_limit_chg = counter_u64_alloc(M_WAITOK);
 3332         rate_limit_active = counter_u64_alloc(M_WAITOK);
 3333         rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
 3334         rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
 3335 }
 3336 
 3337 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 3338 #endif
 3339 #endif /* RATELIMIT */

Cache object: dbbf85f428682c1048b43f44f8da4f7a


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.