The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.h

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-3-Clause
    3  *
    4  * Copyright (c) 1982, 1986, 1990, 1993
    5  *      The Regents of the University of California.
    6  * Copyright (c) 2010-2011 Juniper Networks, Inc.
    7  * All rights reserved.
    8  *
    9  * Portions of this software were developed by Robert N. M. Watson under
   10  * contract to Juniper Networks, Inc.
   11  *
   12  * Redistribution and use in source and binary forms, with or without
   13  * modification, are permitted provided that the following conditions
   14  * are met:
   15  * 1. Redistributions of source code must retain the above copyright
   16  *    notice, this list of conditions and the following disclaimer.
   17  * 2. Redistributions in binary form must reproduce the above copyright
   18  *    notice, this list of conditions and the following disclaimer in the
   19  *    documentation and/or other materials provided with the distribution.
   20  * 3. Neither the name of the University nor the names of its contributors
   21  *    may be used to endorse or promote products derived from this software
   22  *    without specific prior written permission.
   23  *
   24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
   25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
   28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   34  * SUCH DAMAGE.
   35  *
   36  *      @(#)in_pcb.h    8.1 (Berkeley) 6/10/93
   37  * $FreeBSD$
   38  */
   39 
   40 #ifndef _NETINET_IN_PCB_H_
   41 #define _NETINET_IN_PCB_H_
   42 
   43 #include <sys/queue.h>
   44 #include <sys/epoch.h>
   45 #include <sys/_lock.h>
   46 #include <sys/_mutex.h>
   47 #include <sys/_rwlock.h>
   48 #include <net/route.h>
   49 
   50 #ifdef _KERNEL
   51 #include <sys/lock.h>
   52 #include <sys/proc.h>
   53 #include <sys/rwlock.h>
   54 #include <sys/smr.h>
   55 #include <sys/sysctl.h>
   56 #include <net/vnet.h>
   57 #include <vm/uma.h>
   58 #endif
   59 #include <sys/ck.h>
   60 
   61 /*
   62  * struct inpcb is the common protocol control block structure used in most
   63  * IP transport protocols.
   64  *
   65  * Pointers to local and foreign host table entries, local and foreign socket
   66  * numbers, and pointers up (to a socket structure) and down (to a
   67  * protocol-specific control block) are stored here.
   68  */
   69 CK_LIST_HEAD(inpcbhead, inpcb);
   70 CK_LIST_HEAD(inpcbporthead, inpcbport);
   71 CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
   72 typedef uint64_t        inp_gen_t;
   73 
   74 /*
   75  * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet.
   76  * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing
   77  * the following structure.  This requires padding always be zeroed out,
   78  * which is done right after inpcb allocation and stays through its lifetime.
   79  */
   80 struct in_addr_4in6 {
   81         u_int32_t       ia46_pad32[3];
   82         struct  in_addr ia46_addr4;
   83 };
   84 
   85 union in_dependaddr {
   86         struct in_addr_4in6 id46_addr;
   87         struct in6_addr id6_addr;
   88 };
   89 
   90 /*
   91  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
   92  * some extra padding to accomplish this.
   93  * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport,
   94  * lport, faddr to generate hash, so these fields shouldn't be moved.
   95  */
   96 struct in_endpoints {
   97         u_int16_t       ie_fport;               /* foreign port */
   98         u_int16_t       ie_lport;               /* local port */
   99         /* protocol dependent part, local and foreign addr */
  100         union in_dependaddr ie_dependfaddr;     /* foreign host table entry */
  101         union in_dependaddr ie_dependladdr;     /* local host table entry */
  102 #define ie_faddr        ie_dependfaddr.id46_addr.ia46_addr4
  103 #define ie_laddr        ie_dependladdr.id46_addr.ia46_addr4
  104 #define ie6_faddr       ie_dependfaddr.id6_addr
  105 #define ie6_laddr       ie_dependladdr.id6_addr
  106         u_int32_t       ie6_zoneid;             /* scope zone id */
  107 };
  108 
  109 /*
  110  * XXX The defines for inc_* are hacks and should be changed to direct
  111  * references.
  112  */
  113 struct in_conninfo {
  114         u_int8_t        inc_flags;
  115         u_int8_t        inc_len;
  116         u_int16_t       inc_fibnum;     /* XXX was pad, 16 bits is plenty */
  117         /* protocol dependent part */
  118         struct  in_endpoints inc_ie;
  119 };
  120 
  121 /*
  122  * Flags for inc_flags.
  123  */
  124 #define INC_ISIPV6      0x01
  125 #define INC_IPV6MINMTU  0x02
  126 
  127 #define inc_fport       inc_ie.ie_fport
  128 #define inc_lport       inc_ie.ie_lport
  129 #define inc_faddr       inc_ie.ie_faddr
  130 #define inc_laddr       inc_ie.ie_laddr
  131 #define inc6_faddr      inc_ie.ie6_faddr
  132 #define inc6_laddr      inc_ie.ie6_laddr
  133 #define inc6_zoneid     inc_ie.ie6_zoneid
  134 
  135 #if defined(_KERNEL) || defined(_WANT_INPCB)
  136 /*
  137  * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and
  138  * IPv6 sockets.  In the case of TCP and UDP, further per-connection state is
  139  * hung off of inp_ppcb most of the time.  Almost all fields of struct inpcb
  140  * are static after creation or protected by a per-inpcb rwlock, inp_lock.
  141  *
  142  * A inpcb database is indexed by addresses/ports hash as well as list of
  143  * all pcbs that belong to a certain proto. Database lookups or list traversals
  144  * are be performed inside SMR section. Once desired PCB is found its own
  145  * lock is to be obtained and SMR section exited.
  146  *
  147  * Key:
  148  * (b) - Protected by the hpts lock.
  149  * (c) - Constant after initialization
  150  * (e) - Protected by the SMR section
  151  * (i) - Protected by the inpcb lock
  152  * (p) - Protected by the pcbinfo lock for the inpcb
  153  * (h) - Protected by the pcbhash lock for the inpcb
  154  * (s) - Protected by another subsystem's locks
  155  * (x) - Undefined locking
  156  *
  157  * Notes on the tcp_hpts:
  158  *
  159  * First Hpts lock order is
  160  * 1) INP_WLOCK()
  161  * 2) HPTS_LOCK() i.e. hpts->pmtx
  162  *
  163  * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
  164  * You may check the inp->inp_in_hpts flag without the hpts lock.
  165  * The hpts is the only one that will clear this flag holding
  166  * only the hpts lock. This means that in your tcp_output()
  167  * routine when you test for the inp_in_hpts flag to be 1
  168  * it may be transitioning to 0 (by the hpts).
  169  * That's ok since that will just mean an extra call to tcp_output
  170  * that most likely will find the call you executed
  171  * (when the mis-match occurred) will have put the TCB back
  172  * on the hpts and it will return. If your
  173  * call did not add the inp back to the hpts then you will either
  174  * over-send or the cwnd will block you from sending more.
  175  *
  176  * Note you should also be holding the INP_WLOCK() when you
  177  * call the remove from the hpts as well. Though usually
  178  * you are either doing this from a timer, where you need and have
  179  * the INP_WLOCK() or from destroying your TCB where again
  180  * you should already have the INP_WLOCK().
  181  *
  182  * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
  183  * inp_input_cpu_set fields are controlled completely by
  184  * the hpts. Do not ever set these. The inp_hpts_cpu_set
  185  * and inp_input_cpu_set fields indicate if the hpts has
  186  * setup the respective cpu field. It is advised if this
  187  * field is 0, to enqueue the packet with the appropriate
  188  * hpts_immediate() call. If the _set field is 1, then
  189  * you may compare the inp_*_cpu field to the curcpu and
  190  * may want to again insert onto the hpts if these fields
  191  * are not equal (i.e. you are not on the expected CPU).
  192  *
  193  * A note on inp_hpts_calls and inp_input_calls, these
  194  * flags are set when the hpts calls either the output
  195  * or do_segment routines respectively. If the routine
  196  * being called wants to use this, then it needs to
  197  * clear the flag before returning. The hpts will not
  198  * clear the flag. The flags can be used to tell if
  199  * the hpts is the function calling the respective
  200  * routine.
  201  *
  202  * A few other notes:
  203  *
  204  * When a read lock is held, stability of the field is guaranteed; to write
  205  * to a field, a write lock must generally be held.
  206  *
  207  * netinet/netinet6-layer code should not assume that the inp_socket pointer
  208  * is safe to dereference without inp_lock being held, there may be
  209  * close(2)-related races.
  210  *
  211  * The inp_vflag field is overloaded, and would otherwise ideally be (c).
  212  */
  213 struct icmp6_filter;
  214 struct inpcbpolicy;
  215 struct m_snd_tag;
  216 struct inpcb {
  217         /* Cache line #1 (amd64) */
  218         CK_LIST_ENTRY(inpcb) inp_hash;  /* (w:h/r:e)  hash list */
  219         struct rwlock   inp_lock;
  220         /* Cache line #2 (amd64) */
  221 #define inp_start_zero  inp_hpts
  222 #define inp_zero_size   (sizeof(struct inpcb) - \
  223                             offsetof(struct inpcb, inp_start_zero))
  224         TAILQ_ENTRY(inpcb) inp_hpts;    /* pacing out queue next lock(b) */
  225         uint32_t inp_hpts_gencnt;       /* XXXGL */
  226         uint32_t inp_hpts_request;      /* Current hpts request, zero if
  227                                          * fits in the pacing window (i&b). */
  228         /*
  229          * Note the next fields are protected by a
  230          * different lock (hpts-lock). This means that
  231          * they must correspond in size to the smallest
  232          * protectable bit field (uint8_t on x86, and
  233          * other platfomrs potentially uint32_t?). Also
  234          * since CPU switches can occur at different times the two
  235          * fields can *not* be collapsed into a signal bit field.
  236          */
  237 #if defined(__amd64__) || defined(__i386__)
  238         uint8_t inp_in_hpts; /* on output hpts (lock b) */
  239 #else
  240         uint32_t inp_in_hpts; /* on output hpts (lock b) */
  241 #endif
  242         volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
  243         volatile uint16_t  inp_irq_cpu; /* Set by LRO in behalf of or the driver */
  244         u_int   inp_refcount;           /* (i) refcount */
  245         int     inp_flags;              /* (i) generic IP/datagram flags */
  246         int     inp_flags2;             /* (i) generic IP/datagram flags #2*/
  247         uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
  248                          inp_hpts_calls :1,     /* (i) from output hpts */
  249                          inp_irq_cpu_set :1,    /* (i) from LRO/Driver */
  250                          inp_spare_bits2 : 3;
  251         uint8_t inp_numa_domain;        /* numa domain */
  252         void    *inp_ppcb;              /* (i) pointer to per-protocol pcb */
  253         struct  socket *inp_socket;     /* (i) back pointer to socket */
  254         int32_t          inp_hptsslot;  /* Hpts wheel slot this tcb is Lock(i&b) */
  255         uint32_t         inp_hpts_drop_reas;    /* reason we are dropping the PCB (lock i&b) */
  256         struct  inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
  257         struct  ucred   *inp_cred;      /* (c) cache of socket cred */
  258         u_int32_t inp_flow;             /* (i) IPv6 flow information */
  259         u_char  inp_vflag;              /* (i) IP version flag (v4/v6) */
  260         u_char  inp_ip_ttl;             /* (i) time to live proto */
  261         u_char  inp_ip_p;               /* (c) protocol proto */
  262         u_char  inp_ip_minttl;          /* (i) minimum TTL or drop */
  263         uint32_t inp_flowid;            /* (x) flow id / queue id */
  264         struct m_snd_tag *inp_snd_tag;  /* (i) send tag for outgoing mbufs */
  265         uint32_t inp_flowtype;          /* (x) M_HASHTYPE value */
  266         uint32_t inp_rss_listen_bucket; /* (x) overridden RSS listen bucket */
  267 
  268         /* Local and foreign ports, local and foreign addr. */
  269         struct  in_conninfo inp_inc;    /* (i) list for PCB's local port */
  270 
  271         /* MAC and IPSEC policy information. */
  272         struct  label *inp_label;       /* (i) MAC label */
  273         struct  inpcbpolicy *inp_sp;    /* (s) for IPSEC */
  274 
  275         /* Protocol-dependent part; options. */
  276         struct {
  277                 u_char  inp_ip_tos;             /* (i) type of service proto */
  278                 struct mbuf             *inp_options;   /* (i) IP options */
  279                 struct ip_moptions      *inp_moptions;  /* (i) mcast options */
  280         };
  281         struct {
  282                 /* (i) IP options */
  283                 struct mbuf             *in6p_options;
  284                 /* (i) IP6 options for outgoing packets */
  285                 struct ip6_pktopts      *in6p_outputopts;
  286                 /* (i) IP multicast options */
  287                 struct ip6_moptions     *in6p_moptions;
  288                 /* (i) ICMPv6 code type filter */
  289                 struct icmp6_filter     *in6p_icmp6filt;
  290                 /* (i) IPV6_CHECKSUM setsockopt */
  291                 int     in6p_cksum;
  292                 short   in6p_hops;
  293         };
  294         CK_LIST_ENTRY(inpcb) inp_portlist;      /* (r:e/w:h) port list */
  295         struct  inpcbport *inp_phd;     /* (r:e/w:h) head of this list */
  296         inp_gen_t       inp_gencnt;     /* (c) generation count */
  297         void            *spare_ptr;     /* Spare pointer. */
  298         rt_gen_t        inp_rt_cookie;  /* generation for route entry */
  299         union {                         /* cached L3 information */
  300                 struct route inp_route;
  301                 struct route_in6 inp_route6;
  302         };
  303         CK_LIST_ENTRY(inpcb) inp_list;  /* (r:e/w:p) all PCBs for proto */
  304 };
  305 #endif  /* _KERNEL */
  306 
  307 #define inp_fport       inp_inc.inc_fport
  308 #define inp_lport       inp_inc.inc_lport
  309 #define inp_faddr       inp_inc.inc_faddr
  310 #define inp_laddr       inp_inc.inc_laddr
  311 
  312 #define in6p_faddr      inp_inc.inc6_faddr
  313 #define in6p_laddr      inp_inc.inc6_laddr
  314 #define in6p_zoneid     inp_inc.inc6_zoneid
  315 
  316 #define inp_vnet        inp_pcbinfo->ipi_vnet
  317 
  318 /*
  319  * The range of the generation count, as used in this implementation, is 9e19.
  320  * We would have to create 300 billion connections per second for this number
  321  * to roll over in a year.  This seems sufficiently unlikely that we simply
  322  * don't concern ourselves with that possibility.
  323  */
  324 
  325 /*
  326  * Interface exported to userland by various protocols which use inpcbs.  Hack
  327  * alert -- only define if struct xsocket is in scope.
  328  * Fields prefixed with "xi_" are unique to this structure, and the rest
  329  * match fields in the struct inpcb, to ease coding and porting.
  330  *
  331  * Legend:
  332  * (s) - used by userland utilities in src
  333  * (p) - used by utilities in ports
  334  * (3) - is known to be used by third party software not in ports
  335  * (n) - no known usage
  336  */
  337 #ifdef _SYS_SOCKETVAR_H_
  338 struct xinpcb {
  339         ksize_t         xi_len;                 /* length of this structure */
  340         struct xsocket  xi_socket;              /* (s,p) */
  341         struct in_conninfo inp_inc;             /* (s,p) */
  342         uint64_t        inp_gencnt;             /* (s,p) */
  343         kvaddr_t        inp_ppcb;               /* (s) netstat(1) */
  344         int64_t         inp_spare64[4];
  345         uint32_t        inp_flow;               /* (s) */
  346         uint32_t        inp_flowid;             /* (s) */
  347         uint32_t        inp_flowtype;           /* (s) */
  348         int32_t         inp_flags;              /* (s,p) */
  349         int32_t         inp_flags2;             /* (s) */
  350         int32_t         inp_rss_listen_bucket;  /* (n) */
  351         int32_t         in6p_cksum;             /* (n) */
  352         int32_t         inp_spare32[4];
  353         uint16_t        in6p_hops;              /* (n) */
  354         uint8_t         inp_ip_tos;             /* (n) */
  355         int8_t          pad8;
  356         uint8_t         inp_vflag;              /* (s,p) */
  357         uint8_t         inp_ip_ttl;             /* (n) */
  358         uint8_t         inp_ip_p;               /* (n) */
  359         uint8_t         inp_ip_minttl;          /* (n) */
  360         int8_t          inp_spare8[4];
  361 } __aligned(8);
  362 
  363 struct xinpgen {
  364         ksize_t xig_len;        /* length of this structure */
  365         u_int           xig_count;      /* number of PCBs at this time */
  366         uint32_t        _xig_spare32;
  367         inp_gen_t       xig_gen;        /* generation count at this time */
  368         so_gen_t        xig_sogen;      /* socket generation count this time */
  369         uint64_t        _xig_spare64[4];
  370 } __aligned(8);
  371 
  372 struct sockopt_parameters {
  373         struct in_conninfo sop_inc;
  374         uint64_t sop_id;
  375         int sop_level;
  376         int sop_optname;
  377         char sop_optval[];
  378 };
  379 
  380 #ifdef  _KERNEL
  381 int     sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo,
  382             int (*ctloutput_set)(struct inpcb *, struct sockopt *));
  383 void    in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *);
  384 #endif
  385 #endif /* _SYS_SOCKETVAR_H_ */
  386 
  387 #ifdef _KERNEL
  388 /*
  389  * Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both
  390  * IPv4 and IPv6.
  391  *
  392  * The pcbs are protected with SMR section and thus all lists in inpcbinfo
  393  * are CK-lists.  Locking is required to insert a pcb into database. Two
  394  * locks are provided: one for the hash and one for the global list of pcbs,
  395  * as well as overall count and generation count.
  396  *
  397  * Locking key:
  398  *
  399  * (c) Constant or nearly constant after initialisation
  400  * (e) Protected by SMR section
  401  * (g) Locked by ipi_lock
  402  * (h) Locked by ipi_hash_lock
  403  */
  404 struct inpcbinfo {
  405         /*
  406          * Global lock protecting inpcb list modification
  407          */
  408         struct mtx               ipi_lock;
  409         struct inpcbhead         ipi_listhead;          /* (r:e/w:g) */
  410         u_int                    ipi_count;             /* (g) */
  411 
  412         /*
  413          * Generation count -- incremented each time a connection is allocated
  414          * or freed.
  415          */
  416         u_quad_t                 ipi_gencnt;            /* (g) */
  417 
  418         /*
  419          * Fields associated with port lookup and allocation.
  420          */
  421         u_short                  ipi_lastport;          /* (h) */
  422         u_short                  ipi_lastlow;           /* (h) */
  423         u_short                  ipi_lasthi;            /* (h) */
  424 
  425         /*
  426          * UMA zone from which inpcbs are allocated for this protocol.
  427          */
  428         uma_zone_t               ipi_zone;              /* (c) */
  429         uma_zone_t               ipi_portzone;          /* (c) */
  430         smr_t                    ipi_smr;               /* (c) */
  431 
  432         /*
  433          * Global hash of inpcbs, hashed by local and foreign addresses and
  434          * port numbers.
  435          */
  436         struct mtx               ipi_hash_lock;
  437         struct inpcbhead        *ipi_hashbase;          /* (r:e/w:h) */
  438         u_long                   ipi_hashmask;          /* (c) */
  439 
  440         /*
  441          * Global hash of inpcbs, hashed by only local port number.
  442          */
  443         struct inpcbporthead    *ipi_porthashbase;      /* (h) */
  444         u_long                   ipi_porthashmask;      /* (h) */
  445 
  446         /*
  447          * Load balance groups used for the SO_REUSEPORT_LB option,
  448          * hashed by local port.
  449          */
  450         struct  inpcblbgrouphead *ipi_lbgrouphashbase;  /* (r:e/w:h) */
  451         u_long                   ipi_lbgrouphashmask;   /* (h) */
  452 
  453         /*
  454          * Pointer to network stack instance
  455          */
  456         struct vnet             *ipi_vnet;              /* (c) */
  457 };
  458 
  459 /*
  460  * Global allocation storage for each high-level protocol (UDP, TCP, ...).
  461  * Each corresponding per-VNET inpcbinfo points into this one.
  462  */
  463 struct inpcbstorage {
  464         uma_zone_t      ips_zone;
  465         uma_zone_t      ips_portzone;
  466         uma_init        ips_pcbinit;
  467         size_t          ips_size;
  468         const char *    ips_zone_name;
  469         const char *    ips_portzone_name;
  470         const char *    ips_infolock_name;
  471         const char *    ips_hashlock_name;
  472 };
  473 
  474 #define INPCBSTORAGE_DEFINE(prot, ppcb, lname, zname, iname, hname)     \
  475 static int                                                              \
  476 prot##_inpcb_init(void *mem, int size __unused, int flags __unused)     \
  477 {                                                                       \
  478         struct inpcb *inp = mem;                                        \
  479                                                                         \
  480         rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK);    \
  481         return (0);                                                     \
  482 }                                                                       \
  483 static struct inpcbstorage prot = {                                     \
  484         .ips_size = sizeof(struct ppcb),                                \
  485         .ips_pcbinit = prot##_inpcb_init,                               \
  486         .ips_zone_name = zname,                                         \
  487         .ips_portzone_name = zname " ports",                            \
  488         .ips_infolock_name = iname,                                     \
  489         .ips_hashlock_name = hname,                                     \
  490 };                                                                      \
  491 SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN,                  \
  492     SI_ORDER_SECOND, in_pcbstorage_init, &prot);                        \
  493 SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN,              \
  494     SI_ORDER_SECOND, in_pcbstorage_destroy, &prot)
  495 
  496 /*
  497  * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group
  498  * (or unique address:port combination) can be re-used at most
  499  * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which
  500  * is dynamically resized as processes bind/unbind to that specific group.
  501  */
  502 struct inpcblbgroup {
  503         CK_LIST_ENTRY(inpcblbgroup) il_list;
  504         struct epoch_context il_epoch_ctx;
  505         struct ucred    *il_cred;
  506         uint16_t        il_lport;                       /* (c) */
  507         u_char          il_vflag;                       /* (c) */
  508         uint8_t         il_numa_domain;
  509         uint32_t        il_pad2;
  510         union in_dependaddr il_dependladdr;             /* (c) */
  511 #define il_laddr        il_dependladdr.id46_addr.ia46_addr4
  512 #define il6_laddr       il_dependladdr.id6_addr
  513         uint32_t        il_inpsiz; /* max count in il_inp[] (h) */
  514         uint32_t        il_inpcnt; /* cur count in il_inp[] (h) */
  515         struct inpcb    *il_inp[];                      /* (h) */
  516 };
  517 
  518 #define INP_LOCK_DESTROY(inp)   rw_destroy(&(inp)->inp_lock)
  519 #define INP_RLOCK(inp)          rw_rlock(&(inp)->inp_lock)
  520 #define INP_WLOCK(inp)          rw_wlock(&(inp)->inp_lock)
  521 #define INP_TRY_RLOCK(inp)      rw_try_rlock(&(inp)->inp_lock)
  522 #define INP_TRY_WLOCK(inp)      rw_try_wlock(&(inp)->inp_lock)
  523 #define INP_RUNLOCK(inp)        rw_runlock(&(inp)->inp_lock)
  524 #define INP_WUNLOCK(inp)        rw_wunlock(&(inp)->inp_lock)
  525 #define INP_UNLOCK(inp)         rw_unlock(&(inp)->inp_lock)
  526 #define INP_TRY_UPGRADE(inp)    rw_try_upgrade(&(inp)->inp_lock)
  527 #define INP_DOWNGRADE(inp)      rw_downgrade(&(inp)->inp_lock)
  528 #define INP_WLOCKED(inp)        rw_wowned(&(inp)->inp_lock)
  529 #define INP_LOCK_ASSERT(inp)    rw_assert(&(inp)->inp_lock, RA_LOCKED)
  530 #define INP_RLOCK_ASSERT(inp)   rw_assert(&(inp)->inp_lock, RA_RLOCKED)
  531 #define INP_WLOCK_ASSERT(inp)   rw_assert(&(inp)->inp_lock, RA_WLOCKED)
  532 #define INP_UNLOCK_ASSERT(inp)  rw_assert(&(inp)->inp_lock, RA_UNLOCKED)
  533 
  534 /*
  535  * These locking functions are for inpcb consumers outside of sys/netinet,
  536  * more specifically, they were added for the benefit of TOE drivers. The
  537  * macros are reserved for use by the stack.
  538  */
  539 void inp_wlock(struct inpcb *);
  540 void inp_wunlock(struct inpcb *);
  541 void inp_rlock(struct inpcb *);
  542 void inp_runlock(struct inpcb *);
  543 
  544 #ifdef INVARIANT_SUPPORT
  545 void inp_lock_assert(struct inpcb *);
  546 void inp_unlock_assert(struct inpcb *);
  547 #else
  548 #define inp_lock_assert(inp)    do {} while (0)
  549 #define inp_unlock_assert(inp)  do {} while (0)
  550 #endif
  551 
  552 void    inp_apply_all(struct inpcbinfo *, void (*func)(struct inpcb *, void *),
  553             void *arg);
  554 int     inp_ip_tos_get(const struct inpcb *inp);
  555 void    inp_ip_tos_set(struct inpcb *inp, int val);
  556 struct socket *
  557         inp_inpcbtosocket(struct inpcb *inp);
  558 struct tcpcb *
  559         inp_inpcbtotcpcb(struct inpcb *inp);
  560 void    inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
  561                 uint32_t *faddr, uint16_t *fp);
  562 int     inp_so_options(const struct inpcb *inp);
  563 
  564 #endif /* _KERNEL */
  565 
  566 #define INP_INFO_WLOCK(ipi)     mtx_lock(&(ipi)->ipi_lock)
  567 #define INP_INFO_WLOCKED(ipi)   mtx_owned(&(ipi)->ipi_lock)
  568 #define INP_INFO_WUNLOCK(ipi)   mtx_unlock(&(ipi)->ipi_lock)
  569 #define INP_INFO_LOCK_ASSERT(ipi)       MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
  570                                         mtx_owned(&(ipi)->ipi_lock))
  571 #define INP_INFO_WLOCK_ASSERT(ipi)      mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
  572 #define INP_INFO_WUNLOCK_ASSERT(ipi)    \
  573                                 mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED)
  574 
  575 #define INP_HASH_WLOCK(ipi)             mtx_lock(&(ipi)->ipi_hash_lock)
  576 #define INP_HASH_WUNLOCK(ipi)           mtx_unlock(&(ipi)->ipi_hash_lock)
  577 #define INP_HASH_LOCK_ASSERT(ipi)       MPASS(SMR_ENTERED((ipi)->ipi_smr) || \
  578                                         mtx_owned(&(ipi)->ipi_hash_lock))
  579 #define INP_HASH_WLOCK_ASSERT(ipi)      mtx_assert(&(ipi)->ipi_hash_lock, \
  580                                         MA_OWNED)
  581 
  582 /*
  583  * Wildcard matching hash is not just a microoptimisation!  The hash for
  584  * wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6
  585  * wildcard bound pcb won't be able to receive AF_INET connections, while:
  586  * jenkins_hash(&zeroes, 1, s) != jenkins_hash(&zeroes, 4, s)
  587  * See also comment above struct in_addr_4in6.
  588  */
  589 #define IN_ADDR_JHASH32(addr)                                           \
  590         ((addr)->s_addr == INADDR_ANY ? V_in_pcbhashseed :              \
  591             jenkins_hash32((&(addr)->s_addr), 1, V_in_pcbhashseed))
  592 #define IN6_ADDR_JHASH32(addr)                                          \
  593         (memcmp((addr), &in6addr_any, sizeof(in6addr_any)) == 0 ?       \
  594             V_in_pcbhashseed :                                          \
  595             jenkins_hash32((addr)->__u6_addr.__u6_addr32,               \
  596             nitems((addr)->__u6_addr.__u6_addr32), V_in_pcbhashseed))
  597 
  598 #define INP_PCBHASH(faddr, lport, fport, mask)                          \
  599         ((IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
  600 #define INP6_PCBHASH(faddr, lport, fport, mask)                         \
  601         ((IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask))
  602 
  603 #define INP_PCBHASH_WILD(lport, mask)                                   \
  604         ((V_in_pcbhashseed ^ ntohs(lport)) & (mask))
  605 
  606 #define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport)                     \
  607         (IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
  608 #define INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport)                    \
  609         (IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport)))
  610 
  611 #define INP_PCBPORTHASH(lport, mask)    (ntohs((lport)) & (mask))
  612 
  613 /*
  614  * Flags for inp_vflags -- historically version flags only
  615  */
  616 #define INP_IPV4        0x1
  617 #define INP_IPV6        0x2
  618 #define INP_IPV6PROTO   0x4             /* opened under IPv6 protocol */
  619 
  620 /*
  621  * Flags for inp_flags.
  622  */
  623 #define INP_RECVOPTS            0x00000001 /* receive incoming IP options */
  624 #define INP_RECVRETOPTS         0x00000002 /* receive IP options for reply */
  625 #define INP_RECVDSTADDR         0x00000004 /* receive IP dst address */
  626 #define INP_HDRINCL             0x00000008 /* user supplies entire IP header */
  627 #define INP_HIGHPORT            0x00000010 /* user wants "high" port binding */
  628 #define INP_LOWPORT             0x00000020 /* user wants "low" port binding */
  629 #define INP_ANONPORT            0x00000040 /* port chosen for user */
  630 #define INP_RECVIF              0x00000080 /* receive incoming interface */
  631 #define INP_MTUDISC             0x00000100 /* user can do MTU discovery */
  632 /*      INP_FREED               0x00000200 private to in_pcb.c */
  633 #define INP_RECVTTL             0x00000400 /* receive incoming IP TTL */
  634 #define INP_DONTFRAG            0x00000800 /* don't fragment packet */
  635 #define INP_BINDANY             0x00001000 /* allow bind to any address */
  636 #define INP_INHASHLIST          0x00002000 /* in_pcbinshash() has been called */
  637 #define INP_RECVTOS             0x00004000 /* receive incoming IP TOS */
  638 #define IN6P_IPV6_V6ONLY        0x00008000 /* restrict AF_INET6 socket for v6 */
  639 #define IN6P_PKTINFO            0x00010000 /* receive IP6 dst and I/F */
  640 #define IN6P_HOPLIMIT           0x00020000 /* receive hoplimit */
  641 #define IN6P_HOPOPTS            0x00040000 /* receive hop-by-hop options */
  642 #define IN6P_DSTOPTS            0x00080000 /* receive dst options after rthdr */
  643 #define IN6P_RTHDR              0x00100000 /* receive routing header */
  644 #define IN6P_RTHDRDSTOPTS       0x00200000 /* receive dstoptions before rthdr */
  645 #define IN6P_TCLASS             0x00400000 /* receive traffic class value */
  646 #define IN6P_AUTOFLOWLABEL      0x00800000 /* attach flowlabel automatically */
  647 /* was  INP_TIMEWAIT            0x01000000 */
  648 #define INP_ONESBCAST           0x02000000 /* send all-ones broadcast */
  649 #define INP_DROPPED             0x04000000 /* protocol drop flag */
  650 #define INP_SOCKREF             0x08000000 /* strong socket reference */
  651 #define INP_RESERVED_0          0x10000000 /* reserved field */
  652 #define INP_RESERVED_1          0x20000000 /* reserved field */
  653 #define IN6P_RFC2292            0x40000000 /* used RFC2292 API on the socket */
  654 #define IN6P_MTU                0x80000000 /* receive path MTU */
  655 
  656 #define INP_CONTROLOPTS         (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\
  657                                  INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\
  658                                  IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\
  659                                  IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\
  660                                  IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\
  661                                  IN6P_MTU)
  662 
  663 /*
  664  * Flags for inp_flags2.
  665  */
  666 #define INP_MBUF_L_ACKS         0x00000001 /* We need large mbufs for ack compression */
  667 #define INP_MBUF_ACKCMP         0x00000002 /* TCP mbuf ack compression ok */
  668 /*                              0x00000004 */
  669 #define INP_REUSEPORT           0x00000008 /* SO_REUSEPORT option is set */
  670 /*                              0x00000010 */
  671 #define INP_REUSEADDR           0x00000020 /* SO_REUSEADDR option is set */
  672 #define INP_BINDMULTI           0x00000040 /* IP_BINDMULTI option is set */
  673 #define INP_RSS_BUCKET_SET      0x00000080 /* IP_RSS_LISTEN_BUCKET is set */
  674 #define INP_RECVFLOWID          0x00000100 /* populate recv datagram with flow info */
  675 #define INP_RECVRSSBUCKETID     0x00000200 /* populate recv datagram with bucket id */
  676 #define INP_RATE_LIMIT_CHANGED  0x00000400 /* rate limit needs attention */
  677 #define INP_ORIGDSTADDR         0x00000800 /* receive IP dst address/port */
  678 #define INP_CANNOT_DO_ECN       0x00001000 /* The stack does not do ECN */
  679 #define INP_REUSEPORT_LB        0x00002000 /* SO_REUSEPORT_LB option is set */
  680 #define INP_SUPPORTS_MBUFQ      0x00004000 /* Supports the mbuf queue method of LRO */
  681 #define INP_MBUF_QUEUE_READY    0x00008000 /* The transport is pacing, inputs can be queued */
  682 #define INP_DONT_SACK_QUEUE     0x00010000 /* If a sack arrives do not wake me */
  683 #define INP_2PCP_SET            0x00020000 /* If the Eth PCP should be set explicitly */
  684 #define INP_2PCP_BIT0           0x00040000 /* Eth PCP Bit 0 */
  685 #define INP_2PCP_BIT1           0x00080000 /* Eth PCP Bit 1 */
  686 #define INP_2PCP_BIT2           0x00100000 /* Eth PCP Bit 2 */
  687 #define INP_2PCP_BASE   INP_2PCP_BIT0
  688 #define INP_2PCP_MASK   (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2)
  689 #define INP_2PCP_SHIFT          18         /* shift PCP field in/out of inp_flags2 */
  690 
  691 /*
  692  * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next().
  693  */
  694 typedef enum {
  695         INPLOOKUP_WILDCARD = 0x00000001,        /* Allow wildcard sockets. */
  696         INPLOOKUP_RLOCKPCB = 0x00000002,        /* Return inpcb read-locked. */
  697         INPLOOKUP_WLOCKPCB = 0x00000004,        /* Return inpcb write-locked. */
  698 } inp_lookup_t;
  699 
  700 #define INPLOOKUP_MASK  (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \
  701             INPLOOKUP_WLOCKPCB)
  702 #define INPLOOKUP_LOCKMASK      (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)
  703 
  704 #define sotoinpcb(so)   ((struct inpcb *)(so)->so_pcb)
  705 
  706 #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family
  707 
  708 #define INP_CHECK_SOCKAF(so, af)        (INP_SOCKAF(so) == af)
  709 
  710 #ifdef _KERNEL
  711 VNET_DECLARE(int, ipport_reservedhigh);
  712 VNET_DECLARE(int, ipport_reservedlow);
  713 VNET_DECLARE(int, ipport_lowfirstauto);
  714 VNET_DECLARE(int, ipport_lowlastauto);
  715 VNET_DECLARE(int, ipport_firstauto);
  716 VNET_DECLARE(int, ipport_lastauto);
  717 VNET_DECLARE(int, ipport_hifirstauto);
  718 VNET_DECLARE(int, ipport_hilastauto);
  719 VNET_DECLARE(int, ipport_randomized);
  720 
  721 #define V_ipport_reservedhigh   VNET(ipport_reservedhigh)
  722 #define V_ipport_reservedlow    VNET(ipport_reservedlow)
  723 #define V_ipport_lowfirstauto   VNET(ipport_lowfirstauto)
  724 #define V_ipport_lowlastauto    VNET(ipport_lowlastauto)
  725 #define V_ipport_firstauto      VNET(ipport_firstauto)
  726 #define V_ipport_lastauto       VNET(ipport_lastauto)
  727 #define V_ipport_hifirstauto    VNET(ipport_hifirstauto)
  728 #define V_ipport_hilastauto     VNET(ipport_hilastauto)
  729 #define V_ipport_randomized     VNET(ipport_randomized)
  730 
  731 void    in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *,
  732             u_int, u_int);
  733 void    in_pcbinfo_destroy(struct inpcbinfo *);
  734 void    in_pcbstorage_init(void *);
  735 void    in_pcbstorage_destroy(void *);
  736 
  737 int     in_pcbbind_check_bindmulti(const struct inpcb *ni,
  738             const struct inpcb *oi);
  739 
  740 void    in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
  741 int     in_pcballoc(struct socket *, struct inpcbinfo *);
  742 int     in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *);
  743 int     in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
  744             u_short *, struct ucred *);
  745 int     in_pcbconnect(struct inpcb *, struct sockaddr *, struct ucred *, bool);
  746 int     in_pcbconnect_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
  747             u_short *, in_addr_t *, u_short *, struct inpcb **,
  748             struct ucred *);
  749 void    in_pcbdetach(struct inpcb *);
  750 void    in_pcbdisconnect(struct inpcb *);
  751 void    in_pcbdrop(struct inpcb *);
  752 void    in_pcbfree(struct inpcb *);
  753 int     in_pcbinshash(struct inpcb *);
  754 int     in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
  755             struct ucred *);
  756 int     in_pcblbgroup_numa(struct inpcb *, int arg);
  757 struct inpcb *
  758         in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
  759             struct in_addr, u_int, int, struct ifnet *);
  760 struct inpcb *
  761         in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
  762             struct in_addr, u_int, int, struct ifnet *, struct mbuf *);
  763 void    in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
  764             int, struct inpcb *(*)(struct inpcb *, int));
  765 void    in_pcbref(struct inpcb *);
  766 void    in_pcbrehash(struct inpcb *);
  767 bool    in_pcbrele_rlocked(struct inpcb *);
  768 bool    in_pcbrele_wlocked(struct inpcb *);
  769 
  770 typedef bool inp_match_t(const struct inpcb *, void *);
  771 struct inpcb_iterator {
  772         const struct inpcbinfo  *ipi;
  773         struct inpcb            *inp;
  774         inp_match_t             *match;
  775         void                    *ctx;
  776         int                     hash;
  777 #define INP_ALL_LIST            -1
  778         const inp_lookup_t      lock;
  779 };
  780 
  781 /* Note: sparse initializers guarantee .inp = NULL. */
  782 #define INP_ITERATOR(_ipi, _lock, _match, _ctx)         \
  783         {                                               \
  784                 .ipi = (_ipi),                          \
  785                 .lock = (_lock),                        \
  786                 .hash = INP_ALL_LIST,                   \
  787                 .match = (_match),                      \
  788                 .ctx = (_ctx),                          \
  789         }
  790 #define INP_ALL_ITERATOR(_ipi, _lock)                   \
  791         {                                               \
  792                 .ipi = (_ipi),                          \
  793                 .lock = (_lock),                        \
  794                 .hash = INP_ALL_LIST,                   \
  795         }
  796 
  797 struct inpcb *inp_next(struct inpcb_iterator *);
  798 void    in_losing(struct inpcb *);
  799 void    in_pcbsetsolabel(struct socket *so);
  800 int     in_getpeeraddr(struct socket *so, struct sockaddr **nam);
  801 int     in_getsockaddr(struct socket *so, struct sockaddr **nam);
  802 struct sockaddr *
  803         in_sockaddr(in_port_t port, struct in_addr *addr);
  804 void    in_pcbsosetlabel(struct socket *so);
  805 #ifdef RATELIMIT
  806 int
  807 in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
  808             struct mbuf *, uint32_t);
  809 int     in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
  810             uint32_t, struct m_snd_tag **);
  811 void    in_pcbdetach_txrtlmt(struct inpcb *);
  812 void    in_pcbdetach_tag(struct m_snd_tag *);
  813 int     in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
  814 int     in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
  815 int     in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
  816 void    in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *);
  817 void    in_pcboutput_eagain(struct inpcb *);
  818 #endif
  819 #endif /* _KERNEL */
  820 
  821 #endif /* !_NETINET_IN_PCB_H_ */

Cache object: 009e1e5ba48f2180862296edaf30d72e


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.