[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]

FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c

Version: -  FREEBSD  -  FREEBSD7  -  FREEBSD70  -  FREEBSD6  -  FREEBSD64  -  FREEBSD63  -  FREEBSD62  -  FREEBSD61  -  FREEBSD60  -  FREEBSD5  -  FREEBSD55  -  FREEBSD54  -  FREEBSD53  -  FREEBSD52  -  FREEBSD51  -  FREEBSD50  -  FREEBSD4  -  FREEBSD3  -  FREEBSD22  -  linux-2.6  -  linux-2.4.22  -  MK83  -  MK84  -  PLAN9  -  DFBSD  -  NETBSD  -  NETBSD5  -  NETBSD4  -  NETBSD3  -  NETBSD20  -  OPENBSD  -  xnu-517  -  xnu-792  -  xnu-792.6.70  -  xnu-1228  -  OPENSOLARIS  -  minix-3-1-1  -  TRUSTEDBSD-SEBSD  -  FREEBSD-LIBC  -  FREEBSD7-LIBC  -  FREEBSD6-LIBC  -  GLIBC27 
SearchContext: -  none  -  excerpts  -  bigexcerpts 

  1 /*-
  2  * Copyright (c) 1982, 1986, 1991, 1993, 1995
  3  *      The Regents of the University of California.
  4  * Copyright (c) 2007 Robert N. M. Watson
  5  * All rights reserved.
  6  *
  7  * Redistribution and use in source and binary forms, with or without
  8  * modification, are permitted provided that the following conditions
  9  * are met:
 10  * 1. Redistributions of source code must retain the above copyright
 11  *    notice, this list of conditions and the following disclaimer.
 12  * 2. Redistributions in binary form must reproduce the above copyright
 13  *    notice, this list of conditions and the following disclaimer in the
 14  *    documentation and/or other materials provided with the distribution.
 15  * 4. Neither the name of the University nor the names of its contributors
 16  *    may be used to endorse or promote products derived from this software
 17  *    without specific prior written permission.
 18  *
 19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 29  * SUCH DAMAGE.
 30  *
 31  *      @(#)in_pcb.c    8.4 (Berkeley) 5/24/95
 32  */
 33 
 34 #include <sys/cdefs.h>
 35 __FBSDID("$FreeBSD: src/sys/netinet/in_pcb.c,v 1.231 2008/12/02 21:37:28 bz Exp $");
 36 
 37 #include "opt_ddb.h"
 38 #include "opt_ipsec.h"
 39 #include "opt_inet6.h"
 40 #include "opt_mac.h"
 41 
 42 #include <sys/param.h>
 43 #include <sys/systm.h>
 44 #include <sys/malloc.h>
 45 #include <sys/mbuf.h>
 46 #include <sys/domain.h>
 47 #include <sys/protosw.h>
 48 #include <sys/socket.h>
 49 #include <sys/socketvar.h>
 50 #include <sys/priv.h>
 51 #include <sys/proc.h>
 52 #include <sys/jail.h>
 53 #include <sys/kernel.h>
 54 #include <sys/sysctl.h>
 55 #include <sys/vimage.h>
 56 
 57 #ifdef DDB
 58 #include <ddb/ddb.h>
 59 #endif
 60 
 61 #include <vm/uma.h>
 62 
 63 #include <net/if.h>
 64 #include <net/if_types.h>
 65 #include <net/route.h>
 66 
 67 #include <netinet/in.h>
 68 #include <netinet/in_pcb.h>
 69 #include <netinet/in_var.h>
 70 #include <netinet/ip_var.h>
 71 #include <netinet/tcp_var.h>
 72 #include <netinet/udp.h>
 73 #include <netinet/udp_var.h>
 74 #include <netinet/vinet.h>
 75 #ifdef INET6
 76 #include <netinet/ip6.h>
 77 #include <netinet6/ip6_var.h>
 78 #include <netinet6/vinet6.h>
 79 #endif /* INET6 */
 80 
 81 
 82 #ifdef IPSEC
 83 #include <netipsec/ipsec.h>
 84 #include <netipsec/key.h>
 85 #endif /* IPSEC */
 86 
 87 #include <security/mac/mac_framework.h>
 88 
 89 #ifdef VIMAGE_GLOBALS
 90 /*
 91  * These configure the range of local port addresses assigned to
 92  * "unspecified" outgoing connections/packets/whatever.
 93  */
 94 int     ipport_lowfirstauto;
 95 int     ipport_lowlastauto;
 96 int     ipport_firstauto;
 97 int     ipport_lastauto;
 98 int     ipport_hifirstauto;
 99 int     ipport_hilastauto;
100 
101 /*
102  * Reserved ports accessible only to root. There are significant
103  * security considerations that must be accounted for when changing these,
104  * but the security benefits can be great. Please be careful.
105  */
106 int     ipport_reservedhigh;
107 int     ipport_reservedlow;
108 
109 /* Variables dealing with random ephemeral port allocation. */
110 int     ipport_randomized;
111 int     ipport_randomcps;
112 int     ipport_randomtime;
113 int     ipport_stoprandom;
114 int     ipport_tcpallocs;
115 int     ipport_tcplastcount;
116 #endif
117 
118 #define RANGECHK(var, min, max) \
119         if ((var) < (min)) { (var) = (min); } \
120         else if ((var) > (max)) { (var) = (max); }
121 
122 static int
123 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
124 {
125         INIT_VNET_INET(curvnet);
126         int error;
127 
128         error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
129         if (error == 0) {
130                 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
131                 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
132                 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
133                 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
134                 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
135                 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
136         }
137         return (error);
138 }
139 
140 #undef RANGECHK
141 
142 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
143 
144 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
145         lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0,
146         &sysctl_net_ipport_check, "I", "");
147 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
148         lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0,
149         &sysctl_net_ipport_check, "I", "");
150 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
151         first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0,
152         &sysctl_net_ipport_check, "I", "");
153 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
154         last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0,
155         &sysctl_net_ipport_check, "I", "");
156 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
157         hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0, 
158         &sysctl_net_ipport_check, "I", "");
159 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
160         hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0,
161         &sysctl_net_ipport_check, "I", "");
162 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
163         reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, "");
164 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow,
165         CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, "");
166 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized,
167         CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation");
168 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps,
169         CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port "
170         "allocations before switching to a sequental one");
171 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime,
172         CTLFLAG_RW, ipport_randomtime, 0,
173         "Minimum time to keep sequental port "
174         "allocation before switching to a random one");
175 
176 /*
177  * in_pcb.c: manage the Protocol Control Blocks.
178  *
179  * NOTE: It is assumed that most of these functions will be called with
180  * the pcbinfo lock held, and often, the inpcb lock held, as these utility
181  * functions often modify hash chains or addresses in pcbs.
182  */
183 
184 /*
185  * Allocate a PCB and associate it with the socket.
186  * On success return with the PCB locked.
187  */
188 int
189 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
190 {
191 #ifdef INET6
192         INIT_VNET_INET6(curvnet);
193 #endif
194         struct inpcb *inp;
195         int error;
196 
197         INP_INFO_WLOCK_ASSERT(pcbinfo);
198         error = 0;
199         inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
200         if (inp == NULL)
201                 return (ENOBUFS);
202         bzero(inp, inp_zero_size);
203         inp->inp_pcbinfo = pcbinfo;
204         inp->inp_socket = so;
205         inp->inp_cred = crhold(so->so_cred);
206         inp->inp_inc.inc_fibnum = so->so_fibnum;
207 #ifdef MAC
208         error = mac_inpcb_init(inp, M_NOWAIT);
209         if (error != 0)
210                 goto out;
211         SOCK_LOCK(so);
212         mac_inpcb_create(so, inp);
213         SOCK_UNLOCK(so);
214 #endif
215 
216 #ifdef IPSEC
217         error = ipsec_init_policy(so, &inp->inp_sp);
218         if (error != 0) {
219 #ifdef MAC
220                 mac_inpcb_destroy(inp);
221 #endif
222                 goto out;
223         }
224 #endif /*IPSEC*/
225 #ifdef INET6
226         if (INP_SOCKAF(so) == AF_INET6) {
227                 inp->inp_vflag |= INP_IPV6PROTO;
228                 if (V_ip6_v6only)
229                         inp->inp_flags |= IN6P_IPV6_V6ONLY;
230         }
231 #endif
232         LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
233         pcbinfo->ipi_count++;
234         so->so_pcb = (caddr_t)inp;
235 #ifdef INET6
236         if (V_ip6_auto_flowlabel)
237                 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
238 #endif
239         INP_WLOCK(inp);
240         inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
241 
242 #if defined(IPSEC) || defined(MAC)
243 out:
244         if (error != 0) {
245                 crfree(inp->inp_cred);
246                 uma_zfree(pcbinfo->ipi_zone, inp);
247         }
248 #endif
249         return (error);
250 }
251 
252 int
253 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
254 {
255         int anonport, error;
256 
257         INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
258         INP_WLOCK_ASSERT(inp);
259 
260         if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
261                 return (EINVAL);
262         anonport = inp->inp_lport == 0 && (nam == NULL ||
263             ((struct sockaddr_in *)nam)->sin_port == 0);
264         error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
265             &inp->inp_lport, cred);
266         if (error)
267                 return (error);
268         if (in_pcbinshash(inp) != 0) {
269                 inp->inp_laddr.s_addr = INADDR_ANY;
270                 inp->inp_lport = 0;
271                 return (EAGAIN);
272         }
273         if (anonport)
274                 inp->inp_flags |= INP_ANONPORT;
275         return (0);
276 }
277 
278 /*
279  * Set up a bind operation on a PCB, performing port allocation
280  * as required, but do not actually modify the PCB. Callers can
281  * either complete the bind by setting inp_laddr/inp_lport and
282  * calling in_pcbinshash(), or they can just use the resulting
283  * port and address to authorise the sending of a once-off packet.
284  *
285  * On error, the values of *laddrp and *lportp are not changed.
286  */
287 int
288 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
289     u_short *lportp, struct ucred *cred)
290 {
291         INIT_VNET_INET(inp->inp_vnet);
292         struct socket *so = inp->inp_socket;
293         unsigned short *lastport;
294         struct sockaddr_in *sin;
295         struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
296         struct in_addr laddr;
297         u_short lport = 0;
298         int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
299         int error;
300         int dorandom;
301 
302         /*
303          * Because no actual state changes occur here, a global write lock on
304          * the pcbinfo isn't required.
305          */
306         INP_INFO_LOCK_ASSERT(pcbinfo);
307         INP_LOCK_ASSERT(inp);
308 
309         if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
310                 return (EADDRNOTAVAIL);
311         laddr.s_addr = *laddrp;
312         if (nam != NULL && laddr.s_addr != INADDR_ANY)
313                 return (EINVAL);
314         if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
315                 wild = INPLOOKUP_WILDCARD;
316         if (nam) {
317                 sin = (struct sockaddr_in *)nam;
318                 if (nam->sa_len != sizeof (*sin))
319                         return (EINVAL);
320 #ifdef notdef
321                 /*
322                  * We should check the family, but old programs
323                  * incorrectly fail to initialize it.
324                  */
325                 if (sin->sin_family != AF_INET)
326                         return (EAFNOSUPPORT);
327 #endif
328                 if (prison_local_ip4(cred, &sin->sin_addr))
329                         return (EINVAL);
330                 if (sin->sin_port != *lportp) {
331                         /* Don't allow the port to change. */
332                         if (*lportp != 0)
333                                 return (EINVAL);
334                         lport = sin->sin_port;
335                 }
336                 /* NB: lport is left as 0 if the port isn't being changed. */
337                 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
338                         /*
339                          * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
340                          * allow complete duplication of binding if
341                          * SO_REUSEPORT is set, or if SO_REUSEADDR is set
342                          * and a multicast address is bound on both
343                          * new and duplicated sockets.
344                          */
345                         if (so->so_options & SO_REUSEADDR)
346                                 reuseport = SO_REUSEADDR|SO_REUSEPORT;
347                 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
348                         sin->sin_port = 0;              /* yech... */
349                         bzero(&sin->sin_zero, sizeof(sin->sin_zero));
350                         if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
351                                 return (EADDRNOTAVAIL);
352                 }
353                 laddr = sin->sin_addr;
354                 if (lport) {
355                         struct inpcb *t;
356                         struct tcptw *tw;
357 
358                         /* GROSS */
359                         if (ntohs(lport) <= V_ipport_reservedhigh &&
360                             ntohs(lport) >= V_ipport_reservedlow &&
361                             priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
362                             0))
363                                 return (EACCES);
364                         if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
365                             priv_check_cred(inp->inp_cred,
366                             PRIV_NETINET_REUSEPORT, 0) != 0) {
367                                 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
368                                     lport, INPLOOKUP_WILDCARD, cred);
369         /*
370          * XXX
371          * This entire block sorely needs a rewrite.
372          */
373                                 if (t &&
374                                     ((t->inp_vflag & INP_TIMEWAIT) == 0) &&
375                                     (so->so_type != SOCK_STREAM ||
376                                      ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
377                                     (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
378                                      ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
379                                      (t->inp_socket->so_options &
380                                          SO_REUSEPORT) == 0) &&
381                                     (inp->inp_cred->cr_uid !=
382                                      t->inp_cred->cr_uid))
383                                         return (EADDRINUSE);
384                         }
385                         if (prison_local_ip4(cred, &sin->sin_addr))
386                                 return (EADDRNOTAVAIL);
387                         t = in_pcblookup_local(pcbinfo, sin->sin_addr,
388                             lport, wild, cred);
389                         if (t && (t->inp_vflag & INP_TIMEWAIT)) {
390                                 /*
391                                  * XXXRW: If an incpb has had its timewait
392                                  * state recycled, we treat the address as
393                                  * being in use (for now).  This is better
394                                  * than a panic, but not desirable.
395                                  */
396                                 tw = intotw(inp);
397                                 if (tw == NULL ||
398                                     (reuseport & tw->tw_so_options) == 0)
399                                         return (EADDRINUSE);
400                         } else if (t &&
401                             (reuseport & t->inp_socket->so_options) == 0) {
402 #ifdef INET6
403                                 if (ntohl(sin->sin_addr.s_addr) !=
404                                     INADDR_ANY ||
405                                     ntohl(t->inp_laddr.s_addr) !=
406                                     INADDR_ANY ||
407                                     INP_SOCKAF(so) ==
408                                     INP_SOCKAF(t->inp_socket))
409 #endif
410                                 return (EADDRINUSE);
411                         }
412                 }
413         }
414         if (*lportp != 0)
415                 lport = *lportp;
416         if (lport == 0) {
417                 u_short first, last, aux;
418                 int count;
419 
420                 if (prison_local_ip4(cred, &laddr))
421                         return (EINVAL);
422 
423                 if (inp->inp_flags & INP_HIGHPORT) {
424                         first = V_ipport_hifirstauto;   /* sysctl */
425                         last  = V_ipport_hilastauto;
426                         lastport = &pcbinfo->ipi_lasthi;
427                 } else if (inp->inp_flags & INP_LOWPORT) {
428                         error = priv_check_cred(cred,
429                             PRIV_NETINET_RESERVEDPORT, 0);
430                         if (error)
431                                 return error;
432                         first = V_ipport_lowfirstauto;  /* 1023 */
433                         last  = V_ipport_lowlastauto;   /* 600 */
434                         lastport = &pcbinfo->ipi_lastlow;
435                 } else {
436                         first = V_ipport_firstauto;     /* sysctl */
437                         last  = V_ipport_lastauto;
438                         lastport = &pcbinfo->ipi_lastport;
439                 }
440                 /*
441                  * For UDP, use random port allocation as long as the user
442                  * allows it.  For TCP (and as of yet unknown) connections,
443                  * use random port allocation only if the user allows it AND
444                  * ipport_tick() allows it.
445                  */
446                 if (V_ipport_randomized &&
447                         (!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
448                         dorandom = 1;
449                 else
450                         dorandom = 0;
451                 /*
452                  * It makes no sense to do random port allocation if
453                  * we have the only port available.
454                  */
455                 if (first == last)
456                         dorandom = 0;
457                 /* Make sure to not include UDP packets in the count. */
458                 if (pcbinfo != &V_udbinfo)
459                         V_ipport_tcpallocs++;
460                 /*
461                  * Instead of having two loops further down counting up or down
462                  * make sure that first is always <= last and go with only one
463                  * code path implementing all logic.
464                  */
465                 if (first > last) {
466                         aux = first;
467                         first = last;
468                         last = aux;
469                 }
470 
471                 if (dorandom)
472                         *lastport = first +
473                                     (arc4random() % (last - first));
474 
475                 count = last - first;
476 
477                 do {
478                         if (count-- < 0)        /* completely used? */
479                                 return (EADDRNOTAVAIL);
480                         ++*lastport;
481                         if (*lastport < first || *lastport > last)
482                                 *lastport = first;
483                         lport = htons(*lastport);
484                 } while (in_pcblookup_local(pcbinfo, laddr,
485                     lport, wild, cred));
486         }
487         if (prison_local_ip4(cred, &laddr))
488                 return (EINVAL);
489         *laddrp = laddr.s_addr;
490         *lportp = lport;
491         return (0);
492 }
493 
494 /*
495  * Connect from a socket to a specified address.
496  * Both address and port must be specified in argument sin.
497  * If don't have a local address for this socket yet,
498  * then pick one.
499  */
500 int
501 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
502 {
503         u_short lport, fport;
504         in_addr_t laddr, faddr;
505         int anonport, error;
506 
507         INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
508         INP_WLOCK_ASSERT(inp);
509 
510         lport = inp->inp_lport;
511         laddr = inp->inp_laddr.s_addr;
512         anonport = (lport == 0);
513         error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
514             NULL, cred);
515         if (error)
516                 return (error);
517 
518         /* Do the initial binding of the local address if required. */
519         if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
520                 inp->inp_lport = lport;
521                 inp->inp_laddr.s_addr = laddr;
522                 if (in_pcbinshash(inp) != 0) {
523                         inp->inp_laddr.s_addr = INADDR_ANY;
524                         inp->inp_lport = 0;
525                         return (EAGAIN);
526                 }
527         }
528 
529         /* Commit the remaining changes. */
530         inp->inp_lport = lport;
531         inp->inp_laddr.s_addr = laddr;
532         inp->inp_faddr.s_addr = faddr;
533         inp->inp_fport = fport;
534         in_pcbrehash(inp);
535 
536         if (anonport)
537                 inp->inp_flags |= INP_ANONPORT;
538         return (0);
539 }
540 
541 /*
542  * Do proper source address selection on an unbound socket in case
543  * of connect. Take jails into account as well.
544  */
545 static int
546 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
547     struct ucred *cred)
548 {
549         struct in_ifaddr *ia;
550         struct ifaddr *ifa;
551         struct sockaddr *sa;
552         struct sockaddr_in *sin;
553         struct route sro;
554         int error;
555 
556         KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
557 
558         error = 0;
559         ia = NULL;
560         bzero(&sro, sizeof(sro));
561 
562         sin = (struct sockaddr_in *)&sro.ro_dst;
563         sin->sin_family = AF_INET;
564         sin->sin_len = sizeof(struct sockaddr_in);
565         sin->sin_addr.s_addr = faddr->s_addr;
566 
567         /*
568          * If route is known our src addr is taken from the i/f,
569          * else punt.
570          *
571          * Find out route to destination.
572          */
573         if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
574                 in_rtalloc_ign(&sro, RTF_CLONING, inp->inp_inc.inc_fibnum);
575 
576         /*
577          * If we found a route, use the address corresponding to
578          * the outgoing interface.
579          * 
580          * Otherwise assume faddr is reachable on a directly connected
581          * network and try to find a corresponding interface to take
582          * the source address from.
583          */
584         if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
585                 struct ifnet *ifp;
586 
587                 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
588                 if (ia == NULL)
589                         ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
590                 if (ia == NULL) {
591                         error = ENETUNREACH;
592                         goto done;
593                 }
594 
595                 if (cred == NULL || !jailed(cred)) {
596                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
597                         goto done;
598                 }
599 
600                 ifp = ia->ia_ifp;
601                 ia = NULL;
602                 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
603 
604                         sa = ifa->ifa_addr;
605                         if (sa->sa_family != AF_INET)
606                                 continue;
607                         sin = (struct sockaddr_in *)sa;
608                         if (prison_check_ip4(cred, &sin->sin_addr)) {
609                                 ia = (struct in_ifaddr *)ifa;
610                                 break;
611                         }
612                 }
613                 if (ia != NULL) {
614                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
615                         goto done;
616                 }
617 
618                 /* 3. As a last resort return the 'default' jail address. */
619                 if (prison_getip4(cred, laddr) != 0)
620                         error = EADDRNOTAVAIL;
621                 goto done;
622         }
623 
624         /*
625          * If the outgoing interface on the route found is not
626          * a loopback interface, use the address from that interface.
627          * In case of jails do those three steps:
628          * 1. check if the interface address belongs to the jail. If so use it.
629          * 2. check if we have any address on the outgoing interface
630          *    belonging to this jail. If so use it.
631          * 3. as a last resort return the 'default' jail address.
632          */
633         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
634 
635                 /* If not jailed, use the default returned. */
636                 if (cred == NULL || !jailed(cred)) {
637                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
638                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
639                         goto done;
640                 }
641 
642                 /* Jailed. */
643                 /* 1. Check if the iface address belongs to the jail. */
644                 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
645                 if (prison_check_ip4(cred, &sin->sin_addr)) {
646                         ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
647                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
648                         goto done;
649                 }
650 
651                 /*
652                  * 2. Check if we have any address on the outgoing interface
653                  *    belonging to this jail.
654                  */
655                 TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) {
656 
657                         sa = ifa->ifa_addr;
658                         if (sa->sa_family != AF_INET)
659                                 continue;
660                         sin = (struct sockaddr_in *)sa;
661                         if (prison_check_ip4(cred, &sin->sin_addr)) {
662                                 ia = (struct in_ifaddr *)ifa;
663                                 break;
664                         }
665                 }
666                 if (ia != NULL) {
667                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
668                         goto done;
669                 }
670 
671                 /* 3. As a last resort return the 'default' jail address. */
672                 if (prison_getip4(cred, laddr) != 0)
673                         error = EADDRNOTAVAIL;
674                 goto done;
675         }
676 
677         /*
678          * The outgoing interface is marked with 'loopback net', so a route
679          * to ourselves is here.
680          * Try to find the interface of the destination address and then
681          * take the address from there. That interface is not necessarily
682          * a loopback interface.
683          * In case of jails, check that it is an address of the jail
684          * and if we cannot find, fall back to the 'default' jail address.
685          */
686         if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
687                 struct sockaddr_in sain;
688 
689                 bzero(&sain, sizeof(struct sockaddr_in));
690                 sain.sin_family = AF_INET;
691                 sain.sin_len = sizeof(struct sockaddr_in);
692                 sain.sin_addr.s_addr = faddr->s_addr;
693 
694                 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
695                 if (ia == NULL)
696                         ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
697 
698                 if (cred == NULL || !jailed(cred)) {
699                         if (ia == NULL) {
700                                 error = ENETUNREACH;
701                                 goto done;
702                         }
703                         laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
704                         goto done;
705                 }
706 
707                 /* Jailed. */
708                 if (ia != NULL) {
709                         struct ifnet *ifp;
710 
711                         ifp = ia->ia_ifp;
712                         ia = NULL;
713                         TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
714 
715                                 sa = ifa->ifa_addr;
716                                 if (sa->sa_family != AF_INET)
717                                         continue;
718                                 sin = (struct sockaddr_in *)sa;
719                                 if (prison_check_ip4(cred, &sin->sin_addr)) {
720                                         ia = (struct in_ifaddr *)ifa;
721                                         break;
722                                 }
723                         }
724                         if (ia != NULL) {
725                                 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
726                                 goto done;
727                         }
728                 }
729 
730                 /* 3. As a last resort return the 'default' jail address. */
731                 if (prison_getip4(cred, laddr) != 0)
732                         error = EADDRNOTAVAIL;
733                 goto done;
734         }
735 
736 done:
737         if (sro.ro_rt != NULL)
738                 RTFREE(sro.ro_rt);
739         return (error);
740 }
741 
742 /*
743  * Set up for a connect from a socket to the specified address.
744  * On entry, *laddrp and *lportp should contain the current local
745  * address and port for the PCB; these are updated to the values
746  * that should be placed in inp_laddr and inp_lport to complete
747  * the connect.
748  *
749  * On success, *faddrp and *fportp will be set to the remote address
750  * and port. These are not updated in the error case.
751  *
752  * If the operation fails because the connection already exists,
753  * *oinpp will be set to the PCB of that connection so that the
754  * caller can decide to override it. In all other cases, *oinpp
755  * is set to NULL.
756  */
757 int
758 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
759     in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
760     struct inpcb **oinpp, struct ucred *cred)
761 {
762         INIT_VNET_INET(inp->inp_vnet);
763         struct sockaddr_in *sin = (struct sockaddr_in *)nam;
764         struct in_ifaddr *ia;
765         struct inpcb *oinp;
766         struct in_addr laddr, faddr, jailia;
767         u_short lport, fport;
768         int error;
769 
770         /*
771          * Because a global state change doesn't actually occur here, a read
772          * lock is sufficient.
773          */
774         INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
775         INP_LOCK_ASSERT(inp);
776 
777         if (oinpp != NULL)
778                 *oinpp = NULL;
779         if (nam->sa_len != sizeof (*sin))
780                 return (EINVAL);
781         if (sin->sin_family != AF_INET)
782                 return (EAFNOSUPPORT);
783         if (sin->sin_port == 0)
784                 return (EADDRNOTAVAIL);
785         laddr.s_addr = *laddrp;
786         lport = *lportp;
787         faddr = sin->sin_addr;
788         fport = sin->sin_port;
789 
790         if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
791                 /*
792                  * If the destination address is INADDR_ANY,
793                  * use the primary local address.
794                  * If the supplied address is INADDR_BROADCAST,
795                  * and the primary interface supports broadcast,
796                  * choose the broadcast address for that interface.
797                  */
798                 if (faddr.s_addr == INADDR_ANY) {
799                         if (cred != NULL && jailed(cred)) {
800                                 if (prison_getip4(cred, &jailia) != 0)
801                                         return (EADDRNOTAVAIL);
802                                 faddr.s_addr = jailia.s_addr;
803                         } else {
804                                 faddr =
805                                     IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->
806                                     sin_addr;
807                         }
808                 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
809                     (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
810                     IFF_BROADCAST))
811                         faddr = satosin(&TAILQ_FIRST(
812                             &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
813         }
814         if (laddr.s_addr == INADDR_ANY) {
815                 error = in_pcbladdr(inp, &faddr, &laddr, cred);
816                 if (error)
817                         return (error);
818 
819                 /*
820                  * If the destination address is multicast and an outgoing
821                  * interface has been set as a multicast option, use the
822                  * address of that interface as our source address.
823                  */
824                 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
825                     inp->inp_moptions != NULL) {
826                         struct ip_moptions *imo;
827                         struct ifnet *ifp;
828 
829                         imo = inp->inp_moptions;
830                         if (imo->imo_multicast_ifp != NULL) {
831                                 ifp = imo->imo_multicast_ifp;
832                                 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
833                                         if (ia->ia_ifp == ifp)
834                                                 break;
835                                 if (ia == NULL)
836                                         return (EADDRNOTAVAIL);
837                                 laddr = ia->ia_addr.sin_addr;
838                         }
839                 }
840         }
841 
842         oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
843             0, NULL);
844         if (oinp != NULL) {
845                 if (oinpp != NULL)
846                         *oinpp = oinp;
847                 return (EADDRINUSE);
848         }
849         if (lport == 0) {
850                 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
851                     cred);
852                 if (error)
853                         return (error);
854         }
855         *laddrp = laddr.s_addr;
856         *lportp = lport;
857         *faddrp = faddr.s_addr;
858         *fportp = fport;
859         return (0);
860 }
861 
862 void
863 in_pcbdisconnect(struct inpcb *inp)
864 {
865 
866         INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
867         INP_WLOCK_ASSERT(inp);
868 
869         inp->inp_faddr.s_addr = INADDR_ANY;
870         inp->inp_fport = 0;
871         in_pcbrehash(inp);
872 }
873 
874 /*
875  * Historically, in_pcbdetach() included the functionality now found in
876  * in_pcbfree() and in_pcbdrop().  They are now broken out to reflect the
877  * more complex life cycle of TCP.
878  *
879  * in_pcbdetach() is responsibe for disconnecting the socket from an inpcb.
880  * For most protocols, this will be invoked immediately prior to calling
881  * in_pcbfree().  However, for TCP the inpcb may significantly outlive the
882  * socket, in which case in_pcbfree() may be deferred.
883  */
884 void
885 in_pcbdetach(struct inpcb *inp)
886 {
887 
888         KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
889 
890         inp->inp_socket->so_pcb = NULL;
891         inp->inp_socket = NULL;
892 }
893 
894 /*
895  * in_pcbfree() is responsible for freeing an already-detached inpcb, as well
896  * as removing it from any global inpcb lists it might be on.
897  */
898 void
899 in_pcbfree(struct inpcb *inp)
900 {
901         struct inpcbinfo *ipi = inp->inp_pcbinfo;
902 
903         KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
904 
905         INP_INFO_WLOCK_ASSERT(ipi);
906         INP_WLOCK_ASSERT(inp);
907 
908 #ifdef IPSEC
909         if (inp->inp_sp != NULL)
910                 ipsec_delete_pcbpolicy(inp);
911 #endif /* IPSEC */
912         inp->inp_gencnt = ++ipi->ipi_gencnt;
913         in_pcbremlists(inp);
914 #ifdef INET6
915         if (inp->inp_vflag & INP_IPV6PROTO) {
916                 ip6_freepcbopts(inp->in6p_outputopts);
917                 ip6_freemoptions(inp->in6p_moptions);
918         }
919 #endif
920         if (inp->inp_options)
921                 (void)m_free(inp->inp_options);
922         if (inp->inp_moptions != NULL)
923                 inp_freemoptions(inp->inp_moptions);
924         inp->inp_vflag = 0;
925         crfree(inp->inp_cred);
926 
927 #ifdef MAC
928         mac_inpcb_destroy(inp);
929 #endif
930         INP_WUNLOCK(inp);
931         uma_zfree(ipi->ipi_zone, inp);
932 }
933 
934 /*
935  * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
936  * port reservation, and preventing it from being returned by inpcb lookups.
937  *
938  * It is used by TCP to mark an inpcb as unused and avoid future packet
939  * delivery or event notification when a socket remains open but TCP has
940  * closed.  This might occur as a result of a shutdown()-initiated TCP close
941  * or a RST on the wire, and allows the port binding to be reused while still
942  * maintaining the invariant that so_pcb always points to a valid inpcb until
943  * in_pcbdetach().
944  *
945  * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
946  * lists, but can lead to confusing netstat output, as open sockets with
947  * closed TCP connections will no longer appear to have their bound port
948  * number.  An explicit flag would be better, as it would allow us to leave
949  * the port number intact after the connection is dropped.
950  *
951  * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
952  * in_pcbnotifyall() and in_pcbpurgeif0()?
953  */
954 void
955 in_pcbdrop(struct inpcb *inp)
956 {
957 
958         INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
959         INP_WLOCK_ASSERT(inp);
960 
961         inp->inp_vflag |= INP_DROPPED;
962         if (inp->inp_lport) {
963                 struct inpcbport *phd = inp->inp_phd;
964 
965                 LIST_REMOVE(inp, inp_hash);
966                 LIST_REMOVE(inp, inp_portlist);
967                 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
968                         LIST_REMOVE(phd, phd_hash);
969                         free(phd, M_PCB);
970                 }
971                 inp->inp_lport = 0;
972         }
973 }
974 
975 /*
976  * Common routines to return the socket addresses associated with inpcbs.
977  */
978 struct sockaddr *
979 in_sockaddr(in_port_t port, struct in_addr *addr_p)
980 {
981         struct