FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c
1 /*-
2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 * The Regents of the University of California.
4 * Copyright (c) 2007 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
32 */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD: src/sys/netinet/in_pcb.c,v 1.231 2008/12/02 21:37:28 bz Exp $");
36
37 #include "opt_ddb.h"
38 #include "opt_ipsec.h"
39 #include "opt_inet6.h"
40 #include "opt_mac.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/malloc.h>
45 #include <sys/mbuf.h>
46 #include <sys/domain.h>
47 #include <sys/protosw.h>
48 #include <sys/socket.h>
49 #include <sys/socketvar.h>
50 #include <sys/priv.h>
51 #include <sys/proc.h>
52 #include <sys/jail.h>
53 #include <sys/kernel.h>
54 #include <sys/sysctl.h>
55 #include <sys/vimage.h>
56
57 #ifdef DDB
58 #include <ddb/ddb.h>
59 #endif
60
61 #include <vm/uma.h>
62
63 #include <net/if.h>
64 #include <net/if_types.h>
65 #include <net/route.h>
66
67 #include <netinet/in.h>
68 #include <netinet/in_pcb.h>
69 #include <netinet/in_var.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/tcp_var.h>
72 #include <netinet/udp.h>
73 #include <netinet/udp_var.h>
74 #include <netinet/vinet.h>
75 #ifdef INET6
76 #include <netinet/ip6.h>
77 #include <netinet6/ip6_var.h>
78 #include <netinet6/vinet6.h>
79 #endif /* INET6 */
80
81
82 #ifdef IPSEC
83 #include <netipsec/ipsec.h>
84 #include <netipsec/key.h>
85 #endif /* IPSEC */
86
87 #include <security/mac/mac_framework.h>
88
89 #ifdef VIMAGE_GLOBALS
90 /*
91 * These configure the range of local port addresses assigned to
92 * "unspecified" outgoing connections/packets/whatever.
93 */
94 int ipport_lowfirstauto;
95 int ipport_lowlastauto;
96 int ipport_firstauto;
97 int ipport_lastauto;
98 int ipport_hifirstauto;
99 int ipport_hilastauto;
100
101 /*
102 * Reserved ports accessible only to root. There are significant
103 * security considerations that must be accounted for when changing these,
104 * but the security benefits can be great. Please be careful.
105 */
106 int ipport_reservedhigh;
107 int ipport_reservedlow;
108
109 /* Variables dealing with random ephemeral port allocation. */
110 int ipport_randomized;
111 int ipport_randomcps;
112 int ipport_randomtime;
113 int ipport_stoprandom;
114 int ipport_tcpallocs;
115 int ipport_tcplastcount;
116 #endif
117
118 #define RANGECHK(var, min, max) \
119 if ((var) < (min)) { (var) = (min); } \
120 else if ((var) > (max)) { (var) = (max); }
121
122 static int
123 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
124 {
125 INIT_VNET_INET(curvnet);
126 int error;
127
128 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
129 if (error == 0) {
130 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
131 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
132 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
133 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
134 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
135 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
136 }
137 return (error);
138 }
139
140 #undef RANGECHK
141
142 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
143
144 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
145 lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0,
146 &sysctl_net_ipport_check, "I", "");
147 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
148 lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0,
149 &sysctl_net_ipport_check, "I", "");
150 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
151 first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0,
152 &sysctl_net_ipport_check, "I", "");
153 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
154 last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0,
155 &sysctl_net_ipport_check, "I", "");
156 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
157 hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0,
158 &sysctl_net_ipport_check, "I", "");
159 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
160 hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0,
161 &sysctl_net_ipport_check, "I", "");
162 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO,
163 reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, "");
164 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow,
165 CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, "");
166 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized,
167 CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation");
168 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps,
169 CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port "
170 "allocations before switching to a sequental one");
171 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime,
172 CTLFLAG_RW, ipport_randomtime, 0,
173 "Minimum time to keep sequental port "
174 "allocation before switching to a random one");
175
176 /*
177 * in_pcb.c: manage the Protocol Control Blocks.
178 *
179 * NOTE: It is assumed that most of these functions will be called with
180 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
181 * functions often modify hash chains or addresses in pcbs.
182 */
183
184 /*
185 * Allocate a PCB and associate it with the socket.
186 * On success return with the PCB locked.
187 */
188 int
189 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
190 {
191 #ifdef INET6
192 INIT_VNET_INET6(curvnet);
193 #endif
194 struct inpcb *inp;
195 int error;
196
197 INP_INFO_WLOCK_ASSERT(pcbinfo);
198 error = 0;
199 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
200 if (inp == NULL)
201 return (ENOBUFS);
202 bzero(inp, inp_zero_size);
203 inp->inp_pcbinfo = pcbinfo;
204 inp->inp_socket = so;
205 inp->inp_cred = crhold(so->so_cred);
206 inp->inp_inc.inc_fibnum = so->so_fibnum;
207 #ifdef MAC
208 error = mac_inpcb_init(inp, M_NOWAIT);
209 if (error != 0)
210 goto out;
211 SOCK_LOCK(so);
212 mac_inpcb_create(so, inp);
213 SOCK_UNLOCK(so);
214 #endif
215
216 #ifdef IPSEC
217 error = ipsec_init_policy(so, &inp->inp_sp);
218 if (error != 0) {
219 #ifdef MAC
220 mac_inpcb_destroy(inp);
221 #endif
222 goto out;
223 }
224 #endif /*IPSEC*/
225 #ifdef INET6
226 if (INP_SOCKAF(so) == AF_INET6) {
227 inp->inp_vflag |= INP_IPV6PROTO;
228 if (V_ip6_v6only)
229 inp->inp_flags |= IN6P_IPV6_V6ONLY;
230 }
231 #endif
232 LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
233 pcbinfo->ipi_count++;
234 so->so_pcb = (caddr_t)inp;
235 #ifdef INET6
236 if (V_ip6_auto_flowlabel)
237 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
238 #endif
239 INP_WLOCK(inp);
240 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
241
242 #if defined(IPSEC) || defined(MAC)
243 out:
244 if (error != 0) {
245 crfree(inp->inp_cred);
246 uma_zfree(pcbinfo->ipi_zone, inp);
247 }
248 #endif
249 return (error);
250 }
251
252 int
253 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
254 {
255 int anonport, error;
256
257 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
258 INP_WLOCK_ASSERT(inp);
259
260 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
261 return (EINVAL);
262 anonport = inp->inp_lport == 0 && (nam == NULL ||
263 ((struct sockaddr_in *)nam)->sin_port == 0);
264 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
265 &inp->inp_lport, cred);
266 if (error)
267 return (error);
268 if (in_pcbinshash(inp) != 0) {
269 inp->inp_laddr.s_addr = INADDR_ANY;
270 inp->inp_lport = 0;
271 return (EAGAIN);
272 }
273 if (anonport)
274 inp->inp_flags |= INP_ANONPORT;
275 return (0);
276 }
277
278 /*
279 * Set up a bind operation on a PCB, performing port allocation
280 * as required, but do not actually modify the PCB. Callers can
281 * either complete the bind by setting inp_laddr/inp_lport and
282 * calling in_pcbinshash(), or they can just use the resulting
283 * port and address to authorise the sending of a once-off packet.
284 *
285 * On error, the values of *laddrp and *lportp are not changed.
286 */
287 int
288 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
289 u_short *lportp, struct ucred *cred)
290 {
291 INIT_VNET_INET(inp->inp_vnet);
292 struct socket *so = inp->inp_socket;
293 unsigned short *lastport;
294 struct sockaddr_in *sin;
295 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
296 struct in_addr laddr;
297 u_short lport = 0;
298 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
299 int error;
300 int dorandom;
301
302 /*
303 * Because no actual state changes occur here, a global write lock on
304 * the pcbinfo isn't required.
305 */
306 INP_INFO_LOCK_ASSERT(pcbinfo);
307 INP_LOCK_ASSERT(inp);
308
309 if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */
310 return (EADDRNOTAVAIL);
311 laddr.s_addr = *laddrp;
312 if (nam != NULL && laddr.s_addr != INADDR_ANY)
313 return (EINVAL);
314 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
315 wild = INPLOOKUP_WILDCARD;
316 if (nam) {
317 sin = (struct sockaddr_in *)nam;
318 if (nam->sa_len != sizeof (*sin))
319 return (EINVAL);
320 #ifdef notdef
321 /*
322 * We should check the family, but old programs
323 * incorrectly fail to initialize it.
324 */
325 if (sin->sin_family != AF_INET)
326 return (EAFNOSUPPORT);
327 #endif
328 if (prison_local_ip4(cred, &sin->sin_addr))
329 return (EINVAL);
330 if (sin->sin_port != *lportp) {
331 /* Don't allow the port to change. */
332 if (*lportp != 0)
333 return (EINVAL);
334 lport = sin->sin_port;
335 }
336 /* NB: lport is left as 0 if the port isn't being changed. */
337 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
338 /*
339 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
340 * allow complete duplication of binding if
341 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
342 * and a multicast address is bound on both
343 * new and duplicated sockets.
344 */
345 if (so->so_options & SO_REUSEADDR)
346 reuseport = SO_REUSEADDR|SO_REUSEPORT;
347 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
348 sin->sin_port = 0; /* yech... */
349 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
350 if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
351 return (EADDRNOTAVAIL);
352 }
353 laddr = sin->sin_addr;
354 if (lport) {
355 struct inpcb *t;
356 struct tcptw *tw;
357
358 /* GROSS */
359 if (ntohs(lport) <= V_ipport_reservedhigh &&
360 ntohs(lport) >= V_ipport_reservedlow &&
361 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
362 0))
363 return (EACCES);
364 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
365 priv_check_cred(inp->inp_cred,
366 PRIV_NETINET_REUSEPORT, 0) != 0) {
367 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
368 lport, INPLOOKUP_WILDCARD, cred);
369 /*
370 * XXX
371 * This entire block sorely needs a rewrite.
372 */
373 if (t &&
374 ((t->inp_vflag & INP_TIMEWAIT) == 0) &&
375 (so->so_type != SOCK_STREAM ||
376 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
377 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
378 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
379 (t->inp_socket->so_options &
380 SO_REUSEPORT) == 0) &&
381 (inp->inp_cred->cr_uid !=
382 t->inp_cred->cr_uid))
383 return (EADDRINUSE);
384 }
385 if (prison_local_ip4(cred, &sin->sin_addr))
386 return (EADDRNOTAVAIL);
387 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
388 lport, wild, cred);
389 if (t && (t->inp_vflag & INP_TIMEWAIT)) {
390 /*
391 * XXXRW: If an incpb has had its timewait
392 * state recycled, we treat the address as
393 * being in use (for now). This is better
394 * than a panic, but not desirable.
395 */
396 tw = intotw(inp);
397 if (tw == NULL ||
398 (reuseport & tw->tw_so_options) == 0)
399 return (EADDRINUSE);
400 } else if (t &&
401 (reuseport & t->inp_socket->so_options) == 0) {
402 #ifdef INET6
403 if (ntohl(sin->sin_addr.s_addr) !=
404 INADDR_ANY ||
405 ntohl(t->inp_laddr.s_addr) !=
406 INADDR_ANY ||
407 INP_SOCKAF(so) ==
408 INP_SOCKAF(t->inp_socket))
409 #endif
410 return (EADDRINUSE);
411 }
412 }
413 }
414 if (*lportp != 0)
415 lport = *lportp;
416 if (lport == 0) {
417 u_short first, last, aux;
418 int count;
419
420 if (prison_local_ip4(cred, &laddr))
421 return (EINVAL);
422
423 if (inp->inp_flags & INP_HIGHPORT) {
424 first = V_ipport_hifirstauto; /* sysctl */
425 last = V_ipport_hilastauto;
426 lastport = &pcbinfo->ipi_lasthi;
427 } else if (inp->inp_flags & INP_LOWPORT) {
428 error = priv_check_cred(cred,
429 PRIV_NETINET_RESERVEDPORT, 0);
430 if (error)
431 return error;
432 first = V_ipport_lowfirstauto; /* 1023 */
433 last = V_ipport_lowlastauto; /* 600 */
434 lastport = &pcbinfo->ipi_lastlow;
435 } else {
436 first = V_ipport_firstauto; /* sysctl */
437 last = V_ipport_lastauto;
438 lastport = &pcbinfo->ipi_lastport;
439 }
440 /*
441 * For UDP, use random port allocation as long as the user
442 * allows it. For TCP (and as of yet unknown) connections,
443 * use random port allocation only if the user allows it AND
444 * ipport_tick() allows it.
445 */
446 if (V_ipport_randomized &&
447 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo))
448 dorandom = 1;
449 else
450 dorandom = 0;
451 /*
452 * It makes no sense to do random port allocation if
453 * we have the only port available.
454 */
455 if (first == last)
456 dorandom = 0;
457 /* Make sure to not include UDP packets in the count. */
458 if (pcbinfo != &V_udbinfo)
459 V_ipport_tcpallocs++;
460 /*
461 * Instead of having two loops further down counting up or down
462 * make sure that first is always <= last and go with only one
463 * code path implementing all logic.
464 */
465 if (first > last) {
466 aux = first;
467 first = last;
468 last = aux;
469 }
470
471 if (dorandom)
472 *lastport = first +
473 (arc4random() % (last - first));
474
475 count = last - first;
476
477 do {
478 if (count-- < 0) /* completely used? */
479 return (EADDRNOTAVAIL);
480 ++*lastport;
481 if (*lastport < first || *lastport > last)
482 *lastport = first;
483 lport = htons(*lastport);
484 } while (in_pcblookup_local(pcbinfo, laddr,
485 lport, wild, cred));
486 }
487 if (prison_local_ip4(cred, &laddr))
488 return (EINVAL);
489 *laddrp = laddr.s_addr;
490 *lportp = lport;
491 return (0);
492 }
493
494 /*
495 * Connect from a socket to a specified address.
496 * Both address and port must be specified in argument sin.
497 * If don't have a local address for this socket yet,
498 * then pick one.
499 */
500 int
501 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
502 {
503 u_short lport, fport;
504 in_addr_t laddr, faddr;
505 int anonport, error;
506
507 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
508 INP_WLOCK_ASSERT(inp);
509
510 lport = inp->inp_lport;
511 laddr = inp->inp_laddr.s_addr;
512 anonport = (lport == 0);
513 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
514 NULL, cred);
515 if (error)
516 return (error);
517
518 /* Do the initial binding of the local address if required. */
519 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
520 inp->inp_lport = lport;
521 inp->inp_laddr.s_addr = laddr;
522 if (in_pcbinshash(inp) != 0) {
523 inp->inp_laddr.s_addr = INADDR_ANY;
524 inp->inp_lport = 0;
525 return (EAGAIN);
526 }
527 }
528
529 /* Commit the remaining changes. */
530 inp->inp_lport = lport;
531 inp->inp_laddr.s_addr = laddr;
532 inp->inp_faddr.s_addr = faddr;
533 inp->inp_fport = fport;
534 in_pcbrehash(inp);
535
536 if (anonport)
537 inp->inp_flags |= INP_ANONPORT;
538 return (0);
539 }
540
541 /*
542 * Do proper source address selection on an unbound socket in case
543 * of connect. Take jails into account as well.
544 */
545 static int
546 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
547 struct ucred *cred)
548 {
549 struct in_ifaddr *ia;
550 struct ifaddr *ifa;
551 struct sockaddr *sa;
552 struct sockaddr_in *sin;
553 struct route sro;
554 int error;
555
556 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
557
558 error = 0;
559 ia = NULL;
560 bzero(&sro, sizeof(sro));
561
562 sin = (struct sockaddr_in *)&sro.ro_dst;
563 sin->sin_family = AF_INET;
564 sin->sin_len = sizeof(struct sockaddr_in);
565 sin->sin_addr.s_addr = faddr->s_addr;
566
567 /*
568 * If route is known our src addr is taken from the i/f,
569 * else punt.
570 *
571 * Find out route to destination.
572 */
573 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
574 in_rtalloc_ign(&sro, RTF_CLONING, inp->inp_inc.inc_fibnum);
575
576 /*
577 * If we found a route, use the address corresponding to
578 * the outgoing interface.
579 *
580 * Otherwise assume faddr is reachable on a directly connected
581 * network and try to find a corresponding interface to take
582 * the source address from.
583 */
584 if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
585 struct ifnet *ifp;
586
587 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
588 if (ia == NULL)
589 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
590 if (ia == NULL) {
591 error = ENETUNREACH;
592 goto done;
593 }
594
595 if (cred == NULL || !jailed(cred)) {
596 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
597 goto done;
598 }
599
600 ifp = ia->ia_ifp;
601 ia = NULL;
602 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
603
604 sa = ifa->ifa_addr;
605 if (sa->sa_family != AF_INET)
606 continue;
607 sin = (struct sockaddr_in *)sa;
608 if (prison_check_ip4(cred, &sin->sin_addr)) {
609 ia = (struct in_ifaddr *)ifa;
610 break;
611 }
612 }
613 if (ia != NULL) {
614 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
615 goto done;
616 }
617
618 /* 3. As a last resort return the 'default' jail address. */
619 if (prison_getip4(cred, laddr) != 0)
620 error = EADDRNOTAVAIL;
621 goto done;
622 }
623
624 /*
625 * If the outgoing interface on the route found is not
626 * a loopback interface, use the address from that interface.
627 * In case of jails do those three steps:
628 * 1. check if the interface address belongs to the jail. If so use it.
629 * 2. check if we have any address on the outgoing interface
630 * belonging to this jail. If so use it.
631 * 3. as a last resort return the 'default' jail address.
632 */
633 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
634
635 /* If not jailed, use the default returned. */
636 if (cred == NULL || !jailed(cred)) {
637 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
638 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
639 goto done;
640 }
641
642 /* Jailed. */
643 /* 1. Check if the iface address belongs to the jail. */
644 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
645 if (prison_check_ip4(cred, &sin->sin_addr)) {
646 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
647 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
648 goto done;
649 }
650
651 /*
652 * 2. Check if we have any address on the outgoing interface
653 * belonging to this jail.
654 */
655 TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) {
656
657 sa = ifa->ifa_addr;
658 if (sa->sa_family != AF_INET)
659 continue;
660 sin = (struct sockaddr_in *)sa;
661 if (prison_check_ip4(cred, &sin->sin_addr)) {
662 ia = (struct in_ifaddr *)ifa;
663 break;
664 }
665 }
666 if (ia != NULL) {
667 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
668 goto done;
669 }
670
671 /* 3. As a last resort return the 'default' jail address. */
672 if (prison_getip4(cred, laddr) != 0)
673 error = EADDRNOTAVAIL;
674 goto done;
675 }
676
677 /*
678 * The outgoing interface is marked with 'loopback net', so a route
679 * to ourselves is here.
680 * Try to find the interface of the destination address and then
681 * take the address from there. That interface is not necessarily
682 * a loopback interface.
683 * In case of jails, check that it is an address of the jail
684 * and if we cannot find, fall back to the 'default' jail address.
685 */
686 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
687 struct sockaddr_in sain;
688
689 bzero(&sain, sizeof(struct sockaddr_in));
690 sain.sin_family = AF_INET;
691 sain.sin_len = sizeof(struct sockaddr_in);
692 sain.sin_addr.s_addr = faddr->s_addr;
693
694 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
695 if (ia == NULL)
696 ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
697
698 if (cred == NULL || !jailed(cred)) {
699 if (ia == NULL) {
700 error = ENETUNREACH;
701 goto done;
702 }
703 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
704 goto done;
705 }
706
707 /* Jailed. */
708 if (ia != NULL) {
709 struct ifnet *ifp;
710
711 ifp = ia->ia_ifp;
712 ia = NULL;
713 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
714
715 sa = ifa->ifa_addr;
716 if (sa->sa_family != AF_INET)
717 continue;
718 sin = (struct sockaddr_in *)sa;
719 if (prison_check_ip4(cred, &sin->sin_addr)) {
720 ia = (struct in_ifaddr *)ifa;
721 break;
722 }
723 }
724 if (ia != NULL) {
725 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
726 goto done;
727 }
728 }
729
730 /* 3. As a last resort return the 'default' jail address. */
731 if (prison_getip4(cred, laddr) != 0)
732 error = EADDRNOTAVAIL;
733 goto done;
734 }
735
736 done:
737 if (sro.ro_rt != NULL)
738 RTFREE(sro.ro_rt);
739 return (error);
740 }
741
742 /*
743 * Set up for a connect from a socket to the specified address.
744 * On entry, *laddrp and *lportp should contain the current local
745 * address and port for the PCB; these are updated to the values
746 * that should be placed in inp_laddr and inp_lport to complete
747 * the connect.
748 *
749 * On success, *faddrp and *fportp will be set to the remote address
750 * and port. These are not updated in the error case.
751 *
752 * If the operation fails because the connection already exists,
753 * *oinpp will be set to the PCB of that connection so that the
754 * caller can decide to override it. In all other cases, *oinpp
755 * is set to NULL.
756 */
757 int
758 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
759 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
760 struct inpcb **oinpp, struct ucred *cred)
761 {
762 INIT_VNET_INET(inp->inp_vnet);
763 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
764 struct in_ifaddr *ia;
765 struct inpcb *oinp;
766 struct in_addr laddr, faddr, jailia;
767 u_short lport, fport;
768 int error;
769
770 /*
771 * Because a global state change doesn't actually occur here, a read
772 * lock is sufficient.
773 */
774 INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
775 INP_LOCK_ASSERT(inp);
776
777 if (oinpp != NULL)
778 *oinpp = NULL;
779 if (nam->sa_len != sizeof (*sin))
780 return (EINVAL);
781 if (sin->sin_family != AF_INET)
782 return (EAFNOSUPPORT);
783 if (sin->sin_port == 0)
784 return (EADDRNOTAVAIL);
785 laddr.s_addr = *laddrp;
786 lport = *lportp;
787 faddr = sin->sin_addr;
788 fport = sin->sin_port;
789
790 if (!TAILQ_EMPTY(&V_in_ifaddrhead)) {
791 /*
792 * If the destination address is INADDR_ANY,
793 * use the primary local address.
794 * If the supplied address is INADDR_BROADCAST,
795 * and the primary interface supports broadcast,
796 * choose the broadcast address for that interface.
797 */
798 if (faddr.s_addr == INADDR_ANY) {
799 if (cred != NULL && jailed(cred)) {
800 if (prison_getip4(cred, &jailia) != 0)
801 return (EADDRNOTAVAIL);
802 faddr.s_addr = jailia.s_addr;
803 } else {
804 faddr =
805 IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->
806 sin_addr;
807 }
808 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
809 (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
810 IFF_BROADCAST))
811 faddr = satosin(&TAILQ_FIRST(
812 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
813 }
814 if (laddr.s_addr == INADDR_ANY) {
815 error = in_pcbladdr(inp, &faddr, &laddr, cred);
816 if (error)
817 return (error);
818
819 /*
820 * If the destination address is multicast and an outgoing
821 * interface has been set as a multicast option, use the
822 * address of that interface as our source address.
823 */
824 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
825 inp->inp_moptions != NULL) {
826 struct ip_moptions *imo;
827 struct ifnet *ifp;
828
829 imo = inp->inp_moptions;
830 if (imo->imo_multicast_ifp != NULL) {
831 ifp = imo->imo_multicast_ifp;
832 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link)
833 if (ia->ia_ifp == ifp)
834 break;
835 if (ia == NULL)
836 return (EADDRNOTAVAIL);
837 laddr = ia->ia_addr.sin_addr;
838 }
839 }
840 }
841
842 oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
843 0, NULL);
844 if (oinp != NULL) {
845 if (oinpp != NULL)
846 *oinpp = oinp;
847 return (EADDRINUSE);
848 }
849 if (lport == 0) {
850 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
851 cred);
852 if (error)
853 return (error);
854 }
855 *laddrp = laddr.s_addr;
856 *lportp = lport;
857 *faddrp = faddr.s_addr;
858 *fportp = fport;
859 return (0);
860 }
861
862 void
863 in_pcbdisconnect(struct inpcb *inp)
864 {
865
866 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
867 INP_WLOCK_ASSERT(inp);
868
869 inp->inp_faddr.s_addr = INADDR_ANY;
870 inp->inp_fport = 0;
871 in_pcbrehash(inp);
872 }
873
874 /*
875 * Historically, in_pcbdetach() included the functionality now found in
876 * in_pcbfree() and in_pcbdrop(). They are now broken out to reflect the
877 * more complex life cycle of TCP.
878 *
879 * in_pcbdetach() is responsibe for disconnecting the socket from an inpcb.
880 * For most protocols, this will be invoked immediately prior to calling
881 * in_pcbfree(). However, for TCP the inpcb may significantly outlive the
882 * socket, in which case in_pcbfree() may be deferred.
883 */
884 void
885 in_pcbdetach(struct inpcb *inp)
886 {
887
888 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
889
890 inp->inp_socket->so_pcb = NULL;
891 inp->inp_socket = NULL;
892 }
893
894 /*
895 * in_pcbfree() is responsible for freeing an already-detached inpcb, as well
896 * as removing it from any global inpcb lists it might be on.
897 */
898 void
899 in_pcbfree(struct inpcb *inp)
900 {
901 struct inpcbinfo *ipi = inp->inp_pcbinfo;
902
903 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
904
905 INP_INFO_WLOCK_ASSERT(ipi);
906 INP_WLOCK_ASSERT(inp);
907
908 #ifdef IPSEC
909 if (inp->inp_sp != NULL)
910 ipsec_delete_pcbpolicy(inp);
911 #endif /* IPSEC */
912 inp->inp_gencnt = ++ipi->ipi_gencnt;
913 in_pcbremlists(inp);
914 #ifdef INET6
915 if (inp->inp_vflag & INP_IPV6PROTO) {
916 ip6_freepcbopts(inp->in6p_outputopts);
917 ip6_freemoptions(inp->in6p_moptions);
918 }
919 #endif
920 if (inp->inp_options)
921 (void)m_free(inp->inp_options);
922 if (inp->inp_moptions != NULL)
923 inp_freemoptions(inp->inp_moptions);
924 inp->inp_vflag = 0;
925 crfree(inp->inp_cred);
926
927 #ifdef MAC
928 mac_inpcb_destroy(inp);
929 #endif
930 INP_WUNLOCK(inp);
931 uma_zfree(ipi->ipi_zone, inp);
932 }
933
934 /*
935 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
936 * port reservation, and preventing it from being returned by inpcb lookups.
937 *
938 * It is used by TCP to mark an inpcb as unused and avoid future packet
939 * delivery or event notification when a socket remains open but TCP has
940 * closed. This might occur as a result of a shutdown()-initiated TCP close
941 * or a RST on the wire, and allows the port binding to be reused while still
942 * maintaining the invariant that so_pcb always points to a valid inpcb until
943 * in_pcbdetach().
944 *
945 * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
946 * lists, but can lead to confusing netstat output, as open sockets with
947 * closed TCP connections will no longer appear to have their bound port
948 * number. An explicit flag would be better, as it would allow us to leave
949 * the port number intact after the connection is dropped.
950 *
951 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
952 * in_pcbnotifyall() and in_pcbpurgeif0()?
953 */
954 void
955 in_pcbdrop(struct inpcb *inp)
956 {
957
958 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
959 INP_WLOCK_ASSERT(inp);
960
961 inp->inp_vflag |= INP_DROPPED;
962 if (inp->inp_lport) {
963 struct inpcbport *phd = inp->inp_phd;
964
965 LIST_REMOVE(inp, inp_hash);
966 LIST_REMOVE(inp, inp_portlist);
967 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
968 LIST_REMOVE(phd, phd_hash);
969 free(phd, M_PCB);
970 }
971 inp->inp_lport = 0;
972 }
973 }
974
975 /*
976 * Common routines to return the socket addresses associated with inpcbs.
977 */
978 struct sockaddr *
979 in_sockaddr(in_port_t port, struct in_addr *addr_p)
980 {
981 struct |