FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c
1 /*-
2 * Copyright (c) 1982, 1986, 1991, 1993, 1995
3 * The Regents of the University of California.
4 * Copyright (c) 2007-2009 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 4. Neither the name of the University nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 *
31 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
32 */
33
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36
37 #include "opt_ddb.h"
38 #include "opt_ipsec.h"
39 #include "opt_inet.h"
40 #include "opt_inet6.h"
41 #include "opt_mac.h"
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/malloc.h>
46 #include <sys/mbuf.h>
47 #include <sys/domain.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/priv.h>
52 #include <sys/proc.h>
53 #include <sys/jail.h>
54 #include <sys/kernel.h>
55 #include <sys/sysctl.h>
56
57 #ifdef DDB
58 #include <ddb/ddb.h>
59 #endif
60
61 #include <vm/uma.h>
62
63 #include <net/if.h>
64 #include <net/if_types.h>
65 #include <net/route.h>
66
67 #include <netinet/in.h>
68 #include <netinet/in_pcb.h>
69 #include <netinet/in_var.h>
70 #include <netinet/ip_var.h>
71 #include <netinet/tcp_var.h>
72 #include <netinet/udp.h>
73 #include <netinet/udp_var.h>
74 #ifdef INET6
75 #include <netinet/ip6.h>
76 #include <netinet6/ip6_var.h>
77 #include <netinet6/in6_pcb.h>
78 #endif /* INET6 */
79
80
81 #ifdef IPSEC
82 #include <netipsec/ipsec.h>
83 #include <netipsec/key.h>
84 #endif /* IPSEC */
85
86 #include <security/mac/mac_framework.h>
87
88 /*
89 * These configure the range of local port addresses assigned to
90 * "unspecified" outgoing connections/packets/whatever.
91 */
92 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */
93 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */
94 int ipport_firstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
95 int ipport_lastauto = IPPORT_HILASTAUTO; /* 65535 */
96 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */
97 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */
98
99 /*
100 * Reserved ports accessible only to root. There are significant
101 * security considerations that must be accounted for when changing these,
102 * but the security benefits can be great. Please be careful.
103 */
104 int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */
105 int ipport_reservedlow = 0;
106
107 /* Variables dealing with random ephemeral port allocation. */
108 int ipport_randomized = 1; /* user controlled via sysctl */
109 int ipport_randomcps = 10; /* user controlled via sysctl */
110 int ipport_randomtime = 45; /* user controlled via sysctl */
111 int ipport_stoprandom = 0; /* toggled by ipport_tick */
112 int ipport_tcpallocs;
113 int ipport_tcplastcount;
114
115 #define RANGECHK(var, min, max) \
116 if ((var) < (min)) { (var) = (min); } \
117 else if ((var) > (max)) { (var) = (max); }
118
119 static int
120 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
121 {
122 int error;
123
124 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
125 if (error == 0) {
126 RANGECHK(ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
127 RANGECHK(ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
128 RANGECHK(ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
129 RANGECHK(ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
130 RANGECHK(ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
131 RANGECHK(ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
132 }
133 return (error);
134 }
135
136 #undef RANGECHK
137
138 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports");
139
140 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLTYPE_INT|CTLFLAG_RW,
141 &ipport_lowfirstauto, 0, &sysctl_net_ipport_check, "I", "");
142 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLTYPE_INT|CTLFLAG_RW,
143 &ipport_lowlastauto, 0, &sysctl_net_ipport_check, "I", "");
144 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLTYPE_INT|CTLFLAG_RW,
145 &ipport_firstauto, 0, &sysctl_net_ipport_check, "I", "");
146 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLTYPE_INT|CTLFLAG_RW,
147 &ipport_lastauto, 0, &sysctl_net_ipport_check, "I", "");
148 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLTYPE_INT|CTLFLAG_RW,
149 &ipport_hifirstauto, 0, &sysctl_net_ipport_check, "I", "");
150 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLTYPE_INT|CTLFLAG_RW,
151 &ipport_hilastauto, 0, &sysctl_net_ipport_check, "I", "");
152 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
153 CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedhigh, 0, "");
154 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
155 CTLFLAG_RW|CTLFLAG_SECURE, &ipport_reservedlow, 0, "");
156 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_RW,
157 &ipport_randomized, 0, "Enable random port allocation");
158 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, CTLFLAG_RW,
159 &ipport_randomcps, 0, "Maximum number of random port "
160 "allocations before switching to a sequental one");
161 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, CTLFLAG_RW,
162 &ipport_randomtime, 0, "Minimum time to keep sequental port "
163 "allocation before switching to a random one");
164
165 /*
166 * in_pcb.c: manage the Protocol Control Blocks.
167 *
168 * NOTE: It is assumed that most of these functions will be called with
169 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
170 * functions often modify hash chains or addresses in pcbs.
171 */
172
173 /*
174 * Allocate a PCB and associate it with the socket.
175 * On success return with the PCB locked.
176 */
177 int
178 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
179 {
180 struct inpcb *inp;
181 int error;
182
183 INP_INFO_WLOCK_ASSERT(pcbinfo);
184 error = 0;
185 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
186 if (inp == NULL)
187 return (ENOBUFS);
188 bzero(inp, inp_zero_size);
189 inp->inp_pcbinfo = pcbinfo;
190 inp->inp_socket = so;
191 inp->inp_cred = crhold(so->so_cred);
192 inp->inp_inc.inc_fibnum = so->so_fibnum;
193 #ifdef MAC
194 error = mac_init_inpcb(inp, M_NOWAIT);
195 if (error != 0)
196 goto out;
197 SOCK_LOCK(so);
198 mac_create_inpcb_from_socket(so, inp);
199 SOCK_UNLOCK(so);
200 #endif
201 #ifdef IPSEC
202 error = ipsec_init_policy(so, &inp->inp_sp);
203 if (error != 0) {
204 #ifdef MAC
205 mac_destroy_inpcb(inp);
206 #endif
207 goto out;
208 }
209 #endif /*IPSEC*/
210 #ifdef INET6
211 if (INP_SOCKAF(so) == AF_INET6) {
212 inp->inp_vflag |= INP_IPV6PROTO;
213 if (ip6_v6only)
214 inp->inp_flags |= IN6P_IPV6_V6ONLY;
215 }
216 #endif
217 LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
218 pcbinfo->ipi_count++;
219 so->so_pcb = (caddr_t)inp;
220 #ifdef INET6
221 if (ip6_auto_flowlabel)
222 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
223 #endif
224 INP_WLOCK(inp);
225 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
226 #if defined(IPSEC) || defined(MAC)
227 out:
228 if (error != 0) {
229 crfree(inp->inp_cred);
230 uma_zfree(pcbinfo->ipi_zone, inp);
231 }
232 #endif
233 return (error);
234 }
235
236 int
237 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
238 {
239 int anonport, error;
240
241 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
242 INP_WLOCK_ASSERT(inp);
243
244 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
245 return (EINVAL);
246 anonport = inp->inp_lport == 0 && (nam == NULL ||
247 ((struct sockaddr_in *)nam)->sin_port == 0);
248 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
249 &inp->inp_lport, cred);
250 if (error)
251 return (error);
252 if (in_pcbinshash(inp) != 0) {
253 inp->inp_laddr.s_addr = INADDR_ANY;
254 inp->inp_lport = 0;
255 return (EAGAIN);
256 }
257 if (anonport)
258 inp->inp_flags |= INP_ANONPORT;
259 return (0);
260 }
261
262 #if defined(INET) || defined(INET6)
263 int
264 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
265 struct ucred *cred, int wild)
266 {
267 struct inpcbinfo *pcbinfo;
268 struct inpcb *tmpinp;
269 unsigned short *lastport;
270 int count, dorandom, error;
271 u_short aux, first, last, lport;
272 #ifdef INET
273 struct in_addr laddr;
274 #endif
275
276 pcbinfo = inp->inp_pcbinfo;
277
278 /*
279 * Because no actual state changes occur here, a global write lock on
280 * the pcbinfo isn't required.
281 */
282 INP_INFO_LOCK_ASSERT(pcbinfo);
283 INP_LOCK_ASSERT(inp);
284
285 if (inp->inp_flags & INP_HIGHPORT) {
286 first = ipport_hifirstauto; /* sysctl */
287 last = ipport_hilastauto;
288 lastport = &pcbinfo->ipi_lasthi;
289 } else if (inp->inp_flags & INP_LOWPORT) {
290 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
291 if (error)
292 return (error);
293 first = ipport_lowfirstauto; /* 1023 */
294 last = ipport_lowlastauto; /* 600 */
295 lastport = &pcbinfo->ipi_lastlow;
296 } else {
297 first = ipport_firstauto; /* sysctl */
298 last = ipport_lastauto;
299 lastport = &pcbinfo->ipi_lastport;
300 }
301 /*
302 * For UDP, use random port allocation as long as the user
303 * allows it. For TCP (and as of yet unknown) connections,
304 * use random port allocation only if the user allows it AND
305 * ipport_tick() allows it.
306 */
307 if (ipport_randomized &&
308 (!ipport_stoprandom || pcbinfo == &udbinfo))
309 dorandom = 1;
310 else
311 dorandom = 0;
312 /*
313 * It makes no sense to do random port allocation if
314 * we have the only port available.
315 */
316 if (first == last)
317 dorandom = 0;
318 /* Make sure to not include UDP packets in the count. */
319 if (pcbinfo != &udbinfo)
320 ipport_tcpallocs++;
321 /*
322 * Instead of having two loops further down counting up or down
323 * make sure that first is always <= last and go with only one
324 * code path implementing all logic.
325 */
326 if (first > last) {
327 aux = first;
328 first = last;
329 last = aux;
330 }
331
332 #ifdef INET
333 /* Make the compiler happy. */
334 laddr.s_addr = 0;
335 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
336 KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p",
337 __func__, inp));
338 laddr = *laddrp;
339 }
340 #endif
341 lport = *lportp;
342
343 if (dorandom)
344 *lastport = first + (arc4random() % (last - first));
345
346 count = last - first;
347
348 do {
349 if (count-- < 0) /* completely used? */
350 return (EADDRNOTAVAIL);
351 ++*lastport;
352 if (*lastport < first || *lastport > last)
353 *lastport = first;
354 lport = htons(*lastport);
355
356 #ifdef INET6
357 if ((inp->inp_vflag & INP_IPV6) != 0)
358 tmpinp = in6_pcblookup_local(pcbinfo,
359 &inp->in6p_laddr, lport, wild, cred);
360 #endif
361 #if defined(INET) && defined(INET6)
362 else
363 #endif
364 #ifdef INET
365 tmpinp = in_pcblookup_local(pcbinfo, laddr,
366 lport, wild, cred);
367 #endif
368 } while (tmpinp != NULL);
369
370 #ifdef INET
371 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4)
372 laddrp->s_addr = laddr.s_addr;
373 #endif
374 *lportp = lport;
375
376 return (0);
377 }
378 #endif /* INET || INET6 */
379
380 /*
381 * Set up a bind operation on a PCB, performing port allocation
382 * as required, but do not actually modify the PCB. Callers can
383 * either complete the bind by setting inp_laddr/inp_lport and
384 * calling in_pcbinshash(), or they can just use the resulting
385 * port and address to authorise the sending of a once-off packet.
386 *
387 * On error, the values of *laddrp and *lportp are not changed.
388 */
389 int
390 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
391 u_short *lportp, struct ucred *cred)
392 {
393 struct socket *so = inp->inp_socket;
394 struct sockaddr_in *sin;
395 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
396 struct in_addr laddr;
397 u_short lport = 0;
398 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
399 int error;
400
401 /*
402 * Because no actual state changes occur here, a global write lock on
403 * the pcbinfo isn't required.
404 */
405 INP_INFO_LOCK_ASSERT(pcbinfo);
406 INP_LOCK_ASSERT(inp);
407
408 if (TAILQ_EMPTY(&in_ifaddrhead)) /* XXX broken! */
409 return (EADDRNOTAVAIL);
410 laddr.s_addr = *laddrp;
411 if (nam != NULL && laddr.s_addr != INADDR_ANY)
412 return (EINVAL);
413 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
414 wild = INPLOOKUP_WILDCARD;
415 if (nam == NULL) {
416 if ((error = prison_local_ip4(cred, &laddr)) != 0)
417 return (error);
418 } else {
419 sin = (struct sockaddr_in *)nam;
420 if (nam->sa_len != sizeof (*sin))
421 return (EINVAL);
422 #ifdef notdef
423 /*
424 * We should check the family, but old programs
425 * incorrectly fail to initialize it.
426 */
427 if (sin->sin_family != AF_INET)
428 return (EAFNOSUPPORT);
429 #endif
430 error = prison_local_ip4(cred, &sin->sin_addr);
431 if (error)
432 return (error);
433 if (sin->sin_port != *lportp) {
434 /* Don't allow the port to change. */
435 if (*lportp != 0)
436 return (EINVAL);
437 lport = sin->sin_port;
438 }
439 /* NB: lport is left as 0 if the port isn't being changed. */
440 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
441 /*
442 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
443 * allow complete duplication of binding if
444 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
445 * and a multicast address is bound on both
446 * new and duplicated sockets.
447 */
448 if (so->so_options & SO_REUSEADDR)
449 reuseport = SO_REUSEADDR|SO_REUSEPORT;
450 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
451 sin->sin_port = 0; /* yech... */
452 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
453 if (ifa_ifwithaddr((struct sockaddr *)sin) == 0)
454 return (EADDRNOTAVAIL);
455 }
456 laddr = sin->sin_addr;
457 if (lport) {
458 struct inpcb *t;
459 struct tcptw *tw;
460
461 /* GROSS */
462 if (ntohs(lport) <= ipport_reservedhigh &&
463 ntohs(lport) >= ipport_reservedlow &&
464 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
465 0))
466 return (EACCES);
467 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
468 priv_check_cred(inp->inp_cred,
469 PRIV_NETINET_REUSEPORT, 0) != 0) {
470 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
471 lport, INPLOOKUP_WILDCARD, cred);
472 /*
473 * XXX
474 * This entire block sorely needs a rewrite.
475 */
476 if (t &&
477 ((t->inp_flags & INP_TIMEWAIT) == 0) &&
478 (so->so_type != SOCK_STREAM ||
479 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
480 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
481 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
482 (t->inp_socket->so_options &
483 SO_REUSEPORT) == 0) &&
484 (inp->inp_cred->cr_uid !=
485 t->inp_cred->cr_uid))
486 return (EADDRINUSE);
487 }
488 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
489 lport, wild, cred);
490 if (t && (t->inp_flags & INP_TIMEWAIT)) {
491 /*
492 * XXXRW: If an incpb has had its timewait
493 * state recycled, we treat the address as
494 * being in use (for now). This is better
495 * than a panic, but not desirable.
496 */
497 tw = intotw(inp);
498 if (tw == NULL ||
499 (reuseport & tw->tw_so_options) == 0)
500 return (EADDRINUSE);
501 } else if (t &&
502 (reuseport & t->inp_socket->so_options) == 0) {
503 #ifdef INET6
504 if (ntohl(sin->sin_addr.s_addr) !=
505 INADDR_ANY ||
506 ntohl(t->inp_laddr.s_addr) !=
507 INADDR_ANY ||
508 INP_SOCKAF(so) ==
509 INP_SOCKAF(t->inp_socket))
510 #endif
511 return (EADDRINUSE);
512 }
513 }
514 }
515 if (*lportp != 0)
516 lport = *lportp;
517 if (lport == 0) {
518 error = in_pcb_lport(inp, &laddr, &lport, cred, wild);
519 if (error != 0)
520 return (error);
521 }
522 *laddrp = laddr.s_addr;
523 *lportp = lport;
524 return (0);
525 }
526
527 /*
528 * Connect from a socket to a specified address.
529 * Both address and port must be specified in argument sin.
530 * If don't have a local address for this socket yet,
531 * then pick one.
532 */
533 int
534 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
535 {
536 u_short lport, fport;
537 in_addr_t laddr, faddr;
538 int anonport, error;
539
540 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
541 INP_WLOCK_ASSERT(inp);
542
543 lport = inp->inp_lport;
544 laddr = inp->inp_laddr.s_addr;
545 anonport = (lport == 0);
546 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
547 NULL, cred);
548 if (error)
549 return (error);
550
551 /* Do the initial binding of the local address if required. */
552 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
553 inp->inp_lport = lport;
554 inp->inp_laddr.s_addr = laddr;
555 if (in_pcbinshash(inp) != 0) {
556 inp->inp_laddr.s_addr = INADDR_ANY;
557 inp->inp_lport = 0;
558 return (EAGAIN);
559 }
560 }
561
562 /* Commit the remaining changes. */
563 inp->inp_lport = lport;
564 inp->inp_laddr.s_addr = laddr;
565 inp->inp_faddr.s_addr = faddr;
566 inp->inp_fport = fport;
567 in_pcbrehash(inp);
568
569 if (anonport)
570 inp->inp_flags |= INP_ANONPORT;
571 return (0);
572 }
573
574 /*
575 * Do proper source address selection on an unbound socket in case
576 * of connect. Take jails into account as well.
577 */
578 static int
579 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
580 struct ucred *cred)
581 {
582 struct in_ifaddr *ia;
583 struct ifaddr *ifa;
584 struct sockaddr *sa;
585 struct sockaddr_in *sin;
586 struct route sro;
587 int error;
588
589 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
590
591 /*
592 * Bypass source address selection and use the primary jail IP
593 * if requested.
594 */
595 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
596 return (0);
597
598 error = 0;
599 ia = NULL;
600 bzero(&sro, sizeof(sro));
601
602 sin = (struct sockaddr_in *)&sro.ro_dst;
603 sin->sin_family = AF_INET;
604 sin->sin_len = sizeof(struct sockaddr_in);
605 sin->sin_addr.s_addr = faddr->s_addr;
606
607 /*
608 * If route is known our src addr is taken from the i/f,
609 * else punt.
610 *
611 * Find out route to destination.
612 */
613 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
614 in_rtalloc_ign(&sro, RTF_CLONING, inp->inp_inc.inc_fibnum);
615
616 /*
617 * If we found a route, use the address corresponding to
618 * the outgoing interface.
619 *
620 * Otherwise assume faddr is reachable on a directly connected
621 * network and try to find a corresponding interface to take
622 * the source address from.
623 */
624 if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
625 struct ifnet *ifp;
626
627 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin));
628 if (ia == NULL)
629 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin));
630 if (ia == NULL) {
631 error = ENETUNREACH;
632 goto done;
633 }
634
635 if (cred == NULL || !jailed(cred)) {
636 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
637 goto done;
638 }
639
640 ifp = ia->ia_ifp;
641 ia = NULL;
642 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
643
644 sa = ifa->ifa_addr;
645 if (sa->sa_family != AF_INET)
646 continue;
647 sin = (struct sockaddr_in *)sa;
648 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
649 ia = (struct in_ifaddr *)ifa;
650 break;
651 }
652 }
653 if (ia != NULL) {
654 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
655 goto done;
656 }
657
658 /* 3. As a last resort return the 'default' jail address. */
659 error = prison_get_ip4(cred, laddr);
660 goto done;
661 }
662
663 /*
664 * If the outgoing interface on the route found is not
665 * a loopback interface, use the address from that interface.
666 * In case of jails do those three steps:
667 * 1. check if the interface address belongs to the jail. If so use it.
668 * 2. check if we have any address on the outgoing interface
669 * belonging to this jail. If so use it.
670 * 3. as a last resort return the 'default' jail address.
671 */
672 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
673
674 /* If not jailed, use the default returned. */
675 if (cred == NULL || !jailed(cred)) {
676 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
677 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
678 goto done;
679 }
680
681 /* Jailed. */
682 /* 1. Check if the iface address belongs to the jail. */
683 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
684 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
685 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
686 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
687 goto done;
688 }
689
690 /*
691 * 2. Check if we have any address on the outgoing interface
692 * belonging to this jail.
693 */
694 TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) {
695
696 sa = ifa->ifa_addr;
697 if (sa->sa_family != AF_INET)
698 continue;
699 sin = (struct sockaddr_in *)sa;
700 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
701 ia = (struct in_ifaddr *)ifa;
702 break;
703 }
704 }
705 if (ia != NULL) {
706 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
707 goto done;
708 }
709
710 /* 3. As a last resort return the 'default' jail address. */
711 error = prison_get_ip4(cred, laddr);
712 goto done;
713 }
714
715 /*
716 * The outgoing interface is marked with 'loopback net', so a route
717 * to ourselves is here.
718 * Try to find the interface of the destination address and then
719 * take the address from there. That interface is not necessarily
720 * a loopback interface.
721 * In case of jails, check that it is an address of the jail
722 * and if we cannot find, fall back to the 'default' jail address.
723 */
724 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
725 struct sockaddr_in sain;
726
727 bzero(&sain, sizeof(struct sockaddr_in));
728 sain.sin_family = AF_INET;
729 sain.sin_len = sizeof(struct sockaddr_in);
730 sain.sin_addr.s_addr = faddr->s_addr;
731
732 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain)));
733 if (ia == NULL)
734 ia = ifatoia(ifa_ifwithnet(sintosa(&sain)));
735
736 if (cred == NULL || !jailed(cred)) {
737 #if __FreeBSD_version < 800000
738 if (ia == NULL)
739 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
740 #endif
741 if (ia == NULL) {
742 error = ENETUNREACH;
743 goto done;
744 }
745 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
746 goto done;
747 }
748
749 /* Jailed. */
750 if (ia != NULL) {
751 struct ifnet *ifp;
752
753 ifp = ia->ia_ifp;
754 ia = NULL;
755 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
756
757 sa = ifa->ifa_addr;
758 if (sa->sa_family != AF_INET)
759 continue;
760 sin = (struct sockaddr_in *)sa;
761 if (prison_check_ip4(cred,
762 &sin->sin_addr) == 0) {
763 ia = (struct in_ifaddr *)ifa;
764 break;
765 }
766 }
767 if (ia != NULL) {
768 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
769 goto done;
770 }
771 }
772
773 /* 3. As a last resort return the 'default' jail address. */
774 error = prison_get_ip4(cred, laddr);
775 goto done;
776 }
777
778 done:
779 if (sro.ro_rt != NULL)
780 RTFREE(sro.ro_rt);
781 return (error);
782 }
783
784 /*
785 * Set up for a connect from a socket to the specified address.
786 * On entry, *laddrp and *lportp should contain the current local
787 * address and port for the PCB; these are updated to the values
788 * that should be placed in inp_laddr and inp_lport to complete
789 * the connect.
790 *
791 * On success, *faddrp and *fportp will be set to the remote address
792 * and port. These are not updated in the error case.
793 *
794 * If the operation fails because the connection already exists,
795 * *oinpp will be set to the PCB of that connection so that the
796 * caller can decide to override it. In all other cases, *oinpp
797 * is set to NULL.
798 */
799 int
800 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
801 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
802 struct inpcb **oinpp, struct ucred *cred)
803 {
804 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
805 struct in_ifaddr *ia;
806 struct inpcb *oinp;
807 struct in_addr laddr, faddr;
808 u_short lport, fport;
809 int error;
810
811 /*
812 * Because a global state change doesn't actually occur here, a read
813 * lock is sufficient.
814 */
815 INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo);
816 INP_LOCK_ASSERT(inp);
817
818 if (oinpp != NULL)
819 *oinpp = NULL;
820 if (nam->sa_len != sizeof (*sin))
821 return (EINVAL);
822 if (sin->sin_family != AF_INET)
823 return (EAFNOSUPPORT);
824 if (sin->sin_port == 0)
825 return (EADDRNOTAVAIL);
826 laddr.s_addr = *laddrp;
827 lport = *lportp;
828 faddr = sin->sin_addr;
829 fport = sin->sin_port;
830
831 if (!TAILQ_EMPTY(&in_ifaddrhead)) {
832 /*
833 * If the destination address is INADDR_ANY,
834 * use the primary local address.
835 * If the supplied address is INADDR_BROADCAST,
836 * and the primary interface supports broadcast,
837 * choose the broadcast address for that interface.
838 */
839 if (faddr.s_addr == INADDR_ANY) {
840 faddr =
841 IA_SIN(TAILQ_FIRST(&in_ifaddrhead))->sin_addr;
842 if (cred != NULL &&
843 (error = prison_get_ip4(cred, &faddr)) != 0)
844 return (error);
845 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST &&
846 (TAILQ_FIRST(&in_ifaddrhead)->ia_ifp->if_flags &
847 IFF_BROADCAST))
848 faddr = satosin(&TAILQ_FIRST(
849 &in_ifaddrhead)->ia_broadaddr)->sin_addr;
850 }
851 if (laddr.s_addr == INADDR_ANY) {
852 error = in_pcbladdr(inp, &faddr, &laddr, cred);
853 if (error)
854 return (error);
855
856 /*
857 * If the destination address is multicast and an outgoing
858 * interface has been set as a multicast option, use the
859 * address of that interface as our source address.
860 */
861 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
862 inp->inp_moptions != NULL) {
863 struct ip_moptions *imo;
864 struct ifnet *ifp;
865
866 imo = inp->inp_moptions;
867 if (imo->imo_multicast_ifp != NULL) {
868 ifp = imo->imo_multicast_ifp;
869 TAILQ_FOREACH(ia, &in_ifaddrhead, ia_link)
870 if (ia->ia_ifp == ifp)
871 break;
872 if (ia == NULL)
873 return (EADDRNOTAVAIL);
874 laddr = ia->ia_addr.sin_addr;
875 }
876 }
877 }
878
879 oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport,
880 0, NULL);
881 if (oinp != NULL) {
882 if (oinpp != NULL)
883 *oinpp = oinp;
884 return (EADDRINUSE);
885 }
886 if (lport == 0) {
887 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport,
888 cred);
889 if (error)
890 return (error);
891 }
892 *laddrp = laddr.s_addr;
893 *lportp = lport;
894 *faddrp = faddr.s_addr;
895 *fportp = fport;
896 return (0);
897 }
898
899 void
900 in_pcbdisconnect(struct inpcb *inp)
901 {
902
903 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
904 INP_WLOCK_ASSERT(inp);
905
906 inp->inp_faddr.s_addr = INADDR_ANY;
907 inp->inp_fport = 0;
908 in_pcbrehash(inp);
909 }
910
911 /*
912 * Historically, in_pcbdetach() included the functionality now found in
913 * in_pcbfree() and in_pcbdrop(). They are now broken out to reflect the
914 * more complex life cycle of TCP.
915 *
916 * in_pcbdetach() is responsibe for disconnecting the socket from an inpcb.
917 * For most protocols, this will be invoked immediately prior to calling
918 * in_pcbfree(). However, for TCP the inpcb may significantly outlive the
919 * socket, in which case in_pcbfree() may be deferred.
920 */
921 void
922 in_pcbdetach(struct inpcb *inp)
923 {
924
925 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
926
927 inp->inp_socket->so_pcb = NULL;
928 inp->inp_socket = NULL;
929 }
930
931 /*
932 * in_pcbfree() is responsible for freeing an already-detached inpcb, as well
933 * as removing it from any global inpcb lists it might be on.
934 */
935 void
936 in_pcbfree(struct inpcb *inp)
937 {
938 struct inpcbinfo *ipi = inp->inp_pcbinfo;
939
940 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
941
942 INP_INFO_WLOCK_ASSERT(ipi);
943 INP_WLOCK_ASSERT(inp);
944
945 #ifdef IPSEC
946 if (inp->inp_sp != NULL)
947 ipsec_delete_pcbpolicy(inp);
948 #endif /* IPSEC */
949 inp->inp_gencnt = ++ipi->ipi_gencnt;
950 in_pcbremlists(inp);
951 #ifdef INET6
952 if (inp->inp_vflag & INP_IPV6PROTO) {
953 ip6_freepcbopts(inp->in6p_outputopts);
954 ip6_freemoptions(inp->in6p_moptions);
955 }
956 #endif
957 if (inp->inp_options)
958 (void)m_free(inp->inp_options);
959 if (inp->inp_moptions != NULL)
960 inp_freemoptions(inp->inp_moptions);
961 inp->inp_vflag = 0;
962 crfree(inp->inp_cred);
963
964 #ifdef MAC
965 mac_destroy_inpcb(inp);
966 #endif
967 INP_WUNLOCK(inp);
968 uma_zfree(ipi->ipi_zone, inp);
969 }
970
971 /*
972 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
973 * port reservation, and preventing it from being returned by inpcb lookups.
974 *
975 * It is used by TCP to mark an inpcb as unused and avoid future packet
976 * delivery or event notification when a socket remains open but TCP has
977 * closed. This might occur as a result of a shutdown()-initiated TCP close
978 * or a RST on the wire, and allows the port binding to be reused while still
979 * maintaining the invariant that so_pcb always points to a valid inpcb until
980 * in_pcbdetach().
981 *
982 * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash
983 * lists, but can lead to confusing netstat output, as open sockets with
984 * closed TCP connections will no longer appear to have their bound port
985 * number. An explicit flag would be better, as it would allow us to leave
986 * the port number intact after the connection is dropped.
987 *
988 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
989 * in_pcbnotifyall() and in_pcbpurgeif0()?
990 */
991 void
992 in_pcbdrop(struct inpcb *inp)
993 {
994
995 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo);
996 INP_WLOCK_ASSERT(inp);
997
998 inp->inp_flags |= INP_DROPPED;
999 if (inp->inp_flags & INP_INHASHLIST) {
1000 struct inpcbport *phd = inp->inp_phd;
1001
1002 LIST_REMOVE(inp, inp_hash);
1003 LIST_REMOVE(inp, inp_portlist);
1004 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1005 LIST_REMOVE(phd, phd_hash);
1006 free(phd, M_PCB);
1007 }
1008 inp->inp_flags &= ~INP_INHASHLIST;
1009 }
1010 }
1011
1012 /*
1013 * Common routines to return the socket addresses associated with inpcbs.
1014 */
1015 struct sockaddr *
1016 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1017 {
1018 struct sockaddr_in *sin;
1019
1020 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME,
1021 M_WAITOK | M_ZERO);
1022 sin->sin_family = AF_INET;
1023 sin->sin_len = sizeof(*sin);
1024 sin->sin_addr = *addr_p;
1025 sin->sin_port = port;
1026
1027 return (struct sockaddr *)sin;
1028 }
1029
1030 int
1031 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1032 {
1033 struct inpcb *inp;
1034 struct in_addr addr;
1035 in_port_t port;
1036
1037 inp = sotoinpcb(so);
1038 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1039
1040 INP_RLOCK(inp);
1041 port = inp->inp_lport;
1042 addr = inp->inp_laddr;
1043 INP_RUNLOCK(inp);
1044
1045 *nam = in_sockaddr(port, &addr);
1046 return 0;
1047 }
1048
1049 int
1050 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1051 {
1052 struct inpcb *inp;
1053 struct in_addr addr;
1054 in_port_t port;
1055
1056 inp = sotoinpcb(so);
1057 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1058
1059 INP_RLOCK(inp);
1060 port = inp->inp_fport;
1061 addr = inp->inp_faddr;
1062 INP_RUNLOCK(inp);
1063
1064 *nam = in_sockaddr(port, &addr);
1065 return 0;
1066 }
1067
1068 void
1069 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1070 struct inpcb *(*notify)(struct inpcb *, int))
1071 {
1072 struct inpcb *inp, *inp_temp;
1073
1074 INP_INFO_WLOCK(pcbinfo);
1075 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1076 INP_WLOCK(inp);
1077 #ifdef INET6
1078 if ((inp->inp_vflag & INP_IPV4) == 0) {
1079 INP_WUNLOCK(inp);
1080 continue;
1081 }
1082 #endif
1083 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1084 inp->inp_socket == NULL) {
1085 INP_WUNLOCK(inp);
1086 continue;
1087 }
1088 if ((*notify)(inp, errno))
1089 INP_WUNLOCK(inp);
1090 }
1091 INP_INFO_WUNLOCK(pcbinfo);
1092 }
1093
1094 void
1095 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1096 {
1097 struct inpcb *inp;
1098 struct ip_moptions *imo;
1099 int i, gap;
1100
1101 INP_INFO_RLOCK(pcbinfo);
1102 LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1103 INP_WLOCK(inp);
1104 imo = inp->inp_moptions;
1105 if ((inp->inp_vflag & INP_IPV4) &&
1106 imo != NULL) {
1107 /*
1108 * Unselect the outgoing interface if it is being
1109 * detached.
1110 */
1111 if (imo->imo_multicast_ifp == ifp)
1112 imo->imo_multicast_ifp = NULL;
1113
1114 /*
1115 * Drop multicast group membership if we joined
1116 * through the interface being detached.
1117 */
1118 for (i = 0, gap = 0; i < imo->imo_num_memberships;
1119 i++) {
1120 if (imo->imo_membership[i]->inm_ifp == ifp) {
1121 in_delmulti(imo->imo_membership[i]);
1122 gap++;
1123 } else if (gap != 0)
1124 imo->imo_membership[i - gap] =
1125 imo->imo_membership[i];
1126 }
1127 imo->imo_num_memberships -= gap;
1128 }
1129 INP_WUNLOCK(inp);
1130 }
1131 INP_INFO_RUNLOCK(pcbinfo);
1132 }
1133
1134 /*
1135 * Lookup a PCB based on the local address and port.
1136 */
1137 #define INP_LOOKUP_MAPPED_PCB_COST 3
1138 struct inpcb *
1139 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1140 u_short lport, int wild_okay, struct ucred *cred)
1141 {
1142 struct inpcb *inp;
1143 #ifdef INET6
1144 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1145 #else
1146 int matchwild = 3;
1147 #endif
1148 int wildcard;
1149
1150 INP_INFO_LOCK_ASSERT(pcbinfo);
1151
1152 if (!wild_okay) {
1153 struct inpcbhead *head;
1154 /*
1155 * Look for an unconnected (wildcard foreign addr) PCB that
1156 * matches the local address and port we're looking for.
1157 */
1158 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1159 0, pcbinfo->ipi_hashmask)];
1160 LIST_FOREACH(inp, head, inp_hash) {
1161 #ifdef INET6
1162 /* XXX inp locking */
1163 if ((inp->inp_vflag & INP_IPV4) == 0)
1164 continue;
1165 #endif
1166 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1167 inp->inp_laddr.s_addr == laddr.s_addr &&
1168 inp->inp_lport == lport) {
1169 /*
1170 * Found?
1171 */
1172 if (cred == NULL ||
1173 inp->inp_cred->cr_prison == cred->cr_prison)
1174 return (inp);
1175 }
1176 }
1177 /*
1178 * Not found.
1179 */
1180 return (NULL);
1181 } else {
1182 struct inpcbporthead *porthash;
1183 struct inpcbport *phd;
1184 struct inpcb *match = NULL;
1185 /*
1186 * Best fit PCB lookup.
1187 *
1188 * First see if this local port is in use by looking on the
1189 * port hash list.
1190 */
1191 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1192 pcbinfo->ipi_porthashmask)];
1193 LIST_FOREACH(phd, porthash, phd_hash) {
1194 if (phd->phd_port == lport)
1195 break;
1196 }
1197 if (phd != NULL) {
1198 /*
1199 * Port is in use by one or more PCBs. Look for best
1200 * fit.
1201 */
1202 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1203 wildcard = 0;
1204 if (cred != NULL &&
1205 inp->inp_cred->cr_prison != cred->cr_prison)
1206 continue;
1207 #ifdef INET6
1208 /* XXX inp locking */
1209 if ((inp->inp_vflag & INP_IPV4) == 0)
1210 continue;
1211 /*
1212 * We never select the PCB that has
1213 * INP_IPV6 flag and is bound to :: if
1214 * we have another PCB which is bound
1215 * to 0.0.0.0. If a PCB has the
1216 * INP_IPV6 flag, then we set its cost
1217 * higher than IPv4 only PCBs.
1218 *
1219 * Note that the case only happens
1220 * when a socket is bound to ::, under
1221 * the condition that the use of the
1222 * mapped address is allowed.
1223 */
1224 if ((inp->inp_vflag & INP_IPV6) != 0)
1225 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1226 #endif
1227 if (inp->inp_faddr.s_addr != INADDR_ANY)
1228 wildcard++;
1229 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1230 if (laddr.s_addr == INADDR_ANY)
1231 wildcard++;
1232 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1233 continue;
1234 } else {
1235 if (laddr.s_addr != INADDR_ANY)
1236 wildcard++;
1237 }
1238 if (wildcard < matchwild) {
1239 match = inp;
1240 matchwild = wildcard;
1241 if (matchwild == 0)
1242 break;
1243 }
1244 }
1245 }
1246 return (match);
1247 }
1248 }
1249 #undef INP_LOOKUP_MAPPED_PCB_COST
1250
1251 /*
1252 * Lookup PCB in hash list.
1253 */
1254 struct inpcb *
1255 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
1256 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard,
1257 struct ifnet *ifp)
1258 {
1259 struct inpcbhead *head;
1260 struct inpcb *inp, *tmpinp;
1261 u_short fport = fport_arg, lport = lport_arg;
1262
1263 INP_INFO_LOCK_ASSERT(pcbinfo);
1264
1265 /*
1266 * First look for an exact match.
1267 */
1268 tmpinp = NULL;
1269 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
1270 pcbinfo->ipi_hashmask)];
1271 LIST_FOREACH(inp, head, inp_hash) {
1272 #ifdef INET6
1273 /* XXX inp locking */
1274 if ((inp->inp_vflag & INP_IPV4) == 0)
1275 continue;
1276 #endif
1277 if (inp->inp_faddr.s_addr == faddr.s_addr &&
1278 inp->inp_laddr.s_addr == laddr.s_addr &&
1279 inp->inp_fport == fport &&
1280 inp->inp_lport == lport) {
1281 /*
1282 * XXX We should be able to directly return
1283 * the inp here, without any checks.
1284 * Well unless both bound with SO_REUSEPORT?
1285 */
1286 if (jailed(inp->inp_cred))
1287 return (inp);
1288 if (tmpinp == NULL)
1289 tmpinp = inp;
1290 }
1291 }
1292 if (tmpinp != NULL)
1293 return (tmpinp);
1294
1295 /*
1296 * Then look for a wildcard match, if requested.
1297 */
1298 if (wildcard == INPLOOKUP_WILDCARD) {
1299 struct inpcb *local_wild = NULL, *local_exact = NULL;
1300 #ifdef INET6
1301 struct inpcb *local_wild_mapped = NULL;
1302 #endif
1303 struct inpcb *jail_wild = NULL;
1304 int injail;
1305
1306 /*
1307 * Order of socket selection - we always prefer jails.
1308 * 1. jailed, non-wild.
1309 * 2. jailed, wild.
1310 * 3. non-jailed, non-wild.
1311 * 4. non-jailed, wild.
1312 */
1313
1314 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1315 0, pcbinfo->ipi_hashmask)];
1316 LIST_FOREACH(inp, head, inp_hash) {
1317 #ifdef INET6
1318 /* XXX inp locking */
1319 if ((inp->inp_vflag & INP_IPV4) == 0)
1320 continue;
1321 #endif
1322 if (inp->inp_faddr.s_addr != INADDR_ANY ||
1323 inp->inp_lport != lport)
1324 continue;
1325
1326 /* XXX inp locking */
1327 if (ifp && ifp->if_type == IFT_FAITH &&
1328 (inp->inp_flags & INP_FAITH) == 0)
1329 continue;
1330
1331 injail = jailed(inp->inp_cred);
1332 if (injail) {
1333 if (prison_check_ip4(inp->inp_cred,
1334 &laddr) != 0)
1335 continue;
1336 } else {
1337 if (local_exact != NULL)
1338 continue;
1339 }
1340
1341 if (inp->inp_laddr.s_addr == laddr.s_addr) {
1342 if (injail)
1343 return (inp);
1344 else
1345 local_exact = inp;
1346 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
1347 #ifdef INET6
1348 /* XXX inp locking, NULL check */
1349 if (inp->inp_vflag & INP_IPV6PROTO)
1350 local_wild_mapped = inp;
1351 else
1352 #endif /* INET6 */
1353 if (injail)
1354 jail_wild = inp;
1355 else
1356 local_wild = inp;
1357 }
1358 } /* LIST_FOREACH */
1359 if (jail_wild != NULL)
1360 return (jail_wild);
1361 if (local_exact != NULL)
1362 return (local_exact);
1363 if (local_wild != NULL)
1364 return (local_wild);
1365 #ifdef INET6
1366 if (local_wild_mapped != NULL)
1367 return (local_wild_mapped);
1368 #endif /* defined(INET6) */
1369 } /* if (wildcard == INPLOOKUP_WILDCARD) */
1370
1371 return (NULL);
1372 }
1373
1374 /*
1375 * Insert PCB onto various hash lists.
1376 */
1377 int
1378 in_pcbinshash(struct inpcb *inp)
1379 {
1380 struct inpcbhead *pcbhash;
1381 struct inpcbporthead *pcbporthash;
1382 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1383 struct inpcbport *phd;
1384 u_int32_t hashkey_faddr;
1385
1386 INP_INFO_WLOCK_ASSERT(pcbinfo);
1387 INP_WLOCK_ASSERT(inp);
1388 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
1389 ("in_pcbinshash: INP_INHASHLIST"));
1390
1391 #ifdef INET6
1392 if (inp->inp_vflag & INP_IPV6)
1393 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1394 else
1395 #endif /* INET6 */
1396 hashkey_faddr = inp->inp_faddr.s_addr;
1397
1398 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
1399 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
1400
1401 pcbporthash = &pcbinfo->ipi_porthashbase[
1402 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
1403
1404 /*
1405 * Go through port list and look for a head for this lport.
1406 */
1407 LIST_FOREACH(phd, pcbporthash, phd_hash) {
1408 if (phd->phd_port == inp->inp_lport)
1409 break;
1410 }
1411 /*
1412 * If none exists, malloc one and tack it on.
1413 */
1414 if (phd == NULL) {
1415 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
1416 if (phd == NULL) {
1417 return (ENOBUFS); /* XXX */
1418 }
1419 phd->phd_port = inp->inp_lport;
1420 LIST_INIT(&phd->phd_pcblist);
1421 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
1422 }
1423 inp->inp_phd = phd;
1424 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
1425 LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
1426 inp->inp_flags |= INP_INHASHLIST;
1427 return (0);
1428 }
1429
1430 /*
1431 * Move PCB to the proper hash bucket when { faddr, fport } have been
1432 * changed. NOTE: This does not handle the case of the lport changing (the
1433 * hashed port list would have to be updated as well), so the lport must
1434 * not change after in_pcbinshash() has been called.
1435 */
1436 void
1437 in_pcbrehash(struct inpcb *inp)
1438 {
1439 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1440 struct inpcbhead *head;
1441 u_int32_t hashkey_faddr;
1442
1443 INP_INFO_WLOCK_ASSERT(pcbinfo);
1444 INP_WLOCK_ASSERT(inp);
1445 KASSERT(inp->inp_flags & INP_INHASHLIST,
1446 ("in_pcbrehash: !INP_INHASHLIST"));
1447
1448 #ifdef INET6
1449 if (inp->inp_vflag & INP_IPV6)
1450 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */;
1451 else
1452 #endif /* INET6 */
1453 hashkey_faddr = inp->inp_faddr.s_addr;
1454
1455 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
1456 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
1457
1458 LIST_REMOVE(inp, inp_hash);
1459 LIST_INSERT_HEAD(head, inp, inp_hash);
1460 }
1461
1462 /*
1463 * Remove PCB from various lists.
1464 */
1465 void
1466 in_pcbremlists(struct inpcb *inp)
1467 {
1468 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1469
1470 INP_INFO_WLOCK_ASSERT(pcbinfo);
1471 INP_WLOCK_ASSERT(inp);
1472
1473 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
1474 if (inp->inp_flags & INP_INHASHLIST) {
1475 struct inpcbport *phd = inp->inp_phd;
1476
1477 LIST_REMOVE(inp, inp_hash);
1478 LIST_REMOVE(inp, inp_portlist);
1479 if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
1480 LIST_REMOVE(phd, phd_hash);
1481 free(phd, M_PCB);
1482 }
1483 inp->inp_flags &= ~INP_INHASHLIST;
1484 }
1485 LIST_REMOVE(inp, inp_list);
1486 pcbinfo->ipi_count--;
1487 }
1488
1489 /*
1490 * A set label operation has occurred at the socket layer, propagate the
1491 * label change into the in_pcb for the socket.
1492 */
1493 void
1494 in_pcbsosetlabel(struct socket *so)
1495 {
1496 #ifdef MAC
1497 struct inpcb *inp;
1498
1499 inp = sotoinpcb(so);
1500 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
1501
1502 INP_WLOCK(inp);
1503 SOCK_LOCK(so);
1504 mac_inpcb_sosetlabel(so, inp);
1505 SOCK_UNLOCK(so);
1506 INP_WUNLOCK(inp);
1507 #endif
1508 }
1509
1510 /*
1511 * ipport_tick runs once per second, determining if random port allocation
1512 * should be continued. If more than ipport_randomcps ports have been
1513 * allocated in the last second, then we return to sequential port
1514 * allocation. We return to random allocation only once we drop below
1515 * ipport_randomcps for at least ipport_randomtime seconds.
1516 */
1517 void
1518 ipport_tick(void *xtp)
1519 {
1520
1521 if (ipport_tcpallocs <= ipport_tcplastcount + ipport_randomcps) {
1522 if (ipport_stoprandom > 0)
1523 ipport_stoprandom--;
1524 } else
1525 ipport_stoprandom = ipport_randomtime;
1526 ipport_tcplastcount = ipport_tcpallocs;
1527 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
1528 }
1529
1530 void
1531 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
1532 {
1533 struct inpcb *inp;
1534
1535 INP_INFO_RLOCK(&tcbinfo);
1536 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1537 INP_WLOCK(inp);
1538 func(inp, arg);
1539 INP_WUNLOCK(inp);
1540 }
1541 INP_INFO_RUNLOCK(&tcbinfo);
1542 }
1543
1544 struct socket *
1545 inp_inpcbtosocket(struct inpcb *inp)
1546 {
1547
1548 INP_WLOCK_ASSERT(inp);
1549 return (inp->inp_socket);
1550 }
1551
1552 struct tcpcb *
1553 inp_inpcbtotcpcb(struct inpcb *inp)
1554 {
1555
1556 INP_WLOCK_ASSERT(inp);
1557 return ((struct tcpcb *)inp->inp_ppcb);
1558 }
1559
1560 int
1561 inp_ip_tos_get(const struct inpcb *inp)
1562 {
1563
1564 return (inp->inp_ip_tos);
1565 }
1566
1567 void
1568 inp_ip_tos_set(struct inpcb *inp, int val)
1569 {
1570
1571 inp->inp_ip_tos = val;
1572 }
1573
1574 void
1575 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
1576 uint32_t *faddr, uint16_t *fp)
1577 {
1578
1579 INP_LOCK_ASSERT(inp);
1580 *laddr = inp->inp_laddr.s_addr;
1581 *faddr = inp->inp_faddr.s_addr;
1582 *lp = inp->inp_lport;
1583 *fp = inp->inp_fport;
1584 }
1585
1586 struct inpcb *
1587 so_sotoinpcb(struct socket *so)
1588 {
1589
1590 return (sotoinpcb(so));
1591 }
1592
1593 struct tcpcb *
1594 so_sototcpcb(struct socket *so)
1595 {
1596
1597 return (sototcpcb(so));
1598 }
1599
1600 void
1601 inp_wlock(struct inpcb *inp)
1602 {
1603
1604 INP_WLOCK(inp);
1605 }
1606
1607 void
1608 inp_wunlock(struct inpcb *inp)
1609 {
1610
1611 INP_WUNLOCK(inp);
1612 }
1613
1614 void
1615 inp_rlock(struct inpcb *inp)
1616 {
1617
1618 INP_RLOCK(inp);
1619 }
1620
1621 void
1622 inp_runlock(struct inpcb *inp)
1623 {
1624
1625 INP_RUNLOCK(inp);
1626 }
1627
1628 #ifdef INVARIANTS
1629 void
1630 inp_wlock_assert(struct inpcb *inp)
1631 {
1632
1633 INP_WLOCK_ASSERT(inp);
1634 }
1635
1636 void
1637 inp_rlock_assert(struct inpcb *inp)
1638 {
1639
1640 INP_RLOCK_ASSERT(inp);
1641 }
1642
1643 void
1644 inp_lock_assert(struct inpcb *inp)
1645 {
1646
1647 INP_LOCK_ASSERT(inp);
1648 }
1649
1650 void
1651 inp_unlock_assert(struct inpcb *inp)
1652 {
1653
1654 INP_UNLOCK_ASSERT(inp);
1655 }
1656
1657 #endif
1658
1659 #ifdef DDB
1660 static void
1661 db_print_indent(int indent)
1662 {
1663 int i;
1664
1665 for (i = 0; i < indent; i++)
1666 db_printf(" ");
1667 }
1668
1669 static void
1670 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
1671 {
1672 char faddr_str[48], laddr_str[48];
1673
1674 db_print_indent(indent);
1675 db_printf("%s at %p\n", name, inc);
1676
1677 indent += 2;
1678
1679 #ifdef INET6
1680 if (inc->inc_flags & INC_ISIPV6) {
1681 /* IPv6. */
1682 ip6_sprintf(laddr_str, &inc->inc6_laddr);
1683 ip6_sprintf(faddr_str, &inc->inc6_faddr);
1684 } else {
1685 #endif
1686 /* IPv4. */
1687 inet_ntoa_r(inc->inc_laddr, laddr_str);
1688 inet_ntoa_r(inc->inc_faddr, faddr_str);
1689 #ifdef INET6
1690 }
1691 #endif
1692 db_print_indent(indent);
1693 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
1694 ntohs(inc->inc_lport));
1695 db_print_indent(indent);
1696 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
1697 ntohs(inc->inc_fport));
1698 }
1699
1700 static void
1701 db_print_inpflags(int inp_flags)
1702 {
1703 int comma;
1704
1705 comma = 0;
1706 if (inp_flags & INP_RECVOPTS) {
1707 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
1708 comma = 1;
1709 }
1710 if (inp_flags & INP_RECVRETOPTS) {
1711 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
1712 comma = 1;
1713 }
1714 if (inp_flags & INP_RECVDSTADDR) {
1715 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
1716 comma = 1;
1717 }
1718 if (inp_flags & INP_HDRINCL) {
1719 db_printf("%sINP_HDRINCL", comma ? ", " : "");
1720 comma = 1;
1721 }
1722 if (inp_flags & INP_HIGHPORT) {
1723 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
1724 comma = 1;
1725 }
1726 if (inp_flags & INP_LOWPORT) {
1727 db_printf("%sINP_LOWPORT", comma ? ", " : "");
1728 comma = 1;
1729 }
1730 if (inp_flags & INP_ANONPORT) {
1731 db_printf("%sINP_ANONPORT", comma ? ", " : "");
1732 comma = 1;
1733 }
1734 if (inp_flags & INP_RECVIF) {
1735 db_printf("%sINP_RECVIF", comma ? ", " : "");
1736 comma = 1;
1737 }
1738 if (inp_flags & INP_MTUDISC) {
1739 db_printf("%sINP_MTUDISC", comma ? ", " : "");
1740 comma = 1;
1741 }
1742 if (inp_flags & INP_FAITH) {
1743 db_printf("%sINP_FAITH", comma ? ", " : "");
1744 comma = 1;
1745 }
1746 if (inp_flags & INP_RECVTTL) {
1747 db_printf("%sINP_RECVTTL", comma ? ", " : "");
1748 comma = 1;
1749 }
1750 if (inp_flags & INP_DONTFRAG) {
1751 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
1752 comma = 1;
1753 }
1754 if (inp_flags & IN6P_IPV6_V6ONLY) {
1755 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
1756 comma = 1;
1757 }
1758 if (inp_flags & IN6P_PKTINFO) {
1759 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
1760 comma = 1;
1761 }
1762 if (inp_flags & IN6P_HOPLIMIT) {
1763 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
1764 comma = 1;
1765 }
1766 if (inp_flags & IN6P_HOPOPTS) {
1767 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
1768 comma = 1;
1769 }
1770 if (inp_flags & IN6P_DSTOPTS) {
1771 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
1772 comma = 1;
1773 }
1774 if (inp_flags & IN6P_RTHDR) {
1775 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
1776 comma = 1;
1777 }
1778 if (inp_flags & IN6P_RTHDRDSTOPTS) {
1779 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
1780 comma = 1;
1781 }
1782 if (inp_flags & IN6P_TCLASS) {
1783 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
1784 comma = 1;
1785 }
1786 if (inp_flags & IN6P_AUTOFLOWLABEL) {
1787 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
1788 comma = 1;
1789 }
1790 if (inp_flags & INP_TIMEWAIT) {
1791 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
1792 comma = 1;
1793 }
1794 if (inp_flags & INP_ONESBCAST) {
1795 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
1796 comma = 1;
1797 }
1798 if (inp_flags & INP_DROPPED) {
1799 db_printf("%sINP_DROPPED", comma ? ", " : "");
1800 comma = 1;
1801 }
1802 if (inp_flags & INP_SOCKREF) {
1803 db_printf("%sINP_SOCKREF", comma ? ", " : "");
1804 comma = 1;
1805 }
1806 if (inp_flags & IN6P_RFC2292) {
1807 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
1808 comma = 1;
1809 }
1810 if (inp_flags & IN6P_MTU) {
1811 db_printf("IN6P_MTU%s", comma ? ", " : "");
1812 comma = 1;
1813 }
1814 }
1815
1816 static void
1817 db_print_inpvflag(u_char inp_vflag)
1818 {
1819 int comma;
1820
1821 comma = 0;
1822 if (inp_vflag & INP_IPV4) {
1823 db_printf("%sINP_IPV4", comma ? ", " : "");
1824 comma = 1;
1825 }
1826 if (inp_vflag & INP_IPV6) {
1827 db_printf("%sINP_IPV6", comma ? ", " : "");
1828 comma = 1;
1829 }
1830 if (inp_vflag & INP_IPV6PROTO) {
1831 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
1832 comma = 1;
1833 }
1834 }
1835
1836 void
1837 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
1838 {
1839
1840 db_print_indent(indent);
1841 db_printf("%s at %p\n", name, inp);
1842
1843 indent += 2;
1844
1845 db_print_indent(indent);
1846 db_printf("inp_flow: 0x%x\n", inp->inp_flow);
1847
1848 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
1849
1850 db_print_indent(indent);
1851 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
1852 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
1853
1854 db_print_indent(indent);
1855 db_printf("inp_label: %p inp_flags: 0x%x (",
1856 inp->inp_label, inp->inp_flags);
1857 db_print_inpflags(inp->inp_flags);
1858 db_printf(")\n");
1859
1860 db_print_indent(indent);
1861 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
1862 inp->inp_vflag);
1863 db_print_inpvflag(inp->inp_vflag);
1864 db_printf(")\n");
1865
1866 db_print_indent(indent);
1867 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
1868 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
1869
1870 db_print_indent(indent);
1871 #ifdef INET6
1872 if (inp->inp_vflag & INP_IPV6) {
1873 db_printf("in6p_options: %p in6p_outputopts: %p "
1874 "in6p_moptions: %p\n", inp->in6p_options,
1875 inp->in6p_outputopts, inp->in6p_moptions);
1876 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
1877 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
1878 inp->in6p_hops);
1879 } else
1880 #endif
1881 {
1882 db_printf("inp_ip_tos: %d inp_ip_options: %p "
1883 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
1884 inp->inp_options, inp->inp_moptions);
1885 }
1886
1887 db_print_indent(indent);
1888 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
1889 (uintmax_t)inp->inp_gencnt);
1890 }
1891
1892 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
1893 {
1894 struct inpcb *inp;
1895
1896 if (!have_addr) {
1897 db_printf("usage: show inpcb <addr>\n");
1898 return;
1899 }
1900 inp = (struct inpcb *)addr;
1901
1902 db_print_inpcb(inp, "inpcb", 0);
1903 }
1904 #endif
Cache object: 46a86cf5afd92a34b17d75efa7cebf3a
|