FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c
1 /* $OpenBSD: in_pcb.c,v 1.276 2022/10/03 16:43:52 bluhm Exp $ */
2 /* $NetBSD: in_pcb.c,v 1.25 1996/02/13 23:41:53 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 * 3. Neither the name of the University nor the names of its contributors
17 * may be used to endorse or promote products derived from this software
18 * without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
33 *
34 * NRL grants permission for redistribution and use in source and binary
35 * forms, with or without modification, of the software and documentation
36 * created at NRL provided that the following conditions are met:
37 *
38 * 1. Redistributions of source code must retain the above copyright
39 * notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 * notice, this list of conditions and the following disclaimer in the
42 * documentation and/or other materials provided with the distribution.
43 * 3. All advertising materials mentioning features or use of this software
44 * must display the following acknowledgements:
45 * This product includes software developed by the University of
46 * California, Berkeley and its contributors.
47 * This product includes software developed at the Information
48 * Technology Division, US Naval Research Laboratory.
49 * 4. Neither the name of the NRL nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 *
65 * The views and conclusions contained in the software and documentation
66 * are those of the authors and should not be interpreted as representing
67 * official policies, either expressed or implied, of the US Naval
68 * Research Laboratory (NRL).
69 */
70
71 #include "pf.h"
72
73 #include <sys/param.h>
74 #include <sys/systm.h>
75 #include <sys/mbuf.h>
76 #include <sys/protosw.h>
77 #include <sys/socket.h>
78 #include <sys/socketvar.h>
79 #include <sys/domain.h>
80 #include <sys/mount.h>
81 #include <sys/pool.h>
82 #include <sys/proc.h>
83
84 #include <net/if.h>
85 #include <net/if_var.h>
86 #include <net/pfvar.h>
87 #include <net/route.h>
88
89 #include <netinet/in.h>
90 #include <netinet/in_var.h>
91 #include <netinet/ip.h>
92 #include <netinet/ip_var.h>
93 #include <netinet/in_pcb.h>
94 #ifdef IPSEC
95 #include <netinet/ip_esp.h>
96 #endif /* IPSEC */
97
98 #include "stoeplitz.h"
99 #if NSTOEPLITZ > 0
100 #include <net/toeplitz.h>
101 #endif
102
103 const struct in_addr zeroin_addr;
104
105 union {
106 struct in_addr za_in;
107 struct in6_addr za_in6;
108 } zeroin46_addr;
109
110 /*
111 * These configure the range of local port addresses assigned to
112 * "unspecified" outgoing connections/packets/whatever.
113 */
114 int ipport_firstauto = IPPORT_RESERVED;
115 int ipport_lastauto = IPPORT_USERRESERVED;
116 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO;
117 int ipport_hilastauto = IPPORT_HILASTAUTO;
118
119 struct baddynamicports baddynamicports;
120 struct baddynamicports rootonlyports;
121 struct pool inpcb_pool;
122
123 void in_pcbhash_insert(struct inpcb *);
124 struct inpcb *in_pcbhash_lookup(struct inpcbtable *, u_int,
125 const struct in_addr *, u_short, const struct in_addr *, u_short);
126 int in_pcbresize(struct inpcbtable *, int);
127
128 #define INPCBHASH_LOADFACTOR(_x) (((_x) * 3) / 4)
129
130 struct inpcbhead *in_pcbhash(struct inpcbtable *, u_int,
131 const struct in_addr *, u_short, const struct in_addr *, u_short);
132 struct inpcbhead *in_pcblhash(struct inpcbtable *, u_int, u_short);
133
134 /*
135 * in_pcb is used for inet and inet6. in6_pcb only contains special
136 * IPv6 cases. So the internet initializer is used for both domains.
137 */
138 void
139 in_init(void)
140 {
141 pool_init(&inpcb_pool, sizeof(struct inpcb), 0,
142 IPL_SOFTNET, 0, "inpcb", NULL);
143 }
144
145 struct inpcbhead *
146 in_pcbhash(struct inpcbtable *table, u_int rdomain,
147 const struct in_addr *faddr, u_short fport,
148 const struct in_addr *laddr, u_short lport)
149 {
150 SIPHASH_CTX ctx;
151 u_int32_t nrdom = htonl(rdomain);
152
153 SipHash24_Init(&ctx, &table->inpt_key);
154 SipHash24_Update(&ctx, &nrdom, sizeof(nrdom));
155 SipHash24_Update(&ctx, faddr, sizeof(*faddr));
156 SipHash24_Update(&ctx, &fport, sizeof(fport));
157 SipHash24_Update(&ctx, laddr, sizeof(*laddr));
158 SipHash24_Update(&ctx, &lport, sizeof(lport));
159
160 return (&table->inpt_hashtbl[SipHash24_End(&ctx) & table->inpt_mask]);
161 }
162
163 struct inpcbhead *
164 in_pcblhash(struct inpcbtable *table, u_int rdomain, u_short lport)
165 {
166 SIPHASH_CTX ctx;
167 u_int32_t nrdom = htonl(rdomain);
168
169 SipHash24_Init(&ctx, &table->inpt_lkey);
170 SipHash24_Update(&ctx, &nrdom, sizeof(nrdom));
171 SipHash24_Update(&ctx, &lport, sizeof(lport));
172
173 return (&table->inpt_lhashtbl[SipHash24_End(&ctx) & table->inpt_lmask]);
174 }
175
176 void
177 in_pcbinit(struct inpcbtable *table, int hashsize)
178 {
179 mtx_init(&table->inpt_mtx, IPL_SOFTNET);
180 rw_init(&table->inpt_notify, "inpnotify");
181 TAILQ_INIT(&table->inpt_queue);
182 table->inpt_hashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
183 &table->inpt_mask);
184 table->inpt_lhashtbl = hashinit(hashsize, M_PCB, M_WAITOK,
185 &table->inpt_lmask);
186 table->inpt_count = 0;
187 table->inpt_size = hashsize;
188 arc4random_buf(&table->inpt_key, sizeof(table->inpt_key));
189 arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey));
190 }
191
192 /*
193 * Check if the specified port is invalid for dynamic allocation.
194 */
195 int
196 in_baddynamic(u_int16_t port, u_int16_t proto)
197 {
198 switch (proto) {
199 case IPPROTO_TCP:
200 return (DP_ISSET(baddynamicports.tcp, port));
201 case IPPROTO_UDP:
202 #ifdef IPSEC
203 /* Cannot preset this as it is a sysctl */
204 if (port == udpencap_port)
205 return (1);
206 #endif
207 return (DP_ISSET(baddynamicports.udp, port));
208 default:
209 return (0);
210 }
211 }
212
213 int
214 in_rootonly(u_int16_t port, u_int16_t proto)
215 {
216 switch (proto) {
217 case IPPROTO_TCP:
218 return (port < IPPORT_RESERVED ||
219 DP_ISSET(rootonlyports.tcp, port));
220 case IPPROTO_UDP:
221 return (port < IPPORT_RESERVED ||
222 DP_ISSET(rootonlyports.udp, port));
223 default:
224 return (0);
225 }
226 }
227
228 int
229 in_pcballoc(struct socket *so, struct inpcbtable *table, int wait)
230 {
231 struct inpcb *inp;
232
233 inp = pool_get(&inpcb_pool, (wait == M_WAIT ? PR_WAITOK : PR_NOWAIT) |
234 PR_ZERO);
235 if (inp == NULL)
236 return (ENOBUFS);
237 inp->inp_table = table;
238 inp->inp_socket = so;
239 refcnt_init_trace(&inp->inp_refcnt, DT_REFCNT_IDX_INPCB);
240 mtx_init(&inp->inp_mtx, IPL_SOFTNET);
241 inp->inp_seclevel[SL_AUTH] = IPSEC_AUTH_LEVEL_DEFAULT;
242 inp->inp_seclevel[SL_ESP_TRANS] = IPSEC_ESP_TRANS_LEVEL_DEFAULT;
243 inp->inp_seclevel[SL_ESP_NETWORK] = IPSEC_ESP_NETWORK_LEVEL_DEFAULT;
244 inp->inp_seclevel[SL_IPCOMP] = IPSEC_IPCOMP_LEVEL_DEFAULT;
245 inp->inp_rtableid = curproc->p_p->ps_rtableid;
246 inp->inp_hops = -1;
247 #ifdef INET6
248 /*
249 * Small change in this function to set the INP_IPV6 flag so routines
250 * outside pcb-specific routines don't need to use sotopf(), and all
251 * of its pointer chasing, later.
252 */
253 if (sotopf(so) == PF_INET6)
254 inp->inp_flags = INP_IPV6;
255 inp->inp_cksum6 = -1;
256 #endif /* INET6 */
257
258 mtx_enter(&table->inpt_mtx);
259 if (table->inpt_count++ > INPCBHASH_LOADFACTOR(table->inpt_size))
260 (void)in_pcbresize(table, table->inpt_size * 2);
261 TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue);
262 in_pcbhash_insert(inp);
263 mtx_leave(&table->inpt_mtx);
264
265 so->so_pcb = inp;
266
267 return (0);
268 }
269
270 int
271 in_pcbbind(struct inpcb *inp, struct mbuf *nam, struct proc *p)
272 {
273 struct socket *so = inp->inp_socket;
274 u_int16_t lport = 0;
275 int wild = 0;
276 void *laddr = &zeroin46_addr;
277 int error;
278
279 if (inp->inp_lport)
280 return (EINVAL);
281
282 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 &&
283 ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
284 (so->so_options & SO_ACCEPTCONN) == 0))
285 wild = INPLOOKUP_WILDCARD;
286
287 switch (sotopf(so)) {
288 #ifdef INET6
289 case PF_INET6:
290 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6))
291 return (EINVAL);
292 wild |= INPLOOKUP_IPV6;
293
294 if (nam) {
295 struct sockaddr_in6 *sin6;
296
297 if ((error = in6_nam2sin6(nam, &sin6)))
298 return (error);
299 if ((error = in6_pcbaddrisavail(inp, sin6, wild, p)))
300 return (error);
301 laddr = &sin6->sin6_addr;
302 lport = sin6->sin6_port;
303 }
304 break;
305 #endif
306 case PF_INET:
307 if (inp->inp_laddr.s_addr != INADDR_ANY)
308 return (EINVAL);
309
310 if (nam) {
311 struct sockaddr_in *sin;
312
313 if ((error = in_nam2sin(nam, &sin)))
314 return (error);
315 if ((error = in_pcbaddrisavail(inp, sin, wild, p)))
316 return (error);
317 laddr = &sin->sin_addr;
318 lport = sin->sin_port;
319 }
320 break;
321 default:
322 return (EINVAL);
323 }
324
325 if (lport == 0) {
326 if ((error = in_pcbpickport(&lport, laddr, wild, inp, p)))
327 return (error);
328 } else {
329 if (in_rootonly(ntohs(lport), so->so_proto->pr_protocol) &&
330 suser(p) != 0)
331 return (EACCES);
332 }
333 if (nam) {
334 switch (sotopf(so)) {
335 #ifdef INET6
336 case PF_INET6:
337 inp->inp_laddr6 = *(struct in6_addr *)laddr;
338 break;
339 #endif
340 case PF_INET:
341 inp->inp_laddr = *(struct in_addr *)laddr;
342 break;
343 }
344 }
345 inp->inp_lport = lport;
346 in_pcbrehash(inp);
347 return (0);
348 }
349
350 int
351 in_pcbaddrisavail(struct inpcb *inp, struct sockaddr_in *sin, int wild,
352 struct proc *p)
353 {
354 struct socket *so = inp->inp_socket;
355 struct inpcbtable *table = inp->inp_table;
356 u_int16_t lport = sin->sin_port;
357 int reuseport = (so->so_options & SO_REUSEPORT);
358
359 if (IN_MULTICAST(sin->sin_addr.s_addr)) {
360 /*
361 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
362 * allow complete duplication of binding if
363 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
364 * and a multicast address is bound on both
365 * new and duplicated sockets.
366 */
367 if (so->so_options & (SO_REUSEADDR|SO_REUSEPORT))
368 reuseport = SO_REUSEADDR|SO_REUSEPORT;
369 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
370 /*
371 * we must check that we are binding to an address we
372 * own except when:
373 * - SO_BINDANY is set or
374 * - we are binding a UDP socket to 255.255.255.255 or
375 * - we are binding a UDP socket to one of our broadcast
376 * addresses
377 */
378 if (!ISSET(so->so_options, SO_BINDANY) &&
379 !(so->so_type == SOCK_DGRAM &&
380 sin->sin_addr.s_addr == INADDR_BROADCAST) &&
381 !(so->so_type == SOCK_DGRAM &&
382 in_broadcast(sin->sin_addr, inp->inp_rtableid))) {
383 struct ifaddr *ia;
384
385 sin->sin_port = 0;
386 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
387 ia = ifa_ifwithaddr(sintosa(sin), inp->inp_rtableid);
388 sin->sin_port = lport;
389
390 if (ia == NULL)
391 return (EADDRNOTAVAIL);
392 }
393 }
394 if (lport) {
395 struct inpcb *t;
396 int error = 0;
397
398 if (so->so_euid && !IN_MULTICAST(sin->sin_addr.s_addr)) {
399 t = in_pcblookup_local(table, &sin->sin_addr, lport,
400 INPLOOKUP_WILDCARD, inp->inp_rtableid);
401 if (t && (so->so_euid != t->inp_socket->so_euid))
402 error = EADDRINUSE;
403 in_pcbunref(t);
404 if (error)
405 return (error);
406 }
407 t = in_pcblookup_local(table, &sin->sin_addr, lport,
408 wild, inp->inp_rtableid);
409 if (t && (reuseport & t->inp_socket->so_options) == 0)
410 error = EADDRINUSE;
411 in_pcbunref(t);
412 if (error)
413 return (error);
414 }
415
416 return (0);
417 }
418
419 int
420 in_pcbpickport(u_int16_t *lport, void *laddr, int wild, struct inpcb *inp,
421 struct proc *p)
422 {
423 struct socket *so = inp->inp_socket;
424 struct inpcbtable *table = inp->inp_table;
425 struct inpcb *t;
426 u_int16_t first, last, lower, higher, candidate, localport;
427 int count;
428
429 if (inp->inp_flags & INP_HIGHPORT) {
430 first = ipport_hifirstauto; /* sysctl */
431 last = ipport_hilastauto;
432 } else if (inp->inp_flags & INP_LOWPORT) {
433 if (suser(p))
434 return (EACCES);
435 first = IPPORT_RESERVED-1; /* 1023 */
436 last = 600; /* not IPPORT_RESERVED/2 */
437 } else {
438 first = ipport_firstauto; /* sysctl */
439 last = ipport_lastauto;
440 }
441 if (first < last) {
442 lower = first;
443 higher = last;
444 } else {
445 lower = last;
446 higher = first;
447 }
448
449 /*
450 * Simple check to ensure all ports are not used up causing
451 * a deadlock here.
452 */
453
454 count = higher - lower;
455 candidate = lower + arc4random_uniform(count);
456
457 t = NULL;
458 do {
459 in_pcbunref(t);
460 do {
461 if (count-- < 0) /* completely used? */
462 return (EADDRNOTAVAIL);
463 ++candidate;
464 if (candidate < lower || candidate > higher)
465 candidate = lower;
466 localport = htons(candidate);
467 } while (in_baddynamic(candidate, so->so_proto->pr_protocol));
468 t = in_pcblookup_local(table, laddr, localport, wild,
469 inp->inp_rtableid);
470 } while (t != NULL);
471 *lport = localport;
472
473 return (0);
474 }
475
476 /*
477 * Connect from a socket to a specified address.
478 * Both address and port must be specified in argument sin.
479 * If don't have a local address for this socket yet,
480 * then pick one.
481 */
482 int
483 in_pcbconnect(struct inpcb *inp, struct mbuf *nam)
484 {
485 struct in_addr ina;
486 struct sockaddr_in *sin;
487 struct inpcb *t;
488 int error;
489
490 #ifdef INET6
491 if (sotopf(inp->inp_socket) == PF_INET6)
492 return (in6_pcbconnect(inp, nam));
493 KASSERT((inp->inp_flags & INP_IPV6) == 0);
494 #endif /* INET6 */
495
496 if ((error = in_nam2sin(nam, &sin)))
497 return (error);
498 if (sin->sin_port == 0)
499 return (EADDRNOTAVAIL);
500 error = in_pcbselsrc(&ina, sin, inp);
501 if (error)
502 return (error);
503
504 t = in_pcblookup(inp->inp_table, sin->sin_addr, sin->sin_port,
505 ina, inp->inp_lport, inp->inp_rtableid);
506 if (t != NULL) {
507 in_pcbunref(t);
508 return (EADDRINUSE);
509 }
510
511 KASSERT(inp->inp_laddr.s_addr == INADDR_ANY || inp->inp_lport);
512
513 if (inp->inp_laddr.s_addr == INADDR_ANY) {
514 if (inp->inp_lport == 0) {
515 error = in_pcbbind(inp, NULL, curproc);
516 if (error)
517 return (error);
518 t = in_pcblookup(inp->inp_table, sin->sin_addr,
519 sin->sin_port, ina, inp->inp_lport,
520 inp->inp_rtableid);
521 if (t != NULL) {
522 inp->inp_lport = 0;
523 in_pcbunref(t);
524 return (EADDRINUSE);
525 }
526 }
527 inp->inp_laddr = ina;
528 }
529 inp->inp_faddr = sin->sin_addr;
530 inp->inp_fport = sin->sin_port;
531 in_pcbrehash(inp);
532 #if NSTOEPLITZ > 0
533 inp->inp_flowid = stoeplitz_ip4port(inp->inp_faddr.s_addr,
534 inp->inp_laddr.s_addr, inp->inp_fport, inp->inp_lport);
535 #endif
536 return (0);
537 }
538
539 void
540 in_pcbdisconnect(struct inpcb *inp)
541 {
542 #if NPF > 0
543 if (inp->inp_pf_sk) {
544 pf_remove_divert_state(inp->inp_pf_sk);
545 /* pf_remove_divert_state() may have detached the state */
546 pf_inp_unlink(inp);
547 }
548 #endif
549 switch (sotopf(inp->inp_socket)) {
550 #ifdef INET6
551 case PF_INET6:
552 inp->inp_faddr6 = in6addr_any;
553 break;
554 #endif
555 case PF_INET:
556 inp->inp_faddr.s_addr = INADDR_ANY;
557 break;
558 }
559
560 inp->inp_fport = 0;
561 inp->inp_flowid = 0;
562 in_pcbrehash(inp);
563 if (inp->inp_socket->so_state & SS_NOFDREF)
564 in_pcbdetach(inp);
565 }
566
567 void
568 in_pcbdetach(struct inpcb *inp)
569 {
570 struct socket *so = inp->inp_socket;
571 struct inpcbtable *table = inp->inp_table;
572
573 so->so_pcb = NULL;
574 /*
575 * As long as the NET_LOCK() is the default lock for Internet
576 * sockets, do not release it to not introduce new sleeping
577 * points.
578 */
579 sofree(so, 1);
580 m_freem(inp->inp_options);
581 if (inp->inp_route.ro_rt) {
582 rtfree(inp->inp_route.ro_rt);
583 inp->inp_route.ro_rt = NULL;
584 }
585 #ifdef INET6
586 if (inp->inp_flags & INP_IPV6) {
587 ip6_freepcbopts(inp->inp_outputopts6);
588 ip6_freemoptions(inp->inp_moptions6);
589 } else
590 #endif
591 ip_freemoptions(inp->inp_moptions);
592 #if NPF > 0
593 if (inp->inp_pf_sk) {
594 pf_remove_divert_state(inp->inp_pf_sk);
595 /* pf_remove_divert_state() may have detached the state */
596 pf_inp_unlink(inp);
597 }
598 #endif
599 mtx_enter(&table->inpt_mtx);
600 LIST_REMOVE(inp, inp_lhash);
601 LIST_REMOVE(inp, inp_hash);
602 TAILQ_REMOVE(&table->inpt_queue, inp, inp_queue);
603 table->inpt_count--;
604 mtx_leave(&table->inpt_mtx);
605
606 in_pcbunref(inp);
607 }
608
609 struct inpcb *
610 in_pcbref(struct inpcb *inp)
611 {
612 if (inp == NULL)
613 return NULL;
614 refcnt_take(&inp->inp_refcnt);
615 return inp;
616 }
617
618 void
619 in_pcbunref(struct inpcb *inp)
620 {
621 if (inp == NULL)
622 return;
623 if (refcnt_rele(&inp->inp_refcnt) == 0)
624 return;
625 KASSERT((LIST_NEXT(inp, inp_hash) == NULL) ||
626 (LIST_NEXT(inp, inp_hash) == _Q_INVALID));
627 KASSERT((LIST_NEXT(inp, inp_lhash) == NULL) ||
628 (LIST_NEXT(inp, inp_lhash) == _Q_INVALID));
629 KASSERT((TAILQ_NEXT(inp, inp_queue) == NULL) ||
630 (TAILQ_NEXT(inp, inp_queue) == _Q_INVALID));
631 pool_put(&inpcb_pool, inp);
632 }
633
634 void
635 in_setsockaddr(struct inpcb *inp, struct mbuf *nam)
636 {
637 struct sockaddr_in *sin;
638
639 nam->m_len = sizeof(*sin);
640 sin = mtod(nam, struct sockaddr_in *);
641 memset(sin, 0, sizeof(*sin));
642 sin->sin_family = AF_INET;
643 sin->sin_len = sizeof(*sin);
644 sin->sin_port = inp->inp_lport;
645 sin->sin_addr = inp->inp_laddr;
646 }
647
648 void
649 in_setpeeraddr(struct inpcb *inp, struct mbuf *nam)
650 {
651 struct sockaddr_in *sin;
652
653 #ifdef INET6
654 if (sotopf(inp->inp_socket) == PF_INET6) {
655 in6_setpeeraddr(inp, nam);
656 return;
657 }
658 #endif /* INET6 */
659
660 nam->m_len = sizeof(*sin);
661 sin = mtod(nam, struct sockaddr_in *);
662 memset(sin, 0, sizeof(*sin));
663 sin->sin_family = AF_INET;
664 sin->sin_len = sizeof(*sin);
665 sin->sin_port = inp->inp_fport;
666 sin->sin_addr = inp->inp_faddr;
667 }
668
669 int
670 in_sockaddr(struct socket *so, struct mbuf *nam)
671 {
672 struct inpcb *inp;
673
674 inp = sotoinpcb(so);
675 in_setsockaddr(inp, nam);
676
677 return (0);
678 }
679
680 int
681 in_peeraddr(struct socket *so, struct mbuf *nam)
682 {
683 struct inpcb *inp;
684
685 inp = sotoinpcb(so);
686 in_setpeeraddr(inp, nam);
687
688 return (0);
689 }
690
691 /*
692 * Pass some notification to all connections of a protocol
693 * associated with address dst. The "usual action" will be
694 * taken, depending on the ctlinput cmd. The caller must filter any
695 * cmds that are uninteresting (e.g., no error in the map).
696 * Call the protocol specific routine (if any) to report
697 * any errors for each matching socket.
698 */
699 void
700 in_pcbnotifyall(struct inpcbtable *table, struct sockaddr *dst, u_int rtable,
701 int errno, void (*notify)(struct inpcb *, int))
702 {
703 SIMPLEQ_HEAD(, inpcb) inpcblist;
704 struct inpcb *inp;
705 struct in_addr faddr;
706 u_int rdomain;
707
708 if (dst->sa_family != AF_INET)
709 return;
710 faddr = satosin(dst)->sin_addr;
711 if (faddr.s_addr == INADDR_ANY)
712 return;
713 if (notify == NULL)
714 return;
715
716 /*
717 * Use a temporary notify list protected by rwlock to run over
718 * selected PCB. This is necessary as the list of all PCB is
719 * protected by a mutex. Notify may call ip_output() eventually
720 * which may sleep as pf lock is a rwlock. Also the SRP
721 * implementation of the routing table might sleep.
722 * The same inp_notify list entry and inpt_notify rwlock are
723 * used for UDP multicast and raw IP delivery.
724 */
725 SIMPLEQ_INIT(&inpcblist);
726 rdomain = rtable_l2(rtable);
727 rw_enter_write(&table->inpt_notify);
728 mtx_enter(&table->inpt_mtx);
729 TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
730 #ifdef INET6
731 if (inp->inp_flags & INP_IPV6)
732 continue;
733 #endif
734 if (inp->inp_faddr.s_addr != faddr.s_addr ||
735 rtable_l2(inp->inp_rtableid) != rdomain ||
736 inp->inp_socket == NULL) {
737 continue;
738 }
739 in_pcbref(inp);
740 SIMPLEQ_INSERT_TAIL(&inpcblist, inp, inp_notify);
741 }
742 mtx_leave(&table->inpt_mtx);
743
744 while ((inp = SIMPLEQ_FIRST(&inpcblist)) != NULL) {
745 SIMPLEQ_REMOVE_HEAD(&inpcblist, inp_notify);
746 (*notify)(inp, errno);
747 in_pcbunref(inp);
748 }
749 rw_exit_write(&table->inpt_notify);
750 }
751
752 /*
753 * Check for alternatives when higher level complains
754 * about service problems. For now, invalidate cached
755 * routing information. If the route was created dynamically
756 * (by a redirect), time to try a default gateway again.
757 */
758 void
759 in_losing(struct inpcb *inp)
760 {
761 struct rtentry *rt = inp->inp_route.ro_rt;
762
763 if (rt) {
764 inp->inp_route.ro_rt = NULL;
765
766 if (rt->rt_flags & RTF_DYNAMIC) {
767 struct ifnet *ifp;
768
769 ifp = if_get(rt->rt_ifidx);
770 /*
771 * If the interface is gone, all its attached
772 * route entries have been removed from the table,
773 * so we're dealing with a stale cache and have
774 * nothing to do.
775 */
776 if (ifp != NULL)
777 rtdeletemsg(rt, ifp, inp->inp_rtableid);
778 if_put(ifp);
779 }
780 /*
781 * A new route can be allocated
782 * the next time output is attempted.
783 * rtfree() needs to be called in anycase because the inp
784 * is still holding a reference to rt.
785 */
786 rtfree(rt);
787 }
788 }
789
790 /*
791 * After a routing change, flush old routing
792 * and allocate a (hopefully) better one.
793 */
794 void
795 in_rtchange(struct inpcb *inp, int errno)
796 {
797 if (inp->inp_route.ro_rt) {
798 rtfree(inp->inp_route.ro_rt);
799 inp->inp_route.ro_rt = NULL;
800 /*
801 * A new route can be allocated the next time
802 * output is attempted.
803 */
804 }
805 }
806
807 struct inpcb *
808 in_pcblookup_local(struct inpcbtable *table, void *laddrp, u_int lport_arg,
809 int flags, u_int rtable)
810 {
811 struct inpcb *inp, *match = NULL;
812 int matchwild = 3, wildcard;
813 u_int16_t lport = lport_arg;
814 struct in_addr laddr = *(struct in_addr *)laddrp;
815 #ifdef INET6
816 struct in6_addr *laddr6 = (struct in6_addr *)laddrp;
817 #endif
818 struct inpcbhead *head;
819 u_int rdomain;
820
821 rdomain = rtable_l2(rtable);
822 mtx_enter(&table->inpt_mtx);
823 head = in_pcblhash(table, rdomain, lport);
824 LIST_FOREACH(inp, head, inp_lhash) {
825 if (rtable_l2(inp->inp_rtableid) != rdomain)
826 continue;
827 if (inp->inp_lport != lport)
828 continue;
829 wildcard = 0;
830 #ifdef INET6
831 if (ISSET(flags, INPLOOKUP_IPV6)) {
832 if (!ISSET(inp->inp_flags, INP_IPV6))
833 continue;
834
835 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
836 wildcard++;
837
838 if (!IN6_ARE_ADDR_EQUAL(&inp->inp_laddr6, laddr6)) {
839 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_laddr6) ||
840 IN6_IS_ADDR_UNSPECIFIED(laddr6))
841 wildcard++;
842 else
843 continue;
844 }
845
846 } else
847 #endif /* INET6 */
848 {
849 #ifdef INET6
850 if (ISSET(inp->inp_flags, INP_IPV6))
851 continue;
852 #endif /* INET6 */
853
854 if (inp->inp_faddr.s_addr != INADDR_ANY)
855 wildcard++;
856
857 if (inp->inp_laddr.s_addr != laddr.s_addr) {
858 if (inp->inp_laddr.s_addr == INADDR_ANY ||
859 laddr.s_addr == INADDR_ANY)
860 wildcard++;
861 else
862 continue;
863 }
864
865 }
866 if ((!wildcard || (flags & INPLOOKUP_WILDCARD)) &&
867 wildcard < matchwild) {
868 match = inp;
869 if ((matchwild = wildcard) == 0)
870 break;
871 }
872 }
873 in_pcbref(match);
874 mtx_leave(&table->inpt_mtx);
875
876 return (match);
877 }
878
879 struct rtentry *
880 in_pcbrtentry(struct inpcb *inp)
881 {
882 struct route *ro;
883
884 ro = &inp->inp_route;
885
886 /* check if route is still valid */
887 if (!rtisvalid(ro->ro_rt)) {
888 rtfree(ro->ro_rt);
889 ro->ro_rt = NULL;
890 }
891
892 /*
893 * No route yet, so try to acquire one.
894 */
895 if (ro->ro_rt == NULL) {
896 #ifdef INET6
897 memset(ro, 0, sizeof(struct route_in6));
898 #else
899 memset(ro, 0, sizeof(struct route));
900 #endif
901
902 switch(sotopf(inp->inp_socket)) {
903 #ifdef INET6
904 case PF_INET6:
905 if (IN6_IS_ADDR_UNSPECIFIED(&inp->inp_faddr6))
906 break;
907 ro->ro_dst.sa_family = AF_INET6;
908 ro->ro_dst.sa_len = sizeof(struct sockaddr_in6);
909 satosin6(&ro->ro_dst)->sin6_addr = inp->inp_faddr6;
910 ro->ro_tableid = inp->inp_rtableid;
911 ro->ro_rt = rtalloc_mpath(&ro->ro_dst,
912 &inp->inp_laddr6.s6_addr32[0], ro->ro_tableid);
913 break;
914 #endif /* INET6 */
915 case PF_INET:
916 if (inp->inp_faddr.s_addr == INADDR_ANY)
917 break;
918 ro->ro_dst.sa_family = AF_INET;
919 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
920 satosin(&ro->ro_dst)->sin_addr = inp->inp_faddr;
921 ro->ro_tableid = inp->inp_rtableid;
922 ro->ro_rt = rtalloc_mpath(&ro->ro_dst,
923 &inp->inp_laddr.s_addr, ro->ro_tableid);
924 break;
925 }
926 }
927 return (ro->ro_rt);
928 }
929
930 /*
931 * Return an IPv4 address, which is the most appropriate for a given
932 * destination.
933 * If necessary, this function lookups the routing table and returns
934 * an entry to the caller for later use.
935 */
936 int
937 in_pcbselsrc(struct in_addr *insrc, struct sockaddr_in *sin,
938 struct inpcb *inp)
939 {
940 struct ip_moptions *mopts = inp->inp_moptions;
941 struct route *ro = &inp->inp_route;
942 struct in_addr *laddr = &inp->inp_laddr;
943 u_int rtableid = inp->inp_rtableid;
944 struct sockaddr *ip4_source = NULL;
945
946 struct sockaddr_in *sin2;
947 struct in_ifaddr *ia = NULL;
948
949 /*
950 * If the socket(if any) is already bound, use that bound address
951 * unless it is INADDR_ANY or INADDR_BROADCAST.
952 */
953 if (laddr->s_addr != INADDR_ANY &&
954 laddr->s_addr != INADDR_BROADCAST) {
955 *insrc = *laddr;
956 return (0);
957 }
958
959 /*
960 * If the destination address is multicast or limited
961 * broadcast (255.255.255.255) and an outgoing interface has
962 * been set as a multicast option, use the address of that
963 * interface as our source address.
964 */
965 if ((IN_MULTICAST(sin->sin_addr.s_addr) ||
966 sin->sin_addr.s_addr == INADDR_BROADCAST) && mopts != NULL) {
967 struct ifnet *ifp;
968
969 ifp = if_get(mopts->imo_ifidx);
970 if (ifp != NULL) {
971 if (ifp->if_rdomain == rtable_l2(rtableid))
972 IFP_TO_IA(ifp, ia);
973 if (ia == NULL) {
974 if_put(ifp);
975 return (EADDRNOTAVAIL);
976 }
977
978 *insrc = ia->ia_addr.sin_addr;
979 if_put(ifp);
980 return (0);
981 }
982 }
983
984 /*
985 * If route is known or can be allocated now,
986 * our src addr is taken from the i/f, else punt.
987 */
988 if (!rtisvalid(ro->ro_rt) || (ro->ro_tableid != rtableid) ||
989 (satosin(&ro->ro_dst)->sin_addr.s_addr != sin->sin_addr.s_addr)) {
990 rtfree(ro->ro_rt);
991 ro->ro_rt = NULL;
992 }
993 if (ro->ro_rt == NULL) {
994 /* No route yet, so try to acquire one */
995 ro->ro_dst.sa_family = AF_INET;
996 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
997 satosin(&ro->ro_dst)->sin_addr = sin->sin_addr;
998 ro->ro_tableid = rtableid;
999 ro->ro_rt = rtalloc_mpath(&ro->ro_dst, NULL, ro->ro_tableid);
1000
1001 /*
1002 * It is important to zero out the rest of the
1003 * struct sockaddr_in when mixing v6 & v4!
1004 */
1005 sin2 = satosin(&ro->ro_dst);
1006 memset(sin2->sin_zero, 0, sizeof(sin2->sin_zero));
1007 }
1008
1009 /*
1010 * If we found a route, use the address
1011 * corresponding to the outgoing interface.
1012 */
1013 if (ro->ro_rt != NULL)
1014 ia = ifatoia(ro->ro_rt->rt_ifa);
1015
1016 /*
1017 * Use preferred source address if :
1018 * - destination is not onlink
1019 * - preferred source address is set
1020 * - output interface is UP
1021 */
1022 if (ro->ro_rt && !(ro->ro_rt->rt_flags & RTF_LLINFO) &&
1023 !(ro->ro_rt->rt_flags & RTF_HOST)) {
1024 ip4_source = rtable_getsource(rtableid, AF_INET);
1025 if (ip4_source != NULL) {
1026 struct ifaddr *ifa;
1027 if ((ifa = ifa_ifwithaddr(ip4_source, rtableid)) !=
1028 NULL && ISSET(ifa->ifa_ifp->if_flags, IFF_UP)) {
1029 *insrc = satosin(ip4_source)->sin_addr;
1030 return (0);
1031 }
1032 }
1033 }
1034
1035 if (ia == NULL)
1036 return (EADDRNOTAVAIL);
1037
1038 *insrc = ia->ia_addr.sin_addr;
1039 return (0);
1040 }
1041
1042 void
1043 in_pcbrehash(struct inpcb *inp)
1044 {
1045 struct inpcbtable *table = inp->inp_table;
1046
1047 mtx_enter(&table->inpt_mtx);
1048 LIST_REMOVE(inp, inp_lhash);
1049 LIST_REMOVE(inp, inp_hash);
1050 in_pcbhash_insert(inp);
1051 mtx_leave(&table->inpt_mtx);
1052 }
1053
1054 void
1055 in_pcbhash_insert(struct inpcb *inp)
1056 {
1057 struct inpcbtable *table = inp->inp_table;
1058 struct inpcbhead *head;
1059
1060 NET_ASSERT_LOCKED();
1061 MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
1062
1063 head = in_pcblhash(table, inp->inp_rtableid, inp->inp_lport);
1064 LIST_INSERT_HEAD(head, inp, inp_lhash);
1065 #ifdef INET6
1066 if (inp->inp_flags & INP_IPV6)
1067 head = in6_pcbhash(table, rtable_l2(inp->inp_rtableid),
1068 &inp->inp_faddr6, inp->inp_fport,
1069 &inp->inp_laddr6, inp->inp_lport);
1070 else
1071 #endif /* INET6 */
1072 head = in_pcbhash(table, rtable_l2(inp->inp_rtableid),
1073 &inp->inp_faddr, inp->inp_fport,
1074 &inp->inp_laddr, inp->inp_lport);
1075 LIST_INSERT_HEAD(head, inp, inp_hash);
1076 }
1077
1078 struct inpcb *
1079 in_pcbhash_lookup(struct inpcbtable *table, u_int rdomain,
1080 const struct in_addr *faddr, u_short fport,
1081 const struct in_addr *laddr, u_short lport)
1082 {
1083 struct inpcbhead *head;
1084 struct inpcb *inp;
1085
1086 NET_ASSERT_LOCKED();
1087 MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
1088
1089 head = in_pcbhash(table, rdomain, faddr, fport, laddr, lport);
1090 LIST_FOREACH(inp, head, inp_hash) {
1091 #ifdef INET6
1092 if (ISSET(inp->inp_flags, INP_IPV6))
1093 continue;
1094 #endif
1095 if (inp->inp_fport == fport && inp->inp_lport == lport &&
1096 inp->inp_faddr.s_addr == faddr->s_addr &&
1097 inp->inp_laddr.s_addr == laddr->s_addr &&
1098 rtable_l2(inp->inp_rtableid) == rdomain) {
1099 break;
1100 }
1101 }
1102 if (inp != NULL) {
1103 /*
1104 * Move this PCB to the head of hash chain so that
1105 * repeated accesses are quicker. This is analogous to
1106 * the historic single-entry PCB cache.
1107 */
1108 if (inp != LIST_FIRST(head)) {
1109 LIST_REMOVE(inp, inp_hash);
1110 LIST_INSERT_HEAD(head, inp, inp_hash);
1111 }
1112 }
1113 return (inp);
1114 }
1115
1116 int
1117 in_pcbresize(struct inpcbtable *table, int hashsize)
1118 {
1119 u_long nmask, nlmask;
1120 int osize;
1121 void *nhashtbl, *nlhashtbl, *ohashtbl, *olhashtbl;
1122 struct inpcb *inp;
1123
1124 MUTEX_ASSERT_LOCKED(&table->inpt_mtx);
1125
1126 ohashtbl = table->inpt_hashtbl;
1127 olhashtbl = table->inpt_lhashtbl;
1128 osize = table->inpt_size;
1129
1130 nhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nmask);
1131 if (nhashtbl == NULL)
1132 return ENOBUFS;
1133 nlhashtbl = hashinit(hashsize, M_PCB, M_NOWAIT, &nlmask);
1134 if (nlhashtbl == NULL) {
1135 hashfree(nhashtbl, hashsize, M_PCB);
1136 return ENOBUFS;
1137 }
1138 table->inpt_hashtbl = nhashtbl;
1139 table->inpt_lhashtbl = nlhashtbl;
1140 table->inpt_mask = nmask;
1141 table->inpt_lmask = nlmask;
1142 table->inpt_size = hashsize;
1143 arc4random_buf(&table->inpt_key, sizeof(table->inpt_key));
1144 arc4random_buf(&table->inpt_lkey, sizeof(table->inpt_lkey));
1145
1146 TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
1147 LIST_REMOVE(inp, inp_lhash);
1148 LIST_REMOVE(inp, inp_hash);
1149 in_pcbhash_insert(inp);
1150 }
1151 hashfree(ohashtbl, osize, M_PCB);
1152 hashfree(olhashtbl, osize, M_PCB);
1153
1154 return (0);
1155 }
1156
1157 #ifdef DIAGNOSTIC
1158 int in_pcbnotifymiss = 0;
1159 #endif
1160
1161 /*
1162 * The in(6)_pcblookup functions are used to locate connected sockets
1163 * quickly:
1164 * faddr.fport <-> laddr.lport
1165 * No wildcard matching is done so that listening sockets are not found.
1166 * If the functions return NULL in(6)_pcblookup_listen can be used to
1167 * find a listening/bound socket that may accept the connection.
1168 * After those two lookups no other are necessary.
1169 */
1170 struct inpcb *
1171 in_pcblookup(struct inpcbtable *table, struct in_addr faddr,
1172 u_int fport, struct in_addr laddr, u_int lport, u_int rtable)
1173 {
1174 struct inpcb *inp;
1175 u_int rdomain;
1176
1177 rdomain = rtable_l2(rtable);
1178 mtx_enter(&table->inpt_mtx);
1179 inp = in_pcbhash_lookup(table, rdomain, &faddr, fport, &laddr, lport);
1180 in_pcbref(inp);
1181 mtx_leave(&table->inpt_mtx);
1182 #ifdef DIAGNOSTIC
1183 if (inp == NULL && in_pcbnotifymiss) {
1184 printf("%s: faddr=%08x fport=%d laddr=%08x lport=%d rdom=%u\n",
1185 __func__, ntohl(faddr.s_addr), ntohs(fport),
1186 ntohl(laddr.s_addr), ntohs(lport), rdomain);
1187 }
1188 #endif
1189 return (inp);
1190 }
1191
1192 /*
1193 * The in(6)_pcblookup_listen functions are used to locate listening
1194 * sockets quickly. This are sockets with unspecified foreign address
1195 * and port:
1196 * *.* <-> laddr.lport
1197 * *.* <-> *.lport
1198 */
1199 struct inpcb *
1200 in_pcblookup_listen(struct inpcbtable *table, struct in_addr laddr,
1201 u_int lport_arg, struct mbuf *m, u_int rtable)
1202 {
1203 const struct in_addr *key1, *key2;
1204 struct inpcb *inp;
1205 u_int16_t lport = lport_arg;
1206 u_int rdomain;
1207
1208 key1 = &laddr;
1209 key2 = &zeroin_addr;
1210 #if NPF > 0
1211 if (m && m->m_pkthdr.pf.flags & PF_TAG_DIVERTED) {
1212 struct pf_divert *divert;
1213
1214 divert = pf_find_divert(m);
1215 KASSERT(divert != NULL);
1216 switch (divert->type) {
1217 case PF_DIVERT_TO:
1218 key1 = key2 = &divert->addr.v4;
1219 lport = divert->port;
1220 break;
1221 case PF_DIVERT_REPLY:
1222 return (NULL);
1223 default:
1224 panic("%s: unknown divert type %d, mbuf %p, divert %p",
1225 __func__, divert->type, m, divert);
1226 }
1227 } else if (m && m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST) {
1228 /*
1229 * Redirected connections should not be treated the same
1230 * as connections directed to 127.0.0.0/8 since localhost
1231 * can only be accessed from the host itself.
1232 * For example portmap(8) grants more permissions for
1233 * connections to the socket bound to 127.0.0.1 than
1234 * to the * socket.
1235 */
1236 key1 = &zeroin_addr;
1237 key2 = &laddr;
1238 }
1239 #endif
1240
1241 rdomain = rtable_l2(rtable);
1242 mtx_enter(&table->inpt_mtx);
1243 inp = in_pcbhash_lookup(table, rdomain, &zeroin_addr, 0, key1, lport);
1244 if (inp == NULL && key1->s_addr != key2->s_addr) {
1245 inp = in_pcbhash_lookup(table, rdomain,
1246 &zeroin_addr, 0, key2, lport);
1247 }
1248 in_pcbref(inp);
1249 mtx_leave(&table->inpt_mtx);
1250 #ifdef DIAGNOSTIC
1251 if (inp == NULL && in_pcbnotifymiss) {
1252 printf("%s: laddr=%08x lport=%d rdom=%u\n",
1253 __func__, ntohl(laddr.s_addr), ntohs(lport), rdomain);
1254 }
1255 #endif
1256 return (inp);
1257 }
Cache object: 00cea6aa60dbe361c9cad8ee113ddd34
|