FreeBSD/Linux Kernel Cross Reference
sys/netinet/in_pcb.c
1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1991, 1993, 1995
5 * The Regents of the University of California.
6 * Copyright (c) 2007-2009 Robert N. M. Watson
7 * Copyright (c) 2010-2011 Juniper Networks, Inc.
8 * All rights reserved.
9 *
10 * Portions of this software were developed by Robert N. M. Watson under
11 * contract to Juniper Networks, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
38 */
39
40 #include <sys/cdefs.h>
41 __FBSDID("$FreeBSD$");
42
43 #include "opt_ddb.h"
44 #include "opt_ipsec.h"
45 #include "opt_inet.h"
46 #include "opt_inet6.h"
47 #include "opt_ratelimit.h"
48 #include "opt_pcbgroup.h"
49 #include "opt_rss.h"
50
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/lock.h>
54 #include <sys/malloc.h>
55 #include <sys/mbuf.h>
56 #include <sys/callout.h>
57 #include <sys/eventhandler.h>
58 #include <sys/domain.h>
59 #include <sys/protosw.h>
60 #include <sys/rmlock.h>
61 #include <sys/smp.h>
62 #include <sys/socket.h>
63 #include <sys/socketvar.h>
64 #include <sys/sockio.h>
65 #include <sys/priv.h>
66 #include <sys/proc.h>
67 #include <sys/refcount.h>
68 #include <sys/jail.h>
69 #include <sys/kernel.h>
70 #include <sys/sysctl.h>
71
72 #ifdef DDB
73 #include <ddb/ddb.h>
74 #endif
75
76 #include <vm/uma.h>
77
78 #include <net/if.h>
79 #include <net/if_var.h>
80 #include <net/if_types.h>
81 #include <net/if_llatbl.h>
82 #include <net/route.h>
83 #include <net/rss_config.h>
84 #include <net/vnet.h>
85
86 #if defined(INET) || defined(INET6)
87 #include <netinet/in.h>
88 #include <netinet/in_pcb.h>
89 #ifdef INET
90 #include <netinet/in_var.h>
91 #endif
92 #include <netinet/ip_var.h>
93 #include <netinet/tcp_var.h>
94 #ifdef TCPHPTS
95 #include <netinet/tcp_hpts.h>
96 #endif
97 #include <netinet/udp.h>
98 #include <netinet/udp_var.h>
99 #ifdef INET6
100 #include <netinet/ip6.h>
101 #include <netinet6/in6_pcb.h>
102 #include <netinet6/in6_var.h>
103 #include <netinet6/ip6_var.h>
104 #endif /* INET6 */
105 #endif
106
107 #include <netipsec/ipsec_support.h>
108
109 #include <security/mac/mac_framework.h>
110
111 #define INPCBLBGROUP_SIZMIN 8
112 #define INPCBLBGROUP_SIZMAX 256
113
114 static struct callout ipport_tick_callout;
115
116 /*
117 * These configure the range of local port addresses assigned to
118 * "unspecified" outgoing connections/packets/whatever.
119 */
120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */
121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */
122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */
123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */
124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */
125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */
126
127 /*
128 * Reserved ports accessible only to root. There are significant
129 * security considerations that must be accounted for when changing these,
130 * but the security benefits can be great. Please be careful.
131 */
132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */
133 VNET_DEFINE(int, ipport_reservedlow);
134
135 /* Variables dealing with random ephemeral port allocation. */
136 VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */
137 VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */
138 VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */
139 VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */
140 VNET_DEFINE(int, ipport_tcpallocs);
141 VNET_DEFINE_STATIC(int, ipport_tcplastcount);
142
143 #define V_ipport_tcplastcount VNET(ipport_tcplastcount)
144
145 static void in_pcbremlists(struct inpcb *inp);
146 #ifdef INET
147 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
148 struct in_addr faddr, u_int fport_arg,
149 struct in_addr laddr, u_int lport_arg,
150 int lookupflags, struct ifnet *ifp);
151
152 #define RANGECHK(var, min, max) \
153 if ((var) < (min)) { (var) = (min); } \
154 else if ((var) > (max)) { (var) = (max); }
155
156 static int
157 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS)
158 {
159 int error;
160
161 error = sysctl_handle_int(oidp, arg1, arg2, req);
162 if (error == 0) {
163 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1);
164 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1);
165 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX);
166 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX);
167 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX);
168 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX);
169 }
170 return (error);
171 }
172
173 #undef RANGECHK
174
175 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0,
176 "IP Ports");
177
178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst,
179 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
180 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", "");
181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast,
182 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
183 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", "");
184 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first,
185 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
186 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", "");
187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last,
188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
189 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", "");
190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst,
191 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
192 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", "");
193 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast,
194 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW,
195 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", "");
196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh,
197 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
198 &VNET_NAME(ipport_reservedhigh), 0, "");
199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow,
200 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, "");
201 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized,
202 CTLFLAG_VNET | CTLFLAG_RW,
203 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation");
204 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps,
205 CTLFLAG_VNET | CTLFLAG_RW,
206 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port "
207 "allocations before switching to a sequential one");
208 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime,
209 CTLFLAG_VNET | CTLFLAG_RW,
210 &VNET_NAME(ipport_randomtime), 0,
211 "Minimum time to keep sequential port "
212 "allocation before switching to a random one");
213 #endif /* INET */
214
215 /*
216 * in_pcb.c: manage the Protocol Control Blocks.
217 *
218 * NOTE: It is assumed that most of these functions will be called with
219 * the pcbinfo lock held, and often, the inpcb lock held, as these utility
220 * functions often modify hash chains or addresses in pcbs.
221 */
222
223 static struct inpcblbgroup *
224 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
225 uint16_t port, const union in_dependaddr *addr, int size)
226 {
227 struct inpcblbgroup *grp;
228 size_t bytes;
229
230 bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
231 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
232 if (!grp)
233 return (NULL);
234 grp->il_vflag = vflag;
235 grp->il_lport = port;
236 grp->il_dependladdr = *addr;
237 grp->il_inpsiz = size;
238 CK_LIST_INSERT_HEAD(hdr, grp, il_list);
239 return (grp);
240 }
241
242 static void
243 in_pcblbgroup_free_deferred(epoch_context_t ctx)
244 {
245 struct inpcblbgroup *grp;
246
247 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx);
248 free(grp, M_PCB);
249 }
250
251 static void
252 in_pcblbgroup_free(struct inpcblbgroup *grp)
253 {
254
255 CK_LIST_REMOVE(grp, il_list);
256 epoch_call(net_epoch_preempt, &grp->il_epoch_ctx,
257 in_pcblbgroup_free_deferred);
258 }
259
260 static struct inpcblbgroup *
261 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
262 struct inpcblbgroup *old_grp, int size)
263 {
264 struct inpcblbgroup *grp;
265 int i;
266
267 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
268 old_grp->il_lport, &old_grp->il_dependladdr, size);
269 if (grp == NULL)
270 return (NULL);
271
272 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
273 ("invalid new local group size %d and old local group count %d",
274 grp->il_inpsiz, old_grp->il_inpcnt));
275
276 for (i = 0; i < old_grp->il_inpcnt; ++i)
277 grp->il_inp[i] = old_grp->il_inp[i];
278 grp->il_inpcnt = old_grp->il_inpcnt;
279 in_pcblbgroup_free(old_grp);
280 return (grp);
281 }
282
283 /*
284 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
285 * and shrink group if possible.
286 */
287 static void
288 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
289 int i)
290 {
291 struct inpcblbgroup *grp, *new_grp;
292
293 grp = *grpp;
294 for (; i + 1 < grp->il_inpcnt; ++i)
295 grp->il_inp[i] = grp->il_inp[i + 1];
296 grp->il_inpcnt--;
297
298 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
299 grp->il_inpcnt <= grp->il_inpsiz / 4) {
300 /* Shrink this group. */
301 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
302 if (new_grp != NULL)
303 *grpp = new_grp;
304 }
305 }
306
307 /*
308 * Add PCB to load balance group for SO_REUSEPORT_LB option.
309 */
310 static int
311 in_pcbinslbgrouphash(struct inpcb *inp)
312 {
313 const static struct timeval interval = { 60, 0 };
314 static struct timeval lastprint;
315 struct inpcbinfo *pcbinfo;
316 struct inpcblbgrouphead *hdr;
317 struct inpcblbgroup *grp;
318 uint32_t idx;
319
320 pcbinfo = inp->inp_pcbinfo;
321
322 INP_WLOCK_ASSERT(inp);
323 INP_HASH_WLOCK_ASSERT(pcbinfo);
324
325 /*
326 * Don't allow jailed socket to join local group.
327 */
328 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred))
329 return (0);
330
331 #ifdef INET6
332 /*
333 * Don't allow IPv4 mapped INET6 wild socket.
334 */
335 if ((inp->inp_vflag & INP_IPV4) &&
336 inp->inp_laddr.s_addr == INADDR_ANY &&
337 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
338 return (0);
339 }
340 #endif
341
342 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask);
343 hdr = &pcbinfo->ipi_lbgrouphashbase[idx];
344 CK_LIST_FOREACH(grp, hdr, il_list) {
345 if (grp->il_vflag == inp->inp_vflag &&
346 grp->il_lport == inp->inp_lport &&
347 memcmp(&grp->il_dependladdr,
348 &inp->inp_inc.inc_ie.ie_dependladdr,
349 sizeof(grp->il_dependladdr)) == 0)
350 break;
351 }
352 if (grp == NULL) {
353 /* Create new load balance group. */
354 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
355 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
356 INPCBLBGROUP_SIZMIN);
357 if (grp == NULL)
358 return (ENOBUFS);
359 } else if (grp->il_inpcnt == grp->il_inpsiz) {
360 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
361 if (ratecheck(&lastprint, &interval))
362 printf("lb group port %d, limit reached\n",
363 ntohs(grp->il_lport));
364 return (0);
365 }
366
367 /* Expand this local group. */
368 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
369 if (grp == NULL)
370 return (ENOBUFS);
371 }
372
373 KASSERT(grp->il_inpcnt < grp->il_inpsiz,
374 ("invalid local group size %d and count %d", grp->il_inpsiz,
375 grp->il_inpcnt));
376
377 grp->il_inp[grp->il_inpcnt] = inp;
378 grp->il_inpcnt++;
379 return (0);
380 }
381
382 /*
383 * Remove PCB from load balance group.
384 */
385 static void
386 in_pcbremlbgrouphash(struct inpcb *inp)
387 {
388 struct inpcbinfo *pcbinfo;
389 struct inpcblbgrouphead *hdr;
390 struct inpcblbgroup *grp;
391 int i;
392
393 pcbinfo = inp->inp_pcbinfo;
394
395 INP_WLOCK_ASSERT(inp);
396 INP_HASH_WLOCK_ASSERT(pcbinfo);
397
398 hdr = &pcbinfo->ipi_lbgrouphashbase[
399 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
400 CK_LIST_FOREACH(grp, hdr, il_list) {
401 for (i = 0; i < grp->il_inpcnt; ++i) {
402 if (grp->il_inp[i] != inp)
403 continue;
404
405 if (grp->il_inpcnt == 1) {
406 /* We are the last, free this local group. */
407 in_pcblbgroup_free(grp);
408 } else {
409 /* Pull up inpcbs, shrink group if possible. */
410 in_pcblbgroup_reorder(hdr, &grp, i);
411 }
412 return;
413 }
414 }
415 }
416
417 /*
418 * Different protocols initialize their inpcbs differently - giving
419 * different name to the lock. But they all are disposed the same.
420 */
421 static void
422 inpcb_fini(void *mem, int size)
423 {
424 struct inpcb *inp = mem;
425
426 INP_LOCK_DESTROY(inp);
427 }
428
429 /*
430 * Initialize an inpcbinfo -- we should be able to reduce the number of
431 * arguments in time.
432 */
433 void
434 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name,
435 struct inpcbhead *listhead, int hash_nelements, int porthash_nelements,
436 char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields)
437 {
438
439 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1);
440
441 INP_INFO_LOCK_INIT(pcbinfo, name);
442 INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */
443 INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist");
444 #ifdef VIMAGE
445 pcbinfo->ipi_vnet = curvnet;
446 #endif
447 pcbinfo->ipi_listhead = listhead;
448 CK_LIST_INIT(pcbinfo->ipi_listhead);
449 pcbinfo->ipi_count = 0;
450 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB,
451 &pcbinfo->ipi_hashmask);
452 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
453 &pcbinfo->ipi_porthashmask);
454 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB,
455 &pcbinfo->ipi_lbgrouphashmask);
456 #ifdef PCBGROUP
457 in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
458 #endif
459 pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb),
460 NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0);
461 uma_zone_set_max(pcbinfo->ipi_zone, maxsockets);
462 uma_zone_set_warning(pcbinfo->ipi_zone,
463 "kern.ipc.maxsockets limit reached");
464 }
465
466 /*
467 * Destroy an inpcbinfo.
468 */
469 void
470 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
471 {
472
473 KASSERT(pcbinfo->ipi_count == 0,
474 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count));
475
476 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
477 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
478 pcbinfo->ipi_porthashmask);
479 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
480 pcbinfo->ipi_lbgrouphashmask);
481 #ifdef PCBGROUP
482 in_pcbgroup_destroy(pcbinfo);
483 #endif
484 uma_zdestroy(pcbinfo->ipi_zone);
485 INP_LIST_LOCK_DESTROY(pcbinfo);
486 INP_HASH_LOCK_DESTROY(pcbinfo);
487 INP_INFO_LOCK_DESTROY(pcbinfo);
488 }
489
490 /*
491 * Allocate a PCB and associate it with the socket.
492 * On success return with the PCB locked.
493 */
494 int
495 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
496 {
497 struct inpcb *inp;
498 int error;
499
500 #ifdef INVARIANTS
501 if (pcbinfo == &V_tcbinfo) {
502 INP_INFO_RLOCK_ASSERT(pcbinfo);
503 } else {
504 INP_INFO_WLOCK_ASSERT(pcbinfo);
505 }
506 #endif
507
508 error = 0;
509 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT);
510 if (inp == NULL)
511 return (ENOBUFS);
512 bzero(&inp->inp_start_zero, inp_zero_size);
513 inp->inp_pcbinfo = pcbinfo;
514 inp->inp_socket = so;
515 inp->inp_cred = crhold(so->so_cred);
516 inp->inp_inc.inc_fibnum = so->so_fibnum;
517 #ifdef MAC
518 error = mac_inpcb_init(inp, M_NOWAIT);
519 if (error != 0)
520 goto out;
521 mac_inpcb_create(so, inp);
522 #endif
523 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
524 error = ipsec_init_pcbpolicy(inp);
525 if (error != 0) {
526 #ifdef MAC
527 mac_inpcb_destroy(inp);
528 #endif
529 goto out;
530 }
531 #endif /*IPSEC*/
532 #ifdef INET6
533 if (INP_SOCKAF(so) == AF_INET6) {
534 inp->inp_vflag |= INP_IPV6PROTO;
535 if (V_ip6_v6only)
536 inp->inp_flags |= IN6P_IPV6_V6ONLY;
537 }
538 #endif
539 INP_WLOCK(inp);
540 INP_LIST_WLOCK(pcbinfo);
541 CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list);
542 pcbinfo->ipi_count++;
543 so->so_pcb = (caddr_t)inp;
544 #ifdef INET6
545 if (V_ip6_auto_flowlabel)
546 inp->inp_flags |= IN6P_AUTOFLOWLABEL;
547 #endif
548 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
549 refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
550
551 /*
552 * Routes in inpcb's can cache L2 as well; they are guaranteed
553 * to be cleaned up.
554 */
555 inp->inp_route.ro_flags = RT_LLE_CACHE;
556 INP_LIST_WUNLOCK(pcbinfo);
557 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC)
558 out:
559 if (error != 0) {
560 crfree(inp->inp_cred);
561 uma_zfree(pcbinfo->ipi_zone, inp);
562 }
563 #endif
564 return (error);
565 }
566
567 #ifdef INET
568 int
569 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
570 {
571 int anonport, error;
572
573 INP_WLOCK_ASSERT(inp);
574 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
575
576 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY)
577 return (EINVAL);
578 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0;
579 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr,
580 &inp->inp_lport, cred);
581 if (error)
582 return (error);
583 if (in_pcbinshash(inp) != 0) {
584 inp->inp_laddr.s_addr = INADDR_ANY;
585 inp->inp_lport = 0;
586 return (EAGAIN);
587 }
588 if (anonport)
589 inp->inp_flags |= INP_ANONPORT;
590 return (0);
591 }
592 #endif
593
594 #if defined(INET) || defined(INET6)
595 /*
596 * Assign a local port like in_pcb_lport(), but also used with connect()
597 * and a foreign address and port. If fsa is non-NULL, choose a local port
598 * that is unused with those, otherwise one that is completely unused.
599 * lsa can be NULL for IPv6.
600 */
601 int
602 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp,
603 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags)
604 {
605 struct inpcbinfo *pcbinfo;
606 struct inpcb *tmpinp;
607 unsigned short *lastport;
608 int count, dorandom, error;
609 u_short aux, first, last, lport;
610 #ifdef INET
611 struct in_addr laddr, faddr;
612 #endif
613 #ifdef INET6
614 struct in6_addr *laddr6, *faddr6;
615 #endif
616
617 pcbinfo = inp->inp_pcbinfo;
618
619 /*
620 * Because no actual state changes occur here, a global write lock on
621 * the pcbinfo isn't required.
622 */
623 INP_LOCK_ASSERT(inp);
624 INP_HASH_LOCK_ASSERT(pcbinfo);
625
626 if (inp->inp_flags & INP_HIGHPORT) {
627 first = V_ipport_hifirstauto; /* sysctl */
628 last = V_ipport_hilastauto;
629 lastport = &pcbinfo->ipi_lasthi;
630 } else if (inp->inp_flags & INP_LOWPORT) {
631 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 0);
632 if (error)
633 return (error);
634 first = V_ipport_lowfirstauto; /* 1023 */
635 last = V_ipport_lowlastauto; /* 600 */
636 lastport = &pcbinfo->ipi_lastlow;
637 } else {
638 first = V_ipport_firstauto; /* sysctl */
639 last = V_ipport_lastauto;
640 lastport = &pcbinfo->ipi_lastport;
641 }
642 /*
643 * For UDP(-Lite), use random port allocation as long as the user
644 * allows it. For TCP (and as of yet unknown) connections,
645 * use random port allocation only if the user allows it AND
646 * ipport_tick() allows it.
647 */
648 if (V_ipport_randomized &&
649 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo ||
650 pcbinfo == &V_ulitecbinfo))
651 dorandom = 1;
652 else
653 dorandom = 0;
654 /*
655 * It makes no sense to do random port allocation if
656 * we have the only port available.
657 */
658 if (first == last)
659 dorandom = 0;
660 /* Make sure to not include UDP(-Lite) packets in the count. */
661 if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo)
662 V_ipport_tcpallocs++;
663 /*
664 * Instead of having two loops further down counting up or down
665 * make sure that first is always <= last and go with only one
666 * code path implementing all logic.
667 */
668 if (first > last) {
669 aux = first;
670 first = last;
671 last = aux;
672 }
673
674 #ifdef INET
675 laddr.s_addr = INADDR_ANY;
676 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) {
677 if (lsa != NULL)
678 laddr = ((struct sockaddr_in *)lsa)->sin_addr;
679 if (fsa != NULL)
680 faddr = ((struct sockaddr_in *)fsa)->sin_addr;
681 }
682 #endif
683 #ifdef INET6
684 laddr6 = NULL;
685 if ((inp->inp_vflag & INP_IPV6) != 0) {
686 if (lsa != NULL)
687 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr;
688 if (fsa != NULL)
689 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr;
690 }
691 #endif
692
693 tmpinp = NULL;
694 lport = *lportp;
695
696 if (dorandom)
697 *lastport = first + (arc4random() % (last - first));
698
699 count = last - first;
700
701 do {
702 if (count-- < 0) /* completely used? */
703 return (EADDRNOTAVAIL);
704 ++*lastport;
705 if (*lastport < first || *lastport > last)
706 *lastport = first;
707 lport = htons(*lastport);
708
709 if (fsa != NULL) {
710
711 #ifdef INET
712 if (lsa->sa_family == AF_INET) {
713 tmpinp = in_pcblookup_hash_locked(pcbinfo,
714 faddr, fport, laddr, lport, lookupflags,
715 NULL);
716 }
717 #endif
718 #ifdef INET6
719 if (lsa->sa_family == AF_INET6) {
720 tmpinp = in6_pcblookup_hash_locked(pcbinfo,
721 faddr6, fport, laddr6, lport, lookupflags,
722 NULL);
723 }
724 #endif
725 } else {
726 #ifdef INET6
727 if ((inp->inp_vflag & INP_IPV6) != 0)
728 tmpinp = in6_pcblookup_local(pcbinfo,
729 &inp->in6p_laddr, lport, lookupflags, cred);
730 #endif
731 #if defined(INET) && defined(INET6)
732 else
733 #endif
734 #ifdef INET
735 tmpinp = in_pcblookup_local(pcbinfo, laddr,
736 lport, lookupflags, cred);
737 #endif
738 }
739 } while (tmpinp != NULL);
740
741 *lportp = lport;
742
743 return (0);
744 }
745
746 /*
747 * Select a local port (number) to use.
748 */
749 int
750 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp,
751 struct ucred *cred, int lookupflags)
752 {
753 struct sockaddr_in laddr;
754
755 if (laddrp) {
756 bzero(&laddr, sizeof(laddr));
757 laddr.sin_family = AF_INET;
758 laddr.sin_addr = *laddrp;
759 }
760 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr :
761 NULL, lportp, NULL, 0, cred, lookupflags));
762 }
763
764 /*
765 * Return cached socket options.
766 */
767 int
768 inp_so_options(const struct inpcb *inp)
769 {
770 int so_options;
771
772 so_options = 0;
773
774 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
775 so_options |= SO_REUSEPORT_LB;
776 if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
777 so_options |= SO_REUSEPORT;
778 if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
779 so_options |= SO_REUSEADDR;
780 return (so_options);
781 }
782 #endif /* INET || INET6 */
783
784 /*
785 * Check if a new BINDMULTI socket is allowed to be created.
786 *
787 * ni points to the new inp.
788 * oi points to the existing inp.
789 *
790 * This checks whether the existing inp also has BINDMULTI and
791 * whether the credentials match.
792 */
793 int
794 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi)
795 {
796 /* Check permissions match */
797 if ((ni->inp_flags2 & INP_BINDMULTI) &&
798 (ni->inp_cred->cr_uid !=
799 oi->inp_cred->cr_uid))
800 return (0);
801
802 /* Check the existing inp has BINDMULTI set */
803 if ((ni->inp_flags2 & INP_BINDMULTI) &&
804 ((oi->inp_flags2 & INP_BINDMULTI) == 0))
805 return (0);
806
807 /*
808 * We're okay - either INP_BINDMULTI isn't set on ni, or
809 * it is and it matches the checks.
810 */
811 return (1);
812 }
813
814 #ifdef INET
815 /*
816 * Set up a bind operation on a PCB, performing port allocation
817 * as required, but do not actually modify the PCB. Callers can
818 * either complete the bind by setting inp_laddr/inp_lport and
819 * calling in_pcbinshash(), or they can just use the resulting
820 * port and address to authorise the sending of a once-off packet.
821 *
822 * On error, the values of *laddrp and *lportp are not changed.
823 */
824 int
825 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp,
826 u_short *lportp, struct ucred *cred)
827 {
828 struct socket *so = inp->inp_socket;
829 struct sockaddr_in *sin;
830 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
831 struct in_addr laddr;
832 u_short lport = 0;
833 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT);
834 int error;
835
836 /*
837 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
838 * so that we don't have to add to the (already messy) code below.
839 */
840 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
841
842 /*
843 * No state changes, so read locks are sufficient here.
844 */
845 INP_LOCK_ASSERT(inp);
846 INP_HASH_LOCK_ASSERT(pcbinfo);
847
848 laddr.s_addr = *laddrp;
849 if (nam != NULL && laddr.s_addr != INADDR_ANY)
850 return (EINVAL);
851 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
852 lookupflags = INPLOOKUP_WILDCARD;
853 if (nam == NULL) {
854 if ((error = prison_local_ip4(cred, &laddr)) != 0)
855 return (error);
856 } else {
857 sin = (struct sockaddr_in *)nam;
858 if (nam->sa_len != sizeof (*sin))
859 return (EINVAL);
860 #ifdef notdef
861 /*
862 * We should check the family, but old programs
863 * incorrectly fail to initialize it.
864 */
865 if (sin->sin_family != AF_INET)
866 return (EAFNOSUPPORT);
867 #endif
868 error = prison_local_ip4(cred, &sin->sin_addr);
869 if (error)
870 return (error);
871 if (sin->sin_port != *lportp) {
872 /* Don't allow the port to change. */
873 if (*lportp != 0)
874 return (EINVAL);
875 lport = sin->sin_port;
876 }
877 /* NB: lport is left as 0 if the port isn't being changed. */
878 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) {
879 /*
880 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
881 * allow complete duplication of binding if
882 * SO_REUSEPORT is set, or if SO_REUSEADDR is set
883 * and a multicast address is bound on both
884 * new and duplicated sockets.
885 */
886 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
887 reuseport = SO_REUSEADDR|SO_REUSEPORT;
888 /*
889 * XXX: How to deal with SO_REUSEPORT_LB here?
890 * Treat same as SO_REUSEPORT for now.
891 */
892 if ((so->so_options &
893 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
894 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
895 } else if (sin->sin_addr.s_addr != INADDR_ANY) {
896 sin->sin_port = 0; /* yech... */
897 bzero(&sin->sin_zero, sizeof(sin->sin_zero));
898 /*
899 * Is the address a local IP address?
900 * If INP_BINDANY is set, then the socket may be bound
901 * to any endpoint address, local or not.
902 */
903 if ((inp->inp_flags & INP_BINDANY) == 0 &&
904 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
905 return (EADDRNOTAVAIL);
906 }
907 laddr = sin->sin_addr;
908 if (lport) {
909 struct inpcb *t;
910 struct tcptw *tw;
911
912 /* GROSS */
913 if (ntohs(lport) <= V_ipport_reservedhigh &&
914 ntohs(lport) >= V_ipport_reservedlow &&
915 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT,
916 0))
917 return (EACCES);
918 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) &&
919 priv_check_cred(inp->inp_cred,
920 PRIV_NETINET_REUSEPORT, 0) != 0) {
921 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
922 lport, INPLOOKUP_WILDCARD, cred);
923 /*
924 * XXX
925 * This entire block sorely needs a rewrite.
926 */
927 if (t &&
928 ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
929 ((t->inp_flags & INP_TIMEWAIT) == 0) &&
930 (so->so_type != SOCK_STREAM ||
931 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
932 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
933 ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
934 (t->inp_flags2 & INP_REUSEPORT) ||
935 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
936 (inp->inp_cred->cr_uid !=
937 t->inp_cred->cr_uid))
938 return (EADDRINUSE);
939
940 /*
941 * If the socket is a BINDMULTI socket, then
942 * the credentials need to match and the
943 * original socket also has to have been bound
944 * with BINDMULTI.
945 */
946 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
947 return (EADDRINUSE);
948 }
949 t = in_pcblookup_local(pcbinfo, sin->sin_addr,
950 lport, lookupflags, cred);
951 if (t && (t->inp_flags & INP_TIMEWAIT)) {
952 /*
953 * XXXRW: If an incpb has had its timewait
954 * state recycled, we treat the address as
955 * being in use (for now). This is better
956 * than a panic, but not desirable.
957 */
958 tw = intotw(t);
959 if (tw == NULL ||
960 ((reuseport & tw->tw_so_options) == 0 &&
961 (reuseport_lb &
962 tw->tw_so_options) == 0)) {
963 return (EADDRINUSE);
964 }
965 } else if (t &&
966 ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
967 (reuseport & inp_so_options(t)) == 0 &&
968 (reuseport_lb & inp_so_options(t)) == 0) {
969 #ifdef INET6
970 if (ntohl(sin->sin_addr.s_addr) !=
971 INADDR_ANY ||
972 ntohl(t->inp_laddr.s_addr) !=
973 INADDR_ANY ||
974 (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
975 (t->inp_vflag & INP_IPV6PROTO) == 0)
976 #endif
977 return (EADDRINUSE);
978 if (t && (! in_pcbbind_check_bindmulti(inp, t)))
979 return (EADDRINUSE);
980 }
981 }
982 }
983 if (*lportp != 0)
984 lport = *lportp;
985 if (lport == 0) {
986 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags);
987 if (error != 0)
988 return (error);
989
990 }
991 *laddrp = laddr.s_addr;
992 *lportp = lport;
993 return (0);
994 }
995
996 /*
997 * Connect from a socket to a specified address.
998 * Both address and port must be specified in argument sin.
999 * If don't have a local address for this socket yet,
1000 * then pick one.
1001 */
1002 int
1003 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam,
1004 struct ucred *cred, struct mbuf *m, bool rehash)
1005 {
1006 u_short lport, fport;
1007 in_addr_t laddr, faddr;
1008 int anonport, error;
1009
1010 INP_WLOCK_ASSERT(inp);
1011 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1012
1013 lport = inp->inp_lport;
1014 laddr = inp->inp_laddr.s_addr;
1015 anonport = (lport == 0);
1016 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport,
1017 NULL, cred);
1018 if (error)
1019 return (error);
1020
1021 /* Do the initial binding of the local address if required. */
1022 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
1023 KASSERT(rehash == true,
1024 ("Rehashing required for unbound inps"));
1025 inp->inp_lport = lport;
1026 inp->inp_laddr.s_addr = laddr;
1027 if (in_pcbinshash(inp) != 0) {
1028 inp->inp_laddr.s_addr = INADDR_ANY;
1029 inp->inp_lport = 0;
1030 return (EAGAIN);
1031 }
1032 }
1033
1034 /* Commit the remaining changes. */
1035 inp->inp_lport = lport;
1036 inp->inp_laddr.s_addr = laddr;
1037 inp->inp_faddr.s_addr = faddr;
1038 inp->inp_fport = fport;
1039 if (rehash) {
1040 in_pcbrehash_mbuf(inp, m);
1041 } else {
1042 in_pcbinshash_mbuf(inp, m);
1043 }
1044
1045 if (anonport)
1046 inp->inp_flags |= INP_ANONPORT;
1047 return (0);
1048 }
1049
1050 int
1051 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred)
1052 {
1053
1054 return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true));
1055 }
1056
1057 /*
1058 * Do proper source address selection on an unbound socket in case
1059 * of connect. Take jails into account as well.
1060 */
1061 int
1062 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr,
1063 struct ucred *cred)
1064 {
1065 struct ifaddr *ifa;
1066 struct sockaddr *sa;
1067 struct sockaddr_in *sin;
1068 struct route sro;
1069 int error;
1070
1071 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__));
1072 /*
1073 * Bypass source address selection and use the primary jail IP
1074 * if requested.
1075 */
1076 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr))
1077 return (0);
1078
1079 error = 0;
1080 bzero(&sro, sizeof(sro));
1081
1082 sin = (struct sockaddr_in *)&sro.ro_dst;
1083 sin->sin_family = AF_INET;
1084 sin->sin_len = sizeof(struct sockaddr_in);
1085 sin->sin_addr.s_addr = faddr->s_addr;
1086
1087 /*
1088 * If route is known our src addr is taken from the i/f,
1089 * else punt.
1090 *
1091 * Find out route to destination.
1092 */
1093 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0)
1094 in_rtalloc_ign(&sro, 0, inp->inp_inc.inc_fibnum);
1095
1096 /*
1097 * If we found a route, use the address corresponding to
1098 * the outgoing interface.
1099 *
1100 * Otherwise assume faddr is reachable on a directly connected
1101 * network and try to find a corresponding interface to take
1102 * the source address from.
1103 */
1104 NET_EPOCH_ENTER();
1105 if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) {
1106 struct in_ifaddr *ia;
1107 struct ifnet *ifp;
1108
1109 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin,
1110 inp->inp_socket->so_fibnum));
1111 if (ia == NULL) {
1112 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0,
1113 inp->inp_socket->so_fibnum));
1114
1115 }
1116 if (ia == NULL) {
1117 error = ENETUNREACH;
1118 goto done;
1119 }
1120
1121 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1122 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1123 goto done;
1124 }
1125
1126 ifp = ia->ia_ifp;
1127 ia = NULL;
1128 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1129
1130 sa = ifa->ifa_addr;
1131 if (sa->sa_family != AF_INET)
1132 continue;
1133 sin = (struct sockaddr_in *)sa;
1134 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1135 ia = (struct in_ifaddr *)ifa;
1136 break;
1137 }
1138 }
1139 if (ia != NULL) {
1140 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1141 goto done;
1142 }
1143
1144 /* 3. As a last resort return the 'default' jail address. */
1145 error = prison_get_ip4(cred, laddr);
1146 goto done;
1147 }
1148
1149 /*
1150 * If the outgoing interface on the route found is not
1151 * a loopback interface, use the address from that interface.
1152 * In case of jails do those three steps:
1153 * 1. check if the interface address belongs to the jail. If so use it.
1154 * 2. check if we have any address on the outgoing interface
1155 * belonging to this jail. If so use it.
1156 * 3. as a last resort return the 'default' jail address.
1157 */
1158 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) {
1159 struct in_ifaddr *ia;
1160 struct ifnet *ifp;
1161
1162 /* If not jailed, use the default returned. */
1163 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1164 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
1165 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1166 goto done;
1167 }
1168
1169 /* Jailed. */
1170 /* 1. Check if the iface address belongs to the jail. */
1171 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr;
1172 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1173 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa;
1174 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1175 goto done;
1176 }
1177
1178 /*
1179 * 2. Check if we have any address on the outgoing interface
1180 * belonging to this jail.
1181 */
1182 ia = NULL;
1183 ifp = sro.ro_rt->rt_ifp;
1184 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1185 sa = ifa->ifa_addr;
1186 if (sa->sa_family != AF_INET)
1187 continue;
1188 sin = (struct sockaddr_in *)sa;
1189 if (prison_check_ip4(cred, &sin->sin_addr) == 0) {
1190 ia = (struct in_ifaddr *)ifa;
1191 break;
1192 }
1193 }
1194 if (ia != NULL) {
1195 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1196 goto done;
1197 }
1198
1199 /* 3. As a last resort return the 'default' jail address. */
1200 error = prison_get_ip4(cred, laddr);
1201 goto done;
1202 }
1203
1204 /*
1205 * The outgoing interface is marked with 'loopback net', so a route
1206 * to ourselves is here.
1207 * Try to find the interface of the destination address and then
1208 * take the address from there. That interface is not necessarily
1209 * a loopback interface.
1210 * In case of jails, check that it is an address of the jail
1211 * and if we cannot find, fall back to the 'default' jail address.
1212 */
1213 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) {
1214 struct sockaddr_in sain;
1215 struct in_ifaddr *ia;
1216
1217 bzero(&sain, sizeof(struct sockaddr_in));
1218 sain.sin_family = AF_INET;
1219 sain.sin_len = sizeof(struct sockaddr_in);
1220 sain.sin_addr.s_addr = faddr->s_addr;
1221
1222 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain),
1223 inp->inp_socket->so_fibnum));
1224 if (ia == NULL)
1225 ia = ifatoia(ifa_ifwithnet(sintosa(&sain), 0,
1226 inp->inp_socket->so_fibnum));
1227 if (ia == NULL)
1228 ia = ifatoia(ifa_ifwithaddr(sintosa(&sain)));
1229
1230 if (cred == NULL || !prison_flag(cred, PR_IP4)) {
1231 if (ia == NULL) {
1232 error = ENETUNREACH;
1233 goto done;
1234 }
1235 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1236 goto done;
1237 }
1238
1239 /* Jailed. */
1240 if (ia != NULL) {
1241 struct ifnet *ifp;
1242
1243 ifp = ia->ia_ifp;
1244 ia = NULL;
1245 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
1246 sa = ifa->ifa_addr;
1247 if (sa->sa_family != AF_INET)
1248 continue;
1249 sin = (struct sockaddr_in *)sa;
1250 if (prison_check_ip4(cred,
1251 &sin->sin_addr) == 0) {
1252 ia = (struct in_ifaddr *)ifa;
1253 break;
1254 }
1255 }
1256 if (ia != NULL) {
1257 laddr->s_addr = ia->ia_addr.sin_addr.s_addr;
1258 goto done;
1259 }
1260 }
1261
1262 /* 3. As a last resort return the 'default' jail address. */
1263 error = prison_get_ip4(cred, laddr);
1264 goto done;
1265 }
1266
1267 done:
1268 NET_EPOCH_EXIT();
1269 if (sro.ro_rt != NULL)
1270 RTFREE(sro.ro_rt);
1271 return (error);
1272 }
1273
1274 /*
1275 * Set up for a connect from a socket to the specified address.
1276 * On entry, *laddrp and *lportp should contain the current local
1277 * address and port for the PCB; these are updated to the values
1278 * that should be placed in inp_laddr and inp_lport to complete
1279 * the connect.
1280 *
1281 * On success, *faddrp and *fportp will be set to the remote address
1282 * and port. These are not updated in the error case.
1283 *
1284 * If the operation fails because the connection already exists,
1285 * *oinpp will be set to the PCB of that connection so that the
1286 * caller can decide to override it. In all other cases, *oinpp
1287 * is set to NULL.
1288 */
1289 int
1290 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam,
1291 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp,
1292 struct inpcb **oinpp, struct ucred *cred)
1293 {
1294 struct rm_priotracker in_ifa_tracker;
1295 struct sockaddr_in *sin = (struct sockaddr_in *)nam;
1296 struct in_ifaddr *ia;
1297 struct inpcb *oinp;
1298 struct in_addr laddr, faddr;
1299 u_short lport, fport;
1300 int error;
1301
1302 /*
1303 * Because a global state change doesn't actually occur here, a read
1304 * lock is sufficient.
1305 */
1306 INP_LOCK_ASSERT(inp);
1307 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo);
1308
1309 if (oinpp != NULL)
1310 *oinpp = NULL;
1311 if (nam->sa_len != sizeof (*sin))
1312 return (EINVAL);
1313 if (sin->sin_family != AF_INET)
1314 return (EAFNOSUPPORT);
1315 if (sin->sin_port == 0)
1316 return (EADDRNOTAVAIL);
1317 laddr.s_addr = *laddrp;
1318 lport = *lportp;
1319 faddr = sin->sin_addr;
1320 fport = sin->sin_port;
1321
1322 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) {
1323 /*
1324 * If the destination address is INADDR_ANY,
1325 * use the primary local address.
1326 * If the supplied address is INADDR_BROADCAST,
1327 * and the primary interface supports broadcast,
1328 * choose the broadcast address for that interface.
1329 */
1330 if (faddr.s_addr == INADDR_ANY) {
1331 IN_IFADDR_RLOCK(&in_ifa_tracker);
1332 faddr =
1333 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr;
1334 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1335 if (cred != NULL &&
1336 (error = prison_get_ip4(cred, &faddr)) != 0)
1337 return (error);
1338 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) {
1339 IN_IFADDR_RLOCK(&in_ifa_tracker);
1340 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags &
1341 IFF_BROADCAST)
1342 faddr = satosin(&CK_STAILQ_FIRST(
1343 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr;
1344 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1345 }
1346 }
1347 if (laddr.s_addr == INADDR_ANY) {
1348 error = in_pcbladdr(inp, &faddr, &laddr, cred);
1349 /*
1350 * If the destination address is multicast and an outgoing
1351 * interface has been set as a multicast option, prefer the
1352 * address of that interface as our source address.
1353 */
1354 if (IN_MULTICAST(ntohl(faddr.s_addr)) &&
1355 inp->inp_moptions != NULL) {
1356 struct ip_moptions *imo;
1357 struct ifnet *ifp;
1358
1359 imo = inp->inp_moptions;
1360 if (imo->imo_multicast_ifp != NULL) {
1361 ifp = imo->imo_multicast_ifp;
1362 IN_IFADDR_RLOCK(&in_ifa_tracker);
1363 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1364 if ((ia->ia_ifp == ifp) &&
1365 (cred == NULL ||
1366 prison_check_ip4(cred,
1367 &ia->ia_addr.sin_addr) == 0))
1368 break;
1369 }
1370 if (ia == NULL)
1371 error = EADDRNOTAVAIL;
1372 else {
1373 laddr = ia->ia_addr.sin_addr;
1374 error = 0;
1375 }
1376 IN_IFADDR_RUNLOCK(&in_ifa_tracker);
1377 }
1378 }
1379 if (error)
1380 return (error);
1381 }
1382 if (lport != 0) {
1383 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
1384 fport, laddr, lport, 0, NULL);
1385 if (oinp != NULL) {
1386 if (oinpp != NULL)
1387 *oinpp = oinp;
1388 return (EADDRINUSE);
1389 }
1390 } else {
1391 struct sockaddr_in lsin, fsin;
1392
1393 bzero(&lsin, sizeof(lsin));
1394 bzero(&fsin, sizeof(fsin));
1395 lsin.sin_family = AF_INET;
1396 lsin.sin_addr = laddr;
1397 fsin.sin_family = AF_INET;
1398 fsin.sin_addr = faddr;
1399 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin,
1400 &lport, (struct sockaddr *)& fsin, fport, cred,
1401 INPLOOKUP_WILDCARD);
1402 if (error)
1403 return (error);
1404 }
1405 *laddrp = laddr.s_addr;
1406 *lportp = lport;
1407 *faddrp = faddr.s_addr;
1408 *fportp = fport;
1409 return (0);
1410 }
1411
1412 void
1413 in_pcbdisconnect(struct inpcb *inp)
1414 {
1415
1416 INP_WLOCK_ASSERT(inp);
1417 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
1418
1419 inp->inp_faddr.s_addr = INADDR_ANY;
1420 inp->inp_fport = 0;
1421 in_pcbrehash(inp);
1422 }
1423 #endif /* INET */
1424
1425 /*
1426 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb.
1427 * For most protocols, this will be invoked immediately prior to calling
1428 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the
1429 * socket, in which case in_pcbfree() is deferred.
1430 */
1431 void
1432 in_pcbdetach(struct inpcb *inp)
1433 {
1434
1435 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__));
1436
1437 #ifdef RATELIMIT
1438 if (inp->inp_snd_tag != NULL)
1439 in_pcbdetach_txrtlmt(inp);
1440 #endif
1441 inp->inp_socket->so_pcb = NULL;
1442 inp->inp_socket = NULL;
1443 }
1444
1445 /*
1446 * in_pcbref() bumps the reference count on an inpcb in order to maintain
1447 * stability of an inpcb pointer despite the inpcb lock being released. This
1448 * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
1449 * but where the inpcb lock may already held, or when acquiring a reference
1450 * via a pcbgroup.
1451 *
1452 * in_pcbref() should be used only to provide brief memory stability, and
1453 * must always be followed by a call to INP_WLOCK() and in_pcbrele() to
1454 * garbage collect the inpcb if it has been in_pcbfree()'d from another
1455 * context. Until in_pcbrele() has returned that the inpcb is still valid,
1456 * lock and rele are the *only* safe operations that may be performed on the
1457 * inpcb.
1458 *
1459 * While the inpcb will not be freed, releasing the inpcb lock means that the
1460 * connection's state may change, so the caller should be careful to
1461 * revalidate any cached state on reacquiring the lock. Drop the reference
1462 * using in_pcbrele().
1463 */
1464 void
1465 in_pcbref(struct inpcb *inp)
1466 {
1467
1468 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1469
1470 refcount_acquire(&inp->inp_refcount);
1471 }
1472
1473 /*
1474 * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
1475 * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
1476 * return a flag indicating whether or not the inpcb remains valid. If it is
1477 * valid, we return with the inpcb lock held.
1478 *
1479 * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
1480 * reference on an inpcb. Historically more work was done here (actually, in
1481 * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
1482 * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
1483 * about memory stability (and continued use of the write lock).
1484 */
1485 int
1486 in_pcbrele_rlocked(struct inpcb *inp)
1487 {
1488 struct inpcbinfo *pcbinfo;
1489
1490 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1491
1492 INP_RLOCK_ASSERT(inp);
1493
1494 if (refcount_release(&inp->inp_refcount) == 0) {
1495 /*
1496 * If the inpcb has been freed, let the caller know, even if
1497 * this isn't the last reference.
1498 */
1499 if (inp->inp_flags2 & INP_FREED) {
1500 INP_RUNLOCK(inp);
1501 return (1);
1502 }
1503 return (0);
1504 }
1505
1506 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1507 #ifdef TCPHPTS
1508 if (inp->inp_in_hpts || inp->inp_in_input) {
1509 struct tcp_hpts_entry *hpts;
1510 /*
1511 * We should not be on the hpts at
1512 * this point in any form. we must
1513 * get the lock to be sure.
1514 */
1515 hpts = tcp_hpts_lock(inp);
1516 if (inp->inp_in_hpts)
1517 panic("Hpts:%p inp:%p at free still on hpts",
1518 hpts, inp);
1519 mtx_unlock(&hpts->p_mtx);
1520 hpts = tcp_input_lock(inp);
1521 if (inp->inp_in_input)
1522 panic("Hpts:%p inp:%p at free still on input hpts",
1523 hpts, inp);
1524 mtx_unlock(&hpts->p_mtx);
1525 }
1526 #endif
1527 INP_RUNLOCK(inp);
1528 pcbinfo = inp->inp_pcbinfo;
1529 uma_zfree(pcbinfo->ipi_zone, inp);
1530 return (1);
1531 }
1532
1533 int
1534 in_pcbrele_wlocked(struct inpcb *inp)
1535 {
1536 struct inpcbinfo *pcbinfo;
1537
1538 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
1539
1540 INP_WLOCK_ASSERT(inp);
1541
1542 if (refcount_release(&inp->inp_refcount) == 0) {
1543 /*
1544 * If the inpcb has been freed, let the caller know, even if
1545 * this isn't the last reference.
1546 */
1547 if (inp->inp_flags2 & INP_FREED) {
1548 INP_WUNLOCK(inp);
1549 return (1);
1550 }
1551 return (0);
1552 }
1553
1554 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1555 #ifdef TCPHPTS
1556 if (inp->inp_in_hpts || inp->inp_in_input) {
1557 struct tcp_hpts_entry *hpts;
1558 /*
1559 * We should not be on the hpts at
1560 * this point in any form. we must
1561 * get the lock to be sure.
1562 */
1563 hpts = tcp_hpts_lock(inp);
1564 if (inp->inp_in_hpts)
1565 panic("Hpts:%p inp:%p at free still on hpts",
1566 hpts, inp);
1567 mtx_unlock(&hpts->p_mtx);
1568 hpts = tcp_input_lock(inp);
1569 if (inp->inp_in_input)
1570 panic("Hpts:%p inp:%p at free still on input hpts",
1571 hpts, inp);
1572 mtx_unlock(&hpts->p_mtx);
1573 }
1574 #endif
1575 INP_WUNLOCK(inp);
1576 pcbinfo = inp->inp_pcbinfo;
1577 uma_zfree(pcbinfo->ipi_zone, inp);
1578 return (1);
1579 }
1580
1581 /*
1582 * Temporary wrapper.
1583 */
1584 int
1585 in_pcbrele(struct inpcb *inp)
1586 {
1587
1588 return (in_pcbrele_wlocked(inp));
1589 }
1590
1591 void
1592 in_pcblist_rele_rlocked(epoch_context_t ctx)
1593 {
1594 struct in_pcblist *il;
1595 struct inpcb *inp;
1596 struct inpcbinfo *pcbinfo;
1597 int i, n;
1598
1599 il = __containerof(ctx, struct in_pcblist, il_epoch_ctx);
1600 pcbinfo = il->il_pcbinfo;
1601 n = il->il_count;
1602 INP_INFO_WLOCK(pcbinfo);
1603 for (i = 0; i < n; i++) {
1604 inp = il->il_inp_list[i];
1605 INP_RLOCK(inp);
1606 if (!in_pcbrele_rlocked(inp))
1607 INP_RUNLOCK(inp);
1608 }
1609 INP_INFO_WUNLOCK(pcbinfo);
1610 free(il, M_TEMP);
1611 }
1612
1613 static void
1614 inpcbport_free(epoch_context_t ctx)
1615 {
1616 struct inpcbport *phd;
1617
1618 phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx);
1619 free(phd, M_PCB);
1620 }
1621
1622 static void
1623 in_pcbfree_deferred(epoch_context_t ctx)
1624 {
1625 struct inpcb *inp;
1626 int released __unused;
1627
1628 inp = __containerof(ctx, struct inpcb, inp_epoch_ctx);
1629
1630 INP_WLOCK(inp);
1631 CURVNET_SET(inp->inp_vnet);
1632 #ifdef INET
1633 struct ip_moptions *imo = inp->inp_moptions;
1634 inp->inp_moptions = NULL;
1635 #endif
1636 /* XXXRW: Do as much as possible here. */
1637 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
1638 if (inp->inp_sp != NULL)
1639 ipsec_delete_pcbpolicy(inp);
1640 #endif
1641 #ifdef INET6
1642 struct ip6_moptions *im6o = NULL;
1643 if (inp->inp_vflag & INP_IPV6PROTO) {
1644 ip6_freepcbopts(inp->in6p_outputopts);
1645 im6o = inp->in6p_moptions;
1646 inp->in6p_moptions = NULL;
1647 }
1648 #endif
1649 if (inp->inp_options)
1650 (void)m_free(inp->inp_options);
1651 inp->inp_vflag = 0;
1652 crfree(inp->inp_cred);
1653 #ifdef MAC
1654 mac_inpcb_destroy(inp);
1655 #endif
1656 released = in_pcbrele_wlocked(inp);
1657 MPASS(released);
1658 #ifdef INET6
1659 ip6_freemoptions(im6o);
1660 #endif
1661 #ifdef INET
1662 inp_freemoptions(imo);
1663 #endif
1664 CURVNET_RESTORE();
1665 }
1666
1667 /*
1668 * Unconditionally schedule an inpcb to be freed by decrementing its
1669 * reference count, which should occur only after the inpcb has been detached
1670 * from its socket. If another thread holds a temporary reference (acquired
1671 * using in_pcbref()) then the free is deferred until that reference is
1672 * released using in_pcbrele(), but the inpcb is still unlocked. Almost all
1673 * work, including removal from global lists, is done in this context, where
1674 * the pcbinfo lock is held.
1675 */
1676 void
1677 in_pcbfree(struct inpcb *inp)
1678 {
1679 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
1680
1681 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
1682 KASSERT((inp->inp_flags2 & INP_FREED) == 0,
1683 ("%s: called twice for pcb %p", __func__, inp));
1684 if (inp->inp_flags2 & INP_FREED) {
1685 INP_WUNLOCK(inp);
1686 return;
1687 }
1688
1689 #ifdef INVARIANTS
1690 if (pcbinfo == &V_tcbinfo) {
1691 INP_INFO_LOCK_ASSERT(pcbinfo);
1692 } else {
1693 INP_INFO_WLOCK_ASSERT(pcbinfo);
1694 }
1695 #endif
1696 INP_WLOCK_ASSERT(inp);
1697 INP_LIST_WLOCK(pcbinfo);
1698 in_pcbremlists(inp);
1699 INP_LIST_WUNLOCK(pcbinfo);
1700 RO_INVALIDATE_CACHE(&inp->inp_route);
1701 /* mark as destruction in progress */
1702 inp->inp_flags2 |= INP_FREED;
1703 INP_WUNLOCK(inp);
1704 epoch_call(net_epoch_preempt, &inp->inp_epoch_ctx, in_pcbfree_deferred);
1705 }
1706
1707 /*
1708 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and
1709 * port reservation, and preventing it from being returned by inpcb lookups.
1710 *
1711 * It is used by TCP to mark an inpcb as unused and avoid future packet
1712 * delivery or event notification when a socket remains open but TCP has
1713 * closed. This might occur as a result of a shutdown()-initiated TCP close
1714 * or a RST on the wire, and allows the port binding to be reused while still
1715 * maintaining the invariant that so_pcb always points to a valid inpcb until
1716 * in_pcbdetach().
1717 *
1718 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by
1719 * in_pcbnotifyall() and in_pcbpurgeif0()?
1720 */
1721 void
1722 in_pcbdrop(struct inpcb *inp)
1723 {
1724
1725 INP_WLOCK_ASSERT(inp);
1726 #ifdef INVARIANTS
1727 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL)
1728 MPASS(inp->inp_refcount > 1);
1729 #endif
1730
1731 /*
1732 * XXXRW: Possibly we should protect the setting of INP_DROPPED with
1733 * the hash lock...?
1734 */
1735 inp->inp_flags |= INP_DROPPED;
1736 if (inp->inp_flags & INP_INHASHLIST) {
1737 struct inpcbport *phd = inp->inp_phd;
1738
1739 INP_HASH_WLOCK(inp->inp_pcbinfo);
1740 in_pcbremlbgrouphash(inp);
1741 CK_LIST_REMOVE(inp, inp_hash);
1742 CK_LIST_REMOVE(inp, inp_portlist);
1743 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
1744 CK_LIST_REMOVE(phd, phd_hash);
1745 epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
1746 }
1747 INP_HASH_WUNLOCK(inp->inp_pcbinfo);
1748 inp->inp_flags &= ~INP_INHASHLIST;
1749 #ifdef PCBGROUP
1750 in_pcbgroup_remove(inp);
1751 #endif
1752 }
1753 }
1754
1755 #ifdef INET
1756 /*
1757 * Common routines to return the socket addresses associated with inpcbs.
1758 */
1759 struct sockaddr *
1760 in_sockaddr(in_port_t port, struct in_addr *addr_p)
1761 {
1762 struct sockaddr_in *sin;
1763
1764 sin = malloc(sizeof *sin, M_SONAME,
1765 M_WAITOK | M_ZERO);
1766 sin->sin_family = AF_INET;
1767 sin->sin_len = sizeof(*sin);
1768 sin->sin_addr = *addr_p;
1769 sin->sin_port = port;
1770
1771 return (struct sockaddr *)sin;
1772 }
1773
1774 int
1775 in_getsockaddr(struct socket *so, struct sockaddr **nam)
1776 {
1777 struct inpcb *inp;
1778 struct in_addr addr;
1779 in_port_t port;
1780
1781 inp = sotoinpcb(so);
1782 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL"));
1783
1784 INP_RLOCK(inp);
1785 port = inp->inp_lport;
1786 addr = inp->inp_laddr;
1787 INP_RUNLOCK(inp);
1788
1789 *nam = in_sockaddr(port, &addr);
1790 return 0;
1791 }
1792
1793 int
1794 in_getpeeraddr(struct socket *so, struct sockaddr **nam)
1795 {
1796 struct inpcb *inp;
1797 struct in_addr addr;
1798 in_port_t port;
1799
1800 inp = sotoinpcb(so);
1801 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL"));
1802
1803 INP_RLOCK(inp);
1804 port = inp->inp_fport;
1805 addr = inp->inp_faddr;
1806 INP_RUNLOCK(inp);
1807
1808 *nam = in_sockaddr(port, &addr);
1809 return 0;
1810 }
1811
1812 void
1813 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno,
1814 struct inpcb *(*notify)(struct inpcb *, int))
1815 {
1816 struct inpcb *inp, *inp_temp;
1817
1818 INP_INFO_WLOCK(pcbinfo);
1819 CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) {
1820 INP_WLOCK(inp);
1821 #ifdef INET6
1822 if ((inp->inp_vflag & INP_IPV4) == 0) {
1823 INP_WUNLOCK(inp);
1824 continue;
1825 }
1826 #endif
1827 if (inp->inp_faddr.s_addr != faddr.s_addr ||
1828 inp->inp_socket == NULL) {
1829 INP_WUNLOCK(inp);
1830 continue;
1831 }
1832 if ((*notify)(inp, errno))
1833 INP_WUNLOCK(inp);
1834 }
1835 INP_INFO_WUNLOCK(pcbinfo);
1836 }
1837
1838 void
1839 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp)
1840 {
1841 struct inpcb *inp;
1842 struct in_multi *inm;
1843 struct in_mfilter *imf;
1844 struct ip_moptions *imo;
1845
1846 INP_INFO_WLOCK(pcbinfo);
1847 CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
1848 INP_WLOCK(inp);
1849 imo = inp->inp_moptions;
1850 if ((inp->inp_vflag & INP_IPV4) &&
1851 imo != NULL) {
1852 /*
1853 * Unselect the outgoing interface if it is being
1854 * detached.
1855 */
1856 if (imo->imo_multicast_ifp == ifp)
1857 imo->imo_multicast_ifp = NULL;
1858
1859 /*
1860 * Drop multicast group membership if we joined
1861 * through the interface being detached.
1862 *
1863 * XXX This can all be deferred to an epoch_call
1864 */
1865 restart:
1866 IP_MFILTER_FOREACH(imf, &imo->imo_head) {
1867 if ((inm = imf->imf_inm) == NULL)
1868 continue;
1869 if (inm->inm_ifp != ifp)
1870 continue;
1871 ip_mfilter_remove(&imo->imo_head, imf);
1872 IN_MULTI_LOCK_ASSERT();
1873 in_leavegroup_locked(inm, NULL);
1874 ip_mfilter_free(imf);
1875 goto restart;
1876 }
1877 }
1878 INP_WUNLOCK(inp);
1879 }
1880 INP_INFO_WUNLOCK(pcbinfo);
1881 }
1882
1883 /*
1884 * Lookup a PCB based on the local address and port. Caller must hold the
1885 * hash lock. No inpcb locks or references are acquired.
1886 */
1887 #define INP_LOOKUP_MAPPED_PCB_COST 3
1888 struct inpcb *
1889 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr,
1890 u_short lport, int lookupflags, struct ucred *cred)
1891 {
1892 struct inpcb *inp;
1893 #ifdef INET6
1894 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST;
1895 #else
1896 int matchwild = 3;
1897 #endif
1898 int wildcard;
1899
1900 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
1901 ("%s: invalid lookup flags %d", __func__, lookupflags));
1902
1903 INP_HASH_LOCK_ASSERT(pcbinfo);
1904
1905 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) {
1906 struct inpcbhead *head;
1907 /*
1908 * Look for an unconnected (wildcard foreign addr) PCB that
1909 * matches the local address and port we're looking for.
1910 */
1911 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
1912 0, pcbinfo->ipi_hashmask)];
1913 CK_LIST_FOREACH(inp, head, inp_hash) {
1914 #ifdef INET6
1915 /* XXX inp locking */
1916 if ((inp->inp_vflag & INP_IPV4) == 0)
1917 continue;
1918 #endif
1919 if (inp->inp_faddr.s_addr == INADDR_ANY &&
1920 inp->inp_laddr.s_addr == laddr.s_addr &&
1921 inp->inp_lport == lport) {
1922 /*
1923 * Found?
1924 */
1925 if (cred == NULL ||
1926 prison_equal_ip4(cred->cr_prison,
1927 inp->inp_cred->cr_prison))
1928 return (inp);
1929 }
1930 }
1931 /*
1932 * Not found.
1933 */
1934 return (NULL);
1935 } else {
1936 struct inpcbporthead *porthash;
1937 struct inpcbport *phd;
1938 struct inpcb *match = NULL;
1939 /*
1940 * Best fit PCB lookup.
1941 *
1942 * First see if this local port is in use by looking on the
1943 * port hash list.
1944 */
1945 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport,
1946 pcbinfo->ipi_porthashmask)];
1947 CK_LIST_FOREACH(phd, porthash, phd_hash) {
1948 if (phd->phd_port == lport)
1949 break;
1950 }
1951 if (phd != NULL) {
1952 /*
1953 * Port is in use by one or more PCBs. Look for best
1954 * fit.
1955 */
1956 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) {
1957 wildcard = 0;
1958 if (cred != NULL &&
1959 !prison_equal_ip4(inp->inp_cred->cr_prison,
1960 cred->cr_prison))
1961 continue;
1962 #ifdef INET6
1963 /* XXX inp locking */
1964 if ((inp->inp_vflag & INP_IPV4) == 0)
1965 continue;
1966 /*
1967 * We never select the PCB that has
1968 * INP_IPV6 flag and is bound to :: if
1969 * we have another PCB which is bound
1970 * to 0.0.0.0. If a PCB has the
1971 * INP_IPV6 flag, then we set its cost
1972 * higher than IPv4 only PCBs.
1973 *
1974 * Note that the case only happens
1975 * when a socket is bound to ::, under
1976 * the condition that the use of the
1977 * mapped address is allowed.
1978 */
1979 if ((inp->inp_vflag & INP_IPV6) != 0)
1980 wildcard += INP_LOOKUP_MAPPED_PCB_COST;
1981 #endif
1982 if (inp->inp_faddr.s_addr != INADDR_ANY)
1983 wildcard++;
1984 if (inp->inp_laddr.s_addr != INADDR_ANY) {
1985 if (laddr.s_addr == INADDR_ANY)
1986 wildcard++;
1987 else if (inp->inp_laddr.s_addr != laddr.s_addr)
1988 continue;
1989 } else {
1990 if (laddr.s_addr != INADDR_ANY)
1991 wildcard++;
1992 }
1993 if (wildcard < matchwild) {
1994 match = inp;
1995 matchwild = wildcard;
1996 if (matchwild == 0)
1997 break;
1998 }
1999 }
2000 }
2001 return (match);
2002 }
2003 }
2004 #undef INP_LOOKUP_MAPPED_PCB_COST
2005
2006 static struct inpcb *
2007 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
2008 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
2009 uint16_t fport, int lookupflags)
2010 {
2011 struct inpcb *local_wild;
2012 const struct inpcblbgrouphead *hdr;
2013 struct inpcblbgroup *grp;
2014 uint32_t idx;
2015
2016 INP_HASH_LOCK_ASSERT(pcbinfo);
2017
2018 hdr = &pcbinfo->ipi_lbgrouphashbase[
2019 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
2020
2021 /*
2022 * Order of socket selection:
2023 * 1. non-wild.
2024 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
2025 *
2026 * NOTE:
2027 * - Load balanced group does not contain jailed sockets
2028 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
2029 */
2030 local_wild = NULL;
2031 CK_LIST_FOREACH(grp, hdr, il_list) {
2032 #ifdef INET6
2033 if (!(grp->il_vflag & INP_IPV4))
2034 continue;
2035 #endif
2036 if (grp->il_lport != lport)
2037 continue;
2038
2039 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
2040 grp->il_inpcnt;
2041 if (grp->il_laddr.s_addr == laddr->s_addr)
2042 return (grp->il_inp[idx]);
2043 if (grp->il_laddr.s_addr == INADDR_ANY &&
2044 (lookupflags & INPLOOKUP_WILDCARD) != 0)
2045 local_wild = grp->il_inp[idx];
2046 }
2047 return (local_wild);
2048 }
2049
2050 #ifdef PCBGROUP
2051 /*
2052 * Lookup PCB in hash list, using pcbgroup tables.
2053 */
2054 static struct inpcb *
2055 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup,
2056 struct in_addr faddr, u_int fport_arg, struct in_addr laddr,
2057 u_int lport_arg, int lookupflags, struct ifnet *ifp)
2058 {
2059 struct inpcbhead *head;
2060 struct inpcb *inp, *tmpinp;
2061 u_short fport = fport_arg, lport = lport_arg;
2062 bool locked;
2063
2064 /*
2065 * First look for an exact match.
2066 */
2067 tmpinp = NULL;
2068 INP_GROUP_LOCK(pcbgroup);
2069 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2070 pcbgroup->ipg_hashmask)];
2071 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2072 #ifdef INET6
2073 /* XXX inp locking */
2074 if ((inp->inp_vflag & INP_IPV4) == 0)
2075 continue;
2076 #endif
2077 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2078 inp->inp_laddr.s_addr == laddr.s_addr &&
2079 inp->inp_fport == fport &&
2080 inp->inp_lport == lport) {
2081 /*
2082 * XXX We should be able to directly return
2083 * the inp here, without any checks.
2084 * Well unless both bound with SO_REUSEPORT?
2085 */
2086 if (prison_flag(inp->inp_cred, PR_IP4))
2087 goto found;
2088 if (tmpinp == NULL)
2089 tmpinp = inp;
2090 }
2091 }
2092 if (tmpinp != NULL) {
2093 inp = tmpinp;
2094 goto found;
2095 }
2096
2097 #ifdef RSS
2098 /*
2099 * For incoming connections, we may wish to do a wildcard
2100 * match for an RSS-local socket.
2101 */
2102 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2103 struct inpcb *local_wild = NULL, *local_exact = NULL;
2104 #ifdef INET6
2105 struct inpcb *local_wild_mapped = NULL;
2106 #endif
2107 struct inpcb *jail_wild = NULL;
2108 struct inpcbhead *head;
2109 int injail;
2110
2111 /*
2112 * Order of socket selection - we always prefer jails.
2113 * 1. jailed, non-wild.
2114 * 2. jailed, wild.
2115 * 3. non-jailed, non-wild.
2116 * 4. non-jailed, wild.
2117 */
2118
2119 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY,
2120 lport, 0, pcbgroup->ipg_hashmask)];
2121 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) {
2122 #ifdef INET6
2123 /* XXX inp locking */
2124 if ((inp->inp_vflag & INP_IPV4) == 0)
2125 continue;
2126 #endif
2127 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2128 inp->inp_lport != lport)
2129 continue;
2130
2131 injail = prison_flag(inp->inp_cred, PR_IP4);
2132 if (injail) {
2133 if (prison_check_ip4(inp->inp_cred,
2134 &laddr) != 0)
2135 continue;
2136 } else {
2137 if (local_exact != NULL)
2138 continue;
2139 }
2140
2141 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2142 if (injail)
2143 goto found;
2144 else
2145 local_exact = inp;
2146 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2147 #ifdef INET6
2148 /* XXX inp locking, NULL check */
2149 if (inp->inp_vflag & INP_IPV6PROTO)
2150 local_wild_mapped = inp;
2151 else
2152 #endif
2153 if (injail)
2154 jail_wild = inp;
2155 else
2156 local_wild = inp;
2157 }
2158 } /* LIST_FOREACH */
2159
2160 inp = jail_wild;
2161 if (inp == NULL)
2162 inp = local_exact;
2163 if (inp == NULL)
2164 inp = local_wild;
2165 #ifdef INET6
2166 if (inp == NULL)
2167 inp = local_wild_mapped;
2168 #endif
2169 if (inp != NULL)
2170 goto found;
2171 }
2172 #endif
2173
2174 /*
2175 * Then look for a wildcard match, if requested.
2176 */
2177 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2178 struct inpcb *local_wild = NULL, *local_exact = NULL;
2179 #ifdef INET6
2180 struct inpcb *local_wild_mapped = NULL;
2181 #endif
2182 struct inpcb *jail_wild = NULL;
2183 struct inpcbhead *head;
2184 int injail;
2185
2186 /*
2187 * Order of socket selection - we always prefer jails.
2188 * 1. jailed, non-wild.
2189 * 2. jailed, wild.
2190 * 3. non-jailed, non-wild.
2191 * 4. non-jailed, wild.
2192 */
2193 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport,
2194 0, pcbinfo->ipi_wildmask)];
2195 CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) {
2196 #ifdef INET6
2197 /* XXX inp locking */
2198 if ((inp->inp_vflag & INP_IPV4) == 0)
2199 continue;
2200 #endif
2201 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2202 inp->inp_lport != lport)
2203 continue;
2204
2205 injail = prison_flag(inp->inp_cred, PR_IP4);
2206 if (injail) {
2207 if (prison_check_ip4(inp->inp_cred,
2208 &laddr) != 0)
2209 continue;
2210 } else {
2211 if (local_exact != NULL)
2212 continue;
2213 }
2214
2215 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2216 if (injail)
2217 goto found;
2218 else
2219 local_exact = inp;
2220 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2221 #ifdef INET6
2222 /* XXX inp locking, NULL check */
2223 if (inp->inp_vflag & INP_IPV6PROTO)
2224 local_wild_mapped = inp;
2225 else
2226 #endif
2227 if (injail)
2228 jail_wild = inp;
2229 else
2230 local_wild = inp;
2231 }
2232 } /* LIST_FOREACH */
2233 inp = jail_wild;
2234 if (inp == NULL)
2235 inp = local_exact;
2236 if (inp == NULL)
2237 inp = local_wild;
2238 #ifdef INET6
2239 if (inp == NULL)
2240 inp = local_wild_mapped;
2241 #endif
2242 if (inp != NULL)
2243 goto found;
2244 } /* if (lookupflags & INPLOOKUP_WILDCARD) */
2245 INP_GROUP_UNLOCK(pcbgroup);
2246 return (NULL);
2247
2248 found:
2249 if (lookupflags & INPLOOKUP_WLOCKPCB)
2250 locked = INP_TRY_WLOCK(inp);
2251 else if (lookupflags & INPLOOKUP_RLOCKPCB)
2252 locked = INP_TRY_RLOCK(inp);
2253 else
2254 panic("%s: locking bug", __func__);
2255 if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) {
2256 if (lookupflags & INPLOOKUP_WLOCKPCB)
2257 INP_WUNLOCK(inp);
2258 else
2259 INP_RUNLOCK(inp);
2260 return (NULL);
2261 } else if (!locked)
2262 in_pcbref(inp);
2263 INP_GROUP_UNLOCK(pcbgroup);
2264 if (!locked) {
2265 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2266 INP_WLOCK(inp);
2267 if (in_pcbrele_wlocked(inp))
2268 return (NULL);
2269 } else {
2270 INP_RLOCK(inp);
2271 if (in_pcbrele_rlocked(inp))
2272 return (NULL);
2273 }
2274 }
2275 #ifdef INVARIANTS
2276 if (lookupflags & INPLOOKUP_WLOCKPCB)
2277 INP_WLOCK_ASSERT(inp);
2278 else
2279 INP_RLOCK_ASSERT(inp);
2280 #endif
2281 return (inp);
2282 }
2283 #endif /* PCBGROUP */
2284
2285 /*
2286 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes
2287 * that the caller has locked the hash list, and will not perform any further
2288 * locking or reference operations on either the hash list or the connection.
2289 */
2290 static struct inpcb *
2291 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2292 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
2293 struct ifnet *ifp)
2294 {
2295 struct inpcbhead *head;
2296 struct inpcb *inp, *tmpinp;
2297 u_short fport = fport_arg, lport = lport_arg;
2298
2299 #ifdef INVARIANTS
2300 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0,
2301 ("%s: invalid lookup flags %d", __func__, lookupflags));
2302 if (!mtx_owned(&pcbinfo->ipi_hash_lock))
2303 MPASS(in_epoch_verbose(net_epoch_preempt, 1));
2304 #endif
2305 /*
2306 * First look for an exact match.
2307 */
2308 tmpinp = NULL;
2309 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport,
2310 pcbinfo->ipi_hashmask)];
2311 CK_LIST_FOREACH(inp, head, inp_hash) {
2312 #ifdef INET6
2313 /* XXX inp locking */
2314 if ((inp->inp_vflag & INP_IPV4) == 0)
2315 continue;
2316 #endif
2317 if (inp->inp_faddr.s_addr == faddr.s_addr &&
2318 inp->inp_laddr.s_addr == laddr.s_addr &&
2319 inp->inp_fport == fport &&
2320 inp->inp_lport == lport) {
2321 /*
2322 * XXX We should be able to directly return
2323 * the inp here, without any checks.
2324 * Well unless both bound with SO_REUSEPORT?
2325 */
2326 if (prison_flag(inp->inp_cred, PR_IP4))
2327 return (inp);
2328 if (tmpinp == NULL)
2329 tmpinp = inp;
2330 }
2331 }
2332 if (tmpinp != NULL)
2333 return (tmpinp);
2334
2335 /*
2336 * Then look in lb group (for wildcard match).
2337 */
2338 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2339 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
2340 fport, lookupflags);
2341 if (inp != NULL)
2342 return (inp);
2343 }
2344
2345 /*
2346 * Then look for a wildcard match, if requested.
2347 */
2348 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
2349 struct inpcb *local_wild = NULL, *local_exact = NULL;
2350 #ifdef INET6
2351 struct inpcb *local_wild_mapped = NULL;
2352 #endif
2353 struct inpcb *jail_wild = NULL;
2354 int injail;
2355
2356 /*
2357 * Order of socket selection - we always prefer jails.
2358 * 1. jailed, non-wild.
2359 * 2. jailed, wild.
2360 * 3. non-jailed, non-wild.
2361 * 4. non-jailed, wild.
2362 */
2363
2364 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport,
2365 0, pcbinfo->ipi_hashmask)];
2366 CK_LIST_FOREACH(inp, head, inp_hash) {
2367 #ifdef INET6
2368 /* XXX inp locking */
2369 if ((inp->inp_vflag & INP_IPV4) == 0)
2370 continue;
2371 #endif
2372 if (inp->inp_faddr.s_addr != INADDR_ANY ||
2373 inp->inp_lport != lport)
2374 continue;
2375
2376 injail = prison_flag(inp->inp_cred, PR_IP4);
2377 if (injail) {
2378 if (prison_check_ip4(inp->inp_cred,
2379 &laddr) != 0)
2380 continue;
2381 } else {
2382 if (local_exact != NULL)
2383 continue;
2384 }
2385
2386 if (inp->inp_laddr.s_addr == laddr.s_addr) {
2387 if (injail)
2388 return (inp);
2389 else
2390 local_exact = inp;
2391 } else if (inp->inp_laddr.s_addr == INADDR_ANY) {
2392 #ifdef INET6
2393 /* XXX inp locking, NULL check */
2394 if (inp->inp_vflag & INP_IPV6PROTO)
2395 local_wild_mapped = inp;
2396 else
2397 #endif
2398 if (injail)
2399 jail_wild = inp;
2400 else
2401 local_wild = inp;
2402 }
2403 } /* LIST_FOREACH */
2404 if (jail_wild != NULL)
2405 return (jail_wild);
2406 if (local_exact != NULL)
2407 return (local_exact);
2408 if (local_wild != NULL)
2409 return (local_wild);
2410 #ifdef INET6
2411 if (local_wild_mapped != NULL)
2412 return (local_wild_mapped);
2413 #endif
2414 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */
2415
2416 return (NULL);
2417 }
2418
2419 /*
2420 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the
2421 * hash list lock, and will return the inpcb locked (i.e., requires
2422 * INPLOOKUP_LOCKPCB).
2423 */
2424 static struct inpcb *
2425 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2426 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2427 struct ifnet *ifp)
2428 {
2429 struct inpcb *inp;
2430
2431 INP_HASH_RLOCK(pcbinfo);
2432 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
2433 (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
2434 if (inp != NULL) {
2435 if (lookupflags & INPLOOKUP_WLOCKPCB) {
2436 INP_WLOCK(inp);
2437 if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2438 INP_WUNLOCK(inp);
2439 inp = NULL;
2440 }
2441 } else if (lookupflags & INPLOOKUP_RLOCKPCB) {
2442 INP_RLOCK(inp);
2443 if (__predict_false(inp->inp_flags2 & INP_FREED)) {
2444 INP_RUNLOCK(inp);
2445 inp = NULL;
2446 }
2447 } else
2448 panic("%s: locking bug", __func__);
2449 #ifdef INVARIANTS
2450 if (inp != NULL) {
2451 if (lookupflags & INPLOOKUP_WLOCKPCB)
2452 INP_WLOCK_ASSERT(inp);
2453 else
2454 INP_RLOCK_ASSERT(inp);
2455 }
2456 #endif
2457 }
2458 INP_HASH_RUNLOCK(pcbinfo);
2459 return (inp);
2460 }
2461
2462 /*
2463 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf
2464 * from which a pre-calculated hash value may be extracted.
2465 *
2466 * Possibly more of this logic should be in in_pcbgroup.c.
2467 */
2468 struct inpcb *
2469 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport,
2470 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp)
2471 {
2472 #if defined(PCBGROUP) && !defined(RSS)
2473 struct inpcbgroup *pcbgroup;
2474 #endif
2475
2476 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2477 ("%s: invalid lookup flags %d", __func__, lookupflags));
2478 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2479 ("%s: LOCKPCB not set", __func__));
2480
2481 /*
2482 * When not using RSS, use connection groups in preference to the
2483 * reservation table when looking up 4-tuples. When using RSS, just
2484 * use the reservation table, due to the cost of the Toeplitz hash
2485 * in software.
2486 *
2487 * XXXRW: This policy belongs in the pcbgroup code, as in principle
2488 * we could be doing RSS with a non-Toeplitz hash that is affordable
2489 * in software.
2490 */
2491 #if defined(PCBGROUP) && !defined(RSS)
2492 if (in_pcbgroup_enabled(pcbinfo)) {
2493 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2494 fport);
2495 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2496 laddr, lport, lookupflags, ifp));
2497 }
2498 #endif
2499 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2500 lookupflags, ifp));
2501 }
2502
2503 struct inpcb *
2504 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr,
2505 u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
2506 struct ifnet *ifp, struct mbuf *m)
2507 {
2508 #ifdef PCBGROUP
2509 struct inpcbgroup *pcbgroup;
2510 #endif
2511
2512 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0,
2513 ("%s: invalid lookup flags %d", __func__, lookupflags));
2514 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0,
2515 ("%s: LOCKPCB not set", __func__));
2516
2517 #ifdef PCBGROUP
2518 /*
2519 * If we can use a hardware-generated hash to look up the connection
2520 * group, use that connection group to find the inpcb. Otherwise
2521 * fall back on a software hash -- or the reservation table if we're
2522 * using RSS.
2523 *
2524 * XXXRW: As above, that policy belongs in the pcbgroup code.
2525 */
2526 if (in_pcbgroup_enabled(pcbinfo) &&
2527 !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) {
2528 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m),
2529 m->m_pkthdr.flowid);
2530 if (pcbgroup != NULL)
2531 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr,
2532 fport, laddr, lport, lookupflags, ifp));
2533 #ifndef RSS
2534 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr,
2535 fport);
2536 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport,
2537 laddr, lport, lookupflags, ifp));
2538 #endif
2539 }
2540 #endif
2541 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
2542 lookupflags, ifp));
2543 }
2544 #endif /* INET */
2545
2546 /*
2547 * Insert PCB onto various hash lists.
2548 */
2549 static int
2550 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m)
2551 {
2552 struct inpcbhead *pcbhash;
2553 struct inpcbporthead *pcbporthash;
2554 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2555 struct inpcbport *phd;
2556 u_int32_t hashkey_faddr;
2557 int so_options;
2558
2559 INP_WLOCK_ASSERT(inp);
2560 INP_HASH_WLOCK_ASSERT(pcbinfo);
2561
2562 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0,
2563 ("in_pcbinshash: INP_INHASHLIST"));
2564
2565 #ifdef INET6
2566 if (inp->inp_vflag & INP_IPV6)
2567 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2568 else
2569 #endif
2570 hashkey_faddr = inp->inp_faddr.s_addr;
2571
2572 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2573 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2574
2575 pcbporthash = &pcbinfo->ipi_porthashbase[
2576 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
2577
2578 /*
2579 * Add entry to load balance group.
2580 * Only do this if SO_REUSEPORT_LB is set.
2581 */
2582 so_options = inp_so_options(inp);
2583 if (so_options & SO_REUSEPORT_LB) {
2584 int ret = in_pcbinslbgrouphash(inp);
2585 if (ret) {
2586 /* pcb lb group malloc fail (ret=ENOBUFS). */
2587 return (ret);
2588 }
2589 }
2590
2591 /*
2592 * Go through port list and look for a head for this lport.
2593 */
2594 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) {
2595 if (phd->phd_port == inp->inp_lport)
2596 break;
2597 }
2598 /*
2599 * If none exists, malloc one and tack it on.
2600 */
2601 if (phd == NULL) {
2602 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT);
2603 if (phd == NULL) {
2604 return (ENOBUFS); /* XXX */
2605 }
2606 bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context));
2607 phd->phd_port = inp->inp_lport;
2608 CK_LIST_INIT(&phd->phd_pcblist);
2609 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
2610 }
2611 inp->inp_phd = phd;
2612 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
2613 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
2614 inp->inp_flags |= INP_INHASHLIST;
2615 #ifdef PCBGROUP
2616 if (m != NULL) {
2617 in_pcbgroup_update_mbuf(inp, m);
2618 } else {
2619 in_pcbgroup_update(inp);
2620 }
2621 #endif
2622 return (0);
2623 }
2624
2625 int
2626 in_pcbinshash(struct inpcb *inp)
2627 {
2628
2629 return (in_pcbinshash_internal(inp, NULL));
2630 }
2631
2632 int
2633 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m)
2634 {
2635
2636 return (in_pcbinshash_internal(inp, m));
2637 }
2638
2639 /*
2640 * Move PCB to the proper hash bucket when { faddr, fport } have been
2641 * changed. NOTE: This does not handle the case of the lport changing (the
2642 * hashed port list would have to be updated as well), so the lport must
2643 * not change after in_pcbinshash() has been called.
2644 */
2645 void
2646 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m)
2647 {
2648 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2649 struct inpcbhead *head;
2650 u_int32_t hashkey_faddr;
2651
2652 INP_WLOCK_ASSERT(inp);
2653 INP_HASH_WLOCK_ASSERT(pcbinfo);
2654
2655 KASSERT(inp->inp_flags & INP_INHASHLIST,
2656 ("in_pcbrehash: !INP_INHASHLIST"));
2657
2658 #ifdef INET6
2659 if (inp->inp_vflag & INP_IPV6)
2660 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr);
2661 else
2662 #endif
2663 hashkey_faddr = inp->inp_faddr.s_addr;
2664
2665 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr,
2666 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)];
2667
2668 CK_LIST_REMOVE(inp, inp_hash);
2669 CK_LIST_INSERT_HEAD(head, inp, inp_hash);
2670
2671 #ifdef PCBGROUP
2672 if (m != NULL)
2673 in_pcbgroup_update_mbuf(inp, m);
2674 else
2675 in_pcbgroup_update(inp);
2676 #endif
2677 }
2678
2679 void
2680 in_pcbrehash(struct inpcb *inp)
2681 {
2682
2683 in_pcbrehash_mbuf(inp, NULL);
2684 }
2685
2686 /*
2687 * Remove PCB from various lists.
2688 */
2689 static void
2690 in_pcbremlists(struct inpcb *inp)
2691 {
2692 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
2693
2694 #ifdef INVARIANTS
2695 if (pcbinfo == &V_tcbinfo) {
2696 INP_INFO_RLOCK_ASSERT(pcbinfo);
2697 } else {
2698 INP_INFO_WLOCK_ASSERT(pcbinfo);
2699 }
2700 #endif
2701
2702 INP_WLOCK_ASSERT(inp);
2703 INP_LIST_WLOCK_ASSERT(pcbinfo);
2704
2705 inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
2706 if (inp->inp_flags & INP_INHASHLIST) {
2707 struct inpcbport *phd = inp->inp_phd;
2708
2709 INP_HASH_WLOCK(pcbinfo);
2710
2711 /* XXX: Only do if SO_REUSEPORT_LB set? */
2712 in_pcbremlbgrouphash(inp);
2713
2714 CK_LIST_REMOVE(inp, inp_hash);
2715 CK_LIST_REMOVE(inp, inp_portlist);
2716 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) {
2717 CK_LIST_REMOVE(phd, phd_hash);
2718 epoch_call(net_epoch_preempt, &phd->phd_epoch_ctx, inpcbport_free);
2719 }
2720 INP_HASH_WUNLOCK(pcbinfo);
2721 inp->inp_flags &= ~INP_INHASHLIST;
2722 }
2723 CK_LIST_REMOVE(inp, inp_list);
2724 pcbinfo->ipi_count--;
2725 #ifdef PCBGROUP
2726 in_pcbgroup_remove(inp);
2727 #endif
2728 }
2729
2730 /*
2731 * Check for alternatives when higher level complains
2732 * about service problems. For now, invalidate cached
2733 * routing information. If the route was created dynamically
2734 * (by a redirect), time to try a default gateway again.
2735 */
2736 void
2737 in_losing(struct inpcb *inp)
2738 {
2739
2740 RO_INVALIDATE_CACHE(&inp->inp_route);
2741 return;
2742 }
2743
2744 /*
2745 * A set label operation has occurred at the socket layer, propagate the
2746 * label change into the in_pcb for the socket.
2747 */
2748 void
2749 in_pcbsosetlabel(struct socket *so)
2750 {
2751 #ifdef MAC
2752 struct inpcb *inp;
2753
2754 inp = sotoinpcb(so);
2755 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL"));
2756
2757 INP_WLOCK(inp);
2758 SOCK_LOCK(so);
2759 mac_inpcb_sosetlabel(so, inp);
2760 SOCK_UNLOCK(so);
2761 INP_WUNLOCK(inp);
2762 #endif
2763 }
2764
2765 /*
2766 * ipport_tick runs once per second, determining if random port allocation
2767 * should be continued. If more than ipport_randomcps ports have been
2768 * allocated in the last second, then we return to sequential port
2769 * allocation. We return to random allocation only once we drop below
2770 * ipport_randomcps for at least ipport_randomtime seconds.
2771 */
2772 static void
2773 ipport_tick(void *xtp)
2774 {
2775 VNET_ITERATOR_DECL(vnet_iter);
2776
2777 VNET_LIST_RLOCK_NOSLEEP();
2778 VNET_FOREACH(vnet_iter) {
2779 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */
2780 if (V_ipport_tcpallocs <=
2781 V_ipport_tcplastcount + V_ipport_randomcps) {
2782 if (V_ipport_stoprandom > 0)
2783 V_ipport_stoprandom--;
2784 } else
2785 V_ipport_stoprandom = V_ipport_randomtime;
2786 V_ipport_tcplastcount = V_ipport_tcpallocs;
2787 CURVNET_RESTORE();
2788 }
2789 VNET_LIST_RUNLOCK_NOSLEEP();
2790 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL);
2791 }
2792
2793 static void
2794 ip_fini(void *xtp)
2795 {
2796
2797 callout_stop(&ipport_tick_callout);
2798 }
2799
2800 /*
2801 * The ipport_callout should start running at about the time we attach the
2802 * inet or inet6 domains.
2803 */
2804 static void
2805 ipport_tick_init(const void *unused __unused)
2806 {
2807
2808 /* Start ipport_tick. */
2809 callout_init(&ipport_tick_callout, 1);
2810 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL);
2811 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL,
2812 SHUTDOWN_PRI_DEFAULT);
2813 }
2814 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE,
2815 ipport_tick_init, NULL);
2816
2817 void
2818 inp_wlock(struct inpcb *inp)
2819 {
2820
2821 INP_WLOCK(inp);
2822 }
2823
2824 void
2825 inp_wunlock(struct inpcb *inp)
2826 {
2827
2828 INP_WUNLOCK(inp);
2829 }
2830
2831 void
2832 inp_rlock(struct inpcb *inp)
2833 {
2834
2835 INP_RLOCK(inp);
2836 }
2837
2838 void
2839 inp_runlock(struct inpcb *inp)
2840 {
2841
2842 INP_RUNLOCK(inp);
2843 }
2844
2845 #ifdef INVARIANT_SUPPORT
2846 void
2847 inp_lock_assert(struct inpcb *inp)
2848 {
2849
2850 INP_WLOCK_ASSERT(inp);
2851 }
2852
2853 void
2854 inp_unlock_assert(struct inpcb *inp)
2855 {
2856
2857 INP_UNLOCK_ASSERT(inp);
2858 }
2859 #endif
2860
2861 void
2862 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg)
2863 {
2864 struct inpcb *inp;
2865
2866 INP_INFO_WLOCK(&V_tcbinfo);
2867 CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
2868 INP_WLOCK(inp);
2869 func(inp, arg);
2870 INP_WUNLOCK(inp);
2871 }
2872 INP_INFO_WUNLOCK(&V_tcbinfo);
2873 }
2874
2875 struct socket *
2876 inp_inpcbtosocket(struct inpcb *inp)
2877 {
2878
2879 INP_WLOCK_ASSERT(inp);
2880 return (inp->inp_socket);
2881 }
2882
2883 struct tcpcb *
2884 inp_inpcbtotcpcb(struct inpcb *inp)
2885 {
2886
2887 INP_WLOCK_ASSERT(inp);
2888 return ((struct tcpcb *)inp->inp_ppcb);
2889 }
2890
2891 int
2892 inp_ip_tos_get(const struct inpcb *inp)
2893 {
2894
2895 return (inp->inp_ip_tos);
2896 }
2897
2898 void
2899 inp_ip_tos_set(struct inpcb *inp, int val)
2900 {
2901
2902 inp->inp_ip_tos = val;
2903 }
2904
2905 void
2906 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
2907 uint32_t *faddr, uint16_t *fp)
2908 {
2909
2910 INP_LOCK_ASSERT(inp);
2911 *laddr = inp->inp_laddr.s_addr;
2912 *faddr = inp->inp_faddr.s_addr;
2913 *lp = inp->inp_lport;
2914 *fp = inp->inp_fport;
2915 }
2916
2917 struct inpcb *
2918 so_sotoinpcb(struct socket *so)
2919 {
2920
2921 return (sotoinpcb(so));
2922 }
2923
2924 struct tcpcb *
2925 so_sototcpcb(struct socket *so)
2926 {
2927
2928 return (sototcpcb(so));
2929 }
2930
2931 /*
2932 * Create an external-format (``xinpcb'') structure using the information in
2933 * the kernel-format in_pcb structure pointed to by inp. This is done to
2934 * reduce the spew of irrelevant information over this interface, to isolate
2935 * user code from changes in the kernel structure, and potentially to provide
2936 * information-hiding if we decide that some of this information should be
2937 * hidden from users.
2938 */
2939 void
2940 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi)
2941 {
2942
2943 bzero(xi, sizeof(*xi));
2944 xi->xi_len = sizeof(struct xinpcb);
2945 if (inp->inp_socket)
2946 sotoxsocket(inp->inp_socket, &xi->xi_socket);
2947 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo));
2948 xi->inp_gencnt = inp->inp_gencnt;
2949 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb;
2950 xi->inp_flow = inp->inp_flow;
2951 xi->inp_flowid = inp->inp_flowid;
2952 xi->inp_flowtype = inp->inp_flowtype;
2953 xi->inp_flags = inp->inp_flags;
2954 xi->inp_flags2 = inp->inp_flags2;
2955 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket;
2956 xi->in6p_cksum = inp->in6p_cksum;
2957 xi->in6p_hops = inp->in6p_hops;
2958 xi->inp_ip_tos = inp->inp_ip_tos;
2959 xi->inp_vflag = inp->inp_vflag;
2960 xi->inp_ip_ttl = inp->inp_ip_ttl;
2961 xi->inp_ip_p = inp->inp_ip_p;
2962 xi->inp_ip_minttl = inp->inp_ip_minttl;
2963 }
2964
2965 #ifdef DDB
2966 static void
2967 db_print_indent(int indent)
2968 {
2969 int i;
2970
2971 for (i = 0; i < indent; i++)
2972 db_printf(" ");
2973 }
2974
2975 static void
2976 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent)
2977 {
2978 char faddr_str[48], laddr_str[48];
2979
2980 db_print_indent(indent);
2981 db_printf("%s at %p\n", name, inc);
2982
2983 indent += 2;
2984
2985 #ifdef INET6
2986 if (inc->inc_flags & INC_ISIPV6) {
2987 /* IPv6. */
2988 ip6_sprintf(laddr_str, &inc->inc6_laddr);
2989 ip6_sprintf(faddr_str, &inc->inc6_faddr);
2990 } else
2991 #endif
2992 {
2993 /* IPv4. */
2994 inet_ntoa_r(inc->inc_laddr, laddr_str);
2995 inet_ntoa_r(inc->inc_faddr, faddr_str);
2996 }
2997 db_print_indent(indent);
2998 db_printf("inc_laddr %s inc_lport %u\n", laddr_str,
2999 ntohs(inc->inc_lport));
3000 db_print_indent(indent);
3001 db_printf("inc_faddr %s inc_fport %u\n", faddr_str,
3002 ntohs(inc->inc_fport));
3003 }
3004
3005 static void
3006 db_print_inpflags(int inp_flags)
3007 {
3008 int comma;
3009
3010 comma = 0;
3011 if (inp_flags & INP_RECVOPTS) {
3012 db_printf("%sINP_RECVOPTS", comma ? ", " : "");
3013 comma = 1;
3014 }
3015 if (inp_flags & INP_RECVRETOPTS) {
3016 db_printf("%sINP_RECVRETOPTS", comma ? ", " : "");
3017 comma = 1;
3018 }
3019 if (inp_flags & INP_RECVDSTADDR) {
3020 db_printf("%sINP_RECVDSTADDR", comma ? ", " : "");
3021 comma = 1;
3022 }
3023 if (inp_flags & INP_ORIGDSTADDR) {
3024 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : "");
3025 comma = 1;
3026 }
3027 if (inp_flags & INP_HDRINCL) {
3028 db_printf("%sINP_HDRINCL", comma ? ", " : "");
3029 comma = 1;
3030 }
3031 if (inp_flags & INP_HIGHPORT) {
3032 db_printf("%sINP_HIGHPORT", comma ? ", " : "");
3033 comma = 1;
3034 }
3035 if (inp_flags & INP_LOWPORT) {
3036 db_printf("%sINP_LOWPORT", comma ? ", " : "");
3037 comma = 1;
3038 }
3039 if (inp_flags & INP_ANONPORT) {
3040 db_printf("%sINP_ANONPORT", comma ? ", " : "");
3041 comma = 1;
3042 }
3043 if (inp_flags & INP_RECVIF) {
3044 db_printf("%sINP_RECVIF", comma ? ", " : "");
3045 comma = 1;
3046 }
3047 if (inp_flags & INP_MTUDISC) {
3048 db_printf("%sINP_MTUDISC", comma ? ", " : "");
3049 comma = 1;
3050 }
3051 if (inp_flags & INP_RECVTTL) {
3052 db_printf("%sINP_RECVTTL", comma ? ", " : "");
3053 comma = 1;
3054 }
3055 if (inp_flags & INP_DONTFRAG) {
3056 db_printf("%sINP_DONTFRAG", comma ? ", " : "");
3057 comma = 1;
3058 }
3059 if (inp_flags & INP_RECVTOS) {
3060 db_printf("%sINP_RECVTOS", comma ? ", " : "");
3061 comma = 1;
3062 }
3063 if (inp_flags & IN6P_IPV6_V6ONLY) {
3064 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : "");
3065 comma = 1;
3066 }
3067 if (inp_flags & IN6P_PKTINFO) {
3068 db_printf("%sIN6P_PKTINFO", comma ? ", " : "");
3069 comma = 1;
3070 }
3071 if (inp_flags & IN6P_HOPLIMIT) {
3072 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : "");
3073 comma = 1;
3074 }
3075 if (inp_flags & IN6P_HOPOPTS) {
3076 db_printf("%sIN6P_HOPOPTS", comma ? ", " : "");
3077 comma = 1;
3078 }
3079 if (inp_flags & IN6P_DSTOPTS) {
3080 db_printf("%sIN6P_DSTOPTS", comma ? ", " : "");
3081 comma = 1;
3082 }
3083 if (inp_flags & IN6P_RTHDR) {
3084 db_printf("%sIN6P_RTHDR", comma ? ", " : "");
3085 comma = 1;
3086 }
3087 if (inp_flags & IN6P_RTHDRDSTOPTS) {
3088 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : "");
3089 comma = 1;
3090 }
3091 if (inp_flags & IN6P_TCLASS) {
3092 db_printf("%sIN6P_TCLASS", comma ? ", " : "");
3093 comma = 1;
3094 }
3095 if (inp_flags & IN6P_AUTOFLOWLABEL) {
3096 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : "");
3097 comma = 1;
3098 }
3099 if (inp_flags & INP_TIMEWAIT) {
3100 db_printf("%sINP_TIMEWAIT", comma ? ", " : "");
3101 comma = 1;
3102 }
3103 if (inp_flags & INP_ONESBCAST) {
3104 db_printf("%sINP_ONESBCAST", comma ? ", " : "");
3105 comma = 1;
3106 }
3107 if (inp_flags & INP_DROPPED) {
3108 db_printf("%sINP_DROPPED", comma ? ", " : "");
3109 comma = 1;
3110 }
3111 if (inp_flags & INP_SOCKREF) {
3112 db_printf("%sINP_SOCKREF", comma ? ", " : "");
3113 comma = 1;
3114 }
3115 if (inp_flags & IN6P_RFC2292) {
3116 db_printf("%sIN6P_RFC2292", comma ? ", " : "");
3117 comma = 1;
3118 }
3119 if (inp_flags & IN6P_MTU) {
3120 db_printf("IN6P_MTU%s", comma ? ", " : "");
3121 comma = 1;
3122 }
3123 }
3124
3125 static void
3126 db_print_inpvflag(u_char inp_vflag)
3127 {
3128 int comma;
3129
3130 comma = 0;
3131 if (inp_vflag & INP_IPV4) {
3132 db_printf("%sINP_IPV4", comma ? ", " : "");
3133 comma = 1;
3134 }
3135 if (inp_vflag & INP_IPV6) {
3136 db_printf("%sINP_IPV6", comma ? ", " : "");
3137 comma = 1;
3138 }
3139 if (inp_vflag & INP_IPV6PROTO) {
3140 db_printf("%sINP_IPV6PROTO", comma ? ", " : "");
3141 comma = 1;
3142 }
3143 }
3144
3145 static void
3146 db_print_inpcb(struct inpcb *inp, const char *name, int indent)
3147 {
3148
3149 db_print_indent(indent);
3150 db_printf("%s at %p\n", name, inp);
3151
3152 indent += 2;
3153
3154 db_print_indent(indent);
3155 db_printf("inp_flow: 0x%x\n", inp->inp_flow);
3156
3157 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent);
3158
3159 db_print_indent(indent);
3160 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n",
3161 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket);
3162
3163 db_print_indent(indent);
3164 db_printf("inp_label: %p inp_flags: 0x%x (",
3165 inp->inp_label, inp->inp_flags);
3166 db_print_inpflags(inp->inp_flags);
3167 db_printf(")\n");
3168
3169 db_print_indent(indent);
3170 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp,
3171 inp->inp_vflag);
3172 db_print_inpvflag(inp->inp_vflag);
3173 db_printf(")\n");
3174
3175 db_print_indent(indent);
3176 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n",
3177 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl);
3178
3179 db_print_indent(indent);
3180 #ifdef INET6
3181 if (inp->inp_vflag & INP_IPV6) {
3182 db_printf("in6p_options: %p in6p_outputopts: %p "
3183 "in6p_moptions: %p\n", inp->in6p_options,
3184 inp->in6p_outputopts, inp->in6p_moptions);
3185 db_printf("in6p_icmp6filt: %p in6p_cksum %d "
3186 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum,
3187 inp->in6p_hops);
3188 } else
3189 #endif
3190 {
3191 db_printf("inp_ip_tos: %d inp_ip_options: %p "
3192 "inp_ip_moptions: %p\n", inp->inp_ip_tos,
3193 inp->inp_options, inp->inp_moptions);
3194 }
3195
3196 db_print_indent(indent);
3197 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd,
3198 (uintmax_t)inp->inp_gencnt);
3199 }
3200
3201 DB_SHOW_COMMAND(inpcb, db_show_inpcb)
3202 {
3203 struct inpcb *inp;
3204
3205 if (!have_addr) {
3206 db_printf("usage: show inpcb <addr>\n");
3207 return;
3208 }
3209 inp = (struct inpcb *)addr;
3210
3211 db_print_inpcb(inp, "inpcb", 0);
3212 }
3213 #endif /* DDB */
3214
3215 #ifdef RATELIMIT
3216 /*
3217 * Modify TX rate limit based on the existing "inp->inp_snd_tag",
3218 * if any.
3219 */
3220 int
3221 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate)
3222 {
3223 union if_snd_tag_modify_params params = {
3224 .rate_limit.max_rate = max_pacing_rate,
3225 };
3226 struct m_snd_tag *mst;
3227 struct ifnet *ifp;
3228 int error;
3229
3230 mst = inp->inp_snd_tag;
3231 if (mst == NULL)
3232 return (EINVAL);
3233
3234 ifp = mst->ifp;
3235 if (ifp == NULL)
3236 return (EINVAL);
3237
3238 if (ifp->if_snd_tag_modify == NULL) {
3239 error = EOPNOTSUPP;
3240 } else {
3241 error = ifp->if_snd_tag_modify(mst, ¶ms);
3242 }
3243 return (error);
3244 }
3245
3246 /*
3247 * Query existing TX rate limit based on the existing
3248 * "inp->inp_snd_tag", if any.
3249 */
3250 int
3251 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate)
3252 {
3253 union if_snd_tag_query_params params = { };
3254 struct m_snd_tag *mst;
3255 struct ifnet *ifp;
3256 int error;
3257
3258 mst = inp->inp_snd_tag;
3259 if (mst == NULL)
3260 return (EINVAL);
3261
3262 ifp = mst->ifp;
3263 if (ifp == NULL)
3264 return (EINVAL);
3265
3266 if (ifp->if_snd_tag_query == NULL) {
3267 error = EOPNOTSUPP;
3268 } else {
3269 error = ifp->if_snd_tag_query(mst, ¶ms);
3270 if (error == 0 && p_max_pacing_rate != NULL)
3271 *p_max_pacing_rate = params.rate_limit.max_rate;
3272 }
3273 return (error);
3274 }
3275
3276 /*
3277 * Query existing TX queue level based on the existing
3278 * "inp->inp_snd_tag", if any.
3279 */
3280 int
3281 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level)
3282 {
3283 union if_snd_tag_query_params params = { };
3284 struct m_snd_tag *mst;
3285 struct ifnet *ifp;
3286 int error;
3287
3288 mst = inp->inp_snd_tag;
3289 if (mst == NULL)
3290 return (EINVAL);
3291
3292 ifp = mst->ifp;
3293 if (ifp == NULL)
3294 return (EINVAL);
3295
3296 if (ifp->if_snd_tag_query == NULL)
3297 return (EOPNOTSUPP);
3298
3299 error = ifp->if_snd_tag_query(mst, ¶ms);
3300 if (error == 0 && p_txqueue_level != NULL)
3301 *p_txqueue_level = params.rate_limit.queue_level;
3302 return (error);
3303 }
3304
3305 /*
3306 * Allocate a new TX rate limit send tag from the network interface
3307 * given by the "ifp" argument and save it in "inp->inp_snd_tag":
3308 */
3309 int
3310 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
3311 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
3312 {
3313 union if_snd_tag_alloc_params params = {
3314 .rate_limit.hdr.type = (max_pacing_rate == -1U) ?
3315 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT,
3316 .rate_limit.hdr.flowid = flowid,
3317 .rate_limit.hdr.flowtype = flowtype,
3318 .rate_limit.max_rate = max_pacing_rate,
3319 };
3320 int error;
3321
3322 INP_WLOCK_ASSERT(inp);
3323
3324 /*
3325 * If there is already a send tag, or the INP is being torn
3326 * down, allocating a new send tag is not allowed. Else send
3327 * tags may leak.
3328 */
3329 if (inp->inp_snd_tag != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0)
3330 return (EINVAL);
3331
3332 if (ifp->if_snd_tag_alloc == NULL) {
3333 error = EOPNOTSUPP;
3334 } else {
3335 error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
3336
3337 /*
3338 * At success increment the refcount on
3339 * the send tag's network interface:
3340 */
3341 if (error == 0)
3342 if_ref(inp->inp_snd_tag->ifp);
3343 }
3344 return (error);
3345 }
3346
3347 /*
3348 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
3349 * if any:
3350 */
3351 void
3352 in_pcbdetach_txrtlmt(struct inpcb *inp)
3353 {
3354 struct m_snd_tag *mst;
3355 struct ifnet *ifp;
3356
3357 INP_WLOCK_ASSERT(inp);
3358
3359 mst = inp->inp_snd_tag;
3360 inp->inp_snd_tag = NULL;
3361
3362 if (mst == NULL)
3363 return;
3364
3365 ifp = mst->ifp;
3366 if (ifp == NULL)
3367 return;
3368
3369 /*
3370 * If the device was detached while we still had reference(s)
3371 * on the ifp, we assume if_snd_tag_free() was replaced with
3372 * stubs.
3373 */
3374 ifp->if_snd_tag_free(mst);
3375
3376 /* release reference count on network interface */
3377 if_rele(ifp);
3378 }
3379
3380 /*
3381 * This function should be called when the INP_RATE_LIMIT_CHANGED flag
3382 * is set in the fast path and will attach/detach/modify the TX rate
3383 * limit send tag based on the socket's so_max_pacing_rate value.
3384 */
3385 void
3386 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
3387 {
3388 struct socket *socket;
3389 uint32_t max_pacing_rate;
3390 bool did_upgrade;
3391 int error;
3392
3393 if (inp == NULL)
3394 return;
3395
3396 socket = inp->inp_socket;
3397 if (socket == NULL)
3398 return;
3399
3400 if (!INP_WLOCKED(inp)) {
3401 /*
3402 * NOTE: If the write locking fails, we need to bail
3403 * out and use the non-ratelimited ring for the
3404 * transmit until there is a new chance to get the
3405 * write lock.
3406 */
3407 if (!INP_TRY_UPGRADE(inp))
3408 return;
3409 did_upgrade = 1;
3410 } else {
3411 did_upgrade = 0;
3412 }
3413
3414 /*
3415 * NOTE: The so_max_pacing_rate value is read unlocked,
3416 * because atomic updates are not required since the variable
3417 * is checked at every mbuf we send. It is assumed that the
3418 * variable read itself will be atomic.
3419 */
3420 max_pacing_rate = socket->so_max_pacing_rate;
3421
3422 /*
3423 * NOTE: When attaching to a network interface a reference is
3424 * made to ensure the network interface doesn't go away until
3425 * all ratelimit connections are gone. The network interface
3426 * pointers compared below represent valid network interfaces,
3427 * except when comparing towards NULL.
3428 */
3429 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) {
3430 error = 0;
3431 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) {
3432 if (inp->inp_snd_tag != NULL)
3433 in_pcbdetach_txrtlmt(inp);
3434 error = 0;
3435 } else if (inp->inp_snd_tag == NULL) {
3436 /*
3437 * In order to utilize packet pacing with RSS, we need
3438 * to wait until there is a valid RSS hash before we
3439 * can proceed:
3440 */
3441 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) {
3442 error = EAGAIN;
3443 } else {
3444 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
3445 mb->m_pkthdr.flowid, max_pacing_rate);
3446 }
3447 } else {
3448 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
3449 }
3450 if (error == 0 || error == EOPNOTSUPP)
3451 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
3452 if (did_upgrade)
3453 INP_DOWNGRADE(inp);
3454 }
3455
3456 /*
3457 * Track route changes for TX rate limiting.
3458 */
3459 void
3460 in_pcboutput_eagain(struct inpcb *inp)
3461 {
3462 bool did_upgrade;
3463
3464 if (inp == NULL)
3465 return;
3466
3467 if (inp->inp_snd_tag == NULL)
3468 return;
3469
3470 if (!INP_WLOCKED(inp)) {
3471 /*
3472 * NOTE: If the write locking fails, we need to bail
3473 * out and use the non-ratelimited ring for the
3474 * transmit until there is a new chance to get the
3475 * write lock.
3476 */
3477 if (!INP_TRY_UPGRADE(inp))
3478 return;
3479 did_upgrade = 1;
3480 } else {
3481 did_upgrade = 0;
3482 }
3483
3484 /* detach rate limiting */
3485 in_pcbdetach_txrtlmt(inp);
3486
3487 /* make sure new mbuf send tag allocation is made */
3488 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
3489
3490 if (did_upgrade)
3491 INP_DOWNGRADE(inp);
3492 }
3493 #endif /* RATELIMIT */
Cache object: 78a1d13de9e166f6da6adf927e07bb27
|