FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_jail.c
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright (c) 1999 Poul-Henning Kamp.
5 * Copyright (c) 2008 Bjoern A. Zeeb.
6 * Copyright (c) 2009 James Gritton.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/systm.h>
42 #include <sys/errno.h>
43 #include <sys/sysproto.h>
44 #include <sys/malloc.h>
45 #include <sys/osd.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/taskqueue.h>
49 #include <sys/fcntl.h>
50 #include <sys/jail.h>
51 #include <sys/linker.h>
52 #include <sys/lock.h>
53 #include <sys/mutex.h>
54 #include <sys/racct.h>
55 #include <sys/rctl.h>
56 #include <sys/refcount.h>
57 #include <sys/sx.h>
58 #include <sys/sysent.h>
59 #include <sys/namei.h>
60 #include <sys/mount.h>
61 #include <sys/queue.h>
62 #include <sys/socket.h>
63 #include <sys/syscallsubr.h>
64 #include <sys/sysctl.h>
65 #include <sys/uuid.h>
66 #include <sys/vnode.h>
67
68 #include <net/if.h>
69 #include <net/vnet.h>
70
71 #include <netinet/in.h>
72
73 #ifdef DDB
74 #include <ddb/ddb.h>
75 #endif /* DDB */
76
77 #include <security/mac/mac_framework.h>
78
79 #define PRISON0_HOSTUUID_MODULE "hostuuid"
80
81 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
82 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
83
84 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
85 #ifdef INET
86 #ifdef INET6
87 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
88 #else
89 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
90 #endif
91 #else /* !INET */
92 #ifdef INET6
93 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
94 #else
95 #define _PR_IP_SADDRSEL 0
96 #endif
97 #endif
98
99 /* prison0 describes what is "real" about the system. */
100 struct prison prison0 = {
101 .pr_id = 0,
102 .pr_name = "",
103 .pr_ref = 1,
104 .pr_uref = 1,
105 .pr_path = "/",
106 .pr_securelevel = -1,
107 .pr_devfs_rsnum = 0,
108 .pr_childmax = JAIL_MAX,
109 .pr_hostuuid = DEFAULT_HOSTUUID,
110 .pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
111 #ifdef VIMAGE
112 .pr_flags = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
113 #else
114 .pr_flags = PR_HOST|_PR_IP_SADDRSEL,
115 #endif
116 .pr_allow = PR_ALLOW_ALL_STATIC,
117 };
118 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
119
120 struct bool_flags {
121 const char *name;
122 const char *noname;
123 unsigned flag;
124 };
125 struct jailsys_flags {
126 const char *name;
127 unsigned disable;
128 unsigned new;
129 };
130
131 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
132 struct sx allprison_lock;
133 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
134 struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
135 LIST_HEAD(, prison_racct) allprison_racct;
136 int lastprid = 0;
137
138 static int do_jail_attach(struct thread *td, struct prison *pr);
139 static void prison_complete(void *context, int pending);
140 static void prison_deref(struct prison *pr, int flags);
141 static char *prison_path(struct prison *pr1, struct prison *pr2);
142 static void prison_remove_one(struct prison *pr);
143 #ifdef RACCT
144 static void prison_racct_attach(struct prison *pr);
145 static void prison_racct_modify(struct prison *pr);
146 static void prison_racct_detach(struct prison *pr);
147 #endif
148
149 /* Flags for prison_deref */
150 #define PD_DEREF 0x01
151 #define PD_DEUREF 0x02
152 #define PD_LOCKED 0x04
153 #define PD_LIST_SLOCKED 0x08
154 #define PD_LIST_XLOCKED 0x10
155
156 /*
157 * Parameter names corresponding to PR_* flag values. Size values are for kvm
158 * as we cannot figure out the size of a sparse array, or an array without a
159 * terminating entry.
160 */
161 static struct bool_flags pr_flag_bool[] = {
162 {"persist", "nopersist", PR_PERSIST},
163 #ifdef INET
164 {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
165 #endif
166 #ifdef INET6
167 {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
168 #endif
169 };
170 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
171
172 static struct jailsys_flags pr_flag_jailsys[] = {
173 {"host", 0, PR_HOST},
174 #ifdef VIMAGE
175 {"vnet", 0, PR_VNET},
176 #endif
177 #ifdef INET
178 {"ip4", PR_IP4_USER, PR_IP4_USER},
179 #endif
180 #ifdef INET6
181 {"ip6", PR_IP6_USER, PR_IP6_USER},
182 #endif
183 };
184 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
185
186 /* Make this array full-size so dynamic parameters can be added. */
187 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
188 {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
189 {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
190 {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
191 {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
192 {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
193 {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
194 {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
195 {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
196 {"allow.reserved_ports", "allow.noreserved_ports",
197 PR_ALLOW_RESERVED_PORTS},
198 {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
199 };
200 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
201
202 #define JAIL_DEFAULT_ALLOW (PR_ALLOW_SET_HOSTNAME | PR_ALLOW_RESERVED_PORTS)
203 #define JAIL_DEFAULT_ENFORCE_STATFS 2
204 #define JAIL_DEFAULT_DEVFS_RSNUM 0
205 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
206 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
207 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
208 #if defined(INET) || defined(INET6)
209 static unsigned jail_max_af_ips = 255;
210 #endif
211
212 /*
213 * Initialize the parts of prison0 that can't be static-initialized with
214 * constants. This is called from proc0_init() after creating thread0 cpuset.
215 */
216 void
217 prison0_init(void)
218 {
219 uint8_t *file, *data;
220 size_t size;
221
222 prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
223 prison0.pr_osreldate = osreldate;
224 strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
225
226 /* If we have a preloaded hostuuid, use it. */
227 file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
228 if (file != NULL) {
229 data = preload_fetch_addr(file);
230 size = preload_fetch_size(file);
231 if (data != NULL) {
232 /*
233 * The preloaded data may include trailing whitespace, almost
234 * certainly a newline; skip over any whitespace or
235 * non-printable characters to be safe.
236 */
237 while (size > 0 && data[size - 1] <= 0x20) {
238 data[size--] = '\0';
239 }
240 if (validate_uuid(data, size, NULL, 0) == 0) {
241 (void)strlcpy(prison0.pr_hostuuid, data,
242 size + 1);
243 } else if (bootverbose) {
244 printf("hostuuid: preload data malformed: '%s'",
245 data);
246 }
247 }
248 }
249 if (bootverbose)
250 printf("hostuuid: using %s\n", prison0.pr_hostuuid);
251 }
252
253 /*
254 * struct jail_args {
255 * struct jail *jail;
256 * };
257 */
258 int
259 sys_jail(struct thread *td, struct jail_args *uap)
260 {
261 uint32_t version;
262 int error;
263 struct jail j;
264
265 error = copyin(uap->jail, &version, sizeof(uint32_t));
266 if (error)
267 return (error);
268
269 switch (version) {
270 case 0:
271 {
272 struct jail_v0 j0;
273
274 /* FreeBSD single IPv4 jails. */
275 bzero(&j, sizeof(struct jail));
276 error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
277 if (error)
278 return (error);
279 j.version = j0.version;
280 j.path = j0.path;
281 j.hostname = j0.hostname;
282 j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */
283 break;
284 }
285
286 case 1:
287 /*
288 * Version 1 was used by multi-IPv4 jail implementations
289 * that never made it into the official kernel.
290 */
291 return (EINVAL);
292
293 case 2: /* JAIL_API_VERSION */
294 /* FreeBSD multi-IPv4/IPv6,noIP jails. */
295 error = copyin(uap->jail, &j, sizeof(struct jail));
296 if (error)
297 return (error);
298 break;
299
300 default:
301 /* Sci-Fi jails are not supported, sorry. */
302 return (EINVAL);
303 }
304 return (kern_jail(td, &j));
305 }
306
307 int
308 kern_jail(struct thread *td, struct jail *j)
309 {
310 struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
311 #ifdef INET
312 + 1
313 #endif
314 #ifdef INET6
315 + 1
316 #endif
317 )];
318 struct uio opt;
319 char *u_path, *u_hostname, *u_name;
320 struct bool_flags *bf;
321 #ifdef INET
322 uint32_t ip4s;
323 struct in_addr *u_ip4;
324 #endif
325 #ifdef INET6
326 struct in6_addr *u_ip6;
327 #endif
328 size_t tmplen;
329 int error, enforce_statfs;
330
331 bzero(&optiov, sizeof(optiov));
332 opt.uio_iov = optiov;
333 opt.uio_iovcnt = 0;
334 opt.uio_offset = -1;
335 opt.uio_resid = -1;
336 opt.uio_segflg = UIO_SYSSPACE;
337 opt.uio_rw = UIO_READ;
338 opt.uio_td = td;
339
340 /* Set permissions for top-level jails from sysctls. */
341 if (!jailed(td->td_ucred)) {
342 for (bf = pr_flag_allow;
343 bf < pr_flag_allow + nitems(pr_flag_allow) &&
344 bf->flag != 0;
345 bf++) {
346 optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
347 (jail_default_allow & bf->flag)
348 ? bf->name : bf->noname);
349 optiov[opt.uio_iovcnt].iov_len =
350 strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
351 opt.uio_iovcnt += 2;
352 }
353 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
354 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
355 opt.uio_iovcnt++;
356 enforce_statfs = jail_default_enforce_statfs;
357 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
358 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
359 opt.uio_iovcnt++;
360 }
361
362 tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
363 #ifdef INET
364 ip4s = (j->version == 0) ? 1 : j->ip4s;
365 if (ip4s > jail_max_af_ips)
366 return (EINVAL);
367 tmplen += ip4s * sizeof(struct in_addr);
368 #else
369 if (j->ip4s > 0)
370 return (EINVAL);
371 #endif
372 #ifdef INET6
373 if (j->ip6s > jail_max_af_ips)
374 return (EINVAL);
375 tmplen += j->ip6s * sizeof(struct in6_addr);
376 #else
377 if (j->ip6s > 0)
378 return (EINVAL);
379 #endif
380 u_path = malloc(tmplen, M_TEMP, M_WAITOK);
381 u_hostname = u_path + MAXPATHLEN;
382 u_name = u_hostname + MAXHOSTNAMELEN;
383 #ifdef INET
384 u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
385 #endif
386 #ifdef INET6
387 #ifdef INET
388 u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
389 #else
390 u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
391 #endif
392 #endif
393 optiov[opt.uio_iovcnt].iov_base = "path";
394 optiov[opt.uio_iovcnt].iov_len = sizeof("path");
395 opt.uio_iovcnt++;
396 optiov[opt.uio_iovcnt].iov_base = u_path;
397 error = copyinstr(j->path, u_path, MAXPATHLEN,
398 &optiov[opt.uio_iovcnt].iov_len);
399 if (error) {
400 free(u_path, M_TEMP);
401 return (error);
402 }
403 opt.uio_iovcnt++;
404 optiov[opt.uio_iovcnt].iov_base = "host.hostname";
405 optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
406 opt.uio_iovcnt++;
407 optiov[opt.uio_iovcnt].iov_base = u_hostname;
408 error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
409 &optiov[opt.uio_iovcnt].iov_len);
410 if (error) {
411 free(u_path, M_TEMP);
412 return (error);
413 }
414 opt.uio_iovcnt++;
415 if (j->jailname != NULL) {
416 optiov[opt.uio_iovcnt].iov_base = "name";
417 optiov[opt.uio_iovcnt].iov_len = sizeof("name");
418 opt.uio_iovcnt++;
419 optiov[opt.uio_iovcnt].iov_base = u_name;
420 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
421 &optiov[opt.uio_iovcnt].iov_len);
422 if (error) {
423 free(u_path, M_TEMP);
424 return (error);
425 }
426 opt.uio_iovcnt++;
427 }
428 #ifdef INET
429 optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
430 optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
431 opt.uio_iovcnt++;
432 optiov[opt.uio_iovcnt].iov_base = u_ip4;
433 optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
434 if (j->version == 0)
435 u_ip4->s_addr = j->ip4s;
436 else {
437 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
438 if (error) {
439 free(u_path, M_TEMP);
440 return (error);
441 }
442 }
443 opt.uio_iovcnt++;
444 #endif
445 #ifdef INET6
446 optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
447 optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
448 opt.uio_iovcnt++;
449 optiov[opt.uio_iovcnt].iov_base = u_ip6;
450 optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
451 error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
452 if (error) {
453 free(u_path, M_TEMP);
454 return (error);
455 }
456 opt.uio_iovcnt++;
457 #endif
458 KASSERT(opt.uio_iovcnt <= nitems(optiov),
459 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
460 error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
461 free(u_path, M_TEMP);
462 return (error);
463 }
464
465
466 /*
467 * struct jail_set_args {
468 * struct iovec *iovp;
469 * unsigned int iovcnt;
470 * int flags;
471 * };
472 */
473 int
474 sys_jail_set(struct thread *td, struct jail_set_args *uap)
475 {
476 struct uio *auio;
477 int error;
478
479 /* Check that we have an even number of iovecs. */
480 if (uap->iovcnt & 1)
481 return (EINVAL);
482
483 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
484 if (error)
485 return (error);
486 error = kern_jail_set(td, auio, uap->flags);
487 free(auio, M_IOV);
488 return (error);
489 }
490
491 int
492 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
493 {
494 struct nameidata nd;
495 #ifdef INET
496 struct in_addr *ip4;
497 #endif
498 #ifdef INET6
499 struct in6_addr *ip6;
500 #endif
501 struct vfsopt *opt;
502 struct vfsoptlist *opts;
503 struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
504 struct vnode *root;
505 char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
506 char *g_path, *osrelstr;
507 struct bool_flags *bf;
508 struct jailsys_flags *jsf;
509 #if defined(INET) || defined(INET6)
510 struct prison *tppr;
511 void *op;
512 #endif
513 unsigned long hid;
514 size_t namelen, onamelen, pnamelen;
515 int born, created, cuflags, descend, enforce;
516 int error, errmsg_len, errmsg_pos;
517 int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
518 int jid, jsys, len, level;
519 int childmax, osreldt, rsnum, slevel;
520 int fullpath_disabled;
521 #if defined(INET) || defined(INET6)
522 int ii, ij;
523 #endif
524 #ifdef INET
525 int ip4s, redo_ip4;
526 #endif
527 #ifdef INET6
528 int ip6s, redo_ip6;
529 #endif
530 uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
531 unsigned tallow;
532 char numbuf[12];
533
534 error = priv_check(td, PRIV_JAIL_SET);
535 if (!error && (flags & JAIL_ATTACH))
536 error = priv_check(td, PRIV_JAIL_ATTACH);
537 if (error)
538 return (error);
539 mypr = td->td_ucred->cr_prison;
540 if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
541 return (EPERM);
542 if (flags & ~JAIL_SET_MASK)
543 return (EINVAL);
544
545 /*
546 * Check all the parameters before committing to anything. Not all
547 * errors can be caught early, but we may as well try. Also, this
548 * takes care of some expensive stuff (path lookup) before getting
549 * the allprison lock.
550 *
551 * XXX Jails are not filesystems, and jail parameters are not mount
552 * options. But it makes more sense to re-use the vfsopt code
553 * than duplicate it under a different name.
554 */
555 error = vfs_buildopts(optuio, &opts);
556 if (error)
557 return (error);
558 #ifdef INET
559 ip4 = NULL;
560 #endif
561 #ifdef INET6
562 ip6 = NULL;
563 #endif
564 g_path = NULL;
565
566 cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
567 if (!cuflags) {
568 error = EINVAL;
569 vfs_opterror(opts, "no valid operation (create or update)");
570 goto done_errmsg;
571 }
572
573 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
574 if (error == ENOENT)
575 jid = 0;
576 else if (error != 0)
577 goto done_free;
578
579 error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
580 if (error == ENOENT)
581 gotslevel = 0;
582 else if (error != 0)
583 goto done_free;
584 else
585 gotslevel = 1;
586
587 error =
588 vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
589 if (error == ENOENT)
590 gotchildmax = 0;
591 else if (error != 0)
592 goto done_free;
593 else
594 gotchildmax = 1;
595
596 error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
597 if (error == ENOENT)
598 gotenforce = 0;
599 else if (error != 0)
600 goto done_free;
601 else if (enforce < 0 || enforce > 2) {
602 error = EINVAL;
603 goto done_free;
604 } else
605 gotenforce = 1;
606
607 error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
608 if (error == ENOENT)
609 gotrsnum = 0;
610 else if (error != 0)
611 goto done_free;
612 else
613 gotrsnum = 1;
614
615 pr_flags = ch_flags = 0;
616 for (bf = pr_flag_bool;
617 bf < pr_flag_bool + nitems(pr_flag_bool);
618 bf++) {
619 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
620 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
621 }
622 ch_flags |= pr_flags;
623 for (jsf = pr_flag_jailsys;
624 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
625 jsf++) {
626 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
627 if (error == ENOENT)
628 continue;
629 if (error != 0)
630 goto done_free;
631 switch (jsys) {
632 case JAIL_SYS_DISABLE:
633 if (!jsf->disable) {
634 error = EINVAL;
635 goto done_free;
636 }
637 pr_flags |= jsf->disable;
638 break;
639 case JAIL_SYS_NEW:
640 pr_flags |= jsf->new;
641 break;
642 case JAIL_SYS_INHERIT:
643 break;
644 default:
645 error = EINVAL;
646 goto done_free;
647 }
648 ch_flags |= jsf->new | jsf->disable;
649 }
650 if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
651 && !(pr_flags & PR_PERSIST)) {
652 error = EINVAL;
653 vfs_opterror(opts, "new jail must persist or attach");
654 goto done_errmsg;
655 }
656 #ifdef VIMAGE
657 if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
658 error = EINVAL;
659 vfs_opterror(opts, "vnet cannot be changed after creation");
660 goto done_errmsg;
661 }
662 #endif
663 #ifdef INET
664 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
665 error = EINVAL;
666 vfs_opterror(opts, "ip4 cannot be changed after creation");
667 goto done_errmsg;
668 }
669 #endif
670 #ifdef INET6
671 if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
672 error = EINVAL;
673 vfs_opterror(opts, "ip6 cannot be changed after creation");
674 goto done_errmsg;
675 }
676 #endif
677
678 pr_allow = ch_allow = 0;
679 for (bf = pr_flag_allow;
680 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
681 bf++) {
682 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
683 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
684 }
685 ch_allow |= pr_allow;
686
687 error = vfs_getopt(opts, "name", (void **)&name, &len);
688 if (error == ENOENT)
689 name = NULL;
690 else if (error != 0)
691 goto done_free;
692 else {
693 if (len == 0 || name[len - 1] != '\0') {
694 error = EINVAL;
695 goto done_free;
696 }
697 if (len > MAXHOSTNAMELEN) {
698 error = ENAMETOOLONG;
699 goto done_free;
700 }
701 }
702
703 error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
704 if (error == ENOENT)
705 host = NULL;
706 else if (error != 0)
707 goto done_free;
708 else {
709 ch_flags |= PR_HOST;
710 pr_flags |= PR_HOST;
711 if (len == 0 || host[len - 1] != '\0') {
712 error = EINVAL;
713 goto done_free;
714 }
715 if (len > MAXHOSTNAMELEN) {
716 error = ENAMETOOLONG;
717 goto done_free;
718 }
719 }
720
721 error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
722 if (error == ENOENT)
723 domain = NULL;
724 else if (error != 0)
725 goto done_free;
726 else {
727 ch_flags |= PR_HOST;
728 pr_flags |= PR_HOST;
729 if (len == 0 || domain[len - 1] != '\0') {
730 error = EINVAL;
731 goto done_free;
732 }
733 if (len > MAXHOSTNAMELEN) {
734 error = ENAMETOOLONG;
735 goto done_free;
736 }
737 }
738
739 error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
740 if (error == ENOENT)
741 uuid = NULL;
742 else if (error != 0)
743 goto done_free;
744 else {
745 ch_flags |= PR_HOST;
746 pr_flags |= PR_HOST;
747 if (len == 0 || uuid[len - 1] != '\0') {
748 error = EINVAL;
749 goto done_free;
750 }
751 if (len > HOSTUUIDLEN) {
752 error = ENAMETOOLONG;
753 goto done_free;
754 }
755 }
756
757 #ifdef COMPAT_FREEBSD32
758 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
759 uint32_t hid32;
760
761 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
762 hid = hid32;
763 } else
764 #endif
765 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
766 if (error == ENOENT)
767 gothid = 0;
768 else if (error != 0)
769 goto done_free;
770 else {
771 gothid = 1;
772 ch_flags |= PR_HOST;
773 pr_flags |= PR_HOST;
774 }
775
776 #ifdef INET
777 error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
778 if (error == ENOENT)
779 ip4s = 0;
780 else if (error != 0)
781 goto done_free;
782 else if (ip4s & (sizeof(*ip4) - 1)) {
783 error = EINVAL;
784 goto done_free;
785 } else {
786 ch_flags |= PR_IP4_USER;
787 pr_flags |= PR_IP4_USER;
788 if (ip4s > 0) {
789 ip4s /= sizeof(*ip4);
790 if (ip4s > jail_max_af_ips) {
791 error = EINVAL;
792 vfs_opterror(opts, "too many IPv4 addresses");
793 goto done_errmsg;
794 }
795 ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
796 bcopy(op, ip4, ip4s * sizeof(*ip4));
797 /*
798 * IP addresses are all sorted but ip[0] to preserve
799 * the primary IP address as given from userland.
800 * This special IP is used for unbound outgoing
801 * connections as well for "loopback" traffic in case
802 * source address selection cannot find any more fitting
803 * address to connect from.
804 */
805 if (ip4s > 1)
806 qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
807 prison_qcmp_v4);
808 /*
809 * Check for duplicate addresses and do some simple
810 * zero and broadcast checks. If users give other bogus
811 * addresses it is their problem.
812 *
813 * We do not have to care about byte order for these
814 * checks so we will do them in NBO.
815 */
816 for (ii = 0; ii < ip4s; ii++) {
817 if (ip4[ii].s_addr == INADDR_ANY ||
818 ip4[ii].s_addr == INADDR_BROADCAST) {
819 error = EINVAL;
820 goto done_free;
821 }
822 if ((ii+1) < ip4s &&
823 (ip4[0].s_addr == ip4[ii+1].s_addr ||
824 ip4[ii].s_addr == ip4[ii+1].s_addr)) {
825 error = EINVAL;
826 goto done_free;
827 }
828 }
829 }
830 }
831 #endif
832
833 #ifdef INET6
834 error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
835 if (error == ENOENT)
836 ip6s = 0;
837 else if (error != 0)
838 goto done_free;
839 else if (ip6s & (sizeof(*ip6) - 1)) {
840 error = EINVAL;
841 goto done_free;
842 } else {
843 ch_flags |= PR_IP6_USER;
844 pr_flags |= PR_IP6_USER;
845 if (ip6s > 0) {
846 ip6s /= sizeof(*ip6);
847 if (ip6s > jail_max_af_ips) {
848 error = EINVAL;
849 vfs_opterror(opts, "too many IPv6 addresses");
850 goto done_errmsg;
851 }
852 ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
853 bcopy(op, ip6, ip6s * sizeof(*ip6));
854 if (ip6s > 1)
855 qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
856 prison_qcmp_v6);
857 for (ii = 0; ii < ip6s; ii++) {
858 if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
859 error = EINVAL;
860 goto done_free;
861 }
862 if ((ii+1) < ip6s &&
863 (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
864 IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
865 {
866 error = EINVAL;
867 goto done_free;
868 }
869 }
870 }
871 }
872 #endif
873
874 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
875 if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
876 error = EINVAL;
877 vfs_opterror(opts,
878 "vnet jails cannot have IP address restrictions");
879 goto done_errmsg;
880 }
881 #endif
882
883 error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
884 if (error == ENOENT)
885 osrelstr = NULL;
886 else if (error != 0)
887 goto done_free;
888 else {
889 if (flags & JAIL_UPDATE) {
890 error = EINVAL;
891 vfs_opterror(opts,
892 "osrelease cannot be changed after creation");
893 goto done_errmsg;
894 }
895 if (len == 0 || osrelstr[len - 1] != '\0') {
896 error = EINVAL;
897 goto done_free;
898 }
899 if (len >= OSRELEASELEN) {
900 error = ENAMETOOLONG;
901 vfs_opterror(opts,
902 "osrelease string must be 1-%d bytes long",
903 OSRELEASELEN - 1);
904 goto done_errmsg;
905 }
906 }
907
908 error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
909 if (error == ENOENT)
910 osreldt = 0;
911 else if (error != 0)
912 goto done_free;
913 else {
914 if (flags & JAIL_UPDATE) {
915 error = EINVAL;
916 vfs_opterror(opts,
917 "osreldate cannot be changed after creation");
918 goto done_errmsg;
919 }
920 if (osreldt == 0) {
921 error = EINVAL;
922 vfs_opterror(opts, "osreldate cannot be 0");
923 goto done_errmsg;
924 }
925 }
926
927 fullpath_disabled = 0;
928 root = NULL;
929 error = vfs_getopt(opts, "path", (void **)&path, &len);
930 if (error == ENOENT)
931 path = NULL;
932 else if (error != 0)
933 goto done_free;
934 else {
935 if (flags & JAIL_UPDATE) {
936 error = EINVAL;
937 vfs_opterror(opts,
938 "path cannot be changed after creation");
939 goto done_errmsg;
940 }
941 if (len == 0 || path[len - 1] != '\0') {
942 error = EINVAL;
943 goto done_free;
944 }
945 if (len < 2 || (len == 2 && path[0] == '/'))
946 path = NULL;
947 else
948 {
949 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
950 path, td);
951 error = namei(&nd);
952 if (error)
953 goto done_free;
954 root = nd.ni_vp;
955 NDFREE(&nd, NDF_ONLY_PNBUF);
956 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
957 strlcpy(g_path, path, MAXPATHLEN);
958 error = vn_path_to_global_path(td, root, g_path,
959 MAXPATHLEN);
960 if (error == 0)
961 path = g_path;
962 else if (error == ENODEV) {
963 /* means sysctl debug.disablefullpath == 1 */
964 fullpath_disabled = 1;
965 } else {
966 /* exit on other errors */
967 goto done_free;
968 }
969 if (root->v_type != VDIR) {
970 error = ENOTDIR;
971 vput(root);
972 goto done_free;
973 }
974 VOP_UNLOCK(root, 0);
975 if (fullpath_disabled) {
976 /* Leave room for a real-root full pathname. */
977 if (len + (path[0] == '/' &&
978 strcmp(mypr->pr_path, "/")
979 ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
980 error = ENAMETOOLONG;
981 vrele(root);
982 goto done_free;
983 }
984 }
985 }
986 }
987
988 /*
989 * Find the specified jail, or at least its parent.
990 * This abuses the file error codes ENOENT and EEXIST.
991 */
992 pr = NULL;
993 ppr = mypr;
994 if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
995 namelc = strrchr(name, '.');
996 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
997 if (*p != '\0')
998 jid = 0;
999 }
1000 sx_xlock(&allprison_lock);
1001 if (jid != 0) {
1002 /*
1003 * See if a requested jid already exists. There is an
1004 * information leak here if the jid exists but is not within
1005 * the caller's jail hierarchy. Jail creators will get EEXIST
1006 * even though they cannot see the jail, and CREATE | UPDATE
1007 * will return ENOENT which is not normally a valid error.
1008 */
1009 if (jid < 0) {
1010 error = EINVAL;
1011 vfs_opterror(opts, "negative jid");
1012 goto done_unlock_list;
1013 }
1014 pr = prison_find(jid);
1015 if (pr != NULL) {
1016 ppr = pr->pr_parent;
1017 /* Create: jid must not exist. */
1018 if (cuflags == JAIL_CREATE) {
1019 mtx_unlock(&pr->pr_mtx);
1020 error = EEXIST;
1021 vfs_opterror(opts, "jail %d already exists",
1022 jid);
1023 goto done_unlock_list;
1024 }
1025 if (!prison_ischild(mypr, pr)) {
1026 mtx_unlock(&pr->pr_mtx);
1027 pr = NULL;
1028 } else if (pr->pr_uref == 0) {
1029 if (!(flags & JAIL_DYING)) {
1030 mtx_unlock(&pr->pr_mtx);
1031 error = ENOENT;
1032 vfs_opterror(opts, "jail %d is dying",
1033 jid);
1034 goto done_unlock_list;
1035 } else if ((flags & JAIL_ATTACH) ||
1036 (pr_flags & PR_PERSIST)) {
1037 /*
1038 * A dying jail might be resurrected
1039 * (via attach or persist), but first
1040 * it must determine if another jail
1041 * has claimed its name. Accomplish
1042 * this by implicitly re-setting the
1043 * name.
1044 */
1045 if (name == NULL)
1046 name = prison_name(mypr, pr);
1047 }
1048 }
1049 }
1050 if (pr == NULL) {
1051 /* Update: jid must exist. */
1052 if (cuflags == JAIL_UPDATE) {
1053 error = ENOENT;
1054 vfs_opterror(opts, "jail %d not found", jid);
1055 goto done_unlock_list;
1056 }
1057 }
1058 }
1059 /*
1060 * If the caller provided a name, look for a jail by that name.
1061 * This has different semantics for creates and updates keyed by jid
1062 * (where the name must not already exist in a different jail),
1063 * and updates keyed by the name itself (where the name must exist
1064 * because that is the jail being updated).
1065 */
1066 namelc = NULL;
1067 if (name != NULL) {
1068 namelc = strrchr(name, '.');
1069 if (namelc == NULL)
1070 namelc = name;
1071 else {
1072 /*
1073 * This is a hierarchical name. Split it into the
1074 * parent and child names, and make sure the parent
1075 * exists or matches an already found jail.
1076 */
1077 if (pr != NULL) {
1078 if (strncmp(name, ppr->pr_name, namelc - name)
1079 || ppr->pr_name[namelc - name] != '\0') {
1080 mtx_unlock(&pr->pr_mtx);
1081 error = EINVAL;
1082 vfs_opterror(opts,
1083 "cannot change jail's parent");
1084 goto done_unlock_list;
1085 }
1086 } else {
1087 *namelc = '\0';
1088 ppr = prison_find_name(mypr, name);
1089 if (ppr == NULL) {
1090 error = ENOENT;
1091 vfs_opterror(opts,
1092 "jail \"%s\" not found", name);
1093 goto done_unlock_list;
1094 }
1095 mtx_unlock(&ppr->pr_mtx);
1096 *namelc = '.';
1097 }
1098 namelc++;
1099 }
1100 if (namelc[0] != '\0') {
1101 pnamelen =
1102 (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1103 name_again:
1104 deadpr = NULL;
1105 FOREACH_PRISON_CHILD(ppr, tpr) {
1106 if (tpr != pr && tpr->pr_ref > 0 &&
1107 !strcmp(tpr->pr_name + pnamelen, namelc)) {
1108 if (pr == NULL &&
1109 cuflags != JAIL_CREATE) {
1110 mtx_lock(&tpr->pr_mtx);
1111 if (tpr->pr_ref > 0) {
1112 /*
1113 * Use this jail
1114 * for updates.
1115 */
1116 if (tpr->pr_uref > 0) {
1117 pr = tpr;
1118 break;
1119 }
1120 deadpr = tpr;
1121 }
1122 mtx_unlock(&tpr->pr_mtx);
1123 } else if (tpr->pr_uref > 0) {
1124 /*
1125 * Create, or update(jid):
1126 * name must not exist in an
1127 * active sibling jail.
1128 */
1129 error = EEXIST;
1130 if (pr != NULL)
1131 mtx_unlock(&pr->pr_mtx);
1132 vfs_opterror(opts,
1133 "jail \"%s\" already exists",
1134 name);
1135 goto done_unlock_list;
1136 }
1137 }
1138 }
1139 /* If no active jail is found, use a dying one. */
1140 if (deadpr != NULL && pr == NULL) {
1141 if (flags & JAIL_DYING) {
1142 mtx_lock(&deadpr->pr_mtx);
1143 if (deadpr->pr_ref == 0) {
1144 mtx_unlock(&deadpr->pr_mtx);
1145 goto name_again;
1146 }
1147 pr = deadpr;
1148 } else if (cuflags == JAIL_UPDATE) {
1149 error = ENOENT;
1150 vfs_opterror(opts,
1151 "jail \"%s\" is dying", name);
1152 goto done_unlock_list;
1153 }
1154 }
1155 /* Update: name must exist if no jid. */
1156 else if (cuflags == JAIL_UPDATE && pr == NULL) {
1157 error = ENOENT;
1158 vfs_opterror(opts, "jail \"%s\" not found",
1159 name);
1160 goto done_unlock_list;
1161 }
1162 }
1163 }
1164 /* Update: must provide a jid or name. */
1165 else if (cuflags == JAIL_UPDATE && pr == NULL) {
1166 error = ENOENT;
1167 vfs_opterror(opts, "update specified no jail");
1168 goto done_unlock_list;
1169 }
1170
1171 /* If there's no prison to update, create a new one and link it in. */
1172 if (pr == NULL) {
1173 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1174 if (tpr->pr_childcount >= tpr->pr_childmax) {
1175 error = EPERM;
1176 vfs_opterror(opts, "prison limit exceeded");
1177 goto done_unlock_list;
1178 }
1179 created = 1;
1180 mtx_lock(&ppr->pr_mtx);
1181 if (ppr->pr_ref == 0) {
1182 mtx_unlock(&ppr->pr_mtx);
1183 error = ENOENT;
1184 vfs_opterror(opts, "jail \"%s\" not found",
1185 prison_name(mypr, ppr));
1186 goto done_unlock_list;
1187 }
1188 ppr->pr_ref++;
1189 ppr->pr_uref++;
1190 mtx_unlock(&ppr->pr_mtx);
1191 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1192 if (jid == 0) {
1193 /* Find the next free jid. */
1194 jid = lastprid + 1;
1195 findnext:
1196 if (jid == JAIL_MAX)
1197 jid = 1;
1198 TAILQ_FOREACH(tpr, &allprison, pr_list) {
1199 if (tpr->pr_id < jid)
1200 continue;
1201 if (tpr->pr_id > jid || tpr->pr_ref == 0) {
1202 TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1203 break;
1204 }
1205 if (jid == lastprid) {
1206 error = EAGAIN;
1207 vfs_opterror(opts,
1208 "no available jail IDs");
1209 free(pr, M_PRISON);
1210 prison_deref(ppr, PD_DEREF |
1211 PD_DEUREF | PD_LIST_XLOCKED);
1212 goto done_releroot;
1213 }
1214 jid++;
1215 goto findnext;
1216 }
1217 lastprid = jid;
1218 } else {
1219 /*
1220 * The jail already has a jid (that did not yet exist),
1221 * so just find where to insert it.
1222 */
1223 TAILQ_FOREACH(tpr, &allprison, pr_list)
1224 if (tpr->pr_id >= jid) {
1225 TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
1226 break;
1227 }
1228 }
1229 if (tpr == NULL)
1230 TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1231 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1232 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1233 tpr->pr_childcount++;
1234
1235 pr->pr_parent = ppr;
1236 pr->pr_id = jid;
1237
1238 /* Set some default values, and inherit some from the parent. */
1239 if (namelc == NULL)
1240 namelc = "";
1241 if (path == NULL) {
1242 path = "/";
1243 root = mypr->pr_root;
1244 vref(root);
1245 }
1246 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1247 pr->pr_flags |= PR_HOST;
1248 #if defined(INET) || defined(INET6)
1249 #ifdef VIMAGE
1250 if (!(pr_flags & PR_VNET))
1251 #endif
1252 {
1253 #ifdef INET
1254 if (!(ch_flags & PR_IP4_USER))
1255 pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1256 else if (!(pr_flags & PR_IP4_USER)) {
1257 pr->pr_flags |= ppr->pr_flags & PR_IP4;
1258 if (ppr->pr_ip4 != NULL) {
1259 pr->pr_ip4s = ppr->pr_ip4s;
1260 pr->pr_ip4 = malloc(pr->pr_ip4s *
1261 sizeof(struct in_addr), M_PRISON,
1262 M_WAITOK);
1263 bcopy(ppr->pr_ip4, pr->pr_ip4,
1264 pr->pr_ip4s * sizeof(*pr->pr_ip4));
1265 }
1266 }
1267 #endif
1268 #ifdef INET6
1269 if (!(ch_flags & PR_IP6_USER))
1270 pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1271 else if (!(pr_flags & PR_IP6_USER)) {
1272 pr->pr_flags |= ppr->pr_flags & PR_IP6;
1273 if (ppr->pr_ip6 != NULL) {
1274 pr->pr_ip6s = ppr->pr_ip6s;
1275 pr->pr_ip6 = malloc(pr->pr_ip6s *
1276 sizeof(struct in6_addr), M_PRISON,
1277 M_WAITOK);
1278 bcopy(ppr->pr_ip6, pr->pr_ip6,
1279 pr->pr_ip6s * sizeof(*pr->pr_ip6));
1280 }
1281 }
1282 #endif
1283 }
1284 #endif
1285 /* Source address selection is always on by default. */
1286 pr->pr_flags |= _PR_IP_SADDRSEL;
1287
1288 pr->pr_securelevel = ppr->pr_securelevel;
1289 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1290 pr->pr_enforce_statfs = jail_default_enforce_statfs;
1291 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1292
1293 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1294 if (osrelstr == NULL)
1295 strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1296 sizeof(pr->pr_osrelease));
1297 else
1298 strlcpy(pr->pr_osrelease, osrelstr,
1299 sizeof(pr->pr_osrelease));
1300
1301 LIST_INIT(&pr->pr_children);
1302 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1303 TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1304
1305 #ifdef VIMAGE
1306 /* Allocate a new vnet if specified. */
1307 pr->pr_vnet = (pr_flags & PR_VNET)
1308 ? vnet_alloc() : ppr->pr_vnet;
1309 #endif
1310 /*
1311 * Allocate a dedicated cpuset for each jail.
1312 * Unlike other initial settings, this may return an error.
1313 */
1314 error = cpuset_create_root(ppr, &pr->pr_cpuset);
1315 if (error) {
1316 prison_deref(pr, PD_LIST_XLOCKED);
1317 goto done_releroot;
1318 }
1319
1320 mtx_lock(&pr->pr_mtx);
1321 /*
1322 * New prisons do not yet have a reference, because we do not
1323 * want others to see the incomplete prison once the
1324 * allprison_lock is downgraded.
1325 */
1326 } else {
1327 created = 0;
1328 /*
1329 * Grab a reference for existing prisons, to ensure they
1330 * continue to exist for the duration of the call.
1331 */
1332 pr->pr_ref++;
1333 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1334 if ((pr->pr_flags & PR_VNET) &&
1335 (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1336 error = EINVAL;
1337 vfs_opterror(opts,
1338 "vnet jails cannot have IP address restrictions");
1339 goto done_deref_locked;
1340 }
1341 #endif
1342 #ifdef INET
1343 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1344 error = EINVAL;
1345 vfs_opterror(opts,
1346 "ip4 cannot be changed after creation");
1347 goto done_deref_locked;
1348 }
1349 #endif
1350 #ifdef INET6
1351 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1352 error = EINVAL;
1353 vfs_opterror(opts,
1354 "ip6 cannot be changed after creation");
1355 goto done_deref_locked;
1356 }
1357 #endif
1358 }
1359
1360 /* Do final error checking before setting anything. */
1361 if (gotslevel) {
1362 if (slevel < ppr->pr_securelevel) {
1363 error = EPERM;
1364 goto done_deref_locked;
1365 }
1366 }
1367 if (gotchildmax) {
1368 if (childmax >= ppr->pr_childmax) {
1369 error = EPERM;
1370 goto done_deref_locked;
1371 }
1372 }
1373 if (gotenforce) {
1374 if (enforce < ppr->pr_enforce_statfs) {
1375 error = EPERM;
1376 goto done_deref_locked;
1377 }
1378 }
1379 if (gotrsnum) {
1380 /*
1381 * devfs_rsnum is a uint16_t
1382 */
1383 if (rsnum < 0 || rsnum > 65535) {
1384 error = EINVAL;
1385 goto done_deref_locked;
1386 }
1387 /*
1388 * Nested jails always inherit parent's devfs ruleset
1389 */
1390 if (jailed(td->td_ucred)) {
1391 if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1392 error = EPERM;
1393 goto done_deref_locked;
1394 } else
1395 rsnum = ppr->pr_devfs_rsnum;
1396 }
1397 }
1398 #ifdef INET
1399 if (ip4s > 0) {
1400 if (ppr->pr_flags & PR_IP4) {
1401 /*
1402 * Make sure the new set of IP addresses is a
1403 * subset of the parent's list. Don't worry
1404 * about the parent being unlocked, as any
1405 * setting is done with allprison_lock held.
1406 */
1407 for (ij = 0; ij < ppr->pr_ip4s; ij++)
1408 if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
1409 break;
1410 if (ij == ppr->pr_ip4s) {
1411 error = EPERM;
1412 goto done_deref_locked;
1413 }
1414 if (ip4s > 1) {
1415 for (ii = ij = 1; ii < ip4s; ii++) {
1416 if (ip4[ii].s_addr ==
1417 ppr->pr_ip4[0].s_addr)
1418 continue;
1419 for (; ij < ppr->pr_ip4s; ij++)
1420 if (ip4[ii].s_addr ==
1421 ppr->pr_ip4[ij].s_addr)
1422 break;
1423 if (ij == ppr->pr_ip4s)
1424 break;
1425 }
1426 if (ij == ppr->pr_ip4s) {
1427 error = EPERM;
1428 goto done_deref_locked;
1429 }
1430 }
1431 }
1432 /*
1433 * Check for conflicting IP addresses. We permit them
1434 * if there is no more than one IP on each jail. If
1435 * there is a duplicate on a jail with more than one
1436 * IP stop checking and return error.
1437 */
1438 #ifdef VIMAGE
1439 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1440 if (tppr->pr_flags & PR_VNET)
1441 break;
1442 #else
1443 tppr = &prison0;
1444 #endif
1445 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1446 if (tpr == pr ||
1447 #ifdef VIMAGE
1448 (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1449 #endif
1450 tpr->pr_uref == 0) {
1451 descend = 0;
1452 continue;
1453 }
1454 if (!(tpr->pr_flags & PR_IP4_USER))
1455 continue;
1456 descend = 0;
1457 if (tpr->pr_ip4 == NULL ||
1458 (ip4s == 1 && tpr->pr_ip4s == 1))
1459 continue;
1460 for (ii = 0; ii < ip4s; ii++) {
1461 if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
1462 0) {
1463 error = EADDRINUSE;
1464 vfs_opterror(opts,
1465 "IPv4 addresses clash");
1466 goto done_deref_locked;
1467 }
1468 }
1469 }
1470 }
1471 #endif
1472 #ifdef INET6
1473 if (ip6s > 0) {
1474 if (ppr->pr_flags & PR_IP6) {
1475 /*
1476 * Make sure the new set of IP addresses is a
1477 * subset of the parent's list.
1478 */
1479 for (ij = 0; ij < ppr->pr_ip6s; ij++)
1480 if (IN6_ARE_ADDR_EQUAL(&ip6[0],
1481 &ppr->pr_ip6[ij]))
1482 break;
1483 if (ij == ppr->pr_ip6s) {
1484 error = EPERM;
1485 goto done_deref_locked;
1486 }
1487 if (ip6s > 1) {
1488 for (ii = ij = 1; ii < ip6s; ii++) {
1489 if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
1490 &ppr->pr_ip6[0]))
1491 continue;
1492 for (; ij < ppr->pr_ip6s; ij++)
1493 if (IN6_ARE_ADDR_EQUAL(
1494 &ip6[ii], &ppr->pr_ip6[ij]))
1495 break;
1496 if (ij == ppr->pr_ip6s)
1497 break;
1498 }
1499 if (ij == ppr->pr_ip6s) {
1500 error = EPERM;
1501 goto done_deref_locked;
1502 }
1503 }
1504 }
1505 /* Check for conflicting IP addresses. */
1506 #ifdef VIMAGE
1507 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
1508 if (tppr->pr_flags & PR_VNET)
1509 break;
1510 #else
1511 tppr = &prison0;
1512 #endif
1513 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
1514 if (tpr == pr ||
1515 #ifdef VIMAGE
1516 (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
1517 #endif
1518 tpr->pr_uref == 0) {
1519 descend = 0;
1520 continue;
1521 }
1522 if (!(tpr->pr_flags & PR_IP6_USER))
1523 continue;
1524 descend = 0;
1525 if (tpr->pr_ip6 == NULL ||
1526 (ip6s == 1 && tpr->pr_ip6s == 1))
1527 continue;
1528 for (ii = 0; ii < ip6s; ii++) {
1529 if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
1530 0) {
1531 error = EADDRINUSE;
1532 vfs_opterror(opts,
1533 "IPv6 addresses clash");
1534 goto done_deref_locked;
1535 }
1536 }
1537 }
1538 }
1539 #endif
1540 onamelen = namelen = 0;
1541 if (namelc != NULL) {
1542 /* Give a default name of the jid. Also allow the name to be
1543 * explicitly the jid - but not any other number, and only in
1544 * normal form (no leading zero/etc).
1545 */
1546 if (namelc[0] == '\0')
1547 snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1548 else if ((strtoul(namelc, &p, 10) != jid ||
1549 namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1550 error = EINVAL;
1551 vfs_opterror(opts,
1552 "name cannot be numeric (unless it is the jid)");
1553 goto done_deref_locked;
1554 }
1555 /*
1556 * Make sure the name isn't too long for the prison or its
1557 * children.
1558 */
1559 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1560 onamelen = strlen(pr->pr_name + pnamelen);
1561 namelen = strlen(namelc);
1562 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1563 error = ENAMETOOLONG;
1564 goto done_deref_locked;
1565 }
1566 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1567 if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1568 sizeof(pr->pr_name)) {
1569 error = ENAMETOOLONG;
1570 goto done_deref_locked;
1571 }
1572 }
1573 }
1574 if (pr_allow & ~ppr->pr_allow) {
1575 error = EPERM;
1576 goto done_deref_locked;
1577 }
1578
1579 /*
1580 * Let modules check their parameters. This requires unlocking and
1581 * then re-locking the prison, but this is still a valid state as long
1582 * as allprison_lock remains xlocked.
1583 */
1584 mtx_unlock(&pr->pr_mtx);
1585 error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1586 if (error != 0) {
1587 prison_deref(pr, created
1588 ? PD_LIST_XLOCKED
1589 : PD_DEREF | PD_LIST_XLOCKED);
1590 goto done_releroot;
1591 }
1592 mtx_lock(&pr->pr_mtx);
1593
1594 /* At this point, all valid parameters should have been noted. */
1595 TAILQ_FOREACH(opt, opts, link) {
1596 if (!opt->seen && strcmp(opt->name, "errmsg")) {
1597 error = EINVAL;
1598 vfs_opterror(opts, "unknown parameter: %s", opt->name);
1599 goto done_deref_locked;
1600 }
1601 }
1602
1603 /* Set the parameters of the prison. */
1604 #ifdef INET
1605 redo_ip4 = 0;
1606 if (pr_flags & PR_IP4_USER) {
1607 pr->pr_flags |= PR_IP4;
1608 free(pr->pr_ip4, M_PRISON);
1609 pr->pr_ip4s = ip4s;
1610 pr->pr_ip4 = ip4;
1611 ip4 = NULL;
1612 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1613 #ifdef VIMAGE
1614 if (tpr->pr_flags & PR_VNET) {
1615 descend = 0;
1616 continue;
1617 }
1618 #endif
1619 if (prison_restrict_ip4(tpr, NULL)) {
1620 redo_ip4 = 1;
1621 descend = 0;
1622 }
1623 }
1624 }
1625 #endif
1626 #ifdef INET6
1627 redo_ip6 = 0;
1628 if (pr_flags & PR_IP6_USER) {
1629 pr->pr_flags |= PR_IP6;
1630 free(pr->pr_ip6, M_PRISON);
1631 pr->pr_ip6s = ip6s;
1632 pr->pr_ip6 = ip6;
1633 ip6 = NULL;
1634 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1635 #ifdef VIMAGE
1636 if (tpr->pr_flags & PR_VNET) {
1637 descend = 0;
1638 continue;
1639 }
1640 #endif
1641 if (prison_restrict_ip6(tpr, NULL)) {
1642 redo_ip6 = 1;
1643 descend = 0;
1644 }
1645 }
1646 }
1647 #endif
1648 if (gotslevel) {
1649 pr->pr_securelevel = slevel;
1650 /* Set all child jails to be at least this level. */
1651 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1652 if (tpr->pr_securelevel < slevel)
1653 tpr->pr_securelevel = slevel;
1654 }
1655 if (gotchildmax) {
1656 pr->pr_childmax = childmax;
1657 /* Set all child jails to under this limit. */
1658 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1659 if (tpr->pr_childmax > childmax - level)
1660 tpr->pr_childmax = childmax > level
1661 ? childmax - level : 0;
1662 }
1663 if (gotenforce) {
1664 pr->pr_enforce_statfs = enforce;
1665 /* Pass this restriction on to the children. */
1666 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1667 if (tpr->pr_enforce_statfs < enforce)
1668 tpr->pr_enforce_statfs = enforce;
1669 }
1670 if (gotrsnum) {
1671 pr->pr_devfs_rsnum = rsnum;
1672 /* Pass this restriction on to the children. */
1673 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1674 tpr->pr_devfs_rsnum = rsnum;
1675 }
1676 if (namelc != NULL) {
1677 if (ppr == &prison0)
1678 strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1679 else
1680 snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1681 ppr->pr_name, namelc);
1682 /* Change this component of child names. */
1683 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1684 bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1685 strlen(tpr->pr_name + onamelen) + 1);
1686 bcopy(pr->pr_name, tpr->pr_name, namelen);
1687 }
1688 }
1689 if (path != NULL) {
1690 /* Try to keep a real-rooted full pathname. */
1691 if (fullpath_disabled && path[0] == '/' &&
1692 strcmp(mypr->pr_path, "/"))
1693 snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
1694 mypr->pr_path, path);
1695 else
1696 strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1697 pr->pr_root = root;
1698 }
1699 if (PR_HOST & ch_flags & ~pr_flags) {
1700 if (pr->pr_flags & PR_HOST) {
1701 /*
1702 * Copy the parent's host info. As with pr_ip4 above,
1703 * the lack of a lock on the parent is not a problem;
1704 * it is always set with allprison_lock at least
1705 * shared, and is held exclusively here.
1706 */
1707 strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1708 sizeof(pr->pr_hostname));
1709 strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1710 sizeof(pr->pr_domainname));
1711 strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1712 sizeof(pr->pr_hostuuid));
1713 pr->pr_hostid = pr->pr_parent->pr_hostid;
1714 }
1715 } else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1716 /* Set this prison, and any descendants without PR_HOST. */
1717 if (host != NULL)
1718 strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1719 if (domain != NULL)
1720 strlcpy(pr->pr_domainname, domain,
1721 sizeof(pr->pr_domainname));
1722 if (uuid != NULL)
1723 strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1724 if (gothid)
1725 pr->pr_hostid = hid;
1726 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1727 if (tpr->pr_flags & PR_HOST)
1728 descend = 0;
1729 else {
1730 if (host != NULL)
1731 strlcpy(tpr->pr_hostname,
1732 pr->pr_hostname,
1733 sizeof(tpr->pr_hostname));
1734 if (domain != NULL)
1735 strlcpy(tpr->pr_domainname,
1736 pr->pr_domainname,
1737 sizeof(tpr->pr_domainname));
1738 if (uuid != NULL)
1739 strlcpy(tpr->pr_hostuuid,
1740 pr->pr_hostuuid,
1741 sizeof(tpr->pr_hostuuid));
1742 if (gothid)
1743 tpr->pr_hostid = hid;
1744 }
1745 }
1746 }
1747 if ((tallow = ch_allow & ~pr_allow)) {
1748 /* Clear allow bits in all children. */
1749 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1750 tpr->pr_allow &= ~tallow;
1751 }
1752 pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
1753 /*
1754 * Persistent prisons get an extra reference, and prisons losing their
1755 * persist flag lose that reference. Only do this for existing prisons
1756 * for now, so new ones will remain unseen until after the module
1757 * handlers have completed.
1758 */
1759 born = pr->pr_uref == 0;
1760 if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
1761 if (pr_flags & PR_PERSIST) {
1762 pr->pr_ref++;
1763 pr->pr_uref++;
1764 } else {
1765 pr->pr_ref--;
1766 pr->pr_uref--;
1767 }
1768 }
1769 pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
1770 pr->pr_flags &= ~PR_REMOVE;
1771 mtx_unlock(&pr->pr_mtx);
1772
1773 #ifdef RACCT
1774 if (racct_enable && created)
1775 prison_racct_attach(pr);
1776 #endif
1777
1778 /* Locks may have prevented a complete restriction of child IP
1779 * addresses. If so, allocate some more memory and try again.
1780 */
1781 #ifdef INET
1782 while (redo_ip4) {
1783 ip4s = pr->pr_ip4s;
1784 ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
1785 mtx_lock(&pr->pr_mtx);
1786 redo_ip4 = 0;
1787 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1788 #ifdef VIMAGE
1789 if (tpr->pr_flags & PR_VNET) {
1790 descend = 0;
1791 continue;
1792 }
1793 #endif
1794 if (prison_restrict_ip4(tpr, ip4)) {
1795 if (ip4 != NULL)
1796 ip4 = NULL;
1797 else
1798 redo_ip4 = 1;
1799 }
1800 }
1801 mtx_unlock(&pr->pr_mtx);
1802 }
1803 #endif
1804 #ifdef INET6
1805 while (redo_ip6) {
1806 ip6s = pr->pr_ip6s;
1807 ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
1808 mtx_lock(&pr->pr_mtx);
1809 redo_ip6 = 0;
1810 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1811 #ifdef VIMAGE
1812 if (tpr->pr_flags & PR_VNET) {
1813 descend = 0;
1814 continue;
1815 }
1816 #endif
1817 if (prison_restrict_ip6(tpr, ip6)) {
1818 if (ip6 != NULL)
1819 ip6 = NULL;
1820 else
1821 redo_ip6 = 1;
1822 }
1823 }
1824 mtx_unlock(&pr->pr_mtx);
1825 }
1826 #endif
1827
1828 /* Let the modules do their work. */
1829 sx_downgrade(&allprison_lock);
1830 if (born) {
1831 error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
1832 if (error) {
1833 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1834 prison_deref(pr, created
1835 ? PD_LIST_SLOCKED
1836 : PD_DEREF | PD_LIST_SLOCKED);
1837 goto done_errmsg;
1838 }
1839 }
1840 error = osd_jail_call(pr, PR_METHOD_SET, opts);
1841 if (error) {
1842 if (born)
1843 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
1844 prison_deref(pr, created
1845 ? PD_LIST_SLOCKED
1846 : PD_DEREF | PD_LIST_SLOCKED);
1847 goto done_errmsg;
1848 }
1849
1850 /* Attach this process to the prison if requested. */
1851 if (flags & JAIL_ATTACH) {
1852 mtx_lock(&pr->pr_mtx);
1853 error = do_jail_attach(td, pr);
1854 if (error) {
1855 vfs_opterror(opts, "attach failed");
1856 if (!created)
1857 prison_deref(pr, PD_DEREF);
1858 goto done_errmsg;
1859 }
1860 }
1861
1862 #ifdef RACCT
1863 if (racct_enable && !created) {
1864 if (!(flags & JAIL_ATTACH))
1865 sx_sunlock(&allprison_lock);
1866 prison_racct_modify(pr);
1867 if (!(flags & JAIL_ATTACH))
1868 sx_slock(&allprison_lock);
1869 }
1870 #endif
1871
1872 td->td_retval[0] = pr->pr_id;
1873
1874 /*
1875 * Now that it is all there, drop the temporary reference from existing
1876 * prisons. Or add a reference to newly created persistent prisons
1877 * (which was not done earlier so that the prison would not be publicly
1878 * visible).
1879 */
1880 if (!created) {
1881 prison_deref(pr, (flags & JAIL_ATTACH)
1882 ? PD_DEREF
1883 : PD_DEREF | PD_LIST_SLOCKED);
1884 } else {
1885 if (pr_flags & PR_PERSIST) {
1886 mtx_lock(&pr->pr_mtx);
1887 pr->pr_ref++;
1888 pr->pr_uref++;
1889 mtx_unlock(&pr->pr_mtx);
1890 }
1891 if (!(flags & JAIL_ATTACH))
1892 sx_sunlock(&allprison_lock);
1893 }
1894
1895 goto done_free;
1896
1897 done_deref_locked:
1898 prison_deref(pr, created
1899 ? PD_LOCKED | PD_LIST_XLOCKED
1900 : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
1901 goto done_releroot;
1902 done_unlock_list:
1903 sx_xunlock(&allprison_lock);
1904 done_releroot:
1905 if (root != NULL)
1906 vrele(root);
1907 done_errmsg:
1908 if (error) {
1909 if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
1910 &errmsg_len) == 0 && errmsg_len > 0) {
1911 errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
1912 if (optuio->uio_segflg == UIO_SYSSPACE)
1913 bcopy(errmsg,
1914 optuio->uio_iov[errmsg_pos].iov_base,
1915 errmsg_len);
1916 else
1917 copyout(errmsg,
1918 optuio->uio_iov[errmsg_pos].iov_base,
1919 errmsg_len);
1920 }
1921 }
1922 done_free:
1923 #ifdef INET
1924 free(ip4, M_PRISON);
1925 #endif
1926 #ifdef INET6
1927 free(ip6, M_PRISON);
1928 #endif
1929 if (g_path != NULL)
1930 free(g_path, M_TEMP);
1931 vfs_freeopts(opts);
1932 return (error);
1933 }
1934
1935
1936 /*
1937 * struct jail_get_args {
1938 * struct iovec *iovp;
1939 * unsigned int iovcnt;
1940 * int flags;
1941 * };
1942 */
1943 int
1944 sys_jail_get(struct thread *td, struct jail_get_args *uap)
1945 {
1946 struct uio *auio;
1947 int error;
1948
1949 /* Check that we have an even number of iovecs. */
1950 if (uap->iovcnt & 1)
1951 return (EINVAL);
1952
1953 error = copyinuio(uap->iovp, uap->iovcnt, &auio);
1954 if (error)
1955 return (error);
1956 error = kern_jail_get(td, auio, uap->flags);
1957 if (error == 0)
1958 error = copyout(auio->uio_iov, uap->iovp,
1959 uap->iovcnt * sizeof (struct iovec));
1960 free(auio, M_IOV);
1961 return (error);
1962 }
1963
1964 int
1965 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
1966 {
1967 struct bool_flags *bf;
1968 struct jailsys_flags *jsf;
1969 struct prison *pr, *mypr;
1970 struct vfsopt *opt;
1971 struct vfsoptlist *opts;
1972 char *errmsg, *name;
1973 int error, errmsg_len, errmsg_pos, i, jid, len, locked, pos;
1974 unsigned f;
1975
1976 if (flags & ~JAIL_GET_MASK)
1977 return (EINVAL);
1978
1979 /* Get the parameter list. */
1980 error = vfs_buildopts(optuio, &opts);
1981 if (error)
1982 return (error);
1983 errmsg_pos = vfs_getopt_pos(opts, "errmsg");
1984 mypr = td->td_ucred->cr_prison;
1985
1986 /*
1987 * Find the prison specified by one of: lastjid, jid, name.
1988 */
1989 sx_slock(&allprison_lock);
1990 error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
1991 if (error == 0) {
1992 TAILQ_FOREACH(pr, &allprison, pr_list) {
1993 if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
1994 mtx_lock(&pr->pr_mtx);
1995 if (pr->pr_ref > 0 &&
1996 (pr->pr_uref > 0 || (flags & JAIL_DYING)))
1997 break;
1998 mtx_unlock(&pr->pr_mtx);
1999 }
2000 }
2001 if (pr != NULL)
2002 goto found_prison;
2003 error = ENOENT;
2004 vfs_opterror(opts, "no jail after %d", jid);
2005 goto done_unlock_list;
2006 } else if (error != ENOENT)
2007 goto done_unlock_list;
2008
2009 error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2010 if (error == 0) {
2011 if (jid != 0) {
2012 pr = prison_find_child(mypr, jid);
2013 if (pr != NULL) {
2014 if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2015 mtx_unlock(&pr->pr_mtx);
2016 error = ENOENT;
2017 vfs_opterror(opts, "jail %d is dying",
2018 jid);
2019 goto done_unlock_list;
2020 }
2021 goto found_prison;
2022 }
2023 error = ENOENT;
2024 vfs_opterror(opts, "jail %d not found", jid);
2025 goto done_unlock_list;
2026 }
2027 } else if (error != ENOENT)
2028 goto done_unlock_list;
2029
2030 error = vfs_getopt(opts, "name", (void **)&name, &len);
2031 if (error == 0) {
2032 if (len == 0 || name[len - 1] != '\0') {
2033 error = EINVAL;
2034 goto done_unlock_list;
2035 }
2036 pr = prison_find_name(mypr, name);
2037 if (pr != NULL) {
2038 if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
2039 mtx_unlock(&pr->pr_mtx);
2040 error = ENOENT;
2041 vfs_opterror(opts, "jail \"%s\" is dying",
2042 name);
2043 goto done_unlock_list;
2044 }
2045 goto found_prison;
2046 }
2047 error = ENOENT;
2048 vfs_opterror(opts, "jail \"%s\" not found", name);
2049 goto done_unlock_list;
2050 } else if (error != ENOENT)
2051 goto done_unlock_list;
2052
2053 vfs_opterror(opts, "no jail specified");
2054 error = ENOENT;
2055 goto done_unlock_list;
2056
2057 found_prison:
2058 /* Get the parameters of the prison. */
2059 pr->pr_ref++;
2060 locked = PD_LOCKED;
2061 td->td_retval[0] = pr->pr_id;
2062 error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2063 if (error != 0 && error != ENOENT)
2064 goto done_deref;
2065 i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2066 error = vfs_setopt(opts, "parent", &i, sizeof(i));
2067 if (error != 0 && error != ENOENT)
2068 goto done_deref;
2069 error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2070 if (error != 0 && error != ENOENT)
2071 goto done_deref;
2072 error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2073 sizeof(pr->pr_cpuset->cs_id));
2074 if (error != 0 && error != ENOENT)
2075 goto done_deref;
2076 error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2077 if (error != 0 && error != ENOENT)
2078 goto done_deref;
2079 #ifdef INET
2080 error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
2081 pr->pr_ip4s * sizeof(*pr->pr_ip4));
2082 if (error != 0 && error != ENOENT)
2083 goto done_deref;
2084 #endif
2085 #ifdef INET6
2086 error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
2087 pr->pr_ip6s * sizeof(*pr->pr_ip6));
2088 if (error != 0 && error != ENOENT)
2089 goto done_deref;
2090 #endif
2091 error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2092 sizeof(pr->pr_securelevel));
2093 if (error != 0 && error != ENOENT)
2094 goto done_deref;
2095 error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2096 sizeof(pr->pr_childcount));
2097 if (error != 0 && error != ENOENT)
2098 goto done_deref;
2099 error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2100 sizeof(pr->pr_childmax));
2101 if (error != 0 && error != ENOENT)
2102 goto done_deref;
2103 error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2104 if (error != 0 && error != ENOENT)
2105 goto done_deref;
2106 error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2107 if (error != 0 && error != ENOENT)
2108 goto done_deref;
2109 error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2110 if (error != 0 && error != ENOENT)
2111 goto done_deref;
2112 #ifdef COMPAT_FREEBSD32
2113 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2114 uint32_t hid32 = pr->pr_hostid;
2115
2116 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2117 } else
2118 #endif
2119 error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2120 sizeof(pr->pr_hostid));
2121 if (error != 0 && error != ENOENT)
2122 goto done_deref;
2123 error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2124 sizeof(pr->pr_enforce_statfs));
2125 if (error != 0 && error != ENOENT)
2126 goto done_deref;
2127 error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2128 sizeof(pr->pr_devfs_rsnum));
2129 if (error != 0 && error != ENOENT)
2130 goto done_deref;
2131 for (bf = pr_flag_bool;
2132 bf < pr_flag_bool + nitems(pr_flag_bool);
2133 bf++) {
2134 i = (pr->pr_flags & bf->flag) ? 1 : 0;
2135 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2136 if (error != 0 && error != ENOENT)
2137 goto done_deref;
2138 i = !i;
2139 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2140 if (error != 0 && error != ENOENT)
2141 goto done_deref;
2142 }
2143 for (jsf = pr_flag_jailsys;
2144 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2145 jsf++) {
2146 f = pr->pr_flags & (jsf->disable | jsf->new);
2147 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2148 : (f == jsf->new) ? JAIL_SYS_NEW
2149 : JAIL_SYS_INHERIT;
2150 error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2151 if (error != 0 && error != ENOENT)
2152 goto done_deref;
2153 }
2154 for (bf = pr_flag_allow;
2155 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
2156 bf++) {
2157 i = (pr->pr_allow & bf->flag) ? 1 : 0;
2158 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2159 if (error != 0 && error != ENOENT)
2160 goto done_deref;
2161 i = !i;
2162 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2163 if (error != 0 && error != ENOENT)
2164 goto done_deref;
2165 }
2166 i = (pr->pr_uref == 0);
2167 error = vfs_setopt(opts, "dying", &i, sizeof(i));
2168 if (error != 0 && error != ENOENT)
2169 goto done_deref;
2170 i = !i;
2171 error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2172 if (error != 0 && error != ENOENT)
2173 goto done_deref;
2174 error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2175 sizeof(pr->pr_osreldate));
2176 if (error != 0 && error != ENOENT)
2177 goto done_deref;
2178 error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2179 if (error != 0 && error != ENOENT)
2180 goto done_deref;
2181
2182 /* Get the module parameters. */
2183 mtx_unlock(&pr->pr_mtx);
2184 locked = 0;
2185 error = osd_jail_call(pr, PR_METHOD_GET, opts);
2186 if (error)
2187 goto done_deref;
2188 prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
2189
2190 /* By now, all parameters should have been noted. */
2191 TAILQ_FOREACH(opt, opts, link) {
2192 if (!opt->seen && strcmp(opt->name, "errmsg")) {
2193 error = EINVAL;
2194 vfs_opterror(opts, "unknown parameter: %s", opt->name);
2195 goto done_errmsg;
2196 }
2197 }
2198
2199 /* Write the fetched parameters back to userspace. */
2200 error = 0;
2201 TAILQ_FOREACH(opt, opts, link) {
2202 if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2203 pos = 2 * opt->pos + 1;
2204 optuio->uio_iov[pos].iov_len = opt->len;
2205 if (opt->value != NULL) {
2206 if (optuio->uio_segflg == UIO_SYSSPACE) {
2207 bcopy(opt->value,
2208 optuio->uio_iov[pos].iov_base,
2209 opt->len);
2210 } else {
2211 error = copyout(opt->value,
2212 optuio->uio_iov[pos].iov_base,
2213 opt->len);
2214 if (error)
2215 break;
2216 }
2217 }
2218 }
2219 }
2220 goto done_errmsg;
2221
2222 done_deref:
2223 prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
2224 goto done_errmsg;
2225
2226 done_unlock_list:
2227 sx_sunlock(&allprison_lock);
2228 done_errmsg:
2229 if (error && errmsg_pos >= 0) {
2230 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2231 errmsg_pos = 2 * errmsg_pos + 1;
2232 if (errmsg_len > 0) {
2233 if (optuio->uio_segflg == UIO_SYSSPACE)
2234 bcopy(errmsg,
2235 optuio->uio_iov[errmsg_pos].iov_base,
2236 errmsg_len);
2237 else
2238 copyout(errmsg,
2239 optuio->uio_iov[errmsg_pos].iov_base,
2240 errmsg_len);
2241 }
2242 }
2243 vfs_freeopts(opts);
2244 return (error);
2245 }
2246
2247
2248 /*
2249 * struct jail_remove_args {
2250 * int jid;
2251 * };
2252 */
2253 int
2254 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2255 {
2256 struct prison *pr, *cpr, *lpr, *tpr;
2257 int descend, error;
2258
2259 error = priv_check(td, PRIV_JAIL_REMOVE);
2260 if (error)
2261 return (error);
2262
2263 sx_xlock(&allprison_lock);
2264 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2265 if (pr == NULL) {
2266 sx_xunlock(&allprison_lock);
2267 return (EINVAL);
2268 }
2269
2270 /* Remove all descendants of this prison, then remove this prison. */
2271 pr->pr_ref++;
2272 if (!LIST_EMPTY(&pr->pr_children)) {
2273 mtx_unlock(&pr->pr_mtx);
2274 lpr = NULL;
2275 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
2276 mtx_lock(&cpr->pr_mtx);
2277 if (cpr->pr_ref > 0) {
2278 tpr = cpr;
2279 cpr->pr_ref++;
2280 } else {
2281 /* Already removed - do not do it again. */
2282 tpr = NULL;
2283 }
2284 mtx_unlock(&cpr->pr_mtx);
2285 if (lpr != NULL) {
2286 mtx_lock(&lpr->pr_mtx);
2287 prison_remove_one(lpr);
2288 sx_xlock(&allprison_lock);
2289 }
2290 lpr = tpr;
2291 }
2292 if (lpr != NULL) {
2293 mtx_lock(&lpr->pr_mtx);
2294 prison_remove_one(lpr);
2295 sx_xlock(&allprison_lock);
2296 }
2297 mtx_lock(&pr->pr_mtx);
2298 }
2299 prison_remove_one(pr);
2300 return (0);
2301 }
2302
2303 static void
2304 prison_remove_one(struct prison *pr)
2305 {
2306 struct proc *p;
2307 int deuref;
2308
2309 /*
2310 * Mark the prison as doomed, so it doesn't accidentally come back
2311 * to life. It may still be explicitly brought back by jail_set(2).
2312 */
2313 pr->pr_flags |= PR_REMOVE;
2314
2315 /* If the prison was persistent, it is not anymore. */
2316 deuref = 0;
2317 if (pr->pr_flags & PR_PERSIST) {
2318 pr->pr_ref--;
2319 deuref = PD_DEUREF;
2320 pr->pr_flags &= ~PR_PERSIST;
2321 }
2322
2323 /*
2324 * jail_remove added a reference. If that's the only one, remove
2325 * the prison now.
2326 */
2327 KASSERT(pr->pr_ref > 0,
2328 ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
2329 if (pr->pr_ref == 1) {
2330 prison_deref(pr,
2331 deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
2332 return;
2333 }
2334
2335 mtx_unlock(&pr->pr_mtx);
2336 sx_xunlock(&allprison_lock);
2337 /*
2338 * Kill all processes unfortunate enough to be attached to this prison.
2339 */
2340 sx_slock(&allproc_lock);
2341 FOREACH_PROC_IN_SYSTEM(p) {
2342 PROC_LOCK(p);
2343 if (p->p_state != PRS_NEW && p->p_ucred &&
2344 p->p_ucred->cr_prison == pr)
2345 kern_psignal(p, SIGKILL);
2346 PROC_UNLOCK(p);
2347 }
2348 sx_sunlock(&allproc_lock);
2349 /* Remove the temporary reference added by jail_remove. */
2350 prison_deref(pr, deuref | PD_DEREF);
2351 }
2352
2353
2354 /*
2355 * struct jail_attach_args {
2356 * int jid;
2357 * };
2358 */
2359 int
2360 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2361 {
2362 struct prison *pr;
2363 int error;
2364
2365 error = priv_check(td, PRIV_JAIL_ATTACH);
2366 if (error)
2367 return (error);
2368
2369 /*
2370 * Start with exclusive hold on allprison_lock to ensure that a possible
2371 * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
2372 * But then immediately downgrade it since we don't need to stop
2373 * readers.
2374 */
2375 sx_xlock(&allprison_lock);
2376 sx_downgrade(&allprison_lock);
2377 pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2378 if (pr == NULL) {
2379 sx_sunlock(&allprison_lock);
2380 return (EINVAL);
2381 }
2382
2383 /*
2384 * Do not allow a process to attach to a prison that is not
2385 * considered to be "alive".
2386 */
2387 if (pr->pr_uref == 0) {
2388 mtx_unlock(&pr->pr_mtx);
2389 sx_sunlock(&allprison_lock);
2390 return (EINVAL);
2391 }
2392
2393 return (do_jail_attach(td, pr));
2394 }
2395
2396 static int
2397 do_jail_attach(struct thread *td, struct prison *pr)
2398 {
2399 struct proc *p;
2400 struct ucred *newcred, *oldcred;
2401 int error;
2402
2403 /*
2404 * XXX: Note that there is a slight race here if two threads
2405 * in the same privileged process attempt to attach to two
2406 * different jails at the same time. It is important for
2407 * user processes not to do this, or they might end up with
2408 * a process root from one prison, but attached to the jail
2409 * of another.
2410 */
2411 pr->pr_ref++;
2412 pr->pr_uref++;
2413 mtx_unlock(&pr->pr_mtx);
2414
2415 /* Let modules do whatever they need to prepare for attaching. */
2416 error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2417 if (error) {
2418 prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
2419 return (error);
2420 }
2421 sx_sunlock(&allprison_lock);
2422
2423 /*
2424 * Reparent the newly attached process to this jail.
2425 */
2426 p = td->td_proc;
2427 error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2428 if (error)
2429 goto e_revert_osd;
2430
2431 vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2432 if ((error = change_dir(pr->pr_root, td)) != 0)
2433 goto e_unlock;
2434 #ifdef MAC
2435 if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2436 goto e_unlock;
2437 #endif
2438 VOP_UNLOCK(pr->pr_root, 0);
2439 if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2440 goto e_revert_osd;
2441
2442 newcred = crget();
2443 PROC_LOCK(p);
2444 oldcred = crcopysafe(p, newcred);
2445 newcred->cr_prison = pr;
2446 proc_set_cred(p, newcred);
2447 setsugid(p);
2448 #ifdef RACCT
2449 racct_proc_ucred_changed(p, oldcred, newcred);
2450 crhold(newcred);
2451 #endif
2452 PROC_UNLOCK(p);
2453 #ifdef RCTL
2454 rctl_proc_ucred_changed(p, newcred);
2455 crfree(newcred);
2456 #endif
2457 prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
2458 crfree(oldcred);
2459
2460 /*
2461 * If the prison was killed while changing credentials, die along
2462 * with it.
2463 */
2464 if (pr->pr_flags & PR_REMOVE) {
2465 PROC_LOCK(p);
2466 kern_psignal(p, SIGKILL);
2467 PROC_UNLOCK(p);
2468 }
2469
2470 return (0);
2471
2472 e_unlock:
2473 VOP_UNLOCK(pr->pr_root, 0);
2474 e_revert_osd:
2475 /* Tell modules this thread is still in its old jail after all. */
2476 (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2477 prison_deref(pr, PD_DEREF | PD_DEUREF);
2478 return (error);
2479 }
2480
2481
2482 /*
2483 * Returns a locked prison instance, or NULL on failure.
2484 */
2485 struct prison *
2486 prison_find(int prid)
2487 {
2488 struct prison *pr;
2489
2490 sx_assert(&allprison_lock, SX_LOCKED);
2491 TAILQ_FOREACH(pr, &allprison, pr_list) {
2492 if (pr->pr_id == prid) {
2493 mtx_lock(&pr->pr_mtx);
2494 if (pr->pr_ref > 0)
2495 return (pr);
2496 mtx_unlock(&pr->pr_mtx);
2497 }
2498 }
2499 return (NULL);
2500 }
2501
2502 /*
2503 * Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
2504 */
2505 struct prison *
2506 prison_find_child(struct prison *mypr, int prid)
2507 {
2508 struct prison *pr;
2509 int descend;
2510
2511 sx_assert(&allprison_lock, SX_LOCKED);
2512 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2513 if (pr->pr_id == prid) {
2514 mtx_lock(&pr->pr_mtx);
2515 if (pr->pr_ref > 0)
2516 return (pr);
2517 mtx_unlock(&pr->pr_mtx);
2518 }
2519 }
2520 return (NULL);
2521 }
2522
2523 /*
2524 * Look for the name relative to mypr. Returns a locked prison or NULL.
2525 */
2526 struct prison *
2527 prison_find_name(struct prison *mypr, const char *name)
2528 {
2529 struct prison *pr, *deadpr;
2530 size_t mylen;
2531 int descend;
2532
2533 sx_assert(&allprison_lock, SX_LOCKED);
2534 mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2535 again:
2536 deadpr = NULL;
2537 FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2538 if (!strcmp(pr->pr_name + mylen, name)) {
2539 mtx_lock(&pr->pr_mtx);
2540 if (pr->pr_ref > 0) {
2541 if (pr->pr_uref > 0)
2542 return (pr);
2543 deadpr = pr;
2544 }
2545 mtx_unlock(&pr->pr_mtx);
2546 }
2547 }
2548 /* There was no valid prison - perhaps there was a dying one. */
2549 if (deadpr != NULL) {
2550 mtx_lock(&deadpr->pr_mtx);
2551 if (deadpr->pr_ref == 0) {
2552 mtx_unlock(&deadpr->pr_mtx);
2553 goto again;
2554 }
2555 }
2556 return (deadpr);
2557 }
2558
2559 /*
2560 * See if a prison has the specific flag set.
2561 */
2562 int
2563 prison_flag(struct ucred *cred, unsigned flag)
2564 {
2565
2566 /* This is an atomic read, so no locking is necessary. */
2567 return (cred->cr_prison->pr_flags & flag);
2568 }
2569
2570 int
2571 prison_allow(struct ucred *cred, unsigned flag)
2572 {
2573
2574 /* This is an atomic read, so no locking is necessary. */
2575 return (cred->cr_prison->pr_allow & flag);
2576 }
2577
2578 /*
2579 * Remove a prison reference. If that was the last reference, remove the
2580 * prison itself - but not in this context in case there are locks held.
2581 */
2582 void
2583 prison_free_locked(struct prison *pr)
2584 {
2585 int ref;
2586
2587 mtx_assert(&pr->pr_mtx, MA_OWNED);
2588 ref = --pr->pr_ref;
2589 mtx_unlock(&pr->pr_mtx);
2590 if (ref == 0)
2591 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2592 }
2593
2594 void
2595 prison_free(struct prison *pr)
2596 {
2597
2598 mtx_lock(&pr->pr_mtx);
2599 prison_free_locked(pr);
2600 }
2601
2602 /*
2603 * Complete a call to either prison_free or prison_proc_free.
2604 */
2605 static void
2606 prison_complete(void *context, int pending)
2607 {
2608 struct prison *pr = context;
2609
2610 sx_xlock(&allprison_lock);
2611 mtx_lock(&pr->pr_mtx);
2612 prison_deref(pr, pr->pr_uref
2613 ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
2614 : PD_LOCKED | PD_LIST_XLOCKED);
2615 }
2616
2617 /*
2618 * Remove a prison reference (usually). This internal version assumes no
2619 * mutexes are held, except perhaps the prison itself. If there are no more
2620 * references, release and delist the prison. On completion, the prison lock
2621 * and the allprison lock are both unlocked.
2622 */
2623 static void
2624 prison_deref(struct prison *pr, int flags)
2625 {
2626 struct prison *ppr, *tpr;
2627 int ref, lasturef;
2628
2629 if (!(flags & PD_LOCKED))
2630 mtx_lock(&pr->pr_mtx);
2631 for (;;) {
2632 if (flags & PD_DEUREF) {
2633 KASSERT(pr->pr_uref > 0,
2634 ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
2635 pr->pr_id));
2636 pr->pr_uref--;
2637 lasturef = pr->pr_uref == 0;
2638 if (lasturef)
2639 pr->pr_ref++;
2640 KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
2641 } else
2642 lasturef = 0;
2643 if (flags & PD_DEREF) {
2644 KASSERT(pr->pr_ref > 0,
2645 ("prison_deref PD_DEREF on a dead prison (jid=%d)",
2646 pr->pr_id));
2647 pr->pr_ref--;
2648 }
2649 ref = pr->pr_ref;
2650 mtx_unlock(&pr->pr_mtx);
2651
2652 /*
2653 * Tell the modules if the last user reference was removed
2654 * (even it sticks around in dying state).
2655 */
2656 if (lasturef) {
2657 if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
2658 sx_xlock(&allprison_lock);
2659 flags |= PD_LIST_XLOCKED;
2660 }
2661 (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
2662 mtx_lock(&pr->pr_mtx);
2663 ref = --pr->pr_ref;
2664 mtx_unlock(&pr->pr_mtx);
2665 }
2666
2667 /* If the prison still has references, nothing else to do. */
2668 if (ref > 0) {
2669 if (flags & PD_LIST_SLOCKED)
2670 sx_sunlock(&allprison_lock);
2671 else if (flags & PD_LIST_XLOCKED)
2672 sx_xunlock(&allprison_lock);
2673 return;
2674 }
2675
2676 if (flags & PD_LIST_SLOCKED) {
2677 if (!sx_try_upgrade(&allprison_lock)) {
2678 sx_sunlock(&allprison_lock);
2679 sx_xlock(&allprison_lock);
2680 }
2681 } else if (!(flags & PD_LIST_XLOCKED))
2682 sx_xlock(&allprison_lock);
2683
2684 TAILQ_REMOVE(&allprison, pr, pr_list);
2685 LIST_REMOVE(pr, pr_sibling);
2686 ppr = pr->pr_parent;
2687 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
2688 tpr->pr_childcount--;
2689 sx_xunlock(&allprison_lock);
2690
2691 #ifdef VIMAGE
2692 if (pr->pr_vnet != ppr->pr_vnet)
2693 vnet_destroy(pr->pr_vnet);
2694 #endif
2695 if (pr->pr_root != NULL)
2696 vrele(pr->pr_root);
2697 mtx_destroy(&pr->pr_mtx);
2698 #ifdef INET
2699 free(pr->pr_ip4, M_PRISON);
2700 #endif
2701 #ifdef INET6
2702 free(pr->pr_ip6, M_PRISON);
2703 #endif
2704 if (pr->pr_cpuset != NULL)
2705 cpuset_rel(pr->pr_cpuset);
2706 osd_jail_exit(pr);
2707 #ifdef RACCT
2708 if (racct_enable)
2709 prison_racct_detach(pr);
2710 #endif
2711 free(pr, M_PRISON);
2712
2713 /* Removing a prison frees a reference on its parent. */
2714 pr = ppr;
2715 mtx_lock(&pr->pr_mtx);
2716 flags = PD_DEREF | PD_DEUREF;
2717 }
2718 }
2719
2720 void
2721 prison_hold_locked(struct prison *pr)
2722 {
2723
2724 mtx_assert(&pr->pr_mtx, MA_OWNED);
2725 KASSERT(pr->pr_ref > 0,
2726 ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2727 pr->pr_ref++;
2728 }
2729
2730 void
2731 prison_hold(struct prison *pr)
2732 {
2733
2734 mtx_lock(&pr->pr_mtx);
2735 prison_hold_locked(pr);
2736 mtx_unlock(&pr->pr_mtx);
2737 }
2738
2739 void
2740 prison_proc_hold(struct prison *pr)
2741 {
2742
2743 mtx_lock(&pr->pr_mtx);
2744 KASSERT(pr->pr_uref > 0,
2745 ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2746 pr->pr_uref++;
2747 mtx_unlock(&pr->pr_mtx);
2748 }
2749
2750 void
2751 prison_proc_free(struct prison *pr)
2752 {
2753
2754 mtx_lock(&pr->pr_mtx);
2755 KASSERT(pr->pr_uref > 0,
2756 ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2757 if (pr->pr_uref > 1)
2758 pr->pr_uref--;
2759 else {
2760 /*
2761 * Don't remove the last user reference in this context, which
2762 * is expected to be a process that is not only locked, but
2763 * also half dead.
2764 */
2765 pr->pr_ref++;
2766 mtx_unlock(&pr->pr_mtx);
2767 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2768 return;
2769 }
2770 mtx_unlock(&pr->pr_mtx);
2771 }
2772
2773 /*
2774 * Check if a jail supports the given address family.
2775 *
2776 * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
2777 * if not.
2778 */
2779 int
2780 prison_check_af(struct ucred *cred, int af)
2781 {
2782 struct prison *pr;
2783 int error;
2784
2785 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2786
2787 pr = cred->cr_prison;
2788 #ifdef VIMAGE
2789 /* Prisons with their own network stack are not limited. */
2790 if (prison_owns_vnet(cred))
2791 return (0);
2792 #endif
2793
2794 error = 0;
2795 switch (af)
2796 {
2797 #ifdef INET
2798 case AF_INET:
2799 if (pr->pr_flags & PR_IP4)
2800 {
2801 mtx_lock(&pr->pr_mtx);
2802 if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
2803 error = EAFNOSUPPORT;
2804 mtx_unlock(&pr->pr_mtx);
2805 }
2806 break;
2807 #endif
2808 #ifdef INET6
2809 case AF_INET6:
2810 if (pr->pr_flags & PR_IP6)
2811 {
2812 mtx_lock(&pr->pr_mtx);
2813 if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
2814 error = EAFNOSUPPORT;
2815 mtx_unlock(&pr->pr_mtx);
2816 }
2817 break;
2818 #endif
2819 case AF_LOCAL:
2820 case AF_ROUTE:
2821 break;
2822 default:
2823 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
2824 error = EAFNOSUPPORT;
2825 }
2826 return (error);
2827 }
2828
2829 /*
2830 * Check if given address belongs to the jail referenced by cred (wrapper to
2831 * prison_check_ip[46]).
2832 *
2833 * Returns 0 if jail doesn't restrict the address family or if address belongs
2834 * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
2835 * the jail doesn't allow the address family. IPv4 Address passed in in NBO.
2836 */
2837 int
2838 prison_if(struct ucred *cred, struct sockaddr *sa)
2839 {
2840 #ifdef INET
2841 struct sockaddr_in *sai;
2842 #endif
2843 #ifdef INET6
2844 struct sockaddr_in6 *sai6;
2845 #endif
2846 int error;
2847
2848 KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
2849 KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
2850
2851 #ifdef VIMAGE
2852 if (prison_owns_vnet(cred))
2853 return (0);
2854 #endif
2855
2856 error = 0;
2857 switch (sa->sa_family)
2858 {
2859 #ifdef INET
2860 case AF_INET:
2861 sai = (struct sockaddr_in *)sa;
2862 error = prison_check_ip4(cred, &sai->sin_addr);
2863 break;
2864 #endif
2865 #ifdef INET6
2866 case AF_INET6:
2867 sai6 = (struct sockaddr_in6 *)sa;
2868 error = prison_check_ip6(cred, &sai6->sin6_addr);
2869 break;
2870 #endif
2871 default:
2872 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
2873 error = EAFNOSUPPORT;
2874 }
2875 return (error);
2876 }
2877
2878 /*
2879 * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
2880 */
2881 int
2882 prison_check(struct ucred *cred1, struct ucred *cred2)
2883 {
2884
2885 return ((cred1->cr_prison == cred2->cr_prison ||
2886 prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
2887 }
2888
2889 /*
2890 * Return 1 if p2 is a child of p1, otherwise 0.
2891 */
2892 int
2893 prison_ischild(struct prison *pr1, struct prison *pr2)
2894 {
2895
2896 for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
2897 if (pr1 == pr2)
2898 return (1);
2899 return (0);
2900 }
2901
2902 /*
2903 * Return 1 if the passed credential is in a jail, otherwise 0.
2904 */
2905 int
2906 jailed(struct ucred *cred)
2907 {
2908
2909 return (cred->cr_prison != &prison0);
2910 }
2911
2912 /*
2913 * Return 1 if the passed credential is in a jail and that jail does not
2914 * have its own virtual network stack, otherwise 0.
2915 */
2916 int
2917 jailed_without_vnet(struct ucred *cred)
2918 {
2919
2920 if (!jailed(cred))
2921 return (0);
2922 #ifdef VIMAGE
2923 if (prison_owns_vnet(cred))
2924 return (0);
2925 #endif
2926
2927 return (1);
2928 }
2929
2930 /*
2931 * Return the correct hostname (domainname, et al) for the passed credential.
2932 */
2933 void
2934 getcredhostname(struct ucred *cred, char *buf, size_t size)
2935 {
2936 struct prison *pr;
2937
2938 /*
2939 * A NULL credential can be used to shortcut to the physical
2940 * system's hostname.
2941 */
2942 pr = (cred != NULL) ? cred->cr_prison : &prison0;
2943 mtx_lock(&pr->pr_mtx);
2944 strlcpy(buf, pr->pr_hostname, size);
2945 mtx_unlock(&pr->pr_mtx);
2946 }
2947
2948 void
2949 getcreddomainname(struct ucred *cred, char *buf, size_t size)
2950 {
2951
2952 mtx_lock(&cred->cr_prison->pr_mtx);
2953 strlcpy(buf, cred->cr_prison->pr_domainname, size);
2954 mtx_unlock(&cred->cr_prison->pr_mtx);
2955 }
2956
2957 void
2958 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
2959 {
2960
2961 mtx_lock(&cred->cr_prison->pr_mtx);
2962 strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
2963 mtx_unlock(&cred->cr_prison->pr_mtx);
2964 }
2965
2966 void
2967 getcredhostid(struct ucred *cred, unsigned long *hostid)
2968 {
2969
2970 mtx_lock(&cred->cr_prison->pr_mtx);
2971 *hostid = cred->cr_prison->pr_hostid;
2972 mtx_unlock(&cred->cr_prison->pr_mtx);
2973 }
2974
2975 void
2976 getjailname(struct ucred *cred, char *name, size_t len)
2977 {
2978
2979 mtx_lock(&cred->cr_prison->pr_mtx);
2980 strlcpy(name, cred->cr_prison->pr_name, len);
2981 mtx_unlock(&cred->cr_prison->pr_mtx);
2982 }
2983
2984 #ifdef VIMAGE
2985 /*
2986 * Determine whether the prison represented by cred owns
2987 * its vnet rather than having it inherited.
2988 *
2989 * Returns 1 in case the prison owns the vnet, 0 otherwise.
2990 */
2991 int
2992 prison_owns_vnet(struct ucred *cred)
2993 {
2994
2995 /*
2996 * vnets cannot be added/removed after jail creation,
2997 * so no need to lock here.
2998 */
2999 return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3000 }
3001 #endif
3002
3003 /*
3004 * Determine whether the subject represented by cred can "see"
3005 * status of a mount point.
3006 * Returns: 0 for permitted, ENOENT otherwise.
3007 * XXX: This function should be called cr_canseemount() and should be
3008 * placed in kern_prot.c.
3009 */
3010 int
3011 prison_canseemount(struct ucred *cred, struct mount *mp)
3012 {
3013 struct prison *pr;
3014 struct statfs *sp;
3015 size_t len;
3016
3017 pr = cred->cr_prison;
3018 if (pr->pr_enforce_statfs == 0)
3019 return (0);
3020 if (pr->pr_root->v_mount == mp)
3021 return (0);
3022 if (pr->pr_enforce_statfs == 2)
3023 return (ENOENT);
3024 /*
3025 * If jail's chroot directory is set to "/" we should be able to see
3026 * all mount-points from inside a jail.
3027 * This is ugly check, but this is the only situation when jail's
3028 * directory ends with '/'.
3029 */
3030 if (strcmp(pr->pr_path, "/") == 0)
3031 return (0);
3032 len = strlen(pr->pr_path);
3033 sp = &mp->mnt_stat;
3034 if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3035 return (ENOENT);
3036 /*
3037 * Be sure that we don't have situation where jail's root directory
3038 * is "/some/path" and mount point is "/some/pathpath".
3039 */
3040 if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3041 return (ENOENT);
3042 return (0);
3043 }
3044
3045 void
3046 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3047 {
3048 char jpath[MAXPATHLEN];
3049 struct prison *pr;
3050 size_t len;
3051
3052 pr = cred->cr_prison;
3053 if (pr->pr_enforce_statfs == 0)
3054 return;
3055 if (prison_canseemount(cred, mp) != 0) {
3056 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3057 strlcpy(sp->f_mntonname, "[restricted]",
3058 sizeof(sp->f_mntonname));
3059 return;
3060 }
3061 if (pr->pr_root->v_mount == mp) {
3062 /*
3063 * Clear current buffer data, so we are sure nothing from
3064 * the valid path left there.
3065 */
3066 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3067 *sp->f_mntonname = '/';
3068 return;
3069 }
3070 /*
3071 * If jail's chroot directory is set to "/" we should be able to see
3072 * all mount-points from inside a jail.
3073 */
3074 if (strcmp(pr->pr_path, "/") == 0)
3075 return;
3076 len = strlen(pr->pr_path);
3077 strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3078 /*
3079 * Clear current buffer data, so we are sure nothing from
3080 * the valid path left there.
3081 */
3082 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3083 if (*jpath == '\0') {
3084 /* Should never happen. */
3085 *sp->f_mntonname = '/';
3086 } else {
3087 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3088 }
3089 }
3090
3091 /*
3092 * Check with permission for a specific privilege is granted within jail. We
3093 * have a specific list of accepted privileges; the rest are denied.
3094 */
3095 int
3096 prison_priv_check(struct ucred *cred, int priv)
3097 {
3098
3099 if (!jailed(cred))
3100 return (0);
3101
3102 #ifdef VIMAGE
3103 /*
3104 * Privileges specific to prisons with a virtual network stack.
3105 * There might be a duplicate entry here in case the privilege
3106 * is only granted conditionally in the legacy jail case.
3107 */
3108 switch (priv) {
3109 #ifdef notyet
3110 /*
3111 * NFS-specific privileges.
3112 */
3113 case PRIV_NFS_DAEMON:
3114 case PRIV_NFS_LOCKD:
3115 #endif
3116 /*
3117 * Network stack privileges.
3118 */
3119 case PRIV_NET_BRIDGE:
3120 case PRIV_NET_GRE:
3121 case PRIV_NET_BPF:
3122 case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
3123 case PRIV_NET_ROUTE:
3124 case PRIV_NET_TAP:
3125 case PRIV_NET_SETIFMTU:
3126 case PRIV_NET_SETIFFLAGS:
3127 case PRIV_NET_SETIFCAP:
3128 case PRIV_NET_SETIFDESCR:
3129 case PRIV_NET_SETIFNAME :
3130 case PRIV_NET_SETIFMETRIC:
3131 case PRIV_NET_SETIFPHYS:
3132 case PRIV_NET_SETIFMAC:
3133 case PRIV_NET_SETLANPCP:
3134 case PRIV_NET_ADDMULTI:
3135 case PRIV_NET_DELMULTI:
3136 case PRIV_NET_HWIOCTL:
3137 case PRIV_NET_SETLLADDR:
3138 case PRIV_NET_ADDIFGROUP:
3139 case PRIV_NET_DELIFGROUP:
3140 case PRIV_NET_IFCREATE:
3141 case PRIV_NET_IFDESTROY:
3142 case PRIV_NET_ADDIFADDR:
3143 case PRIV_NET_DELIFADDR:
3144 case PRIV_NET_LAGG:
3145 case PRIV_NET_GIF:
3146 case PRIV_NET_SETIFVNET:
3147 case PRIV_NET_SETIFFIB:
3148 case PRIV_NET_ME:
3149
3150 /*
3151 * 802.11-related privileges.
3152 */
3153 case PRIV_NET80211_VAP_GETKEY:
3154 case PRIV_NET80211_VAP_MANAGE:
3155
3156 #ifdef notyet
3157 /*
3158 * ATM privileges.
3159 */
3160 case PRIV_NETATM_CFG:
3161 case PRIV_NETATM_ADD:
3162 case PRIV_NETATM_DEL:
3163 case PRIV_NETATM_SET:
3164
3165 /*
3166 * Bluetooth privileges.
3167 */
3168 case PRIV_NETBLUETOOTH_RAW:
3169 #endif
3170
3171 /*
3172 * Netgraph and netgraph module privileges.
3173 */
3174 case PRIV_NETGRAPH_CONTROL:
3175 #ifdef notyet
3176 case PRIV_NETGRAPH_TTY:
3177 #endif
3178
3179 /*
3180 * IPv4 and IPv6 privileges.
3181 */
3182 case PRIV_NETINET_IPFW:
3183 case PRIV_NETINET_DIVERT:
3184 case PRIV_NETINET_PF:
3185 case PRIV_NETINET_DUMMYNET:
3186 case PRIV_NETINET_CARP:
3187 case PRIV_NETINET_MROUTE:
3188 case PRIV_NETINET_RAW:
3189 case PRIV_NETINET_ADDRCTRL6:
3190 case PRIV_NETINET_ND6:
3191 case PRIV_NETINET_SCOPE6:
3192 case PRIV_NETINET_ALIFETIME6:
3193 case PRIV_NETINET_IPSEC:
3194 case PRIV_NETINET_BINDANY:
3195
3196 #ifdef notyet
3197 /*
3198 * NCP privileges.
3199 */
3200 case PRIV_NETNCP:
3201
3202 /*
3203 * SMB privileges.
3204 */
3205 case PRIV_NETSMB:
3206 #endif
3207
3208 /*
3209 * No default: or deny here.
3210 * In case of no permit fall through to next switch().
3211 */
3212 if (cred->cr_prison->pr_flags & PR_VNET)
3213 return (0);
3214 }
3215 #endif /* VIMAGE */
3216
3217 switch (priv) {
3218
3219 /*
3220 * Allow ktrace privileges for root in jail.
3221 */
3222 case PRIV_KTRACE:
3223
3224 #if 0
3225 /*
3226 * Allow jailed processes to configure audit identity and
3227 * submit audit records (login, etc). In the future we may
3228 * want to further refine the relationship between audit and
3229 * jail.
3230 */
3231 case PRIV_AUDIT_GETAUDIT:
3232 case PRIV_AUDIT_SETAUDIT:
3233 case PRIV_AUDIT_SUBMIT:
3234 #endif
3235
3236 /*
3237 * Allow jailed processes to manipulate process UNIX
3238 * credentials in any way they see fit.
3239 */
3240 case PRIV_CRED_SETUID:
3241 case PRIV_CRED_SETEUID:
3242 case PRIV_CRED_SETGID:
3243 case PRIV_CRED_SETEGID:
3244 case PRIV_CRED_SETGROUPS:
3245 case PRIV_CRED_SETREUID:
3246 case PRIV_CRED_SETREGID:
3247 case PRIV_CRED_SETRESUID:
3248 case PRIV_CRED_SETRESGID:
3249
3250 /*
3251 * Jail implements visibility constraints already, so allow
3252 * jailed root to override uid/gid-based constraints.
3253 */
3254 case PRIV_SEEOTHERGIDS:
3255 case PRIV_SEEOTHERUIDS:
3256
3257 /*
3258 * Jail implements inter-process debugging limits already, so
3259 * allow jailed root various debugging privileges.
3260 */
3261 case PRIV_DEBUG_DIFFCRED:
3262 case PRIV_DEBUG_SUGID:
3263 case PRIV_DEBUG_UNPRIV:
3264
3265 /*
3266 * Allow jail to set various resource limits and login
3267 * properties, and for now, exceed process resource limits.
3268 */
3269 case PRIV_PROC_LIMIT:
3270 case PRIV_PROC_SETLOGIN:
3271 case PRIV_PROC_SETRLIMIT:
3272
3273 /*
3274 * System V and POSIX IPC privileges are granted in jail.
3275 */
3276 case PRIV_IPC_READ:
3277 case PRIV_IPC_WRITE:
3278 case PRIV_IPC_ADMIN:
3279 case PRIV_IPC_MSGSIZE:
3280 case PRIV_MQ_ADMIN:
3281
3282 /*
3283 * Jail operations within a jail work on child jails.
3284 */
3285 case PRIV_JAIL_ATTACH:
3286 case PRIV_JAIL_SET:
3287 case PRIV_JAIL_REMOVE:
3288
3289 /*
3290 * Jail implements its own inter-process limits, so allow
3291 * root processes in jail to change scheduling on other
3292 * processes in the same jail. Likewise for signalling.
3293 */
3294 case PRIV_SCHED_DIFFCRED:
3295 case PRIV_SCHED_CPUSET:
3296 case PRIV_SIGNAL_DIFFCRED:
3297 case PRIV_SIGNAL_SUGID:
3298
3299 /*
3300 * Allow jailed processes to write to sysctls marked as jail
3301 * writable.
3302 */
3303 case PRIV_SYSCTL_WRITEJAIL:
3304
3305 /*
3306 * Allow root in jail to manage a variety of quota
3307 * properties. These should likely be conditional on a
3308 * configuration option.
3309 */
3310 case PRIV_VFS_GETQUOTA:
3311 case PRIV_VFS_SETQUOTA:
3312
3313 /*
3314 * Since Jail relies on chroot() to implement file system
3315 * protections, grant many VFS privileges to root in jail.
3316 * Be careful to exclude mount-related and NFS-related
3317 * privileges.
3318 */
3319 case PRIV_VFS_READ:
3320 case PRIV_VFS_WRITE:
3321 case PRIV_VFS_ADMIN:
3322 case PRIV_VFS_EXEC:
3323 case PRIV_VFS_LOOKUP:
3324 case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
3325 case PRIV_VFS_CHFLAGS_DEV:
3326 case PRIV_VFS_CHOWN:
3327 case PRIV_VFS_CHROOT:
3328 case PRIV_VFS_RETAINSUGID:
3329 case PRIV_VFS_FCHROOT:
3330 case PRIV_VFS_LINK:
3331 case PRIV_VFS_SETGID:
3332 case PRIV_VFS_STAT:
3333 case PRIV_VFS_STICKYFILE:
3334
3335 /*
3336 * As in the non-jail case, non-root users are expected to be
3337 * able to read kernel/physical memory (provided /dev/[k]mem
3338 * exists in the jail and they have permission to access it).
3339 */
3340 case PRIV_KMEM_READ:
3341 return (0);
3342
3343 /*
3344 * Depending on the global setting, allow privilege of
3345 * setting system flags.
3346 */
3347 case PRIV_VFS_SYSFLAGS:
3348 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
3349 return (0);
3350 else
3351 return (EPERM);
3352
3353 /*
3354 * Depending on the global setting, allow privilege of
3355 * mounting/unmounting file systems.
3356 */
3357 case PRIV_VFS_MOUNT:
3358 case PRIV_VFS_UNMOUNT:
3359 case PRIV_VFS_MOUNT_NONUSER:
3360 case PRIV_VFS_MOUNT_OWNER:
3361 if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
3362 cred->cr_prison->pr_enforce_statfs < 2)
3363 return (0);
3364 else
3365 return (EPERM);
3366
3367 /*
3368 * Conditionnaly allow locking (unlocking) physical pages
3369 * in memory.
3370 */
3371 case PRIV_VM_MLOCK:
3372 case PRIV_VM_MUNLOCK:
3373 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
3374 return (0);
3375 else
3376 return (EPERM);
3377
3378 /*
3379 * Conditionally allow jailed root to bind reserved ports.
3380 */
3381 case PRIV_NETINET_RESERVEDPORT:
3382 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
3383 return (0);
3384 else
3385 return (EPERM);
3386
3387 /*
3388 * Allow jailed root to reuse in-use ports.
3389 */
3390 case PRIV_NETINET_REUSEPORT:
3391 return (0);
3392
3393 /*
3394 * Allow jailed root to set certain IPv4/6 (option) headers.
3395 */
3396 case PRIV_NETINET_SETHDROPTS:
3397 return (0);
3398
3399 /*
3400 * Conditionally allow creating raw sockets in jail.
3401 */
3402 case PRIV_NETINET_RAW:
3403 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
3404 return (0);
3405 else
3406 return (EPERM);
3407
3408 /*
3409 * Since jail implements its own visibility limits on netstat
3410 * sysctls, allow getcred. This allows identd to work in
3411 * jail.
3412 */
3413 case PRIV_NETINET_GETCRED:
3414 return (0);
3415
3416 /*
3417 * Allow jailed root to set loginclass.
3418 */
3419 case PRIV_PROC_SETLOGINCLASS:
3420 return (0);
3421
3422 /*
3423 * Do not allow a process inside a jail to read the kernel
3424 * message buffer unless explicitly permitted.
3425 */
3426 case PRIV_MSGBUF:
3427 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
3428 return (0);
3429 return (EPERM);
3430
3431 default:
3432 /*
3433 * In all remaining cases, deny the privilege request. This
3434 * includes almost all network privileges, many system
3435 * configuration privileges.
3436 */
3437 return (EPERM);
3438 }
3439 }
3440
3441 /*
3442 * Return the part of pr2's name that is relative to pr1, or the whole name
3443 * if it does not directly follow.
3444 */
3445
3446 char *
3447 prison_name(struct prison *pr1, struct prison *pr2)
3448 {
3449 char *name;
3450
3451 /* Jails see themselves as "" (if they see themselves at all). */
3452 if (pr1 == pr2)
3453 return "";
3454 name = pr2->pr_name;
3455 if (prison_ischild(pr1, pr2)) {
3456 /*
3457 * pr1 isn't locked (and allprison_lock may not be either)
3458 * so its length can't be counted on. But the number of dots
3459 * can be counted on - and counted.
3460 */
3461 for (; pr1 != &prison0; pr1 = pr1->pr_parent)
3462 name = strchr(name, '.') + 1;
3463 }
3464 return (name);
3465 }
3466
3467 /*
3468 * Return the part of pr2's path that is relative to pr1, or the whole path
3469 * if it does not directly follow.
3470 */
3471 static char *
3472 prison_path(struct prison *pr1, struct prison *pr2)
3473 {
3474 char *path1, *path2;
3475 int len1;
3476
3477 path1 = pr1->pr_path;
3478 path2 = pr2->pr_path;
3479 if (!strcmp(path1, "/"))
3480 return (path2);
3481 len1 = strlen(path1);
3482 if (strncmp(path1, path2, len1))
3483 return (path2);
3484 if (path2[len1] == '\0')
3485 return "/";
3486 if (path2[len1] == '/')
3487 return (path2 + len1);
3488 return (path2);
3489 }
3490
3491
3492 /*
3493 * Jail-related sysctls.
3494 */
3495 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
3496 "Jails");
3497
3498 static int
3499 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
3500 {
3501 struct xprison *xp;
3502 struct prison *pr, *cpr;
3503 #ifdef INET
3504 struct in_addr *ip4 = NULL;
3505 int ip4s = 0;
3506 #endif
3507 #ifdef INET6
3508 struct in6_addr *ip6 = NULL;
3509 int ip6s = 0;
3510 #endif
3511 int descend, error;
3512
3513 xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
3514 pr = req->td->td_ucred->cr_prison;
3515 error = 0;
3516 sx_slock(&allprison_lock);
3517 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
3518 #if defined(INET) || defined(INET6)
3519 again:
3520 #endif
3521 mtx_lock(&cpr->pr_mtx);
3522 #ifdef INET
3523 if (cpr->pr_ip4s > 0) {
3524 if (ip4s < cpr->pr_ip4s) {
3525 ip4s = cpr->pr_ip4s;
3526 mtx_unlock(&cpr->pr_mtx);
3527 ip4 = realloc(ip4, ip4s *
3528 sizeof(struct in_addr), M_TEMP, M_WAITOK);
3529 goto again;
3530 }
3531 bcopy(cpr->pr_ip4, ip4,
3532 cpr->pr_ip4s * sizeof(struct in_addr));
3533 }
3534 #endif
3535 #ifdef INET6
3536 if (cpr->pr_ip6s > 0) {
3537 if (ip6s < cpr->pr_ip6s) {
3538 ip6s = cpr->pr_ip6s;
3539 mtx_unlock(&cpr->pr_mtx);
3540 ip6 = realloc(ip6, ip6s *
3541 sizeof(struct in6_addr), M_TEMP, M_WAITOK);
3542 goto again;
3543 }
3544 bcopy(cpr->pr_ip6, ip6,
3545 cpr->pr_ip6s * sizeof(struct in6_addr));
3546 }
3547 #endif
3548 if (cpr->pr_ref == 0) {
3549 mtx_unlock(&cpr->pr_mtx);
3550 continue;
3551 }
3552 bzero(xp, sizeof(*xp));
3553 xp->pr_version = XPRISON_VERSION;
3554 xp->pr_id = cpr->pr_id;
3555 xp->pr_state = cpr->pr_uref > 0
3556 ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
3557 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
3558 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
3559 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
3560 #ifdef INET
3561 xp->pr_ip4s = cpr->pr_ip4s;
3562 #endif
3563 #ifdef INET6
3564 xp->pr_ip6s = cpr->pr_ip6s;
3565 #endif
3566 mtx_unlock(&cpr->pr_mtx);
3567 error = SYSCTL_OUT(req, xp, sizeof(*xp));
3568 if (error)
3569 break;
3570 #ifdef INET
3571 if (xp->pr_ip4s > 0) {
3572 error = SYSCTL_OUT(req, ip4,
3573 xp->pr_ip4s * sizeof(struct in_addr));
3574 if (error)
3575 break;
3576 }
3577 #endif
3578 #ifdef INET6
3579 if (xp->pr_ip6s > 0) {
3580 error = SYSCTL_OUT(req, ip6,
3581 xp->pr_ip6s * sizeof(struct in6_addr));
3582 if (error)
3583 break;
3584 }
3585 #endif
3586 }
3587 sx_sunlock(&allprison_lock);
3588 free(xp, M_TEMP);
3589 #ifdef INET
3590 free(ip4, M_TEMP);
3591 #endif
3592 #ifdef INET6
3593 free(ip6, M_TEMP);
3594 #endif
3595 return (error);
3596 }
3597
3598 SYSCTL_OID(_security_jail, OID_AUTO, list,
3599 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3600 sysctl_jail_list, "S", "List of active jails");
3601
3602 static int
3603 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
3604 {
3605 int error, injail;
3606
3607 injail = jailed(req->td->td_ucred);
3608 error = SYSCTL_OUT(req, &injail, sizeof(injail));
3609
3610 return (error);
3611 }
3612
3613 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
3614 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3615 sysctl_jail_jailed, "I", "Process in jail?");
3616
3617 static int
3618 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
3619 {
3620 int error, havevnet;
3621 #ifdef VIMAGE
3622 struct ucred *cred = req->td->td_ucred;
3623
3624 havevnet = jailed(cred) && prison_owns_vnet(cred);
3625 #else
3626 havevnet = 0;
3627 #endif
3628 error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
3629
3630 return (error);
3631 }
3632
3633 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
3634 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
3635 sysctl_jail_vnet, "I", "Jail owns vnet?");
3636
3637 #if defined(INET) || defined(INET6)
3638 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
3639 &jail_max_af_ips, 0,
3640 "Number of IP addresses a jail may have at most per address family (deprecated)");
3641 #endif
3642
3643 /*
3644 * Default parameters for jail(2) compatibility. For historical reasons,
3645 * the sysctl names have varying similarity to the parameter names. Prisons
3646 * just see their own parameters, and can't change them.
3647 */
3648 static int
3649 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
3650 {
3651 struct prison *pr;
3652 int allow, error, i;
3653
3654 pr = req->td->td_ucred->cr_prison;
3655 allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
3656
3657 /* Get the current flag value, and convert it to a boolean. */
3658 i = (allow & arg2) ? 1 : 0;
3659 if (arg1 != NULL)
3660 i = !i;
3661 error = sysctl_handle_int(oidp, &i, 0, req);
3662 if (error || !req->newptr)
3663 return (error);
3664 i = i ? arg2 : 0;
3665 if (arg1 != NULL)
3666 i ^= arg2;
3667 /*
3668 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
3669 * for writing.
3670 */
3671 mtx_lock(&prison0.pr_mtx);
3672 jail_default_allow = (jail_default_allow & ~arg2) | i;
3673 mtx_unlock(&prison0.pr_mtx);
3674 return (0);
3675 }
3676
3677 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
3678 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3679 NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
3680 "Processes in jail can set their hostnames (deprecated)");
3681 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
3682 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3683 (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
3684 "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
3685 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
3686 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3687 NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
3688 "Processes in jail can use System V IPC primitives (deprecated)");
3689 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
3690 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3691 NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
3692 "Prison root can create raw sockets (deprecated)");
3693 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
3694 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3695 NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
3696 "Processes in jail can alter system file flags (deprecated)");
3697 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
3698 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3699 NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
3700 "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
3701
3702 static int
3703 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
3704 {
3705 struct prison *pr;
3706 int level, error;
3707
3708 pr = req->td->td_ucred->cr_prison;
3709 level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
3710 error = sysctl_handle_int(oidp, &level, 0, req);
3711 if (error || !req->newptr)
3712 return (error);
3713 *(int *)arg1 = level;
3714 return (0);
3715 }
3716
3717 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
3718 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3719 &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
3720 sysctl_jail_default_level, "I",
3721 "Processes in jail cannot see all mounted file systems (deprecated)");
3722
3723 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
3724 CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
3725 &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
3726 sysctl_jail_default_level, "I",
3727 "Ruleset for the devfs filesystem in jail (deprecated)");
3728
3729 /*
3730 * Nodes to describe jail parameters. Maximum length of string parameters
3731 * is returned in the string itself, and the other parameters exist merely
3732 * to make themselves and their types known.
3733 */
3734 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
3735 "Jail parameters");
3736
3737 int
3738 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
3739 {
3740 int i;
3741 long l;
3742 size_t s;
3743 char numbuf[12];
3744
3745 switch (oidp->oid_kind & CTLTYPE)
3746 {
3747 case CTLTYPE_LONG:
3748 case CTLTYPE_ULONG:
3749 l = 0;
3750 #ifdef SCTL_MASK32
3751 if (!(req->flags & SCTL_MASK32))
3752 #endif
3753 return (SYSCTL_OUT(req, &l, sizeof(l)));
3754 case CTLTYPE_INT:
3755 case CTLTYPE_UINT:
3756 i = 0;
3757 return (SYSCTL_OUT(req, &i, sizeof(i)));
3758 case CTLTYPE_STRING:
3759 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
3760 return
3761 (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
3762 case CTLTYPE_STRUCT:
3763 s = (size_t)arg2;
3764 return (SYSCTL_OUT(req, &s, sizeof(s)));
3765 }
3766 return (0);
3767 }
3768
3769 /*
3770 * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
3771 * jail creation time but cannot be changed in an existing jail.
3772 */
3773 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
3774 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
3775 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
3776 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
3777 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
3778 "I", "Jail secure level");
3779 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
3780 "Jail value for kern.osreldate and uname -K");
3781 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
3782 "Jail value for kern.osrelease and uname -r");
3783 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
3784 "I", "Jail cannot see all mounted file systems");
3785 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
3786 "I", "Ruleset for in-jail devfs mounts");
3787 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
3788 "B", "Jail persistence");
3789 #ifdef VIMAGE
3790 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
3791 "E,jailsys", "Virtual network stack");
3792 #endif
3793 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
3794 "B", "Jail is in the process of shutting down");
3795
3796 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
3797 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
3798 "I", "Current number of child jails");
3799 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
3800 "I", "Maximum number of child jails");
3801
3802 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
3803 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
3804 "Jail hostname");
3805 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
3806 "Jail NIS domainname");
3807 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
3808 "Jail host UUID");
3809 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
3810 "LU", "Jail host ID");
3811
3812 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
3813 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
3814
3815 #ifdef INET
3816 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
3817 "Jail IPv4 address virtualization");
3818 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
3819 "S,in_addr,a", "Jail IPv4 addresses");
3820 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
3821 "B", "Do (not) use IPv4 source address selection rather than the "
3822 "primary jail IPv4 address.");
3823 #endif
3824 #ifdef INET6
3825 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
3826 "Jail IPv6 address virtualization");
3827 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
3828 "S,in6_addr,a", "Jail IPv6 addresses");
3829 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
3830 "B", "Do (not) use IPv6 source address selection rather than the "
3831 "primary jail IPv6 address.");
3832 #endif
3833
3834 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
3835 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
3836 "B", "Jail may set hostname");
3837 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
3838 "B", "Jail may use SYSV IPC");
3839 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
3840 "B", "Jail may create raw sockets");
3841 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
3842 "B", "Jail may alter system file flags");
3843 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
3844 "B", "Jail may set file quotas");
3845 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
3846 "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
3847 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
3848 "B", "Jail may lock (unlock) physical pages in memory");
3849 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
3850 "B", "Jail may bind sockets to reserved ports");
3851 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
3852 "B", "Jail may read the kernel message buffer");
3853
3854 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
3855 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
3856 "B", "Jail may mount/unmount jail-friendly file systems in general");
3857
3858 /*
3859 * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>. Return
3860 * its associated bit in the pr_allow bitmask, or zero if the parameter was
3861 * not created.
3862 */
3863 unsigned
3864 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
3865 const char *descr)
3866 {
3867 struct bool_flags *bf;
3868 struct sysctl_oid *parent;
3869 char *allow_name, *allow_noname, *allowed;
3870 #ifndef NO_SYSCTL_DESCR
3871 char *descr_deprecated;
3872 #endif
3873 unsigned allow_flag;
3874
3875 if (prefix
3876 ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
3877 < 0 ||
3878 asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
3879 < 0
3880 : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
3881 asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
3882 free(allow_name, M_PRISON);
3883 return 0;
3884 }
3885
3886 /*
3887 * See if this parameter has already beed added, i.e. a module was
3888 * previously loaded/unloaded.
3889 */
3890 mtx_lock(&prison0.pr_mtx);
3891 for (bf = pr_flag_allow;
3892 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
3893 bf++) {
3894 if (strcmp(bf->name, allow_name) == 0) {
3895 allow_flag = bf->flag;
3896 goto no_add;
3897 }
3898 }
3899
3900 /*
3901 * Find a free bit in prison0's pr_allow, failing if there are none
3902 * (which shouldn't happen as long as we keep track of how many
3903 * potential dynamic flags exist).
3904 */
3905 for (allow_flag = 1;; allow_flag <<= 1) {
3906 if (allow_flag == 0)
3907 goto no_add;
3908 if ((prison0.pr_allow & allow_flag) == 0)
3909 break;
3910 }
3911
3912 /*
3913 * Note the parameter in the next open slot in pr_flag_allow.
3914 * Set the flag last so code that checks pr_flag_allow can do so
3915 * without locking.
3916 */
3917 for (bf = pr_flag_allow; bf->flag != 0; bf++)
3918 if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
3919 /* This should never happen, but is not fatal. */
3920 allow_flag = 0;
3921 goto no_add;
3922 }
3923 prison0.pr_allow |= allow_flag;
3924 bf->name = allow_name;
3925 bf->noname = allow_noname;
3926 bf->flag = allow_flag;
3927 mtx_unlock(&prison0.pr_mtx);
3928
3929 /*
3930 * Create sysctls for the parameter, and the back-compat global
3931 * permission.
3932 */
3933 parent = prefix
3934 ? SYSCTL_ADD_NODE(NULL,
3935 SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
3936 OID_AUTO, prefix, 0, 0, prefix_descr)
3937 : &sysctl___security_jail_param_allow;
3938 (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
3939 name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
3940 NULL, 0, sysctl_jail_param, "B", descr);
3941 if ((prefix
3942 ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
3943 : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
3944 #ifndef NO_SYSCTL_DESCR
3945 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
3946 descr);
3947 #endif
3948 (void)SYSCTL_ADD_PROC(NULL,
3949 SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
3950 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
3951 sysctl_jail_default_allow, "I", descr_deprecated);
3952 #ifndef NO_SYSCTL_DESCR
3953 free(descr_deprecated, M_TEMP);
3954 #endif
3955 free(allowed, M_TEMP);
3956 }
3957 return allow_flag;
3958
3959 no_add:
3960 mtx_unlock(&prison0.pr_mtx);
3961 free(allow_name, M_PRISON);
3962 free(allow_noname, M_PRISON);
3963 return allow_flag;
3964 }
3965
3966 /*
3967 * The VFS system will register jail-aware filesystems here. They each get
3968 * a parameter allow.mount.xxxfs and a flag to check when a jailed user
3969 * attempts to mount.
3970 */
3971 void
3972 prison_add_vfs(struct vfsconf *vfsp)
3973 {
3974 #ifdef NO_SYSCTL_DESCR
3975
3976 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
3977 NULL, NULL);
3978 #else
3979 char *descr;
3980
3981 (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
3982 vfsp->vfc_name);
3983 vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
3984 NULL, descr);
3985 free(descr, M_TEMP);
3986 #endif
3987 }
3988
3989 #ifdef RACCT
3990 void
3991 prison_racct_foreach(void (*callback)(struct racct *racct,
3992 void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
3993 void *arg2, void *arg3)
3994 {
3995 struct prison_racct *prr;
3996
3997 ASSERT_RACCT_ENABLED();
3998
3999 sx_slock(&allprison_lock);
4000 if (pre != NULL)
4001 (pre)();
4002 LIST_FOREACH(prr, &allprison_racct, prr_next)
4003 (callback)(prr->prr_racct, arg2, arg3);
4004 if (post != NULL)
4005 (post)();
4006 sx_sunlock(&allprison_lock);
4007 }
4008
4009 static struct prison_racct *
4010 prison_racct_find_locked(const char *name)
4011 {
4012 struct prison_racct *prr;
4013
4014 ASSERT_RACCT_ENABLED();
4015 sx_assert(&allprison_lock, SA_XLOCKED);
4016
4017 if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4018 return (NULL);
4019
4020 LIST_FOREACH(prr, &allprison_racct, prr_next) {
4021 if (strcmp(name, prr->prr_name) != 0)
4022 continue;
4023
4024 /* Found prison_racct with a matching name? */
4025 prison_racct_hold(prr);
4026 return (prr);
4027 }
4028
4029 /* Add new prison_racct. */
4030 prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4031 racct_create(&prr->prr_racct);
4032
4033 strcpy(prr->prr_name, name);
4034 refcount_init(&prr->prr_refcount, 1);
4035 LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4036
4037 return (prr);
4038 }
4039
4040 struct prison_racct *
4041 prison_racct_find(const char *name)
4042 {
4043 struct prison_racct *prr;
4044
4045 ASSERT_RACCT_ENABLED();
4046
4047 sx_xlock(&allprison_lock);
4048 prr = prison_racct_find_locked(name);
4049 sx_xunlock(&allprison_lock);
4050 return (prr);
4051 }
4052
4053 void
4054 prison_racct_hold(struct prison_racct *prr)
4055 {
4056
4057 ASSERT_RACCT_ENABLED();
4058
4059 refcount_acquire(&prr->prr_refcount);
4060 }
4061
4062 static void
4063 prison_racct_free_locked(struct prison_racct *prr)
4064 {
4065
4066 ASSERT_RACCT_ENABLED();
4067 sx_assert(&allprison_lock, SA_XLOCKED);
4068
4069 if (refcount_release(&prr->prr_refcount)) {
4070 racct_destroy(&prr->prr_racct);
4071 LIST_REMOVE(prr, prr_next);
4072 free(prr, M_PRISON_RACCT);
4073 }
4074 }
4075
4076 void
4077 prison_racct_free(struct prison_racct *prr)
4078 {
4079 int old;
4080
4081 ASSERT_RACCT_ENABLED();
4082 sx_assert(&allprison_lock, SA_UNLOCKED);
4083
4084 old = prr->prr_refcount;
4085 if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
4086 return;
4087
4088 sx_xlock(&allprison_lock);
4089 prison_racct_free_locked(prr);
4090 sx_xunlock(&allprison_lock);
4091 }
4092
4093 static void
4094 prison_racct_attach(struct prison *pr)
4095 {
4096 struct prison_racct *prr;
4097
4098 ASSERT_RACCT_ENABLED();
4099 sx_assert(&allprison_lock, SA_XLOCKED);
4100
4101 prr = prison_racct_find_locked(pr->pr_name);
4102 KASSERT(prr != NULL, ("cannot find prison_racct"));
4103
4104 pr->pr_prison_racct = prr;
4105 }
4106
4107 /*
4108 * Handle jail renaming. From the racct point of view, renaming means
4109 * moving from one prison_racct to another.
4110 */
4111 static void
4112 prison_racct_modify(struct prison *pr)
4113 {
4114 #ifdef RCTL
4115 struct proc *p;
4116 struct ucred *cred;
4117 #endif
4118 struct prison_racct *oldprr;
4119
4120 ASSERT_RACCT_ENABLED();
4121
4122 sx_slock(&allproc_lock);
4123 sx_xlock(&allprison_lock);
4124
4125 if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4126 sx_xunlock(&allprison_lock);
4127 sx_sunlock(&allproc_lock);
4128 return;
4129 }
4130
4131 oldprr = pr->pr_prison_racct;
4132 pr->pr_prison_racct = NULL;
4133
4134 prison_racct_attach(pr);
4135
4136 /*
4137 * Move resource utilisation records.
4138 */
4139 racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4140
4141 #ifdef RCTL
4142 /*
4143 * Force rctl to reattach rules to processes.
4144 */
4145 FOREACH_PROC_IN_SYSTEM(p) {
4146 PROC_LOCK(p);
4147 cred = crhold(p->p_ucred);
4148 PROC_UNLOCK(p);
4149 rctl_proc_ucred_changed(p, cred);
4150 crfree(cred);
4151 }
4152 #endif
4153
4154 sx_sunlock(&allproc_lock);
4155 prison_racct_free_locked(oldprr);
4156 sx_xunlock(&allprison_lock);
4157 }
4158
4159 static void
4160 prison_racct_detach(struct prison *pr)
4161 {
4162
4163 ASSERT_RACCT_ENABLED();
4164 sx_assert(&allprison_lock, SA_UNLOCKED);
4165
4166 if (pr->pr_prison_racct == NULL)
4167 return;
4168 prison_racct_free(pr->pr_prison_racct);
4169 pr->pr_prison_racct = NULL;
4170 }
4171 #endif /* RACCT */
4172
4173 #ifdef DDB
4174
4175 static void
4176 db_show_prison(struct prison *pr)
4177 {
4178 struct bool_flags *bf;
4179 struct jailsys_flags *jsf;
4180 #if defined(INET) || defined(INET6)
4181 int ii;
4182 #endif
4183 unsigned f;
4184 #ifdef INET
4185 char ip4buf[INET_ADDRSTRLEN];
4186 #endif
4187 #ifdef INET6
4188 char ip6buf[INET6_ADDRSTRLEN];
4189 #endif
4190
4191 db_printf("prison %p:\n", pr);
4192 db_printf(" jid = %d\n", pr->pr_id);
4193 db_printf(" name = %s\n", pr->pr_name);
4194 db_printf(" parent = %p\n", pr->pr_parent);
4195 db_printf(" ref = %d\n", pr->pr_ref);
4196 db_printf(" uref = %d\n", pr->pr_uref);
4197 db_printf(" path = %s\n", pr->pr_path);
4198 db_printf(" cpuset = %d\n", pr->pr_cpuset
4199 ? pr->pr_cpuset->cs_id : -1);
4200 #ifdef VIMAGE
4201 db_printf(" vnet = %p\n", pr->pr_vnet);
4202 #endif
4203 db_printf(" root = %p\n", pr->pr_root);
4204 db_printf(" securelevel = %d\n", pr->pr_securelevel);
4205 db_printf(" devfs_rsnum = %d\n", pr->pr_devfs_rsnum);
4206 db_printf(" children.max = %d\n", pr->pr_childmax);
4207 db_printf(" children.cur = %d\n", pr->pr_childcount);
4208 db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
4209 db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
4210 db_printf(" flags = 0x%x", pr->pr_flags);
4211 for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
4212 if (pr->pr_flags & bf->flag)
4213 db_printf(" %s", bf->name);
4214 for (jsf = pr_flag_jailsys;
4215 jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
4216 jsf++) {
4217 f = pr->pr_flags & (jsf->disable | jsf->new);
4218 db_printf(" %-16s= %s\n", jsf->name,
4219 (f != 0 && f == jsf->disable) ? "disable"
4220 : (f == jsf->new) ? "new"
4221 : "inherit");
4222 }
4223 db_printf(" allow = 0x%x", pr->pr_allow);
4224 for (bf = pr_flag_allow;
4225 bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
4226 bf++)
4227 if (pr->pr_allow & bf->flag)
4228 db_printf(" %s", bf->name);
4229 db_printf("\n");
4230 db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
4231 db_printf(" host.hostname = %s\n", pr->pr_hostname);
4232 db_printf(" host.domainname = %s\n", pr->pr_domainname);
4233 db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
4234 db_printf(" host.hostid = %lu\n", pr->pr_hostid);
4235 #ifdef INET
4236 db_printf(" ip4s = %d\n", pr->pr_ip4s);
4237 for (ii = 0; ii < pr->pr_ip4s; ii++)
4238 db_printf(" %s %s\n",
4239 ii == 0 ? "ip4.addr =" : " ",
4240 inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
4241 #endif
4242 #ifdef INET6
4243 db_printf(" ip6s = %d\n", pr->pr_ip6s);
4244 for (ii = 0; ii < pr->pr_ip6s; ii++)
4245 db_printf(" %s %s\n",
4246 ii == 0 ? "ip6.addr =" : " ",
4247 ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
4248 #endif
4249 }
4250
4251 DB_SHOW_COMMAND(prison, db_show_prison_command)
4252 {
4253 struct prison *pr;
4254
4255 if (!have_addr) {
4256 /*
4257 * Show all prisons in the list, and prison0 which is not
4258 * listed.
4259 */
4260 db_show_prison(&prison0);
4261 if (!db_pager_quit) {
4262 TAILQ_FOREACH(pr, &allprison, pr_list) {
4263 db_show_prison(pr);
4264 if (db_pager_quit)
4265 break;
4266 }
4267 }
4268 return;
4269 }
4270
4271 if (addr == 0)
4272 pr = &prison0;
4273 else {
4274 /* Look for a prison with the ID and with references. */
4275 TAILQ_FOREACH(pr, &allprison, pr_list)
4276 if (pr->pr_id == addr && pr->pr_ref > 0)
4277 break;
4278 if (pr == NULL)
4279 /* Look again, without requiring a reference. */
4280 TAILQ_FOREACH(pr, &allprison, pr_list)
4281 if (pr->pr_id == addr)
4282 break;
4283 if (pr == NULL)
4284 /* Assume address points to a valid prison. */
4285 pr = (struct prison *)addr;
4286 }
4287 db_show_prison(pr);
4288 }
4289
4290 #endif /* DDB */
Cache object: a98dd8a4789fd61132d0ea420250a47e
|