The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_jail.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*-
    2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
    3  *
    4  * Copyright (c) 1999 Poul-Henning Kamp.
    5  * Copyright (c) 2008 Bjoern A. Zeeb.
    6  * Copyright (c) 2009 James Gritton.
    7  * All rights reserved.
    8  *
    9  * Redistribution and use in source and binary forms, with or without
   10  * modification, are permitted provided that the following conditions
   11  * are met:
   12  * 1. Redistributions of source code must retain the above copyright
   13  *    notice, this list of conditions and the following disclaimer.
   14  * 2. Redistributions in binary form must reproduce the above copyright
   15  *    notice, this list of conditions and the following disclaimer in the
   16  *    documentation and/or other materials provided with the distribution.
   17  *
   18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   28  * SUCH DAMAGE.
   29  */
   30 
   31 #include <sys/cdefs.h>
   32 __FBSDID("$FreeBSD: releng/12.0/sys/kern/kern_jail.c 340628 2018-11-19 14:19:27Z kib $");
   33 
   34 #include "opt_ddb.h"
   35 #include "opt_inet.h"
   36 #include "opt_inet6.h"
   37 
   38 #include <sys/param.h>
   39 #include <sys/types.h>
   40 #include <sys/kernel.h>
   41 #include <sys/systm.h>
   42 #include <sys/errno.h>
   43 #include <sys/sysproto.h>
   44 #include <sys/malloc.h>
   45 #include <sys/osd.h>
   46 #include <sys/priv.h>
   47 #include <sys/proc.h>
   48 #include <sys/taskqueue.h>
   49 #include <sys/fcntl.h>
   50 #include <sys/jail.h>
   51 #include <sys/lock.h>
   52 #include <sys/mutex.h>
   53 #include <sys/racct.h>
   54 #include <sys/rctl.h>
   55 #include <sys/refcount.h>
   56 #include <sys/sx.h>
   57 #include <sys/sysent.h>
   58 #include <sys/namei.h>
   59 #include <sys/mount.h>
   60 #include <sys/queue.h>
   61 #include <sys/socket.h>
   62 #include <sys/syscallsubr.h>
   63 #include <sys/sysctl.h>
   64 #include <sys/vnode.h>
   65 
   66 #include <net/if.h>
   67 #include <net/vnet.h>
   68 
   69 #include <netinet/in.h>
   70 
   71 #ifdef DDB
   72 #include <ddb/ddb.h>
   73 #endif /* DDB */
   74 
   75 #include <security/mac/mac_framework.h>
   76 
   77 #define DEFAULT_HOSTUUID        "00000000-0000-0000-0000-000000000000"
   78 
   79 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
   80 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
   81 
   82 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
   83 #ifdef INET
   84 #ifdef INET6
   85 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
   86 #else
   87 #define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
   88 #endif
   89 #else /* !INET */
   90 #ifdef INET6
   91 #define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
   92 #else
   93 #define _PR_IP_SADDRSEL 0
   94 #endif
   95 #endif
   96 
   97 /* prison0 describes what is "real" about the system. */
   98 struct prison prison0 = {
   99         .pr_id          = 0,
  100         .pr_name        = "",
  101         .pr_ref         = 1,
  102         .pr_uref        = 1,
  103         .pr_path        = "/",
  104         .pr_securelevel = -1,
  105         .pr_devfs_rsnum = 0,
  106         .pr_childmax    = JAIL_MAX,
  107         .pr_hostuuid    = DEFAULT_HOSTUUID,
  108         .pr_children    = LIST_HEAD_INITIALIZER(prison0.pr_children),
  109 #ifdef VIMAGE
  110         .pr_flags       = PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
  111 #else
  112         .pr_flags       = PR_HOST|_PR_IP_SADDRSEL,
  113 #endif
  114         .pr_allow       = PR_ALLOW_ALL_STATIC,
  115 };
  116 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
  117 
  118 struct bool_flags {
  119         const char      *name;
  120         const char      *noname;
  121         unsigned         flag;
  122 };
  123 struct jailsys_flags {
  124         const char      *name;
  125         unsigned         disable;
  126         unsigned         new;
  127 };
  128 
  129 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
  130 struct  sx allprison_lock;
  131 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
  132 struct  prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
  133 LIST_HEAD(, prison_racct) allprison_racct;
  134 int     lastprid = 0;
  135 
  136 static int do_jail_attach(struct thread *td, struct prison *pr);
  137 static void prison_complete(void *context, int pending);
  138 static void prison_deref(struct prison *pr, int flags);
  139 static char *prison_path(struct prison *pr1, struct prison *pr2);
  140 static void prison_remove_one(struct prison *pr);
  141 #ifdef RACCT
  142 static void prison_racct_attach(struct prison *pr);
  143 static void prison_racct_modify(struct prison *pr);
  144 static void prison_racct_detach(struct prison *pr);
  145 #endif
  146 
  147 /* Flags for prison_deref */
  148 #define PD_DEREF        0x01
  149 #define PD_DEUREF       0x02
  150 #define PD_LOCKED       0x04
  151 #define PD_LIST_SLOCKED 0x08
  152 #define PD_LIST_XLOCKED 0x10
  153 
  154 /*
  155  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
  156  * as we cannot figure out the size of a sparse array, or an array without a
  157  * terminating entry.
  158  */
  159 static struct bool_flags pr_flag_bool[] = {
  160         {"persist", "nopersist", PR_PERSIST},
  161 #ifdef INET
  162         {"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
  163 #endif
  164 #ifdef INET6
  165         {"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
  166 #endif
  167 };
  168 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
  169 
  170 static struct jailsys_flags pr_flag_jailsys[] = {
  171         {"host", 0, PR_HOST},
  172 #ifdef VIMAGE
  173         {"vnet", 0, PR_VNET},
  174 #endif
  175 #ifdef INET
  176         {"ip4", PR_IP4_USER, PR_IP4_USER},
  177 #endif
  178 #ifdef INET6
  179         {"ip6", PR_IP6_USER, PR_IP6_USER},
  180 #endif
  181 };
  182 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
  183 
  184 /* Make this array full-size so dynamic parameters can be added. */
  185 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
  186         {"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
  187         {"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
  188         {"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
  189         {"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
  190         {"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
  191         {"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
  192         {"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
  193         {"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
  194         {"allow.reserved_ports", "allow.noreserved_ports",
  195          PR_ALLOW_RESERVED_PORTS},
  196         {"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
  197 };
  198 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
  199 
  200 #define JAIL_DEFAULT_ALLOW              (PR_ALLOW_SET_HOSTNAME | PR_ALLOW_RESERVED_PORTS)
  201 #define JAIL_DEFAULT_ENFORCE_STATFS     2
  202 #define JAIL_DEFAULT_DEVFS_RSNUM        0
  203 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
  204 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
  205 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
  206 #if defined(INET) || defined(INET6)
  207 static unsigned jail_max_af_ips = 255;
  208 #endif
  209 
  210 /*
  211  * Initialize the parts of prison0 that can't be static-initialized with
  212  * constants.  This is called from proc0_init() after creating thread0 cpuset.
  213  */
  214 void
  215 prison0_init(void)
  216 {
  217 
  218         prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
  219         prison0.pr_osreldate = osreldate;
  220         strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
  221 }
  222 
  223 /*
  224  * struct jail_args {
  225  *      struct jail *jail;
  226  * };
  227  */
  228 int
  229 sys_jail(struct thread *td, struct jail_args *uap)
  230 {
  231         uint32_t version;
  232         int error;
  233         struct jail j;
  234 
  235         error = copyin(uap->jail, &version, sizeof(uint32_t));
  236         if (error)
  237                 return (error);
  238 
  239         switch (version) {
  240         case 0:
  241         {
  242                 struct jail_v0 j0;
  243 
  244                 /* FreeBSD single IPv4 jails. */
  245                 bzero(&j, sizeof(struct jail));
  246                 error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
  247                 if (error)
  248                         return (error);
  249                 j.version = j0.version;
  250                 j.path = j0.path;
  251                 j.hostname = j0.hostname;
  252                 j.ip4s = htonl(j0.ip_number);   /* jail_v0 is host order */
  253                 break;
  254         }
  255 
  256         case 1:
  257                 /*
  258                  * Version 1 was used by multi-IPv4 jail implementations
  259                  * that never made it into the official kernel.
  260                  */
  261                 return (EINVAL);
  262 
  263         case 2: /* JAIL_API_VERSION */
  264                 /* FreeBSD multi-IPv4/IPv6,noIP jails. */
  265                 error = copyin(uap->jail, &j, sizeof(struct jail));
  266                 if (error)
  267                         return (error);
  268                 break;
  269 
  270         default:
  271                 /* Sci-Fi jails are not supported, sorry. */
  272                 return (EINVAL);
  273         }
  274         return (kern_jail(td, &j));
  275 }
  276 
  277 int
  278 kern_jail(struct thread *td, struct jail *j)
  279 {
  280         struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
  281 #ifdef INET
  282                             + 1
  283 #endif
  284 #ifdef INET6
  285                             + 1
  286 #endif
  287                             )];
  288         struct uio opt;
  289         char *u_path, *u_hostname, *u_name;
  290         struct bool_flags *bf;
  291 #ifdef INET
  292         uint32_t ip4s;
  293         struct in_addr *u_ip4;
  294 #endif
  295 #ifdef INET6
  296         struct in6_addr *u_ip6;
  297 #endif
  298         size_t tmplen;
  299         int error, enforce_statfs;
  300 
  301         bzero(&optiov, sizeof(optiov));
  302         opt.uio_iov = optiov;
  303         opt.uio_iovcnt = 0;
  304         opt.uio_offset = -1;
  305         opt.uio_resid = -1;
  306         opt.uio_segflg = UIO_SYSSPACE;
  307         opt.uio_rw = UIO_READ;
  308         opt.uio_td = td;
  309 
  310         /* Set permissions for top-level jails from sysctls. */
  311         if (!jailed(td->td_ucred)) {
  312                 for (bf = pr_flag_allow;
  313                      bf < pr_flag_allow + nitems(pr_flag_allow) &&
  314                         bf->flag != 0;
  315                      bf++) {
  316                         optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
  317                             (jail_default_allow & bf->flag)
  318                             ? bf->name : bf->noname);
  319                         optiov[opt.uio_iovcnt].iov_len =
  320                             strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
  321                         opt.uio_iovcnt += 2;
  322                 }
  323                 optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
  324                 optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
  325                 opt.uio_iovcnt++;
  326                 enforce_statfs = jail_default_enforce_statfs;
  327                 optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
  328                 optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
  329                 opt.uio_iovcnt++;
  330         }
  331 
  332         tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
  333 #ifdef INET
  334         ip4s = (j->version == 0) ? 1 : j->ip4s;
  335         if (ip4s > jail_max_af_ips)
  336                 return (EINVAL);
  337         tmplen += ip4s * sizeof(struct in_addr);
  338 #else
  339         if (j->ip4s > 0)
  340                 return (EINVAL);
  341 #endif
  342 #ifdef INET6
  343         if (j->ip6s > jail_max_af_ips)
  344                 return (EINVAL);
  345         tmplen += j->ip6s * sizeof(struct in6_addr);
  346 #else
  347         if (j->ip6s > 0)
  348                 return (EINVAL);
  349 #endif
  350         u_path = malloc(tmplen, M_TEMP, M_WAITOK);
  351         u_hostname = u_path + MAXPATHLEN;
  352         u_name = u_hostname + MAXHOSTNAMELEN;
  353 #ifdef INET
  354         u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
  355 #endif
  356 #ifdef INET6
  357 #ifdef INET
  358         u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
  359 #else
  360         u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
  361 #endif
  362 #endif
  363         optiov[opt.uio_iovcnt].iov_base = "path";
  364         optiov[opt.uio_iovcnt].iov_len = sizeof("path");
  365         opt.uio_iovcnt++;
  366         optiov[opt.uio_iovcnt].iov_base = u_path;
  367         error = copyinstr(j->path, u_path, MAXPATHLEN,
  368             &optiov[opt.uio_iovcnt].iov_len);
  369         if (error) {
  370                 free(u_path, M_TEMP);
  371                 return (error);
  372         }
  373         opt.uio_iovcnt++;
  374         optiov[opt.uio_iovcnt].iov_base = "host.hostname";
  375         optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
  376         opt.uio_iovcnt++;
  377         optiov[opt.uio_iovcnt].iov_base = u_hostname;
  378         error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
  379             &optiov[opt.uio_iovcnt].iov_len);
  380         if (error) {
  381                 free(u_path, M_TEMP);
  382                 return (error);
  383         }
  384         opt.uio_iovcnt++;
  385         if (j->jailname != NULL) {
  386                 optiov[opt.uio_iovcnt].iov_base = "name";
  387                 optiov[opt.uio_iovcnt].iov_len = sizeof("name");
  388                 opt.uio_iovcnt++;
  389                 optiov[opt.uio_iovcnt].iov_base = u_name;
  390                 error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
  391                     &optiov[opt.uio_iovcnt].iov_len);
  392                 if (error) {
  393                         free(u_path, M_TEMP);
  394                         return (error);
  395                 }
  396                 opt.uio_iovcnt++;
  397         }
  398 #ifdef INET
  399         optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
  400         optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
  401         opt.uio_iovcnt++;
  402         optiov[opt.uio_iovcnt].iov_base = u_ip4;
  403         optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
  404         if (j->version == 0)
  405                 u_ip4->s_addr = j->ip4s;
  406         else {
  407                 error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
  408                 if (error) {
  409                         free(u_path, M_TEMP);
  410                         return (error);
  411                 }
  412         }
  413         opt.uio_iovcnt++;
  414 #endif
  415 #ifdef INET6
  416         optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
  417         optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
  418         opt.uio_iovcnt++;
  419         optiov[opt.uio_iovcnt].iov_base = u_ip6;
  420         optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
  421         error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
  422         if (error) {
  423                 free(u_path, M_TEMP);
  424                 return (error);
  425         }
  426         opt.uio_iovcnt++;
  427 #endif
  428         KASSERT(opt.uio_iovcnt <= nitems(optiov),
  429                 ("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
  430         error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
  431         free(u_path, M_TEMP);
  432         return (error);
  433 }
  434 
  435 
  436 /*
  437  * struct jail_set_args {
  438  *      struct iovec *iovp;
  439  *      unsigned int iovcnt;
  440  *      int flags;
  441  * };
  442  */
  443 int
  444 sys_jail_set(struct thread *td, struct jail_set_args *uap)
  445 {
  446         struct uio *auio;
  447         int error;
  448 
  449         /* Check that we have an even number of iovecs. */
  450         if (uap->iovcnt & 1)
  451                 return (EINVAL);
  452 
  453         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
  454         if (error)
  455                 return (error);
  456         error = kern_jail_set(td, auio, uap->flags);
  457         free(auio, M_IOV);
  458         return (error);
  459 }
  460 
  461 int
  462 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
  463 {
  464         struct nameidata nd;
  465 #ifdef INET
  466         struct in_addr *ip4;
  467 #endif
  468 #ifdef INET6
  469         struct in6_addr *ip6;
  470 #endif
  471         struct vfsopt *opt;
  472         struct vfsoptlist *opts;
  473         struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
  474         struct vnode *root;
  475         char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
  476         char *g_path, *osrelstr;
  477         struct bool_flags *bf;
  478         struct jailsys_flags *jsf;
  479 #if defined(INET) || defined(INET6)
  480         struct prison *tppr;
  481         void *op;
  482 #endif
  483         unsigned long hid;
  484         size_t namelen, onamelen, pnamelen;
  485         int born, created, cuflags, descend, enforce;
  486         int error, errmsg_len, errmsg_pos;
  487         int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
  488         int jid, jsys, len, level;
  489         int childmax, osreldt, rsnum, slevel;
  490         int fullpath_disabled;
  491 #if defined(INET) || defined(INET6)
  492         int ii, ij;
  493 #endif
  494 #ifdef INET
  495         int ip4s, redo_ip4;
  496 #endif
  497 #ifdef INET6
  498         int ip6s, redo_ip6;
  499 #endif
  500         uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
  501         unsigned tallow;
  502         char numbuf[12];
  503 
  504         error = priv_check(td, PRIV_JAIL_SET);
  505         if (!error && (flags & JAIL_ATTACH))
  506                 error = priv_check(td, PRIV_JAIL_ATTACH);
  507         if (error)
  508                 return (error);
  509         mypr = td->td_ucred->cr_prison;
  510         if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
  511                 return (EPERM);
  512         if (flags & ~JAIL_SET_MASK)
  513                 return (EINVAL);
  514 
  515         /*
  516          * Check all the parameters before committing to anything.  Not all
  517          * errors can be caught early, but we may as well try.  Also, this
  518          * takes care of some expensive stuff (path lookup) before getting
  519          * the allprison lock.
  520          *
  521          * XXX Jails are not filesystems, and jail parameters are not mount
  522          *     options.  But it makes more sense to re-use the vfsopt code
  523          *     than duplicate it under a different name.
  524          */
  525         error = vfs_buildopts(optuio, &opts);
  526         if (error)
  527                 return (error);
  528 #ifdef INET
  529         ip4 = NULL;
  530 #endif
  531 #ifdef INET6
  532         ip6 = NULL;
  533 #endif
  534         g_path = NULL;
  535 
  536         cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
  537         if (!cuflags) {
  538                 error = EINVAL;
  539                 vfs_opterror(opts, "no valid operation (create or update)");
  540                 goto done_errmsg;
  541         }
  542 
  543         error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
  544         if (error == ENOENT)
  545                 jid = 0;
  546         else if (error != 0)
  547                 goto done_free;
  548 
  549         error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
  550         if (error == ENOENT)
  551                 gotslevel = 0;
  552         else if (error != 0)
  553                 goto done_free;
  554         else
  555                 gotslevel = 1;
  556 
  557         error =
  558             vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
  559         if (error == ENOENT)
  560                 gotchildmax = 0;
  561         else if (error != 0)
  562                 goto done_free;
  563         else
  564                 gotchildmax = 1;
  565 
  566         error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
  567         if (error == ENOENT)
  568                 gotenforce = 0;
  569         else if (error != 0)
  570                 goto done_free;
  571         else if (enforce < 0 || enforce > 2) {
  572                 error = EINVAL;
  573                 goto done_free;
  574         } else
  575                 gotenforce = 1;
  576 
  577         error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
  578         if (error == ENOENT)
  579                 gotrsnum = 0;
  580         else if (error != 0)
  581                 goto done_free;
  582         else
  583                 gotrsnum = 1;
  584 
  585         pr_flags = ch_flags = 0;
  586         for (bf = pr_flag_bool;
  587              bf < pr_flag_bool + nitems(pr_flag_bool);
  588              bf++) {
  589                 vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
  590                 vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
  591         }
  592         ch_flags |= pr_flags;
  593         for (jsf = pr_flag_jailsys;
  594              jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
  595              jsf++) {
  596                 error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
  597                 if (error == ENOENT)
  598                         continue;
  599                 if (error != 0)
  600                         goto done_free;
  601                 switch (jsys) {
  602                 case JAIL_SYS_DISABLE:
  603                         if (!jsf->disable) {
  604                                 error = EINVAL;
  605                                 goto done_free;
  606                         }
  607                         pr_flags |= jsf->disable;
  608                         break;
  609                 case JAIL_SYS_NEW:
  610                         pr_flags |= jsf->new;
  611                         break;
  612                 case JAIL_SYS_INHERIT:
  613                         break;
  614                 default:
  615                         error = EINVAL;
  616                         goto done_free;
  617                 }
  618                 ch_flags |= jsf->new | jsf->disable;
  619         }
  620         if ((flags & (JAIL_CREATE | JAIL_UPDATE | JAIL_ATTACH)) == JAIL_CREATE
  621             && !(pr_flags & PR_PERSIST)) {
  622                 error = EINVAL;
  623                 vfs_opterror(opts, "new jail must persist or attach");
  624                 goto done_errmsg;
  625         }
  626 #ifdef VIMAGE
  627         if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
  628                 error = EINVAL;
  629                 vfs_opterror(opts, "vnet cannot be changed after creation");
  630                 goto done_errmsg;
  631         }
  632 #endif
  633 #ifdef INET
  634         if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
  635                 error = EINVAL;
  636                 vfs_opterror(opts, "ip4 cannot be changed after creation");
  637                 goto done_errmsg;
  638         }
  639 #endif
  640 #ifdef INET6
  641         if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
  642                 error = EINVAL;
  643                 vfs_opterror(opts, "ip6 cannot be changed after creation");
  644                 goto done_errmsg;
  645         }
  646 #endif
  647 
  648         pr_allow = ch_allow = 0;
  649         for (bf = pr_flag_allow;
  650              bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
  651              bf++) {
  652                 vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
  653                 vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
  654         }
  655         ch_allow |= pr_allow;
  656 
  657         error = vfs_getopt(opts, "name", (void **)&name, &len);
  658         if (error == ENOENT)
  659                 name = NULL;
  660         else if (error != 0)
  661                 goto done_free;
  662         else {
  663                 if (len == 0 || name[len - 1] != '\0') {
  664                         error = EINVAL;
  665                         goto done_free;
  666                 }
  667                 if (len > MAXHOSTNAMELEN) {
  668                         error = ENAMETOOLONG;
  669                         goto done_free;
  670                 }
  671         }
  672 
  673         error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
  674         if (error == ENOENT)
  675                 host = NULL;
  676         else if (error != 0)
  677                 goto done_free;
  678         else {
  679                 ch_flags |= PR_HOST;
  680                 pr_flags |= PR_HOST;
  681                 if (len == 0 || host[len - 1] != '\0') {
  682                         error = EINVAL;
  683                         goto done_free;
  684                 }
  685                 if (len > MAXHOSTNAMELEN) {
  686                         error = ENAMETOOLONG;
  687                         goto done_free;
  688                 }
  689         }
  690 
  691         error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
  692         if (error == ENOENT)
  693                 domain = NULL;
  694         else if (error != 0)
  695                 goto done_free;
  696         else {
  697                 ch_flags |= PR_HOST;
  698                 pr_flags |= PR_HOST;
  699                 if (len == 0 || domain[len - 1] != '\0') {
  700                         error = EINVAL;
  701                         goto done_free;
  702                 }
  703                 if (len > MAXHOSTNAMELEN) {
  704                         error = ENAMETOOLONG;
  705                         goto done_free;
  706                 }
  707         }
  708 
  709         error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
  710         if (error == ENOENT)
  711                 uuid = NULL;
  712         else if (error != 0)
  713                 goto done_free;
  714         else {
  715                 ch_flags |= PR_HOST;
  716                 pr_flags |= PR_HOST;
  717                 if (len == 0 || uuid[len - 1] != '\0') {
  718                         error = EINVAL;
  719                         goto done_free;
  720                 }
  721                 if (len > HOSTUUIDLEN) {
  722                         error = ENAMETOOLONG;
  723                         goto done_free;
  724                 }
  725         }
  726 
  727 #ifdef COMPAT_FREEBSD32
  728         if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
  729                 uint32_t hid32;
  730 
  731                 error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
  732                 hid = hid32;
  733         } else
  734 #endif
  735                 error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
  736         if (error == ENOENT)
  737                 gothid = 0;
  738         else if (error != 0)
  739                 goto done_free;
  740         else {
  741                 gothid = 1;
  742                 ch_flags |= PR_HOST;
  743                 pr_flags |= PR_HOST;
  744         }
  745 
  746 #ifdef INET
  747         error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
  748         if (error == ENOENT)
  749                 ip4s = 0;
  750         else if (error != 0)
  751                 goto done_free;
  752         else if (ip4s & (sizeof(*ip4) - 1)) {
  753                 error = EINVAL;
  754                 goto done_free;
  755         } else {
  756                 ch_flags |= PR_IP4_USER;
  757                 pr_flags |= PR_IP4_USER;
  758                 if (ip4s > 0) {
  759                         ip4s /= sizeof(*ip4);
  760                         if (ip4s > jail_max_af_ips) {
  761                                 error = EINVAL;
  762                                 vfs_opterror(opts, "too many IPv4 addresses");
  763                                 goto done_errmsg;
  764                         }
  765                         ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
  766                         bcopy(op, ip4, ip4s * sizeof(*ip4));
  767                         /*
  768                          * IP addresses are all sorted but ip[0] to preserve
  769                          * the primary IP address as given from userland.
  770                          * This special IP is used for unbound outgoing
  771                          * connections as well for "loopback" traffic in case
  772                          * source address selection cannot find any more fitting
  773                          * address to connect from.
  774                          */
  775                         if (ip4s > 1)
  776                                 qsort(ip4 + 1, ip4s - 1, sizeof(*ip4),
  777                                     prison_qcmp_v4);
  778                         /*
  779                          * Check for duplicate addresses and do some simple
  780                          * zero and broadcast checks. If users give other bogus
  781                          * addresses it is their problem.
  782                          *
  783                          * We do not have to care about byte order for these
  784                          * checks so we will do them in NBO.
  785                          */
  786                         for (ii = 0; ii < ip4s; ii++) {
  787                                 if (ip4[ii].s_addr == INADDR_ANY ||
  788                                     ip4[ii].s_addr == INADDR_BROADCAST) {
  789                                         error = EINVAL;
  790                                         goto done_free;
  791                                 }
  792                                 if ((ii+1) < ip4s &&
  793                                     (ip4[0].s_addr == ip4[ii+1].s_addr ||
  794                                      ip4[ii].s_addr == ip4[ii+1].s_addr)) {
  795                                         error = EINVAL;
  796                                         goto done_free;
  797                                 }
  798                         }
  799                 }
  800         }
  801 #endif
  802 
  803 #ifdef INET6
  804         error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
  805         if (error == ENOENT)
  806                 ip6s = 0;
  807         else if (error != 0)
  808                 goto done_free;
  809         else if (ip6s & (sizeof(*ip6) - 1)) {
  810                 error = EINVAL;
  811                 goto done_free;
  812         } else {
  813                 ch_flags |= PR_IP6_USER;
  814                 pr_flags |= PR_IP6_USER;
  815                 if (ip6s > 0) {
  816                         ip6s /= sizeof(*ip6);
  817                         if (ip6s > jail_max_af_ips) {
  818                                 error = EINVAL;
  819                                 vfs_opterror(opts, "too many IPv6 addresses");
  820                                 goto done_errmsg;
  821                         }
  822                         ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
  823                         bcopy(op, ip6, ip6s * sizeof(*ip6));
  824                         if (ip6s > 1)
  825                                 qsort(ip6 + 1, ip6s - 1, sizeof(*ip6),
  826                                     prison_qcmp_v6);
  827                         for (ii = 0; ii < ip6s; ii++) {
  828                                 if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
  829                                         error = EINVAL;
  830                                         goto done_free;
  831                                 }
  832                                 if ((ii+1) < ip6s &&
  833                                     (IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) ||
  834                                      IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
  835                                 {
  836                                         error = EINVAL;
  837                                         goto done_free;
  838                                 }
  839                         }
  840                 }
  841         }
  842 #endif
  843 
  844 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
  845         if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
  846                 error = EINVAL;
  847                 vfs_opterror(opts,
  848                     "vnet jails cannot have IP address restrictions");
  849                 goto done_errmsg;
  850         }
  851 #endif
  852 
  853         error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
  854         if (error == ENOENT)
  855                 osrelstr = NULL;
  856         else if (error != 0)
  857                 goto done_free;
  858         else {
  859                 if (flags & JAIL_UPDATE) {
  860                         error = EINVAL;
  861                         vfs_opterror(opts,
  862                             "osrelease cannot be changed after creation");
  863                         goto done_errmsg;
  864                 }
  865                 if (len == 0 || len >= OSRELEASELEN) {
  866                         error = EINVAL;
  867                         vfs_opterror(opts,
  868                             "osrelease string must be 1-%d bytes long",
  869                             OSRELEASELEN - 1);
  870                         goto done_errmsg;
  871                 }
  872         }
  873 
  874         error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
  875         if (error == ENOENT)
  876                 osreldt = 0;
  877         else if (error != 0)
  878                 goto done_free;
  879         else {
  880                 if (flags & JAIL_UPDATE) {
  881                         error = EINVAL;
  882                         vfs_opterror(opts,
  883                             "osreldate cannot be changed after creation");
  884                         goto done_errmsg;
  885                 }
  886                 if (osreldt == 0) {
  887                         error = EINVAL;
  888                         vfs_opterror(opts, "osreldate cannot be 0");
  889                         goto done_errmsg;
  890                 }
  891         }
  892 
  893         fullpath_disabled = 0;
  894         root = NULL;
  895         error = vfs_getopt(opts, "path", (void **)&path, &len);
  896         if (error == ENOENT)
  897                 path = NULL;
  898         else if (error != 0)
  899                 goto done_free;
  900         else {
  901                 if (flags & JAIL_UPDATE) {
  902                         error = EINVAL;
  903                         vfs_opterror(opts,
  904                             "path cannot be changed after creation");
  905                         goto done_errmsg;
  906                 }
  907                 if (len == 0 || path[len - 1] != '\0') {
  908                         error = EINVAL;
  909                         goto done_free;
  910                 }
  911                 NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
  912                     path, td);
  913                 error = namei(&nd);
  914                 if (error)
  915                         goto done_free;
  916                 root = nd.ni_vp;
  917                 NDFREE(&nd, NDF_ONLY_PNBUF);
  918                 g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
  919                 strlcpy(g_path, path, MAXPATHLEN);
  920                 error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
  921                 if (error == 0)
  922                         path = g_path;
  923                 else if (error == ENODEV) {
  924                         /* proceed if sysctl debug.disablefullpath == 1 */
  925                         fullpath_disabled = 1;
  926                         if (len < 2 || (len == 2 && path[0] == '/'))
  927                                 path = NULL;
  928                 } else {
  929                         /* exit on other errors */
  930                         goto done_free;
  931                 }
  932                 if (root->v_type != VDIR) {
  933                         error = ENOTDIR;
  934                         vput(root);
  935                         goto done_free;
  936                 }
  937                 VOP_UNLOCK(root, 0);
  938                 if (fullpath_disabled) {
  939                         /* Leave room for a real-root full pathname. */
  940                         if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
  941                             ? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
  942                                 error = ENAMETOOLONG;
  943                                 vrele(root);
  944                                 goto done_free;
  945                         }
  946                 }
  947         }
  948 
  949         /*
  950          * Find the specified jail, or at least its parent.
  951          * This abuses the file error codes ENOENT and EEXIST.
  952          */
  953         pr = NULL;
  954         ppr = mypr;
  955         if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
  956                 namelc = strrchr(name, '.');
  957                 jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
  958                 if (*p != '\0')
  959                         jid = 0;
  960         }
  961         sx_xlock(&allprison_lock);
  962         if (jid != 0) {
  963                 /*
  964                  * See if a requested jid already exists.  There is an
  965                  * information leak here if the jid exists but is not within
  966                  * the caller's jail hierarchy.  Jail creators will get EEXIST
  967                  * even though they cannot see the jail, and CREATE | UPDATE
  968                  * will return ENOENT which is not normally a valid error.
  969                  */
  970                 if (jid < 0) {
  971                         error = EINVAL;
  972                         vfs_opterror(opts, "negative jid");
  973                         goto done_unlock_list;
  974                 }
  975                 pr = prison_find(jid);
  976                 if (pr != NULL) {
  977                         ppr = pr->pr_parent;
  978                         /* Create: jid must not exist. */
  979                         if (cuflags == JAIL_CREATE) {
  980                                 mtx_unlock(&pr->pr_mtx);
  981                                 error = EEXIST;
  982                                 vfs_opterror(opts, "jail %d already exists",
  983                                     jid);
  984                                 goto done_unlock_list;
  985                         }
  986                         if (!prison_ischild(mypr, pr)) {
  987                                 mtx_unlock(&pr->pr_mtx);
  988                                 pr = NULL;
  989                         } else if (pr->pr_uref == 0) {
  990                                 if (!(flags & JAIL_DYING)) {
  991                                         mtx_unlock(&pr->pr_mtx);
  992                                         error = ENOENT;
  993                                         vfs_opterror(opts, "jail %d is dying",
  994                                             jid);
  995                                         goto done_unlock_list;
  996                                 } else if ((flags & JAIL_ATTACH) ||
  997                                     (pr_flags & PR_PERSIST)) {
  998                                         /*
  999                                          * A dying jail might be resurrected
 1000                                          * (via attach or persist), but first
 1001                                          * it must determine if another jail
 1002                                          * has claimed its name.  Accomplish
 1003                                          * this by implicitly re-setting the
 1004                                          * name.
 1005                                          */
 1006                                         if (name == NULL)
 1007                                                 name = prison_name(mypr, pr);
 1008                                 }
 1009                         }
 1010                 }
 1011                 if (pr == NULL) {
 1012                         /* Update: jid must exist. */
 1013                         if (cuflags == JAIL_UPDATE) {
 1014                                 error = ENOENT;
 1015                                 vfs_opterror(opts, "jail %d not found", jid);
 1016                                 goto done_unlock_list;
 1017                         }
 1018                 }
 1019         }
 1020         /*
 1021          * If the caller provided a name, look for a jail by that name.
 1022          * This has different semantics for creates and updates keyed by jid
 1023          * (where the name must not already exist in a different jail),
 1024          * and updates keyed by the name itself (where the name must exist
 1025          * because that is the jail being updated).
 1026          */
 1027         namelc = NULL;
 1028         if (name != NULL) {
 1029                 namelc = strrchr(name, '.');
 1030                 if (namelc == NULL)
 1031                         namelc = name;
 1032                 else {
 1033                         /*
 1034                          * This is a hierarchical name.  Split it into the
 1035                          * parent and child names, and make sure the parent
 1036                          * exists or matches an already found jail.
 1037                          */
 1038                         if (pr != NULL) {
 1039                                 if (strncmp(name, ppr->pr_name, namelc - name)
 1040                                     || ppr->pr_name[namelc - name] != '\0') {
 1041                                         mtx_unlock(&pr->pr_mtx);
 1042                                         error = EINVAL;
 1043                                         vfs_opterror(opts,
 1044                                             "cannot change jail's parent");
 1045                                         goto done_unlock_list;
 1046                                 }
 1047                         } else {
 1048                                 *namelc = '\0';
 1049                                 ppr = prison_find_name(mypr, name);
 1050                                 if (ppr == NULL) {
 1051                                         error = ENOENT;
 1052                                         vfs_opterror(opts,
 1053                                             "jail \"%s\" not found", name);
 1054                                         goto done_unlock_list;
 1055                                 }
 1056                                 mtx_unlock(&ppr->pr_mtx);
 1057                                 *namelc = '.';
 1058                         }
 1059                         namelc++;
 1060                 }
 1061                 if (namelc[0] != '\0') {
 1062                         pnamelen =
 1063                             (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 1064  name_again:
 1065                         deadpr = NULL;
 1066                         FOREACH_PRISON_CHILD(ppr, tpr) {
 1067                                 if (tpr != pr && tpr->pr_ref > 0 &&
 1068                                     !strcmp(tpr->pr_name + pnamelen, namelc)) {
 1069                                         if (pr == NULL &&
 1070                                             cuflags != JAIL_CREATE) {
 1071                                                 mtx_lock(&tpr->pr_mtx);
 1072                                                 if (tpr->pr_ref > 0) {
 1073                                                         /*
 1074                                                          * Use this jail
 1075                                                          * for updates.
 1076                                                          */
 1077                                                         if (tpr->pr_uref > 0) {
 1078                                                                 pr = tpr;
 1079                                                                 break;
 1080                                                         }
 1081                                                         deadpr = tpr;
 1082                                                 }
 1083                                                 mtx_unlock(&tpr->pr_mtx);
 1084                                         } else if (tpr->pr_uref > 0) {
 1085                                                 /*
 1086                                                  * Create, or update(jid):
 1087                                                  * name must not exist in an
 1088                                                  * active sibling jail.
 1089                                                  */
 1090                                                 error = EEXIST;
 1091                                                 if (pr != NULL)
 1092                                                         mtx_unlock(&pr->pr_mtx);
 1093                                                 vfs_opterror(opts,
 1094                                                    "jail \"%s\" already exists",
 1095                                                    name);
 1096                                                 goto done_unlock_list;
 1097                                         }
 1098                                 }
 1099                         }
 1100                         /* If no active jail is found, use a dying one. */
 1101                         if (deadpr != NULL && pr == NULL) {
 1102                                 if (flags & JAIL_DYING) {
 1103                                         mtx_lock(&deadpr->pr_mtx);
 1104                                         if (deadpr->pr_ref == 0) {
 1105                                                 mtx_unlock(&deadpr->pr_mtx);
 1106                                                 goto name_again;
 1107                                         }
 1108                                         pr = deadpr;
 1109                                 } else if (cuflags == JAIL_UPDATE) {
 1110                                         error = ENOENT;
 1111                                         vfs_opterror(opts,
 1112                                             "jail \"%s\" is dying", name);
 1113                                         goto done_unlock_list;
 1114                                 }
 1115                         }
 1116                         /* Update: name must exist if no jid. */
 1117                         else if (cuflags == JAIL_UPDATE && pr == NULL) {
 1118                                 error = ENOENT;
 1119                                 vfs_opterror(opts, "jail \"%s\" not found",
 1120                                     name);
 1121                                 goto done_unlock_list;
 1122                         }
 1123                 }
 1124         }
 1125         /* Update: must provide a jid or name. */
 1126         else if (cuflags == JAIL_UPDATE && pr == NULL) {
 1127                 error = ENOENT;
 1128                 vfs_opterror(opts, "update specified no jail");
 1129                 goto done_unlock_list;
 1130         }
 1131 
 1132         /* If there's no prison to update, create a new one and link it in. */
 1133         if (pr == NULL) {
 1134                 for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
 1135                         if (tpr->pr_childcount >= tpr->pr_childmax) {
 1136                                 error = EPERM;
 1137                                 vfs_opterror(opts, "prison limit exceeded");
 1138                                 goto done_unlock_list;
 1139                         }
 1140                 created = 1;
 1141                 mtx_lock(&ppr->pr_mtx);
 1142                 if (ppr->pr_ref == 0) {
 1143                         mtx_unlock(&ppr->pr_mtx);
 1144                         error = ENOENT;
 1145                         vfs_opterror(opts, "jail \"%s\" not found",
 1146                             prison_name(mypr, ppr));
 1147                         goto done_unlock_list;
 1148                 }
 1149                 ppr->pr_ref++;
 1150                 ppr->pr_uref++;
 1151                 mtx_unlock(&ppr->pr_mtx);
 1152                 pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
 1153                 if (jid == 0) {
 1154                         /* Find the next free jid. */
 1155                         jid = lastprid + 1;
 1156  findnext:
 1157                         if (jid == JAIL_MAX)
 1158                                 jid = 1;
 1159                         TAILQ_FOREACH(tpr, &allprison, pr_list) {
 1160                                 if (tpr->pr_id < jid)
 1161                                         continue;
 1162                                 if (tpr->pr_id > jid || tpr->pr_ref == 0) {
 1163                                         TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 1164                                         break;
 1165                                 }
 1166                                 if (jid == lastprid) {
 1167                                         error = EAGAIN;
 1168                                         vfs_opterror(opts,
 1169                                             "no available jail IDs");
 1170                                         free(pr, M_PRISON);
 1171                                         prison_deref(ppr, PD_DEREF |
 1172                                             PD_DEUREF | PD_LIST_XLOCKED);
 1173                                         goto done_releroot;
 1174                                 }
 1175                                 jid++;
 1176                                 goto findnext;
 1177                         }
 1178                         lastprid = jid;
 1179                 } else {
 1180                         /*
 1181                          * The jail already has a jid (that did not yet exist),
 1182                          * so just find where to insert it.
 1183                          */
 1184                         TAILQ_FOREACH(tpr, &allprison, pr_list)
 1185                                 if (tpr->pr_id >= jid) {
 1186                                         TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
 1187                                         break;
 1188                                 }
 1189                 }
 1190                 if (tpr == NULL)
 1191                         TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
 1192                 LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
 1193                 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 1194                         tpr->pr_childcount++;
 1195 
 1196                 pr->pr_parent = ppr;
 1197                 pr->pr_id = jid;
 1198 
 1199                 /* Set some default values, and inherit some from the parent. */
 1200                 if (namelc == NULL)
 1201                         namelc = "";
 1202                 if (path == NULL) {
 1203                         path = "/";
 1204                         root = mypr->pr_root;
 1205                         vref(root);
 1206                 }
 1207                 strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
 1208                 pr->pr_flags |= PR_HOST;
 1209 #if defined(INET) || defined(INET6)
 1210 #ifdef VIMAGE
 1211                 if (!(pr_flags & PR_VNET))
 1212 #endif
 1213                 {
 1214 #ifdef INET
 1215                         if (!(ch_flags & PR_IP4_USER))
 1216                                 pr->pr_flags |= PR_IP4 | PR_IP4_USER;
 1217                         else if (!(pr_flags & PR_IP4_USER)) {
 1218                                 pr->pr_flags |= ppr->pr_flags & PR_IP4;
 1219                                 if (ppr->pr_ip4 != NULL) {
 1220                                         pr->pr_ip4s = ppr->pr_ip4s;
 1221                                         pr->pr_ip4 = malloc(pr->pr_ip4s *
 1222                                             sizeof(struct in_addr), M_PRISON,
 1223                                             M_WAITOK);
 1224                                         bcopy(ppr->pr_ip4, pr->pr_ip4,
 1225                                             pr->pr_ip4s * sizeof(*pr->pr_ip4));
 1226                                 }
 1227                         }
 1228 #endif
 1229 #ifdef INET6
 1230                         if (!(ch_flags & PR_IP6_USER))
 1231                                 pr->pr_flags |= PR_IP6 | PR_IP6_USER;
 1232                         else if (!(pr_flags & PR_IP6_USER)) {
 1233                                 pr->pr_flags |= ppr->pr_flags & PR_IP6;
 1234                                 if (ppr->pr_ip6 != NULL) {
 1235                                         pr->pr_ip6s = ppr->pr_ip6s;
 1236                                         pr->pr_ip6 = malloc(pr->pr_ip6s *
 1237                                             sizeof(struct in6_addr), M_PRISON,
 1238                                             M_WAITOK);
 1239                                         bcopy(ppr->pr_ip6, pr->pr_ip6,
 1240                                             pr->pr_ip6s * sizeof(*pr->pr_ip6));
 1241                                 }
 1242                         }
 1243 #endif
 1244                 }
 1245 #endif
 1246                 /* Source address selection is always on by default. */
 1247                 pr->pr_flags |= _PR_IP_SADDRSEL;
 1248 
 1249                 pr->pr_securelevel = ppr->pr_securelevel;
 1250                 pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
 1251                 pr->pr_enforce_statfs = jail_default_enforce_statfs;
 1252                 pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
 1253 
 1254                 pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
 1255                 if (osrelstr == NULL)
 1256                     strcpy(pr->pr_osrelease, ppr->pr_osrelease);
 1257                 else
 1258                     strcpy(pr->pr_osrelease, osrelstr);
 1259 
 1260                 LIST_INIT(&pr->pr_children);
 1261                 mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
 1262                 TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 1263 
 1264 #ifdef VIMAGE
 1265                 /* Allocate a new vnet if specified. */
 1266                 pr->pr_vnet = (pr_flags & PR_VNET)
 1267                     ? vnet_alloc() : ppr->pr_vnet;
 1268 #endif
 1269                 /*
 1270                  * Allocate a dedicated cpuset for each jail.
 1271                  * Unlike other initial settings, this may return an erorr.
 1272                  */
 1273                 error = cpuset_create_root(ppr, &pr->pr_cpuset);
 1274                 if (error) {
 1275                         prison_deref(pr, PD_LIST_XLOCKED);
 1276                         goto done_releroot;
 1277                 }
 1278 
 1279                 mtx_lock(&pr->pr_mtx);
 1280                 /*
 1281                  * New prisons do not yet have a reference, because we do not
 1282                  * want others to see the incomplete prison once the
 1283                  * allprison_lock is downgraded.
 1284                  */
 1285         } else {
 1286                 created = 0;
 1287                 /*
 1288                  * Grab a reference for existing prisons, to ensure they
 1289                  * continue to exist for the duration of the call.
 1290                  */
 1291                 pr->pr_ref++;
 1292 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
 1293                 if ((pr->pr_flags & PR_VNET) &&
 1294                     (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
 1295                         error = EINVAL;
 1296                         vfs_opterror(opts,
 1297                             "vnet jails cannot have IP address restrictions");
 1298                         goto done_deref_locked;
 1299                 }
 1300 #endif
 1301 #ifdef INET
 1302                 if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 1303                         error = EINVAL;
 1304                         vfs_opterror(opts,
 1305                             "ip4 cannot be changed after creation");
 1306                         goto done_deref_locked;
 1307                 }
 1308 #endif
 1309 #ifdef INET6
 1310                 if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
 1311                         error = EINVAL;
 1312                         vfs_opterror(opts,
 1313                             "ip6 cannot be changed after creation");
 1314                         goto done_deref_locked;
 1315                 }
 1316 #endif
 1317         }
 1318 
 1319         /* Do final error checking before setting anything. */
 1320         if (gotslevel) {
 1321                 if (slevel < ppr->pr_securelevel) {
 1322                         error = EPERM;
 1323                         goto done_deref_locked;
 1324                 }
 1325         }
 1326         if (gotchildmax) {
 1327                 if (childmax >= ppr->pr_childmax) {
 1328                         error = EPERM;
 1329                         goto done_deref_locked;
 1330                 }
 1331         }
 1332         if (gotenforce) {
 1333                 if (enforce < ppr->pr_enforce_statfs) {
 1334                         error = EPERM;
 1335                         goto done_deref_locked;
 1336                 }
 1337         }
 1338         if (gotrsnum) {
 1339                 /*
 1340                  * devfs_rsnum is a uint16_t
 1341                  */
 1342                 if (rsnum < 0 || rsnum > 65535) {
 1343                         error = EINVAL;
 1344                         goto done_deref_locked;
 1345                 }
 1346                 /*
 1347                  * Nested jails always inherit parent's devfs ruleset
 1348                  */
 1349                 if (jailed(td->td_ucred)) {
 1350                         if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
 1351                                 error = EPERM;
 1352                                 goto done_deref_locked;
 1353                         } else
 1354                                 rsnum = ppr->pr_devfs_rsnum;
 1355                 }
 1356         }
 1357 #ifdef INET
 1358         if (ip4s > 0) {
 1359                 if (ppr->pr_flags & PR_IP4) {
 1360                         /*
 1361                          * Make sure the new set of IP addresses is a
 1362                          * subset of the parent's list.  Don't worry
 1363                          * about the parent being unlocked, as any
 1364                          * setting is done with allprison_lock held.
 1365                          */
 1366                         for (ij = 0; ij < ppr->pr_ip4s; ij++)
 1367                                 if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
 1368                                         break;
 1369                         if (ij == ppr->pr_ip4s) {
 1370                                 error = EPERM;
 1371                                 goto done_deref_locked;
 1372                         }
 1373                         if (ip4s > 1) {
 1374                                 for (ii = ij = 1; ii < ip4s; ii++) {
 1375                                         if (ip4[ii].s_addr ==
 1376                                             ppr->pr_ip4[0].s_addr)
 1377                                                 continue;
 1378                                         for (; ij < ppr->pr_ip4s; ij++)
 1379                                                 if (ip4[ii].s_addr ==
 1380                                                     ppr->pr_ip4[ij].s_addr)
 1381                                                         break;
 1382                                         if (ij == ppr->pr_ip4s)
 1383                                                 break;
 1384                                 }
 1385                                 if (ij == ppr->pr_ip4s) {
 1386                                         error = EPERM;
 1387                                         goto done_deref_locked;
 1388                                 }
 1389                         }
 1390                 }
 1391                 /*
 1392                  * Check for conflicting IP addresses.  We permit them
 1393                  * if there is no more than one IP on each jail.  If
 1394                  * there is a duplicate on a jail with more than one
 1395                  * IP stop checking and return error.
 1396                  */
 1397 #ifdef VIMAGE
 1398                 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 1399                         if (tppr->pr_flags & PR_VNET)
 1400                                 break;
 1401 #else
 1402                 tppr = &prison0;
 1403 #endif
 1404                 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 1405                         if (tpr == pr ||
 1406 #ifdef VIMAGE
 1407                             (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 1408 #endif
 1409                             tpr->pr_uref == 0) {
 1410                                 descend = 0;
 1411                                 continue;
 1412                         }
 1413                         if (!(tpr->pr_flags & PR_IP4_USER))
 1414                                 continue;
 1415                         descend = 0;
 1416                         if (tpr->pr_ip4 == NULL ||
 1417                             (ip4s == 1 && tpr->pr_ip4s == 1))
 1418                                 continue;
 1419                         for (ii = 0; ii < ip4s; ii++) {
 1420                                 if (prison_check_ip4_locked(tpr, &ip4[ii]) ==
 1421                                     0) {
 1422                                         error = EADDRINUSE;
 1423                                         vfs_opterror(opts,
 1424                                             "IPv4 addresses clash");
 1425                                         goto done_deref_locked;
 1426                                 }
 1427                         }
 1428                 }
 1429         }
 1430 #endif
 1431 #ifdef INET6
 1432         if (ip6s > 0) {
 1433                 if (ppr->pr_flags & PR_IP6) {
 1434                         /*
 1435                          * Make sure the new set of IP addresses is a
 1436                          * subset of the parent's list.
 1437                          */
 1438                         for (ij = 0; ij < ppr->pr_ip6s; ij++)
 1439                                 if (IN6_ARE_ADDR_EQUAL(&ip6[0],
 1440                                     &ppr->pr_ip6[ij]))
 1441                                         break;
 1442                         if (ij == ppr->pr_ip6s) {
 1443                                 error = EPERM;
 1444                                 goto done_deref_locked;
 1445                         }
 1446                         if (ip6s > 1) {
 1447                                 for (ii = ij = 1; ii < ip6s; ii++) {
 1448                                         if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
 1449                                              &ppr->pr_ip6[0]))
 1450                                                 continue;
 1451                                         for (; ij < ppr->pr_ip6s; ij++)
 1452                                                 if (IN6_ARE_ADDR_EQUAL(
 1453                                                     &ip6[ii], &ppr->pr_ip6[ij]))
 1454                                                         break;
 1455                                         if (ij == ppr->pr_ip6s)
 1456                                                 break;
 1457                                 }
 1458                                 if (ij == ppr->pr_ip6s) {
 1459                                         error = EPERM;
 1460                                         goto done_deref_locked;
 1461                                 }
 1462                         }
 1463                 }
 1464                 /* Check for conflicting IP addresses. */
 1465 #ifdef VIMAGE
 1466                 for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
 1467                         if (tppr->pr_flags & PR_VNET)
 1468                                 break;
 1469 #else
 1470                 tppr = &prison0;
 1471 #endif
 1472                 FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
 1473                         if (tpr == pr ||
 1474 #ifdef VIMAGE
 1475                             (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
 1476 #endif
 1477                             tpr->pr_uref == 0) {
 1478                                 descend = 0;
 1479                                 continue;
 1480                         }
 1481                         if (!(tpr->pr_flags & PR_IP6_USER))
 1482                                 continue;
 1483                         descend = 0;
 1484                         if (tpr->pr_ip6 == NULL ||
 1485                             (ip6s == 1 && tpr->pr_ip6s == 1))
 1486                                 continue;
 1487                         for (ii = 0; ii < ip6s; ii++) {
 1488                                 if (prison_check_ip6_locked(tpr, &ip6[ii]) ==
 1489                                     0) {
 1490                                         error = EADDRINUSE;
 1491                                         vfs_opterror(opts,
 1492                                             "IPv6 addresses clash");
 1493                                         goto done_deref_locked;
 1494                                 }
 1495                         }
 1496                 }
 1497         }
 1498 #endif
 1499         onamelen = namelen = 0;
 1500         if (namelc != NULL) {
 1501                 /* Give a default name of the jid.  Also allow the name to be
 1502                  * explicitly the jid - but not any other number, and only in
 1503                  * normal form (no leading zero/etc).
 1504                  */
 1505                 if (namelc[0] == '\0')
 1506                         snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
 1507                 else if ((strtoul(namelc, &p, 10) != jid ||
 1508                           namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
 1509                         error = EINVAL;
 1510                         vfs_opterror(opts,
 1511                             "name cannot be numeric (unless it is the jid)");
 1512                         goto done_deref_locked;
 1513                 }
 1514                 /*
 1515                  * Make sure the name isn't too long for the prison or its
 1516                  * children.
 1517                  */
 1518                 pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
 1519                 onamelen = strlen(pr->pr_name + pnamelen);
 1520                 namelen = strlen(namelc);
 1521                 if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
 1522                         error = ENAMETOOLONG;
 1523                         goto done_deref_locked;
 1524                 }
 1525                 FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
 1526                         if (strlen(tpr->pr_name) + (namelen - onamelen) >=
 1527                             sizeof(pr->pr_name)) {
 1528                                 error = ENAMETOOLONG;
 1529                                 goto done_deref_locked;
 1530                         }
 1531                 }
 1532         }
 1533         if (pr_allow & ~ppr->pr_allow) {
 1534                 error = EPERM;
 1535                 goto done_deref_locked;
 1536         }
 1537 
 1538         /*
 1539          * Let modules check their parameters.  This requires unlocking and
 1540          * then re-locking the prison, but this is still a valid state as long
 1541          * as allprison_lock remains xlocked.
 1542          */
 1543         mtx_unlock(&pr->pr_mtx);
 1544         error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
 1545         if (error != 0) {
 1546                 prison_deref(pr, created
 1547                     ? PD_LIST_XLOCKED
 1548                     : PD_DEREF | PD_LIST_XLOCKED);
 1549                 goto done_releroot;
 1550         }
 1551         mtx_lock(&pr->pr_mtx);
 1552 
 1553         /* At this point, all valid parameters should have been noted. */
 1554         TAILQ_FOREACH(opt, opts, link) {
 1555                 if (!opt->seen && strcmp(opt->name, "errmsg")) {
 1556                         error = EINVAL;
 1557                         vfs_opterror(opts, "unknown parameter: %s", opt->name);
 1558                         goto done_deref_locked;
 1559                 }
 1560         }
 1561 
 1562         /* Set the parameters of the prison. */
 1563 #ifdef INET
 1564         redo_ip4 = 0;
 1565         if (pr_flags & PR_IP4_USER) {
 1566                 pr->pr_flags |= PR_IP4;
 1567                 free(pr->pr_ip4, M_PRISON);
 1568                 pr->pr_ip4s = ip4s;
 1569                 pr->pr_ip4 = ip4;
 1570                 ip4 = NULL;
 1571                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 1572 #ifdef VIMAGE
 1573                         if (tpr->pr_flags & PR_VNET) {
 1574                                 descend = 0;
 1575                                 continue;
 1576                         }
 1577 #endif
 1578                         if (prison_restrict_ip4(tpr, NULL)) {
 1579                                 redo_ip4 = 1;
 1580                                 descend = 0;
 1581                         }
 1582                 }
 1583         }
 1584 #endif
 1585 #ifdef INET6
 1586         redo_ip6 = 0;
 1587         if (pr_flags & PR_IP6_USER) {
 1588                 pr->pr_flags |= PR_IP6;
 1589                 free(pr->pr_ip6, M_PRISON);
 1590                 pr->pr_ip6s = ip6s;
 1591                 pr->pr_ip6 = ip6;
 1592                 ip6 = NULL;
 1593                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 1594 #ifdef VIMAGE
 1595                         if (tpr->pr_flags & PR_VNET) {
 1596                                 descend = 0;
 1597                                 continue;
 1598                         }
 1599 #endif
 1600                         if (prison_restrict_ip6(tpr, NULL)) {
 1601                                 redo_ip6 = 1;
 1602                                 descend = 0;
 1603                         }
 1604                 }
 1605         }
 1606 #endif
 1607         if (gotslevel) {
 1608                 pr->pr_securelevel = slevel;
 1609                 /* Set all child jails to be at least this level. */
 1610                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 1611                         if (tpr->pr_securelevel < slevel)
 1612                                 tpr->pr_securelevel = slevel;
 1613         }
 1614         if (gotchildmax) {
 1615                 pr->pr_childmax = childmax;
 1616                 /* Set all child jails to under this limit. */
 1617                 FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
 1618                         if (tpr->pr_childmax > childmax - level)
 1619                                 tpr->pr_childmax = childmax > level
 1620                                     ? childmax - level : 0;
 1621         }
 1622         if (gotenforce) {
 1623                 pr->pr_enforce_statfs = enforce;
 1624                 /* Pass this restriction on to the children. */
 1625                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 1626                         if (tpr->pr_enforce_statfs < enforce)
 1627                                 tpr->pr_enforce_statfs = enforce;
 1628         }
 1629         if (gotrsnum) {
 1630                 pr->pr_devfs_rsnum = rsnum;
 1631                 /* Pass this restriction on to the children. */
 1632                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 1633                         tpr->pr_devfs_rsnum = rsnum;
 1634         }
 1635         if (namelc != NULL) {
 1636                 if (ppr == &prison0)
 1637                         strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
 1638                 else
 1639                         snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
 1640                             ppr->pr_name, namelc);
 1641                 /* Change this component of child names. */
 1642                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 1643                         bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
 1644                             strlen(tpr->pr_name + onamelen) + 1);
 1645                         bcopy(pr->pr_name, tpr->pr_name, namelen);
 1646                 }
 1647         }
 1648         if (path != NULL) {
 1649                 /* Try to keep a real-rooted full pathname. */
 1650                 if (fullpath_disabled && path[0] == '/' &&
 1651                     strcmp(mypr->pr_path, "/"))
 1652                         snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
 1653                             mypr->pr_path, path);
 1654                 else
 1655                         strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
 1656                 pr->pr_root = root;
 1657         }
 1658         if (PR_HOST & ch_flags & ~pr_flags) {
 1659                 if (pr->pr_flags & PR_HOST) {
 1660                         /*
 1661                          * Copy the parent's host info.  As with pr_ip4 above,
 1662                          * the lack of a lock on the parent is not a problem;
 1663                          * it is always set with allprison_lock at least
 1664                          * shared, and is held exclusively here.
 1665                          */
 1666                         strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
 1667                             sizeof(pr->pr_hostname));
 1668                         strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
 1669                             sizeof(pr->pr_domainname));
 1670                         strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
 1671                             sizeof(pr->pr_hostuuid));
 1672                         pr->pr_hostid = pr->pr_parent->pr_hostid;
 1673                 }
 1674         } else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
 1675                 /* Set this prison, and any descendants without PR_HOST. */
 1676                 if (host != NULL)
 1677                         strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
 1678                 if (domain != NULL)
 1679                         strlcpy(pr->pr_domainname, domain, 
 1680                             sizeof(pr->pr_domainname));
 1681                 if (uuid != NULL)
 1682                         strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
 1683                 if (gothid)
 1684                         pr->pr_hostid = hid;
 1685                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 1686                         if (tpr->pr_flags & PR_HOST)
 1687                                 descend = 0;
 1688                         else {
 1689                                 if (host != NULL)
 1690                                         strlcpy(tpr->pr_hostname,
 1691                                             pr->pr_hostname,
 1692                                             sizeof(tpr->pr_hostname));
 1693                                 if (domain != NULL)
 1694                                         strlcpy(tpr->pr_domainname, 
 1695                                             pr->pr_domainname,
 1696                                             sizeof(tpr->pr_domainname));
 1697                                 if (uuid != NULL)
 1698                                         strlcpy(tpr->pr_hostuuid,
 1699                                             pr->pr_hostuuid,
 1700                                             sizeof(tpr->pr_hostuuid));
 1701                                 if (gothid)
 1702                                         tpr->pr_hostid = hid;
 1703                         }
 1704                 }
 1705         }
 1706         if ((tallow = ch_allow & ~pr_allow)) {
 1707                 /* Clear allow bits in all children. */
 1708                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
 1709                         tpr->pr_allow &= ~tallow;
 1710         }
 1711         pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
 1712         /*
 1713          * Persistent prisons get an extra reference, and prisons losing their
 1714          * persist flag lose that reference.  Only do this for existing prisons
 1715          * for now, so new ones will remain unseen until after the module
 1716          * handlers have completed.
 1717          */
 1718         born = pr->pr_uref == 0;
 1719         if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
 1720                 if (pr_flags & PR_PERSIST) {
 1721                         pr->pr_ref++;
 1722                         pr->pr_uref++;
 1723                 } else {
 1724                         pr->pr_ref--;
 1725                         pr->pr_uref--;
 1726                 }
 1727         }
 1728         pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
 1729         mtx_unlock(&pr->pr_mtx);
 1730 
 1731 #ifdef RACCT
 1732         if (racct_enable && created)
 1733                 prison_racct_attach(pr);
 1734 #endif
 1735 
 1736         /* Locks may have prevented a complete restriction of child IP
 1737          * addresses.  If so, allocate some more memory and try again.
 1738          */
 1739 #ifdef INET
 1740         while (redo_ip4) {
 1741                 ip4s = pr->pr_ip4s;
 1742                 ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
 1743                 mtx_lock(&pr->pr_mtx);
 1744                 redo_ip4 = 0;
 1745                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 1746 #ifdef VIMAGE
 1747                         if (tpr->pr_flags & PR_VNET) {
 1748                                 descend = 0;
 1749                                 continue;
 1750                         }
 1751 #endif
 1752                         if (prison_restrict_ip4(tpr, ip4)) {
 1753                                 if (ip4 != NULL)
 1754                                         ip4 = NULL;
 1755                                 else
 1756                                         redo_ip4 = 1;
 1757                         }
 1758                 }
 1759                 mtx_unlock(&pr->pr_mtx);
 1760         }
 1761 #endif
 1762 #ifdef INET6
 1763         while (redo_ip6) {
 1764                 ip6s = pr->pr_ip6s;
 1765                 ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
 1766                 mtx_lock(&pr->pr_mtx);
 1767                 redo_ip6 = 0;
 1768                 FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
 1769 #ifdef VIMAGE
 1770                         if (tpr->pr_flags & PR_VNET) {
 1771                                 descend = 0;
 1772                                 continue;
 1773                         }
 1774 #endif
 1775                         if (prison_restrict_ip6(tpr, ip6)) {
 1776                                 if (ip6 != NULL)
 1777                                         ip6 = NULL;
 1778                                 else
 1779                                         redo_ip6 = 1;
 1780                         }
 1781                 }
 1782                 mtx_unlock(&pr->pr_mtx);
 1783         }
 1784 #endif
 1785 
 1786         /* Let the modules do their work. */
 1787         sx_downgrade(&allprison_lock);
 1788         if (born) {
 1789                 error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
 1790                 if (error) {
 1791                         (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 1792                         prison_deref(pr, created
 1793                             ? PD_LIST_SLOCKED
 1794                             : PD_DEREF | PD_LIST_SLOCKED);
 1795                         goto done_errmsg;
 1796                 }
 1797         }
 1798         error = osd_jail_call(pr, PR_METHOD_SET, opts);
 1799         if (error) {
 1800                 if (born)
 1801                         (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 1802                 prison_deref(pr, created
 1803                     ? PD_LIST_SLOCKED
 1804                     : PD_DEREF | PD_LIST_SLOCKED);
 1805                 goto done_errmsg;
 1806         }
 1807 
 1808         /* Attach this process to the prison if requested. */
 1809         if (flags & JAIL_ATTACH) {
 1810                 mtx_lock(&pr->pr_mtx);
 1811                 error = do_jail_attach(td, pr);
 1812                 if (error) {
 1813                         vfs_opterror(opts, "attach failed");
 1814                         if (!created)
 1815                                 prison_deref(pr, PD_DEREF);
 1816                         goto done_errmsg;
 1817                 }
 1818         }
 1819 
 1820 #ifdef RACCT
 1821         if (racct_enable && !created) {
 1822                 if (!(flags & JAIL_ATTACH))
 1823                         sx_sunlock(&allprison_lock);
 1824                 prison_racct_modify(pr);
 1825                 if (!(flags & JAIL_ATTACH))
 1826                         sx_slock(&allprison_lock);
 1827         }
 1828 #endif
 1829 
 1830         td->td_retval[0] = pr->pr_id;
 1831 
 1832         /*
 1833          * Now that it is all there, drop the temporary reference from existing
 1834          * prisons.  Or add a reference to newly created persistent prisons
 1835          * (which was not done earlier so that the prison would not be publicly
 1836          * visible).
 1837          */
 1838         if (!created) {
 1839                 prison_deref(pr, (flags & JAIL_ATTACH)
 1840                     ? PD_DEREF
 1841                     : PD_DEREF | PD_LIST_SLOCKED);
 1842         } else {
 1843                 if (pr_flags & PR_PERSIST) {
 1844                         mtx_lock(&pr->pr_mtx);
 1845                         pr->pr_ref++;
 1846                         pr->pr_uref++;
 1847                         mtx_unlock(&pr->pr_mtx);
 1848                 }
 1849                 if (!(flags & JAIL_ATTACH))
 1850                         sx_sunlock(&allprison_lock);
 1851         }
 1852 
 1853         goto done_free;
 1854 
 1855  done_deref_locked:
 1856         prison_deref(pr, created
 1857             ? PD_LOCKED | PD_LIST_XLOCKED
 1858             : PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 1859         goto done_releroot;
 1860  done_unlock_list:
 1861         sx_xunlock(&allprison_lock);
 1862  done_releroot:
 1863         if (root != NULL)
 1864                 vrele(root);
 1865  done_errmsg:
 1866         if (error) {
 1867                 if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
 1868                     &errmsg_len) == 0 && errmsg_len > 0) {
 1869                         errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
 1870                         if (optuio->uio_segflg == UIO_SYSSPACE)
 1871                                 bcopy(errmsg,
 1872                                     optuio->uio_iov[errmsg_pos].iov_base,
 1873                                     errmsg_len);
 1874                         else
 1875                                 copyout(errmsg,
 1876                                     optuio->uio_iov[errmsg_pos].iov_base,
 1877                                     errmsg_len);
 1878                 }
 1879         }
 1880  done_free:
 1881 #ifdef INET
 1882         free(ip4, M_PRISON);
 1883 #endif
 1884 #ifdef INET6
 1885         free(ip6, M_PRISON);
 1886 #endif
 1887         if (g_path != NULL)
 1888                 free(g_path, M_TEMP);
 1889         vfs_freeopts(opts);
 1890         return (error);
 1891 }
 1892 
 1893 
 1894 /*
 1895  * struct jail_get_args {
 1896  *      struct iovec *iovp;
 1897  *      unsigned int iovcnt;
 1898  *      int flags;
 1899  * };
 1900  */
 1901 int
 1902 sys_jail_get(struct thread *td, struct jail_get_args *uap)
 1903 {
 1904         struct uio *auio;
 1905         int error;
 1906 
 1907         /* Check that we have an even number of iovecs. */
 1908         if (uap->iovcnt & 1)
 1909                 return (EINVAL);
 1910 
 1911         error = copyinuio(uap->iovp, uap->iovcnt, &auio);
 1912         if (error)
 1913                 return (error);
 1914         error = kern_jail_get(td, auio, uap->flags);
 1915         if (error == 0)
 1916                 error = copyout(auio->uio_iov, uap->iovp,
 1917                     uap->iovcnt * sizeof (struct iovec));
 1918         free(auio, M_IOV);
 1919         return (error);
 1920 }
 1921 
 1922 int
 1923 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
 1924 {
 1925         struct bool_flags *bf;
 1926         struct jailsys_flags *jsf;
 1927         struct prison *pr, *mypr;
 1928         struct vfsopt *opt;
 1929         struct vfsoptlist *opts;
 1930         char *errmsg, *name;
 1931         int error, errmsg_len, errmsg_pos, i, jid, len, locked, pos;
 1932         unsigned f;
 1933 
 1934         if (flags & ~JAIL_GET_MASK)
 1935                 return (EINVAL);
 1936 
 1937         /* Get the parameter list. */
 1938         error = vfs_buildopts(optuio, &opts);
 1939         if (error)
 1940                 return (error);
 1941         errmsg_pos = vfs_getopt_pos(opts, "errmsg");
 1942         mypr = td->td_ucred->cr_prison;
 1943 
 1944         /*
 1945          * Find the prison specified by one of: lastjid, jid, name.
 1946          */
 1947         sx_slock(&allprison_lock);
 1948         error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
 1949         if (error == 0) {
 1950                 TAILQ_FOREACH(pr, &allprison, pr_list) {
 1951                         if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
 1952                                 mtx_lock(&pr->pr_mtx);
 1953                                 if (pr->pr_ref > 0 &&
 1954                                     (pr->pr_uref > 0 || (flags & JAIL_DYING)))
 1955                                         break;
 1956                                 mtx_unlock(&pr->pr_mtx);
 1957                         }
 1958                 }
 1959                 if (pr != NULL)
 1960                         goto found_prison;
 1961                 error = ENOENT;
 1962                 vfs_opterror(opts, "no jail after %d", jid);
 1963                 goto done_unlock_list;
 1964         } else if (error != ENOENT)
 1965                 goto done_unlock_list;
 1966 
 1967         error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
 1968         if (error == 0) {
 1969                 if (jid != 0) {
 1970                         pr = prison_find_child(mypr, jid);
 1971                         if (pr != NULL) {
 1972                                 if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 1973                                         mtx_unlock(&pr->pr_mtx);
 1974                                         error = ENOENT;
 1975                                         vfs_opterror(opts, "jail %d is dying",
 1976                                             jid);
 1977                                         goto done_unlock_list;
 1978                                 }
 1979                                 goto found_prison;
 1980                         }
 1981                         error = ENOENT;
 1982                         vfs_opterror(opts, "jail %d not found", jid);
 1983                         goto done_unlock_list;
 1984                 }
 1985         } else if (error != ENOENT)
 1986                 goto done_unlock_list;
 1987 
 1988         error = vfs_getopt(opts, "name", (void **)&name, &len);
 1989         if (error == 0) {
 1990                 if (len == 0 || name[len - 1] != '\0') {
 1991                         error = EINVAL;
 1992                         goto done_unlock_list;
 1993                 }
 1994                 pr = prison_find_name(mypr, name);
 1995                 if (pr != NULL) {
 1996                         if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
 1997                                 mtx_unlock(&pr->pr_mtx);
 1998                                 error = ENOENT;
 1999                                 vfs_opterror(opts, "jail \"%s\" is dying",
 2000                                     name);
 2001                                 goto done_unlock_list;
 2002                         }
 2003                         goto found_prison;
 2004                 }
 2005                 error = ENOENT;
 2006                 vfs_opterror(opts, "jail \"%s\" not found", name);
 2007                 goto done_unlock_list;
 2008         } else if (error != ENOENT)
 2009                 goto done_unlock_list;
 2010 
 2011         vfs_opterror(opts, "no jail specified");
 2012         error = ENOENT;
 2013         goto done_unlock_list;
 2014 
 2015  found_prison:
 2016         /* Get the parameters of the prison. */
 2017         pr->pr_ref++;
 2018         locked = PD_LOCKED;
 2019         td->td_retval[0] = pr->pr_id;
 2020         error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
 2021         if (error != 0 && error != ENOENT)
 2022                 goto done_deref;
 2023         i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
 2024         error = vfs_setopt(opts, "parent", &i, sizeof(i));
 2025         if (error != 0 && error != ENOENT)
 2026                 goto done_deref;
 2027         error = vfs_setopts(opts, "name", prison_name(mypr, pr));
 2028         if (error != 0 && error != ENOENT)
 2029                 goto done_deref;
 2030         error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
 2031             sizeof(pr->pr_cpuset->cs_id));
 2032         if (error != 0 && error != ENOENT)
 2033                 goto done_deref;
 2034         error = vfs_setopts(opts, "path", prison_path(mypr, pr));
 2035         if (error != 0 && error != ENOENT)
 2036                 goto done_deref;
 2037 #ifdef INET
 2038         error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
 2039             pr->pr_ip4s * sizeof(*pr->pr_ip4));
 2040         if (error != 0 && error != ENOENT)
 2041                 goto done_deref;
 2042 #endif
 2043 #ifdef INET6
 2044         error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
 2045             pr->pr_ip6s * sizeof(*pr->pr_ip6));
 2046         if (error != 0 && error != ENOENT)
 2047                 goto done_deref;
 2048 #endif
 2049         error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
 2050             sizeof(pr->pr_securelevel));
 2051         if (error != 0 && error != ENOENT)
 2052                 goto done_deref;
 2053         error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
 2054             sizeof(pr->pr_childcount));
 2055         if (error != 0 && error != ENOENT)
 2056                 goto done_deref;
 2057         error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
 2058             sizeof(pr->pr_childmax));
 2059         if (error != 0 && error != ENOENT)
 2060                 goto done_deref;
 2061         error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
 2062         if (error != 0 && error != ENOENT)
 2063                 goto done_deref;
 2064         error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
 2065         if (error != 0 && error != ENOENT)
 2066                 goto done_deref;
 2067         error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
 2068         if (error != 0 && error != ENOENT)
 2069                 goto done_deref;
 2070 #ifdef COMPAT_FREEBSD32
 2071         if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
 2072                 uint32_t hid32 = pr->pr_hostid;
 2073 
 2074                 error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
 2075         } else
 2076 #endif
 2077         error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
 2078             sizeof(pr->pr_hostid));
 2079         if (error != 0 && error != ENOENT)
 2080                 goto done_deref;
 2081         error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
 2082             sizeof(pr->pr_enforce_statfs));
 2083         if (error != 0 && error != ENOENT)
 2084                 goto done_deref;
 2085         error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
 2086             sizeof(pr->pr_devfs_rsnum));
 2087         if (error != 0 && error != ENOENT)
 2088                 goto done_deref;
 2089         for (bf = pr_flag_bool;
 2090              bf < pr_flag_bool + nitems(pr_flag_bool);
 2091              bf++) {
 2092                 i = (pr->pr_flags & bf->flag) ? 1 : 0;
 2093                 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 2094                 if (error != 0 && error != ENOENT)
 2095                         goto done_deref;
 2096                 i = !i;
 2097                 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 2098                 if (error != 0 && error != ENOENT)
 2099                         goto done_deref;
 2100         }
 2101         for (jsf = pr_flag_jailsys;
 2102              jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 2103              jsf++) {
 2104                 f = pr->pr_flags & (jsf->disable | jsf->new);
 2105                 i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
 2106                     : (f == jsf->new) ? JAIL_SYS_NEW
 2107                     : JAIL_SYS_INHERIT;
 2108                 error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
 2109                 if (error != 0 && error != ENOENT)
 2110                         goto done_deref;
 2111         }
 2112         for (bf = pr_flag_allow;
 2113              bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
 2114              bf++) {
 2115                 i = (pr->pr_allow & bf->flag) ? 1 : 0;
 2116                 error = vfs_setopt(opts, bf->name, &i, sizeof(i));
 2117                 if (error != 0 && error != ENOENT)
 2118                         goto done_deref;
 2119                 i = !i;
 2120                 error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
 2121                 if (error != 0 && error != ENOENT)
 2122                         goto done_deref;
 2123         }
 2124         i = (pr->pr_uref == 0);
 2125         error = vfs_setopt(opts, "dying", &i, sizeof(i));
 2126         if (error != 0 && error != ENOENT)
 2127                 goto done_deref;
 2128         i = !i;
 2129         error = vfs_setopt(opts, "nodying", &i, sizeof(i));
 2130         if (error != 0 && error != ENOENT)
 2131                 goto done_deref;
 2132         error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
 2133             sizeof(pr->pr_osreldate));
 2134         if (error != 0 && error != ENOENT)
 2135                 goto done_deref;
 2136         error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
 2137         if (error != 0 && error != ENOENT)
 2138                 goto done_deref;
 2139 
 2140         /* Get the module parameters. */
 2141         mtx_unlock(&pr->pr_mtx);
 2142         locked = 0;
 2143         error = osd_jail_call(pr, PR_METHOD_GET, opts);
 2144         if (error)
 2145                 goto done_deref;
 2146         prison_deref(pr, PD_DEREF | PD_LIST_SLOCKED);
 2147 
 2148         /* By now, all parameters should have been noted. */
 2149         TAILQ_FOREACH(opt, opts, link) {
 2150                 if (!opt->seen && strcmp(opt->name, "errmsg")) {
 2151                         error = EINVAL;
 2152                         vfs_opterror(opts, "unknown parameter: %s", opt->name);
 2153                         goto done_errmsg;
 2154                 }
 2155         }
 2156 
 2157         /* Write the fetched parameters back to userspace. */
 2158         error = 0;
 2159         TAILQ_FOREACH(opt, opts, link) {
 2160                 if (opt->pos >= 0 && opt->pos != errmsg_pos) {
 2161                         pos = 2 * opt->pos + 1;
 2162                         optuio->uio_iov[pos].iov_len = opt->len;
 2163                         if (opt->value != NULL) {
 2164                                 if (optuio->uio_segflg == UIO_SYSSPACE) {
 2165                                         bcopy(opt->value,
 2166                                             optuio->uio_iov[pos].iov_base,
 2167                                             opt->len);
 2168                                 } else {
 2169                                         error = copyout(opt->value,
 2170                                             optuio->uio_iov[pos].iov_base,
 2171                                             opt->len);
 2172                                         if (error)
 2173                                                 break;
 2174                                 }
 2175                         }
 2176                 }
 2177         }
 2178         goto done_errmsg;
 2179 
 2180  done_deref:
 2181         prison_deref(pr, locked | PD_DEREF | PD_LIST_SLOCKED);
 2182         goto done_errmsg;
 2183 
 2184  done_unlock_list:
 2185         sx_sunlock(&allprison_lock);
 2186  done_errmsg:
 2187         if (error && errmsg_pos >= 0) {
 2188                 vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
 2189                 errmsg_pos = 2 * errmsg_pos + 1;
 2190                 if (errmsg_len > 0) {
 2191                         if (optuio->uio_segflg == UIO_SYSSPACE)
 2192                                 bcopy(errmsg,
 2193                                     optuio->uio_iov[errmsg_pos].iov_base,
 2194                                     errmsg_len);
 2195                         else
 2196                                 copyout(errmsg,
 2197                                     optuio->uio_iov[errmsg_pos].iov_base,
 2198                                     errmsg_len);
 2199                 }
 2200         }
 2201         vfs_freeopts(opts);
 2202         return (error);
 2203 }
 2204 
 2205 
 2206 /*
 2207  * struct jail_remove_args {
 2208  *      int jid;
 2209  * };
 2210  */
 2211 int
 2212 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
 2213 {
 2214         struct prison *pr, *cpr, *lpr, *tpr;
 2215         int descend, error;
 2216 
 2217         error = priv_check(td, PRIV_JAIL_REMOVE);
 2218         if (error)
 2219                 return (error);
 2220 
 2221         sx_xlock(&allprison_lock);
 2222         pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 2223         if (pr == NULL) {
 2224                 sx_xunlock(&allprison_lock);
 2225                 return (EINVAL);
 2226         }
 2227 
 2228         /* Remove all descendants of this prison, then remove this prison. */
 2229         pr->pr_ref++;
 2230         if (!LIST_EMPTY(&pr->pr_children)) {
 2231                 mtx_unlock(&pr->pr_mtx);
 2232                 lpr = NULL;
 2233                 FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 2234                         mtx_lock(&cpr->pr_mtx);
 2235                         if (cpr->pr_ref > 0) {
 2236                                 tpr = cpr;
 2237                                 cpr->pr_ref++;
 2238                         } else {
 2239                                 /* Already removed - do not do it again. */
 2240                                 tpr = NULL;
 2241                         }
 2242                         mtx_unlock(&cpr->pr_mtx);
 2243                         if (lpr != NULL) {
 2244                                 mtx_lock(&lpr->pr_mtx);
 2245                                 prison_remove_one(lpr);
 2246                                 sx_xlock(&allprison_lock);
 2247                         }
 2248                         lpr = tpr;
 2249                 }
 2250                 if (lpr != NULL) {
 2251                         mtx_lock(&lpr->pr_mtx);
 2252                         prison_remove_one(lpr);
 2253                         sx_xlock(&allprison_lock);
 2254                 }
 2255                 mtx_lock(&pr->pr_mtx);
 2256         }
 2257         prison_remove_one(pr);
 2258         return (0);
 2259 }
 2260 
 2261 static void
 2262 prison_remove_one(struct prison *pr)
 2263 {
 2264         struct proc *p;
 2265         int deuref;
 2266 
 2267         /* If the prison was persistent, it is not anymore. */
 2268         deuref = 0;
 2269         if (pr->pr_flags & PR_PERSIST) {
 2270                 pr->pr_ref--;
 2271                 deuref = PD_DEUREF;
 2272                 pr->pr_flags &= ~PR_PERSIST;
 2273         }
 2274 
 2275         /*
 2276          * jail_remove added a reference.  If that's the only one, remove
 2277          * the prison now.
 2278          */
 2279         KASSERT(pr->pr_ref > 0,
 2280             ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
 2281         if (pr->pr_ref == 1) {
 2282                 prison_deref(pr,
 2283                     deuref | PD_DEREF | PD_LOCKED | PD_LIST_XLOCKED);
 2284                 return;
 2285         }
 2286 
 2287         mtx_unlock(&pr->pr_mtx);
 2288         sx_xunlock(&allprison_lock);
 2289         /*
 2290          * Kill all processes unfortunate enough to be attached to this prison.
 2291          */
 2292         sx_slock(&allproc_lock);
 2293         FOREACH_PROC_IN_SYSTEM(p) {
 2294                 PROC_LOCK(p);
 2295                 if (p->p_state != PRS_NEW && p->p_ucred &&
 2296                     p->p_ucred->cr_prison == pr)
 2297                         kern_psignal(p, SIGKILL);
 2298                 PROC_UNLOCK(p);
 2299         }
 2300         sx_sunlock(&allproc_lock);
 2301         /* Remove the temporary reference added by jail_remove. */
 2302         prison_deref(pr, deuref | PD_DEREF);
 2303 }
 2304 
 2305 
 2306 /*
 2307  * struct jail_attach_args {
 2308  *      int jid;
 2309  * };
 2310  */
 2311 int
 2312 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
 2313 {
 2314         struct prison *pr;
 2315         int error;
 2316 
 2317         error = priv_check(td, PRIV_JAIL_ATTACH);
 2318         if (error)
 2319                 return (error);
 2320 
 2321         /*
 2322          * Start with exclusive hold on allprison_lock to ensure that a possible
 2323          * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
 2324          * But then immediately downgrade it since we don't need to stop
 2325          * readers.
 2326          */
 2327         sx_xlock(&allprison_lock);
 2328         sx_downgrade(&allprison_lock);
 2329         pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
 2330         if (pr == NULL) {
 2331                 sx_sunlock(&allprison_lock);
 2332                 return (EINVAL);
 2333         }
 2334 
 2335         /*
 2336          * Do not allow a process to attach to a prison that is not
 2337          * considered to be "alive".
 2338          */
 2339         if (pr->pr_uref == 0) {
 2340                 mtx_unlock(&pr->pr_mtx);
 2341                 sx_sunlock(&allprison_lock);
 2342                 return (EINVAL);
 2343         }
 2344 
 2345         return (do_jail_attach(td, pr));
 2346 }
 2347 
 2348 static int
 2349 do_jail_attach(struct thread *td, struct prison *pr)
 2350 {
 2351         struct proc *p;
 2352         struct ucred *newcred, *oldcred;
 2353         int error;
 2354 
 2355         /*
 2356          * XXX: Note that there is a slight race here if two threads
 2357          * in the same privileged process attempt to attach to two
 2358          * different jails at the same time.  It is important for
 2359          * user processes not to do this, or they might end up with
 2360          * a process root from one prison, but attached to the jail
 2361          * of another.
 2362          */
 2363         pr->pr_ref++;
 2364         pr->pr_uref++;
 2365         mtx_unlock(&pr->pr_mtx);
 2366 
 2367         /* Let modules do whatever they need to prepare for attaching. */
 2368         error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
 2369         if (error) {
 2370                 prison_deref(pr, PD_DEREF | PD_DEUREF | PD_LIST_SLOCKED);
 2371                 return (error);
 2372         }
 2373         sx_sunlock(&allprison_lock);
 2374 
 2375         /*
 2376          * Reparent the newly attached process to this jail.
 2377          */
 2378         p = td->td_proc;
 2379         error = cpuset_setproc_update_set(p, pr->pr_cpuset);
 2380         if (error)
 2381                 goto e_revert_osd;
 2382 
 2383         vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
 2384         if ((error = change_dir(pr->pr_root, td)) != 0)
 2385                 goto e_unlock;
 2386 #ifdef MAC
 2387         if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
 2388                 goto e_unlock;
 2389 #endif
 2390         VOP_UNLOCK(pr->pr_root, 0);
 2391         if ((error = pwd_chroot(td, pr->pr_root)))
 2392                 goto e_revert_osd;
 2393 
 2394         newcred = crget();
 2395         PROC_LOCK(p);
 2396         oldcred = crcopysafe(p, newcred);
 2397         newcred->cr_prison = pr;
 2398         proc_set_cred(p, newcred);
 2399         setsugid(p);
 2400 #ifdef RACCT
 2401         racct_proc_ucred_changed(p, oldcred, newcred);
 2402         crhold(newcred);
 2403 #endif
 2404         PROC_UNLOCK(p);
 2405 #ifdef RCTL
 2406         rctl_proc_ucred_changed(p, newcred);
 2407         crfree(newcred);
 2408 #endif
 2409         prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
 2410         crfree(oldcred);
 2411         return (0);
 2412 
 2413  e_unlock:
 2414         VOP_UNLOCK(pr->pr_root, 0);
 2415  e_revert_osd:
 2416         /* Tell modules this thread is still in its old jail after all. */
 2417         (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
 2418         prison_deref(pr, PD_DEREF | PD_DEUREF);
 2419         return (error);
 2420 }
 2421 
 2422 
 2423 /*
 2424  * Returns a locked prison instance, or NULL on failure.
 2425  */
 2426 struct prison *
 2427 prison_find(int prid)
 2428 {
 2429         struct prison *pr;
 2430 
 2431         sx_assert(&allprison_lock, SX_LOCKED);
 2432         TAILQ_FOREACH(pr, &allprison, pr_list) {
 2433                 if (pr->pr_id == prid) {
 2434                         mtx_lock(&pr->pr_mtx);
 2435                         if (pr->pr_ref > 0)
 2436                                 return (pr);
 2437                         mtx_unlock(&pr->pr_mtx);
 2438                 }
 2439         }
 2440         return (NULL);
 2441 }
 2442 
 2443 /*
 2444  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
 2445  */
 2446 struct prison *
 2447 prison_find_child(struct prison *mypr, int prid)
 2448 {
 2449         struct prison *pr;
 2450         int descend;
 2451 
 2452         sx_assert(&allprison_lock, SX_LOCKED);
 2453         FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 2454                 if (pr->pr_id == prid) {
 2455                         mtx_lock(&pr->pr_mtx);
 2456                         if (pr->pr_ref > 0)
 2457                                 return (pr);
 2458                         mtx_unlock(&pr->pr_mtx);
 2459                 }
 2460         }
 2461         return (NULL);
 2462 }
 2463 
 2464 /*
 2465  * Look for the name relative to mypr.  Returns a locked prison or NULL.
 2466  */
 2467 struct prison *
 2468 prison_find_name(struct prison *mypr, const char *name)
 2469 {
 2470         struct prison *pr, *deadpr;
 2471         size_t mylen;
 2472         int descend;
 2473 
 2474         sx_assert(&allprison_lock, SX_LOCKED);
 2475         mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
 2476  again:
 2477         deadpr = NULL;
 2478         FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
 2479                 if (!strcmp(pr->pr_name + mylen, name)) {
 2480                         mtx_lock(&pr->pr_mtx);
 2481                         if (pr->pr_ref > 0) {
 2482                                 if (pr->pr_uref > 0)
 2483                                         return (pr);
 2484                                 deadpr = pr;
 2485                         }
 2486                         mtx_unlock(&pr->pr_mtx);
 2487                 }
 2488         }
 2489         /* There was no valid prison - perhaps there was a dying one. */
 2490         if (deadpr != NULL) {
 2491                 mtx_lock(&deadpr->pr_mtx);
 2492                 if (deadpr->pr_ref == 0) {
 2493                         mtx_unlock(&deadpr->pr_mtx);
 2494                         goto again;
 2495                 }
 2496         }
 2497         return (deadpr);
 2498 }
 2499 
 2500 /*
 2501  * See if a prison has the specific flag set.
 2502  */
 2503 int
 2504 prison_flag(struct ucred *cred, unsigned flag)
 2505 {
 2506 
 2507         /* This is an atomic read, so no locking is necessary. */
 2508         return (cred->cr_prison->pr_flags & flag);
 2509 }
 2510 
 2511 int
 2512 prison_allow(struct ucred *cred, unsigned flag)
 2513 {
 2514 
 2515         /* This is an atomic read, so no locking is necessary. */
 2516         return (cred->cr_prison->pr_allow & flag);
 2517 }
 2518 
 2519 /*
 2520  * Remove a prison reference.  If that was the last reference, remove the
 2521  * prison itself - but not in this context in case there are locks held.
 2522  */
 2523 void
 2524 prison_free_locked(struct prison *pr)
 2525 {
 2526         int ref;
 2527 
 2528         mtx_assert(&pr->pr_mtx, MA_OWNED);
 2529         ref = --pr->pr_ref;
 2530         mtx_unlock(&pr->pr_mtx);
 2531         if (ref == 0)
 2532                 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 2533 }
 2534 
 2535 void
 2536 prison_free(struct prison *pr)
 2537 {
 2538 
 2539         mtx_lock(&pr->pr_mtx);
 2540         prison_free_locked(pr);
 2541 }
 2542 
 2543 /*
 2544  * Complete a call to either prison_free or prison_proc_free.
 2545  */
 2546 static void
 2547 prison_complete(void *context, int pending)
 2548 {
 2549         struct prison *pr = context;
 2550 
 2551         sx_xlock(&allprison_lock);
 2552         mtx_lock(&pr->pr_mtx);
 2553         prison_deref(pr, pr->pr_uref
 2554             ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
 2555             : PD_LOCKED | PD_LIST_XLOCKED);
 2556 }
 2557 
 2558 /*
 2559  * Remove a prison reference (usually).  This internal version assumes no
 2560  * mutexes are held, except perhaps the prison itself.  If there are no more
 2561  * references, release and delist the prison.  On completion, the prison lock
 2562  * and the allprison lock are both unlocked.
 2563  */
 2564 static void
 2565 prison_deref(struct prison *pr, int flags)
 2566 {
 2567         struct prison *ppr, *tpr;
 2568         int ref, lasturef;
 2569 
 2570         if (!(flags & PD_LOCKED))
 2571                 mtx_lock(&pr->pr_mtx);
 2572         for (;;) {
 2573                 if (flags & PD_DEUREF) {
 2574                         KASSERT(pr->pr_uref > 0,
 2575                             ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
 2576                              pr->pr_id));
 2577                         pr->pr_uref--;
 2578                         lasturef = pr->pr_uref == 0;
 2579                         if (lasturef)
 2580                                 pr->pr_ref++;
 2581                         KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
 2582                 } else
 2583                         lasturef = 0;
 2584                 if (flags & PD_DEREF) {
 2585                         KASSERT(pr->pr_ref > 0,
 2586                             ("prison_deref PD_DEREF on a dead prison (jid=%d)",
 2587                              pr->pr_id));
 2588                         pr->pr_ref--;
 2589                 }
 2590                 ref = pr->pr_ref;
 2591                 mtx_unlock(&pr->pr_mtx);
 2592 
 2593                 /*
 2594                  * Tell the modules if the last user reference was removed
 2595                  * (even it sticks around in dying state).
 2596                  */
 2597                 if (lasturef) {
 2598                         if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
 2599                                 sx_xlock(&allprison_lock);
 2600                                 flags |= PD_LIST_XLOCKED;
 2601                         }
 2602                         (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
 2603                         mtx_lock(&pr->pr_mtx);
 2604                         ref = --pr->pr_ref;
 2605                         mtx_unlock(&pr->pr_mtx);
 2606                 }
 2607 
 2608                 /* If the prison still has references, nothing else to do. */
 2609                 if (ref > 0) {
 2610                         if (flags & PD_LIST_SLOCKED)
 2611                                 sx_sunlock(&allprison_lock);
 2612                         else if (flags & PD_LIST_XLOCKED)
 2613                                 sx_xunlock(&allprison_lock);
 2614                         return;
 2615                 }
 2616 
 2617                 if (flags & PD_LIST_SLOCKED) {
 2618                         if (!sx_try_upgrade(&allprison_lock)) {
 2619                                 sx_sunlock(&allprison_lock);
 2620                                 sx_xlock(&allprison_lock);
 2621                         }
 2622                 } else if (!(flags & PD_LIST_XLOCKED))
 2623                         sx_xlock(&allprison_lock);
 2624 
 2625                 TAILQ_REMOVE(&allprison, pr, pr_list);
 2626                 LIST_REMOVE(pr, pr_sibling);
 2627                 ppr = pr->pr_parent;
 2628                 for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
 2629                         tpr->pr_childcount--;
 2630                 sx_xunlock(&allprison_lock);
 2631 
 2632 #ifdef VIMAGE
 2633                 if (pr->pr_vnet != ppr->pr_vnet)
 2634                         vnet_destroy(pr->pr_vnet);
 2635 #endif
 2636                 if (pr->pr_root != NULL)
 2637                         vrele(pr->pr_root);
 2638                 mtx_destroy(&pr->pr_mtx);
 2639 #ifdef INET
 2640                 free(pr->pr_ip4, M_PRISON);
 2641 #endif
 2642 #ifdef INET6
 2643                 free(pr->pr_ip6, M_PRISON);
 2644 #endif
 2645                 if (pr->pr_cpuset != NULL)
 2646                         cpuset_rel(pr->pr_cpuset);
 2647                 osd_jail_exit(pr);
 2648 #ifdef RACCT
 2649                 if (racct_enable)
 2650                         prison_racct_detach(pr);
 2651 #endif
 2652                 free(pr, M_PRISON);
 2653 
 2654                 /* Removing a prison frees a reference on its parent. */
 2655                 pr = ppr;
 2656                 mtx_lock(&pr->pr_mtx);
 2657                 flags = PD_DEREF | PD_DEUREF;
 2658         }
 2659 }
 2660 
 2661 void
 2662 prison_hold_locked(struct prison *pr)
 2663 {
 2664 
 2665         mtx_assert(&pr->pr_mtx, MA_OWNED);
 2666         KASSERT(pr->pr_ref > 0,
 2667             ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
 2668         pr->pr_ref++;
 2669 }
 2670 
 2671 void
 2672 prison_hold(struct prison *pr)
 2673 {
 2674 
 2675         mtx_lock(&pr->pr_mtx);
 2676         prison_hold_locked(pr);
 2677         mtx_unlock(&pr->pr_mtx);
 2678 }
 2679 
 2680 void
 2681 prison_proc_hold(struct prison *pr)
 2682 {
 2683 
 2684         mtx_lock(&pr->pr_mtx);
 2685         KASSERT(pr->pr_uref > 0,
 2686             ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
 2687         pr->pr_uref++;
 2688         mtx_unlock(&pr->pr_mtx);
 2689 }
 2690 
 2691 void
 2692 prison_proc_free(struct prison *pr)
 2693 {
 2694 
 2695         mtx_lock(&pr->pr_mtx);
 2696         KASSERT(pr->pr_uref > 0,
 2697             ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
 2698         if (pr->pr_uref > 1)
 2699                 pr->pr_uref--;
 2700         else {
 2701                 /*
 2702                  * Don't remove the last user reference in this context, which
 2703                  * is expected to be a process that is not only locked, but
 2704                  * also half dead.
 2705                  */
 2706                 pr->pr_ref++;
 2707                 mtx_unlock(&pr->pr_mtx);
 2708                 taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 2709                 return;
 2710         }
 2711         mtx_unlock(&pr->pr_mtx);
 2712 }
 2713 
 2714 /*
 2715  * Check if a jail supports the given address family.
 2716  *
 2717  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
 2718  * if not.
 2719  */
 2720 int
 2721 prison_check_af(struct ucred *cred, int af)
 2722 {
 2723         struct prison *pr;
 2724         int error;
 2725 
 2726         KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 2727 
 2728         pr = cred->cr_prison;
 2729 #ifdef VIMAGE
 2730         /* Prisons with their own network stack are not limited. */
 2731         if (prison_owns_vnet(cred))
 2732                 return (0);
 2733 #endif
 2734 
 2735         error = 0;
 2736         switch (af)
 2737         {
 2738 #ifdef INET
 2739         case AF_INET:
 2740                 if (pr->pr_flags & PR_IP4)
 2741                 {
 2742                         mtx_lock(&pr->pr_mtx);
 2743                         if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
 2744                                 error = EAFNOSUPPORT;
 2745                         mtx_unlock(&pr->pr_mtx);
 2746                 }
 2747                 break;
 2748 #endif
 2749 #ifdef INET6
 2750         case AF_INET6:
 2751                 if (pr->pr_flags & PR_IP6)
 2752                 {
 2753                         mtx_lock(&pr->pr_mtx);
 2754                         if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
 2755                                 error = EAFNOSUPPORT;
 2756                         mtx_unlock(&pr->pr_mtx);
 2757                 }
 2758                 break;
 2759 #endif
 2760         case AF_LOCAL:
 2761         case AF_ROUTE:
 2762                 break;
 2763         default:
 2764                 if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
 2765                         error = EAFNOSUPPORT;
 2766         }
 2767         return (error);
 2768 }
 2769 
 2770 /*
 2771  * Check if given address belongs to the jail referenced by cred (wrapper to
 2772  * prison_check_ip[46]).
 2773  *
 2774  * Returns 0 if jail doesn't restrict the address family or if address belongs
 2775  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
 2776  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
 2777  */
 2778 int
 2779 prison_if(struct ucred *cred, struct sockaddr *sa)
 2780 {
 2781 #ifdef INET
 2782         struct sockaddr_in *sai;
 2783 #endif
 2784 #ifdef INET6
 2785         struct sockaddr_in6 *sai6;
 2786 #endif
 2787         int error;
 2788 
 2789         KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
 2790         KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
 2791 
 2792 #ifdef VIMAGE
 2793         if (prison_owns_vnet(cred))
 2794                 return (0);
 2795 #endif
 2796 
 2797         error = 0;
 2798         switch (sa->sa_family)
 2799         {
 2800 #ifdef INET
 2801         case AF_INET:
 2802                 sai = (struct sockaddr_in *)sa;
 2803                 error = prison_check_ip4(cred, &sai->sin_addr);
 2804                 break;
 2805 #endif
 2806 #ifdef INET6
 2807         case AF_INET6:
 2808                 sai6 = (struct sockaddr_in6 *)sa;
 2809                 error = prison_check_ip6(cred, &sai6->sin6_addr);
 2810                 break;
 2811 #endif
 2812         default:
 2813                 if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
 2814                         error = EAFNOSUPPORT;
 2815         }
 2816         return (error);
 2817 }
 2818 
 2819 /*
 2820  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
 2821  */
 2822 int
 2823 prison_check(struct ucred *cred1, struct ucred *cred2)
 2824 {
 2825 
 2826         return ((cred1->cr_prison == cred2->cr_prison ||
 2827             prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
 2828 }
 2829 
 2830 /*
 2831  * Return 1 if p2 is a child of p1, otherwise 0.
 2832  */
 2833 int
 2834 prison_ischild(struct prison *pr1, struct prison *pr2)
 2835 {
 2836 
 2837         for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
 2838                 if (pr1 == pr2)
 2839                         return (1);
 2840         return (0);
 2841 }
 2842 
 2843 /*
 2844  * Return 1 if the passed credential is in a jail, otherwise 0.
 2845  */
 2846 int
 2847 jailed(struct ucred *cred)
 2848 {
 2849 
 2850         return (cred->cr_prison != &prison0);
 2851 }
 2852 
 2853 /*
 2854  * Return 1 if the passed credential is in a jail and that jail does not
 2855  * have its own virtual network stack, otherwise 0.
 2856  */
 2857 int
 2858 jailed_without_vnet(struct ucred *cred)
 2859 {
 2860 
 2861         if (!jailed(cred))
 2862                 return (0);
 2863 #ifdef VIMAGE
 2864         if (prison_owns_vnet(cred))
 2865                 return (0);
 2866 #endif
 2867 
 2868         return (1);
 2869 }
 2870 
 2871 /*
 2872  * Return the correct hostname (domainname, et al) for the passed credential.
 2873  */
 2874 void
 2875 getcredhostname(struct ucred *cred, char *buf, size_t size)
 2876 {
 2877         struct prison *pr;
 2878 
 2879         /*
 2880          * A NULL credential can be used to shortcut to the physical
 2881          * system's hostname.
 2882          */
 2883         pr = (cred != NULL) ? cred->cr_prison : &prison0;
 2884         mtx_lock(&pr->pr_mtx);
 2885         strlcpy(buf, pr->pr_hostname, size);
 2886         mtx_unlock(&pr->pr_mtx);
 2887 }
 2888 
 2889 void
 2890 getcreddomainname(struct ucred *cred, char *buf, size_t size)
 2891 {
 2892 
 2893         mtx_lock(&cred->cr_prison->pr_mtx);
 2894         strlcpy(buf, cred->cr_prison->pr_domainname, size);
 2895         mtx_unlock(&cred->cr_prison->pr_mtx);
 2896 }
 2897 
 2898 void
 2899 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
 2900 {
 2901 
 2902         mtx_lock(&cred->cr_prison->pr_mtx);
 2903         strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
 2904         mtx_unlock(&cred->cr_prison->pr_mtx);
 2905 }
 2906 
 2907 void
 2908 getcredhostid(struct ucred *cred, unsigned long *hostid)
 2909 {
 2910 
 2911         mtx_lock(&cred->cr_prison->pr_mtx);
 2912         *hostid = cred->cr_prison->pr_hostid;
 2913         mtx_unlock(&cred->cr_prison->pr_mtx);
 2914 }
 2915 
 2916 #ifdef VIMAGE
 2917 /*
 2918  * Determine whether the prison represented by cred owns
 2919  * its vnet rather than having it inherited.
 2920  *
 2921  * Returns 1 in case the prison owns the vnet, 0 otherwise.
 2922  */
 2923 int
 2924 prison_owns_vnet(struct ucred *cred)
 2925 {
 2926 
 2927         /*
 2928          * vnets cannot be added/removed after jail creation,
 2929          * so no need to lock here.
 2930          */
 2931         return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
 2932 }
 2933 #endif
 2934 
 2935 /*
 2936  * Determine whether the subject represented by cred can "see"
 2937  * status of a mount point.
 2938  * Returns: 0 for permitted, ENOENT otherwise.
 2939  * XXX: This function should be called cr_canseemount() and should be
 2940  *      placed in kern_prot.c.
 2941  */
 2942 int
 2943 prison_canseemount(struct ucred *cred, struct mount *mp)
 2944 {
 2945         struct prison *pr;
 2946         struct statfs *sp;
 2947         size_t len;
 2948 
 2949         pr = cred->cr_prison;
 2950         if (pr->pr_enforce_statfs == 0)
 2951                 return (0);
 2952         if (pr->pr_root->v_mount == mp)
 2953                 return (0);
 2954         if (pr->pr_enforce_statfs == 2)
 2955                 return (ENOENT);
 2956         /*
 2957          * If jail's chroot directory is set to "/" we should be able to see
 2958          * all mount-points from inside a jail.
 2959          * This is ugly check, but this is the only situation when jail's
 2960          * directory ends with '/'.
 2961          */
 2962         if (strcmp(pr->pr_path, "/") == 0)
 2963                 return (0);
 2964         len = strlen(pr->pr_path);
 2965         sp = &mp->mnt_stat;
 2966         if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
 2967                 return (ENOENT);
 2968         /*
 2969          * Be sure that we don't have situation where jail's root directory
 2970          * is "/some/path" and mount point is "/some/pathpath".
 2971          */
 2972         if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
 2973                 return (ENOENT);
 2974         return (0);
 2975 }
 2976 
 2977 void
 2978 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
 2979 {
 2980         char jpath[MAXPATHLEN];
 2981         struct prison *pr;
 2982         size_t len;
 2983 
 2984         pr = cred->cr_prison;
 2985         if (pr->pr_enforce_statfs == 0)
 2986                 return;
 2987         if (prison_canseemount(cred, mp) != 0) {
 2988                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 2989                 strlcpy(sp->f_mntonname, "[restricted]",
 2990                     sizeof(sp->f_mntonname));
 2991                 return;
 2992         }
 2993         if (pr->pr_root->v_mount == mp) {
 2994                 /*
 2995                  * Clear current buffer data, so we are sure nothing from
 2996                  * the valid path left there.
 2997                  */
 2998                 bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 2999                 *sp->f_mntonname = '/';
 3000                 return;
 3001         }
 3002         /*
 3003          * If jail's chroot directory is set to "/" we should be able to see
 3004          * all mount-points from inside a jail.
 3005          */
 3006         if (strcmp(pr->pr_path, "/") == 0)
 3007                 return;
 3008         len = strlen(pr->pr_path);
 3009         strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
 3010         /*
 3011          * Clear current buffer data, so we are sure nothing from
 3012          * the valid path left there.
 3013          */
 3014         bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
 3015         if (*jpath == '\0') {
 3016                 /* Should never happen. */
 3017                 *sp->f_mntonname = '/';
 3018         } else {
 3019                 strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
 3020         }
 3021 }
 3022 
 3023 /*
 3024  * Check with permission for a specific privilege is granted within jail.  We
 3025  * have a specific list of accepted privileges; the rest are denied.
 3026  */
 3027 int
 3028 prison_priv_check(struct ucred *cred, int priv)
 3029 {
 3030 
 3031         if (!jailed(cred))
 3032                 return (0);
 3033 
 3034 #ifdef VIMAGE
 3035         /*
 3036          * Privileges specific to prisons with a virtual network stack.
 3037          * There might be a duplicate entry here in case the privilege
 3038          * is only granted conditionally in the legacy jail case.
 3039          */
 3040         switch (priv) {
 3041 #ifdef notyet
 3042                 /*
 3043                  * NFS-specific privileges.
 3044                  */
 3045         case PRIV_NFS_DAEMON:
 3046         case PRIV_NFS_LOCKD:
 3047 #endif
 3048                 /*
 3049                  * Network stack privileges.
 3050                  */
 3051         case PRIV_NET_BRIDGE:
 3052         case PRIV_NET_GRE:
 3053         case PRIV_NET_BPF:
 3054         case PRIV_NET_RAW:              /* Dup, cond. in legacy jail case. */
 3055         case PRIV_NET_ROUTE:
 3056         case PRIV_NET_TAP:
 3057         case PRIV_NET_SETIFMTU:
 3058         case PRIV_NET_SETIFFLAGS:
 3059         case PRIV_NET_SETIFCAP:
 3060         case PRIV_NET_SETIFDESCR:
 3061         case PRIV_NET_SETIFNAME :
 3062         case PRIV_NET_SETIFMETRIC:
 3063         case PRIV_NET_SETIFPHYS:
 3064         case PRIV_NET_SETIFMAC:
 3065         case PRIV_NET_SETLANPCP:
 3066         case PRIV_NET_ADDMULTI:
 3067         case PRIV_NET_DELMULTI:
 3068         case PRIV_NET_HWIOCTL:
 3069         case PRIV_NET_SETLLADDR:
 3070         case PRIV_NET_ADDIFGROUP:
 3071         case PRIV_NET_DELIFGROUP:
 3072         case PRIV_NET_IFCREATE:
 3073         case PRIV_NET_IFDESTROY:
 3074         case PRIV_NET_ADDIFADDR:
 3075         case PRIV_NET_DELIFADDR:
 3076         case PRIV_NET_LAGG:
 3077         case PRIV_NET_GIF:
 3078         case PRIV_NET_SETIFVNET:
 3079         case PRIV_NET_SETIFFIB:
 3080 
 3081                 /*
 3082                  * 802.11-related privileges.
 3083                  */
 3084         case PRIV_NET80211_GETKEY:
 3085 #ifdef notyet
 3086         case PRIV_NET80211_MANAGE:              /* XXX-BZ discuss with sam@ */
 3087 #endif
 3088 
 3089 #ifdef notyet
 3090                 /*
 3091                  * ATM privileges.
 3092                  */
 3093         case PRIV_NETATM_CFG:
 3094         case PRIV_NETATM_ADD:
 3095         case PRIV_NETATM_DEL:
 3096         case PRIV_NETATM_SET:
 3097 
 3098                 /*
 3099                  * Bluetooth privileges.
 3100                  */
 3101         case PRIV_NETBLUETOOTH_RAW:
 3102 #endif
 3103 
 3104                 /*
 3105                  * Netgraph and netgraph module privileges.
 3106                  */
 3107         case PRIV_NETGRAPH_CONTROL:
 3108 #ifdef notyet
 3109         case PRIV_NETGRAPH_TTY:
 3110 #endif
 3111 
 3112                 /*
 3113                  * IPv4 and IPv6 privileges.
 3114                  */
 3115         case PRIV_NETINET_IPFW:
 3116         case PRIV_NETINET_DIVERT:
 3117         case PRIV_NETINET_PF:
 3118         case PRIV_NETINET_DUMMYNET:
 3119         case PRIV_NETINET_CARP:
 3120         case PRIV_NETINET_MROUTE:
 3121         case PRIV_NETINET_RAW:
 3122         case PRIV_NETINET_ADDRCTRL6:
 3123         case PRIV_NETINET_ND6:
 3124         case PRIV_NETINET_SCOPE6:
 3125         case PRIV_NETINET_ALIFETIME6:
 3126         case PRIV_NETINET_IPSEC:
 3127         case PRIV_NETINET_BINDANY:
 3128 
 3129 #ifdef notyet
 3130                 /*
 3131                  * NCP privileges.
 3132                  */
 3133         case PRIV_NETNCP:
 3134 
 3135                 /*
 3136                  * SMB privileges.
 3137                  */
 3138         case PRIV_NETSMB:
 3139 #endif
 3140 
 3141         /*
 3142          * No default: or deny here.
 3143          * In case of no permit fall through to next switch().
 3144          */
 3145                 if (cred->cr_prison->pr_flags & PR_VNET)
 3146                         return (0);
 3147         }
 3148 #endif /* VIMAGE */
 3149 
 3150         switch (priv) {
 3151 
 3152                 /*
 3153                  * Allow ktrace privileges for root in jail.
 3154                  */
 3155         case PRIV_KTRACE:
 3156 
 3157 #if 0
 3158                 /*
 3159                  * Allow jailed processes to configure audit identity and
 3160                  * submit audit records (login, etc).  In the future we may
 3161                  * want to further refine the relationship between audit and
 3162                  * jail.
 3163                  */
 3164         case PRIV_AUDIT_GETAUDIT:
 3165         case PRIV_AUDIT_SETAUDIT:
 3166         case PRIV_AUDIT_SUBMIT:
 3167 #endif
 3168 
 3169                 /*
 3170                  * Allow jailed processes to manipulate process UNIX
 3171                  * credentials in any way they see fit.
 3172                  */
 3173         case PRIV_CRED_SETUID:
 3174         case PRIV_CRED_SETEUID:
 3175         case PRIV_CRED_SETGID:
 3176         case PRIV_CRED_SETEGID:
 3177         case PRIV_CRED_SETGROUPS:
 3178         case PRIV_CRED_SETREUID:
 3179         case PRIV_CRED_SETREGID:
 3180         case PRIV_CRED_SETRESUID:
 3181         case PRIV_CRED_SETRESGID:
 3182 
 3183                 /*
 3184                  * Jail implements visibility constraints already, so allow
 3185                  * jailed root to override uid/gid-based constraints.
 3186                  */
 3187         case PRIV_SEEOTHERGIDS:
 3188         case PRIV_SEEOTHERUIDS:
 3189 
 3190                 /*
 3191                  * Jail implements inter-process debugging limits already, so
 3192                  * allow jailed root various debugging privileges.
 3193                  */
 3194         case PRIV_DEBUG_DIFFCRED:
 3195         case PRIV_DEBUG_SUGID:
 3196         case PRIV_DEBUG_UNPRIV:
 3197 
 3198                 /*
 3199                  * Allow jail to set various resource limits and login
 3200                  * properties, and for now, exceed process resource limits.
 3201                  */
 3202         case PRIV_PROC_LIMIT:
 3203         case PRIV_PROC_SETLOGIN:
 3204         case PRIV_PROC_SETRLIMIT:
 3205 
 3206                 /*
 3207                  * System V and POSIX IPC privileges are granted in jail.
 3208                  */
 3209         case PRIV_IPC_READ:
 3210         case PRIV_IPC_WRITE:
 3211         case PRIV_IPC_ADMIN:
 3212         case PRIV_IPC_MSGSIZE:
 3213         case PRIV_MQ_ADMIN:
 3214 
 3215                 /*
 3216                  * Jail operations within a jail work on child jails.
 3217                  */
 3218         case PRIV_JAIL_ATTACH:
 3219         case PRIV_JAIL_SET:
 3220         case PRIV_JAIL_REMOVE:
 3221 
 3222                 /*
 3223                  * Jail implements its own inter-process limits, so allow
 3224                  * root processes in jail to change scheduling on other
 3225                  * processes in the same jail.  Likewise for signalling.
 3226                  */
 3227         case PRIV_SCHED_DIFFCRED:
 3228         case PRIV_SCHED_CPUSET:
 3229         case PRIV_SIGNAL_DIFFCRED:
 3230         case PRIV_SIGNAL_SUGID:
 3231 
 3232                 /*
 3233                  * Allow jailed processes to write to sysctls marked as jail
 3234                  * writable.
 3235                  */
 3236         case PRIV_SYSCTL_WRITEJAIL:
 3237 
 3238                 /*
 3239                  * Allow root in jail to manage a variety of quota
 3240                  * properties.  These should likely be conditional on a
 3241                  * configuration option.
 3242                  */
 3243         case PRIV_VFS_GETQUOTA:
 3244         case PRIV_VFS_SETQUOTA:
 3245 
 3246                 /*
 3247                  * Since Jail relies on chroot() to implement file system
 3248                  * protections, grant many VFS privileges to root in jail.
 3249                  * Be careful to exclude mount-related and NFS-related
 3250                  * privileges.
 3251                  */
 3252         case PRIV_VFS_READ:
 3253         case PRIV_VFS_WRITE:
 3254         case PRIV_VFS_ADMIN:
 3255         case PRIV_VFS_EXEC:
 3256         case PRIV_VFS_LOOKUP:
 3257         case PRIV_VFS_BLOCKRESERVE:     /* XXXRW: Slightly surprising. */
 3258         case PRIV_VFS_CHFLAGS_DEV:
 3259         case PRIV_VFS_CHOWN:
 3260         case PRIV_VFS_CHROOT:
 3261         case PRIV_VFS_RETAINSUGID:
 3262         case PRIV_VFS_FCHROOT:
 3263         case PRIV_VFS_LINK:
 3264         case PRIV_VFS_SETGID:
 3265         case PRIV_VFS_STAT:
 3266         case PRIV_VFS_STICKYFILE:
 3267 
 3268                 /*
 3269                  * As in the non-jail case, non-root users are expected to be
 3270                  * able to read kernel/phyiscal memory (provided /dev/[k]mem
 3271                  * exists in the jail and they have permission to access it).
 3272                  */
 3273         case PRIV_KMEM_READ:
 3274                 return (0);
 3275 
 3276                 /*
 3277                  * Depending on the global setting, allow privilege of
 3278                  * setting system flags.
 3279                  */
 3280         case PRIV_VFS_SYSFLAGS:
 3281                 if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
 3282                         return (0);
 3283                 else
 3284                         return (EPERM);
 3285 
 3286                 /*
 3287                  * Depending on the global setting, allow privilege of
 3288                  * mounting/unmounting file systems.
 3289                  */
 3290         case PRIV_VFS_MOUNT:
 3291         case PRIV_VFS_UNMOUNT:
 3292         case PRIV_VFS_MOUNT_NONUSER:
 3293         case PRIV_VFS_MOUNT_OWNER:
 3294                 if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
 3295                     cred->cr_prison->pr_enforce_statfs < 2)
 3296                         return (0);
 3297                 else
 3298                         return (EPERM);
 3299 
 3300                 /*
 3301                  * Conditionnaly allow locking (unlocking) physical pages
 3302                  * in memory.
 3303                  */
 3304         case PRIV_VM_MLOCK:
 3305         case PRIV_VM_MUNLOCK:
 3306                 if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
 3307                         return (0);
 3308                 else
 3309                         return (EPERM);
 3310 
 3311                 /*
 3312                  * Conditionally allow jailed root to bind reserved ports.
 3313                  */
 3314         case PRIV_NETINET_RESERVEDPORT:
 3315                 if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
 3316                         return (0);
 3317                 else
 3318                         return (EPERM);
 3319 
 3320                 /*
 3321                  * Allow jailed root to reuse in-use ports.
 3322                  */
 3323         case PRIV_NETINET_REUSEPORT:
 3324                 return (0);
 3325 
 3326                 /*
 3327                  * Allow jailed root to set certain IPv4/6 (option) headers.
 3328                  */
 3329         case PRIV_NETINET_SETHDROPTS:
 3330                 return (0);
 3331 
 3332                 /*
 3333                  * Conditionally allow creating raw sockets in jail.
 3334                  */
 3335         case PRIV_NETINET_RAW:
 3336                 if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
 3337                         return (0);
 3338                 else
 3339                         return (EPERM);
 3340 
 3341                 /*
 3342                  * Since jail implements its own visibility limits on netstat
 3343                  * sysctls, allow getcred.  This allows identd to work in
 3344                  * jail.
 3345                  */
 3346         case PRIV_NETINET_GETCRED:
 3347                 return (0);
 3348 
 3349                 /*
 3350                  * Allow jailed root to set loginclass.
 3351                  */
 3352         case PRIV_PROC_SETLOGINCLASS:
 3353                 return (0);
 3354 
 3355                 /*
 3356                  * Do not allow a process inside a jail to read the kernel
 3357                  * message buffer unless explicitly permitted.
 3358                  */
 3359         case PRIV_MSGBUF:
 3360                 if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
 3361                         return (0);
 3362                 return (EPERM);
 3363 
 3364         default:
 3365                 /*
 3366                  * In all remaining cases, deny the privilege request.  This
 3367                  * includes almost all network privileges, many system
 3368                  * configuration privileges.
 3369                  */
 3370                 return (EPERM);
 3371         }
 3372 }
 3373 
 3374 /*
 3375  * Return the part of pr2's name that is relative to pr1, or the whole name
 3376  * if it does not directly follow.
 3377  */
 3378 
 3379 char *
 3380 prison_name(struct prison *pr1, struct prison *pr2)
 3381 {
 3382         char *name;
 3383 
 3384         /* Jails see themselves as "" (if they see themselves at all). */
 3385         if (pr1 == pr2)
 3386                 return "";
 3387         name = pr2->pr_name;
 3388         if (prison_ischild(pr1, pr2)) {
 3389                 /*
 3390                  * pr1 isn't locked (and allprison_lock may not be either)
 3391                  * so its length can't be counted on.  But the number of dots
 3392                  * can be counted on - and counted.
 3393                  */
 3394                 for (; pr1 != &prison0; pr1 = pr1->pr_parent)
 3395                         name = strchr(name, '.') + 1;
 3396         }
 3397         return (name);
 3398 }
 3399 
 3400 /*
 3401  * Return the part of pr2's path that is relative to pr1, or the whole path
 3402  * if it does not directly follow.
 3403  */
 3404 static char *
 3405 prison_path(struct prison *pr1, struct prison *pr2)
 3406 {
 3407         char *path1, *path2;
 3408         int len1;
 3409 
 3410         path1 = pr1->pr_path;
 3411         path2 = pr2->pr_path;
 3412         if (!strcmp(path1, "/"))
 3413                 return (path2);
 3414         len1 = strlen(path1);
 3415         if (strncmp(path1, path2, len1))
 3416                 return (path2);
 3417         if (path2[len1] == '\0')
 3418                 return "/";
 3419         if (path2[len1] == '/')
 3420                 return (path2 + len1);
 3421         return (path2);
 3422 }
 3423 
 3424 
 3425 /*
 3426  * Jail-related sysctls.
 3427  */
 3428 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
 3429     "Jails");
 3430 
 3431 static int
 3432 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 3433 {
 3434         struct xprison *xp;
 3435         struct prison *pr, *cpr;
 3436 #ifdef INET
 3437         struct in_addr *ip4 = NULL;
 3438         int ip4s = 0;
 3439 #endif
 3440 #ifdef INET6
 3441         struct in6_addr *ip6 = NULL;
 3442         int ip6s = 0;
 3443 #endif
 3444         int descend, error;
 3445 
 3446         xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
 3447         pr = req->td->td_ucred->cr_prison;
 3448         error = 0;
 3449         sx_slock(&allprison_lock);
 3450         FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
 3451 #if defined(INET) || defined(INET6)
 3452  again:
 3453 #endif
 3454                 mtx_lock(&cpr->pr_mtx);
 3455 #ifdef INET
 3456                 if (cpr->pr_ip4s > 0) {
 3457                         if (ip4s < cpr->pr_ip4s) {
 3458                                 ip4s = cpr->pr_ip4s;
 3459                                 mtx_unlock(&cpr->pr_mtx);
 3460                                 ip4 = realloc(ip4, ip4s *
 3461                                     sizeof(struct in_addr), M_TEMP, M_WAITOK);
 3462                                 goto again;
 3463                         }
 3464                         bcopy(cpr->pr_ip4, ip4,
 3465                             cpr->pr_ip4s * sizeof(struct in_addr));
 3466                 }
 3467 #endif
 3468 #ifdef INET6
 3469                 if (cpr->pr_ip6s > 0) {
 3470                         if (ip6s < cpr->pr_ip6s) {
 3471                                 ip6s = cpr->pr_ip6s;
 3472                                 mtx_unlock(&cpr->pr_mtx);
 3473                                 ip6 = realloc(ip6, ip6s *
 3474                                     sizeof(struct in6_addr), M_TEMP, M_WAITOK);
 3475                                 goto again;
 3476                         }
 3477                         bcopy(cpr->pr_ip6, ip6,
 3478                             cpr->pr_ip6s * sizeof(struct in6_addr));
 3479                 }
 3480 #endif
 3481                 if (cpr->pr_ref == 0) {
 3482                         mtx_unlock(&cpr->pr_mtx);
 3483                         continue;
 3484                 }
 3485                 bzero(xp, sizeof(*xp));
 3486                 xp->pr_version = XPRISON_VERSION;
 3487                 xp->pr_id = cpr->pr_id;
 3488                 xp->pr_state = cpr->pr_uref > 0
 3489                     ? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
 3490                 strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
 3491                 strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
 3492                 strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
 3493 #ifdef INET
 3494                 xp->pr_ip4s = cpr->pr_ip4s;
 3495 #endif
 3496 #ifdef INET6
 3497                 xp->pr_ip6s = cpr->pr_ip6s;
 3498 #endif
 3499                 mtx_unlock(&cpr->pr_mtx);
 3500                 error = SYSCTL_OUT(req, xp, sizeof(*xp));
 3501                 if (error)
 3502                         break;
 3503 #ifdef INET
 3504                 if (xp->pr_ip4s > 0) {
 3505                         error = SYSCTL_OUT(req, ip4,
 3506                             xp->pr_ip4s * sizeof(struct in_addr));
 3507                         if (error)
 3508                                 break;
 3509                 }
 3510 #endif
 3511 #ifdef INET6
 3512                 if (xp->pr_ip6s > 0) {
 3513                         error = SYSCTL_OUT(req, ip6,
 3514                             xp->pr_ip6s * sizeof(struct in6_addr));
 3515                         if (error)
 3516                                 break;
 3517                 }
 3518 #endif
 3519         }
 3520         sx_sunlock(&allprison_lock);
 3521         free(xp, M_TEMP);
 3522 #ifdef INET
 3523         free(ip4, M_TEMP);
 3524 #endif
 3525 #ifdef INET6
 3526         free(ip6, M_TEMP);
 3527 #endif
 3528         return (error);
 3529 }
 3530 
 3531 SYSCTL_OID(_security_jail, OID_AUTO, list,
 3532     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 3533     sysctl_jail_list, "S", "List of active jails");
 3534 
 3535 static int
 3536 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
 3537 {
 3538         int error, injail;
 3539 
 3540         injail = jailed(req->td->td_ucred);
 3541         error = SYSCTL_OUT(req, &injail, sizeof(injail));
 3542 
 3543         return (error);
 3544 }
 3545 
 3546 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
 3547     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 3548     sysctl_jail_jailed, "I", "Process in jail?");
 3549 
 3550 static int
 3551 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
 3552 {
 3553         int error, havevnet;
 3554 #ifdef VIMAGE
 3555         struct ucred *cred = req->td->td_ucred;
 3556 
 3557         havevnet = jailed(cred) && prison_owns_vnet(cred);
 3558 #else
 3559         havevnet = 0;
 3560 #endif
 3561         error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
 3562 
 3563         return (error);
 3564 }
 3565 
 3566 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
 3567     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
 3568     sysctl_jail_vnet, "I", "Jail owns vnet?");
 3569 
 3570 #if defined(INET) || defined(INET6)
 3571 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
 3572     &jail_max_af_ips, 0,
 3573     "Number of IP addresses a jail may have at most per address family (deprecated)");
 3574 #endif
 3575 
 3576 /*
 3577  * Default parameters for jail(2) compatibility.  For historical reasons,
 3578  * the sysctl names have varying similarity to the parameter names.  Prisons
 3579  * just see their own parameters, and can't change them.
 3580  */
 3581 static int
 3582 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
 3583 {
 3584         struct prison *pr;
 3585         int allow, error, i;
 3586 
 3587         pr = req->td->td_ucred->cr_prison;
 3588         allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;
 3589 
 3590         /* Get the current flag value, and convert it to a boolean. */
 3591         i = (allow & arg2) ? 1 : 0;
 3592         if (arg1 != NULL)
 3593                 i = !i;
 3594         error = sysctl_handle_int(oidp, &i, 0, req);
 3595         if (error || !req->newptr)
 3596                 return (error);
 3597         i = i ? arg2 : 0;
 3598         if (arg1 != NULL)
 3599                 i ^= arg2;
 3600         /*
 3601          * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
 3602          * for writing.
 3603          */
 3604         mtx_lock(&prison0.pr_mtx);
 3605         jail_default_allow = (jail_default_allow & ~arg2) | i;
 3606         mtx_unlock(&prison0.pr_mtx);
 3607         return (0);
 3608 }
 3609 
 3610 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
 3611     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3612     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
 3613     "Processes in jail can set their hostnames (deprecated)");
 3614 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
 3615     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3616     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
 3617     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
 3618 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
 3619     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3620     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
 3621     "Processes in jail can use System V IPC primitives (deprecated)");
 3622 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
 3623     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3624     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
 3625     "Prison root can create raw sockets (deprecated)");
 3626 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
 3627     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3628     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
 3629     "Processes in jail can alter system file flags (deprecated)");
 3630 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
 3631     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3632     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
 3633     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
 3634 
 3635 static int
 3636 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
 3637 {
 3638         struct prison *pr;
 3639         int level, error;
 3640 
 3641         pr = req->td->td_ucred->cr_prison;
 3642         level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
 3643         error = sysctl_handle_int(oidp, &level, 0, req);
 3644         if (error || !req->newptr)
 3645                 return (error);
 3646         *(int *)arg1 = level;
 3647         return (0);
 3648 }
 3649 
 3650 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
 3651     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3652     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
 3653     sysctl_jail_default_level, "I",
 3654     "Processes in jail cannot see all mounted file systems (deprecated)");
 3655 
 3656 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
 3657     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
 3658     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
 3659     sysctl_jail_default_level, "I",
 3660     "Ruleset for the devfs filesystem in jail (deprecated)");
 3661 
 3662 /*
 3663  * Nodes to describe jail parameters.  Maximum length of string parameters
 3664  * is returned in the string itself, and the other parameters exist merely
 3665  * to make themselves and their types known.
 3666  */
 3667 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
 3668     "Jail parameters");
 3669 
 3670 int
 3671 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
 3672 {
 3673         int i;
 3674         long l;
 3675         size_t s;
 3676         char numbuf[12];
 3677 
 3678         switch (oidp->oid_kind & CTLTYPE)
 3679         {
 3680         case CTLTYPE_LONG:
 3681         case CTLTYPE_ULONG:
 3682                 l = 0;
 3683 #ifdef SCTL_MASK32
 3684                 if (!(req->flags & SCTL_MASK32))
 3685 #endif
 3686                         return (SYSCTL_OUT(req, &l, sizeof(l)));
 3687         case CTLTYPE_INT:
 3688         case CTLTYPE_UINT:
 3689                 i = 0;
 3690                 return (SYSCTL_OUT(req, &i, sizeof(i)));
 3691         case CTLTYPE_STRING:
 3692                 snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
 3693                 return
 3694                     (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
 3695         case CTLTYPE_STRUCT:
 3696                 s = (size_t)arg2;
 3697                 return (SYSCTL_OUT(req, &s, sizeof(s)));
 3698         }
 3699         return (0);
 3700 }
 3701 
 3702 /*
 3703  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
 3704  * jail creation time but cannot be changed in an existing jail.
 3705  */
 3706 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
 3707 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
 3708 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
 3709 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
 3710 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
 3711     "I", "Jail secure level");
 3712 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I", 
 3713     "Jail value for kern.osreldate and uname -K");
 3714 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN, 
 3715     "Jail value for kern.osrelease and uname -r");
 3716 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
 3717     "I", "Jail cannot see all mounted file systems");
 3718 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
 3719     "I", "Ruleset for in-jail devfs mounts");
 3720 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
 3721     "B", "Jail persistence");
 3722 #ifdef VIMAGE
 3723 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
 3724     "E,jailsys", "Virtual network stack");
 3725 #endif
 3726 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
 3727     "B", "Jail is in the process of shutting down");
 3728 
 3729 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
 3730 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
 3731     "I", "Current number of child jails");
 3732 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
 3733     "I", "Maximum number of child jails");
 3734 
 3735 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
 3736 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
 3737     "Jail hostname");
 3738 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
 3739     "Jail NIS domainname");
 3740 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
 3741     "Jail host UUID");
 3742 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
 3743     "LU", "Jail host ID");
 3744 
 3745 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
 3746 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
 3747 
 3748 #ifdef INET
 3749 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
 3750     "Jail IPv4 address virtualization");
 3751 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
 3752     "S,in_addr,a", "Jail IPv4 addresses");
 3753 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
 3754     "B", "Do (not) use IPv4 source address selection rather than the "
 3755     "primary jail IPv4 address.");
 3756 #endif
 3757 #ifdef INET6
 3758 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
 3759     "Jail IPv6 address virtualization");
 3760 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
 3761     "S,in6_addr,a", "Jail IPv6 addresses");
 3762 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
 3763     "B", "Do (not) use IPv6 source address selection rather than the "
 3764     "primary jail IPv6 address.");
 3765 #endif
 3766 
 3767 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
 3768 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
 3769     "B", "Jail may set hostname");
 3770 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
 3771     "B", "Jail may use SYSV IPC");
 3772 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
 3773     "B", "Jail may create raw sockets");
 3774 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
 3775     "B", "Jail may alter system file flags");
 3776 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
 3777     "B", "Jail may set file quotas");
 3778 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
 3779     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
 3780 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
 3781     "B", "Jail may lock (unlock) physical pages in memory");
 3782 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
 3783     "B", "Jail may bind sockets to reserved ports");
 3784 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
 3785     "B", "Jail may read the kernel message buffer");
 3786 
 3787 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
 3788 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
 3789     "B", "Jail may mount/unmount jail-friendly file systems in general");
 3790 
 3791 /*
 3792  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
 3793  * its associated bit in the pr_allow bitmask, or zero if the parameter was
 3794  * not created.
 3795  */
 3796 unsigned
 3797 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
 3798     const char *descr)
 3799 {
 3800         struct bool_flags *bf;
 3801         struct sysctl_oid *parent;
 3802         char *allow_name, *allow_noname, *allowed;
 3803 #ifndef NO_SYSCTL_DESCR
 3804         char *descr_deprecated;
 3805 #endif
 3806         unsigned allow_flag;
 3807 
 3808         if (prefix
 3809             ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
 3810                 < 0 ||
 3811               asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
 3812                 < 0
 3813             : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
 3814               asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
 3815                 free(allow_name, M_PRISON);
 3816                 return 0;
 3817         }
 3818 
 3819         /*
 3820          * See if this parameter has already beed added, i.e. a module was
 3821          * previously loaded/unloaded.
 3822          */
 3823         mtx_lock(&prison0.pr_mtx);
 3824         for (bf = pr_flag_allow;
 3825              bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
 3826              bf++) {
 3827                 if (strcmp(bf->name, allow_name) == 0) {
 3828                         allow_flag = bf->flag;
 3829                         goto no_add;
 3830                 }
 3831         }
 3832 
 3833         /*
 3834          * Find a free bit in prison0's pr_allow, failing if there are none
 3835          * (which shouldn't happen as long as we keep track of how many
 3836          * potential dynamic flags exist).
 3837          */
 3838         for (allow_flag = 1;; allow_flag <<= 1) {
 3839                 if (allow_flag == 0)
 3840                         goto no_add;
 3841                 if ((prison0.pr_allow & allow_flag) == 0)
 3842                         break;
 3843         }
 3844 
 3845         /*
 3846          * Note the parameter in the next open slot in pr_flag_allow.
 3847          * Set the flag last so code that checks pr_flag_allow can do so
 3848          * without locking.
 3849          */
 3850         for (bf = pr_flag_allow; bf->flag != 0; bf++)
 3851                 if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
 3852                         /* This should never happen, but is not fatal. */
 3853                         allow_flag = 0;
 3854                         goto no_add;
 3855                 }
 3856         prison0.pr_allow |= allow_flag;
 3857         bf->name = allow_name;
 3858         bf->noname = allow_noname;
 3859         bf->flag = allow_flag;
 3860         mtx_unlock(&prison0.pr_mtx);
 3861 
 3862         /*
 3863          * Create sysctls for the paramter, and the back-compat global
 3864          * permission.
 3865          */
 3866         parent = prefix
 3867             ? SYSCTL_ADD_NODE(NULL,
 3868                   SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
 3869                   OID_AUTO, prefix, 0, 0, prefix_descr)
 3870             : &sysctl___security_jail_param_allow;
 3871         (void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
 3872             name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
 3873             NULL, 0, sysctl_jail_param, "B", descr);
 3874         if ((prefix
 3875              ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
 3876              : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
 3877 #ifndef NO_SYSCTL_DESCR
 3878                 (void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
 3879                     descr);
 3880 #endif
 3881                 (void)SYSCTL_ADD_PROC(NULL,
 3882                     SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
 3883                     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
 3884                     sysctl_jail_default_allow, "I", descr_deprecated);
 3885 #ifndef NO_SYSCTL_DESCR
 3886                 free(descr_deprecated, M_TEMP);
 3887 #endif
 3888                 free(allowed, M_TEMP);
 3889         }
 3890         return allow_flag;
 3891 
 3892  no_add:
 3893         mtx_unlock(&prison0.pr_mtx);
 3894         free(allow_name, M_PRISON);
 3895         free(allow_noname, M_PRISON);
 3896         return allow_flag;
 3897 }
 3898 
 3899 /*
 3900  * The VFS system will register jail-aware filesystems here.  They each get
 3901  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
 3902  * attempts to mount.
 3903  */
 3904 void
 3905 prison_add_vfs(struct vfsconf *vfsp)
 3906 {
 3907 #ifdef NO_SYSCTL_DESCR
 3908 
 3909         vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 3910             NULL, NULL);
 3911 #else
 3912         char *descr;
 3913 
 3914         (void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
 3915             vfsp->vfc_name);
 3916         vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
 3917             NULL, descr);
 3918         free(descr, M_TEMP);
 3919 #endif
 3920 }
 3921 
 3922 #ifdef RACCT
 3923 void
 3924 prison_racct_foreach(void (*callback)(struct racct *racct,
 3925     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
 3926     void *arg2, void *arg3)
 3927 {
 3928         struct prison_racct *prr;
 3929 
 3930         ASSERT_RACCT_ENABLED();
 3931 
 3932         sx_slock(&allprison_lock);
 3933         if (pre != NULL)
 3934                 (pre)();
 3935         LIST_FOREACH(prr, &allprison_racct, prr_next)
 3936                 (callback)(prr->prr_racct, arg2, arg3);
 3937         if (post != NULL)
 3938                 (post)();
 3939         sx_sunlock(&allprison_lock);
 3940 }
 3941 
 3942 static struct prison_racct *
 3943 prison_racct_find_locked(const char *name)
 3944 {
 3945         struct prison_racct *prr;
 3946 
 3947         ASSERT_RACCT_ENABLED();
 3948         sx_assert(&allprison_lock, SA_XLOCKED);
 3949 
 3950         if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
 3951                 return (NULL);
 3952 
 3953         LIST_FOREACH(prr, &allprison_racct, prr_next) {
 3954                 if (strcmp(name, prr->prr_name) != 0)
 3955                         continue;
 3956 
 3957                 /* Found prison_racct with a matching name? */
 3958                 prison_racct_hold(prr);
 3959                 return (prr);
 3960         }
 3961 
 3962         /* Add new prison_racct. */
 3963         prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
 3964         racct_create(&prr->prr_racct);
 3965 
 3966         strcpy(prr->prr_name, name);
 3967         refcount_init(&prr->prr_refcount, 1);
 3968         LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
 3969 
 3970         return (prr);
 3971 }
 3972 
 3973 struct prison_racct *
 3974 prison_racct_find(const char *name)
 3975 {
 3976         struct prison_racct *prr;
 3977 
 3978         ASSERT_RACCT_ENABLED();
 3979 
 3980         sx_xlock(&allprison_lock);
 3981         prr = prison_racct_find_locked(name);
 3982         sx_xunlock(&allprison_lock);
 3983         return (prr);
 3984 }
 3985 
 3986 void
 3987 prison_racct_hold(struct prison_racct *prr)
 3988 {
 3989 
 3990         ASSERT_RACCT_ENABLED();
 3991 
 3992         refcount_acquire(&prr->prr_refcount);
 3993 }
 3994 
 3995 static void
 3996 prison_racct_free_locked(struct prison_racct *prr)
 3997 {
 3998 
 3999         ASSERT_RACCT_ENABLED();
 4000         sx_assert(&allprison_lock, SA_XLOCKED);
 4001 
 4002         if (refcount_release(&prr->prr_refcount)) {
 4003                 racct_destroy(&prr->prr_racct);
 4004                 LIST_REMOVE(prr, prr_next);
 4005                 free(prr, M_PRISON_RACCT);
 4006         }
 4007 }
 4008 
 4009 void
 4010 prison_racct_free(struct prison_racct *prr)
 4011 {
 4012         int old;
 4013 
 4014         ASSERT_RACCT_ENABLED();
 4015         sx_assert(&allprison_lock, SA_UNLOCKED);
 4016 
 4017         old = prr->prr_refcount;
 4018         if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
 4019                 return;
 4020 
 4021         sx_xlock(&allprison_lock);
 4022         prison_racct_free_locked(prr);
 4023         sx_xunlock(&allprison_lock);
 4024 }
 4025 
 4026 static void
 4027 prison_racct_attach(struct prison *pr)
 4028 {
 4029         struct prison_racct *prr;
 4030 
 4031         ASSERT_RACCT_ENABLED();
 4032         sx_assert(&allprison_lock, SA_XLOCKED);
 4033 
 4034         prr = prison_racct_find_locked(pr->pr_name);
 4035         KASSERT(prr != NULL, ("cannot find prison_racct"));
 4036 
 4037         pr->pr_prison_racct = prr;
 4038 }
 4039 
 4040 /*
 4041  * Handle jail renaming.  From the racct point of view, renaming means
 4042  * moving from one prison_racct to another.
 4043  */
 4044 static void
 4045 prison_racct_modify(struct prison *pr)
 4046 {
 4047 #ifdef RCTL
 4048         struct proc *p;
 4049         struct ucred *cred;
 4050 #endif
 4051         struct prison_racct *oldprr;
 4052 
 4053         ASSERT_RACCT_ENABLED();
 4054 
 4055         sx_slock(&allproc_lock);
 4056         sx_xlock(&allprison_lock);
 4057 
 4058         if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
 4059                 sx_xunlock(&allprison_lock);
 4060                 sx_sunlock(&allproc_lock);
 4061                 return;
 4062         }
 4063 
 4064         oldprr = pr->pr_prison_racct;
 4065         pr->pr_prison_racct = NULL;
 4066 
 4067         prison_racct_attach(pr);
 4068 
 4069         /*
 4070          * Move resource utilisation records.
 4071          */
 4072         racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
 4073 
 4074 #ifdef RCTL
 4075         /*
 4076          * Force rctl to reattach rules to processes.
 4077          */
 4078         FOREACH_PROC_IN_SYSTEM(p) {
 4079                 PROC_LOCK(p);
 4080                 cred = crhold(p->p_ucred);
 4081                 PROC_UNLOCK(p);
 4082                 rctl_proc_ucred_changed(p, cred);
 4083                 crfree(cred);
 4084         }
 4085 #endif
 4086 
 4087         sx_sunlock(&allproc_lock);
 4088         prison_racct_free_locked(oldprr);
 4089         sx_xunlock(&allprison_lock);
 4090 }
 4091 
 4092 static void
 4093 prison_racct_detach(struct prison *pr)
 4094 {
 4095 
 4096         ASSERT_RACCT_ENABLED();
 4097         sx_assert(&allprison_lock, SA_UNLOCKED);
 4098 
 4099         if (pr->pr_prison_racct == NULL)
 4100                 return;
 4101         prison_racct_free(pr->pr_prison_racct);
 4102         pr->pr_prison_racct = NULL;
 4103 }
 4104 #endif /* RACCT */
 4105 
 4106 #ifdef DDB
 4107 
 4108 static void
 4109 db_show_prison(struct prison *pr)
 4110 {
 4111         struct bool_flags *bf;
 4112         struct jailsys_flags *jsf;
 4113 #if defined(INET) || defined(INET6)
 4114         int ii;
 4115 #endif
 4116         unsigned f;
 4117 #ifdef INET
 4118         char ip4buf[INET_ADDRSTRLEN];
 4119 #endif
 4120 #ifdef INET6
 4121         char ip6buf[INET6_ADDRSTRLEN];
 4122 #endif
 4123 
 4124         db_printf("prison %p:\n", pr);
 4125         db_printf(" jid             = %d\n", pr->pr_id);
 4126         db_printf(" name            = %s\n", pr->pr_name);
 4127         db_printf(" parent          = %p\n", pr->pr_parent);
 4128         db_printf(" ref             = %d\n", pr->pr_ref);
 4129         db_printf(" uref            = %d\n", pr->pr_uref);
 4130         db_printf(" path            = %s\n", pr->pr_path);
 4131         db_printf(" cpuset          = %d\n", pr->pr_cpuset
 4132             ? pr->pr_cpuset->cs_id : -1);
 4133 #ifdef VIMAGE
 4134         db_printf(" vnet            = %p\n", pr->pr_vnet);
 4135 #endif
 4136         db_printf(" root            = %p\n", pr->pr_root);
 4137         db_printf(" securelevel     = %d\n", pr->pr_securelevel);
 4138         db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
 4139         db_printf(" children.max    = %d\n", pr->pr_childmax);
 4140         db_printf(" children.cur    = %d\n", pr->pr_childcount);
 4141         db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
 4142         db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
 4143         db_printf(" flags           = 0x%x", pr->pr_flags);
 4144         for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
 4145                 if (pr->pr_flags & bf->flag)
 4146                         db_printf(" %s", bf->name);
 4147         for (jsf = pr_flag_jailsys;
 4148              jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
 4149              jsf++) {
 4150                 f = pr->pr_flags & (jsf->disable | jsf->new);
 4151                 db_printf(" %-16s= %s\n", jsf->name,
 4152                     (f != 0 && f == jsf->disable) ? "disable"
 4153                     : (f == jsf->new) ? "new"
 4154                     : "inherit");
 4155         }
 4156         db_printf(" allow           = 0x%x", pr->pr_allow);
 4157         for (bf = pr_flag_allow;
 4158              bf < pr_flag_allow + nitems(pr_flag_allow) && bf->flag != 0;
 4159              bf++)
 4160                 if (pr->pr_allow & bf->flag)
 4161                         db_printf(" %s", bf->name);
 4162         db_printf("\n");
 4163         db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
 4164         db_printf(" host.hostname   = %s\n", pr->pr_hostname);
 4165         db_printf(" host.domainname = %s\n", pr->pr_domainname);
 4166         db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
 4167         db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
 4168 #ifdef INET
 4169         db_printf(" ip4s            = %d\n", pr->pr_ip4s);
 4170         for (ii = 0; ii < pr->pr_ip4s; ii++)
 4171                 db_printf(" %s %s\n",
 4172                     ii == 0 ? "ip4.addr        =" : "                 ",
 4173                     inet_ntoa_r(pr->pr_ip4[ii], ip4buf));
 4174 #endif
 4175 #ifdef INET6
 4176         db_printf(" ip6s            = %d\n", pr->pr_ip6s);
 4177         for (ii = 0; ii < pr->pr_ip6s; ii++)
 4178                 db_printf(" %s %s\n",
 4179                     ii == 0 ? "ip6.addr        =" : "                 ",
 4180                     ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
 4181 #endif
 4182 }
 4183 
 4184 DB_SHOW_COMMAND(prison, db_show_prison_command)
 4185 {
 4186         struct prison *pr;
 4187 
 4188         if (!have_addr) {
 4189                 /*
 4190                  * Show all prisons in the list, and prison0 which is not
 4191                  * listed.
 4192                  */
 4193                 db_show_prison(&prison0);
 4194                 if (!db_pager_quit) {
 4195                         TAILQ_FOREACH(pr, &allprison, pr_list) {
 4196                                 db_show_prison(pr);
 4197                                 if (db_pager_quit)
 4198                                         break;
 4199                         }
 4200                 }
 4201                 return;
 4202         }
 4203 
 4204         if (addr == 0)
 4205                 pr = &prison0;
 4206         else {
 4207                 /* Look for a prison with the ID and with references. */
 4208                 TAILQ_FOREACH(pr, &allprison, pr_list)
 4209                         if (pr->pr_id == addr && pr->pr_ref > 0)
 4210                                 break;
 4211                 if (pr == NULL)
 4212                         /* Look again, without requiring a reference. */
 4213                         TAILQ_FOREACH(pr, &allprison, pr_list)
 4214                                 if (pr->pr_id == addr)
 4215                                         break;
 4216                 if (pr == NULL)
 4217                         /* Assume address points to a valid prison. */
 4218                         pr = (struct prison *)addr;
 4219         }
 4220         db_show_prison(pr);
 4221 }
 4222 
 4223 #endif /* DDB */

Cache object: 2a4eb2f7e7b6161546e33fc431a5c9b4


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.