The Design and Implementation of the FreeBSD Operating System, Second Edition
Now available: The Design and Implementation of the FreeBSD Operating System (Second Edition)


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]

FreeBSD/Linux Kernel Cross Reference
sys/common/os/fork.c

Version: -  FREEBSD  -  FREEBSD-13-STABLE  -  FREEBSD-13-0  -  FREEBSD-12-STABLE  -  FREEBSD-12-0  -  FREEBSD-11-STABLE  -  FREEBSD-11-0  -  FREEBSD-10-STABLE  -  FREEBSD-10-0  -  FREEBSD-9-STABLE  -  FREEBSD-9-0  -  FREEBSD-8-STABLE  -  FREEBSD-8-0  -  FREEBSD-7-STABLE  -  FREEBSD-7-0  -  FREEBSD-6-STABLE  -  FREEBSD-6-0  -  FREEBSD-5-STABLE  -  FREEBSD-5-0  -  FREEBSD-4-STABLE  -  FREEBSD-3-STABLE  -  FREEBSD22  -  l41  -  OPENBSD  -  linux-2.6  -  MK84  -  PLAN9  -  xnu-8792 
SearchContext: -  none  -  3  -  10 

    1 /*
    2  * CDDL HEADER START
    3  *
    4  * The contents of this file are subject to the terms of the
    5  * Common Development and Distribution License (the "License").
    6  * You may not use this file except in compliance with the License.
    7  *
    8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
    9  * or http://www.opensolaris.org/os/licensing.
   10  * See the License for the specific language governing permissions
   11  * and limitations under the License.
   12  *
   13  * When distributing Covered Code, include this CDDL HEADER in each
   14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
   15  * If applicable, add the following below this CDDL HEADER, with the
   16  * fields enclosed by brackets "[]" replaced with your own identifying
   17  * information: Portions Copyright [yyyy] [name of copyright owner]
   18  *
   19  * CDDL HEADER END
   20  */
   21 
   22 /*
   23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
   24  */
   25 
   26 /*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
   27 /*        All Rights Reserved   */
   28 
   29 #include <sys/types.h>
   30 #include <sys/param.h>
   31 #include <sys/sysmacros.h>
   32 #include <sys/signal.h>
   33 #include <sys/cred.h>
   34 #include <sys/policy.h>
   35 #include <sys/user.h>
   36 #include <sys/systm.h>
   37 #include <sys/cpuvar.h>
   38 #include <sys/vfs.h>
   39 #include <sys/vnode.h>
   40 #include <sys/file.h>
   41 #include <sys/errno.h>
   42 #include <sys/time.h>
   43 #include <sys/proc.h>
   44 #include <sys/cmn_err.h>
   45 #include <sys/acct.h>
   46 #include <sys/tuneable.h>
   47 #include <sys/class.h>
   48 #include <sys/kmem.h>
   49 #include <sys/session.h>
   50 #include <sys/ucontext.h>
   51 #include <sys/stack.h>
   52 #include <sys/procfs.h>
   53 #include <sys/prsystm.h>
   54 #include <sys/vmsystm.h>
   55 #include <sys/vtrace.h>
   56 #include <sys/debug.h>
   57 #include <sys/shm_impl.h>
   58 #include <sys/door_data.h>
   59 #include <vm/as.h>
   60 #include <vm/rm.h>
   61 #include <c2/audit.h>
   62 #include <sys/var.h>
   63 #include <sys/schedctl.h>
   64 #include <sys/utrap.h>
   65 #include <sys/task.h>
   66 #include <sys/resource.h>
   67 #include <sys/cyclic.h>
   68 #include <sys/lgrp.h>
   69 #include <sys/rctl.h>
   70 #include <sys/contract_impl.h>
   71 #include <sys/contract/process_impl.h>
   72 #include <sys/list.h>
   73 #include <sys/dtrace.h>
   74 #include <sys/pool.h>
   75 #include <sys/zone.h>
   76 #include <sys/sdt.h>
   77 #include <sys/class.h>
   78 #include <sys/corectl.h>
   79 #include <sys/brand.h>
   80 #include <sys/fork.h>
   81 
   82 static int64_t cfork(int, int, int);
   83 static int getproc(proc_t **, pid_t, uint_t);
   84 #define GETPROC_USER    0x0
   85 #define GETPROC_KERNEL  0x1
   86 
   87 static void fork_fail(proc_t *);
   88 static void forklwp_fail(proc_t *);
   89 
   90 int fork_fail_pending;
   91 
   92 extern struct kmem_cache *process_cache;
   93 
   94 /*
   95  * The vfork() system call trap is no longer invoked by libc.
   96  * It is retained only for the benefit of applications running
   97  * within a solaris10 branded zone.  It should be eliminated
   98  * when we no longer support solaris10 branded zones.
   99  */
  100 int64_t
  101 vfork(void)
  102 {
  103         curthread->t_post_sys = 1;      /* so vfwait() will be called */
  104         return (cfork(1, 1, 0));
  105 }
  106 
  107 /*
  108  * forksys system call - forkx, forkallx, vforkx.  This is the
  109  * interface invoked by libc for fork1(), forkall(), and vfork()
  110  */
  111 int64_t
  112 forksys(int subcode, int flags)
  113 {
  114         switch (subcode) {
  115         case 0:
  116                 return (cfork(0, 1, flags));    /* forkx(flags) */
  117         case 1:
  118                 return (cfork(0, 0, flags));    /* forkallx(flags) */
  119         case 2:
  120                 curthread->t_post_sys = 1;      /* so vfwait() will be called */
  121                 return (cfork(1, 1, flags));    /* vforkx(flags) */
  122         default:
  123                 return ((int64_t)set_errno(EINVAL));
  124         }
  125 }
  126 
  127 /* ARGSUSED */
  128 static int64_t
  129 cfork(int isvfork, int isfork1, int flags)
  130 {
  131         proc_t *p = ttoproc(curthread);
  132         struct as *as;
  133         proc_t *cp, **orphpp;
  134         klwp_t *clone;
  135         kthread_t *t;
  136         task_t *tk;
  137         rval_t  r;
  138         int error;
  139         int i;
  140         rctl_set_t *dup_set;
  141         rctl_alloc_gp_t *dup_gp;
  142         rctl_entity_p_t e;
  143         lwpdir_t *ldp;
  144         lwpent_t *lep;
  145         lwpent_t *clep;
  146 
  147         /*
  148          * Allow only these two flags.
  149          */
  150         if ((flags & ~(FORK_NOSIGCHLD | FORK_WAITPID)) != 0) {
  151                 error = EINVAL;
  152                 goto forkerr;
  153         }
  154 
  155         /*
  156          * fork is not supported for the /proc agent lwp.
  157          */
  158         if (curthread == p->p_agenttp) {
  159                 error = ENOTSUP;
  160                 goto forkerr;
  161         }
  162 
  163         if ((error = secpolicy_basic_fork(CRED())) != 0)
  164                 goto forkerr;
  165 
  166         /*
  167          * If the calling lwp is doing a fork1() then the
  168          * other lwps in this process are not duplicated and
  169          * don't need to be held where their kernel stacks can be
  170          * cloned.  If doing forkall(), the process is held with
  171          * SHOLDFORK, so that the lwps are at a point where their
  172          * stacks can be copied which is on entry or exit from
  173          * the kernel.
  174          */
  175         if (!holdlwps(isfork1 ? SHOLDFORK1 : SHOLDFORK)) {
  176                 aston(curthread);
  177                 error = EINTR;
  178                 goto forkerr;
  179         }
  180 
  181 #if defined(__sparc)
  182         /*
  183          * Ensure that the user stack is fully constructed
  184          * before creating the child process structure.
  185          */
  186         (void) flush_user_windows_to_stack(NULL);
  187 #endif
  188 
  189         mutex_enter(&p->p_lock);
  190         /*
  191          * If this is vfork(), cancel any suspend request we might
  192          * have gotten from some other thread via lwp_suspend().
  193          * Otherwise we could end up with a deadlock on return
  194          * from the vfork() in both the parent and the child.
  195          */
  196         if (isvfork)
  197                 curthread->t_proc_flag &= ~TP_HOLDLWP;
  198         /*
  199          * Prevent our resource set associations from being changed during fork.
  200          */
  201         pool_barrier_enter();
  202         mutex_exit(&p->p_lock);
  203 
  204         /*
  205          * Create a child proc struct. Place a VN_HOLD on appropriate vnodes.
  206          */
  207         if (getproc(&cp, 0, GETPROC_USER) < 0) {
  208                 mutex_enter(&p->p_lock);
  209                 pool_barrier_exit();
  210                 continuelwps(p);
  211                 mutex_exit(&p->p_lock);
  212                 error = EAGAIN;
  213                 goto forkerr;
  214         }
  215 
  216         TRACE_2(TR_FAC_PROC, TR_PROC_FORK, "proc_fork:cp %p p %p", cp, p);
  217 
  218         /*
  219          * Assign an address space to child
  220          */
  221         if (isvfork) {
  222                 /*
  223                  * Clear any watched areas and remember the
  224                  * watched pages for restoring in vfwait().
  225                  */
  226                 as = p->p_as;
  227                 if (avl_numnodes(&as->a_wpage) != 0) {
  228                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
  229                         as_clearwatch(as);
  230                         p->p_wpage = as->a_wpage;
  231                         avl_create(&as->a_wpage, wp_compare,
  232                             sizeof (struct watched_page),
  233                             offsetof(struct watched_page, wp_link));
  234                         AS_LOCK_EXIT(as, &as->a_lock);
  235                 }
  236                 cp->p_as = as;
  237                 cp->p_flag |= SVFORK;
  238 
  239                 /*
  240                  * Use the parent's shm segment list information for
  241                  * the child as it uses its address space till it execs.
  242                  */
  243                 cp->p_segacct = p->p_segacct;
  244         } else {
  245                 /*
  246                  * We need to hold P_PR_LOCK until the address space has
  247                  * been duplicated and we've had a chance to remove from the
  248                  * child any DTrace probes that were in the parent. Holding
  249                  * P_PR_LOCK prevents any new probes from being added and any
  250                  * extant probes from being removed.
  251                  */
  252                 mutex_enter(&p->p_lock);
  253                 sprlock_proc(p);
  254                 p->p_flag |= SFORKING;
  255                 mutex_exit(&p->p_lock);
  256 
  257                 error = as_dup(p->p_as, cp);
  258                 if (error != 0) {
  259                         mutex_enter(&p->p_lock);
  260                         sprunlock(p);
  261                         fork_fail(cp);
  262                         mutex_enter(&pidlock);
  263                         orphpp = &p->p_orphan;
  264                         while (*orphpp != cp)
  265                                 orphpp = &(*orphpp)->p_nextorph;
  266                         *orphpp = cp->p_nextorph;
  267                         if (p->p_child == cp)
  268                                 p->p_child = cp->p_sibling;
  269                         if (cp->p_sibling)
  270                                 cp->p_sibling->p_psibling = cp->p_psibling;
  271                         if (cp->p_psibling)
  272                                 cp->p_psibling->p_sibling = cp->p_sibling;
  273                         mutex_enter(&cp->p_lock);
  274                         tk = cp->p_task;
  275                         task_detach(cp);
  276                         ASSERT(cp->p_pool->pool_ref > 0);
  277                         atomic_add_32(&cp->p_pool->pool_ref, -1);
  278                         mutex_exit(&cp->p_lock);
  279                         pid_exit(cp, tk);
  280                         mutex_exit(&pidlock);
  281                         task_rele(tk);
  282 
  283                         mutex_enter(&p->p_lock);
  284                         p->p_flag &= ~SFORKING;
  285                         pool_barrier_exit();
  286                         continuelwps(p);
  287                         mutex_exit(&p->p_lock);
  288                         /*
  289                          * Preserve ENOMEM error condition but
  290                          * map all others to EAGAIN.
  291                          */
  292                         error = (error == ENOMEM) ? ENOMEM : EAGAIN;
  293                         goto forkerr;
  294                 }
  295 
  296                 /*
  297                  * Remove all DTrace tracepoints from the child process. We
  298                  * need to do this _before_ duplicating USDT providers since
  299                  * any associated probes may be immediately enabled.
  300                  */
  301                 if (p->p_dtrace_count > 0)
  302                         dtrace_fasttrap_fork(p, cp);
  303 
  304                 mutex_enter(&p->p_lock);
  305                 sprunlock(p);
  306 
  307                 /* Duplicate parent's shared memory */
  308                 if (p->p_segacct)
  309                         shmfork(p, cp);
  310 
  311                 /*
  312                  * Duplicate any helper actions and providers. The SFORKING
  313                  * we set above informs the code to enable USDT probes that
  314                  * sprlock() may fail because the child is being forked.
  315                  */
  316                 if (p->p_dtrace_helpers != NULL) {
  317                         ASSERT(dtrace_helpers_fork != NULL);
  318                         (*dtrace_helpers_fork)(p, cp);
  319                 }
  320 
  321                 mutex_enter(&p->p_lock);
  322                 p->p_flag &= ~SFORKING;
  323                 mutex_exit(&p->p_lock);
  324         }
  325 
  326         /*
  327          * Duplicate parent's resource controls.
  328          */
  329         dup_set = rctl_set_create();
  330         for (;;) {
  331                 dup_gp = rctl_set_dup_prealloc(p->p_rctls);
  332                 mutex_enter(&p->p_rctls->rcs_lock);
  333                 if (rctl_set_dup_ready(p->p_rctls, dup_gp))
  334                         break;
  335                 mutex_exit(&p->p_rctls->rcs_lock);
  336                 rctl_prealloc_destroy(dup_gp);
  337         }
  338         e.rcep_p.proc = cp;
  339         e.rcep_t = RCENTITY_PROCESS;
  340         cp->p_rctls = rctl_set_dup(p->p_rctls, p, cp, &e, dup_set, dup_gp,
  341             RCD_DUP | RCD_CALLBACK);
  342         mutex_exit(&p->p_rctls->rcs_lock);
  343 
  344         rctl_prealloc_destroy(dup_gp);
  345 
  346         /*
  347          * Allocate the child's lwp directory and lwpid hash table.
  348          */
  349         if (isfork1)
  350                 cp->p_lwpdir_sz = 2;
  351         else
  352                 cp->p_lwpdir_sz = p->p_lwpdir_sz;
  353         cp->p_lwpdir = cp->p_lwpfree = ldp =
  354             kmem_zalloc(cp->p_lwpdir_sz * sizeof (lwpdir_t), KM_SLEEP);
  355         for (i = 1; i < cp->p_lwpdir_sz; i++, ldp++)
  356                 ldp->ld_next = ldp + 1;
  357         cp->p_tidhash_sz = (cp->p_lwpdir_sz + 2) / 2;
  358         cp->p_tidhash =
  359             kmem_zalloc(cp->p_tidhash_sz * sizeof (tidhash_t), KM_SLEEP);
  360 
  361         /*
  362          * Duplicate parent's lwps.
  363          * Mutual exclusion is not needed because the process is
  364          * in the hold state and only the current lwp is running.
  365          */
  366         klgrpset_clear(cp->p_lgrpset);
  367         if (isfork1) {
  368                 clone = forklwp(ttolwp(curthread), cp, curthread->t_tid);
  369                 if (clone == NULL)
  370                         goto forklwperr;
  371                 /*
  372                  * Inherit only the lwp_wait()able flag,
  373                  * Daemon threads should not call fork1(), but oh well...
  374                  */
  375                 lwptot(clone)->t_proc_flag |=
  376                     (curthread->t_proc_flag & TP_TWAIT);
  377         } else {
  378                 /* this is forkall(), no one can be in lwp_wait() */
  379                 ASSERT(p->p_lwpwait == 0 && p->p_lwpdwait == 0);
  380                 /* for each entry in the parent's lwp directory... */
  381                 for (i = 0, ldp = p->p_lwpdir; i < p->p_lwpdir_sz; i++, ldp++) {
  382                         klwp_t *clwp;
  383                         kthread_t *ct;
  384 
  385                         if ((lep = ldp->ld_entry) == NULL)
  386                                 continue;
  387 
  388                         if ((t = lep->le_thread) != NULL) {
  389                                 clwp = forklwp(ttolwp(t), cp, t->t_tid);
  390                                 if (clwp == NULL)
  391                                         goto forklwperr;
  392                                 ct = lwptot(clwp);
  393                                 /*
  394                                  * Inherit lwp_wait()able and daemon flags.
  395                                  */
  396                                 ct->t_proc_flag |=
  397                                     (t->t_proc_flag & (TP_TWAIT|TP_DAEMON));
  398                                 /*
  399                                  * Keep track of the clone of curthread to
  400                                  * post return values through lwp_setrval().
  401                                  * Mark other threads for special treatment
  402                                  * by lwp_rtt() / post_syscall().
  403                                  */
  404                                 if (t == curthread)
  405                                         clone = clwp;
  406                                 else
  407                                         ct->t_flag |= T_FORKALL;
  408                         } else {
  409                                 /*
  410                                  * Replicate zombie lwps in the child.
  411                                  */
  412                                 clep = kmem_zalloc(sizeof (*clep), KM_SLEEP);
  413                                 clep->le_lwpid = lep->le_lwpid;
  414                                 clep->le_start = lep->le_start;
  415                                 lwp_hash_in(cp, clep,
  416                                     cp->p_tidhash, cp->p_tidhash_sz, 0);
  417                         }
  418                 }
  419         }
  420 
  421         /*
  422          * Put new process in the parent's process contract, or put it
  423          * in a new one if there is an active process template.  Send a
  424          * fork event (if requested) to whatever contract the child is
  425          * a member of.  Fails if the parent has been SIGKILLed.
  426          */
  427         if (contract_process_fork(NULL, cp, p, B_TRUE) == NULL)
  428                 goto forklwperr;
  429 
  430         /*
  431          * No fork failures occur beyond this point.
  432          */
  433 
  434         cp->p_lwpid = p->p_lwpid;
  435         if (!isfork1) {
  436                 cp->p_lwpdaemon = p->p_lwpdaemon;
  437                 cp->p_zombcnt = p->p_zombcnt;
  438                 /*
  439                  * If the parent's lwp ids have wrapped around, so have the
  440                  * child's.
  441                  */
  442                 cp->p_flag |= p->p_flag & SLWPWRAP;
  443         }
  444 
  445         mutex_enter(&p->p_lock);
  446         corectl_path_hold(cp->p_corefile = p->p_corefile);
  447         corectl_content_hold(cp->p_content = p->p_content);
  448         mutex_exit(&p->p_lock);
  449 
  450         /*
  451          * Duplicate process context ops, if any.
  452          */
  453         if (p->p_pctx)
  454                 forkpctx(p, cp);
  455 
  456 #ifdef __sparc
  457         utrap_dup(p, cp);
  458 #endif
  459         /*
  460          * If the child process has been marked to stop on exit
  461          * from this fork, arrange for all other lwps to stop in
  462          * sympathy with the active lwp.
  463          */
  464         if (PTOU(cp)->u_systrap &&
  465             prismember(&PTOU(cp)->u_exitmask, curthread->t_sysnum)) {
  466                 mutex_enter(&cp->p_lock);
  467                 t = cp->p_tlist;
  468                 do {
  469                         t->t_proc_flag |= TP_PRSTOP;
  470                         aston(t);       /* so TP_PRSTOP will be seen */
  471                 } while ((t = t->t_forw) != cp->p_tlist);
  472                 mutex_exit(&cp->p_lock);
  473         }
  474         /*
  475          * If the parent process has been marked to stop on exit
  476          * from this fork, and its asynchronous-stop flag has not
  477          * been set, arrange for all other lwps to stop before
  478          * they return back to user level.
  479          */
  480         if (!(p->p_proc_flag & P_PR_ASYNC) && PTOU(p)->u_systrap &&
  481             prismember(&PTOU(p)->u_exitmask, curthread->t_sysnum)) {
  482                 mutex_enter(&p->p_lock);
  483                 t = p->p_tlist;
  484                 do {
  485                         t->t_proc_flag |= TP_PRSTOP;
  486                         aston(t);       /* so TP_PRSTOP will be seen */
  487                 } while ((t = t->t_forw) != p->p_tlist);
  488                 mutex_exit(&p->p_lock);
  489         }
  490 
  491         if (PROC_IS_BRANDED(p))
  492                 BROP(p)->b_lwp_setrval(clone, p->p_pid, 1);
  493         else
  494                 lwp_setrval(clone, p->p_pid, 1);
  495 
  496         /* set return values for parent */
  497         r.r_val1 = (int)cp->p_pid;
  498         r.r_val2 = 0;
  499 
  500         /*
  501          * pool_barrier_exit() can now be called because the child process has:
  502          * - all identifying features cloned or set (p_pid, p_task, p_pool)
  503          * - all resource sets associated (p_tlist->*->t_cpupart, p_as->a_mset)
  504          * - any other fields set which are used in resource set binding.
  505          */
  506         mutex_enter(&p->p_lock);
  507         pool_barrier_exit();
  508         mutex_exit(&p->p_lock);
  509 
  510         mutex_enter(&pidlock);
  511         mutex_enter(&cp->p_lock);
  512 
  513         /*
  514          * Set flags telling the child what (not) to do on exit.
  515          */
  516         if (flags & FORK_NOSIGCHLD)
  517                 cp->p_pidflag |= CLDNOSIGCHLD;
  518         if (flags & FORK_WAITPID)
  519                 cp->p_pidflag |= CLDWAITPID;
  520 
  521         /*
  522          * Now that there are lwps and threads attached, add the new
  523          * process to the process group.
  524          */
  525         pgjoin(cp, p->p_pgidp);
  526         cp->p_stat = SRUN;
  527         /*
  528          * We are now done with all the lwps in the child process.
  529          */
  530         t = cp->p_tlist;
  531         do {
  532                 /*
  533                  * Set the lwp_suspend()ed lwps running.
  534                  * They will suspend properly at syscall exit.
  535                  */
  536                 if (t->t_proc_flag & TP_HOLDLWP)
  537                         lwp_create_done(t);
  538                 else {
  539                         /* set TS_CREATE to allow continuelwps() to work */
  540                         thread_lock(t);
  541                         ASSERT(t->t_state == TS_STOPPED &&
  542                             !(t->t_schedflag & (TS_CREATE|TS_CSTART)));
  543                         t->t_schedflag |= TS_CREATE;
  544                         thread_unlock(t);
  545                 }
  546         } while ((t = t->t_forw) != cp->p_tlist);
  547         mutex_exit(&cp->p_lock);
  548 
  549         if (isvfork) {
  550                 CPU_STATS_ADDQ(CPU, sys, sysvfork, 1);
  551                 mutex_enter(&p->p_lock);
  552                 p->p_flag |= SVFWAIT;
  553                 curthread->t_flag |= T_VFPARENT;
  554                 DTRACE_PROC1(create, proc_t *, cp);
  555                 cv_broadcast(&pr_pid_cv[p->p_slot]);    /* inform /proc */
  556                 mutex_exit(&p->p_lock);
  557                 /*
  558                  * Grab child's p_lock before dropping pidlock to ensure
  559                  * the process will not disappear before we set it running.
  560                  */
  561                 mutex_enter(&cp->p_lock);
  562                 mutex_exit(&pidlock);
  563                 sigdefault(cp);
  564                 continuelwps(cp);
  565                 mutex_exit(&cp->p_lock);
  566         } else {
  567                 CPU_STATS_ADDQ(CPU, sys, sysfork, 1);
  568                 DTRACE_PROC1(create, proc_t *, cp);
  569                 /*
  570                  * It is CL_FORKRET's job to drop pidlock.
  571                  * If we do it here, the process could be set running
  572                  * and disappear before CL_FORKRET() is called.
  573                  */
  574                 CL_FORKRET(curthread, cp->p_tlist);
  575                 schedctl_set_cidpri(curthread);
  576                 ASSERT(MUTEX_NOT_HELD(&pidlock));
  577         }
  578 
  579         return (r.r_vals);
  580 
  581 forklwperr:
  582         if (isvfork) {
  583                 if (avl_numnodes(&p->p_wpage) != 0) {
  584                         /* restore watchpoints to parent */
  585                         as = p->p_as;
  586                         AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
  587                         as->a_wpage = p->p_wpage;
  588                         avl_create(&p->p_wpage, wp_compare,
  589                             sizeof (struct watched_page),
  590                             offsetof(struct watched_page, wp_link));
  591                         as_setwatch(as);
  592                         AS_LOCK_EXIT(as, &as->a_lock);
  593                 }
  594         } else {
  595                 if (cp->p_segacct)
  596                         shmexit(cp);
  597                 as = cp->p_as;
  598                 cp->p_as = &kas;
  599                 as_free(as);
  600         }
  601 
  602         if (cp->p_lwpdir) {
  603                 for (i = 0, ldp = cp->p_lwpdir; i < cp->p_lwpdir_sz; i++, ldp++)
  604                         if ((lep = ldp->ld_entry) != NULL)
  605                                 kmem_free(lep, sizeof (*lep));
  606                 kmem_free(cp->p_lwpdir,
  607                     cp->p_lwpdir_sz * sizeof (*cp->p_lwpdir));
  608         }
  609         cp->p_lwpdir = NULL;
  610         cp->p_lwpfree = NULL;
  611         cp->p_lwpdir_sz = 0;
  612 
  613         if (cp->p_tidhash)
  614                 kmem_free(cp->p_tidhash,
  615                     cp->p_tidhash_sz * sizeof (*cp->p_tidhash));
  616         cp->p_tidhash = NULL;
  617         cp->p_tidhash_sz = 0;
  618 
  619         forklwp_fail(cp);
  620         fork_fail(cp);
  621         rctl_set_free(cp->p_rctls);
  622         mutex_enter(&pidlock);
  623 
  624         /*
  625          * Detach failed child from task.
  626          */
  627         mutex_enter(&cp->p_lock);
  628         tk = cp->p_task;
  629         task_detach(cp);
  630         ASSERT(cp->p_pool->pool_ref > 0);
  631         atomic_add_32(&cp->p_pool->pool_ref, -1);
  632         mutex_exit(&cp->p_lock);
  633 
  634         orphpp = &p->p_orphan;
  635         while (*orphpp != cp)
  636                 orphpp = &(*orphpp)->p_nextorph;
  637         *orphpp = cp->p_nextorph;
  638         if (p->p_child == cp)
  639                 p->p_child = cp->p_sibling;
  640         if (cp->p_sibling)
  641                 cp->p_sibling->p_psibling = cp->p_psibling;
  642         if (cp->p_psibling)
  643                 cp->p_psibling->p_sibling = cp->p_sibling;
  644         pid_exit(cp, tk);
  645         mutex_exit(&pidlock);
  646 
  647         task_rele(tk);
  648 
  649         mutex_enter(&p->p_lock);
  650         pool_barrier_exit();
  651         continuelwps(p);
  652         mutex_exit(&p->p_lock);
  653         error = EAGAIN;
  654 forkerr:
  655         return ((int64_t)set_errno(error));
  656 }
  657 
  658 /*
  659  * Free allocated resources from getproc() if a fork failed.
  660  */
  661 static void
  662 fork_fail(proc_t *cp)
  663 {
  664         uf_info_t *fip = P_FINFO(cp);
  665 
  666         fcnt_add(fip, -1);
  667         sigdelq(cp, NULL, 0);
  668 
  669         mutex_enter(&pidlock);
  670         upcount_dec(crgetruid(cp->p_cred), crgetzoneid(cp->p_cred));
  671         mutex_exit(&pidlock);
  672 
  673         /*
  674          * single threaded, so no locking needed here
  675          */
  676         crfree(cp->p_cred);
  677 
  678         kmem_free(fip->fi_list, fip->fi_nfiles * sizeof (uf_entry_t));
  679 
  680         VN_RELE(PTOU(curproc)->u_cdir);
  681         if (PTOU(curproc)->u_rdir)
  682                 VN_RELE(PTOU(curproc)->u_rdir);
  683         if (cp->p_exec)
  684                 VN_RELE(cp->p_exec);
  685         if (cp->p_execdir)
  686                 VN_RELE(cp->p_execdir);
  687         if (PTOU(curproc)->u_cwd)
  688                 refstr_rele(PTOU(curproc)->u_cwd);
  689         if (PROC_IS_BRANDED(cp)) {
  690                 brand_clearbrand(cp, B_TRUE);
  691         }
  692 }
  693 
  694 /*
  695  * Clean up the lwps already created for this child process.
  696  * The fork failed while duplicating all the lwps of the parent
  697  * and those lwps already created must be freed.
  698  * This process is invisible to the rest of the system,
  699  * so we don't need to hold p->p_lock to protect the list.
  700  */
  701 static void
  702 forklwp_fail(proc_t *p)
  703 {
  704         kthread_t *t;
  705         task_t *tk;
  706         int branded = 0;
  707 
  708         if (PROC_IS_BRANDED(p))
  709                 branded = 1;
  710 
  711         while ((t = p->p_tlist) != NULL) {
  712                 /*
  713                  * First remove the lwp from the process's p_tlist.
  714                  */
  715                 if (t != t->t_forw)
  716                         p->p_tlist = t->t_forw;
  717                 else
  718                         p->p_tlist = NULL;
  719                 p->p_lwpcnt--;
  720                 t->t_forw->t_back = t->t_back;
  721                 t->t_back->t_forw = t->t_forw;
  722 
  723                 tk = p->p_task;
  724                 mutex_enter(&p->p_zone->zone_nlwps_lock);
  725                 tk->tk_nlwps--;
  726                 tk->tk_proj->kpj_nlwps--;
  727                 p->p_zone->zone_nlwps--;
  728                 mutex_exit(&p->p_zone->zone_nlwps_lock);
  729 
  730                 ASSERT(t->t_schedctl == NULL);
  731 
  732                 if (branded)
  733                         BROP(p)->b_freelwp(ttolwp(t));
  734 
  735                 if (t->t_door != NULL) {
  736                         kmem_free(t->t_door, sizeof (door_data_t));
  737                         t->t_door = NULL;
  738                 }
  739                 lwp_ctmpl_clear(ttolwp(t));
  740 
  741                 /*
  742                  * Remove the thread from the all threads list.
  743                  * We need to hold pidlock for this.
  744                  */
  745                 mutex_enter(&pidlock);
  746                 t->t_next->t_prev = t->t_prev;
  747                 t->t_prev->t_next = t->t_next;
  748                 CL_EXIT(t);     /* tell the scheduler that we're exiting */
  749                 cv_broadcast(&t->t_joincv);     /* tell anyone in thread_join */
  750                 mutex_exit(&pidlock);
  751 
  752                 /*
  753                  * Let the lgroup load averages know that this thread isn't
  754                  * going to show up (i.e. un-do what was done on behalf of
  755                  * this thread by the earlier lgrp_move_thread()).
  756                  */
  757                 kpreempt_disable();
  758                 lgrp_move_thread(t, NULL, 1);
  759                 kpreempt_enable();
  760 
  761                 /*
  762                  * The thread was created TS_STOPPED.
  763                  * We change it to TS_FREE to avoid an
  764                  * ASSERT() panic in thread_free().
  765                  */
  766                 t->t_state = TS_FREE;
  767                 thread_rele(t);
  768                 thread_free(t);
  769         }
  770 }
  771 
  772 extern struct as kas;
  773 
  774 /*
  775  * fork a kernel process.
  776  */
  777 int
  778 newproc(void (*pc)(), caddr_t arg, id_t cid, int pri, struct contract **ct,
  779     pid_t pid)
  780 {
  781         proc_t *p;
  782         struct user *up;
  783         kthread_t *t;
  784         cont_process_t *ctp = NULL;
  785         rctl_entity_p_t e;
  786 
  787         ASSERT(cid != sysdccid);
  788         ASSERT(cid != syscid || ct == NULL);
  789         if (CLASS_KERNEL(cid)) {
  790                 rctl_alloc_gp_t *init_gp;
  791                 rctl_set_t *init_set;
  792 
  793                 ASSERT(pid != 1);
  794 
  795                 if (getproc(&p, pid, GETPROC_KERNEL) < 0)
  796                         return (EAGAIN);
  797 
  798                 /*
  799                  * Release the hold on the p_exec and p_execdir, these
  800                  * were acquired in getproc()
  801                  */
  802                 if (p->p_execdir != NULL)
  803                         VN_RELE(p->p_execdir);
  804                 if (p->p_exec != NULL)
  805                         VN_RELE(p->p_exec);
  806                 p->p_flag |= SNOWAIT;
  807                 p->p_exec = NULL;
  808                 p->p_execdir = NULL;
  809 
  810                 init_set = rctl_set_create();
  811                 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
  812 
  813                 /*
  814                  * kernel processes do not inherit /proc tracing flags.
  815                  */
  816                 sigemptyset(&p->p_sigmask);
  817                 premptyset(&p->p_fltmask);
  818                 up = PTOU(p);
  819                 up->u_systrap = 0;
  820                 premptyset(&(up->u_entrymask));
  821                 premptyset(&(up->u_exitmask));
  822                 mutex_enter(&p->p_lock);
  823                 e.rcep_p.proc = p;
  824                 e.rcep_t = RCENTITY_PROCESS;
  825                 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
  826                     init_gp);
  827                 mutex_exit(&p->p_lock);
  828 
  829                 rctl_prealloc_destroy(init_gp);
  830 
  831                 t = lwp_kernel_create(p, pc, arg, TS_STOPPED, pri);
  832         } else {
  833                 rctl_alloc_gp_t *init_gp, *default_gp;
  834                 rctl_set_t *init_set;
  835                 task_t *tk, *tk_old;
  836                 klwp_t *lwp;
  837 
  838                 if (getproc(&p, pid, GETPROC_USER) < 0)
  839                         return (EAGAIN);
  840                 /*
  841                  * init creates a new task, distinct from the task
  842                  * containing kernel "processes".
  843                  */
  844                 tk = task_create(0, p->p_zone);
  845                 mutex_enter(&tk->tk_zone->zone_nlwps_lock);
  846                 tk->tk_proj->kpj_ntasks++;
  847                 tk->tk_nprocs++;
  848                 mutex_exit(&tk->tk_zone->zone_nlwps_lock);
  849 
  850                 default_gp = rctl_rlimit_set_prealloc(RLIM_NLIMITS);
  851                 init_gp = rctl_set_init_prealloc(RCENTITY_PROCESS);
  852                 init_set = rctl_set_create();
  853 
  854                 mutex_enter(&pidlock);
  855                 mutex_enter(&p->p_lock);
  856                 tk_old = p->p_task;     /* switch to new task */
  857 
  858                 task_detach(p);
  859                 task_begin(tk, p);
  860                 mutex_exit(&pidlock);
  861 
  862                 mutex_enter(&tk_old->tk_zone->zone_nlwps_lock);
  863                 tk_old->tk_nprocs--;
  864                 mutex_exit(&tk_old->tk_zone->zone_nlwps_lock);
  865 
  866                 e.rcep_p.proc = p;
  867                 e.rcep_t = RCENTITY_PROCESS;
  868                 p->p_rctls = rctl_set_init(RCENTITY_PROCESS, p, &e, init_set,
  869                     init_gp);
  870                 rctlproc_default_init(p, default_gp);
  871                 mutex_exit(&p->p_lock);
  872 
  873                 task_rele(tk_old);
  874                 rctl_prealloc_destroy(default_gp);
  875                 rctl_prealloc_destroy(init_gp);
  876 
  877                 if ((lwp = lwp_create(pc, arg, 0, p, TS_STOPPED, pri,
  878                     &curthread->t_hold, cid, 1)) == NULL) {
  879                         task_t *tk;
  880                         fork_fail(p);
  881                         mutex_enter(&pidlock);
  882                         mutex_enter(&p->p_lock);
  883                         tk = p->p_task;
  884                         task_detach(p);
  885                         ASSERT(p->p_pool->pool_ref > 0);
  886                         atomic_add_32(&p->p_pool->pool_ref, -1);
  887                         mutex_exit(&p->p_lock);
  888                         pid_exit(p, tk);
  889                         mutex_exit(&pidlock);
  890                         task_rele(tk);
  891 
  892                         return (EAGAIN);
  893                 }
  894                 t = lwptot(lwp);
  895 
  896                 ctp = contract_process_fork(sys_process_tmpl, p, curproc,
  897                     B_FALSE);
  898                 ASSERT(ctp != NULL);
  899                 if (ct != NULL)
  900                         *ct = &ctp->conp_contract;
  901         }
  902 
  903         ASSERT3U(t->t_tid, ==, 1);
  904         p->p_lwpid = 1;
  905         mutex_enter(&pidlock);
  906         pgjoin(p, p->p_parent->p_pgidp);
  907         p->p_stat = SRUN;
  908         mutex_enter(&p->p_lock);
  909         t->t_proc_flag &= ~TP_HOLDLWP;
  910         lwp_create_done(t);
  911         mutex_exit(&p->p_lock);
  912         mutex_exit(&pidlock);
  913         return (0);
  914 }
  915 
  916 /*
  917  * create a child proc struct.
  918  */
  919 static int
  920 getproc(proc_t **cpp, pid_t pid, uint_t flags)
  921 {
  922         proc_t          *pp, *cp;
  923         pid_t           newpid;
  924         struct user     *uarea;
  925         extern uint_t   nproc;
  926         struct cred     *cr;
  927         uid_t           ruid;
  928         zoneid_t        zoneid;
  929         task_t          *task;
  930         kproject_t      *proj;
  931         zone_t          *zone;
  932         int             rctlfail = 0;
  933 
  934         if (!page_mem_avail(tune.t_minarmem))
  935                 return (-1);
  936         if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)
  937                 return (-1);    /* no point in starting new processes */
  938 
  939         pp = (flags & GETPROC_KERNEL) ? &p0 : curproc;
  940         task = pp->p_task;
  941         proj = task->tk_proj;
  942         zone = pp->p_zone;
  943 
  944         mutex_enter(&pp->p_lock);
  945         mutex_enter(&zone->zone_nlwps_lock);
  946         if (proj != proj0p) {
  947                 if (task->tk_nprocs >= task->tk_nprocs_ctl)
  948                         if (rctl_test(rc_task_nprocs, task->tk_rctls,
  949                             pp, 1, 0) & RCT_DENY)
  950                                 rctlfail = 1;
  951 
  952                 if (proj->kpj_nprocs >= proj->kpj_nprocs_ctl)
  953                         if (rctl_test(rc_project_nprocs, proj->kpj_rctls,
  954                             pp, 1, 0) & RCT_DENY)
  955                                 rctlfail = 1;
  956 
  957                 if (zone->zone_nprocs >= zone->zone_nprocs_ctl)
  958                         if (rctl_test(rc_zone_nprocs, zone->zone_rctls,
  959                             pp, 1, 0) & RCT_DENY)
  960                                 rctlfail = 1;
  961 
  962                 if (rctlfail) {
  963                         mutex_exit(&zone->zone_nlwps_lock);
  964                         mutex_exit(&pp->p_lock);
  965                         goto punish;
  966                 }
  967         }
  968         task->tk_nprocs++;
  969         proj->kpj_nprocs++;
  970         zone->zone_nprocs++;
  971         mutex_exit(&zone->zone_nlwps_lock);
  972         mutex_exit(&pp->p_lock);
  973 
  974         cp = kmem_cache_alloc(process_cache, KM_SLEEP);
  975         bzero(cp, sizeof (proc_t));
  976 
  977         /*
  978          * Make proc entry for child process
  979          */
  980         mutex_init(&cp->p_splock, NULL, MUTEX_DEFAULT, NULL);
  981         mutex_init(&cp->p_crlock, NULL, MUTEX_DEFAULT, NULL);
  982         mutex_init(&cp->p_pflock, NULL, MUTEX_DEFAULT, NULL);
  983 #if defined(__x86)
  984         mutex_init(&cp->p_ldtlock, NULL, MUTEX_DEFAULT, NULL);
  985 #endif
  986         mutex_init(&cp->p_maplock, NULL, MUTEX_DEFAULT, NULL);
  987         cp->p_stat = SIDL;
  988         cp->p_mstart = gethrtime();
  989         cp->p_as = &kas;
  990         /*
  991          * p_zone must be set before we call pid_allocate since the process
  992          * will be visible after that and code such as prfind_zone will
  993          * look at the p_zone field.
  994          */
  995         cp->p_zone = pp->p_zone;
  996         cp->p_t1_lgrpid = LGRP_NONE;
  997         cp->p_tr_lgrpid = LGRP_NONE;
  998 
  999         if ((newpid = pid_allocate(cp, pid, PID_ALLOC_PROC)) == -1) {
 1000                 if (nproc == v.v_proc) {
 1001                         CPU_STATS_ADDQ(CPU, sys, procovf, 1);
 1002                         cmn_err(CE_WARN, "out of processes");
 1003                 }
 1004                 goto bad;
 1005         }
 1006 
 1007         mutex_enter(&pp->p_lock);
 1008         cp->p_exec = pp->p_exec;
 1009         cp->p_execdir = pp->p_execdir;
 1010         mutex_exit(&pp->p_lock);
 1011 
 1012         if (cp->p_exec) {
 1013                 VN_HOLD(cp->p_exec);
 1014                 /*
 1015                  * Each VOP_OPEN() must be paired with a corresponding
 1016                  * VOP_CLOSE(). In this case, the executable will be
 1017                  * closed for the child in either proc_exit() or gexec().
 1018                  */
 1019                 if (VOP_OPEN(&cp->p_exec, FREAD, CRED(), NULL) != 0) {
 1020                         VN_RELE(cp->p_exec);
 1021                         cp->p_exec = NULLVP;
 1022                         cp->p_execdir = NULLVP;
 1023                         goto bad;
 1024                 }
 1025         }
 1026         if (cp->p_execdir)
 1027                 VN_HOLD(cp->p_execdir);
 1028 
 1029         /*
 1030          * If not privileged make sure that this user hasn't exceeded
 1031          * v.v_maxup processes, and that users collectively haven't
 1032          * exceeded v.v_maxupttl processes.
 1033          */
 1034         mutex_enter(&pidlock);
 1035         ASSERT(nproc < v.v_proc);       /* otherwise how'd we get our pid? */
 1036         cr = CRED();
 1037         ruid = crgetruid(cr);
 1038         zoneid = crgetzoneid(cr);
 1039         if (nproc >= v.v_maxup &&       /* short-circuit; usually false */
 1040             (nproc >= v.v_maxupttl ||
 1041             upcount_get(ruid, zoneid) >= v.v_maxup) &&
 1042             secpolicy_newproc(cr) != 0) {
 1043                 mutex_exit(&pidlock);
 1044                 zcmn_err(zoneid, CE_NOTE,
 1045                     "out of per-user processes for uid %d", ruid);
 1046                 goto bad;
 1047         }
 1048 
 1049         /*
 1050          * Everything is cool, put the new proc on the active process list.
 1051          * It is already on the pid list and in /proc.
 1052          * Increment the per uid process count (upcount).
 1053          */
 1054         nproc++;
 1055         upcount_inc(ruid, zoneid);
 1056 
 1057         cp->p_next = practive;
 1058         practive->p_prev = cp;
 1059         practive = cp;
 1060 
 1061         cp->p_ignore = pp->p_ignore;
 1062         cp->p_siginfo = pp->p_siginfo;
 1063         cp->p_flag = pp->p_flag & (SJCTL|SNOWAIT|SNOCD);
 1064         cp->p_sessp = pp->p_sessp;
 1065         sess_hold(pp);
 1066         cp->p_brand = pp->p_brand;
 1067         if (PROC_IS_BRANDED(pp))
 1068                 BROP(pp)->b_copy_procdata(cp, pp);
 1069         cp->p_bssbase = pp->p_bssbase;
 1070         cp->p_brkbase = pp->p_brkbase;
 1071         cp->p_brksize = pp->p_brksize;
 1072         cp->p_brkpageszc = pp->p_brkpageszc;
 1073         cp->p_stksize = pp->p_stksize;
 1074         cp->p_stkpageszc = pp->p_stkpageszc;
 1075         cp->p_stkprot = pp->p_stkprot;
 1076         cp->p_datprot = pp->p_datprot;
 1077         cp->p_usrstack = pp->p_usrstack;
 1078         cp->p_model = pp->p_model;
 1079         cp->p_ppid = pp->p_pid;
 1080         cp->p_ancpid = pp->p_pid;
 1081         cp->p_portcnt = pp->p_portcnt;
 1082 
 1083         /*
 1084          * Initialize watchpoint structures
 1085          */
 1086         avl_create(&cp->p_warea, wa_compare, sizeof (struct watched_area),
 1087             offsetof(struct watched_area, wa_link));
 1088 
 1089         /*
 1090          * Initialize immediate resource control values.
 1091          */
 1092         cp->p_stk_ctl = pp->p_stk_ctl;
 1093         cp->p_fsz_ctl = pp->p_fsz_ctl;
 1094         cp->p_vmem_ctl = pp->p_vmem_ctl;
 1095         cp->p_fno_ctl = pp->p_fno_ctl;
 1096 
 1097         /*
 1098          * Link up to parent-child-sibling chain.  No need to lock
 1099          * in general since only a call to freeproc() (done by the
 1100          * same parent as newproc()) diddles with the child chain.
 1101          */
 1102         cp->p_sibling = pp->p_child;
 1103         if (pp->p_child)
 1104                 pp->p_child->p_psibling = cp;
 1105 
 1106         cp->p_parent = pp;
 1107         pp->p_child = cp;
 1108 
 1109         cp->p_child_ns = NULL;
 1110         cp->p_sibling_ns = NULL;
 1111 
 1112         cp->p_nextorph = pp->p_orphan;
 1113         cp->p_nextofkin = pp;
 1114         pp->p_orphan = cp;
 1115 
 1116         /*
 1117          * Inherit profiling state; do not inherit REALPROF profiling state.
 1118          */
 1119         cp->p_prof = pp->p_prof;
 1120         cp->p_rprof_cyclic = CYCLIC_NONE;
 1121 
 1122         /*
 1123          * Inherit pool pointer from the parent.  Kernel processes are
 1124          * always bound to the default pool.
 1125          */
 1126         mutex_enter(&pp->p_lock);
 1127         if (flags & GETPROC_KERNEL) {
 1128                 cp->p_pool = pool_default;
 1129                 cp->p_flag |= SSYS;
 1130         } else {
 1131                 cp->p_pool = pp->p_pool;
 1132         }
 1133         atomic_add_32(&cp->p_pool->pool_ref, 1);
 1134         mutex_exit(&pp->p_lock);
 1135 
 1136         /*
 1137          * Add the child process to the current task.  Kernel processes
 1138          * are always attached to task0.
 1139          */
 1140         mutex_enter(&cp->p_lock);
 1141         if (flags & GETPROC_KERNEL)
 1142                 task_attach(task0p, cp);
 1143         else
 1144                 task_attach(pp->p_task, cp);
 1145         mutex_exit(&cp->p_lock);
 1146         mutex_exit(&pidlock);
 1147 
 1148         avl_create(&cp->p_ct_held, contract_compar, sizeof (contract_t),
 1149             offsetof(contract_t, ct_ctlist));
 1150 
 1151         /*
 1152          * Duplicate any audit information kept in the process table
 1153          */
 1154         if (audit_active)       /* copy audit data to cp */
 1155                 audit_newproc(cp);
 1156 
 1157         crhold(cp->p_cred = cr);
 1158 
 1159         /*
 1160          * Bump up the counts on the file structures pointed at by the
 1161          * parent's file table since the child will point at them too.
 1162          */
 1163         fcnt_add(P_FINFO(pp), 1);
 1164 
 1165         if (PTOU(pp)->u_cdir) {
 1166                 VN_HOLD(PTOU(pp)->u_cdir);
 1167         } else {
 1168                 ASSERT(pp == &p0);
 1169                 /*
 1170                  * We must be at or before vfs_mountroot(); it will take care of
 1171                  * assigning our current directory.
 1172                  */
 1173         }
 1174         if (PTOU(pp)->u_rdir)
 1175                 VN_HOLD(PTOU(pp)->u_rdir);
 1176         if (PTOU(pp)->u_cwd)
 1177                 refstr_hold(PTOU(pp)->u_cwd);
 1178 
 1179         /*
 1180          * copy the parent's uarea.
 1181          */
 1182         uarea = PTOU(cp);
 1183         bcopy(PTOU(pp), uarea, sizeof (*uarea));
 1184         flist_fork(P_FINFO(pp), P_FINFO(cp));
 1185 
 1186         gethrestime(&uarea->u_start);
 1187         uarea->u_ticks = ddi_get_lbolt();
 1188         uarea->u_mem = rm_asrss(pp->p_as);
 1189         uarea->u_acflag = AFORK;
 1190 
 1191         /*
 1192          * If inherit-on-fork, copy /proc tracing flags to child.
 1193          */
 1194         if ((pp->p_proc_flag & P_PR_FORK) != 0) {
 1195                 cp->p_proc_flag |= pp->p_proc_flag & (P_PR_TRACE|P_PR_FORK);
 1196                 cp->p_sigmask = pp->p_sigmask;
 1197                 cp->p_fltmask = pp->p_fltmask;
 1198         } else {
 1199                 sigemptyset(&cp->p_sigmask);
 1200                 premptyset(&cp->p_fltmask);
 1201                 uarea->u_systrap = 0;
 1202                 premptyset(&uarea->u_entrymask);
 1203                 premptyset(&uarea->u_exitmask);
 1204         }
 1205         /*
 1206          * If microstate accounting is being inherited, mark child
 1207          */
 1208         if ((pp->p_flag & SMSFORK) != 0)
 1209                 cp->p_flag |= pp->p_flag & (SMSFORK|SMSACCT);
 1210 
 1211         /*
 1212          * Inherit fixalignment flag from the parent
 1213          */
 1214         cp->p_fixalignment = pp->p_fixalignment;
 1215 
 1216         *cpp = cp;
 1217         return (0);
 1218 
 1219 bad:
 1220         ASSERT(MUTEX_NOT_HELD(&pidlock));
 1221 
 1222         mutex_destroy(&cp->p_crlock);
 1223         mutex_destroy(&cp->p_pflock);
 1224 #if defined(__x86)
 1225         mutex_destroy(&cp->p_ldtlock);
 1226 #endif
 1227         if (newpid != -1) {
 1228                 proc_entry_free(cp->p_pidp);
 1229                 (void) pid_rele(cp->p_pidp);
 1230         }
 1231         kmem_cache_free(process_cache, cp);
 1232 
 1233         mutex_enter(&zone->zone_nlwps_lock);
 1234         task->tk_nprocs--;
 1235         proj->kpj_nprocs--;
 1236         zone->zone_nprocs--;
 1237         mutex_exit(&zone->zone_nlwps_lock);
 1238 
 1239 punish:
 1240         /*
 1241          * We most likely got into this situation because some process is
 1242          * forking out of control.  As punishment, put it to sleep for a
 1243          * bit so it can't eat the machine alive.  Sleep interval is chosen
 1244          * to allow no more than one fork failure per cpu per clock tick
 1245          * on average (yes, I just made this up).  This has two desirable
 1246          * properties: (1) it sets a constant limit on the fork failure
 1247          * rate, and (2) the busier the system is, the harsher the penalty
 1248          * for abusing it becomes.
 1249          */
 1250         INCR_COUNT(&fork_fail_pending, &pidlock);
 1251         delay(fork_fail_pending / ncpus + 1);
 1252         DECR_COUNT(&fork_fail_pending, &pidlock);
 1253 
 1254         return (-1); /* out of memory or proc slots */
 1255 }
 1256 
 1257 /*
 1258  * Release virtual memory.
 1259  * In the case of vfork(), the child was given exclusive access to its
 1260  * parent's address space.  The parent is waiting in vfwait() for the
 1261  * child to release its exclusive claim via relvm().
 1262  */
 1263 void
 1264 relvm()
 1265 {
 1266         proc_t *p = curproc;
 1267 
 1268         ASSERT((unsigned)p->p_lwpcnt <= 1);
 1269 
 1270         prrelvm();      /* inform /proc */
 1271 
 1272         if (p->p_flag & SVFORK) {
 1273                 proc_t *pp = p->p_parent;
 1274                 /*
 1275                  * The child process is either exec'ing or exit'ing.
 1276                  * The child is now separated from the parent's address
 1277                  * space.  The parent process is made dispatchable.
 1278                  *
 1279                  * This is a delicate locking maneuver, involving
 1280                  * both the parent's p_lock and the child's p_lock.
 1281                  * As soon as the SVFORK flag is turned off, the
 1282                  * parent is free to run, but it must not run until
 1283                  * we wake it up using its p_cv because it might
 1284                  * exit and we would be referencing invalid memory.
 1285                  * Therefore, we hold the parent with its p_lock
 1286                  * while protecting our p_flags with our own p_lock.
 1287                  */
 1288 try_again:
 1289                 mutex_enter(&p->p_lock);        /* grab child's lock first */
 1290                 prbarrier(p);           /* make sure /proc is blocked out */
 1291                 mutex_enter(&pp->p_lock);
 1292 
 1293                 /*
 1294                  * Check if parent is locked by /proc.
 1295                  */
 1296                 if (pp->p_proc_flag & P_PR_LOCK) {
 1297                         /*
 1298                          * Delay until /proc is done with the parent.
 1299                          * We must drop our (the child's) p->p_lock, wait
 1300                          * via prbarrier() on the parent, then start over.
 1301                          */
 1302                         mutex_exit(&p->p_lock);
 1303                         prbarrier(pp);
 1304                         mutex_exit(&pp->p_lock);
 1305                         goto try_again;
 1306                 }
 1307                 p->p_flag &= ~SVFORK;
 1308                 kpreempt_disable();
 1309                 p->p_as = &kas;
 1310 
 1311                 /*
 1312                  * notify hat of change in thread's address space
 1313                  */
 1314                 hat_thread_exit(curthread);
 1315                 kpreempt_enable();
 1316 
 1317                 /*
 1318                  * child sizes are copied back to parent because
 1319                  * child may have grown.
 1320                  */
 1321                 pp->p_brkbase = p->p_brkbase;
 1322                 pp->p_brksize = p->p_brksize;
 1323                 pp->p_stksize = p->p_stksize;
 1324 
 1325                 /*
 1326                  * Copy back the shm accounting information
 1327                  * to the parent process.
 1328                  */
 1329                 pp->p_segacct = p->p_segacct;
 1330                 p->p_segacct = NULL;
 1331 
 1332                 /*
 1333                  * The parent is no longer waiting for the vfork()d child.
 1334                  * Restore the parent's watched pages, if any.  This is
 1335                  * safe because we know the parent is not locked by /proc
 1336                  */
 1337                 pp->p_flag &= ~SVFWAIT;
 1338                 if (avl_numnodes(&pp->p_wpage) != 0) {
 1339                         pp->p_as->a_wpage = pp->p_wpage;
 1340                         avl_create(&pp->p_wpage, wp_compare,
 1341                             sizeof (struct watched_page),
 1342                             offsetof(struct watched_page, wp_link));
 1343                 }
 1344                 cv_signal(&pp->p_cv);
 1345                 mutex_exit(&pp->p_lock);
 1346                 mutex_exit(&p->p_lock);
 1347         } else {
 1348                 if (p->p_as != &kas) {
 1349                         struct as *as;
 1350 
 1351                         if (p->p_segacct)
 1352                                 shmexit(p);
 1353 
 1354                         /*
 1355                          * We grab p_lock for the benefit of /proc
 1356                          */
 1357                         kpreempt_disable();
 1358                         mutex_enter(&p->p_lock);
 1359                         prbarrier(p);   /* make sure /proc is blocked out */
 1360                         as = p->p_as;
 1361                         p->p_as = &kas;
 1362                         mutex_exit(&p->p_lock);
 1363 
 1364                         /*
 1365                          * notify hat of change in thread's address space
 1366                          */
 1367                         hat_thread_exit(curthread);
 1368                         kpreempt_enable();
 1369 
 1370                         as_free(as);
 1371                         p->p_tr_lgrpid = LGRP_NONE;
 1372                 }
 1373         }
 1374 }
 1375 
 1376 /*
 1377  * Wait for child to exec or exit.
 1378  * Called by parent of vfork'ed process.
 1379  * See important comments in relvm(), above.
 1380  */
 1381 void
 1382 vfwait(pid_t pid)
 1383 {
 1384         int signalled = 0;
 1385         proc_t *pp = ttoproc(curthread);
 1386         proc_t *cp;
 1387 
 1388         /*
 1389          * Wait for child to exec or exit.
 1390          */
 1391         for (;;) {
 1392                 mutex_enter(&pidlock);
 1393                 cp = prfind(pid);
 1394                 if (cp == NULL || cp->p_parent != pp) {
 1395                         /*
 1396                          * Child has exit()ed.
 1397                          */
 1398                         mutex_exit(&pidlock);
 1399                         break;
 1400                 }
 1401                 /*
 1402                  * Grab the child's p_lock before releasing pidlock.
 1403                  * Otherwise, the child could exit and we would be
 1404                  * referencing invalid memory.
 1405                  */
 1406                 mutex_enter(&cp->p_lock);
 1407                 mutex_exit(&pidlock);
 1408                 if (!(cp->p_flag & SVFORK)) {
 1409                         /*
 1410                          * Child has exec()ed or is exit()ing.
 1411                          */
 1412                         mutex_exit(&cp->p_lock);
 1413                         break;
 1414                 }
 1415                 mutex_enter(&pp->p_lock);
 1416                 mutex_exit(&cp->p_lock);
 1417                 /*
 1418                  * We might be waked up spuriously from the cv_wait().
 1419                  * We have to do the whole operation over again to be
 1420                  * sure the child's SVFORK flag really is turned off.
 1421                  * We cannot make reference to the child because it can
 1422                  * exit before we return and we would be referencing
 1423                  * invalid memory.
 1424                  *
 1425                  * Because this is potentially a very long-term wait,
 1426                  * we call cv_wait_sig() (for its jobcontrol and /proc
 1427                  * side-effects) unless there is a current signal, in
 1428                  * which case we use cv_wait() because we cannot return
 1429                  * from this function until the child has released the
 1430                  * address space.  Calling cv_wait_sig() with a current
 1431                  * signal would lead to an indefinite loop here because
 1432                  * cv_wait_sig() returns immediately in this case.
 1433                  */
 1434                 if (signalled)
 1435                         cv_wait(&pp->p_cv, &pp->p_lock);
 1436                 else
 1437                         signalled = !cv_wait_sig(&pp->p_cv, &pp->p_lock);
 1438                 mutex_exit(&pp->p_lock);
 1439         }
 1440 
 1441         /* restore watchpoints to parent */
 1442         if (pr_watch_active(pp)) {
 1443                 struct as *as = pp->p_as;
 1444                 AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
 1445                 as_setwatch(as);
 1446                 AS_LOCK_EXIT(as, &as->a_lock);
 1447         }
 1448 
 1449         mutex_enter(&pp->p_lock);
 1450         prbarrier(pp);  /* barrier against /proc locking */
 1451         continuelwps(pp);
 1452         mutex_exit(&pp->p_lock);
 1453 }

Cache object: 01f4523d0ae5d5683289e14a3a25c4d2


[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]


This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.