kern_exec.c

Version: - FREEBSD - FREEBSD-13-STABLE - FREEBSD-13-0 - FREEBSD-12-STABLE - FREEBSD-12-0 - FREEBSD-11-STABLE - FREEBSD-11-0 - FREEBSD-10-STABLE - FREEBSD-10-0 - FREEBSD-9-STABLE - FREEBSD-9-0 - FREEBSD-8-STABLE - FREEBSD-8-0 - FREEBSD-7-STABLE - FREEBSD-7-0 - FREEBSD-6-STABLE - FREEBSD-6-0 - FREEBSD-5-STABLE - FREEBSD-5-0 - FREEBSD-4-STABLE - FREEBSD-3-STABLE - FREEBSD22 - l41 - OPENBSD - linux-2.6 - MK84 - PLAN9 - xnu-8792
SearchContext: - none - 3 - 10
    1 /*-
    2  * Copyright (c) 1993, David Greenman
    3  * All rights reserved.
    4  *
    5  * Redistribution and use in source and binary forms, with or without
    6  * modification, are permitted provided that the following conditions
    7  * are met:
    8  * 1. Redistributions of source code must retain the above copyright
    9  *    notice, this list of conditions and the following disclaimer.
   10  * 2. Redistributions in binary form must reproduce the above copyright
   11  *    notice, this list of conditions and the following disclaimer in the
   12  *    documentation and/or other materials provided with the distribution.
   13  *
   14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
   15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
   17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
   18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
   19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
   20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
   23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   24  * SUCH DAMAGE.
   25  */
   26 
   27 #include <sys/cdefs.h>
   28 __FBSDID("$FreeBSD: releng/10.2/sys/kern/kern_exec.c 283359 2015-05-24 07:32:02Z kib $");
   29 
   30 #include "opt_capsicum.h"
   31 #include "opt_hwpmc_hooks.h"
   32 #include "opt_kdtrace.h"
   33 #include "opt_ktrace.h"
   34 #include "opt_vm.h"
   35 
   36 #include <sys/param.h>
   37 #include <sys/capsicum.h>
   38 #include <sys/systm.h>
   39 #include <sys/capsicum.h>
   40 #include <sys/eventhandler.h>
   41 #include <sys/lock.h>
   42 #include <sys/mutex.h>
   43 #include <sys/sysproto.h>
   44 #include <sys/signalvar.h>
   45 #include <sys/kernel.h>
   46 #include <sys/mount.h>
   47 #include <sys/filedesc.h>
   48 #include <sys/fcntl.h>
   49 #include <sys/acct.h>
   50 #include <sys/exec.h>
   51 #include <sys/imgact.h>
   52 #include <sys/imgact_elf.h>
   53 #include <sys/wait.h>
   54 #include <sys/malloc.h>
   55 #include <sys/priv.h>
   56 #include <sys/proc.h>
   57 #include <sys/pioctl.h>
   58 #include <sys/namei.h>
   59 #include <sys/resourcevar.h>
   60 #include <sys/rwlock.h>
   61 #include <sys/sched.h>
   62 #include <sys/sdt.h>
   63 #include <sys/sf_buf.h>
   64 #include <sys/syscallsubr.h>
   65 #include <sys/sysent.h>
   66 #include <sys/shm.h>
   67 #include <sys/sysctl.h>
   68 #include <sys/vnode.h>
   69 #include <sys/stat.h>
   70 #ifdef KTRACE
   71 #include <sys/ktrace.h>
   72 #endif
   73 
   74 #include <vm/vm.h>
   75 #include <vm/vm_param.h>
   76 #include <vm/pmap.h>
   77 #include <vm/vm_page.h>
   78 #include <vm/vm_map.h>
   79 #include <vm/vm_kern.h>
   80 #include <vm/vm_extern.h>
   81 #include <vm/vm_object.h>
   82 #include <vm/vm_pager.h>
   83 
   84 #ifdef  HWPMC_HOOKS
   85 #include <sys/pmckern.h>
   86 #endif
   87 
   88 #include <machine/reg.h>
   89 
   90 #include <security/audit/audit.h>
   91 #include <security/mac/mac_framework.h>
   92 
   93 #ifdef KDTRACE_HOOKS
   94 #include <sys/dtrace_bsd.h>
   95 dtrace_execexit_func_t  dtrace_fasttrap_exec;
   96 #endif
   97 
   98 SDT_PROVIDER_DECLARE(proc);
   99 SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
  100 SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");
  101 SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
  102 
  103 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
  104 
  105 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
  106 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
  107 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
  108 static int do_execve(struct thread *td, struct image_args *args,
  109     struct mac *mac_p);
  110 
  111 /* XXX This should be vm_size_t. */
  112 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
  113     NULL, 0, sysctl_kern_ps_strings, "LU", "");
  114 
  115 /* XXX This should be vm_size_t. */
  116 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD|
  117     CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", "");
  118 
  119 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
  120     NULL, 0, sysctl_kern_stackprot, "I", "");
  121 
  122 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
  123 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
  124     &ps_arg_cache_limit, 0, "");
  125 
  126 static int disallow_high_osrel;
  127 SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
  128     &disallow_high_osrel, 0,
  129     "Disallow execution of binaries built for higher version of the world");
  130 
  131 static int map_at_zero = 0;
  132 TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
  133 SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
  134     "Permit processes to map an object at virtual address 0.");
  135 
  136 static int
  137 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
  138 {
  139         struct proc *p;
  140         int error;
  141 
  142         p = curproc;
  143 #ifdef SCTL_MASK32
  144         if (req->flags & SCTL_MASK32) {
  145                 unsigned int val;
  146                 val = (unsigned int)p->p_sysent->sv_psstrings;
  147                 error = SYSCTL_OUT(req, &val, sizeof(val));
  148         } else
  149 #endif
  150                 error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
  151                    sizeof(p->p_sysent->sv_psstrings));
  152         return error;
  153 }
  154 
  155 static int
  156 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
  157 {
  158         struct proc *p;
  159         int error;
  160 
  161         p = curproc;
  162 #ifdef SCTL_MASK32
  163         if (req->flags & SCTL_MASK32) {
  164                 unsigned int val;
  165                 val = (unsigned int)p->p_sysent->sv_usrstack;
  166                 error = SYSCTL_OUT(req, &val, sizeof(val));
  167         } else
  168 #endif
  169                 error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
  170                     sizeof(p->p_sysent->sv_usrstack));
  171         return error;
  172 }
  173 
  174 static int
  175 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
  176 {
  177         struct proc *p;
  178 
  179         p = curproc;
  180         return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
  181             sizeof(p->p_sysent->sv_stackprot)));
  182 }
  183 
  184 /*
  185  * Each of the items is a pointer to a `const struct execsw', hence the
  186  * double pointer here.
  187  */
  188 static const struct execsw **execsw;
  189 
  190 #ifndef _SYS_SYSPROTO_H_
  191 struct execve_args {
  192         char    *fname; 
  193         char    **argv;
  194         char    **envv; 
  195 };
  196 #endif
  197 
  198 int
  199 sys_execve(struct thread *td, struct execve_args *uap)
  200 {
  201         struct image_args args;
  202         struct vmspace *oldvmspace;
  203         int error;
  204 
  205         error = pre_execve(td, &oldvmspace);
  206         if (error != 0)
  207                 return (error);
  208         error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
  209             uap->argv, uap->envv);
  210         if (error == 0)
  211                 error = kern_execve(td, &args, NULL);
  212         post_execve(td, error, oldvmspace);
  213         return (error);
  214 }
  215 
  216 #ifndef _SYS_SYSPROTO_H_
  217 struct fexecve_args {
  218         int     fd;
  219         char    **argv;
  220         char    **envv;
  221 }
  222 #endif
  223 int
  224 sys_fexecve(struct thread *td, struct fexecve_args *uap)
  225 {
  226         struct image_args args;
  227         struct vmspace *oldvmspace;
  228         int error;
  229 
  230         error = pre_execve(td, &oldvmspace);
  231         if (error != 0)
  232                 return (error);
  233         error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
  234             uap->argv, uap->envv);
  235         if (error == 0) {
  236                 args.fd = uap->fd;
  237                 error = kern_execve(td, &args, NULL);
  238         }
  239         post_execve(td, error, oldvmspace);
  240         return (error);
  241 }
  242 
  243 #ifndef _SYS_SYSPROTO_H_
  244 struct __mac_execve_args {
  245         char    *fname;
  246         char    **argv;
  247         char    **envv;
  248         struct mac      *mac_p;
  249 };
  250 #endif
  251 
  252 int
  253 sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
  254 {
  255 #ifdef MAC
  256         struct image_args args;
  257         struct vmspace *oldvmspace;
  258         int error;
  259 
  260         error = pre_execve(td, &oldvmspace);
  261         if (error != 0)
  262                 return (error);
  263         error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
  264             uap->argv, uap->envv);
  265         if (error == 0)
  266                 error = kern_execve(td, &args, uap->mac_p);
  267         post_execve(td, error, oldvmspace);
  268         return (error);
  269 #else
  270         return (ENOSYS);
  271 #endif
  272 }
  273 
  274 int
  275 pre_execve(struct thread *td, struct vmspace **oldvmspace)
  276 {
  277         struct proc *p;
  278         int error;
  279 
  280         KASSERT(td == curthread, ("non-current thread %p", td));
  281         error = 0;
  282         p = td->td_proc;
  283         if ((p->p_flag & P_HADTHREADS) != 0) {
  284                 PROC_LOCK(p);
  285                 if (thread_single(p, SINGLE_BOUNDARY) != 0)
  286                         error = ERESTART;
  287                 PROC_UNLOCK(p);
  288         }
  289         KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
  290             ("nested execve"));
  291         *oldvmspace = p->p_vmspace;
  292         return (error);
  293 }
  294 
  295 void
  296 post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
  297 {
  298         struct proc *p;
  299 
  300         KASSERT(td == curthread, ("non-current thread %p", td));
  301         p = td->td_proc;
  302         if ((p->p_flag & P_HADTHREADS) != 0) {
  303                 PROC_LOCK(p);
  304                 /*
  305                  * If success, we upgrade to SINGLE_EXIT state to
  306                  * force other threads to suicide.
  307                  */
  308                 if (error == 0)
  309                         thread_single(p, SINGLE_EXIT);
  310                 else
  311                         thread_single_end(p, SINGLE_BOUNDARY);
  312                 PROC_UNLOCK(p);
  313         }
  314         if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
  315                 KASSERT(p->p_vmspace != oldvmspace,
  316                     ("oldvmspace still used"));
  317                 vmspace_free(oldvmspace);
  318                 td->td_pflags &= ~TDP_EXECVMSPC;
  319         }
  320 }
  321 
  322 /*
  323  * XXX: kern_execve has the astonishing property of not always returning to
  324  * the caller.  If sufficiently bad things happen during the call to
  325  * do_execve(), it can end up calling exit1(); as a result, callers must
  326  * avoid doing anything which they might need to undo (e.g., allocating
  327  * memory).
  328  */
  329 int
  330 kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
  331 {
  332 
  333         AUDIT_ARG_ARGV(args->begin_argv, args->argc,
  334             args->begin_envv - args->begin_argv);
  335         AUDIT_ARG_ENVV(args->begin_envv, args->envc,
  336             args->endp - args->begin_envv);
  337         return (do_execve(td, args, mac_p));
  338 }
  339 
  340 /*
  341  * In-kernel implementation of execve().  All arguments are assumed to be
  342  * userspace pointers from the passed thread.
  343  */
  344 static int
  345 do_execve(td, args, mac_p)
  346         struct thread *td;
  347         struct image_args *args;
  348         struct mac *mac_p;
  349 {
  350         struct proc *p = td->td_proc;
  351         struct nameidata nd;
  352         struct ucred *newcred = NULL, *oldcred;
  353         struct uidinfo *euip = NULL;
  354         register_t *stack_base;
  355         int error, i;
  356         struct image_params image_params, *imgp;
  357         struct vattr attr;
  358         int (*img_first)(struct image_params *);
  359         struct pargs *oldargs = NULL, *newargs = NULL;
  360         struct sigacts *oldsigacts, *newsigacts;
  361 #ifdef KTRACE
  362         struct vnode *tracevp = NULL;
  363         struct ucred *tracecred = NULL;
  364 #endif
  365         struct vnode *textvp = NULL, *binvp = NULL;
  366         cap_rights_t rights;
  367         int credential_changing;
  368         int textset;
  369 #ifdef MAC
  370         struct label *interpvplabel = NULL;
  371         int will_transition;
  372 #endif
  373 #ifdef HWPMC_HOOKS
  374         struct pmckern_procexec pe;
  375 #endif
  376         static const char fexecv_proc_title[] = "(fexecv)";
  377 
  378         imgp = &image_params;
  379 
  380         /*
  381          * Lock the process and set the P_INEXEC flag to indicate that
  382          * it should be left alone until we're done here.  This is
  383          * necessary to avoid race conditions - e.g. in ptrace() -
  384          * that might allow a local user to illicitly obtain elevated
  385          * privileges.
  386          */
  387         PROC_LOCK(p);
  388         KASSERT((p->p_flag & P_INEXEC) == 0,
  389             ("%s(): process already has P_INEXEC flag", __func__));
  390         p->p_flag |= P_INEXEC;
  391         PROC_UNLOCK(p);
  392 
  393         /*
  394          * Initialize part of the common data
  395          */
  396         bzero(imgp, sizeof(*imgp));
  397         imgp->proc = p;
  398         imgp->attr = &attr;
  399         imgp->args = args;
  400 
  401 #ifdef MAC
  402         error = mac_execve_enter(imgp, mac_p);
  403         if (error)
  404                 goto exec_fail;
  405 #endif
  406 
  407         /*
  408          * Translate the file name. namei() returns a vnode pointer
  409          *      in ni_vp amoung other things.
  410          *
  411          * XXXAUDIT: It would be desirable to also audit the name of the
  412          * interpreter if this is an interpreted binary.
  413          */
  414         if (args->fname != NULL) {
  415                 NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
  416                     | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
  417         }
  418 
  419         SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
  420 
  421 interpret:
  422         if (args->fname != NULL) {
  423 #ifdef CAPABILITY_MODE
  424                 /*
  425                  * While capability mode can't reach this point via direct
  426                  * path arguments to execve(), we also don't allow
  427                  * interpreters to be used in capability mode (for now).
  428                  * Catch indirect lookups and return a permissions error.
  429                  */
  430                 if (IN_CAPABILITY_MODE(td)) {
  431                         error = ECAPMODE;
  432                         goto exec_fail;
  433                 }
  434 #endif
  435                 error = namei(&nd);
  436                 if (error)
  437                         goto exec_fail;
  438 
  439                 binvp  = nd.ni_vp;
  440                 imgp->vp = binvp;
  441         } else {
  442                 AUDIT_ARG_FD(args->fd);
  443                 /*
  444                  * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
  445                  */
  446                 error = fgetvp_exec(td, args->fd,
  447                     cap_rights_init(&rights, CAP_FEXECVE), &binvp);
  448                 if (error)
  449                         goto exec_fail;
  450                 vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
  451                 AUDIT_ARG_VNODE1(binvp);
  452                 imgp->vp = binvp;
  453         }
  454 
  455         /*
  456          * Check file permissions (also 'opens' file)
  457          */
  458         error = exec_check_permissions(imgp);
  459         if (error)
  460                 goto exec_fail_dealloc;
  461 
  462         imgp->object = imgp->vp->v_object;
  463         if (imgp->object != NULL)
  464                 vm_object_reference(imgp->object);
  465 
  466         /*
  467          * Set VV_TEXT now so no one can write to the executable while we're
  468          * activating it.
  469          *
  470          * Remember if this was set before and unset it in case this is not
  471          * actually an executable image.
  472          */
  473         textset = VOP_IS_TEXT(imgp->vp);
  474         VOP_SET_TEXT(imgp->vp);
  475 
  476         error = exec_map_first_page(imgp);
  477         if (error)
  478                 goto exec_fail_dealloc;
  479 
  480         imgp->proc->p_osrel = 0;
  481         /*
  482          *      If the current process has a special image activator it
  483          *      wants to try first, call it.   For example, emulating shell
  484          *      scripts differently.
  485          */
  486         error = -1;
  487         if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
  488                 error = img_first(imgp);
  489 
  490         /*
  491          *      Loop through the list of image activators, calling each one.
  492          *      An activator returns -1 if there is no match, 0 on success,
  493          *      and an error otherwise.
  494          */
  495         for (i = 0; error == -1 && execsw[i]; ++i) {
  496                 if (execsw[i]->ex_imgact == NULL ||
  497                     execsw[i]->ex_imgact == img_first) {
  498                         continue;
  499                 }
  500                 error = (*execsw[i]->ex_imgact)(imgp);
  501         }
  502 
  503         if (error) {
  504                 if (error == -1) {
  505                         if (textset == 0)
  506                                 VOP_UNSET_TEXT(imgp->vp);
  507                         error = ENOEXEC;
  508                 }
  509                 goto exec_fail_dealloc;
  510         }
  511 
  512         /*
  513          * Special interpreter operation, cleanup and loop up to try to
  514          * activate the interpreter.
  515          */
  516         if (imgp->interpreted) {
  517                 exec_unmap_first_page(imgp);
  518                 /*
  519                  * VV_TEXT needs to be unset for scripts.  There is a short
  520                  * period before we determine that something is a script where
  521                  * VV_TEXT will be set. The vnode lock is held over this
  522                  * entire period so nothing should illegitimately be blocked.
  523                  */
  524                 VOP_UNSET_TEXT(imgp->vp);
  525                 /* free name buffer and old vnode */
  526                 if (args->fname != NULL)
  527                         NDFREE(&nd, NDF_ONLY_PNBUF);
  528 #ifdef MAC
  529                 mac_execve_interpreter_enter(binvp, &interpvplabel);
  530 #endif
  531                 if (imgp->opened) {
  532                         VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
  533                         imgp->opened = 0;
  534                 }
  535                 vput(binvp);
  536                 vm_object_deallocate(imgp->object);
  537                 imgp->object = NULL;
  538                 /* set new name to that of the interpreter */
  539                 NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
  540                     UIO_SYSSPACE, imgp->interpreter_name, td);
  541                 args->fname = imgp->interpreter_name;
  542                 goto interpret;
  543         }
  544 
  545         /*
  546          * NB: We unlock the vnode here because it is believed that none
  547          * of the sv_copyout_strings/sv_fixup operations require the vnode.
  548          */
  549         VOP_UNLOCK(imgp->vp, 0);
  550 
  551         /*
  552          * Do the best to calculate the full path to the image file.
  553          */
  554         if (imgp->auxargs != NULL &&
  555             ((args->fname != NULL && args->fname[0] == '/') ||
  556              vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
  557                 imgp->execpath = args->fname;
  558 
  559         if (disallow_high_osrel &&
  560             P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
  561                 error = ENOEXEC;
  562                 uprintf("Osrel %d for image %s too high\n", p->p_osrel,
  563                     imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
  564                 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
  565                 goto exec_fail_dealloc;
  566         }
  567 
  568         /*
  569          * Copy out strings (args and env) and initialize stack base
  570          */
  571         if (p->p_sysent->sv_copyout_strings)
  572                 stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
  573         else
  574                 stack_base = exec_copyout_strings(imgp);
  575 
  576         /*
  577          * If custom stack fixup routine present for this process
  578          * let it do the stack setup.
  579          * Else stuff argument count as first item on stack
  580          */
  581         if (p->p_sysent->sv_fixup != NULL)
  582                 (*p->p_sysent->sv_fixup)(&stack_base, imgp);
  583         else
  584                 suword(--stack_base, imgp->args->argc);
  585 
  586         /*
  587          * For security and other reasons, the file descriptor table cannot
  588          * be shared after an exec.
  589          */
  590         fdunshare(td);
  591         /* close files on exec */
  592         fdcloseexec(td);
  593 
  594         /*
  595          * Malloc things before we need locks.
  596          */
  597         i = imgp->args->begin_envv - imgp->args->begin_argv;
  598         /* Cache arguments if they fit inside our allowance */
  599         if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
  600                 newargs = pargs_alloc(i);
  601                 bcopy(imgp->args->begin_argv, newargs->ar_args, i);
  602         }
  603 
  604         vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
  605 
  606         /* Get a reference to the vnode prior to locking the proc */
  607         VREF(binvp);
  608 
  609         /*
  610          * For security and other reasons, signal handlers cannot
  611          * be shared after an exec. The new process gets a copy of the old
  612          * handlers. In execsigs(), the new process will have its signals
  613          * reset.
  614          */
  615         if (sigacts_shared(p->p_sigacts)) {
  616                 oldsigacts = p->p_sigacts;
  617                 newsigacts = sigacts_alloc();
  618                 sigacts_copy(newsigacts, oldsigacts);
  619         } else {
  620                 oldsigacts = NULL;
  621                 newsigacts = NULL; /* satisfy gcc */
  622         }
  623 
  624         PROC_LOCK(p);
  625         if (oldsigacts)
  626                 p->p_sigacts = newsigacts;
  627         oldcred = p->p_ucred;
  628         /* Stop profiling */
  629         stopprofclock(p);
  630 
  631         /* reset caught signals */
  632         execsigs(p);
  633 
  634         /* name this process - nameiexec(p, ndp) */
  635         bzero(p->p_comm, sizeof(p->p_comm));
  636         if (args->fname)
  637                 bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
  638                     min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
  639         else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
  640                 bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
  641         bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
  642 #ifdef KTR
  643         sched_clear_tdname(td);
  644 #endif
  645 
  646         /*
  647          * mark as execed, wakeup the process that vforked (if any) and tell
  648          * it that it now has its own resources back
  649          */
  650         p->p_flag |= P_EXEC;
  651         if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
  652                 p->p_flag2 &= ~P2_NOTRACE;
  653         if (p->p_flag & P_PPWAIT) {
  654                 p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
  655                 cv_broadcast(&p->p_pwait);
  656         }
  657 
  658         /*
  659          * Implement image setuid/setgid.
  660          *
  661          * Don't honor setuid/setgid if the filesystem prohibits it or if
  662          * the process is being traced.
  663          *
  664          * We disable setuid/setgid/etc in compatibility mode on the basis
  665          * that most setugid applications are not written with that
  666          * environment in mind, and will therefore almost certainly operate
  667          * incorrectly. In principle there's no reason that setugid
  668          * applications might not be useful in capability mode, so we may want
  669          * to reconsider this conservative design choice in the future.
  670          *
  671          * XXXMAC: For the time being, use NOSUID to also prohibit
  672          * transitions on the file system.
  673          */
  674         credential_changing = 0;
  675         credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
  676             attr.va_uid;
  677         credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
  678             attr.va_gid;
  679 #ifdef MAC
  680         will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
  681             interpvplabel, imgp);
  682         credential_changing |= will_transition;
  683 #endif
  684 
  685         if (credential_changing &&
  686 #ifdef CAPABILITY_MODE
  687             ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
  688 #endif
  689             (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
  690             (p->p_flag & P_TRACED) == 0) {
  691                 /*
  692                  * Turn off syscall tracing for set-id programs, except for
  693                  * root.  Record any set-id flags first to make sure that
  694                  * we do not regain any tracing during a possible block.
  695                  */
  696                 setsugid(p);
  697 
  698 #ifdef KTRACE
  699                 if (p->p_tracecred != NULL &&
  700                     priv_check_cred(p->p_tracecred, PRIV_DEBUG_DIFFCRED, 0))
  701                         ktrprocexec(p, &tracecred, &tracevp);
  702 #endif
  703                 /*
  704                  * Close any file descriptors 0..2 that reference procfs,
  705                  * then make sure file descriptors 0..2 are in use.
  706                  *
  707                  * setugidsafety() may call closef() and then pfind()
  708                  * which may grab the process lock.
  709                  * fdcheckstd() may call falloc() which may block to
  710                  * allocate memory, so temporarily drop the process lock.
  711                  */
  712                 PROC_UNLOCK(p);
  713                 VOP_UNLOCK(imgp->vp, 0);
  714                 setugidsafety(td);
  715                 error = fdcheckstd(td);
  716                 if (error != 0)
  717                         goto done1;
  718                 newcred = crdup(oldcred);
  719                 euip = uifind(attr.va_uid);
  720                 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
  721                 PROC_LOCK(p);
  722                 /*
  723                  * Set the new credentials.
  724                  */
  725                 if (attr.va_mode & S_ISUID)
  726                         change_euid(newcred, euip);
  727                 if (attr.va_mode & S_ISGID)
  728                         change_egid(newcred, attr.va_gid);
  729 #ifdef MAC
  730                 if (will_transition) {
  731                         mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
  732                             interpvplabel, imgp);
  733                 }
  734 #endif
  735                 /*
  736                  * Implement correct POSIX saved-id behavior.
  737                  *
  738                  * XXXMAC: Note that the current logic will save the
  739                  * uid and gid if a MAC domain transition occurs, even
  740                  * though maybe it shouldn't.
  741                  */
  742                 change_svuid(newcred, newcred->cr_uid);
  743                 change_svgid(newcred, newcred->cr_gid);
  744                 p->p_ucred = newcred;
  745         } else {
  746                 if (oldcred->cr_uid == oldcred->cr_ruid &&
  747                     oldcred->cr_gid == oldcred->cr_rgid)
  748                         p->p_flag &= ~P_SUGID;
  749                 /*
  750                  * Implement correct POSIX saved-id behavior.
  751                  *
  752                  * XXX: It's not clear that the existing behavior is
  753                  * POSIX-compliant.  A number of sources indicate that the
  754                  * saved uid/gid should only be updated if the new ruid is
  755                  * not equal to the old ruid, or the new euid is not equal
  756                  * to the old euid and the new euid is not equal to the old
  757                  * ruid.  The FreeBSD code always updates the saved uid/gid.
  758                  * Also, this code uses the new (replaced) euid and egid as
  759                  * the source, which may or may not be the right ones to use.
  760                  */
  761                 if (oldcred->cr_svuid != oldcred->cr_uid ||
  762                     oldcred->cr_svgid != oldcred->cr_gid) {
  763                         PROC_UNLOCK(p);
  764                         VOP_UNLOCK(imgp->vp, 0);
  765                         newcred = crdup(oldcred);
  766                         vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
  767                         PROC_LOCK(p);
  768                         change_svuid(newcred, newcred->cr_uid);
  769                         change_svgid(newcred, newcred->cr_gid);
  770                         p->p_ucred = newcred;
  771                 }
  772         }
  773 
  774         /*
  775          * Store the vp for use in procfs.  This vnode was referenced prior
  776          * to locking the proc lock.
  777          */
  778         textvp = p->p_textvp;
  779         p->p_textvp = binvp;
  780 
  781 #ifdef KDTRACE_HOOKS
  782         /*
  783          * Tell the DTrace fasttrap provider about the exec if it
  784          * has declared an interest.
  785          */
  786         if (dtrace_fasttrap_exec)
  787                 dtrace_fasttrap_exec(p);
  788 #endif
  789 
  790         /*
  791          * Notify others that we exec'd, and clear the P_INEXEC flag
  792          * as we're now a bona fide freshly-execed process.
  793          */
  794         KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
  795         p->p_flag &= ~P_INEXEC;
  796 
  797         /* clear "fork but no exec" flag, as we _are_ execing */
  798         p->p_acflag &= ~AFORK;
  799 
  800         /*
  801          * Free any previous argument cache and replace it with
  802          * the new argument cache, if any.
  803          */
  804         oldargs = p->p_args;
  805         p->p_args = newargs;
  806         newargs = NULL;
  807 
  808 #ifdef  HWPMC_HOOKS
  809         /*
  810          * Check if system-wide sampling is in effect or if the
  811          * current process is using PMCs.  If so, do exec() time
  812          * processing.  This processing needs to happen AFTER the
  813          * P_INEXEC flag is cleared.
  814          *
  815          * The proc lock needs to be released before taking the PMC
  816          * SX.
  817          */
  818         if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
  819                 PROC_UNLOCK(p);
  820                 VOP_UNLOCK(imgp->vp, 0);
  821                 pe.pm_credentialschanged = credential_changing;
  822                 pe.pm_entryaddr = imgp->entry_addr;
  823 
  824                 PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
  825                 vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
  826         } else
  827                 PROC_UNLOCK(p);
  828 #else  /* !HWPMC_HOOKS */
  829         PROC_UNLOCK(p);
  830 #endif
  831 
  832         /* Set values passed into the program in registers. */
  833         if (p->p_sysent->sv_setregs)
  834                 (*p->p_sysent->sv_setregs)(td, imgp, 
  835                     (u_long)(uintptr_t)stack_base);
  836         else
  837                 exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
  838 
  839         vfs_mark_atime(imgp->vp, td->td_ucred);
  840 
  841         SDT_PROBE(proc, kernel, , exec__success, args->fname, 0, 0, 0, 0);
  842 
  843         VOP_UNLOCK(imgp->vp, 0);
  844 done1:
  845         /*
  846          * Free any resources malloc'd earlier that we didn't use.
  847          */
  848         if (euip != NULL)
  849                 uifree(euip);
  850         if (newcred != NULL)
  851                 crfree(oldcred);
  852 
  853         /*
  854          * Handle deferred decrement of ref counts.
  855          */
  856         if (textvp != NULL)
  857                 vrele(textvp);
  858         if (binvp && error != 0)
  859                 vrele(binvp);
  860 #ifdef KTRACE
  861         if (tracevp != NULL)
  862                 vrele(tracevp);
  863         if (tracecred != NULL)
  864                 crfree(tracecred);
  865 #endif
  866         vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
  867         pargs_drop(oldargs);
  868         pargs_drop(newargs);
  869         if (oldsigacts != NULL)
  870                 sigacts_free(oldsigacts);
  871 
  872 exec_fail_dealloc:
  873 
  874         /*
  875          * free various allocated resources
  876          */
  877         if (imgp->firstpage != NULL)
  878                 exec_unmap_first_page(imgp);
  879 
  880         if (imgp->vp != NULL) {
  881                 if (args->fname)
  882                         NDFREE(&nd, NDF_ONLY_PNBUF);
  883                 if (imgp->opened)
  884                         VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
  885                 vput(imgp->vp);
  886         }
  887 
  888         if (imgp->object != NULL)
  889                 vm_object_deallocate(imgp->object);
  890 
  891         free(imgp->freepath, M_TEMP);
  892 
  893         if (error == 0) {
  894                 PROC_LOCK(p);
  895                 td->td_dbgflags |= TDB_EXEC;
  896                 PROC_UNLOCK(p);
  897 
  898                 /*
  899                  * Stop the process here if its stop event mask has
  900                  * the S_EXEC bit set.
  901                  */
  902                 STOPEVENT(p, S_EXEC, 0);
  903                 goto done2;
  904         }
  905 
  906 exec_fail:
  907         /* we're done here, clear P_INEXEC */
  908         PROC_LOCK(p);
  909         p->p_flag &= ~P_INEXEC;
  910         PROC_UNLOCK(p);
  911 
  912         SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
  913 
  914 done2:
  915 #ifdef MAC
  916         mac_execve_exit(imgp);
  917         mac_execve_interpreter_exit(interpvplabel);
  918 #endif
  919         exec_free_args(args);
  920 
  921         if (error && imgp->vmspace_destroyed) {
  922                 /* sorry, no more process anymore. exit gracefully */
  923                 exit1(td, W_EXITCODE(0, SIGABRT));
  924                 /* NOT REACHED */
  925         }
  926 
  927 #ifdef KTRACE
  928         if (error == 0)
  929                 ktrprocctor(p);
  930 #endif
  931 
  932         return (error);
  933 }
  934 
  935 int
  936 exec_map_first_page(imgp)
  937         struct image_params *imgp;
  938 {
  939         int rv, i;
  940         int initial_pagein;
  941         vm_page_t ma[VM_INITIAL_PAGEIN];
  942         vm_object_t object;
  943 
  944         if (imgp->firstpage != NULL)
  945                 exec_unmap_first_page(imgp);
  946 
  947         object = imgp->vp->v_object;
  948         if (object == NULL)
  949                 return (EACCES);
  950         VM_OBJECT_WLOCK(object);
  951 #if VM_NRESERVLEVEL > 0
  952         if ((object->flags & OBJ_COLORED) == 0) {
  953                 object->flags |= OBJ_COLORED;
  954                 object->pg_color = 0;
  955         }
  956 #endif
  957         ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL);
  958         if (ma[0]->valid != VM_PAGE_BITS_ALL) {
  959                 initial_pagein = VM_INITIAL_PAGEIN;
  960                 if (initial_pagein > object->size)
  961                         initial_pagein = object->size;
  962                 for (i = 1; i < initial_pagein; i++) {
  963                         if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
  964                                 if (ma[i]->valid)
  965                                         break;
  966                                 if (vm_page_tryxbusy(ma[i]))
  967                                         break;
  968                         } else {
  969                                 ma[i] = vm_page_alloc(object, i,
  970                                     VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
  971                                 if (ma[i] == NULL)
  972                                         break;
  973                         }
  974                 }
  975                 initial_pagein = i;
  976                 rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
  977                 ma[0] = vm_page_lookup(object, 0);
  978                 if ((rv != VM_PAGER_OK) || (ma[0] == NULL)) {
  979                         if (ma[0] != NULL) {
  980                                 vm_page_lock(ma[0]);
  981                                 vm_page_free(ma[0]);
  982                                 vm_page_unlock(ma[0]);
  983                         }
  984                         VM_OBJECT_WUNLOCK(object);
  985                         return (EIO);
  986                 }
  987         }
  988         vm_page_xunbusy(ma[0]);
  989         vm_page_lock(ma[0]);
  990         vm_page_hold(ma[0]);
  991         vm_page_activate(ma[0]);
  992         vm_page_unlock(ma[0]);
  993         VM_OBJECT_WUNLOCK(object);
  994 
  995         imgp->firstpage = sf_buf_alloc(ma[0], 0);
  996         imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
  997 
  998         return (0);
  999 }
 1000 
 1001 void
 1002 exec_unmap_first_page(imgp)
 1003         struct image_params *imgp;
 1004 {
 1005         vm_page_t m;
 1006 
 1007         if (imgp->firstpage != NULL) {
 1008                 m = sf_buf_page(imgp->firstpage);
 1009                 sf_buf_free(imgp->firstpage);
 1010                 imgp->firstpage = NULL;
 1011                 vm_page_lock(m);
 1012                 vm_page_unhold(m);
 1013                 vm_page_unlock(m);
 1014         }
 1015 }
 1016 
 1017 /*
 1018  * Destroy old address space, and allocate a new stack
 1019  *      The new stack is only SGROWSIZ large because it is grown
 1020  *      automatically in trap.c.
 1021  */
 1022 int
 1023 exec_new_vmspace(imgp, sv)
 1024         struct image_params *imgp;
 1025         struct sysentvec *sv;
 1026 {
 1027         int error;
 1028         struct proc *p = imgp->proc;
 1029         struct vmspace *vmspace = p->p_vmspace;
 1030         vm_object_t obj;
 1031         struct rlimit rlim_stack;
 1032         vm_offset_t sv_minuser, stack_addr;
 1033         vm_map_t map;
 1034         u_long ssiz;
 1035 
 1036         imgp->vmspace_destroyed = 1;
 1037         imgp->sysent = sv;
 1038 
 1039         /* May be called with Giant held */
 1040         EVENTHANDLER_INVOKE(process_exec, p, imgp);
 1041 
 1042         /*
 1043          * Blow away entire process VM, if address space not shared,
 1044          * otherwise, create a new VM space so that other threads are
 1045          * not disrupted
 1046          */
 1047         map = &vmspace->vm_map;
 1048         if (map_at_zero)
 1049                 sv_minuser = sv->sv_minuser;
 1050         else
 1051                 sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
 1052         if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
 1053             vm_map_max(map) == sv->sv_maxuser) {
 1054                 shmexit(vmspace);
 1055                 pmap_remove_pages(vmspace_pmap(vmspace));
 1056                 vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 1057         } else {
 1058                 error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
 1059                 if (error)
 1060                         return (error);
 1061                 vmspace = p->p_vmspace;
 1062                 map = &vmspace->vm_map;
 1063         }
 1064 
 1065         /* Map a shared page */
 1066         obj = sv->sv_shared_page_obj;
 1067         if (obj != NULL) {
 1068                 vm_object_reference(obj);
 1069                 error = vm_map_fixed(map, obj, 0,
 1070                     sv->sv_shared_page_base, sv->sv_shared_page_len,
 1071                     VM_PROT_READ | VM_PROT_EXECUTE,
 1072                     VM_PROT_READ | VM_PROT_EXECUTE,
 1073                     MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
 1074                 if (error) {
 1075                         vm_object_deallocate(obj);
 1076                         return (error);
 1077                 }
 1078         }
 1079 
 1080         /* Allocate a new stack */
 1081         if (imgp->stack_sz != 0) {
 1082                 ssiz = trunc_page(imgp->stack_sz);
 1083                 PROC_LOCK(p);
 1084                 lim_rlimit(p, RLIMIT_STACK, &rlim_stack);
 1085                 PROC_UNLOCK(p);
 1086                 if (ssiz > rlim_stack.rlim_max)
 1087                         ssiz = rlim_stack.rlim_max;
 1088                 if (ssiz > rlim_stack.rlim_cur) {
 1089                         rlim_stack.rlim_cur = ssiz;
 1090                         kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
 1091                 }
 1092         } else if (sv->sv_maxssiz != NULL) {
 1093                 ssiz = *sv->sv_maxssiz;
 1094         } else {
 1095                 ssiz = maxssiz;
 1096         }
 1097         stack_addr = sv->sv_usrstack - ssiz;
 1098         error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 1099             obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
 1100                 sv->sv_stackprot,
 1101             VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 1102         if (error)
 1103                 return (error);
 1104 
 1105 #ifdef __ia64__
 1106         /* Allocate a new register stack */
 1107         stack_addr = IA64_BACKINGSTORE;
 1108         error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 1109             sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 1110         if (error)
 1111                 return (error);
 1112 #endif
 1113 
 1114         /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 1115          * VM_STACK case, but they are still used to monitor the size of the
 1116          * process stack so we can check the stack rlimit.
 1117          */
 1118         vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 1119         vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
 1120 
 1121         return (0);
 1122 }
 1123 
 1124 /*
 1125  * Copy out argument and environment strings from the old process address
 1126  * space into the temporary string buffer.
 1127  */
 1128 int
 1129 exec_copyin_args(struct image_args *args, char *fname,
 1130     enum uio_seg segflg, char **argv, char **envv)
 1131 {
 1132         u_long argp, envp;
 1133         int error;
 1134         size_t length;
 1135 
 1136         bzero(args, sizeof(*args));
 1137         if (argv == NULL)
 1138                 return (EFAULT);
 1139 
 1140         /*
 1141          * Allocate demand-paged memory for the file name, argument, and
 1142          * environment strings.
 1143          */
 1144         error = exec_alloc_args(args);
 1145         if (error != 0)
 1146                 return (error);
 1147 
 1148         /*
 1149          * Copy the file name.
 1150          */
 1151         if (fname != NULL) {
 1152                 args->fname = args->buf;
 1153                 error = (segflg == UIO_SYSSPACE) ?
 1154                     copystr(fname, args->fname, PATH_MAX, &length) :
 1155                     copyinstr(fname, args->fname, PATH_MAX, &length);
 1156                 if (error != 0)
 1157                         goto err_exit;
 1158         } else
 1159                 length = 0;
 1160 
 1161         args->begin_argv = args->buf + length;
 1162         args->endp = args->begin_argv;
 1163         args->stringspace = ARG_MAX;
 1164 
 1165         /*
 1166          * extract arguments first
 1167          */
 1168         for (;;) {
 1169                 error = fueword(argv++, &argp);
 1170                 if (error == -1) {
 1171                         error = EFAULT;
 1172                         goto err_exit;
 1173                 }
 1174                 if (argp == 0)
 1175                         break;
 1176                 error = copyinstr((void *)(uintptr_t)argp, args->endp,
 1177                     args->stringspace, &length);
 1178                 if (error != 0) {
 1179                         if (error == ENAMETOOLONG) 
 1180                                 error = E2BIG;
 1181                         goto err_exit;
 1182                 }
 1183                 args->stringspace -= length;
 1184                 args->endp += length;
 1185                 args->argc++;
 1186         }
 1187 
 1188         args->begin_envv = args->endp;
 1189 
 1190         /*
 1191          * extract environment strings
 1192          */
 1193         if (envv) {
 1194                 for (;;) {
 1195                         error = fueword(envv++, &envp);
 1196                         if (error == -1) {
 1197                                 error = EFAULT;
 1198                                 goto err_exit;
 1199                         }
 1200                         if (envp == 0)
 1201                                 break;
 1202                         error = copyinstr((void *)(uintptr_t)envp,
 1203                             args->endp, args->stringspace, &length);
 1204                         if (error != 0) {
 1205                                 if (error == ENAMETOOLONG)
 1206                                         error = E2BIG;
 1207                                 goto err_exit;
 1208                         }
 1209                         args->stringspace -= length;
 1210                         args->endp += length;
 1211                         args->envc++;
 1212                 }
 1213         }
 1214 
 1215         return (0);
 1216 
 1217 err_exit:
 1218         exec_free_args(args);
 1219         return (error);
 1220 }
 1221 
 1222 /*
 1223  * Allocate temporary demand-paged, zero-filled memory for the file name,
 1224  * argument, and environment strings.  Returns zero if the allocation succeeds
 1225  * and ENOMEM otherwise.
 1226  */
 1227 int
 1228 exec_alloc_args(struct image_args *args)
 1229 {
 1230 
 1231         args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
 1232         return (args->buf != NULL ? 0 : ENOMEM);
 1233 }
 1234 
 1235 void
 1236 exec_free_args(struct image_args *args)
 1237 {
 1238 
 1239         if (args->buf != NULL) {
 1240                 kmap_free_wakeup(exec_map, (vm_offset_t)args->buf,
 1241                     PATH_MAX + ARG_MAX);
 1242                 args->buf = NULL;
 1243         }
 1244         if (args->fname_buf != NULL) {
 1245                 free(args->fname_buf, M_TEMP);
 1246                 args->fname_buf = NULL;
 1247         }
 1248 }
 1249 
 1250 /*
 1251  * Copy strings out to the new process address space, constructing new arg
 1252  * and env vector tables. Return a pointer to the base so that it can be used
 1253  * as the initial stack pointer.
 1254  */
 1255 register_t *
 1256 exec_copyout_strings(imgp)
 1257         struct image_params *imgp;
 1258 {
 1259         int argc, envc;
 1260         char **vectp;
 1261         char *stringp;
 1262         uintptr_t destp;
 1263         register_t *stack_base;
 1264         struct ps_strings *arginfo;
 1265         struct proc *p;
 1266         size_t execpath_len;
 1267         int szsigcode, szps;
 1268         char canary[sizeof(long) * 8];
 1269 
 1270         szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
 1271         /*
 1272          * Calculate string base and vector table pointers.
 1273          * Also deal with signal trampoline code for this exec type.
 1274          */
 1275         if (imgp->execpath != NULL && imgp->auxargs != NULL)
 1276                 execpath_len = strlen(imgp->execpath) + 1;
 1277         else
 1278                 execpath_len = 0;
 1279         p = imgp->proc;
 1280         szsigcode = 0;
 1281         arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 1282         if (p->p_sysent->sv_sigcode_base == 0) {
 1283                 if (p->p_sysent->sv_szsigcode != NULL)
 1284                         szsigcode = *(p->p_sysent->sv_szsigcode);
 1285         }
 1286         destp = (uintptr_t)arginfo;
 1287 
 1288         /*
 1289          * install sigcode
 1290          */
 1291         if (szsigcode != 0) {
 1292                 destp -= szsigcode;
 1293                 destp = rounddown2(destp, sizeof(void *));
 1294                 copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode);
 1295         }
 1296 
 1297         /*
 1298          * Copy the image path for the rtld.
 1299          */
 1300         if (execpath_len != 0) {
 1301                 destp -= execpath_len;
 1302                 imgp->execpathp = destp;
 1303                 copyout(imgp->execpath, (void *)destp, execpath_len);
 1304         }
 1305 
 1306         /*
 1307          * Prepare the canary for SSP.
 1308          */
 1309         arc4rand(canary, sizeof(canary), 0);
 1310         destp -= sizeof(canary);
 1311         imgp->canary = destp;
 1312         copyout(canary, (void *)destp, sizeof(canary));
 1313         imgp->canarylen = sizeof(canary);
 1314 
 1315         /*
 1316          * Prepare the pagesizes array.
 1317          */
 1318         destp -= szps;
 1319         destp = rounddown2(destp, sizeof(void *));
 1320         imgp->pagesizes = destp;
 1321         copyout(pagesizes, (void *)destp, szps);
 1322         imgp->pagesizeslen = szps;
 1323 
 1324         destp -= ARG_MAX - imgp->args->stringspace;
 1325         destp = rounddown2(destp, sizeof(void *));
 1326 
 1327         /*
 1328          * If we have a valid auxargs ptr, prepare some room
 1329          * on the stack.
 1330          */
 1331         if (imgp->auxargs) {
 1332                 /*
 1333                  * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 1334                  * lower compatibility.
 1335                  */
 1336                 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 1337                     (AT_COUNT * 2);
 1338                 /*
 1339                  * The '+ 2' is for the null pointers at the end of each of
 1340                  * the arg and env vector sets,and imgp->auxarg_size is room
 1341                  * for argument of Runtime loader.
 1342                  */
 1343                 vectp = (char **)(destp - (imgp->args->argc +
 1344                     imgp->args->envc + 2 + imgp->auxarg_size)
 1345                     * sizeof(char *));
 1346         } else {
 1347                 /*
 1348                  * The '+ 2' is for the null pointers at the end of each of
 1349                  * the arg and env vector sets
 1350                  */
 1351                 vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc
 1352                     + 2) * sizeof(char *));
 1353         }
 1354 
 1355         /*
 1356          * vectp also becomes our initial stack base
 1357          */
 1358         stack_base = (register_t *)vectp;
 1359 
 1360         stringp = imgp->args->begin_argv;
 1361         argc = imgp->args->argc;
 1362         envc = imgp->args->envc;
 1363 
 1364         /*
 1365          * Copy out strings - arguments and environment.
 1366          */
 1367         copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
 1368 
 1369         /*
 1370          * Fill in "ps_strings" struct for ps, w, etc.
 1371          */
 1372         suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 1373         suword32(&arginfo->ps_nargvstr, argc);
 1374 
 1375         /*
 1376          * Fill in argument portion of vector table.
 1377          */
 1378         for (; argc > 0; --argc) {
 1379                 suword(vectp++, (long)(intptr_t)destp);
 1380                 while (*stringp++ != 0)
 1381                         destp++;
 1382                 destp++;
 1383         }
 1384 
 1385         /* a null vector table pointer separates the argp's from the envp's */
 1386         suword(vectp++, 0);
 1387 
 1388         suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 1389         suword32(&arginfo->ps_nenvstr, envc);
 1390 
 1391         /*
 1392          * Fill in environment portion of vector table.
 1393          */
 1394         for (; envc > 0; --envc) {
 1395                 suword(vectp++, (long)(intptr_t)destp);
 1396                 while (*stringp++ != 0)
 1397                         destp++;
 1398                 destp++;
 1399         }
 1400 
 1401         /* end of vector table is a null pointer */
 1402         suword(vectp, 0);
 1403 
 1404         return (stack_base);
 1405 }
 1406 
 1407 /*
 1408  * Check permissions of file to execute.
 1409  *      Called with imgp->vp locked.
 1410  *      Return 0 for success or error code on failure.
 1411  */
 1412 int
 1413 exec_check_permissions(imgp)
 1414         struct image_params *imgp;
 1415 {
 1416         struct vnode *vp = imgp->vp;
 1417         struct vattr *attr = imgp->attr;
 1418         struct thread *td;
 1419         int error, writecount;
 1420 
 1421         td = curthread;
 1422 
 1423         /* Get file attributes */
 1424         error = VOP_GETATTR(vp, attr, td->td_ucred);
 1425         if (error)
 1426                 return (error);
 1427 
 1428 #ifdef MAC
 1429         error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
 1430         if (error)
 1431                 return (error);
 1432 #endif
 1433 
 1434         /*
 1435          * 1) Check if file execution is disabled for the filesystem that
 1436          *    this file resides on.
 1437          * 2) Ensure that at least one execute bit is on. Otherwise, a
 1438          *    privileged user will always succeed, and we don't want this
 1439          *    to happen unless the file really is executable.
 1440          * 3) Ensure that the file is a regular file.
 1441          */
 1442         if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 1443             (attr->va_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0 ||
 1444             (attr->va_type != VREG))
 1445                 return (EACCES);
 1446 
 1447         /*
 1448          * Zero length files can't be exec'd
 1449          */
 1450         if (attr->va_size == 0)
 1451                 return (ENOEXEC);
 1452 
 1453         /*
 1454          *  Check for execute permission to file based on current credentials.
 1455          */
 1456         error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 1457         if (error)
 1458                 return (error);
 1459 
 1460         /*
 1461          * Check number of open-for-writes on the file and deny execution
 1462          * if there are any.
 1463          */
 1464         error = VOP_GET_WRITECOUNT(vp, &writecount);
 1465         if (error != 0)
 1466                 return (error);
 1467         if (writecount != 0)
 1468                 return (ETXTBSY);
 1469 
 1470         /*
 1471          * Call filesystem specific open routine (which does nothing in the
 1472          * general case).
 1473          */
 1474         error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 1475         if (error == 0)
 1476                 imgp->opened = 1;
 1477         return (error);
 1478 }
 1479 
 1480 /*
 1481  * Exec handler registration
 1482  */
 1483 int
 1484 exec_register(execsw_arg)
 1485         const struct execsw *execsw_arg;
 1486 {
 1487         const struct execsw **es, **xs, **newexecsw;
 1488         int count = 2;  /* New slot and trailing NULL */
 1489 
 1490         if (execsw)
 1491                 for (es = execsw; *es; es++)
 1492                         count++;
 1493         newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 1494         if (newexecsw == NULL)
 1495                 return (ENOMEM);
 1496         xs = newexecsw;
 1497         if (execsw)
 1498                 for (es = execsw; *es; es++)
 1499                         *xs++ = *es;
 1500         *xs++ = execsw_arg;
 1501         *xs = NULL;
 1502         if (execsw)
 1503                 free(execsw, M_TEMP);
 1504         execsw = newexecsw;
 1505         return (0);
 1506 }
 1507 
 1508 int
 1509 exec_unregister(execsw_arg)
 1510         const struct execsw *execsw_arg;
 1511 {
 1512         const struct execsw **es, **xs, **newexecsw;
 1513         int count = 1;
 1514 
 1515         if (execsw == NULL)
 1516                 panic("unregister with no handlers left?\n");
 1517 
 1518         for (es = execsw; *es; es++) {
 1519                 if (*es == execsw_arg)
 1520                         break;
 1521         }
 1522         if (*es == NULL)
 1523                 return (ENOENT);
 1524         for (es = execsw; *es; es++)
 1525                 if (*es != execsw_arg)
 1526                         count++;
 1527         newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 1528         if (newexecsw == NULL)
 1529                 return (ENOMEM);
 1530         xs = newexecsw;
 1531         for (es = execsw; *es; es++)
 1532                 if (*es != execsw_arg)
 1533                         *xs++ = *es;
 1534         *xs = NULL;
 1535         if (execsw)
 1536                 free(execsw, M_TEMP);
 1537         execsw = newexecsw;
 1538         return (0);
 1539 }
Cache object: 53d7784b35fe9d31eb517a843086da44
[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ] [ list types ] [ track identifier ]
This page is part of the FreeBSD/Linux Linux Kernel Cross-Reference, and was automatically generated using a modified version of the LXR engine.
FreeBSD/Linux Kernel Cross Reference sys/kern/kern_exec.c

FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_exec.c