FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_exec.c
1 /*-
2 * Copyright (c) 1993, David Greenman
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD: releng/6.2/sys/kern/kern_exec.c 164286 2006-11-14 20:42:41Z cvs2svn $");
29
30 #include "opt_hwpmc_hooks.h"
31 #include "opt_ktrace.h"
32 #include "opt_mac.h"
33
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/eventhandler.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/sysproto.h>
40 #include <sys/signalvar.h>
41 #include <sys/kernel.h>
42 #include <sys/mac.h>
43 #include <sys/mount.h>
44 #include <sys/filedesc.h>
45 #include <sys/fcntl.h>
46 #include <sys/acct.h>
47 #include <sys/exec.h>
48 #include <sys/imgact.h>
49 #include <sys/imgact_elf.h>
50 #include <sys/wait.h>
51 #include <sys/malloc.h>
52 #include <sys/proc.h>
53 #include <sys/pioctl.h>
54 #include <sys/namei.h>
55 #include <sys/resourcevar.h>
56 #include <sys/sf_buf.h>
57 #include <sys/syscallsubr.h>
58 #include <sys/sysent.h>
59 #include <sys/shm.h>
60 #include <sys/sysctl.h>
61 #include <sys/vnode.h>
62 #ifdef KTRACE
63 #include <sys/ktrace.h>
64 #endif
65
66 #include <vm/vm.h>
67 #include <vm/vm_param.h>
68 #include <vm/pmap.h>
69 #include <vm/vm_page.h>
70 #include <vm/vm_map.h>
71 #include <vm/vm_kern.h>
72 #include <vm/vm_extern.h>
73 #include <vm/vm_object.h>
74 #include <vm/vm_pager.h>
75
76 #ifdef HWPMC_HOOKS
77 #include <sys/pmckern.h>
78 #endif
79
80 #include <machine/reg.h>
81
82 #include <security/audit/audit.h>
83
84 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
85
86 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
87 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
88 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
89 static int do_execve(struct thread *td, struct image_args *args,
90 struct mac *mac_p);
91
92 /* XXX This should be vm_size_t. */
93 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
94 NULL, 0, sysctl_kern_ps_strings, "LU", "");
95
96 /* XXX This should be vm_size_t. */
97 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
98 NULL, 0, sysctl_kern_usrstack, "LU", "");
99
100 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
101 NULL, 0, sysctl_kern_stackprot, "I", "");
102
103 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
104 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
105 &ps_arg_cache_limit, 0, "");
106
107 static int
108 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
109 {
110 struct proc *p;
111 int error;
112
113 p = curproc;
114 #ifdef SCTL_MASK32
115 if (req->flags & SCTL_MASK32) {
116 unsigned int val;
117 val = (unsigned int)p->p_sysent->sv_psstrings;
118 error = SYSCTL_OUT(req, &val, sizeof(val));
119 } else
120 #endif
121 error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
122 sizeof(p->p_sysent->sv_psstrings));
123 return error;
124 }
125
126 static int
127 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
128 {
129 struct proc *p;
130 int error;
131
132 p = curproc;
133 #ifdef SCTL_MASK32
134 if (req->flags & SCTL_MASK32) {
135 unsigned int val;
136 val = (unsigned int)p->p_sysent->sv_usrstack;
137 error = SYSCTL_OUT(req, &val, sizeof(val));
138 } else
139 #endif
140 error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
141 sizeof(p->p_sysent->sv_usrstack));
142 return error;
143 }
144
145 static int
146 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
147 {
148 struct proc *p;
149
150 p = curproc;
151 return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
152 sizeof(p->p_sysent->sv_stackprot)));
153 }
154
155 /*
156 * Each of the items is a pointer to a `const struct execsw', hence the
157 * double pointer here.
158 */
159 static const struct execsw **execsw;
160
161 #ifndef _SYS_SYSPROTO_H_
162 struct execve_args {
163 char *fname;
164 char **argv;
165 char **envv;
166 };
167 #endif
168
169 /*
170 * MPSAFE
171 */
172 int
173 execve(td, uap)
174 struct thread *td;
175 struct execve_args /* {
176 char *fname;
177 char **argv;
178 char **envv;
179 } */ *uap;
180 {
181 int error;
182 struct image_args args;
183
184 error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
185 uap->argv, uap->envv);
186
187 if (error == 0)
188 error = kern_execve(td, &args, NULL);
189
190 exec_free_args(&args);
191
192 return (error);
193 }
194
195 #ifndef _SYS_SYSPROTO_H_
196 struct __mac_execve_args {
197 char *fname;
198 char **argv;
199 char **envv;
200 struct mac *mac_p;
201 };
202 #endif
203
204 /*
205 * MPSAFE
206 */
207 int
208 __mac_execve(td, uap)
209 struct thread *td;
210 struct __mac_execve_args /* {
211 char *fname;
212 char **argv;
213 char **envv;
214 struct mac *mac_p;
215 } */ *uap;
216 {
217 #ifdef MAC
218 int error;
219 struct image_args args;
220
221 error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
222 uap->argv, uap->envv);
223
224 if (error == 0)
225 error = kern_execve(td, &args, uap->mac_p);
226
227 exec_free_args(&args);
228
229 return (error);
230 #else
231 return (ENOSYS);
232 #endif
233 }
234
235 /*
236 * XXX: kern_execve has the astonishing property of not always
237 * returning to the caller. If sufficiently bad things happen during
238 * the call to do_execve(), it can end up calling exit1(); as a result,
239 * callers must avoid doing anything which they might need to undo
240 * (e.g., allocating memory).
241 */
242 int
243 kern_execve(td, args, mac_p)
244 struct thread *td;
245 struct image_args *args;
246 struct mac *mac_p;
247 {
248 struct proc *p = td->td_proc;
249 int error;
250
251 AUDIT_ARG(argv, args->begin_argv, args->argc,
252 args->begin_envv - args->begin_argv);
253 AUDIT_ARG(envv, args->begin_envv, args->envc,
254 args->endp - args->begin_envv);
255 if (p->p_flag & P_HADTHREADS) {
256 PROC_LOCK(p);
257 if (thread_single(SINGLE_BOUNDARY)) {
258 PROC_UNLOCK(p);
259 return (ERESTART); /* Try again later. */
260 }
261 PROC_UNLOCK(p);
262 }
263
264 error = do_execve(td, args, mac_p);
265
266 if (p->p_flag & P_HADTHREADS) {
267 PROC_LOCK(p);
268 /*
269 * If success, we upgrade to SINGLE_EXIT state to
270 * force other threads to suicide.
271 */
272 if (error == 0)
273 thread_single(SINGLE_EXIT);
274 else
275 thread_single_end();
276 PROC_UNLOCK(p);
277 }
278
279 return (error);
280 }
281
282 /*
283 * In-kernel implementation of execve(). All arguments are assumed to be
284 * userspace pointers from the passed thread.
285 *
286 * MPSAFE
287 */
288 static int
289 do_execve(td, args, mac_p)
290 struct thread *td;
291 struct image_args *args;
292 struct mac *mac_p;
293 {
294 struct proc *p = td->td_proc;
295 struct nameidata nd, *ndp;
296 struct ucred *newcred = NULL, *oldcred;
297 struct uidinfo *euip;
298 register_t *stack_base;
299 int error, len, i;
300 struct image_params image_params, *imgp;
301 struct vattr attr;
302 int (*img_first)(struct image_params *);
303 struct pargs *oldargs = NULL, *newargs = NULL;
304 struct sigacts *oldsigacts, *newsigacts;
305 #ifdef KTRACE
306 struct vnode *tracevp = NULL;
307 struct ucred *tracecred = NULL;
308 #endif
309 struct vnode *textvp = NULL;
310 int credential_changing;
311 int vfslocked;
312 int textset;
313 #ifdef MAC
314 struct label *interplabel = NULL;
315 int will_transition;
316 #endif
317 #ifdef HWPMC_HOOKS
318 struct pmckern_procexec pe;
319 #endif
320
321 vfslocked = 0;
322 imgp = &image_params;
323
324 /*
325 * Lock the process and set the P_INEXEC flag to indicate that
326 * it should be left alone until we're done here. This is
327 * necessary to avoid race conditions - e.g. in ptrace() -
328 * that might allow a local user to illicitly obtain elevated
329 * privileges.
330 */
331 PROC_LOCK(p);
332 KASSERT((p->p_flag & P_INEXEC) == 0,
333 ("%s(): process already has P_INEXEC flag", __func__));
334 p->p_flag |= P_INEXEC;
335 PROC_UNLOCK(p);
336
337 /*
338 * Initialize part of the common data
339 */
340 imgp->proc = p;
341 imgp->execlabel = NULL;
342 imgp->attr = &attr;
343 imgp->entry_addr = 0;
344 imgp->vmspace_destroyed = 0;
345 imgp->interpreted = 0;
346 imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
347 imgp->auxargs = NULL;
348 imgp->vp = NULL;
349 imgp->object = NULL;
350 imgp->firstpage = NULL;
351 imgp->ps_strings = 0;
352 imgp->auxarg_size = 0;
353 imgp->args = args;
354
355 #ifdef MAC
356 error = mac_execve_enter(imgp, mac_p);
357 if (error)
358 goto exec_fail;
359 #endif
360
361 imgp->image_header = NULL;
362
363 /*
364 * Translate the file name. namei() returns a vnode pointer
365 * in ni_vp amoung other things.
366 *
367 * XXXAUDIT: It would be desirable to also audit the name of the
368 * interpreter if this is an interpreted binary.
369 */
370 ndp = &nd;
371 NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
372 AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
373
374 interpret:
375 error = namei(ndp);
376 if (error)
377 goto exec_fail;
378
379 vfslocked = NDHASGIANT(ndp);
380 imgp->vp = ndp->ni_vp;
381
382 /*
383 * Check file permissions (also 'opens' file)
384 */
385 error = exec_check_permissions(imgp);
386 if (error)
387 goto exec_fail_dealloc;
388
389 imgp->object = imgp->vp->v_object;
390 if (imgp->object != NULL)
391 vm_object_reference(imgp->object);
392
393 /*
394 * Set VV_TEXT now so no one can write to the executable while we're
395 * activating it.
396 *
397 * Remember if this was set before and unset it in case this is not
398 * actually an executable image.
399 */
400 textset = imgp->vp->v_vflag & VV_TEXT;
401 imgp->vp->v_vflag |= VV_TEXT;
402
403 error = exec_map_first_page(imgp);
404 if (error)
405 goto exec_fail_dealloc;
406
407 /*
408 * If the current process has a special image activator it
409 * wants to try first, call it. For example, emulating shell
410 * scripts differently.
411 */
412 error = -1;
413 if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
414 error = img_first(imgp);
415
416 /*
417 * Loop through the list of image activators, calling each one.
418 * An activator returns -1 if there is no match, 0 on success,
419 * and an error otherwise.
420 */
421 for (i = 0; error == -1 && execsw[i]; ++i) {
422 if (execsw[i]->ex_imgact == NULL ||
423 execsw[i]->ex_imgact == img_first) {
424 continue;
425 }
426 error = (*execsw[i]->ex_imgact)(imgp);
427 }
428
429 if (error) {
430 if (error == -1) {
431 if (textset == 0)
432 imgp->vp->v_vflag &= ~VV_TEXT;
433 error = ENOEXEC;
434 }
435 goto exec_fail_dealloc;
436 }
437
438 /*
439 * Special interpreter operation, cleanup and loop up to try to
440 * activate the interpreter.
441 */
442 if (imgp->interpreted) {
443 exec_unmap_first_page(imgp);
444 /*
445 * VV_TEXT needs to be unset for scripts. There is a short
446 * period before we determine that something is a script where
447 * VV_TEXT will be set. The vnode lock is held over this
448 * entire period so nothing should illegitimately be blocked.
449 */
450 imgp->vp->v_vflag &= ~VV_TEXT;
451 /* free name buffer and old vnode */
452 NDFREE(ndp, NDF_ONLY_PNBUF);
453 #ifdef MAC
454 interplabel = mac_vnode_label_alloc();
455 mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel);
456 #endif
457 vput(ndp->ni_vp);
458 vm_object_deallocate(imgp->object);
459 imgp->object = NULL;
460 VFS_UNLOCK_GIANT(vfslocked);
461 vfslocked = 0;
462 /* set new name to that of the interpreter */
463 NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
464 UIO_SYSSPACE, imgp->interpreter_name, td);
465 goto interpret;
466 }
467
468 /*
469 * Copy out strings (args and env) and initialize stack base
470 */
471 if (p->p_sysent->sv_copyout_strings)
472 stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
473 else
474 stack_base = exec_copyout_strings(imgp);
475
476 /*
477 * If custom stack fixup routine present for this process
478 * let it do the stack setup.
479 * Else stuff argument count as first item on stack
480 */
481 if (p->p_sysent->sv_fixup != NULL)
482 (*p->p_sysent->sv_fixup)(&stack_base, imgp);
483 else
484 suword(--stack_base, imgp->args->argc);
485
486 /*
487 * For security and other reasons, the file descriptor table cannot
488 * be shared after an exec.
489 */
490 fdunshare(p, td);
491
492 /*
493 * Malloc things before we need locks.
494 */
495 newcred = crget();
496 euip = uifind(attr.va_uid);
497 i = imgp->args->begin_envv - imgp->args->begin_argv;
498 /* Cache arguments if they fit inside our allowance */
499 if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
500 newargs = pargs_alloc(i);
501 bcopy(imgp->args->begin_argv, newargs->ar_args, i);
502 }
503
504 /* close files on exec */
505 VOP_UNLOCK(imgp->vp, 0, td);
506 fdcloseexec(td);
507 vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
508
509 /* Get a reference to the vnode prior to locking the proc */
510 VREF(ndp->ni_vp);
511
512 /*
513 * For security and other reasons, signal handlers cannot
514 * be shared after an exec. The new process gets a copy of the old
515 * handlers. In execsigs(), the new process will have its signals
516 * reset.
517 */
518 PROC_LOCK(p);
519 if (sigacts_shared(p->p_sigacts)) {
520 oldsigacts = p->p_sigacts;
521 PROC_UNLOCK(p);
522 newsigacts = sigacts_alloc();
523 sigacts_copy(newsigacts, oldsigacts);
524 PROC_LOCK(p);
525 p->p_sigacts = newsigacts;
526 } else
527 oldsigacts = NULL;
528
529 /* Stop profiling */
530 stopprofclock(p);
531
532 /* reset caught signals */
533 execsigs(p);
534
535 /* name this process - nameiexec(p, ndp) */
536 len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
537 bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
538 p->p_comm[len] = 0;
539
540 /*
541 * mark as execed, wakeup the process that vforked (if any) and tell
542 * it that it now has its own resources back
543 */
544 p->p_flag |= P_EXEC;
545 if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
546 p->p_flag &= ~P_PPWAIT;
547 wakeup(p->p_pptr);
548 }
549
550 /*
551 * Implement image setuid/setgid.
552 *
553 * Don't honor setuid/setgid if the filesystem prohibits it or if
554 * the process is being traced.
555 *
556 * XXXMAC: For the time being, use NOSUID to also prohibit
557 * transitions on the file system.
558 */
559 oldcred = p->p_ucred;
560 credential_changing = 0;
561 credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
562 attr.va_uid;
563 credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
564 attr.va_gid;
565 #ifdef MAC
566 will_transition = mac_execve_will_transition(oldcred, imgp->vp,
567 interplabel, imgp);
568 credential_changing |= will_transition;
569 #endif
570
571 if (credential_changing &&
572 (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
573 (p->p_flag & P_TRACED) == 0) {
574 /*
575 * Turn off syscall tracing for set-id programs, except for
576 * root. Record any set-id flags first to make sure that
577 * we do not regain any tracing during a possible block.
578 */
579 setsugid(p);
580 #ifdef KTRACE
581 if (p->p_tracevp != NULL && suser_cred(oldcred, SUSER_ALLOWJAIL)) {
582 mtx_lock(&ktrace_mtx);
583 p->p_traceflag = 0;
584 tracevp = p->p_tracevp;
585 p->p_tracevp = NULL;
586 tracecred = p->p_tracecred;
587 p->p_tracecred = NULL;
588 mtx_unlock(&ktrace_mtx);
589 }
590 #endif
591 /*
592 * Close any file descriptors 0..2 that reference procfs,
593 * then make sure file descriptors 0..2 are in use.
594 *
595 * setugidsafety() may call closef() and then pfind()
596 * which may grab the process lock.
597 * fdcheckstd() may call falloc() which may block to
598 * allocate memory, so temporarily drop the process lock.
599 */
600 PROC_UNLOCK(p);
601 setugidsafety(td);
602 VOP_UNLOCK(imgp->vp, 0, td);
603 error = fdcheckstd(td);
604 vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
605 if (error != 0)
606 goto done1;
607 PROC_LOCK(p);
608 /*
609 * Set the new credentials.
610 */
611 crcopy(newcred, oldcred);
612 if (attr.va_mode & VSUID)
613 change_euid(newcred, euip);
614 if (attr.va_mode & VSGID)
615 change_egid(newcred, attr.va_gid);
616 #ifdef MAC
617 if (will_transition) {
618 mac_execve_transition(oldcred, newcred, imgp->vp,
619 interplabel, imgp);
620 }
621 #endif
622 /*
623 * Implement correct POSIX saved-id behavior.
624 *
625 * XXXMAC: Note that the current logic will save the
626 * uid and gid if a MAC domain transition occurs, even
627 * though maybe it shouldn't.
628 */
629 change_svuid(newcred, newcred->cr_uid);
630 change_svgid(newcred, newcred->cr_gid);
631 p->p_ucred = newcred;
632 newcred = NULL;
633 } else {
634 if (oldcred->cr_uid == oldcred->cr_ruid &&
635 oldcred->cr_gid == oldcred->cr_rgid)
636 p->p_flag &= ~P_SUGID;
637 /*
638 * Implement correct POSIX saved-id behavior.
639 *
640 * XXX: It's not clear that the existing behavior is
641 * POSIX-compliant. A number of sources indicate that the
642 * saved uid/gid should only be updated if the new ruid is
643 * not equal to the old ruid, or the new euid is not equal
644 * to the old euid and the new euid is not equal to the old
645 * ruid. The FreeBSD code always updates the saved uid/gid.
646 * Also, this code uses the new (replaced) euid and egid as
647 * the source, which may or may not be the right ones to use.
648 */
649 if (oldcred->cr_svuid != oldcred->cr_uid ||
650 oldcred->cr_svgid != oldcred->cr_gid) {
651 crcopy(newcred, oldcred);
652 change_svuid(newcred, newcred->cr_uid);
653 change_svgid(newcred, newcred->cr_gid);
654 p->p_ucred = newcred;
655 newcred = NULL;
656 }
657 }
658
659 /*
660 * Store the vp for use in procfs. This vnode was referenced prior
661 * to locking the proc lock.
662 */
663 textvp = p->p_textvp;
664 p->p_textvp = ndp->ni_vp;
665
666 /*
667 * Notify others that we exec'd, and clear the P_INEXEC flag
668 * as we're now a bona fide freshly-execed process.
669 */
670 KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
671 p->p_flag &= ~P_INEXEC;
672
673 /*
674 * If tracing the process, trap to debugger so breakpoints
675 * can be set before the program executes.
676 * Use tdsignal to deliver signal to current thread, use
677 * psignal may cause the signal to be delivered to wrong thread
678 * because that thread will exit, remember we are going to enter
679 * single thread mode.
680 */
681 if (p->p_flag & P_TRACED)
682 tdsignal(td, SIGTRAP, SIGTARGET_TD);
683
684 /* clear "fork but no exec" flag, as we _are_ execing */
685 p->p_acflag &= ~AFORK;
686
687 /*
688 * Free any previous argument cache and replace it with
689 * the new argument cache, if any.
690 */
691 oldargs = p->p_args;
692 p->p_args = newargs;
693 newargs = NULL;
694
695 #ifdef HWPMC_HOOKS
696 /*
697 * Check if system-wide sampling is in effect or if the
698 * current process is using PMCs. If so, do exec() time
699 * processing. This processing needs to happen AFTER the
700 * P_INEXEC flag is cleared.
701 *
702 * The proc lock needs to be released before taking the PMC
703 * SX.
704 */
705 if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
706 PROC_UNLOCK(p);
707 pe.pm_credentialschanged = credential_changing;
708 pe.pm_entryaddr = imgp->entry_addr;
709
710 PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
711 } else
712 PROC_UNLOCK(p);
713 #else /* !HWPMC_HOOKS */
714 PROC_UNLOCK(p);
715 #endif
716
717 /* Set values passed into the program in registers. */
718 if (p->p_sysent->sv_setregs)
719 (*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
720 (u_long)(uintptr_t)stack_base, imgp->ps_strings);
721 else
722 exec_setregs(td, imgp->entry_addr,
723 (u_long)(uintptr_t)stack_base, imgp->ps_strings);
724
725 vfs_mark_atime(imgp->vp, td);
726
727 done1:
728 /*
729 * Free any resources malloc'd earlier that we didn't use.
730 */
731 uifree(euip);
732 if (newcred == NULL)
733 crfree(oldcred);
734 else
735 crfree(newcred);
736 VOP_UNLOCK(imgp->vp, 0, td);
737 /*
738 * Handle deferred decrement of ref counts.
739 */
740 if (textvp != NULL) {
741 int tvfslocked;
742
743 tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
744 vrele(textvp);
745 VFS_UNLOCK_GIANT(tvfslocked);
746 }
747 if (ndp->ni_vp && error != 0)
748 vrele(ndp->ni_vp);
749 #ifdef KTRACE
750 if (tracevp != NULL)
751 vrele(tracevp);
752 if (tracecred != NULL)
753 crfree(tracecred);
754 #endif
755 vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
756 if (oldargs != NULL)
757 pargs_drop(oldargs);
758 if (newargs != NULL)
759 pargs_drop(newargs);
760 if (oldsigacts != NULL)
761 sigacts_free(oldsigacts);
762
763 exec_fail_dealloc:
764
765 /*
766 * free various allocated resources
767 */
768 if (imgp->firstpage != NULL)
769 exec_unmap_first_page(imgp);
770
771 if (imgp->vp != NULL) {
772 NDFREE(ndp, NDF_ONLY_PNBUF);
773 vput(imgp->vp);
774 }
775
776 if (imgp->object != NULL)
777 vm_object_deallocate(imgp->object);
778
779 if (error == 0) {
780 /*
781 * Stop the process here if its stop event mask has
782 * the S_EXEC bit set.
783 */
784 STOPEVENT(p, S_EXEC, 0);
785 goto done2;
786 }
787
788 exec_fail:
789 /* we're done here, clear P_INEXEC */
790 PROC_LOCK(p);
791 p->p_flag &= ~P_INEXEC;
792 PROC_UNLOCK(p);
793
794 if (imgp->vmspace_destroyed) {
795 /* sorry, no more process anymore. exit gracefully */
796 #ifdef MAC
797 mac_execve_exit(imgp);
798 if (interplabel != NULL)
799 mac_vnode_label_free(interplabel);
800 #endif
801 VFS_UNLOCK_GIANT(vfslocked);
802 exec_free_args(args);
803 exit1(td, W_EXITCODE(0, SIGABRT));
804 /* NOT REACHED */
805 error = 0;
806 }
807 done2:
808 #ifdef MAC
809 mac_execve_exit(imgp);
810 if (interplabel != NULL)
811 mac_vnode_label_free(interplabel);
812 #endif
813 VFS_UNLOCK_GIANT(vfslocked);
814 return (error);
815 }
816
817 int
818 exec_map_first_page(imgp)
819 struct image_params *imgp;
820 {
821 int rv, i;
822 int initial_pagein;
823 vm_page_t ma[VM_INITIAL_PAGEIN];
824 vm_object_t object;
825
826 if (imgp->firstpage != NULL)
827 exec_unmap_first_page(imgp);
828
829 object = imgp->vp->v_object;
830 if (object == NULL)
831 return (EACCES);
832 VM_OBJECT_LOCK(object);
833 ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
834 if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
835 initial_pagein = VM_INITIAL_PAGEIN;
836 if (initial_pagein > object->size)
837 initial_pagein = object->size;
838 for (i = 1; i < initial_pagein; i++) {
839 if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
840 if (ma[i]->valid)
841 break;
842 vm_page_lock_queues();
843 if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) {
844 vm_page_unlock_queues();
845 break;
846 }
847 vm_page_busy(ma[i]);
848 vm_page_unlock_queues();
849 } else {
850 ma[i] = vm_page_alloc(object, i,
851 VM_ALLOC_NORMAL);
852 if (ma[i] == NULL)
853 break;
854 }
855 }
856 initial_pagein = i;
857 rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
858 ma[0] = vm_page_lookup(object, 0);
859 if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
860 (ma[0]->valid == 0)) {
861 if (ma[0]) {
862 vm_page_lock_queues();
863 pmap_remove_all(ma[0]);
864 vm_page_free(ma[0]);
865 vm_page_unlock_queues();
866 }
867 VM_OBJECT_UNLOCK(object);
868 return (EIO);
869 }
870 }
871 vm_page_lock_queues();
872 vm_page_hold(ma[0]);
873 vm_page_wakeup(ma[0]);
874 vm_page_unlock_queues();
875 VM_OBJECT_UNLOCK(object);
876
877 imgp->firstpage = sf_buf_alloc(ma[0], 0);
878 imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
879
880 return (0);
881 }
882
883 void
884 exec_unmap_first_page(imgp)
885 struct image_params *imgp;
886 {
887 vm_page_t m;
888
889 if (imgp->firstpage != NULL) {
890 m = sf_buf_page(imgp->firstpage);
891 sf_buf_free(imgp->firstpage);
892 imgp->firstpage = NULL;
893 vm_page_lock_queues();
894 vm_page_unhold(m);
895 vm_page_unlock_queues();
896 }
897 }
898
899 /*
900 * Destroy old address space, and allocate a new stack
901 * The new stack is only SGROWSIZ large because it is grown
902 * automatically in trap.c.
903 */
904 int
905 exec_new_vmspace(imgp, sv)
906 struct image_params *imgp;
907 struct sysentvec *sv;
908 {
909 int error;
910 struct proc *p = imgp->proc;
911 struct vmspace *vmspace = p->p_vmspace;
912 vm_offset_t stack_addr;
913 vm_map_t map;
914
915 imgp->vmspace_destroyed = 1;
916
917 /* Called with Giant held, do not depend on it! */
918 EVENTHANDLER_INVOKE(process_exec, p);
919
920 /*
921 * Here is as good a place as any to do any resource limit cleanups.
922 * This is needed if a 64 bit binary exec's a 32 bit binary - the
923 * data size limit may need to be changed to a value that makes
924 * sense for the 32 bit binary.
925 */
926 if (sv->sv_fixlimits != NULL)
927 sv->sv_fixlimits(imgp);
928
929 /*
930 * Blow away entire process VM, if address space not shared,
931 * otherwise, create a new VM space so that other threads are
932 * not disrupted
933 */
934 map = &vmspace->vm_map;
935 if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
936 vm_map_max(map) == sv->sv_maxuser) {
937 shmexit(vmspace);
938 pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
939 vm_map_max(map));
940 vm_map_remove(map, vm_map_min(map), vm_map_max(map));
941 } else {
942 vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
943 vmspace = p->p_vmspace;
944 map = &vmspace->vm_map;
945 }
946
947 /* Allocate a new stack */
948 stack_addr = sv->sv_usrstack - maxssiz;
949 error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
950 sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
951 if (error)
952 return (error);
953
954 #ifdef __ia64__
955 /* Allocate a new register stack */
956 stack_addr = IA64_BACKINGSTORE;
957 error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
958 sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
959 if (error)
960 return (error);
961 #endif
962
963 /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
964 * VM_STACK case, but they are still used to monitor the size of the
965 * process stack so we can check the stack rlimit.
966 */
967 vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
968 vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
969
970 return (0);
971 }
972
973 /*
974 * Copy out argument and environment strings from the old process
975 * address space into the temporary string buffer.
976 */
977 int
978 exec_copyin_args(struct image_args *args, char *fname,
979 enum uio_seg segflg, char **argv, char **envv)
980 {
981 char *argp, *envp;
982 int error;
983 size_t length;
984
985 error = 0;
986
987 bzero(args, sizeof(*args));
988 if (argv == NULL)
989 return (EFAULT);
990 /*
991 * Allocate temporary demand zeroed space for argument and
992 * environment strings:
993 *
994 * o ARG_MAX for argument and environment;
995 * o MAXSHELLCMDLEN for the name of interpreters.
996 */
997 args->buf = (char *) kmem_alloc_wait(exec_map,
998 PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
999 if (args->buf == NULL)
1000 return (ENOMEM);
1001 args->begin_argv = args->buf;
1002 args->endp = args->begin_argv;
1003 args->stringspace = ARG_MAX;
1004
1005 args->fname = args->buf + ARG_MAX;
1006
1007 /*
1008 * Copy the file name.
1009 */
1010 error = (segflg == UIO_SYSSPACE) ?
1011 copystr(fname, args->fname, PATH_MAX, &length) :
1012 copyinstr(fname, args->fname, PATH_MAX, &length);
1013 if (error != 0)
1014 return (error);
1015
1016 /*
1017 * extract arguments first
1018 */
1019 while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
1020 if (argp == (caddr_t) -1)
1021 return (EFAULT);
1022 if ((error = copyinstr(argp, args->endp,
1023 args->stringspace, &length))) {
1024 if (error == ENAMETOOLONG)
1025 return (E2BIG);
1026 return (error);
1027 }
1028 args->stringspace -= length;
1029 args->endp += length;
1030 args->argc++;
1031 }
1032
1033 args->begin_envv = args->endp;
1034
1035 /*
1036 * extract environment strings
1037 */
1038 if (envv) {
1039 while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
1040 if (envp == (caddr_t)-1)
1041 return (EFAULT);
1042 if ((error = copyinstr(envp, args->endp,
1043 args->stringspace, &length))) {
1044 if (error == ENAMETOOLONG)
1045 return (E2BIG);
1046 return (error);
1047 }
1048 args->stringspace -= length;
1049 args->endp += length;
1050 args->envc++;
1051 }
1052 }
1053
1054 return (0);
1055 }
1056
1057 void
1058 exec_free_args(struct image_args *args)
1059 {
1060
1061 if (args->buf) {
1062 kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
1063 PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
1064 args->buf = NULL;
1065 }
1066 }
1067
1068 /*
1069 * Copy strings out to the new process address space, constructing
1070 * new arg and env vector tables. Return a pointer to the base
1071 * so that it can be used as the initial stack pointer.
1072 */
1073 register_t *
1074 exec_copyout_strings(imgp)
1075 struct image_params *imgp;
1076 {
1077 int argc, envc;
1078 char **vectp;
1079 char *stringp, *destp;
1080 register_t *stack_base;
1081 struct ps_strings *arginfo;
1082 struct proc *p;
1083 int szsigcode;
1084
1085 /*
1086 * Calculate string base and vector table pointers.
1087 * Also deal with signal trampoline code for this exec type.
1088 */
1089 p = imgp->proc;
1090 szsigcode = 0;
1091 arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
1092 if (p->p_sysent->sv_szsigcode != NULL)
1093 szsigcode = *(p->p_sysent->sv_szsigcode);
1094 destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
1095 roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
1096
1097 /*
1098 * install sigcode
1099 */
1100 if (szsigcode)
1101 copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
1102 szsigcode), szsigcode);
1103
1104 /*
1105 * If we have a valid auxargs ptr, prepare some room
1106 * on the stack.
1107 */
1108 if (imgp->auxargs) {
1109 /*
1110 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
1111 * lower compatibility.
1112 */
1113 imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
1114 (AT_COUNT * 2);
1115 /*
1116 * The '+ 2' is for the null pointers at the end of each of
1117 * the arg and env vector sets,and imgp->auxarg_size is room
1118 * for argument of Runtime loader.
1119 */
1120 vectp = (char **)(destp - (imgp->args->argc +
1121 imgp->args->envc + 2 + imgp->auxarg_size) *
1122 sizeof(char *));
1123
1124 } else {
1125 /*
1126 * The '+ 2' is for the null pointers at the end of each of
1127 * the arg and env vector sets
1128 */
1129 vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
1130 sizeof(char *));
1131 }
1132
1133 /*
1134 * vectp also becomes our initial stack base
1135 */
1136 stack_base = (register_t *)vectp;
1137
1138 stringp = imgp->args->begin_argv;
1139 argc = imgp->args->argc;
1140 envc = imgp->args->envc;
1141
1142 /*
1143 * Copy out strings - arguments and environment.
1144 */
1145 copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
1146
1147 /*
1148 * Fill in "ps_strings" struct for ps, w, etc.
1149 */
1150 suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
1151 suword(&arginfo->ps_nargvstr, argc);
1152
1153 /*
1154 * Fill in argument portion of vector table.
1155 */
1156 for (; argc > 0; --argc) {
1157 suword(vectp++, (long)(intptr_t)destp);
1158 while (*stringp++ != 0)
1159 destp++;
1160 destp++;
1161 }
1162
1163 /* a null vector table pointer separates the argp's from the envp's */
1164 suword(vectp++, 0);
1165
1166 suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
1167 suword(&arginfo->ps_nenvstr, envc);
1168
1169 /*
1170 * Fill in environment portion of vector table.
1171 */
1172 for (; envc > 0; --envc) {
1173 suword(vectp++, (long)(intptr_t)destp);
1174 while (*stringp++ != 0)
1175 destp++;
1176 destp++;
1177 }
1178
1179 /* end of vector table is a null pointer */
1180 suword(vectp, 0);
1181
1182 return (stack_base);
1183 }
1184
1185 /*
1186 * Check permissions of file to execute.
1187 * Called with imgp->vp locked.
1188 * Return 0 for success or error code on failure.
1189 */
1190 int
1191 exec_check_permissions(imgp)
1192 struct image_params *imgp;
1193 {
1194 struct vnode *vp = imgp->vp;
1195 struct vattr *attr = imgp->attr;
1196 struct thread *td;
1197 int error;
1198
1199 td = curthread; /* XXXKSE */
1200
1201 /* Get file attributes */
1202 error = VOP_GETATTR(vp, attr, td->td_ucred, td);
1203 if (error)
1204 return (error);
1205
1206 #ifdef MAC
1207 error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
1208 if (error)
1209 return (error);
1210 #endif
1211
1212 /*
1213 * 1) Check if file execution is disabled for the filesystem that this
1214 * file resides on.
1215 * 2) Insure that at least one execute bit is on - otherwise root
1216 * will always succeed, and we don't want to happen unless the
1217 * file really is executable.
1218 * 3) Insure that the file is a regular file.
1219 */
1220 if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
1221 ((attr->va_mode & 0111) == 0) ||
1222 (attr->va_type != VREG))
1223 return (EACCES);
1224
1225 /*
1226 * Zero length files can't be exec'd
1227 */
1228 if (attr->va_size == 0)
1229 return (ENOEXEC);
1230
1231 /*
1232 * Check for execute permission to file based on current credentials.
1233 */
1234 error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
1235 if (error)
1236 return (error);
1237
1238 /*
1239 * Check number of open-for-writes on the file and deny execution
1240 * if there are any.
1241 */
1242 if (vp->v_writecount)
1243 return (ETXTBSY);
1244
1245 /*
1246 * Call filesystem specific open routine (which does nothing in the
1247 * general case).
1248 */
1249 error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
1250 return (error);
1251 }
1252
1253 /*
1254 * Exec handler registration
1255 */
1256 int
1257 exec_register(execsw_arg)
1258 const struct execsw *execsw_arg;
1259 {
1260 const struct execsw **es, **xs, **newexecsw;
1261 int count = 2; /* New slot and trailing NULL */
1262
1263 if (execsw)
1264 for (es = execsw; *es; es++)
1265 count++;
1266 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1267 if (newexecsw == NULL)
1268 return (ENOMEM);
1269 xs = newexecsw;
1270 if (execsw)
1271 for (es = execsw; *es; es++)
1272 *xs++ = *es;
1273 *xs++ = execsw_arg;
1274 *xs = NULL;
1275 if (execsw)
1276 free(execsw, M_TEMP);
1277 execsw = newexecsw;
1278 return (0);
1279 }
1280
1281 int
1282 exec_unregister(execsw_arg)
1283 const struct execsw *execsw_arg;
1284 {
1285 const struct execsw **es, **xs, **newexecsw;
1286 int count = 1;
1287
1288 if (execsw == NULL)
1289 panic("unregister with no handlers left?\n");
1290
1291 for (es = execsw; *es; es++) {
1292 if (*es == execsw_arg)
1293 break;
1294 }
1295 if (*es == NULL)
1296 return (ENOENT);
1297 for (es = execsw; *es; es++)
1298 if (*es != execsw_arg)
1299 count++;
1300 newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
1301 if (newexecsw == NULL)
1302 return (ENOMEM);
1303 xs = newexecsw;
1304 for (es = execsw; *es; es++)
1305 if (*es != execsw_arg)
1306 *xs++ = *es;
1307 *xs = NULL;
1308 if (execsw)
1309 free(execsw, M_TEMP);
1310 execsw = newexecsw;
1311 return (0);
1312 }
Cache object: ad0d4f0722cf49ecf3bcb09eb925b9bd
|