FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_fork.c
1 /* $OpenBSD: kern_fork.c,v 1.245 2023/01/07 05:24:58 guenther Exp $ */
2 /* $NetBSD: kern_fork.c,v 1.29 1996/02/09 18:59:34 christos Exp $ */
3
4 /*
5 * Copyright (c) 1982, 1986, 1989, 1991, 1993
6 * The Regents of the University of California. All rights reserved.
7 * (c) UNIX System Laboratories, Inc.
8 * All or some portions of this file are derived from material licensed
9 * to the University of California by American Telephone and Telegraph
10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11 * the permission of UNIX System Laboratories, Inc.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 * may be used to endorse or promote products derived from this software
23 * without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
38 */
39
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/filedesc.h>
43 #include <sys/malloc.h>
44 #include <sys/mount.h>
45 #include <sys/proc.h>
46 #include <sys/resourcevar.h>
47 #include <sys/signalvar.h>
48 #include <sys/vnode.h>
49 #include <sys/vmmeter.h>
50 #include <sys/acct.h>
51 #include <sys/ktrace.h>
52 #include <sys/sched.h>
53 #include <sys/sysctl.h>
54 #include <sys/pool.h>
55 #include <sys/mman.h>
56 #include <sys/ptrace.h>
57 #include <sys/atomic.h>
58 #include <sys/unistd.h>
59
60 #include <sys/syscallargs.h>
61
62 #include <uvm/uvm.h>
63 #include <machine/tcb.h>
64
65 int nprocesses = 1; /* process 0 */
66 int nthreads = 1; /* proc 0 */
67 struct forkstat forkstat;
68
69 void fork_return(void *);
70 pid_t alloctid(void);
71 pid_t allocpid(void);
72 int ispidtaken(pid_t);
73
74 void unveil_copy(struct process *parent, struct process *child);
75
76 struct proc *thread_new(struct proc *_parent, vaddr_t _uaddr);
77 struct process *process_new(struct proc *, struct process *, int);
78 int fork_check_maxthread(uid_t _uid);
79
80 void
81 fork_return(void *arg)
82 {
83 struct proc *p = (struct proc *)arg;
84
85 if (p->p_p->ps_flags & PS_TRACED)
86 psignal(p, SIGTRAP);
87
88 child_return(p);
89 }
90
91 int
92 sys_fork(struct proc *p, void *v, register_t *retval)
93 {
94 void (*func)(void *) = child_return;
95 int flags;
96
97 flags = FORK_FORK;
98 if (p->p_p->ps_ptmask & PTRACE_FORK) {
99 flags |= FORK_PTRACE;
100 func = fork_return;
101 }
102 return fork1(p, flags, func, NULL, retval, NULL);
103 }
104
105 int
106 sys_vfork(struct proc *p, void *v, register_t *retval)
107 {
108 return fork1(p, FORK_VFORK|FORK_PPWAIT, child_return, NULL,
109 retval, NULL);
110 }
111
112 int
113 sys___tfork(struct proc *p, void *v, register_t *retval)
114 {
115 struct sys___tfork_args /* {
116 syscallarg(const struct __tfork) *param;
117 syscallarg(size_t) psize;
118 } */ *uap = v;
119 size_t psize = SCARG(uap, psize);
120 struct __tfork param = { 0 };
121 int error;
122
123 if (psize == 0 || psize > sizeof(param))
124 return EINVAL;
125 if ((error = copyin(SCARG(uap, param), ¶m, psize)))
126 return error;
127 #ifdef KTRACE
128 if (KTRPOINT(p, KTR_STRUCT))
129 ktrstruct(p, "tfork", ¶m, sizeof(param));
130 #endif
131 #ifdef TCB_INVALID
132 if (TCB_INVALID(param.tf_tcb))
133 return EINVAL;
134 #endif /* TCB_INVALID */
135
136 return thread_fork(p, param.tf_stack, param.tf_tcb, param.tf_tid,
137 retval);
138 }
139
140 /*
141 * Allocate and initialize a thread (proc) structure, given the parent thread.
142 */
143 struct proc *
144 thread_new(struct proc *parent, vaddr_t uaddr)
145 {
146 struct proc *p;
147
148 p = pool_get(&proc_pool, PR_WAITOK);
149 p->p_stat = SIDL; /* protect against others */
150 p->p_runpri = 0;
151 p->p_flag = 0;
152
153 /*
154 * Make a proc table entry for the new process.
155 * Start by zeroing the section of proc that is zero-initialized,
156 * then copy the section that is copied directly from the parent.
157 */
158 memset(&p->p_startzero, 0,
159 (caddr_t)&p->p_endzero - (caddr_t)&p->p_startzero);
160 memcpy(&p->p_startcopy, &parent->p_startcopy,
161 (caddr_t)&p->p_endcopy - (caddr_t)&p->p_startcopy);
162 crhold(p->p_ucred);
163 p->p_addr = (struct user *)uaddr;
164
165 /*
166 * Initialize the timeouts.
167 */
168 timeout_set(&p->p_sleep_to, endtsleep, p);
169
170 return p;
171 }
172
173 /*
174 * Initialize common bits of a process structure, given the initial thread.
175 */
176 void
177 process_initialize(struct process *pr, struct proc *p)
178 {
179 /* initialize the thread links */
180 pr->ps_mainproc = p;
181 TAILQ_INIT(&pr->ps_threads);
182 TAILQ_INSERT_TAIL(&pr->ps_threads, p, p_thr_link);
183 pr->ps_refcnt = 1;
184 p->p_p = pr;
185
186 /* give the process the same creds as the initial thread */
187 pr->ps_ucred = p->p_ucred;
188 crhold(pr->ps_ucred);
189 /* new thread and new process */
190 KASSERT(p->p_ucred->cr_refcnt.r_refs >= 2);
191
192 LIST_INIT(&pr->ps_children);
193 LIST_INIT(&pr->ps_orphans);
194 LIST_INIT(&pr->ps_ftlist);
195 LIST_INIT(&pr->ps_sigiolst);
196 TAILQ_INIT(&pr->ps_tslpqueue);
197
198 rw_init(&pr->ps_lock, "pslock");
199 mtx_init(&pr->ps_mtx, IPL_HIGH);
200
201 timeout_set_flags(&pr->ps_realit_to, realitexpire, pr,
202 KCLOCK_UPTIME, 0);
203 timeout_set(&pr->ps_rucheck_to, rucheck, pr);
204 }
205
206
207 /*
208 * Allocate and initialize a new process.
209 */
210 struct process *
211 process_new(struct proc *p, struct process *parent, int flags)
212 {
213 struct process *pr;
214
215 pr = pool_get(&process_pool, PR_WAITOK);
216
217 /*
218 * Make a process structure for the new process.
219 * Start by zeroing the section of proc that is zero-initialized,
220 * then copy the section that is copied directly from the parent.
221 */
222 memset(&pr->ps_startzero, 0,
223 (caddr_t)&pr->ps_endzero - (caddr_t)&pr->ps_startzero);
224 memcpy(&pr->ps_startcopy, &parent->ps_startcopy,
225 (caddr_t)&pr->ps_endcopy - (caddr_t)&pr->ps_startcopy);
226
227 process_initialize(pr, p);
228 pr->ps_pid = allocpid();
229 lim_fork(parent, pr);
230
231 /* post-copy fixups */
232 pr->ps_pptr = parent;
233 pr->ps_ppid = parent->ps_pid;
234
235 /* bump references to the text vnode (for sysctl) */
236 pr->ps_textvp = parent->ps_textvp;
237 if (pr->ps_textvp)
238 vref(pr->ps_textvp);
239
240 /* copy unveil if unveil is active */
241 unveil_copy(parent, pr);
242
243 pr->ps_flags = parent->ps_flags &
244 (PS_SUGID | PS_SUGIDEXEC | PS_PLEDGE | PS_EXECPLEDGE | PS_WXNEEDED);
245 if (parent->ps_session->s_ttyvp != NULL)
246 pr->ps_flags |= parent->ps_flags & PS_CONTROLT;
247
248 /*
249 * Duplicate sub-structures as needed.
250 * Increase reference counts on shared objects.
251 */
252 if (flags & FORK_SHAREFILES)
253 pr->ps_fd = fdshare(parent);
254 else
255 pr->ps_fd = fdcopy(parent);
256 pr->ps_sigacts = sigactsinit(parent);
257 if (flags & FORK_SHAREVM)
258 pr->ps_vmspace = uvmspace_share(parent);
259 else
260 pr->ps_vmspace = uvmspace_fork(parent);
261
262 if (parent->ps_flags & PS_PROFIL)
263 startprofclock(pr);
264 if (flags & FORK_PTRACE)
265 pr->ps_flags |= parent->ps_flags & PS_TRACED;
266 if (flags & FORK_NOZOMBIE)
267 pr->ps_flags |= PS_NOZOMBIE;
268 if (flags & FORK_SYSTEM)
269 pr->ps_flags |= PS_SYSTEM;
270
271 /* mark as embryo to protect against others */
272 pr->ps_flags |= PS_EMBRYO;
273
274 /* Force visibility of all of the above changes */
275 membar_producer();
276
277 /* it's sufficiently inited to be globally visible */
278 LIST_INSERT_HEAD(&allprocess, pr, ps_list);
279
280 return pr;
281 }
282
283 /* print the 'table full' message once per 10 seconds */
284 struct timeval fork_tfmrate = { 10, 0 };
285
286 int
287 fork_check_maxthread(uid_t uid)
288 {
289 /*
290 * Although process entries are dynamically created, we still keep
291 * a global limit on the maximum number we will create. We reserve
292 * the last 5 processes to root. The variable nprocesses is the
293 * current number of processes, maxprocess is the limit. Similar
294 * rules for threads (struct proc): we reserve the last 5 to root;
295 * the variable nthreads is the current number of procs, maxthread is
296 * the limit.
297 */
298 if ((nthreads >= maxthread - 5 && uid != 0) || nthreads >= maxthread) {
299 static struct timeval lasttfm;
300
301 if (ratecheck(&lasttfm, &fork_tfmrate))
302 tablefull("thread");
303 return EAGAIN;
304 }
305 nthreads++;
306
307 return 0;
308 }
309
310 static inline void
311 fork_thread_start(struct proc *p, struct proc *parent, int flags)
312 {
313 struct cpu_info *ci;
314 int s;
315
316 SCHED_LOCK(s);
317 ci = sched_choosecpu_fork(parent, flags);
318 setrunqueue(ci, p, p->p_usrpri);
319 SCHED_UNLOCK(s);
320 }
321
322 int
323 fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
324 register_t *retval, struct proc **rnewprocp)
325 {
326 struct process *curpr = curp->p_p;
327 struct process *pr;
328 struct proc *p;
329 uid_t uid = curp->p_ucred->cr_ruid;
330 struct vmspace *vm;
331 int count;
332 vaddr_t uaddr;
333 int error;
334 struct ptrace_state *newptstat = NULL;
335
336 KASSERT((flags & ~(FORK_FORK | FORK_VFORK | FORK_PPWAIT | FORK_PTRACE
337 | FORK_IDLE | FORK_SHAREVM | FORK_SHAREFILES | FORK_NOZOMBIE
338 | FORK_SYSTEM)) == 0);
339 KASSERT(func != NULL);
340
341 if ((error = fork_check_maxthread(uid)))
342 return error;
343
344 if ((nprocesses >= maxprocess - 5 && uid != 0) ||
345 nprocesses >= maxprocess) {
346 static struct timeval lasttfm;
347
348 if (ratecheck(&lasttfm, &fork_tfmrate))
349 tablefull("process");
350 nthreads--;
351 return EAGAIN;
352 }
353 nprocesses++;
354
355 /*
356 * Increment the count of processes running with this uid.
357 * Don't allow a nonprivileged user to exceed their current limit.
358 */
359 count = chgproccnt(uid, 1);
360 if (uid != 0 && count > lim_cur(RLIMIT_NPROC)) {
361 (void)chgproccnt(uid, -1);
362 nprocesses--;
363 nthreads--;
364 return EAGAIN;
365 }
366
367 uaddr = uvm_uarea_alloc();
368 if (uaddr == 0) {
369 (void)chgproccnt(uid, -1);
370 nprocesses--;
371 nthreads--;
372 return (ENOMEM);
373 }
374
375 /*
376 * From now on, we're committed to the fork and cannot fail.
377 */
378 p = thread_new(curp, uaddr);
379 pr = process_new(p, curpr, flags);
380
381 p->p_fd = pr->ps_fd;
382 p->p_vmspace = pr->ps_vmspace;
383 if (pr->ps_flags & PS_SYSTEM)
384 atomic_setbits_int(&p->p_flag, P_SYSTEM);
385
386 if (flags & FORK_PPWAIT) {
387 atomic_setbits_int(&pr->ps_flags, PS_PPWAIT);
388 atomic_setbits_int(&curpr->ps_flags, PS_ISPWAIT);
389 }
390
391 #ifdef KTRACE
392 /*
393 * Copy traceflag and tracefile if enabled.
394 * If not inherited, these were zeroed above.
395 */
396 if (curpr->ps_traceflag & KTRFAC_INHERIT)
397 ktrsettrace(pr, curpr->ps_traceflag, curpr->ps_tracevp,
398 curpr->ps_tracecred);
399 #endif
400
401 /*
402 * Finish creating the child thread. cpu_fork() will copy
403 * and update the pcb and make the child ready to run. If
404 * this is a normal user fork, the child will exit directly
405 * to user mode via child_return() on its first time slice
406 * and will not return here. If this is a kernel thread,
407 * the specified entry point will be executed.
408 */
409 cpu_fork(curp, p, NULL, NULL, func, arg ? arg : p);
410
411 vm = pr->ps_vmspace;
412
413 if (flags & FORK_FORK) {
414 forkstat.cntfork++;
415 forkstat.sizfork += vm->vm_dsize + vm->vm_ssize;
416 } else if (flags & FORK_VFORK) {
417 forkstat.cntvfork++;
418 forkstat.sizvfork += vm->vm_dsize + vm->vm_ssize;
419 } else {
420 forkstat.cntkthread++;
421 }
422
423 if (pr->ps_flags & PS_TRACED && flags & FORK_FORK)
424 newptstat = malloc(sizeof(*newptstat), M_SUBPROC, M_WAITOK);
425
426 p->p_tid = alloctid();
427
428 LIST_INSERT_HEAD(&allproc, p, p_list);
429 LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
430 LIST_INSERT_HEAD(PIDHASH(pr->ps_pid), pr, ps_hash);
431 LIST_INSERT_AFTER(curpr, pr, ps_pglist);
432 LIST_INSERT_HEAD(&curpr->ps_children, pr, ps_sibling);
433
434 if (pr->ps_flags & PS_TRACED) {
435 pr->ps_oppid = curpr->ps_pid;
436 process_reparent(pr, curpr->ps_pptr);
437
438 /*
439 * Set ptrace status.
440 */
441 if (newptstat != NULL) {
442 pr->ps_ptstat = newptstat;
443 newptstat = NULL;
444 curpr->ps_ptstat->pe_report_event = PTRACE_FORK;
445 pr->ps_ptstat->pe_report_event = PTRACE_FORK;
446 curpr->ps_ptstat->pe_other_pid = pr->ps_pid;
447 pr->ps_ptstat->pe_other_pid = curpr->ps_pid;
448 }
449 }
450
451 /*
452 * For new processes, set accounting bits and mark as complete.
453 */
454 nanouptime(&pr->ps_start);
455 pr->ps_acflag = AFORK;
456 atomic_clearbits_int(&pr->ps_flags, PS_EMBRYO);
457
458 if ((flags & FORK_IDLE) == 0)
459 fork_thread_start(p, curp, flags);
460 else
461 p->p_cpu = arg;
462
463 free(newptstat, M_SUBPROC, sizeof(*newptstat));
464
465 /*
466 * Notify any interested parties about the new process.
467 */
468 KNOTE(&curpr->ps_klist, NOTE_FORK | pr->ps_pid);
469
470 /*
471 * Update stats now that we know the fork was successful.
472 */
473 uvmexp.forks++;
474 if (flags & FORK_PPWAIT)
475 uvmexp.forks_ppwait++;
476 if (flags & FORK_SHAREVM)
477 uvmexp.forks_sharevm++;
478
479 /*
480 * Pass a pointer to the new process to the caller.
481 */
482 if (rnewprocp != NULL)
483 *rnewprocp = p;
484
485 /*
486 * Preserve synchronization semantics of vfork. If waiting for
487 * child to exec or exit, set PS_PPWAIT on child and PS_ISPWAIT
488 * on ourselves, and sleep on our process for the latter flag
489 * to go away.
490 * XXX Need to stop other rthreads in the parent
491 */
492 if (flags & FORK_PPWAIT)
493 while (curpr->ps_flags & PS_ISPWAIT)
494 tsleep_nsec(curpr, PWAIT, "ppwait", INFSLP);
495
496 /*
497 * If we're tracing the child, alert the parent too.
498 */
499 if ((flags & FORK_PTRACE) && (curpr->ps_flags & PS_TRACED))
500 psignal(curp, SIGTRAP);
501
502 /*
503 * Return child pid to parent process
504 */
505 if (retval != NULL)
506 *retval = pr->ps_pid;
507 return (0);
508 }
509
510 int
511 thread_fork(struct proc *curp, void *stack, void *tcb, pid_t *tidptr,
512 register_t *retval)
513 {
514 struct process *pr = curp->p_p;
515 struct proc *p;
516 pid_t tid;
517 vaddr_t uaddr;
518 int s, error;
519
520 if (stack == NULL)
521 return EINVAL;
522
523 if ((error = fork_check_maxthread(curp->p_ucred->cr_ruid)))
524 return error;
525
526 uaddr = uvm_uarea_alloc();
527 if (uaddr == 0) {
528 nthreads--;
529 return ENOMEM;
530 }
531
532 /*
533 * From now on, we're committed to the fork and cannot fail.
534 */
535 p = thread_new(curp, uaddr);
536 atomic_setbits_int(&p->p_flag, P_THREAD);
537 sigstkinit(&p->p_sigstk);
538 memset(p->p_name, 0, sizeof p->p_name);
539
540 /* other links */
541 p->p_p = pr;
542 pr->ps_refcnt++;
543
544 /* local copies */
545 p->p_fd = pr->ps_fd;
546 p->p_vmspace = pr->ps_vmspace;
547
548 /*
549 * Finish creating the child thread. cpu_fork() will copy
550 * and update the pcb and make the child ready to run. The
551 * child will exit directly to user mode via child_return()
552 * on its first time slice and will not return here.
553 */
554 cpu_fork(curp, p, stack, tcb, child_return, p);
555
556 p->p_tid = alloctid();
557
558 LIST_INSERT_HEAD(&allproc, p, p_list);
559 LIST_INSERT_HEAD(TIDHASH(p->p_tid), p, p_hash);
560
561 SCHED_LOCK(s);
562 TAILQ_INSERT_TAIL(&pr->ps_threads, p, p_thr_link);
563 /*
564 * if somebody else wants to take us to single threaded mode,
565 * count ourselves in.
566 */
567 if (pr->ps_single) {
568 atomic_inc_int(&pr->ps_singlecount);
569 atomic_setbits_int(&p->p_flag, P_SUSPSINGLE);
570 }
571 SCHED_UNLOCK(s);
572
573 /*
574 * Return tid to parent thread and copy it out to userspace
575 */
576 *retval = tid = p->p_tid + THREAD_PID_OFFSET;
577 if (tidptr != NULL) {
578 if (copyout(&tid, tidptr, sizeof(tid)))
579 psignal(curp, SIGSEGV);
580 }
581
582 fork_thread_start(p, curp, 0);
583
584 /*
585 * Update stats now that we know the fork was successful.
586 */
587 forkstat.cnttfork++;
588 uvmexp.forks++;
589 uvmexp.forks_sharevm++;
590
591 return 0;
592 }
593
594
595 /* Find an unused tid */
596 pid_t
597 alloctid(void)
598 {
599 pid_t tid;
600
601 do {
602 /* (0 .. TID_MASK+1] */
603 tid = 1 + (arc4random() & TID_MASK);
604 } while (tfind(tid) != NULL);
605
606 return (tid);
607 }
608
609 /*
610 * Checks for current use of a pid, either as a pid or pgid.
611 */
612 pid_t oldpids[128];
613 int
614 ispidtaken(pid_t pid)
615 {
616 uint32_t i;
617
618 for (i = 0; i < nitems(oldpids); i++)
619 if (pid == oldpids[i])
620 return (1);
621
622 if (prfind(pid) != NULL)
623 return (1);
624 if (pgfind(pid) != NULL)
625 return (1);
626 if (zombiefind(pid) != NULL)
627 return (1);
628 return (0);
629 }
630
631 /* Find an unused pid */
632 pid_t
633 allocpid(void)
634 {
635 static int first = 1;
636 pid_t pid;
637
638 /* The first PID allocated is always 1. */
639 if (first) {
640 first = 0;
641 return 1;
642 }
643
644 /*
645 * All subsequent PIDs are chosen randomly. We need to
646 * find an unused PID in the range [2, PID_MAX].
647 */
648 do {
649 pid = 2 + arc4random_uniform(PID_MAX - 1);
650 } while (ispidtaken(pid));
651 return pid;
652 }
653
654 void
655 freepid(pid_t pid)
656 {
657 static uint32_t idx;
658
659 oldpids[idx++ % nitems(oldpids)] = pid;
660 }
661
662 #if defined(MULTIPROCESSOR)
663 /*
664 * XXX This is a slight hack to get newly-formed processes to
665 * XXX acquire the kernel lock as soon as they run.
666 */
667 void
668 proc_trampoline_mp(void)
669 {
670 SCHED_ASSERT_LOCKED();
671 __mp_unlock(&sched_lock);
672 spl0();
673 SCHED_ASSERT_UNLOCKED();
674 KERNEL_ASSERT_UNLOCKED();
675
676 KERNEL_LOCK();
677 }
678 #endif
Cache object: 952f9c643dbb46285c1d753ceeef14a7
|