FreeBSD/Linux Kernel Cross Reference
sys/kern/kern_fork.c
1 /*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
39 * $FreeBSD$
40 */
41
42 #include "opt_ktrace.h"
43
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/sysproto.h>
47 #include <sys/filedesc.h>
48 #include <sys/kernel.h>
49 #include <sys/sysctl.h>
50 #include <sys/malloc.h>
51 #include <sys/proc.h>
52 #include <sys/resourcevar.h>
53 #include <sys/vnode.h>
54 #include <sys/acct.h>
55 #include <sys/ktrace.h>
56 #include <sys/unistd.h>
57 #include <sys/jail.h>
58
59 #include <vm/vm.h>
60 #include <sys/lock.h>
61 #include <vm/pmap.h>
62 #include <vm/vm_map.h>
63 #include <vm/vm_extern.h>
64 #include <vm/vm_zone.h>
65
66 #include <sys/vmmeter.h>
67 #include <sys/user.h>
68
69 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
70
71 /*
72 * These are the stuctures used to create a callout list for things to do
73 * when forking a process
74 */
75 struct forklist {
76 forklist_fn function;
77 TAILQ_ENTRY(forklist) next;
78 };
79
80 TAILQ_HEAD(forklist_head, forklist);
81 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
82
83 #ifndef _SYS_SYSPROTO_H_
84 struct fork_args {
85 int dummy;
86 };
87 #endif
88
89 int forksleep; /* Place for fork1() to sleep on. */
90
91 /* ARGSUSED */
92 int
93 fork(p, uap)
94 struct proc *p;
95 struct fork_args *uap;
96 {
97 int error;
98 struct proc *p2;
99
100 error = fork1(p, RFFDG | RFPROC, &p2);
101 if (error == 0) {
102 p->p_retval[0] = p2->p_pid;
103 p->p_retval[1] = 0;
104 }
105 return error;
106 }
107
108 /* ARGSUSED */
109 int
110 vfork(p, uap)
111 struct proc *p;
112 struct vfork_args *uap;
113 {
114 int error;
115 struct proc *p2;
116
117 error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
118 if (error == 0) {
119 p->p_retval[0] = p2->p_pid;
120 p->p_retval[1] = 0;
121 }
122 return error;
123 }
124
125 int
126 rfork(p, uap)
127 struct proc *p;
128 struct rfork_args *uap;
129 {
130 int error;
131 struct proc *p2;
132
133 /* Don't allow kernel only flags. */
134 if ((uap->flags & RFKERNELONLY) != 0)
135 return (EINVAL);
136
137 error = fork1(p, uap->flags, &p2);
138 if (error == 0) {
139 p->p_retval[0] = p2 ? p2->p_pid : 0;
140 p->p_retval[1] = 0;
141 }
142 return error;
143 }
144
145
146 int nprocs = 1; /* process 0 */
147 static int nextpid = 0;
148
149 /*
150 * Random component to nextpid generation. We mix in a random factor to make
151 * it a little harder to predict. We sanity check the modulus value to avoid
152 * doing it in critical paths. Don't let it be too small or we pointlessly
153 * waste randomness entropy, and don't let it be impossibly large. Using a
154 * modulus that is too big causes a LOT more process table scans and slows
155 * down fork processing as the pidchecked caching is defeated.
156 */
157 static int randompid = 0;
158
159 static int
160 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
161 {
162 int error, pid;
163
164 pid = randompid;
165 error = sysctl_handle_int(oidp, &pid, 0, req);
166 if (error || !req->newptr)
167 return (error);
168 if (pid < 0 || pid > PID_MAX - 100) /* out of range */
169 pid = PID_MAX - 100;
170 else if (pid < 2) /* NOP */
171 pid = 0;
172 else if (pid < 100) /* Make it reasonable */
173 pid = 100;
174 randompid = pid;
175 return (error);
176 }
177
178 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
179 0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
180
181 int
182 fork1(p1, flags, procp)
183 struct proc *p1;
184 int flags;
185 struct proc **procp;
186 {
187 struct proc *p2, *pptr;
188 uid_t uid;
189 struct proc *newproc;
190 int ok, s;
191 static int curfail = 0, pidchecked = 0;
192 static struct timeval lastfail;
193 struct forklist *ep;
194 struct filedesc_to_leader *fdtol;
195
196 if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
197 return (EINVAL);
198
199 /*
200 * Here we don't create a new process, but we divorce
201 * certain parts of a process from itself.
202 */
203 if ((flags & RFPROC) == 0) {
204
205 vm_fork(p1, 0, flags);
206
207 /*
208 * Close all file descriptors.
209 */
210 if (flags & RFCFDG) {
211 struct filedesc *fdtmp;
212 fdtmp = fdinit(p1);
213 fdfree(p1);
214 p1->p_fd = fdtmp;
215 }
216
217 /*
218 * Unshare file descriptors (from parent.)
219 */
220 if (flags & RFFDG) {
221 if (p1->p_fd->fd_refcnt > 1) {
222 struct filedesc *newfd;
223 newfd = fdcopy(p1);
224 fdfree(p1);
225 p1->p_fd = newfd;
226 }
227 }
228 *procp = NULL;
229 return (0);
230 }
231
232 /*
233 * Although process entries are dynamically created, we still keep
234 * a global limit on the maximum number we will create. Don't allow
235 * a nonprivileged user to use the last ten processes; don't let root
236 * exceed the limit. The variable nprocs is the current number of
237 * processes, maxproc is the limit.
238 */
239 uid = p1->p_cred->p_ruid;
240 if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
241 if (ppsratecheck(&lastfail, &curfail, 1))
242 printf("maxproc limit exceeded by uid %d, please see tuning(7) and login.conf(5).\n", uid);
243 tsleep(&forksleep, PUSER, "fork", hz / 2);
244 return (EAGAIN);
245 }
246 /*
247 * Increment the nprocs resource before blocking can occur. There
248 * are hard-limits as to the number of processes that can run.
249 */
250 nprocs++;
251
252 /*
253 * Increment the count of procs running with this uid. Don't allow
254 * a nonprivileged user to exceed their current limit.
255 */
256 ok = chgproccnt(p1->p_cred->p_uidinfo, 1,
257 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
258 if (!ok) {
259 /*
260 * Back out the process count
261 */
262 nprocs--;
263 if (ppsratecheck(&lastfail, &curfail, 1))
264 printf("maxproc limit exceeded by uid %d, please see tuning(7) and login.conf(5).\n", uid);
265 tsleep(&forksleep, PUSER, "fork", hz / 2);
266 return (EAGAIN);
267 }
268
269 /* Allocate new proc. */
270 newproc = zalloc(proc_zone);
271
272 /*
273 * Setup linkage for kernel based threading
274 */
275 if((flags & RFTHREAD) != 0) {
276 newproc->p_peers = p1->p_peers;
277 p1->p_peers = newproc;
278 newproc->p_leader = p1->p_leader;
279 } else {
280 newproc->p_peers = 0;
281 newproc->p_leader = newproc;
282 }
283
284 newproc->p_wakeup = 0;
285
286 newproc->p_vmspace = NULL;
287
288 /*
289 * Find an unused process ID. We remember a range of unused IDs
290 * ready to use (from nextpid+1 through pidchecked-1).
291 */
292 nextpid++;
293 if (randompid)
294 nextpid += arc4random() % randompid;
295 retry:
296 /*
297 * If the process ID prototype has wrapped around,
298 * restart somewhat above 0, as the low-numbered procs
299 * tend to include daemons that don't exit.
300 */
301 if (nextpid >= PID_MAX) {
302 nextpid = nextpid % PID_MAX;
303 if (nextpid < 100)
304 nextpid += 100;
305 pidchecked = 0;
306 }
307 if (nextpid >= pidchecked) {
308 int doingzomb = 0;
309
310 pidchecked = PID_MAX;
311 /*
312 * Scan the active and zombie procs to check whether this pid
313 * is in use. Remember the lowest pid that's greater
314 * than nextpid, so we can avoid checking for a while.
315 */
316 p2 = LIST_FIRST(&allproc);
317 again:
318 for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) {
319 while (p2->p_pid == nextpid ||
320 p2->p_pgrp->pg_id == nextpid ||
321 p2->p_session->s_sid == nextpid) {
322 nextpid++;
323 if (nextpid >= pidchecked)
324 goto retry;
325 }
326 if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
327 pidchecked = p2->p_pid;
328 if (p2->p_pgrp->pg_id > nextpid &&
329 pidchecked > p2->p_pgrp->pg_id)
330 pidchecked = p2->p_pgrp->pg_id;
331 if (p2->p_session->s_sid > nextpid &&
332 pidchecked > p2->p_session->s_sid)
333 pidchecked = p2->p_session->s_sid;
334 }
335 if (!doingzomb) {
336 doingzomb = 1;
337 p2 = LIST_FIRST(&zombproc);
338 goto again;
339 }
340 }
341
342 p2 = newproc;
343 p2->p_stat = SIDL; /* protect against others */
344 p2->p_pid = nextpid;
345 LIST_INSERT_HEAD(&allproc, p2, p_list);
346 LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
347
348 /*
349 * Make a proc table entry for the new process.
350 * Start by zeroing the section of proc that is zero-initialized,
351 * then copy the section that is copied directly from the parent.
352 */
353 bzero(&p2->p_startzero,
354 (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
355 bcopy(&p1->p_startcopy, &p2->p_startcopy,
356 (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
357
358 p2->p_aioinfo = NULL;
359
360 /*
361 * Duplicate sub-structures as needed.
362 * Increase reference counts on shared objects.
363 * The p_stats and p_sigacts substructs are set in vm_fork.
364 */
365 p2->p_flag = P_INMEM;
366 if (p1->p_flag & P_PROFIL)
367 startprofclock(p2);
368 MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
369 M_SUBPROC, M_WAITOK);
370 bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
371 p2->p_cred->p_refcnt = 1;
372 crhold(p1->p_ucred);
373 uihold(p1->p_cred->p_uidinfo);
374
375 if (p2->p_prison) {
376 p2->p_prison->pr_ref++;
377 p2->p_flag |= P_JAILED;
378 }
379
380 if (p2->p_args)
381 p2->p_args->ar_ref++;
382
383 if (flags & RFSIGSHARE) {
384 p2->p_procsig = p1->p_procsig;
385 p2->p_procsig->ps_refcnt++;
386 if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
387 struct sigacts *newsigacts;
388 int s;
389
390 /* Create the shared sigacts structure */
391 MALLOC(newsigacts, struct sigacts *,
392 sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
393 s = splhigh();
394 /*
395 * Set p_sigacts to the new shared structure.
396 * Note that this is updating p1->p_sigacts at the
397 * same time, since p_sigacts is just a pointer to
398 * the shared p_procsig->ps_sigacts.
399 */
400 p2->p_sigacts = newsigacts;
401 bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
402 sizeof(*p2->p_sigacts));
403 *p2->p_sigacts = p1->p_addr->u_sigacts;
404 splx(s);
405 }
406 } else {
407 MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
408 M_SUBPROC, M_WAITOK);
409 bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
410 p2->p_procsig->ps_refcnt = 1;
411 p2->p_sigacts = NULL; /* finished in vm_fork() */
412 }
413 if (flags & RFLINUXTHPN)
414 p2->p_sigparent = SIGUSR1;
415 else
416 p2->p_sigparent = SIGCHLD;
417
418 /* bump references to the text vnode (for procfs) */
419 p2->p_textvp = p1->p_textvp;
420 if (p2->p_textvp)
421 VREF(p2->p_textvp);
422
423 if (flags & RFCFDG) {
424 p2->p_fd = fdinit(p1);
425 fdtol = NULL;
426 } else if (flags & RFFDG) {
427 p2->p_fd = fdcopy(p1);
428 fdtol = NULL;
429 } else {
430 p2->p_fd = fdshare(p1);
431 if (p1->p_fdtol == NULL)
432 p1->p_fdtol =
433 filedesc_to_leader_alloc(NULL,
434 p1->p_leader);
435 if ((flags & RFTHREAD) != 0) {
436 /*
437 * Shared file descriptor table and
438 * shared process leaders.
439 */
440 fdtol = p1->p_fdtol;
441 fdtol->fdl_refcount++;
442 } else {
443 /*
444 * Shared file descriptor table, and
445 * different process leaders
446 */
447 fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
448 p2);
449 }
450 }
451 p2->p_fdtol = fdtol;
452
453 /*
454 * If p_limit is still copy-on-write, bump refcnt,
455 * otherwise get a copy that won't be modified.
456 * (If PL_SHAREMOD is clear, the structure is shared
457 * copy-on-write.)
458 */
459 if (p1->p_limit->p_lflags & PL_SHAREMOD)
460 p2->p_limit = limcopy(p1->p_limit);
461 else {
462 p2->p_limit = p1->p_limit;
463 p2->p_limit->p_refcnt++;
464 }
465
466 /*
467 * Preserve some more flags in subprocess. P_PROFIL has already
468 * been preserved.
469 */
470 p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
471 if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
472 p2->p_flag |= P_CONTROLT;
473 if (flags & RFPPWAIT)
474 p2->p_flag |= P_PPWAIT;
475
476 LIST_INSERT_AFTER(p1, p2, p_pglist);
477
478 /*
479 * Attach the new process to its parent.
480 *
481 * If RFNOWAIT is set, the newly created process becomes a child
482 * of init. This effectively disassociates the child from the
483 * parent.
484 */
485 if (flags & RFNOWAIT)
486 pptr = initproc;
487 else
488 pptr = p1;
489 p2->p_pptr = pptr;
490 LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
491 LIST_INIT(&p2->p_children);
492
493 #ifdef KTRACE
494 /*
495 * Copy traceflag and tracefile if enabled. If not inherited,
496 * these were zeroed above but we still could have a trace race
497 * so make sure p2's p_tracep is NULL.
498 */
499 if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) {
500 p2->p_traceflag = p1->p_traceflag;
501 if ((p2->p_tracep = p1->p_tracep) != NULL)
502 VREF(p2->p_tracep);
503 }
504 #endif
505
506 /*
507 * set priority of child to be that of parent
508 */
509 p2->p_estcpu = p1->p_estcpu;
510
511 /*
512 * This begins the section where we must prevent the parent
513 * from being swapped.
514 */
515 PHOLD(p1);
516
517 /*
518 * Finish creating the child process. It will return via a different
519 * execution path later. (ie: directly into user mode)
520 */
521 vm_fork(p1, p2, flags);
522
523 if (flags == (RFFDG | RFPROC)) {
524 cnt.v_forks++;
525 cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
526 } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
527 cnt.v_vforks++;
528 cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
529 } else if (p1 == &proc0) {
530 cnt.v_kthreads++;
531 cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
532 } else {
533 cnt.v_rforks++;
534 cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
535 }
536
537 /*
538 * Both processes are set up, now check if any loadable modules want
539 * to adjust anything.
540 * What if they have an error? XXX
541 */
542 TAILQ_FOREACH(ep, &fork_list, next) {
543 (*ep->function)(p1, p2, flags);
544 }
545
546 /*
547 * Make child runnable and add to run queue.
548 */
549 microtime(&(p2->p_stats->p_start));
550 p2->p_acflag = AFORK;
551 s = splhigh();
552 p2->p_stat = SRUN;
553 setrunqueue(p2);
554 splx(s);
555
556 /*
557 * Now can be swapped.
558 */
559 PRELE(p1);
560
561 /*
562 * tell any interested parties about the new process
563 */
564 KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
565
566 /*
567 * Preserve synchronization semantics of vfork. If waiting for
568 * child to exec or exit, set P_PPWAIT on child, and sleep on our
569 * proc (in case of exit).
570 */
571 while (p2->p_flag & P_PPWAIT)
572 tsleep(p1, PWAIT, "ppwait", 0);
573
574 /*
575 * Return child proc pointer to parent.
576 */
577 *procp = p2;
578 return (0);
579 }
580
581 /*
582 * The next two functionms are general routines to handle adding/deleting
583 * items on the fork callout list.
584 *
585 * at_fork():
586 * Take the arguments given and put them onto the fork callout list,
587 * However first make sure that it's not already there.
588 * Returns 0 on success or a standard error number.
589 */
590
591 int
592 at_fork(function)
593 forklist_fn function;
594 {
595 struct forklist *ep;
596
597 #ifdef INVARIANTS
598 /* let the programmer know if he's been stupid */
599 if (rm_at_fork(function))
600 printf("WARNING: fork callout entry (%p) already present\n",
601 function);
602 #endif
603 ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
604 if (ep == NULL)
605 return (ENOMEM);
606 ep->function = function;
607 TAILQ_INSERT_TAIL(&fork_list, ep, next);
608 return (0);
609 }
610
611 /*
612 * Scan the exit callout list for the given item and remove it..
613 * Returns the number of items removed (0 or 1)
614 */
615
616 int
617 rm_at_fork(function)
618 forklist_fn function;
619 {
620 struct forklist *ep;
621
622 TAILQ_FOREACH(ep, &fork_list, next) {
623 if (ep->function == function) {
624 TAILQ_REMOVE(&fork_list, ep, next);
625 free(ep, M_ATFORK);
626 return(1);
627 }
628 }
629 return (0);
630 }
Cache object: 60c193e7c313101e56bc1fdb0d62949a
|