Fix CPU stats percentages formatting
[dragonfly.git] / sys / kern / kern_fork.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. All advertising materials mentioning features or use of this software
19 * must display the following acknowledgement:
20 * This product includes software developed by the University of
21 * California, Berkeley and its contributors.
22 * 4. Neither the name of the University nor the names of its contributors
23 * may be used to endorse or promote products derived from this software
24 * without specific prior written permission.
25 *
26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * SUCH DAMAGE.
37 *
38 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
39 * $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.13 2003/06/06 20:21:32 tegge Exp $
40 * $DragonFly: src/sys/kern/kern_fork.c,v 1.14 2003/07/26 18:12:44 dillon Exp $
41 */
42
43#include "opt_ktrace.h"
44
45#include <sys/param.h>
46#include <sys/systm.h>
47#include <sys/sysproto.h>
48#include <sys/filedesc.h>
49#include <sys/kernel.h>
50#include <sys/sysctl.h>
51#include <sys/malloc.h>
52#include <sys/proc.h>
53#include <sys/resourcevar.h>
54#include <sys/vnode.h>
55#include <sys/acct.h>
56#include <sys/ktrace.h>
57#include <sys/unistd.h>
58#include <sys/jail.h>
59
60#include <vm/vm.h>
61#include <sys/lock.h>
62#include <vm/pmap.h>
63#include <vm/vm_map.h>
64#include <vm/vm_extern.h>
65#include <vm/vm_zone.h>
66
67#include <sys/vmmeter.h>
68#include <sys/user.h>
69
70static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
71
72/*
73 * These are the stuctures used to create a callout list for things to do
74 * when forking a process
75 */
76struct forklist {
77 forklist_fn function;
78 TAILQ_ENTRY(forklist) next;
79};
80
81TAILQ_HEAD(forklist_head, forklist);
82static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
83
84int forksleep; /* Place for fork1() to sleep on. */
85
86/* ARGSUSED */
87int
88fork(struct fork_args *uap)
89{
90 struct proc *p = curproc;
91 struct proc *p2;
92 int error;
93
94 error = fork1(p, RFFDG | RFPROC, &p2);
95 if (error == 0) {
96 start_forked_proc(p, p2);
97 uap->lmsg.u.ms_fds[0] = p2->p_pid;
98 uap->lmsg.u.ms_fds[1] = 0;
99 }
100 return error;
101}
102
103/* ARGSUSED */
104int
105vfork(struct vfork_args *uap)
106{
107 struct proc *p = curproc;
108 struct proc *p2;
109 int error;
110
111 error = fork1(p, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
112 if (error == 0) {
113 start_forked_proc(p, p2);
114 uap->lmsg.u.ms_fds[0] = p2->p_pid;
115 uap->lmsg.u.ms_fds[1] = 0;
116 }
117 return error;
118}
119
120int
121rfork(struct rfork_args *uap)
122{
123 struct proc *p = curproc;
124 struct proc *p2;
125 int error;
126
127 error = fork1(p, uap->flags, &p2);
128 if (error == 0) {
129 start_forked_proc(p, p2);
130 uap->lmsg.u.ms_fds[0] = p2 ? p2->p_pid : 0;
131 uap->lmsg.u.ms_fds[1] = 0;
132 }
133 return error;
134}
135
136
137int nprocs = 1; /* process 0 */
138static int nextpid = 0;
139
140/*
141 * Random component to nextpid generation. We mix in a random factor to make
142 * it a little harder to predict. We sanity check the modulus value to avoid
143 * doing it in critical paths. Don't let it be too small or we pointlessly
144 * waste randomness entropy, and don't let it be impossibly large. Using a
145 * modulus that is too big causes a LOT more process table scans and slows
146 * down fork processing as the pidchecked caching is defeated.
147 */
148static int randompid = 0;
149
150static int
151sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
152{
153 int error, pid;
154
155 pid = randompid;
156 error = sysctl_handle_int(oidp, &pid, 0, req);
157 if (error || !req->newptr)
158 return (error);
159 if (pid < 0 || pid > PID_MAX - 100) /* out of range */
160 pid = PID_MAX - 100;
161 else if (pid < 2) /* NOP */
162 pid = 0;
163 else if (pid < 100) /* Make it reasonable */
164 pid = 100;
165 randompid = pid;
166 return (error);
167}
168
169SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
170 0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
171
172int
173fork1(p1, flags, procp)
174 struct proc *p1;
175 int flags;
176 struct proc **procp;
177{
178 struct proc *p2, *pptr;
179 uid_t uid;
180 struct proc *newproc;
181 int ok;
182 static int pidchecked = 0;
183 struct forklist *ep;
184 struct filedesc_to_leader *fdtol;
185
186 if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
187 return (EINVAL);
188
189 /*
190 * Here we don't create a new process, but we divorce
191 * certain parts of a process from itself.
192 */
193 if ((flags & RFPROC) == 0) {
194
195 vm_fork(p1, 0, flags);
196
197 /*
198 * Close all file descriptors.
199 */
200 if (flags & RFCFDG) {
201 struct filedesc *fdtmp;
202 fdtmp = fdinit(p1);
203 fdfree(p1);
204 p1->p_fd = fdtmp;
205 }
206
207 /*
208 * Unshare file descriptors (from parent.)
209 */
210 if (flags & RFFDG) {
211 if (p1->p_fd->fd_refcnt > 1) {
212 struct filedesc *newfd;
213 newfd = fdcopy(p1);
214 fdfree(p1);
215 p1->p_fd = newfd;
216 }
217 }
218 *procp = NULL;
219 return (0);
220 }
221
222 /*
223 * Although process entries are dynamically created, we still keep
224 * a global limit on the maximum number we will create. Don't allow
225 * a nonprivileged user to use the last ten processes; don't let root
226 * exceed the limit. The variable nprocs is the current number of
227 * processes, maxproc is the limit.
228 */
229 uid = p1->p_ucred->cr_ruid;
230 if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
231 tsleep(&forksleep, 0, "fork", hz / 2);
232 return (EAGAIN);
233 }
234 /*
235 * Increment the nprocs resource before blocking can occur. There
236 * are hard-limits as to the number of processes that can run.
237 */
238 nprocs++;
239
240 /*
241 * Increment the count of procs running with this uid. Don't allow
242 * a nonprivileged user to exceed their current limit.
243 */
244 ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1,
245 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
246 if (!ok) {
247 /*
248 * Back out the process count
249 */
250 nprocs--;
251 tsleep(&forksleep, 0, "fork", hz / 2);
252 return (EAGAIN);
253 }
254
255 /* Allocate new proc. */
256 newproc = zalloc(proc_zone);
257
258 /*
259 * Setup linkage for kernel based threading
260 */
261 if((flags & RFTHREAD) != 0) {
262 newproc->p_peers = p1->p_peers;
263 p1->p_peers = newproc;
264 newproc->p_leader = p1->p_leader;
265 } else {
266 newproc->p_peers = 0;
267 newproc->p_leader = newproc;
268 }
269
270 newproc->p_wakeup = 0;
271 newproc->p_vmspace = NULL;
272
273 /*
274 * Find an unused process ID. We remember a range of unused IDs
275 * ready to use (from nextpid+1 through pidchecked-1).
276 */
277 nextpid++;
278 if (randompid)
279 nextpid += arc4random() % randompid;
280retry:
281 /*
282 * If the process ID prototype has wrapped around,
283 * restart somewhat above 0, as the low-numbered procs
284 * tend to include daemons that don't exit.
285 */
286 if (nextpid >= PID_MAX) {
287 nextpid = nextpid % PID_MAX;
288 if (nextpid < 100)
289 nextpid += 100;
290 pidchecked = 0;
291 }
292 if (nextpid >= pidchecked) {
293 int doingzomb = 0;
294
295 pidchecked = PID_MAX;
296 /*
297 * Scan the active and zombie procs to check whether this pid
298 * is in use. Remember the lowest pid that's greater
299 * than nextpid, so we can avoid checking for a while.
300 */
301 p2 = LIST_FIRST(&allproc);
302again:
303 for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) {
304 while (p2->p_pid == nextpid ||
305 p2->p_pgrp->pg_id == nextpid ||
306 p2->p_session->s_sid == nextpid) {
307 nextpid++;
308 if (nextpid >= pidchecked)
309 goto retry;
310 }
311 if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
312 pidchecked = p2->p_pid;
313 if (p2->p_pgrp->pg_id > nextpid &&
314 pidchecked > p2->p_pgrp->pg_id)
315 pidchecked = p2->p_pgrp->pg_id;
316 if (p2->p_session->s_sid > nextpid &&
317 pidchecked > p2->p_session->s_sid)
318 pidchecked = p2->p_session->s_sid;
319 }
320 if (!doingzomb) {
321 doingzomb = 1;
322 p2 = LIST_FIRST(&zombproc);
323 goto again;
324 }
325 }
326
327 p2 = newproc;
328 p2->p_stat = SIDL; /* protect against others */
329 p2->p_pid = nextpid;
330 LIST_INSERT_HEAD(&allproc, p2, p_list);
331 LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
332
333 /*
334 * Make a proc table entry for the new process.
335 * Start by zeroing the section of proc that is zero-initialized,
336 * then copy the section that is copied directly from the parent.
337 */
338 bzero(&p2->p_startzero,
339 (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
340 bcopy(&p1->p_startcopy, &p2->p_startcopy,
341 (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
342
343 p2->p_aioinfo = NULL;
344
345 /*
346 * Duplicate sub-structures as needed.
347 * Increase reference counts on shared objects.
348 * The p_stats and p_sigacts substructs are set in vm_fork.
349 *
350 * P_CP_RELEASED indicates that the process is starting out in
351 * the kernel (in the fork trampoline). The flag will be converted
352 * to P_CURPROC when the new process calls userret() and attempts
353 * to return to userland
354 */
355 p2->p_flag = P_INMEM | P_CP_RELEASED;
356 if (p1->p_flag & P_PROFIL)
357 startprofclock(p2);
358 p2->p_ucred = crhold(p1->p_ucred);
359
360 if (p2->p_ucred->cr_prison) {
361 p2->p_ucred->cr_prison->pr_ref++;
362 p2->p_flag |= P_JAILED;
363 }
364
365 if (p2->p_args)
366 p2->p_args->ar_ref++;
367
368 if (flags & RFSIGSHARE) {
369 p2->p_procsig = p1->p_procsig;
370 p2->p_procsig->ps_refcnt++;
371 if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
372 struct sigacts *newsigacts;
373 int s;
374
375 /* Create the shared sigacts structure */
376 MALLOC(newsigacts, struct sigacts *,
377 sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
378 s = splhigh();
379 /*
380 * Set p_sigacts to the new shared structure.
381 * Note that this is updating p1->p_sigacts at the
382 * same time, since p_sigacts is just a pointer to
383 * the shared p_procsig->ps_sigacts.
384 */
385 p2->p_sigacts = newsigacts;
386 bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
387 sizeof(*p2->p_sigacts));
388 *p2->p_sigacts = p1->p_addr->u_sigacts;
389 splx(s);
390 }
391 } else {
392 MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
393 M_SUBPROC, M_WAITOK);
394 bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
395 p2->p_procsig->ps_refcnt = 1;
396 p2->p_sigacts = NULL; /* finished in vm_fork() */
397 }
398 if (flags & RFLINUXTHPN)
399 p2->p_sigparent = SIGUSR1;
400 else
401 p2->p_sigparent = SIGCHLD;
402
403 /* bump references to the text vnode (for procfs) */
404 p2->p_textvp = p1->p_textvp;
405 if (p2->p_textvp)
406 VREF(p2->p_textvp);
407
408 if (flags & RFCFDG) {
409 p2->p_fd = fdinit(p1);
410 fdtol = NULL;
411 } else if (flags & RFFDG) {
412 p2->p_fd = fdcopy(p1);
413 fdtol = NULL;
414 } else {
415 p2->p_fd = fdshare(p1);
416 if (p1->p_fdtol == NULL)
417 p1->p_fdtol =
418 filedesc_to_leader_alloc(NULL,
419 p1->p_leader);
420 if ((flags & RFTHREAD) != 0) {
421 /*
422 * Shared file descriptor table and
423 * shared process leaders.
424 */
425 fdtol = p1->p_fdtol;
426 fdtol->fdl_refcount++;
427 } else {
428 /*
429 * Shared file descriptor table, and
430 * different process leaders
431 */
432 fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
433 p2);
434 }
435 }
436 p2->p_fdtol = fdtol;
437
438 /*
439 * If p_limit is still copy-on-write, bump refcnt,
440 * otherwise get a copy that won't be modified.
441 * (If PL_SHAREMOD is clear, the structure is shared
442 * copy-on-write.)
443 */
444 if (p1->p_limit->p_lflags & PL_SHAREMOD)
445 p2->p_limit = limcopy(p1->p_limit);
446 else {
447 p2->p_limit = p1->p_limit;
448 p2->p_limit->p_refcnt++;
449 }
450
451 /*
452 * Preserve some more flags in subprocess. P_PROFIL has already
453 * been preserved.
454 */
455 p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
456 if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
457 p2->p_flag |= P_CONTROLT;
458 if (flags & RFPPWAIT)
459 p2->p_flag |= P_PPWAIT;
460
461 LIST_INSERT_AFTER(p1, p2, p_pglist);
462
463 /*
464 * Attach the new process to its parent.
465 *
466 * If RFNOWAIT is set, the newly created process becomes a child
467 * of init. This effectively disassociates the child from the
468 * parent.
469 */
470 if (flags & RFNOWAIT)
471 pptr = initproc;
472 else
473 pptr = p1;
474 p2->p_pptr = pptr;
475 LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
476 LIST_INIT(&p2->p_children);
477
478#ifdef KTRACE
479 /*
480 * Copy traceflag and tracefile if enabled. If not inherited,
481 * these were zeroed above but we still could have a trace race
482 * so make sure p2's p_tracep is NULL.
483 */
484 if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) {
485 p2->p_traceflag = p1->p_traceflag;
486 if ((p2->p_tracep = p1->p_tracep) != NULL)
487 VREF(p2->p_tracep);
488 }
489#endif
490
491 /*
492 * set priority of child to be that of parent
493 */
494 p2->p_estcpu = p1->p_estcpu;
495
496 /*
497 * This begins the section where we must prevent the parent
498 * from being swapped.
499 */
500 PHOLD(p1);
501
502 /*
503 * Finish creating the child process. It will return via a different
504 * execution path later. (ie: directly into user mode)
505 */
506 vm_fork(p1, p2, flags);
507
508 if (flags == (RFFDG | RFPROC)) {
509 mycpu->gd_cnt.v_forks++;
510 mycpu->gd_cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
511 } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
512 mycpu->gd_cnt.v_vforks++;
513 mycpu->gd_cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
514 } else if (p1 == &proc0) {
515 mycpu->gd_cnt.v_kthreads++;
516 mycpu->gd_cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
517 } else {
518 mycpu->gd_cnt.v_rforks++;
519 mycpu->gd_cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
520 }
521
522 /*
523 * Both processes are set up, now check if any loadable modules want
524 * to adjust anything.
525 * What if they have an error? XXX
526 */
527 TAILQ_FOREACH(ep, &fork_list, next) {
528 (*ep->function)(p1, p2, flags);
529 }
530
531 /*
532 * Make child runnable and add to run queue.
533 */
534 microtime(&(p2->p_stats->p_start));
535 p2->p_acflag = AFORK;
536
537 /*
538 * tell any interested parties about the new process
539 */
540 KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
541
542 /*
543 * Return child proc pointer to parent.
544 */
545 *procp = p2;
546 return (0);
547}
548
549/*
550 * The next two functionms are general routines to handle adding/deleting
551 * items on the fork callout list.
552 *
553 * at_fork():
554 * Take the arguments given and put them onto the fork callout list,
555 * However first make sure that it's not already there.
556 * Returns 0 on success or a standard error number.
557 */
558
559int
560at_fork(function)
561 forklist_fn function;
562{
563 struct forklist *ep;
564
565#ifdef INVARIANTS
566 /* let the programmer know if he's been stupid */
567 if (rm_at_fork(function))
568 printf("WARNING: fork callout entry (%p) already present\n",
569 function);
570#endif
571 ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
572 if (ep == NULL)
573 return (ENOMEM);
574 ep->function = function;
575 TAILQ_INSERT_TAIL(&fork_list, ep, next);
576 return (0);
577}
578
579/*
580 * Scan the exit callout list for the given item and remove it..
581 * Returns the number of items removed (0 or 1)
582 */
583
584int
585rm_at_fork(function)
586 forklist_fn function;
587{
588 struct forklist *ep;
589
590 TAILQ_FOREACH(ep, &fork_list, next) {
591 if (ep->function == function) {
592 TAILQ_REMOVE(&fork_list, ep, next);
593 free(ep, M_ATFORK);
594 return(1);
595 }
596 }
597 return (0);
598}
599
600/*
601 * Add a forked process to the run queue after any remaining setup, such
602 * as setting the fork handler, has been completed.
603 */
604
605void
606start_forked_proc(struct proc *p1, struct proc *p2)
607{
608 /*
609 * Move from SIDL to RUN queue, and activate the process's thread.
610 * Activation of the thread effectively makes the process "a"
611 * current process, so we do not setrunqueue().
612 */
613 KASSERT(p2->p_stat == SIDL,
614 ("cannot start forked process, bad status: %p", p2));
615 (void) splhigh();
616 p2->p_stat = SRUN;
617 setrunqueue(p2);
618 (void) spl0();
619
620 /*
621 * Now can be swapped.
622 */
623 PRELE(p1);
624
625 /*
626 * Preserve synchronization semantics of vfork. If waiting for
627 * child to exec or exit, set P_PPWAIT on child, and sleep on our
628 * proc (in case of exit).
629 */
630 while (p2->p_flag & P_PPWAIT)
631 tsleep(p1, 0, "ppwait", 0);
632}
633