kernel - Remove dsched
[dragonfly.git] / sys / kern / kern_fork.c
CommitLineData
984263bc
MD
1/*
2 * Copyright (c) 1982, 1986, 1989, 1991, 1993
3 * The Regents of the University of California. All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
dc71b7ab 18 * 3. Neither the name of the University nor the names of its contributors
984263bc
MD
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
5bc7cd8d 35 * $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.14 2003/06/26 04:15:10 silby Exp $
984263bc
MD
36 */
37
38#include "opt_ktrace.h"
39
40#include <sys/param.h>
41#include <sys/systm.h>
42#include <sys/sysproto.h>
43#include <sys/filedesc.h>
44#include <sys/kernel.h>
45#include <sys/sysctl.h>
46#include <sys/malloc.h>
47#include <sys/proc.h>
48#include <sys/resourcevar.h>
49#include <sys/vnode.h>
50#include <sys/acct.h>
51#include <sys/ktrace.h>
dfc1fc13
EN
52#include <sys/unistd.h>
53#include <sys/jail.h>
984263bc
MD
54
55#include <vm/vm.h>
56#include <sys/lock.h>
57#include <vm/pmap.h>
58#include <vm/vm_map.h>
59#include <vm/vm_extern.h>
984263bc
MD
60
61#include <sys/vmmeter.h>
19bfc8ab 62#include <sys/refcount.h>
e43a034f 63#include <sys/thread2.h>
b1b4e5a6 64#include <sys/signal2.h>
8f1f6170 65#include <sys/spinlock2.h>
984263bc 66
8c72e3d5
AH
67#include <sys/dsched.h>
68
984263bc 69static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
6e2a912c 70static MALLOC_DEFINE(M_REAPER, "reaper", "process reapers");
984263bc
MD
71
72/*
73 * These are the stuctures used to create a callout list for things to do
74 * when forking a process
75 */
76struct forklist {
77 forklist_fn function;
78 TAILQ_ENTRY(forklist) next;
79};
80
81TAILQ_HEAD(forklist_head, forklist);
82static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
83
13d13d89
SS
84static struct lwp *lwp_fork(struct lwp *, struct proc *, int flags);
85
984263bc
MD
86int forksleep; /* Place for fork1() to sleep on. */
87
3e291793
MD
88/*
89 * Red-Black tree support for LWPs
90 */
91
92static int
93rb_lwp_compare(struct lwp *lp1, struct lwp *lp2)
94{
95 if (lp1->lwp_tid < lp2->lwp_tid)
96 return(-1);
97 if (lp1->lwp_tid > lp2->lwp_tid)
98 return(1);
99 return(0);
100}
101
102RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid);
103
3919ced0 104/*
51818c08 105 * fork() system call
3919ced0 106 */
984263bc 107int
753fd850 108sys_fork(struct fork_args *uap)
984263bc 109{
553ea3c8 110 struct lwp *lp = curthread->td_lwp;
984263bc 111 struct proc *p2;
41c20dac 112 int error;
984263bc 113
167e6ecb 114 error = fork1(lp, RFFDG | RFPROC | RFPGLOCK, &p2);
984263bc 115 if (error == 0) {
de7ac1d6 116 PHOLD(p2);
553ea3c8 117 start_forked_proc(lp, p2);
c7114eea
MD
118 uap->sysmsg_fds[0] = p2->p_pid;
119 uap->sysmsg_fds[1] = 0;
de7ac1d6 120 PRELE(p2);
984263bc
MD
121 }
122 return error;
123}
124
3919ced0 125/*
51818c08 126 * vfork() system call
3919ced0 127 */
984263bc 128int
753fd850 129sys_vfork(struct vfork_args *uap)
984263bc 130{
553ea3c8 131 struct lwp *lp = curthread->td_lwp;
984263bc 132 struct proc *p2;
41c20dac 133 int error;
984263bc 134
167e6ecb 135 error = fork1(lp, RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK, &p2);
984263bc 136 if (error == 0) {
de7ac1d6 137 PHOLD(p2);
553ea3c8 138 start_forked_proc(lp, p2);
c7114eea
MD
139 uap->sysmsg_fds[0] = p2->p_pid;
140 uap->sysmsg_fds[1] = 0;
de7ac1d6 141 PRELE(p2);
984263bc
MD
142 }
143 return error;
144}
145
f61c1ff1
MD
146/*
147 * Handle rforks. An rfork may (1) operate on the current process without
148 * creating a new, (2) create a new process that shared the current process's
149 * vmspace, signals, and/or descriptors, or (3) create a new process that does
150 * not share these things (normal fork).
151 *
152 * Note that we only call start_forked_proc() if a new process is actually
153 * created.
154 *
155 * rfork { int flags }
156 */
984263bc 157int
753fd850 158sys_rfork(struct rfork_args *uap)
984263bc 159{
553ea3c8 160 struct lwp *lp = curthread->td_lwp;
984263bc 161 struct proc *p2;
41c20dac 162 int error;
984263bc 163
6654fbcb
MD
164 if ((uap->flags & RFKERNELONLY) != 0)
165 return (EINVAL);
166
167e6ecb 167 error = fork1(lp, uap->flags | RFPGLOCK, &p2);
984263bc 168 if (error == 0) {
de7ac1d6
MD
169 if (p2) {
170 PHOLD(p2);
553ea3c8 171 start_forked_proc(lp, p2);
de7ac1d6
MD
172 uap->sysmsg_fds[0] = p2->p_pid;
173 uap->sysmsg_fds[1] = 0;
174 PRELE(p2);
175 } else {
176 uap->sysmsg_fds[0] = 0;
177 uap->sysmsg_fds[1] = 0;
178 }
984263bc
MD
179 }
180 return error;
181}
182
3919ced0 183/*
51818c08 184 * Low level thread create used by pthreads.
3919ced0 185 */
91bd9c1e
SS
186int
187sys_lwp_create(struct lwp_create_args *uap)
188{
189 struct proc *p = curproc;
190 struct lwp *lp;
191 struct lwp_params params;
192 int error;
193
194 error = copyin(uap->params, &params, sizeof(params));
195 if (error)
196 goto fail2;
197
b5c4d81f 198 lwkt_gettoken(&p->p_token);
8f1f6170 199 plimit_lwp_fork(p); /* force exclusive access */
91bd9c1e
SS
200 lp = lwp_fork(curthread->td_lwp, p, RFPROC);
201 error = cpu_prepare_lwp(lp, &params);
7b925b71
MD
202 if (error)
203 goto fail;
b44473af
SW
204 if (params.lwp_tid1 != NULL &&
205 (error = copyout(&lp->lwp_tid, params.lwp_tid1, sizeof(lp->lwp_tid))))
91bd9c1e 206 goto fail;
b44473af
SW
207 if (params.lwp_tid2 != NULL &&
208 (error = copyout(&lp->lwp_tid, params.lwp_tid2, sizeof(lp->lwp_tid))))
91bd9c1e
SS
209 goto fail;
210
211 /*
8f1f6170 212 * Now schedule the new lwp.
91bd9c1e
SS
213 */
214 p->p_usched->resetpriority(lp);
215 crit_enter();
216 lp->lwp_stat = LSRUN;
217 p->p_usched->setrunqueue(lp);
218 crit_exit();
b5c4d81f 219 lwkt_reltoken(&p->p_token);
91bd9c1e
SS
220
221 return (0);
222
223fail:
3e291793 224 lwp_rb_tree_RB_REMOVE(&p->p_lwp_tree, lp);
0b26dde3 225 --p->p_nthreads;
e3161323 226 /* lwp_dispose expects an exited lwp, and a held proc */
4643740a 227 atomic_set_int(&lp->lwp_mpflags, LWP_MP_WEXIT);
e3161323 228 lp->lwp_thread->td_flags |= TDF_EXITING;
7b925b71 229 lwkt_remove_tdallq(lp->lwp_thread);
e3161323 230 PHOLD(p);
18af8f55
MD
231 biosched_done(lp->lwp_thread);
232 dsched_exit_thread(lp->lwp_thread);
91bd9c1e 233 lwp_dispose(lp);
b5c4d81f 234 lwkt_reltoken(&p->p_token);
91bd9c1e
SS
235fail2:
236 return (error);
237}
984263bc
MD
238
239int nprocs = 1; /* process 0 */
984263bc
MD
240
241int
553ea3c8 242fork1(struct lwp *lp1, int flags, struct proc **procp)
984263bc 243{
553ea3c8 244 struct proc *p1 = lp1->lwp_proc;
de7ac1d6
MD
245 struct proc *p2;
246 struct proc *pptr;
58c2553a
MD
247 struct pgrp *p1grp;
248 struct pgrp *plkgrp;
f0d55ae9 249 struct sysreaper *reap;
984263bc 250 uid_t uid;
167e6ecb 251 int ok, error;
51e64ff2 252 static int curfail = 0;
5bc7cd8d 253 static struct timeval lastfail;
984263bc
MD
254 struct forklist *ep;
255 struct filedesc_to_leader *fdtol;
256
257 if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
258 return (EINVAL);
259
b5c4d81f 260 lwkt_gettoken(&p1->p_token);
58c2553a 261 plkgrp = NULL;
de7ac1d6 262 p2 = NULL;
b5c4d81f 263
984263bc
MD
264 /*
265 * Here we don't create a new process, but we divorce
266 * certain parts of a process from itself.
267 */
268 if ((flags & RFPROC) == 0) {
13d13d89
SS
269 /*
270 * This kind of stunt does not work anymore if
271 * there are native threads (lwps) running
272 */
b5c4d81f
MD
273 if (p1->p_nthreads != 1) {
274 error = EINVAL;
275 goto done;
276 }
13d13d89
SS
277
278 vm_fork(p1, 0, flags);
984263bc
MD
279
280 /*
281 * Close all file descriptors.
282 */
283 if (flags & RFCFDG) {
284 struct filedesc *fdtmp;
285 fdtmp = fdinit(p1);
0a4a9c77 286 fdfree(p1, fdtmp);
984263bc
MD
287 }
288
289 /*
290 * Unshare file descriptors (from parent.)
291 */
292 if (flags & RFFDG) {
293 if (p1->p_fd->fd_refcnt > 1) {
294 struct filedesc *newfd;
2994659f
VS
295 error = fdcopy(p1, &newfd);
296 if (error != 0) {
297 error = ENOMEM;
298 goto done;
299 }
0a4a9c77 300 fdfree(p1, newfd);
984263bc
MD
301 }
302 }
303 *procp = NULL;
b5c4d81f
MD
304 error = 0;
305 goto done;
984263bc
MD
306 }
307
167e6ecb
MD
308 /*
309 * Interlock against process group signal delivery. If signals
310 * are pending after the interlock is obtained we have to restart
311 * the system call to process the signals. If we don't the child
312 * can miss a pgsignal (such as ^C) sent during the fork.
313 *
314 * We can't use CURSIG() here because it will process any STOPs
315 * and cause the process group lock to be held indefinitely. If
316 * a STOP occurs, the fork will be restarted after the CONT.
317 */
58c2553a
MD
318 p1grp = p1->p_pgrp;
319 if ((flags & RFPGLOCK) && (plkgrp = p1->p_pgrp) != NULL) {
320 pgref(plkgrp);
321 lockmgr(&plkgrp->pg_lock, LK_SHARED);
f6e73860 322 if (CURSIG_NOBLOCK(lp1)) {
167e6ecb
MD
323 error = ERESTART;
324 goto done;
325 }
326 }
327
984263bc
MD
328 /*
329 * Although process entries are dynamically created, we still keep
330 * a global limit on the maximum number we will create. Don't allow
331 * a nonprivileged user to use the last ten processes; don't let root
332 * exceed the limit. The variable nprocs is the current number of
333 * processes, maxproc is the limit.
334 */
9910d07b 335 uid = lp1->lwp_thread->td_ucred->cr_ruid;
984263bc 336 if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) {
5bc7cd8d 337 if (ppsratecheck(&lastfail, &curfail, 1))
6ea70f76 338 kprintf("maxproc limit exceeded by uid %d, please "
5bc7cd8d 339 "see tuning(7) and login.conf(5).\n", uid);
377d4740 340 tsleep(&forksleep, 0, "fork", hz / 2);
167e6ecb
MD
341 error = EAGAIN;
342 goto done;
984263bc 343 }
8c2bce60 344
984263bc
MD
345 /*
346 * Increment the nprocs resource before blocking can occur. There
347 * are hard-limits as to the number of processes that can run.
348 */
8c2bce60 349 atomic_add_int(&nprocs, 1);
984263bc
MD
350
351 /*
352 * Increment the count of procs running with this uid. Don't allow
353 * a nonprivileged user to exceed their current limit.
354 */
9910d07b 355 ok = chgproccnt(lp1->lwp_thread->td_ucred->cr_ruidinfo, 1,
984263bc
MD
356 (uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
357 if (!ok) {
358 /*
359 * Back out the process count
360 */
8c2bce60 361 atomic_add_int(&nprocs, -1);
5bc7cd8d 362 if (ppsratecheck(&lastfail, &curfail, 1))
6ea70f76 363 kprintf("maxproc limit exceeded by uid %d, please "
5bc7cd8d 364 "see tuning(7) and login.conf(5).\n", uid);
377d4740 365 tsleep(&forksleep, 0, "fork", hz / 2);
167e6ecb
MD
366 error = EAGAIN;
367 goto done;
984263bc
MD
368 }
369
de7ac1d6
MD
370 /*
371 * Allocate a new process, don't get fancy: zero the structure.
372 */
37733243 373 p2 = kmalloc(sizeof(struct proc), M_PROC, M_WAITOK|M_ZERO);
984263bc
MD
374
375 /*
de7ac1d6
MD
376 * Core initialization. SIDL is a safety state that protects the
377 * partially initialized process once it starts getting hooked
378 * into system structures and becomes addressable.
379 *
380 * We must be sure to acquire p2->p_token as well, we must hold it
381 * once the process is on the allproc list to avoid things such
382 * as competing modifications to p_flags.
984263bc 383 */
0adbcbd6
MD
384 mycpu->gd_forkid += ncpus;
385 p2->p_forkid = mycpu->gd_forkid + mycpu->gd_cpuid;
de7ac1d6
MD
386 p2->p_lasttid = -1; /* first tid will be 0 */
387 p2->p_stat = SIDL;
984263bc 388
6e2a912c
MD
389 /*
390 * NOTE: Process 0 will not have a reaper, but process 1 (init) and
391 * all other processes always will.
392 */
f0d55ae9
MD
393 if ((reap = p1->p_reaper) != NULL) {
394 reaper_hold(reap);
395 p2->p_reaper = reap;
396 } else {
397 p2->p_reaper = NULL;
398 }
6e2a912c 399
3e291793 400 RB_INIT(&p2->p_lwp_tree);
ba87a4ab 401 spin_init(&p2->p_spin, "procfork1");
8c2bce60 402 lwkt_token_init(&p2->p_token, "proc");
de7ac1d6 403 lwkt_gettoken(&p2->p_token);
ef09c3ed 404
984263bc 405 /*
de7ac1d6
MD
406 * Setup linkage for kernel based threading XXX lwp. Also add the
407 * process to the allproclist.
408 *
409 * The process structure is addressable after this point.
984263bc 410 */
de7ac1d6
MD
411 if (flags & RFTHREAD) {
412 p2->p_peers = p1->p_peers;
413 p1->p_peers = p2;
414 p2->p_leader = p1->p_leader;
415 } else {
416 p2->p_leader = p2;
417 }
51e64ff2 418 proc_add_allproc(p2);
984263bc
MD
419
420 /*
de7ac1d6 421 * Initialize the section which is copied verbatim from the parent.
984263bc 422 */
984263bc 423 bcopy(&p1->p_startcopy, &p2->p_startcopy,
de7ac1d6 424 ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
984263bc 425
984263bc 426 /*
88072e3b
MD
427 * Duplicate sub-structures as needed. Increase reference counts
428 * on shared objects.
429 *
430 * NOTE: because we are now on the allproc list it is possible for
431 * other consumers to gain temporary references to p2
432 * (p2->p_lock can change).
984263bc 433 */
4643740a 434 if (p1->p_flags & P_PROFIL)
984263bc 435 startprofclock(p2);
9910d07b 436 p2->p_ucred = crhold(lp1->lwp_thread->td_ucred);
984263bc 437
b40e316c 438 if (jailed(p2->p_ucred))
4643740a 439 p2->p_flags |= P_JAILED;
984263bc
MD
440
441 if (p2->p_args)
19bfc8ab 442 refcount_acquire(&p2->p_args->ar_ref);
984263bc 443
13d13d89 444 p2->p_usched = p1->p_usched;
8c72e3d5 445 /* XXX: verify copy of the secondary iosched stuff */
3573cf7b 446 dsched_enter_proc(p2);
13d13d89 447
984263bc 448 if (flags & RFSIGSHARE) {
b1b4e5a6 449 p2->p_sigacts = p1->p_sigacts;
6fa9e71a 450 refcount_acquire(&p2->p_sigacts->ps_refcnt);
984263bc 451 } else {
6fa9e71a
MD
452 p2->p_sigacts = kmalloc(sizeof(*p2->p_sigacts),
453 M_SUBPROC, M_WAITOK);
b1b4e5a6 454 bcopy(p1->p_sigacts, p2->p_sigacts, sizeof(*p2->p_sigacts));
6fa9e71a 455 refcount_init(&p2->p_sigacts->ps_refcnt, 1);
984263bc
MD
456 }
457 if (flags & RFLINUXTHPN)
458 p2->p_sigparent = SIGUSR1;
459 else
460 p2->p_sigparent = SIGCHLD;
461
462 /* bump references to the text vnode (for procfs) */
463 p2->p_textvp = p1->p_textvp;
464 if (p2->p_textvp)
597aea93 465 vref(p2->p_textvp);
984263bc 466
8ba5f7ef
AH
467 /* copy namecache handle to the text file */
468 if (p1->p_textnch.mount)
469 cache_copy(&p1->p_textnch, &p2->p_textnch);
470
0daa37a5
MD
471 /*
472 * Handle file descriptors
473 */
984263bc
MD
474 if (flags & RFCFDG) {
475 p2->p_fd = fdinit(p1);
476 fdtol = NULL;
477 } else if (flags & RFFDG) {
2994659f
VS
478 error = fdcopy(p1, &p2->p_fd);
479 if (error != 0) {
480 error = ENOMEM;
481 goto done;
482 }
984263bc
MD
483 fdtol = NULL;
484 } else {
485 p2->p_fd = fdshare(p1);
b5c4d81f 486 if (p1->p_fdtol == NULL) {
de7ac1d6
MD
487 p1->p_fdtol = filedesc_to_leader_alloc(NULL,
488 p1->p_leader);
b5c4d81f 489 }
984263bc
MD
490 if ((flags & RFTHREAD) != 0) {
491 /*
492 * Shared file descriptor table and
493 * shared process leaders.
494 */
495 fdtol = p1->p_fdtol;
496 fdtol->fdl_refcount++;
497 } else {
498 /*
499 * Shared file descriptor table, and
500 * different process leaders
501 */
98a7f915 502 fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p2);
984263bc
MD
503 }
504 }
505 p2->p_fdtol = fdtol;
8f1f6170 506 p2->p_limit = plimit_fork(p1);
984263bc
MD
507
508 /*
509 * Preserve some more flags in subprocess. P_PROFIL has already
510 * been preserved.
511 */
4643740a 512 p2->p_flags |= p1->p_flags & P_SUGID;
de7ac1d6 513 if (p1->p_session->s_ttyvp != NULL && (p1->p_flags & P_CONTROLT))
4643740a 514 p2->p_flags |= P_CONTROLT;
87116512 515 if (flags & RFPPWAIT) {
4643740a 516 p2->p_flags |= P_PPWAIT;
87116512
MD
517 if (p1->p_upmap)
518 p1->p_upmap->invfork = 1;
519 }
520
984263bc 521
0daa37a5
MD
522 /*
523 * Inherit the virtual kernel structure (allows a virtual kernel
524 * to fork to simulate multiple cpus).
525 */
4a22e893
MD
526 if (p1->p_vkernel)
527 vkernel_inherit(p1, p2);
0daa37a5 528
5fd012e0
MD
529 /*
530 * Once we are on a pglist we may receive signals. XXX we might
531 * race a ^C being sent to the process group by not receiving it
532 * at all prior to this line.
533 */
58c2553a
MD
534 pgref(p1grp);
535 lwkt_gettoken(&p1grp->pg_token);
984263bc 536 LIST_INSERT_AFTER(p1, p2, p_pglist);
58c2553a 537 lwkt_reltoken(&p1grp->pg_token);
984263bc
MD
538
539 /*
540 * Attach the new process to its parent.
541 *
542 * If RFNOWAIT is set, the newly created process becomes a child
f0d55ae9
MD
543 * of the reaper (typically init). This effectively disassociates
544 * the child from the parent.
545 *
546 * Temporarily hold pptr for the RFNOWAIT case to avoid ripouts.
984263bc 547 */
f0d55ae9 548 if (flags & RFNOWAIT) {
2cb91021
MD
549 pptr = reaper_get(reap);
550 if (pptr == NULL) {
f0d55ae9
MD
551 pptr = initproc;
552 PHOLD(pptr);
553 }
554 } else {
984263bc 555 pptr = p1;
f0d55ae9 556 }
984263bc 557 p2->p_pptr = pptr;
984263bc 558 LIST_INIT(&p2->p_children);
b5c4d81f
MD
559
560 lwkt_gettoken(&pptr->p_token);
561 LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
562 lwkt_reltoken(&pptr->p_token);
563
f0d55ae9
MD
564 if (flags & RFNOWAIT)
565 PRELE(pptr);
566
98a7f915 567 varsymset_init(&p2->p_varsymset, &p1->p_varsymset);
8c2bce60 568 callout_init_mp(&p2->p_ithandle);
984263bc
MD
569
570#ifdef KTRACE
571 /*
572 * Copy traceflag and tracefile if enabled. If not inherited,
573 * these were zeroed above but we still could have a trace race
29f58392 574 * so make sure p2's p_tracenode is NULL.
984263bc 575 */
29f58392 576 if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracenode == NULL) {
984263bc 577 p2->p_traceflag = p1->p_traceflag;
29f58392 578 p2->p_tracenode = ktrinherit(p1->p_tracenode);
984263bc
MD
579 }
580#endif
581
984263bc
MD
582 /*
583 * This begins the section where we must prevent the parent
584 * from being swapped.
13d13d89
SS
585 *
586 * Gets PRELE'd in the caller in start_forked_proc().
984263bc
MD
587 */
588 PHOLD(p1);
589
13d13d89
SS
590 vm_fork(p1, p2, flags);
591
984263bc 592 /*
13d13d89
SS
593 * Create the first lwp associated with the new proc.
594 * It will return via a different execution path later, directly
595 * into userland, after it was put on the runq by
596 * start_forked_proc().
984263bc 597 */
13d13d89 598 lwp_fork(lp1, p2, flags);
984263bc 599
6b72a0c2 600 if (flags == (RFFDG | RFPROC | RFPGLOCK)) {
12e4aaff 601 mycpu->gd_cnt.v_forks++;
de7ac1d6
MD
602 mycpu->gd_cnt.v_forkpages += p2->p_vmspace->vm_dsize +
603 p2->p_vmspace->vm_ssize;
6b72a0c2 604 } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK)) {
12e4aaff 605 mycpu->gd_cnt.v_vforks++;
de7ac1d6
MD
606 mycpu->gd_cnt.v_vforkpages += p2->p_vmspace->vm_dsize +
607 p2->p_vmspace->vm_ssize;
984263bc 608 } else if (p1 == &proc0) {
12e4aaff 609 mycpu->gd_cnt.v_kthreads++;
de7ac1d6
MD
610 mycpu->gd_cnt.v_kthreadpages += p2->p_vmspace->vm_dsize +
611 p2->p_vmspace->vm_ssize;
984263bc 612 } else {
12e4aaff 613 mycpu->gd_cnt.v_rforks++;
de7ac1d6
MD
614 mycpu->gd_cnt.v_rforkpages += p2->p_vmspace->vm_dsize +
615 p2->p_vmspace->vm_ssize;
984263bc
MD
616 }
617
618 /*
619 * Both processes are set up, now check if any loadable modules want
620 * to adjust anything.
621 * What if they have an error? XXX
622 */
623 TAILQ_FOREACH(ep, &fork_list, next) {
624 (*ep->function)(p1, p2, flags);
625 }
626
627 /*
a77ac49d
MD
628 * Set the start time. Note that the process is not runnable. The
629 * caller is responsible for making it runnable.
984263bc 630 */
d9fa5f67 631 microtime(&p2->p_start);
984263bc 632 p2->p_acflag = AFORK;
984263bc
MD
633
634 /*
635 * tell any interested parties about the new process
636 */
637 KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
638
984263bc
MD
639 /*
640 * Return child proc pointer to parent.
641 */
642 *procp = p2;
b5c4d81f 643 error = 0;
167e6ecb 644done:
de7ac1d6
MD
645 if (p2)
646 lwkt_reltoken(&p2->p_token);
b5c4d81f 647 lwkt_reltoken(&p1->p_token);
58c2553a
MD
648 if (plkgrp) {
649 lockmgr(&plkgrp->pg_lock, LK_RELEASE);
650 pgrel(plkgrp);
651 }
167e6ecb 652 return (error);
984263bc
MD
653}
654
13d13d89
SS
655static struct lwp *
656lwp_fork(struct lwp *origlp, struct proc *destproc, int flags)
657{
d2d8515b 658 globaldata_t gd = mycpu;
13d13d89
SS
659 struct lwp *lp;
660 struct thread *td;
13d13d89 661
f6c36234 662 lp = kmalloc(sizeof(struct lwp), M_LWP, M_WAITOK|M_ZERO);
3e291793 663
13d13d89 664 lp->lwp_proc = destproc;
287ebb09 665 lp->lwp_vmspace = destproc->p_vmspace;
13d13d89 666 lp->lwp_stat = LSRUN;
13d13d89
SS
667 bcopy(&origlp->lwp_startcopy, &lp->lwp_startcopy,
668 (unsigned) ((caddr_t)&lp->lwp_endcopy -
669 (caddr_t)&lp->lwp_startcopy));
4643740a 670 lp->lwp_flags |= origlp->lwp_flags & LWP_ALTSTACK;
13d13d89
SS
671 /*
672 * Set cpbase to the last timeout that occured (not the upcoming
673 * timeout).
674 *
675 * A critical section is required since a timer IPI can update
676 * scheduler specific data.
677 */
678 crit_enter();
d2d8515b 679 lp->lwp_cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
13d13d89
SS
680 destproc->p_usched->heuristic_forking(origlp, lp);
681 crit_exit();
c07315c4 682 CPUMASK_ANDMASK(lp->lwp_cpumask, usched_mastermask);
e2b148c6 683 lwkt_token_init(&lp->lwp_token, "lwp_token");
ba87a4ab 684 spin_init(&lp->lwp_spin, "lwptoken");
13d13d89 685
d2d8515b
MD
686 /*
687 * Assign the thread to the current cpu to begin with so we
688 * can manipulate it.
689 */
690 td = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, gd->gd_cpuid, 0);
13d13d89 691 lp->lwp_thread = td;
2734d278 692 td->td_ucred = crhold(destproc->p_ucred);
13d13d89
SS
693 td->td_proc = destproc;
694 td->td_lwp = lp;
695 td->td_switch = cpu_heavy_switch;
e3e6be1f 696#ifdef NO_LWKT_SPLIT_USERPRI
d992c377 697 lwkt_setpri(td, TDPRI_USER_NORM);
e3e6be1f
MD
698#else
699 lwkt_setpri(td, TDPRI_KERN_USER);
d992c377 700#endif
13d13d89
SS
701 lwkt_set_comm(td, "%s", destproc->p_comm);
702
703 /*
704 * cpu_fork will copy and update the pcb, set up the kernel stack,
705 * and make the child ready to run.
706 */
707 cpu_fork(origlp, lp, flags);
a591f597 708 kqueue_init(&lp->lwp_kqueue, destproc->p_fd);
13d13d89 709
0d78b86e
MD
710 /*
711 * Assign a TID to the lp. Loop until the insert succeeds (returns
712 * NULL).
713 */
714 lp->lwp_tid = destproc->p_lasttid;
715 do {
716 if (++lp->lwp_tid < 0)
717 lp->lwp_tid = 1;
718 } while (lwp_rb_tree_RB_INSERT(&destproc->p_lwp_tree, lp) != NULL);
719 destproc->p_lasttid = lp->lwp_tid;
720 destproc->p_nthreads++;
721
51818c08
MD
722 /*
723 * This flag is set and never cleared. It means that the process
724 * was threaded at some point. Used to improve exit performance.
725 */
726 destproc->p_flags |= P_MAYBETHREADED;
727
13d13d89
SS
728 return (lp);
729}
730
984263bc
MD
731/*
732 * The next two functionms are general routines to handle adding/deleting
733 * items on the fork callout list.
734 *
735 * at_fork():
736 * Take the arguments given and put them onto the fork callout list,
737 * However first make sure that it's not already there.
738 * Returns 0 on success or a standard error number.
739 */
984263bc 740int
303c76d5 741at_fork(forklist_fn function)
984263bc
MD
742{
743 struct forklist *ep;
744
745#ifdef INVARIANTS
746 /* let the programmer know if he's been stupid */
303c76d5 747 if (rm_at_fork(function)) {
6ea70f76 748 kprintf("WARNING: fork callout entry (%p) already present\n",
984263bc 749 function);
303c76d5 750 }
984263bc 751#endif
efda3bd0 752 ep = kmalloc(sizeof(*ep), M_ATFORK, M_WAITOK|M_ZERO);
984263bc
MD
753 ep->function = function;
754 TAILQ_INSERT_TAIL(&fork_list, ep, next);
755 return (0);
756}
757
758/*
759 * Scan the exit callout list for the given item and remove it..
760 * Returns the number of items removed (0 or 1)
761 */
984263bc 762int
303c76d5 763rm_at_fork(forklist_fn function)
984263bc
MD
764{
765 struct forklist *ep;
766
767 TAILQ_FOREACH(ep, &fork_list, next) {
768 if (ep->function == function) {
769 TAILQ_REMOVE(&fork_list, ep, next);
efda3bd0 770 kfree(ep, M_ATFORK);
984263bc
MD
771 return(1);
772 }
773 }
774 return (0);
775}
7d0bac62
MD
776
777/*
778 * Add a forked process to the run queue after any remaining setup, such
779 * as setting the fork handler, has been completed.
de7ac1d6
MD
780 *
781 * p2 is held by the caller.
7d0bac62 782 */
7d0bac62 783void
553ea3c8 784start_forked_proc(struct lwp *lp1, struct proc *p2)
7d0bac62 785{
08f2f1bb 786 struct lwp *lp2 = ONLY_LWP_IN_PROC(p2);
51818c08 787 int pflags;
553ea3c8 788
7d0bac62 789 /*
26a0694b
MD
790 * Move from SIDL to RUN queue, and activate the process's thread.
791 * Activation of the thread effectively makes the process "a"
792 * current process, so we do not setrunqueue().
8ec60c3f
MD
793 *
794 * YYY setrunqueue works here but we should clean up the trampoline
795 * code so we just schedule the LWKT thread and let the trampoline
796 * deal with the userland scheduler on return to userland.
7d0bac62 797 */
553ea3c8 798 KASSERT(p2->p_stat == SIDL,
7d0bac62 799 ("cannot start forked process, bad status: %p", p2));
553ea3c8 800 p2->p_usched->resetpriority(lp2);
e43a034f 801 crit_enter();
164b8401
SS
802 p2->p_stat = SACTIVE;
803 lp2->lwp_stat = LSRUN;
553ea3c8 804 p2->p_usched->setrunqueue(lp2);
e43a034f 805 crit_exit();
7d0bac62
MD
806
807 /*
808 * Now can be swapped.
809 */
553ea3c8 810 PRELE(lp1->lwp_proc);
7d0bac62
MD
811
812 /*
51818c08
MD
813 * Preserve synchronization semantics of vfork. P_PPWAIT is set in
814 * the child until it has retired the parent's resources. The parent
815 * must wait for the flag to be cleared by the child.
816 *
817 * Interlock the flag/tsleep with atomic ops to avoid unnecessary
818 * p_token conflicts.
ee934fe9 819 *
51818c08
MD
820 * XXX Is this use of an atomic op on a field that is not normally
821 * manipulated with atomic ops ok?
7d0bac62 822 */
51818c08
MD
823 while ((pflags = p2->p_flags) & P_PPWAIT) {
824 cpu_ccfence();
825 tsleep_interlock(lp1->lwp_proc, 0);
826 if (atomic_cmpset_int(&p2->p_flags, pflags, pflags))
827 tsleep(lp1->lwp_proc, PINTERLOCKED, "ppwait", 0);
2734d278 828 }
7d0bac62 829}
6e2a912c
MD
830
831/*
fc3bc286 832 * procctl (idtype_t idtype, id_t id, int cmd, void *arg)
6e2a912c
MD
833 */
834int
fc3bc286 835sys_procctl(struct procctl_args *uap)
6e2a912c
MD
836{
837 struct proc *p = curproc;
838 struct proc *p2;
839 struct sysreaper *reap;
840 union reaper_info udata;
841 int error;
842
fc3bc286
MD
843 if (uap->idtype != P_PID || uap->id != (id_t)p->p_pid)
844 return EINVAL;
845
846 switch(uap->cmd) {
847 case PROC_REAP_ACQUIRE:
6e2a912c
MD
848 lwkt_gettoken(&p->p_token);
849 reap = kmalloc(sizeof(*reap), M_REAPER, M_WAITOK|M_ZERO);
850 if (p->p_reaper == NULL || p->p_reaper->p != p) {
851 reaper_init(p, reap);
852 error = 0;
853 } else {
854 kfree(reap, M_REAPER);
855 error = EALREADY;
856 }
857 lwkt_reltoken(&p->p_token);
858 break;
fc3bc286 859 case PROC_REAP_RELEASE:
6e2a912c
MD
860 lwkt_gettoken(&p->p_token);
861release_again:
862 reap = p->p_reaper;
863 KKASSERT(reap != NULL);
864 if (reap->p == p) {
865 reaper_hold(reap); /* in case of thread race */
866 lockmgr(&reap->lock, LK_EXCLUSIVE);
867 if (reap->p != p) {
868 lockmgr(&reap->lock, LK_RELEASE);
869 reaper_drop(reap);
870 goto release_again;
871 }
872 reap->p = NULL;
873 p->p_reaper = reap->parent;
874 if (p->p_reaper)
875 reaper_hold(p->p_reaper);
876 lockmgr(&reap->lock, LK_RELEASE);
877 reaper_drop(reap); /* our ref */
878 reaper_drop(reap); /* old p_reaper ref */
879 error = 0;
880 } else {
881 error = ENOTCONN;
882 }
883 lwkt_reltoken(&p->p_token);
884 break;
fc3bc286 885 case PROC_REAP_STATUS:
6e2a912c
MD
886 bzero(&udata, sizeof(udata));
887 lwkt_gettoken_shared(&p->p_token);
888 if ((reap = p->p_reaper) != NULL && reap->p == p) {
889 udata.status.flags = reap->flags;
f614cb6d 890 udata.status.refs = reap->refs - 1; /* minus ours */
6e2a912c 891 }
f614cb6d
MD
892 p2 = LIST_FIRST(&p->p_children);
893 udata.status.pid_head = p2 ? p2->p_pid : -1;
6e2a912c 894 lwkt_reltoken(&p->p_token);
f614cb6d 895
6e2a912c
MD
896 if (uap->data) {
897 error = copyout(&udata, uap->data,
898 sizeof(udata.status));
899 } else {
900 error = 0;
901 }
902 break;
903 default:
904 error = EINVAL;
905 break;
906 }
907 return error;
908}
909
910/*
911 * Bump ref on reaper, preventing destruction
912 */
913void
914reaper_hold(struct sysreaper *reap)
915{
916 KKASSERT(reap->refs > 0);
917 refcount_acquire(&reap->refs);
918}
919
920/*
921 * Drop ref on reaper, destroy the structure on the 1->0
922 * transition and loop on the parent.
923 */
924void
925reaper_drop(struct sysreaper *next)
926{
927 struct sysreaper *reap;
928
929 while ((reap = next) != NULL) {
930 if (refcount_release(&reap->refs)) {
931 next = reap->parent;
932 KKASSERT(reap->p == NULL);
933 reap->parent = NULL;
934 kfree(reap, M_REAPER);
935 } else {
936 next = NULL;
937 }
938 }
939}
940
941/*
942 * Initialize a static or newly allocated reaper structure
943 */
944void
945reaper_init(struct proc *p, struct sysreaper *reap)
946{
947 reap->parent = p->p_reaper;
948 reap->p = p;
949 if (p == initproc) {
950 reap->flags = REAPER_STAT_OWNED | REAPER_STAT_REALINIT;
951 reap->refs = 2;
952 } else {
953 reap->flags = REAPER_STAT_OWNED;
954 reap->refs = 1;
955 }
956 lockinit(&reap->lock, "subrp", 0, 0);
957 cpu_sfence();
958 p->p_reaper = reap;
959}
960
961/*
962 * Called with p->p_token held during exit.
963 *
964 * This is a bit simpler than RELEASE because there are no threads remaining
965 * to race. We only release if we own the reaper, the exit code will handle
966 * the final p_reaper release.
967 */
968struct sysreaper *
969reaper_exit(struct proc *p)
970{
971 struct sysreaper *reap;
972
973 /*
974 * Release acquired reaper
975 */
976 if ((reap = p->p_reaper) != NULL && reap->p == p) {
977 lockmgr(&reap->lock, LK_EXCLUSIVE);
978 p->p_reaper = reap->parent;
979 if (p->p_reaper)
980 reaper_hold(p->p_reaper);
981 reap->p = NULL;
982 lockmgr(&reap->lock, LK_RELEASE);
983 reaper_drop(reap);
984 }
985
986 /*
987 * Return and clear reaper (caller is holding p_token for us)
988 * (reap->p does not equal p). Caller must drop it.
989 */
990 if ((reap = p->p_reaper) != NULL) {
991 p->p_reaper = NULL;
992 }
993 return reap;
994}
995
996/*
997 * Return a held (PHOLD) process representing the reaper for process (p).
998 * NULL should not normally be returned. Caller should PRELE() the returned
999 * reaper process when finished.
1000 *
1001 * Remove dead internal nodes while we are at it.
1002 *
1003 * Process (p)'s token must be held on call.
1004 * The returned process's token is NOT acquired by this routine.
1005 */
1006struct proc *
1007reaper_get(struct sysreaper *reap)
1008{
1009 struct sysreaper *next;
1010 struct proc *reproc;
1011
1012 if (reap == NULL)
1013 return NULL;
1014
1015 /*
1016 * Extra hold for loop
1017 */
1018 reaper_hold(reap);
1019
1020 while (reap) {
1021 lockmgr(&reap->lock, LK_SHARED);
1022 if (reap->p) {
1023 /*
1024 * Probable reaper
1025 */
1026 if (reap->p) {
1027 reproc = reap->p;
1028 PHOLD(reproc);
1029 lockmgr(&reap->lock, LK_RELEASE);
1030 reaper_drop(reap);
1031 return reproc;
1032 }
1033
1034 /*
1035 * Raced, try again
1036 */
1037 lockmgr(&reap->lock, LK_RELEASE);
1038 continue;
1039 }
1040
1041 /*
1042 * Traverse upwards in the reaper topology, destroy
1043 * dead internal nodes when possible.
1044 *
1045 * NOTE: Our ref on next means that a dead node should
1046 * have 2 (ours and reap->parent's).
1047 */
1048 next = reap->parent;
1049 while (next) {
1050 reaper_hold(next);
1051 if (next->refs == 2 && next->p == NULL) {
1052 lockmgr(&reap->lock, LK_RELEASE);
1053 lockmgr(&reap->lock, LK_EXCLUSIVE);
1054 if (next->refs == 2 &&
1055 reap->parent == next &&
1056 next->p == NULL) {
1057 /*
1058 * reap->parent inherits ref from next.
1059 */
1060 reap->parent = next->parent;
1061 next->parent = NULL;
1062 reaper_drop(next); /* ours */
1063 reaper_drop(next); /* old parent */
1064 next = reap->parent;
1065 continue; /* possible chain */
1066 }
1067 }
1068 break;
1069 }
1070 lockmgr(&reap->lock, LK_RELEASE);
1071 reaper_drop(reap);
1072 reap = next;
1073 }
1074 return NULL;
1075}