Commit | Line | Data |
---|---|---|
984263bc MD |
1 | /* |
2 | * Copyright (c) 1982, 1986, 1989, 1991, 1993 | |
3 | * The Regents of the University of California. All rights reserved. | |
4 | * (c) UNIX System Laboratories, Inc. | |
5 | * All or some portions of this file are derived from material licensed | |
6 | * to the University of California by American Telephone and Telegraph | |
7 | * Co. or Unix System Laboratories, Inc. and are reproduced herein with | |
8 | * the permission of UNIX System Laboratories, Inc. | |
9 | * | |
10 | * Redistribution and use in source and binary forms, with or without | |
11 | * modification, are permitted provided that the following conditions | |
12 | * are met: | |
13 | * 1. Redistributions of source code must retain the above copyright | |
14 | * notice, this list of conditions and the following disclaimer. | |
15 | * 2. Redistributions in binary form must reproduce the above copyright | |
16 | * notice, this list of conditions and the following disclaimer in the | |
17 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 18 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
19 | * may be used to endorse or promote products derived from this software |
20 | * without specific prior written permission. | |
21 | * | |
22 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
23 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
24 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
25 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
26 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
27 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
28 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
29 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
31 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
32 | * SUCH DAMAGE. | |
33 | * | |
34 | * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 | |
5bc7cd8d | 35 | * $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.14 2003/06/26 04:15:10 silby Exp $ |
984263bc MD |
36 | */ |
37 | ||
38 | #include "opt_ktrace.h" | |
39 | ||
40 | #include <sys/param.h> | |
41 | #include <sys/systm.h> | |
80d831e1 | 42 | #include <sys/sysmsg.h> |
984263bc MD |
43 | #include <sys/filedesc.h> |
44 | #include <sys/kernel.h> | |
45 | #include <sys/sysctl.h> | |
46 | #include <sys/malloc.h> | |
47 | #include <sys/proc.h> | |
48 | #include <sys/resourcevar.h> | |
49 | #include <sys/vnode.h> | |
50 | #include <sys/acct.h> | |
51 | #include <sys/ktrace.h> | |
dfc1fc13 EN |
52 | #include <sys/unistd.h> |
53 | #include <sys/jail.h> | |
0d9899e7 | 54 | #include <sys/lwp.h> |
984263bc MD |
55 | |
56 | #include <vm/vm.h> | |
57 | #include <sys/lock.h> | |
58 | #include <vm/pmap.h> | |
59 | #include <vm/vm_map.h> | |
60 | #include <vm/vm_extern.h> | |
984263bc MD |
61 | |
62 | #include <sys/vmmeter.h> | |
19bfc8ab | 63 | #include <sys/refcount.h> |
e43a034f | 64 | #include <sys/thread2.h> |
b1b4e5a6 | 65 | #include <sys/signal2.h> |
8f1f6170 | 66 | #include <sys/spinlock2.h> |
984263bc | 67 | |
8c72e3d5 AH |
68 | #include <sys/dsched.h> |
69 | ||
984263bc | 70 | static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback"); |
6e2a912c | 71 | static MALLOC_DEFINE(M_REAPER, "reaper", "process reapers"); |
984263bc MD |
72 | |
73 | /* | |
74 | * These are the stuctures used to create a callout list for things to do | |
75 | * when forking a process | |
76 | */ | |
77 | struct forklist { | |
78 | forklist_fn function; | |
79 | TAILQ_ENTRY(forklist) next; | |
80 | }; | |
81 | ||
82 | TAILQ_HEAD(forklist_head, forklist); | |
83 | static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list); | |
84 | ||
8e99ae46 | 85 | static struct lwp *lwp_fork1(struct lwp *, struct proc *, int flags, |
1eb8c611 | 86 | const cpumask_t *mask); |
8e99ae46 MD |
87 | static void lwp_fork2(struct lwp *lp1, struct proc *destproc, |
88 | struct lwp *lp2, int flags); | |
1eb8c611 SZ |
89 | static int lwp_create1(struct lwp_params *params, |
90 | const cpumask_t *mask); | |
a73d7792 | 91 | static struct lock reaper_lock = LOCK_INITIALIZER("reapgl", 0, 0); |
13d13d89 | 92 | |
984263bc MD |
93 | int forksleep; /* Place for fork1() to sleep on. */ |
94 | ||
3e291793 MD |
95 | /* |
96 | * Red-Black tree support for LWPs | |
97 | */ | |
98 | ||
99 | static int | |
100 | rb_lwp_compare(struct lwp *lp1, struct lwp *lp2) | |
101 | { | |
102 | if (lp1->lwp_tid < lp2->lwp_tid) | |
103 | return(-1); | |
104 | if (lp1->lwp_tid > lp2->lwp_tid) | |
105 | return(1); | |
106 | return(0); | |
107 | } | |
108 | ||
109 | RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid); | |
110 | ||
31efdff0 MD |
111 | /* |
112 | * When forking, memory underpinning umtx-supported mutexes may be set | |
113 | * COW causing the physical address to change. We must wakeup any threads | |
114 | * blocked on the physical address to allow them to re-resolve their VM. | |
afd7f124 MD |
115 | * |
116 | * (caller is holding p->p_token) | |
31efdff0 MD |
117 | */ |
118 | static void | |
119 | wake_umtx_threads(struct proc *p1) | |
120 | { | |
121 | struct lwp *lp; | |
122 | struct thread *td; | |
123 | ||
124 | RB_FOREACH(lp, lwp_rb_tree, &p1->p_lwp_tree) { | |
125 | td = lp->lwp_thread; | |
126 | if (td && (td->td_flags & TDF_TSLEEPQ) && | |
127 | (td->td_wdomain & PDOMAIN_MASK) == PDOMAIN_UMTX) { | |
128 | wakeup_domain(td->td_wchan, PDOMAIN_UMTX); | |
129 | } | |
130 | } | |
131 | } | |
132 | ||
3919ced0 | 133 | /* |
51818c08 | 134 | * fork() system call |
3919ced0 | 135 | */ |
984263bc | 136 | int |
80d831e1 | 137 | sys_fork(struct sysmsg *sysmsg, const struct fork_args *uap) |
984263bc | 138 | { |
553ea3c8 | 139 | struct lwp *lp = curthread->td_lwp; |
984263bc | 140 | struct proc *p2; |
41c20dac | 141 | int error; |
984263bc | 142 | |
167e6ecb | 143 | error = fork1(lp, RFFDG | RFPROC | RFPGLOCK, &p2); |
984263bc | 144 | if (error == 0) { |
de7ac1d6 | 145 | PHOLD(p2); |
553ea3c8 | 146 | start_forked_proc(lp, p2); |
80d831e1 MD |
147 | sysmsg->sysmsg_fds[0] = p2->p_pid; |
148 | sysmsg->sysmsg_fds[1] = 0; | |
de7ac1d6 | 149 | PRELE(p2); |
984263bc MD |
150 | } |
151 | return error; | |
152 | } | |
153 | ||
3919ced0 | 154 | /* |
51818c08 | 155 | * vfork() system call |
3919ced0 | 156 | */ |
984263bc | 157 | int |
80d831e1 | 158 | sys_vfork(struct sysmsg *sysmsg, const struct vfork_args *uap) |
984263bc | 159 | { |
553ea3c8 | 160 | struct lwp *lp = curthread->td_lwp; |
984263bc | 161 | struct proc *p2; |
41c20dac | 162 | int error; |
984263bc | 163 | |
167e6ecb | 164 | error = fork1(lp, RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK, &p2); |
984263bc | 165 | if (error == 0) { |
de7ac1d6 | 166 | PHOLD(p2); |
553ea3c8 | 167 | start_forked_proc(lp, p2); |
80d831e1 MD |
168 | sysmsg->sysmsg_fds[0] = p2->p_pid; |
169 | sysmsg->sysmsg_fds[1] = 0; | |
de7ac1d6 | 170 | PRELE(p2); |
984263bc MD |
171 | } |
172 | return error; | |
173 | } | |
174 | ||
f61c1ff1 MD |
175 | /* |
176 | * Handle rforks. An rfork may (1) operate on the current process without | |
177 | * creating a new, (2) create a new process that shared the current process's | |
178 | * vmspace, signals, and/or descriptors, or (3) create a new process that does | |
179 | * not share these things (normal fork). | |
180 | * | |
181 | * Note that we only call start_forked_proc() if a new process is actually | |
182 | * created. | |
183 | * | |
184 | * rfork { int flags } | |
185 | */ | |
984263bc | 186 | int |
80d831e1 | 187 | sys_rfork(struct sysmsg *sysmsg, const struct rfork_args *uap) |
984263bc | 188 | { |
553ea3c8 | 189 | struct lwp *lp = curthread->td_lwp; |
984263bc | 190 | struct proc *p2; |
41c20dac | 191 | int error; |
984263bc | 192 | |
6654fbcb MD |
193 | if ((uap->flags & RFKERNELONLY) != 0) |
194 | return (EINVAL); | |
195 | ||
167e6ecb | 196 | error = fork1(lp, uap->flags | RFPGLOCK, &p2); |
984263bc | 197 | if (error == 0) { |
de7ac1d6 MD |
198 | if (p2) { |
199 | PHOLD(p2); | |
553ea3c8 | 200 | start_forked_proc(lp, p2); |
80d831e1 MD |
201 | sysmsg->sysmsg_fds[0] = p2->p_pid; |
202 | sysmsg->sysmsg_fds[1] = 0; | |
de7ac1d6 MD |
203 | PRELE(p2); |
204 | } else { | |
80d831e1 MD |
205 | sysmsg->sysmsg_fds[0] = 0; |
206 | sysmsg->sysmsg_fds[1] = 0; | |
de7ac1d6 | 207 | } |
984263bc MD |
208 | } |
209 | return error; | |
210 | } | |
211 | ||
1eb8c611 SZ |
212 | static int |
213 | lwp_create1(struct lwp_params *uprm, const cpumask_t *umask) | |
91bd9c1e SS |
214 | { |
215 | struct proc *p = curproc; | |
216 | struct lwp *lp; | |
217 | struct lwp_params params; | |
1eb8c611 | 218 | cpumask_t *mask = NULL, mask0; |
91bd9c1e SS |
219 | int error; |
220 | ||
1eb8c611 | 221 | error = copyin(uprm, ¶ms, sizeof(params)); |
91bd9c1e SS |
222 | if (error) |
223 | goto fail2; | |
224 | ||
1eb8c611 SZ |
225 | if (umask != NULL) { |
226 | error = copyin(umask, &mask0, sizeof(mask0)); | |
227 | if (error) | |
228 | goto fail2; | |
229 | CPUMASK_ANDMASK(mask0, smp_active_mask); | |
230 | if (CPUMASK_TESTNZERO(mask0)) | |
231 | mask = &mask0; | |
232 | } | |
233 | ||
b5c4d81f | 234 | lwkt_gettoken(&p->p_token); |
8f1f6170 | 235 | plimit_lwp_fork(p); /* force exclusive access */ |
8e99ae46 MD |
236 | lp = lwp_fork1(curthread->td_lwp, p, RFPROC | RFMEM, mask); |
237 | lwp_fork2(curthread->td_lwp, p, lp, RFPROC | RFMEM); | |
91bd9c1e | 238 | error = cpu_prepare_lwp(lp, ¶ms); |
7b925b71 MD |
239 | if (error) |
240 | goto fail; | |
b44473af SW |
241 | if (params.lwp_tid1 != NULL && |
242 | (error = copyout(&lp->lwp_tid, params.lwp_tid1, sizeof(lp->lwp_tid)))) | |
91bd9c1e | 243 | goto fail; |
b44473af SW |
244 | if (params.lwp_tid2 != NULL && |
245 | (error = copyout(&lp->lwp_tid, params.lwp_tid2, sizeof(lp->lwp_tid)))) | |
91bd9c1e SS |
246 | goto fail; |
247 | ||
248 | /* | |
8f1f6170 | 249 | * Now schedule the new lwp. |
91bd9c1e SS |
250 | */ |
251 | p->p_usched->resetpriority(lp); | |
252 | crit_enter(); | |
253 | lp->lwp_stat = LSRUN; | |
254 | p->p_usched->setrunqueue(lp); | |
255 | crit_exit(); | |
b5c4d81f | 256 | lwkt_reltoken(&p->p_token); |
91bd9c1e SS |
257 | |
258 | return (0); | |
259 | ||
260 | fail: | |
6214ede1 SZ |
261 | /* |
262 | * Make sure no one is using this lwp, before it is removed from | |
263 | * the tree. If we didn't wait it here, lwp tree iteration with | |
264 | * blocking operation would be broken. | |
265 | */ | |
266 | while (lp->lwp_lock > 0) | |
267 | tsleep(lp, 0, "lwpfail", 1); | |
3e291793 | 268 | lwp_rb_tree_RB_REMOVE(&p->p_lwp_tree, lp); |
0b26dde3 | 269 | --p->p_nthreads; |
e3161323 | 270 | /* lwp_dispose expects an exited lwp, and a held proc */ |
4643740a | 271 | atomic_set_int(&lp->lwp_mpflags, LWP_MP_WEXIT); |
e3161323 | 272 | lp->lwp_thread->td_flags |= TDF_EXITING; |
7b925b71 | 273 | lwkt_remove_tdallq(lp->lwp_thread); |
e3161323 | 274 | PHOLD(p); |
18af8f55 MD |
275 | biosched_done(lp->lwp_thread); |
276 | dsched_exit_thread(lp->lwp_thread); | |
91bd9c1e | 277 | lwp_dispose(lp); |
b5c4d81f | 278 | lwkt_reltoken(&p->p_token); |
91bd9c1e SS |
279 | fail2: |
280 | return (error); | |
281 | } | |
984263bc | 282 | |
1eb8c611 SZ |
283 | /* |
284 | * Low level thread create used by pthreads. | |
285 | */ | |
286 | int | |
80d831e1 | 287 | sys_lwp_create(struct sysmsg *sysmsg, const struct lwp_create_args *uap) |
1eb8c611 SZ |
288 | { |
289 | ||
290 | return (lwp_create1(uap->params, NULL)); | |
291 | } | |
292 | ||
293 | int | |
80d831e1 | 294 | sys_lwp_create2(struct sysmsg *sysmsg, const struct lwp_create2_args *uap) |
1eb8c611 SZ |
295 | { |
296 | ||
297 | return (lwp_create1(uap->params, uap->mask)); | |
298 | } | |
299 | ||
984263bc | 300 | int nprocs = 1; /* process 0 */ |
984263bc MD |
301 | |
302 | int | |
553ea3c8 | 303 | fork1(struct lwp *lp1, int flags, struct proc **procp) |
984263bc | 304 | { |
553ea3c8 | 305 | struct proc *p1 = lp1->lwp_proc; |
de7ac1d6 MD |
306 | struct proc *p2; |
307 | struct proc *pptr; | |
58c2553a MD |
308 | struct pgrp *p1grp; |
309 | struct pgrp *plkgrp; | |
8e99ae46 | 310 | struct lwp *lp2; |
f0d55ae9 | 311 | struct sysreaper *reap; |
984263bc | 312 | uid_t uid; |
167e6ecb | 313 | int ok, error; |
51e64ff2 | 314 | static int curfail = 0; |
5bc7cd8d | 315 | static struct timeval lastfail; |
984263bc MD |
316 | struct forklist *ep; |
317 | struct filedesc_to_leader *fdtol; | |
318 | ||
319 | if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) | |
320 | return (EINVAL); | |
321 | ||
b5c4d81f | 322 | lwkt_gettoken(&p1->p_token); |
58c2553a | 323 | plkgrp = NULL; |
de7ac1d6 | 324 | p2 = NULL; |
b5c4d81f | 325 | |
984263bc MD |
326 | /* |
327 | * Here we don't create a new process, but we divorce | |
328 | * certain parts of a process from itself. | |
329 | */ | |
330 | if ((flags & RFPROC) == 0) { | |
13d13d89 SS |
331 | /* |
332 | * This kind of stunt does not work anymore if | |
333 | * there are native threads (lwps) running | |
334 | */ | |
b5c4d81f MD |
335 | if (p1->p_nthreads != 1) { |
336 | error = EINVAL; | |
337 | goto done; | |
338 | } | |
13d13d89 | 339 | |
4aa6d05c | 340 | vm_fork(p1, NULL, NULL, flags); |
afd7f124 MD |
341 | if ((flags & RFMEM) == 0) |
342 | wake_umtx_threads(p1); | |
984263bc MD |
343 | |
344 | /* | |
345 | * Close all file descriptors. | |
346 | */ | |
347 | if (flags & RFCFDG) { | |
348 | struct filedesc *fdtmp; | |
349 | fdtmp = fdinit(p1); | |
0a4a9c77 | 350 | fdfree(p1, fdtmp); |
984263bc MD |
351 | } |
352 | ||
353 | /* | |
354 | * Unshare file descriptors (from parent.) | |
355 | */ | |
356 | if (flags & RFFDG) { | |
357 | if (p1->p_fd->fd_refcnt > 1) { | |
358 | struct filedesc *newfd; | |
2994659f VS |
359 | error = fdcopy(p1, &newfd); |
360 | if (error != 0) { | |
361 | error = ENOMEM; | |
362 | goto done; | |
363 | } | |
0a4a9c77 | 364 | fdfree(p1, newfd); |
984263bc MD |
365 | } |
366 | } | |
367 | *procp = NULL; | |
b5c4d81f MD |
368 | error = 0; |
369 | goto done; | |
984263bc MD |
370 | } |
371 | ||
167e6ecb MD |
372 | /* |
373 | * Interlock against process group signal delivery. If signals | |
374 | * are pending after the interlock is obtained we have to restart | |
375 | * the system call to process the signals. If we don't the child | |
376 | * can miss a pgsignal (such as ^C) sent during the fork. | |
377 | * | |
378 | * We can't use CURSIG() here because it will process any STOPs | |
379 | * and cause the process group lock to be held indefinitely. If | |
380 | * a STOP occurs, the fork will be restarted after the CONT. | |
381 | */ | |
58c2553a MD |
382 | p1grp = p1->p_pgrp; |
383 | if ((flags & RFPGLOCK) && (plkgrp = p1->p_pgrp) != NULL) { | |
384 | pgref(plkgrp); | |
385 | lockmgr(&plkgrp->pg_lock, LK_SHARED); | |
f6e73860 | 386 | if (CURSIG_NOBLOCK(lp1)) { |
167e6ecb MD |
387 | error = ERESTART; |
388 | goto done; | |
389 | } | |
390 | } | |
391 | ||
984263bc MD |
392 | /* |
393 | * Although process entries are dynamically created, we still keep | |
394 | * a global limit on the maximum number we will create. Don't allow | |
395 | * a nonprivileged user to use the last ten processes; don't let root | |
396 | * exceed the limit. The variable nprocs is the current number of | |
397 | * processes, maxproc is the limit. | |
398 | */ | |
9910d07b | 399 | uid = lp1->lwp_thread->td_ucred->cr_ruid; |
984263bc | 400 | if ((nprocs >= maxproc - 10 && uid != 0) || nprocs >= maxproc) { |
5bc7cd8d | 401 | if (ppsratecheck(&lastfail, &curfail, 1)) |
6ea70f76 | 402 | kprintf("maxproc limit exceeded by uid %d, please " |
5bc7cd8d | 403 | "see tuning(7) and login.conf(5).\n", uid); |
377d4740 | 404 | tsleep(&forksleep, 0, "fork", hz / 2); |
167e6ecb MD |
405 | error = EAGAIN; |
406 | goto done; | |
984263bc | 407 | } |
8c2bce60 | 408 | |
984263bc MD |
409 | /* |
410 | * Increment the nprocs resource before blocking can occur. There | |
411 | * are hard-limits as to the number of processes that can run. | |
412 | */ | |
8c2bce60 | 413 | atomic_add_int(&nprocs, 1); |
984263bc MD |
414 | |
415 | /* | |
e7e1189f MD |
416 | * Increment the count of procs running with this uid. This also |
417 | * applies to root. | |
984263bc | 418 | */ |
9910d07b | 419 | ok = chgproccnt(lp1->lwp_thread->td_ucred->cr_ruidinfo, 1, |
e7e1189f | 420 | plimit_getadjvalue(RLIMIT_NPROC)); |
984263bc MD |
421 | if (!ok) { |
422 | /* | |
423 | * Back out the process count | |
424 | */ | |
8c2bce60 | 425 | atomic_add_int(&nprocs, -1); |
e7e1189f MD |
426 | if (ppsratecheck(&lastfail, &curfail, 1)) { |
427 | kprintf("maxproc limit of %jd " | |
428 | "exceeded by \"%s\" uid %d, " | |
429 | "please see tuning(7) and login.conf(5).\n", | |
430 | plimit_getadjvalue(RLIMIT_NPROC), | |
431 | p1->p_comm, | |
432 | uid); | |
433 | } | |
377d4740 | 434 | tsleep(&forksleep, 0, "fork", hz / 2); |
167e6ecb MD |
435 | error = EAGAIN; |
436 | goto done; | |
984263bc MD |
437 | } |
438 | ||
de7ac1d6 MD |
439 | /* |
440 | * Allocate a new process, don't get fancy: zero the structure. | |
441 | */ | |
37733243 | 442 | p2 = kmalloc(sizeof(struct proc), M_PROC, M_WAITOK|M_ZERO); |
984263bc MD |
443 | |
444 | /* | |
de7ac1d6 MD |
445 | * Core initialization. SIDL is a safety state that protects the |
446 | * partially initialized process once it starts getting hooked | |
447 | * into system structures and becomes addressable. | |
448 | * | |
449 | * We must be sure to acquire p2->p_token as well, we must hold it | |
450 | * once the process is on the allproc list to avoid things such | |
451 | * as competing modifications to p_flags. | |
984263bc | 452 | */ |
0adbcbd6 MD |
453 | mycpu->gd_forkid += ncpus; |
454 | p2->p_forkid = mycpu->gd_forkid + mycpu->gd_cpuid; | |
526c5c2b | 455 | p2->p_lasttid = 0; /* first tid will be 1 */ |
de7ac1d6 | 456 | p2->p_stat = SIDL; |
984263bc | 457 | |
6e2a912c MD |
458 | /* |
459 | * NOTE: Process 0 will not have a reaper, but process 1 (init) and | |
460 | * all other processes always will. | |
461 | */ | |
f0d55ae9 MD |
462 | if ((reap = p1->p_reaper) != NULL) { |
463 | reaper_hold(reap); | |
464 | p2->p_reaper = reap; | |
465 | } else { | |
466 | p2->p_reaper = NULL; | |
467 | } | |
6e2a912c | 468 | |
3e291793 | 469 | RB_INIT(&p2->p_lwp_tree); |
ba87a4ab | 470 | spin_init(&p2->p_spin, "procfork1"); |
8c2bce60 | 471 | lwkt_token_init(&p2->p_token, "proc"); |
de7ac1d6 | 472 | lwkt_gettoken(&p2->p_token); |
d6299163 MD |
473 | p2->p_uidpcpu = kmalloc(sizeof(*p2->p_uidpcpu) * ncpus, |
474 | M_SUBPROC, M_WAITOK | M_ZERO); | |
ef09c3ed | 475 | |
984263bc | 476 | /* |
de7ac1d6 MD |
477 | * Setup linkage for kernel based threading XXX lwp. Also add the |
478 | * process to the allproclist. | |
479 | * | |
480 | * The process structure is addressable after this point. | |
984263bc | 481 | */ |
de7ac1d6 MD |
482 | if (flags & RFTHREAD) { |
483 | p2->p_peers = p1->p_peers; | |
484 | p1->p_peers = p2; | |
485 | p2->p_leader = p1->p_leader; | |
486 | } else { | |
487 | p2->p_leader = p2; | |
488 | } | |
51e64ff2 | 489 | proc_add_allproc(p2); |
984263bc MD |
490 | |
491 | /* | |
de7ac1d6 | 492 | * Initialize the section which is copied verbatim from the parent. |
984263bc | 493 | */ |
984263bc | 494 | bcopy(&p1->p_startcopy, &p2->p_startcopy, |
de7ac1d6 | 495 | ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); |
984263bc | 496 | |
984263bc | 497 | /* |
88072e3b MD |
498 | * Duplicate sub-structures as needed. Increase reference counts |
499 | * on shared objects. | |
500 | * | |
501 | * NOTE: because we are now on the allproc list it is possible for | |
502 | * other consumers to gain temporary references to p2 | |
503 | * (p2->p_lock can change). | |
984263bc | 504 | */ |
4643740a | 505 | if (p1->p_flags & P_PROFIL) |
984263bc | 506 | startprofclock(p2); |
9910d07b | 507 | p2->p_ucred = crhold(lp1->lwp_thread->td_ucred); |
984263bc | 508 | |
b40e316c | 509 | if (jailed(p2->p_ucred)) |
4643740a | 510 | p2->p_flags |= P_JAILED; |
984263bc MD |
511 | |
512 | if (p2->p_args) | |
19bfc8ab | 513 | refcount_acquire(&p2->p_args->ar_ref); |
984263bc | 514 | |
13d13d89 | 515 | p2->p_usched = p1->p_usched; |
8c72e3d5 | 516 | /* XXX: verify copy of the secondary iosched stuff */ |
3573cf7b | 517 | dsched_enter_proc(p2); |
13d13d89 | 518 | |
984263bc | 519 | if (flags & RFSIGSHARE) { |
b1b4e5a6 | 520 | p2->p_sigacts = p1->p_sigacts; |
6fa9e71a | 521 | refcount_acquire(&p2->p_sigacts->ps_refcnt); |
984263bc | 522 | } else { |
6fa9e71a MD |
523 | p2->p_sigacts = kmalloc(sizeof(*p2->p_sigacts), |
524 | M_SUBPROC, M_WAITOK); | |
b1b4e5a6 | 525 | bcopy(p1->p_sigacts, p2->p_sigacts, sizeof(*p2->p_sigacts)); |
6fa9e71a | 526 | refcount_init(&p2->p_sigacts->ps_refcnt, 1); |
984263bc MD |
527 | } |
528 | if (flags & RFLINUXTHPN) | |
529 | p2->p_sigparent = SIGUSR1; | |
530 | else | |
531 | p2->p_sigparent = SIGCHLD; | |
532 | ||
533 | /* bump references to the text vnode (for procfs) */ | |
534 | p2->p_textvp = p1->p_textvp; | |
535 | if (p2->p_textvp) | |
597aea93 | 536 | vref(p2->p_textvp); |
984263bc | 537 | |
8ba5f7ef AH |
538 | /* copy namecache handle to the text file */ |
539 | if (p1->p_textnch.mount) | |
540 | cache_copy(&p1->p_textnch, &p2->p_textnch); | |
541 | ||
0daa37a5 MD |
542 | /* |
543 | * Handle file descriptors | |
544 | */ | |
984263bc MD |
545 | if (flags & RFCFDG) { |
546 | p2->p_fd = fdinit(p1); | |
547 | fdtol = NULL; | |
548 | } else if (flags & RFFDG) { | |
2994659f VS |
549 | error = fdcopy(p1, &p2->p_fd); |
550 | if (error != 0) { | |
551 | error = ENOMEM; | |
552 | goto done; | |
553 | } | |
984263bc MD |
554 | fdtol = NULL; |
555 | } else { | |
556 | p2->p_fd = fdshare(p1); | |
b5c4d81f | 557 | if (p1->p_fdtol == NULL) { |
de7ac1d6 MD |
558 | p1->p_fdtol = filedesc_to_leader_alloc(NULL, |
559 | p1->p_leader); | |
b5c4d81f | 560 | } |
984263bc MD |
561 | if ((flags & RFTHREAD) != 0) { |
562 | /* | |
563 | * Shared file descriptor table and | |
564 | * shared process leaders. | |
565 | */ | |
566 | fdtol = p1->p_fdtol; | |
567 | fdtol->fdl_refcount++; | |
568 | } else { | |
569 | /* | |
570 | * Shared file descriptor table, and | |
571 | * different process leaders | |
572 | */ | |
98a7f915 | 573 | fdtol = filedesc_to_leader_alloc(p1->p_fdtol, p2); |
984263bc MD |
574 | } |
575 | } | |
576 | p2->p_fdtol = fdtol; | |
8f1f6170 | 577 | p2->p_limit = plimit_fork(p1); |
984263bc | 578 | |
e7e1189f MD |
579 | /* |
580 | * Adjust depth for resource downscaling | |
581 | */ | |
582 | if ((p2->p_depth & 31) != 31) | |
583 | ++p2->p_depth; | |
584 | ||
984263bc MD |
585 | /* |
586 | * Preserve some more flags in subprocess. P_PROFIL has already | |
587 | * been preserved. | |
588 | */ | |
4643740a | 589 | p2->p_flags |= p1->p_flags & P_SUGID; |
de7ac1d6 | 590 | if (p1->p_session->s_ttyvp != NULL && (p1->p_flags & P_CONTROLT)) |
4643740a | 591 | p2->p_flags |= P_CONTROLT; |
87116512 | 592 | if (flags & RFPPWAIT) { |
4643740a | 593 | p2->p_flags |= P_PPWAIT; |
87116512 | 594 | if (p1->p_upmap) |
2eca01a4 | 595 | atomic_add_int(&p1->p_upmap->invfork, 1); |
87116512 MD |
596 | } |
597 | ||
0daa37a5 MD |
598 | /* |
599 | * Inherit the virtual kernel structure (allows a virtual kernel | |
600 | * to fork to simulate multiple cpus). | |
601 | */ | |
4a22e893 MD |
602 | if (p1->p_vkernel) |
603 | vkernel_inherit(p1, p2); | |
0daa37a5 | 604 | |
5fd012e0 MD |
605 | /* |
606 | * Once we are on a pglist we may receive signals. XXX we might | |
607 | * race a ^C being sent to the process group by not receiving it | |
608 | * at all prior to this line. | |
609 | */ | |
58c2553a MD |
610 | pgref(p1grp); |
611 | lwkt_gettoken(&p1grp->pg_token); | |
984263bc | 612 | LIST_INSERT_AFTER(p1, p2, p_pglist); |
58c2553a | 613 | lwkt_reltoken(&p1grp->pg_token); |
984263bc MD |
614 | |
615 | /* | |
616 | * Attach the new process to its parent. | |
617 | * | |
618 | * If RFNOWAIT is set, the newly created process becomes a child | |
f0d55ae9 MD |
619 | * of the reaper (typically init). This effectively disassociates |
620 | * the child from the parent. | |
621 | * | |
622 | * Temporarily hold pptr for the RFNOWAIT case to avoid ripouts. | |
984263bc | 623 | */ |
f0d55ae9 | 624 | if (flags & RFNOWAIT) { |
2cb91021 MD |
625 | pptr = reaper_get(reap); |
626 | if (pptr == NULL) { | |
f0d55ae9 MD |
627 | pptr = initproc; |
628 | PHOLD(pptr); | |
629 | } | |
630 | } else { | |
984263bc | 631 | pptr = p1; |
f0d55ae9 | 632 | } |
984263bc | 633 | p2->p_pptr = pptr; |
39b9b6cd | 634 | p2->p_ppid = pptr->p_pid; |
984263bc | 635 | LIST_INIT(&p2->p_children); |
b5c4d81f MD |
636 | |
637 | lwkt_gettoken(&pptr->p_token); | |
638 | LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); | |
639 | lwkt_reltoken(&pptr->p_token); | |
640 | ||
f0d55ae9 MD |
641 | if (flags & RFNOWAIT) |
642 | PRELE(pptr); | |
643 | ||
98a7f915 | 644 | varsymset_init(&p2->p_varsymset, &p1->p_varsymset); |
8c2bce60 | 645 | callout_init_mp(&p2->p_ithandle); |
984263bc MD |
646 | |
647 | #ifdef KTRACE | |
648 | /* | |
649 | * Copy traceflag and tracefile if enabled. If not inherited, | |
650 | * these were zeroed above but we still could have a trace race | |
29f58392 | 651 | * so make sure p2's p_tracenode is NULL. |
984263bc | 652 | */ |
29f58392 | 653 | if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracenode == NULL) { |
984263bc | 654 | p2->p_traceflag = p1->p_traceflag; |
29f58392 | 655 | p2->p_tracenode = ktrinherit(p1->p_tracenode); |
984263bc MD |
656 | } |
657 | #endif | |
658 | ||
984263bc MD |
659 | /* |
660 | * This begins the section where we must prevent the parent | |
c52f5180 MD |
661 | * from being messed with too heavily while we run through the |
662 | * fork operation. | |
13d13d89 SS |
663 | * |
664 | * Gets PRELE'd in the caller in start_forked_proc(). | |
c52f5180 MD |
665 | * |
666 | * Create the first lwp associated with the new proc. It will | |
667 | * return via a different execution path later, directly into | |
668 | * userland, after it was put on the runq by start_forked_proc(). | |
984263bc MD |
669 | */ |
670 | PHOLD(p1); | |
671 | ||
8e99ae46 | 672 | lp2 = lwp_fork1(lp1, p2, flags, NULL); |
4aa6d05c | 673 | vm_fork(p1, p2, lp2, flags); |
afd7f124 MD |
674 | if ((flags & RFMEM) == 0) |
675 | wake_umtx_threads(p1); | |
8e99ae46 | 676 | lwp_fork2(lp1, p2, lp2, flags); |
984263bc | 677 | |
6b72a0c2 | 678 | if (flags == (RFFDG | RFPROC | RFPGLOCK)) { |
12e4aaff | 679 | mycpu->gd_cnt.v_forks++; |
4b566556 MD |
680 | mycpu->gd_cnt.v_forkpages += btoc(p2->p_vmspace->vm_dsize) + |
681 | btoc(p2->p_vmspace->vm_ssize); | |
6b72a0c2 | 682 | } else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM | RFPGLOCK)) { |
12e4aaff | 683 | mycpu->gd_cnt.v_vforks++; |
4b566556 MD |
684 | mycpu->gd_cnt.v_vforkpages += btoc(p2->p_vmspace->vm_dsize) + |
685 | btoc(p2->p_vmspace->vm_ssize); | |
984263bc | 686 | } else if (p1 == &proc0) { |
12e4aaff | 687 | mycpu->gd_cnt.v_kthreads++; |
4b566556 MD |
688 | mycpu->gd_cnt.v_kthreadpages += btoc(p2->p_vmspace->vm_dsize) + |
689 | btoc(p2->p_vmspace->vm_ssize); | |
984263bc | 690 | } else { |
12e4aaff | 691 | mycpu->gd_cnt.v_rforks++; |
4b566556 MD |
692 | mycpu->gd_cnt.v_rforkpages += btoc(p2->p_vmspace->vm_dsize) + |
693 | btoc(p2->p_vmspace->vm_ssize); | |
984263bc MD |
694 | } |
695 | ||
696 | /* | |
697 | * Both processes are set up, now check if any loadable modules want | |
698 | * to adjust anything. | |
699 | * What if they have an error? XXX | |
700 | */ | |
701 | TAILQ_FOREACH(ep, &fork_list, next) { | |
702 | (*ep->function)(p1, p2, flags); | |
703 | } | |
704 | ||
705 | /* | |
a77ac49d MD |
706 | * Set the start time. Note that the process is not runnable. The |
707 | * caller is responsible for making it runnable. | |
984263bc | 708 | */ |
d9fa5f67 | 709 | microtime(&p2->p_start); |
984263bc | 710 | p2->p_acflag = AFORK; |
984263bc MD |
711 | |
712 | /* | |
713 | * tell any interested parties about the new process | |
714 | */ | |
715 | KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid); | |
716 | ||
984263bc MD |
717 | /* |
718 | * Return child proc pointer to parent. | |
719 | */ | |
720 | *procp = p2; | |
b5c4d81f | 721 | error = 0; |
167e6ecb | 722 | done: |
de7ac1d6 MD |
723 | if (p2) |
724 | lwkt_reltoken(&p2->p_token); | |
b5c4d81f | 725 | lwkt_reltoken(&p1->p_token); |
58c2553a MD |
726 | if (plkgrp) { |
727 | lockmgr(&plkgrp->pg_lock, LK_RELEASE); | |
728 | pgrel(plkgrp); | |
729 | } | |
167e6ecb | 730 | return (error); |
984263bc MD |
731 | } |
732 | ||
c52f5180 MD |
733 | /* |
734 | * The first part of lwp_fork*() allocates enough of the new lwp that | |
735 | * vm_fork() can use it to deal with /dev/lpmap mappings. | |
736 | */ | |
13d13d89 | 737 | static struct lwp * |
8e99ae46 | 738 | lwp_fork1(struct lwp *lp1, struct proc *destproc, int flags, |
e3c330f0 | 739 | const cpumask_t *mask) |
13d13d89 | 740 | { |
8e99ae46 MD |
741 | struct lwp *lp2; |
742 | ||
743 | lp2 = kmalloc(sizeof(struct lwp), M_LWP, M_WAITOK|M_ZERO); | |
744 | lp2->lwp_proc = destproc; | |
745 | lp2->lwp_stat = LSRUN; | |
746 | bcopy(&lp1->lwp_startcopy, &lp2->lwp_startcopy, | |
747 | (unsigned) ((caddr_t)&lp2->lwp_endcopy - | |
748 | (caddr_t)&lp2->lwp_startcopy)); | |
749 | if (mask != NULL) | |
750 | lp2->lwp_cpumask = *mask; | |
13d13d89 | 751 | |
8e99ae46 MD |
752 | lwkt_token_init(&lp2->lwp_token, "lwp_token"); |
753 | TAILQ_INIT(&lp2->lwp_lpmap_backing_list); | |
754 | spin_init(&lp2->lwp_spin, "lwptoken"); | |
3e291793 | 755 | |
8e99ae46 MD |
756 | /* |
757 | * Use the same TID for the first thread in the new process after | |
758 | * a fork or vfork. This is needed to keep pthreads and /dev/lpmap | |
759 | * sane. In particular a consequence of implementing the per-thread | |
760 | * /dev/lpmap map code makes this mandatory. | |
761 | * | |
762 | * NOTE: exec*() will reset the TID to 1 to keep things sane in that | |
763 | * department too. | |
c52f5180 MD |
764 | * |
765 | * NOTE: In the case of lwp_create(), this TID represents a conflict | |
766 | * which will be resolved in lwp_fork2(), but in the case of | |
767 | * a fork(), the TID has to be correct or vm_fork() will not | |
768 | * keep the correct lpmap. | |
8e99ae46 | 769 | */ |
c52f5180 | 770 | lp2->lwp_tid = lp1->lwp_tid; |
8e99ae46 MD |
771 | |
772 | return lp2; | |
773 | } | |
774 | ||
c52f5180 MD |
775 | /* |
776 | * The second part of lwp_fork*() | |
777 | */ | |
8e99ae46 MD |
778 | static void |
779 | lwp_fork2(struct lwp *lp1, struct proc *destproc, struct lwp *lp2, int flags) | |
780 | { | |
781 | globaldata_t gd = mycpu; | |
782 | struct thread *td2; | |
783 | ||
784 | lp2->lwp_vmspace = destproc->p_vmspace; | |
ea5bffb9 MD |
785 | |
786 | /* | |
787 | * Reset the sigaltstack if memory is shared, otherwise inherit | |
788 | * it. | |
789 | */ | |
790 | if (flags & RFMEM) { | |
8e99ae46 MD |
791 | lp2->lwp_sigstk.ss_flags = SS_DISABLE; |
792 | lp2->lwp_sigstk.ss_size = 0; | |
793 | lp2->lwp_sigstk.ss_sp = NULL; | |
794 | lp2->lwp_flags &= ~LWP_ALTSTACK; | |
ea5bffb9 | 795 | } else { |
8e99ae46 | 796 | lp2->lwp_flags |= lp1->lwp_flags & LWP_ALTSTACK; |
ea5bffb9 MD |
797 | } |
798 | ||
13d13d89 SS |
799 | /* |
800 | * Set cpbase to the last timeout that occured (not the upcoming | |
801 | * timeout). | |
802 | * | |
803 | * A critical section is required since a timer IPI can update | |
804 | * scheduler specific data. | |
805 | */ | |
806 | crit_enter(); | |
8e99ae46 MD |
807 | lp2->lwp_cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; |
808 | destproc->p_usched->heuristic_forking(lp1, lp2); | |
13d13d89 | 809 | crit_exit(); |
8e99ae46 | 810 | CPUMASK_ANDMASK(lp2->lwp_cpumask, usched_mastermask); |
13d13d89 | 811 | |
d2d8515b MD |
812 | /* |
813 | * Assign the thread to the current cpu to begin with so we | |
814 | * can manipulate it. | |
815 | */ | |
8e99ae46 MD |
816 | td2 = lwkt_alloc_thread(NULL, LWKT_THREAD_STACK, gd->gd_cpuid, 0); |
817 | lp2->lwp_thread = td2; | |
818 | td2->td_wakefromcpu = gd->gd_cpuid; | |
819 | td2->td_ucred = crhold(destproc->p_ucred); | |
820 | td2->td_proc = destproc; | |
821 | td2->td_lwp = lp2; | |
822 | td2->td_switch = cpu_heavy_switch; | |
e3e6be1f | 823 | #ifdef NO_LWKT_SPLIT_USERPRI |
8e99ae46 | 824 | lwkt_setpri(td2, TDPRI_USER_NORM); |
e3e6be1f | 825 | #else |
8e99ae46 | 826 | lwkt_setpri(td2, TDPRI_KERN_USER); |
d992c377 | 827 | #endif |
8e99ae46 | 828 | lwkt_set_comm(td2, "%s", destproc->p_comm); |
13d13d89 SS |
829 | |
830 | /* | |
831 | * cpu_fork will copy and update the pcb, set up the kernel stack, | |
832 | * and make the child ready to run. | |
833 | */ | |
8e99ae46 MD |
834 | cpu_fork(lp1, lp2, flags); |
835 | kqueue_init(&lp2->lwp_kqueue, destproc->p_fd); | |
0d78b86e | 836 | |
c52f5180 MD |
837 | /* |
838 | * Associate the new thread with destproc, after we've set most of | |
839 | * it up and gotten its related td2 installed. Otherwise we can | |
840 | * race other random kernel code that iterates LWPs and expects the | |
841 | * thread to be assigned. | |
842 | * | |
843 | * Leave 2 bits open so the pthreads library can optimize locks | |
844 | * by combining the TID with a few Lock-related flags. | |
845 | */ | |
846 | while (lwp_rb_tree_RB_INSERT(&destproc->p_lwp_tree, lp2) != NULL) { | |
847 | ++lp2->lwp_tid; | |
848 | if (lp2->lwp_tid == 0 || lp2->lwp_tid == 0x3FFFFFFF) | |
849 | lp2->lwp_tid = 1; | |
850 | } | |
851 | ||
852 | destproc->p_lasttid = lp2->lwp_tid; | |
853 | destproc->p_nthreads++; | |
854 | ||
51818c08 MD |
855 | /* |
856 | * This flag is set and never cleared. It means that the process | |
857 | * was threaded at some point. Used to improve exit performance. | |
858 | */ | |
e3c330f0 | 859 | pmap_maybethreaded(&destproc->p_vmspace->vm_pmap); |
51818c08 MD |
860 | destproc->p_flags |= P_MAYBETHREADED; |
861 | ||
64b5a8a5 MD |
862 | /* |
863 | * If the original lp had a lpmap and a non-zero blockallsigs | |
864 | * count, give the lp for the forked process the same count. | |
865 | * | |
866 | * This makes the user code and expectations less confusing | |
867 | * in terms of unwinding locks and also allows userland to start | |
868 | * the forked process with signals blocked via the blockallsigs() | |
869 | * mechanism if desired. | |
64b5a8a5 | 870 | */ |
8e99ae46 MD |
871 | if (lp1->lwp_lpmap && |
872 | (lp1->lwp_lpmap->blockallsigs & 0x7FFFFFFF)) { | |
873 | lwp_usermap(lp2, 0); | |
874 | if (lp2->lwp_lpmap) { | |
875 | lp2->lwp_lpmap->blockallsigs = | |
876 | lp1->lwp_lpmap->blockallsigs; | |
64b5a8a5 MD |
877 | } |
878 | } | |
13d13d89 SS |
879 | } |
880 | ||
984263bc MD |
881 | /* |
882 | * The next two functionms are general routines to handle adding/deleting | |
883 | * items on the fork callout list. | |
884 | * | |
885 | * at_fork(): | |
886 | * Take the arguments given and put them onto the fork callout list, | |
887 | * However first make sure that it's not already there. | |
888 | * Returns 0 on success or a standard error number. | |
889 | */ | |
984263bc | 890 | int |
303c76d5 | 891 | at_fork(forklist_fn function) |
984263bc MD |
892 | { |
893 | struct forklist *ep; | |
894 | ||
895 | #ifdef INVARIANTS | |
896 | /* let the programmer know if he's been stupid */ | |
303c76d5 | 897 | if (rm_at_fork(function)) { |
6ea70f76 | 898 | kprintf("WARNING: fork callout entry (%p) already present\n", |
984263bc | 899 | function); |
303c76d5 | 900 | } |
984263bc | 901 | #endif |
efda3bd0 | 902 | ep = kmalloc(sizeof(*ep), M_ATFORK, M_WAITOK|M_ZERO); |
984263bc MD |
903 | ep->function = function; |
904 | TAILQ_INSERT_TAIL(&fork_list, ep, next); | |
905 | return (0); | |
906 | } | |
907 | ||
908 | /* | |
909 | * Scan the exit callout list for the given item and remove it.. | |
910 | * Returns the number of items removed (0 or 1) | |
911 | */ | |
984263bc | 912 | int |
303c76d5 | 913 | rm_at_fork(forklist_fn function) |
984263bc MD |
914 | { |
915 | struct forklist *ep; | |
916 | ||
917 | TAILQ_FOREACH(ep, &fork_list, next) { | |
918 | if (ep->function == function) { | |
919 | TAILQ_REMOVE(&fork_list, ep, next); | |
efda3bd0 | 920 | kfree(ep, M_ATFORK); |
984263bc MD |
921 | return(1); |
922 | } | |
923 | } | |
924 | return (0); | |
925 | } | |
7d0bac62 MD |
926 | |
927 | /* | |
928 | * Add a forked process to the run queue after any remaining setup, such | |
929 | * as setting the fork handler, has been completed. | |
de7ac1d6 MD |
930 | * |
931 | * p2 is held by the caller. | |
7d0bac62 | 932 | */ |
7d0bac62 | 933 | void |
553ea3c8 | 934 | start_forked_proc(struct lwp *lp1, struct proc *p2) |
7d0bac62 | 935 | { |
08f2f1bb | 936 | struct lwp *lp2 = ONLY_LWP_IN_PROC(p2); |
51818c08 | 937 | int pflags; |
553ea3c8 | 938 | |
7d0bac62 | 939 | /* |
26a0694b MD |
940 | * Move from SIDL to RUN queue, and activate the process's thread. |
941 | * Activation of the thread effectively makes the process "a" | |
942 | * current process, so we do not setrunqueue(). | |
8ec60c3f MD |
943 | * |
944 | * YYY setrunqueue works here but we should clean up the trampoline | |
945 | * code so we just schedule the LWKT thread and let the trampoline | |
946 | * deal with the userland scheduler on return to userland. | |
7d0bac62 | 947 | */ |
553ea3c8 | 948 | KASSERT(p2->p_stat == SIDL, |
7d0bac62 | 949 | ("cannot start forked process, bad status: %p", p2)); |
553ea3c8 | 950 | p2->p_usched->resetpriority(lp2); |
e43a034f | 951 | crit_enter(); |
164b8401 SS |
952 | p2->p_stat = SACTIVE; |
953 | lp2->lwp_stat = LSRUN; | |
553ea3c8 | 954 | p2->p_usched->setrunqueue(lp2); |
e43a034f | 955 | crit_exit(); |
7d0bac62 MD |
956 | |
957 | /* | |
958 | * Now can be swapped. | |
959 | */ | |
553ea3c8 | 960 | PRELE(lp1->lwp_proc); |
7d0bac62 MD |
961 | |
962 | /* | |
51818c08 MD |
963 | * Preserve synchronization semantics of vfork. P_PPWAIT is set in |
964 | * the child until it has retired the parent's resources. The parent | |
965 | * must wait for the flag to be cleared by the child. | |
966 | * | |
967 | * Interlock the flag/tsleep with atomic ops to avoid unnecessary | |
968 | * p_token conflicts. | |
ee934fe9 | 969 | * |
51818c08 MD |
970 | * XXX Is this use of an atomic op on a field that is not normally |
971 | * manipulated with atomic ops ok? | |
7d0bac62 | 972 | */ |
51818c08 MD |
973 | while ((pflags = p2->p_flags) & P_PPWAIT) { |
974 | cpu_ccfence(); | |
975 | tsleep_interlock(lp1->lwp_proc, 0); | |
976 | if (atomic_cmpset_int(&p2->p_flags, pflags, pflags)) | |
977 | tsleep(lp1->lwp_proc, PINTERLOCKED, "ppwait", 0); | |
2734d278 | 978 | } |
7d0bac62 | 979 | } |
6e2a912c MD |
980 | |
981 | /* | |
fc3bc286 | 982 | * procctl (idtype_t idtype, id_t id, int cmd, void *arg) |
6e2a912c MD |
983 | */ |
984 | int | |
80d831e1 | 985 | sys_procctl(struct sysmsg *sysmsg, const struct procctl_args *uap) |
6e2a912c MD |
986 | { |
987 | struct proc *p = curproc; | |
988 | struct proc *p2; | |
989 | struct sysreaper *reap; | |
990 | union reaper_info udata; | |
991 | int error; | |
992 | ||
acdf1ee6 MD |
993 | if (uap->idtype != P_PID) |
994 | return EINVAL; | |
995 | if (uap->id != 0 && uap->id != (id_t)p->p_pid) | |
fc3bc286 MD |
996 | return EINVAL; |
997 | ||
998 | switch(uap->cmd) { | |
999 | case PROC_REAP_ACQUIRE: | |
6e2a912c MD |
1000 | lwkt_gettoken(&p->p_token); |
1001 | reap = kmalloc(sizeof(*reap), M_REAPER, M_WAITOK|M_ZERO); | |
1002 | if (p->p_reaper == NULL || p->p_reaper->p != p) { | |
1003 | reaper_init(p, reap); | |
1004 | error = 0; | |
1005 | } else { | |
1006 | kfree(reap, M_REAPER); | |
1007 | error = EALREADY; | |
1008 | } | |
1009 | lwkt_reltoken(&p->p_token); | |
1010 | break; | |
fc3bc286 | 1011 | case PROC_REAP_RELEASE: |
6e2a912c MD |
1012 | lwkt_gettoken(&p->p_token); |
1013 | release_again: | |
1014 | reap = p->p_reaper; | |
1015 | KKASSERT(reap != NULL); | |
1016 | if (reap->p == p) { | |
1017 | reaper_hold(reap); /* in case of thread race */ | |
1018 | lockmgr(&reap->lock, LK_EXCLUSIVE); | |
1019 | if (reap->p != p) { | |
1020 | lockmgr(&reap->lock, LK_RELEASE); | |
1021 | reaper_drop(reap); | |
1022 | goto release_again; | |
1023 | } | |
1024 | reap->p = NULL; | |
1025 | p->p_reaper = reap->parent; | |
1026 | if (p->p_reaper) | |
1027 | reaper_hold(p->p_reaper); | |
1028 | lockmgr(&reap->lock, LK_RELEASE); | |
1029 | reaper_drop(reap); /* our ref */ | |
1030 | reaper_drop(reap); /* old p_reaper ref */ | |
1031 | error = 0; | |
1032 | } else { | |
1033 | error = ENOTCONN; | |
1034 | } | |
1035 | lwkt_reltoken(&p->p_token); | |
1036 | break; | |
fc3bc286 | 1037 | case PROC_REAP_STATUS: |
6e2a912c MD |
1038 | bzero(&udata, sizeof(udata)); |
1039 | lwkt_gettoken_shared(&p->p_token); | |
1040 | if ((reap = p->p_reaper) != NULL && reap->p == p) { | |
1041 | udata.status.flags = reap->flags; | |
f614cb6d | 1042 | udata.status.refs = reap->refs - 1; /* minus ours */ |
6e2a912c | 1043 | } |
f614cb6d MD |
1044 | p2 = LIST_FIRST(&p->p_children); |
1045 | udata.status.pid_head = p2 ? p2->p_pid : -1; | |
6e2a912c | 1046 | lwkt_reltoken(&p->p_token); |
f614cb6d | 1047 | |
6e2a912c MD |
1048 | if (uap->data) { |
1049 | error = copyout(&udata, uap->data, | |
1050 | sizeof(udata.status)); | |
1051 | } else { | |
1052 | error = 0; | |
1053 | } | |
1054 | break; | |
acdf1ee6 MD |
1055 | case PROC_PDEATHSIG_CTL: |
1056 | error = EINVAL; | |
1057 | if (uap->data) { | |
1058 | int dsig = 0; | |
1059 | ||
1060 | error = copyin(uap->data, &dsig, sizeof(dsig)); | |
1061 | if (error == 0 && dsig >= 0 && dsig <= _SIG_MAXSIG) | |
1062 | p->p_deathsig = dsig; | |
1063 | } | |
1064 | break; | |
1065 | case PROC_PDEATHSIG_STATUS: | |
1066 | error = EINVAL; | |
1067 | if (uap->data) { | |
1068 | error = copyout(&p->p_deathsig, uap->data, | |
1069 | sizeof(p->p_deathsig)); | |
1070 | } | |
1071 | break; | |
6e2a912c MD |
1072 | default: |
1073 | error = EINVAL; | |
1074 | break; | |
1075 | } | |
1076 | return error; | |
1077 | } | |
1078 | ||
1079 | /* | |
1080 | * Bump ref on reaper, preventing destruction | |
1081 | */ | |
1082 | void | |
1083 | reaper_hold(struct sysreaper *reap) | |
1084 | { | |
1085 | KKASSERT(reap->refs > 0); | |
1086 | refcount_acquire(&reap->refs); | |
1087 | } | |
1088 | ||
1089 | /* | |
1090 | * Drop ref on reaper, destroy the structure on the 1->0 | |
1091 | * transition and loop on the parent. | |
1092 | */ | |
1093 | void | |
1094 | reaper_drop(struct sysreaper *next) | |
1095 | { | |
1096 | struct sysreaper *reap; | |
1097 | ||
1098 | while ((reap = next) != NULL) { | |
1099 | if (refcount_release(&reap->refs)) { | |
1100 | next = reap->parent; | |
1101 | KKASSERT(reap->p == NULL); | |
a73d7792 | 1102 | lockmgr(&reaper_lock, LK_EXCLUSIVE); |
6e2a912c MD |
1103 | reap->parent = NULL; |
1104 | kfree(reap, M_REAPER); | |
a73d7792 | 1105 | lockmgr(&reaper_lock, LK_RELEASE); |
6e2a912c MD |
1106 | } else { |
1107 | next = NULL; | |
1108 | } | |
1109 | } | |
1110 | } | |
1111 | ||
1112 | /* | |
1113 | * Initialize a static or newly allocated reaper structure | |
1114 | */ | |
1115 | void | |
1116 | reaper_init(struct proc *p, struct sysreaper *reap) | |
1117 | { | |
1118 | reap->parent = p->p_reaper; | |
1119 | reap->p = p; | |
1120 | if (p == initproc) { | |
1121 | reap->flags = REAPER_STAT_OWNED | REAPER_STAT_REALINIT; | |
1122 | reap->refs = 2; | |
1123 | } else { | |
1124 | reap->flags = REAPER_STAT_OWNED; | |
1125 | reap->refs = 1; | |
1126 | } | |
1127 | lockinit(&reap->lock, "subrp", 0, 0); | |
1128 | cpu_sfence(); | |
1129 | p->p_reaper = reap; | |
1130 | } | |
1131 | ||
1132 | /* | |
1133 | * Called with p->p_token held during exit. | |
1134 | * | |
1135 | * This is a bit simpler than RELEASE because there are no threads remaining | |
1136 | * to race. We only release if we own the reaper, the exit code will handle | |
1137 | * the final p_reaper release. | |
1138 | */ | |
1139 | struct sysreaper * | |
1140 | reaper_exit(struct proc *p) | |
1141 | { | |
1142 | struct sysreaper *reap; | |
1143 | ||
1144 | /* | |
1145 | * Release acquired reaper | |
1146 | */ | |
1147 | if ((reap = p->p_reaper) != NULL && reap->p == p) { | |
1148 | lockmgr(&reap->lock, LK_EXCLUSIVE); | |
1149 | p->p_reaper = reap->parent; | |
1150 | if (p->p_reaper) | |
1151 | reaper_hold(p->p_reaper); | |
1152 | reap->p = NULL; | |
1153 | lockmgr(&reap->lock, LK_RELEASE); | |
1154 | reaper_drop(reap); | |
1155 | } | |
1156 | ||
1157 | /* | |
1158 | * Return and clear reaper (caller is holding p_token for us) | |
1159 | * (reap->p does not equal p). Caller must drop it. | |
1160 | */ | |
1161 | if ((reap = p->p_reaper) != NULL) { | |
1162 | p->p_reaper = NULL; | |
1163 | } | |
1164 | return reap; | |
1165 | } | |
1166 | ||
1167 | /* | |
1168 | * Return a held (PHOLD) process representing the reaper for process (p). | |
1169 | * NULL should not normally be returned. Caller should PRELE() the returned | |
1170 | * reaper process when finished. | |
1171 | * | |
1172 | * Remove dead internal nodes while we are at it. | |
1173 | * | |
1174 | * Process (p)'s token must be held on call. | |
1175 | * The returned process's token is NOT acquired by this routine. | |
1176 | */ | |
1177 | struct proc * | |
1178 | reaper_get(struct sysreaper *reap) | |
1179 | { | |
1180 | struct sysreaper *next; | |
1181 | struct proc *reproc; | |
1182 | ||
1183 | if (reap == NULL) | |
1184 | return NULL; | |
1185 | ||
1186 | /* | |
1187 | * Extra hold for loop | |
1188 | */ | |
1189 | reaper_hold(reap); | |
1190 | ||
1191 | while (reap) { | |
1192 | lockmgr(&reap->lock, LK_SHARED); | |
1193 | if (reap->p) { | |
1194 | /* | |
1195 | * Probable reaper | |
1196 | */ | |
1197 | if (reap->p) { | |
1198 | reproc = reap->p; | |
1199 | PHOLD(reproc); | |
1200 | lockmgr(&reap->lock, LK_RELEASE); | |
1201 | reaper_drop(reap); | |
1202 | return reproc; | |
1203 | } | |
1204 | ||
1205 | /* | |
1206 | * Raced, try again | |
1207 | */ | |
1208 | lockmgr(&reap->lock, LK_RELEASE); | |
1209 | continue; | |
1210 | } | |
1211 | ||
1212 | /* | |
1213 | * Traverse upwards in the reaper topology, destroy | |
1214 | * dead internal nodes when possible. | |
1215 | * | |
1216 | * NOTE: Our ref on next means that a dead node should | |
1217 | * have 2 (ours and reap->parent's). | |
1218 | */ | |
1219 | next = reap->parent; | |
1220 | while (next) { | |
1221 | reaper_hold(next); | |
1222 | if (next->refs == 2 && next->p == NULL) { | |
1223 | lockmgr(&reap->lock, LK_RELEASE); | |
1224 | lockmgr(&reap->lock, LK_EXCLUSIVE); | |
1225 | if (next->refs == 2 && | |
1226 | reap->parent == next && | |
1227 | next->p == NULL) { | |
1228 | /* | |
1229 | * reap->parent inherits ref from next. | |
1230 | */ | |
1231 | reap->parent = next->parent; | |
1232 | next->parent = NULL; | |
1233 | reaper_drop(next); /* ours */ | |
1234 | reaper_drop(next); /* old parent */ | |
1235 | next = reap->parent; | |
1236 | continue; /* possible chain */ | |
1237 | } | |
1238 | } | |
1239 | break; | |
1240 | } | |
1241 | lockmgr(&reap->lock, LK_RELEASE); | |
1242 | reaper_drop(reap); | |
1243 | reap = next; | |
1244 | } | |
1245 | return NULL; | |
1246 | } | |
a73d7792 MD |
1247 | |
1248 | /* | |
1249 | * Test that the sender is allowed to send a signal to the target. | |
1250 | * The sender process is assumed to have a stable reaper. The | |
1251 | * target can be e.g. from a scan callback. | |
1252 | * | |
1253 | * Target cannot be the reaper process itself unless reaper_ok is specified, | |
1254 | * or sender == target. | |
1255 | */ | |
1256 | int | |
1257 | reaper_sigtest(struct proc *sender, struct proc *target, int reaper_ok) | |
1258 | { | |
1259 | struct sysreaper *sreap; | |
1260 | struct sysreaper *reap; | |
1261 | int r; | |
1262 | ||
1263 | sreap = sender->p_reaper; | |
1264 | if (sreap == NULL) | |
1265 | return 1; | |
1266 | ||
1267 | if (sreap == target->p_reaper) { | |
1268 | if (sreap->p == target && sreap->p != sender && reaper_ok == 0) | |
1269 | return 0; | |
1270 | return 1; | |
1271 | } | |
1272 | lockmgr(&reaper_lock, LK_SHARED); | |
1273 | r = 0; | |
1274 | for (reap = target->p_reaper; reap; reap = reap->parent) { | |
1275 | if (sreap == reap) { | |
1276 | if (sreap->p != target || reaper_ok) | |
1277 | r = 1; | |
1278 | break; | |
1279 | } | |
1280 | } | |
1281 | lockmgr(&reaper_lock, LK_RELEASE); | |
1282 | ||
1283 | return r; | |
1284 | } |