gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
	39	* $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.13 2003/06/06 20:21:32 tegge Exp $
	40	* $DragonFly: src/sys/kern/kern_fork.c,v 1.14 2003/07/26 18:12:44 dillon Exp $
	41	*/
	42
	43	#include "opt_ktrace.h"
	44
	45	#include <sys/param.h>
	46	#include <sys/systm.h>
	47	#include <sys/sysproto.h>
	48	#include <sys/filedesc.h>
	49	#include <sys/kernel.h>
	50	#include <sys/sysctl.h>
	51	#include <sys/malloc.h>
	52	#include <sys/proc.h>
	53	#include <sys/resourcevar.h>
	54	#include <sys/vnode.h>
	55	#include <sys/acct.h>
	56	#include <sys/ktrace.h>
	57	#include <sys/unistd.h>
	58	#include <sys/jail.h>
	59
	60	#include <vm/vm.h>
	61	#include <sys/lock.h>
	62	#include <vm/pmap.h>
	63	#include <vm/vm_map.h>
	64	#include <vm/vm_extern.h>
	65	#include <vm/vm_zone.h>
	66
	67	#include <sys/vmmeter.h>
	68	#include <sys/user.h>
	69
	70	static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
	71
	72	/*
	73	* These are the stuctures used to create a callout list for things to do
	74	* when forking a process
	75	*/
	76	struct forklist {
	77	forklist_fn function;
	78	TAILQ_ENTRY(forklist) next;
	79	};
	80
	81	TAILQ_HEAD(forklist_head, forklist);
	82	static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
	83
	84	int forksleep; /* Place for fork1() to sleep on. */
	85
	86	/* ARGSUSED */
	87	int
	88	fork(struct fork_args *uap)
	89	{
	90	struct proc *p = curproc;
	91	struct proc *p2;
	92	int error;
	93
	94	error = fork1(p, RFFDG \| RFPROC, &p2);
	95	if (error == 0) {
	96	start_forked_proc(p, p2);
	97	uap->lmsg.u.ms_fds[0] = p2->p_pid;
	98	uap->lmsg.u.ms_fds[1] = 0;
	99	}
	100	return error;
	101	}
	102
	103	/* ARGSUSED */
	104	int
	105	vfork(struct vfork_args *uap)
	106	{
	107	struct proc *p = curproc;
	108	struct proc *p2;
	109	int error;
	110
	111	error = fork1(p, RFFDG \| RFPROC \| RFPPWAIT \| RFMEM, &p2);
	112	if (error == 0) {
	113	start_forked_proc(p, p2);
	114	uap->lmsg.u.ms_fds[0] = p2->p_pid;
	115	uap->lmsg.u.ms_fds[1] = 0;
	116	}
	117	return error;
	118	}
	119
	120	int
	121	rfork(struct rfork_args *uap)
	122	{
	123	struct proc *p = curproc;
	124	struct proc *p2;
	125	int error;
	126
	127	error = fork1(p, uap->flags, &p2);
	128	if (error == 0) {
	129	start_forked_proc(p, p2);
	130	uap->lmsg.u.ms_fds[0] = p2 ? p2->p_pid : 0;
	131	uap->lmsg.u.ms_fds[1] = 0;
	132	}
	133	return error;
	134	}
	135
	136
	137	int nprocs = 1; /* process 0 */
	138	static int nextpid = 0;
	139
	140	/*
	141	* Random component to nextpid generation. We mix in a random factor to make
	142	* it a little harder to predict. We sanity check the modulus value to avoid
	143	* doing it in critical paths. Don't let it be too small or we pointlessly
	144	* waste randomness entropy, and don't let it be impossibly large. Using a
	145	* modulus that is too big causes a LOT more process table scans and slows
	146	* down fork processing as the pidchecked caching is defeated.
	147	*/
	148	static int randompid = 0;
	149
	150	static int
	151	sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
	152	{
	153	int error, pid;
	154
	155	pid = randompid;
	156	error = sysctl_handle_int(oidp, &pid, 0, req);
	157	if (error \|\| !req->newptr)
	158	return (error);
	159	if (pid < 0 \|\| pid > PID_MAX - 100) /* out of range */
	160	pid = PID_MAX - 100;
	161	else if (pid < 2) /* NOP */
	162	pid = 0;
	163	else if (pid < 100) /* Make it reasonable */
	164	pid = 100;
	165	randompid = pid;
	166	return (error);
	167	}
	168
	169	SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT\|CTLFLAG_RW,
	170	0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
	171
	172	int
	173	fork1(p1, flags, procp)
	174	struct proc *p1;
	175	int flags;
	176	struct proc **procp;
	177	{
	178	struct proc p2, pptr;
	179	uid_t uid;
	180	struct proc *newproc;
	181	int ok;
	182	static int pidchecked = 0;
	183	struct forklist *ep;
	184	struct filedesc_to_leader *fdtol;
	185
	186	if ((flags & (RFFDG\|RFCFDG)) == (RFFDG\|RFCFDG))
	187	return (EINVAL);
	188
	189	/*
	190	* Here we don't create a new process, but we divorce
	191	* certain parts of a process from itself.
	192	*/
	193	if ((flags & RFPROC) == 0) {
	194
	195	vm_fork(p1, 0, flags);
	196
	197	/*
	198	* Close all file descriptors.
	199	*/
	200	if (flags & RFCFDG) {
	201	struct filedesc *fdtmp;
	202	fdtmp = fdinit(p1);
	203	fdfree(p1);
	204	p1->p_fd = fdtmp;
	205	}
	206
	207	/*
	208	* Unshare file descriptors (from parent.)
	209	*/
	210	if (flags & RFFDG) {
	211	if (p1->p_fd->fd_refcnt > 1) {
	212	struct filedesc *newfd;
	213	newfd = fdcopy(p1);
	214	fdfree(p1);
	215	p1->p_fd = newfd;
	216	}
	217	}
	218	*procp = NULL;
	219	return (0);
	220	}
	221
	222	/*
	223	* Although process entries are dynamically created, we still keep
	224	* a global limit on the maximum number we will create. Don't allow
	225	* a nonprivileged user to use the last ten processes; don't let root
	226	* exceed the limit. The variable nprocs is the current number of
	227	* processes, maxproc is the limit.
	228	*/
	229	uid = p1->p_ucred->cr_ruid;
	230	if ((nprocs >= maxproc - 10 && uid != 0) \|\| nprocs >= maxproc) {
	231	tsleep(&forksleep, 0, "fork", hz / 2);
	232	return (EAGAIN);
	233	}
	234	/*
	235	* Increment the nprocs resource before blocking can occur. There
	236	* are hard-limits as to the number of processes that can run.
	237	*/
	238	nprocs++;
	239
	240	/*
	241	* Increment the count of procs running with this uid. Don't allow
	242	* a nonprivileged user to exceed their current limit.
	243	*/
	244	ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1,
	245	(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
	246	if (!ok) {
	247	/*
	248	* Back out the process count
	249	*/
	250	nprocs--;
	251	tsleep(&forksleep, 0, "fork", hz / 2);
	252	return (EAGAIN);
	253	}
	254
	255	/* Allocate new proc. */
	256	newproc = zalloc(proc_zone);
	257
	258	/*
	259	* Setup linkage for kernel based threading
	260	*/
	261	if((flags & RFTHREAD) != 0) {
	262	newproc->p_peers = p1->p_peers;
	263	p1->p_peers = newproc;
	264	newproc->p_leader = p1->p_leader;
	265	} else {
	266	newproc->p_peers = 0;
	267	newproc->p_leader = newproc;
	268	}
	269
	270	newproc->p_wakeup = 0;
	271	newproc->p_vmspace = NULL;
	272
	273	/*
	274	* Find an unused process ID. We remember a range of unused IDs
	275	* ready to use (from nextpid+1 through pidchecked-1).
	276	*/
	277	nextpid++;
	278	if (randompid)
	279	nextpid += arc4random() % randompid;
	280	retry:
	281	/*
	282	* If the process ID prototype has wrapped around,
	283	* restart somewhat above 0, as the low-numbered procs
	284	* tend to include daemons that don't exit.
	285	*/
	286	if (nextpid >= PID_MAX) {
	287	nextpid = nextpid % PID_MAX;
	288	if (nextpid < 100)
	289	nextpid += 100;
	290	pidchecked = 0;
	291	}
	292	if (nextpid >= pidchecked) {
	293	int doingzomb = 0;
	294
	295	pidchecked = PID_MAX;
	296	/*
	297	* Scan the active and zombie procs to check whether this pid
	298	* is in use. Remember the lowest pid that's greater
	299	* than nextpid, so we can avoid checking for a while.
	300	*/
	301	p2 = LIST_FIRST(&allproc);
	302	again:
	303	for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) {
	304	while (p2->p_pid == nextpid \|\|
	305	p2->p_pgrp->pg_id == nextpid \|\|
	306	p2->p_session->s_sid == nextpid) {
	307	nextpid++;
	308	if (nextpid >= pidchecked)
	309	goto retry;
	310	}
	311	if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
	312	pidchecked = p2->p_pid;
	313	if (p2->p_pgrp->pg_id > nextpid &&
	314	pidchecked > p2->p_pgrp->pg_id)
	315	pidchecked = p2->p_pgrp->pg_id;
	316	if (p2->p_session->s_sid > nextpid &&
	317	pidchecked > p2->p_session->s_sid)
	318	pidchecked = p2->p_session->s_sid;
	319	}
	320	if (!doingzomb) {
	321	doingzomb = 1;
	322	p2 = LIST_FIRST(&zombproc);
	323	goto again;
	324	}
	325	}
	326
	327	p2 = newproc;
	328	p2->p_stat = SIDL; /* protect against others */
	329	p2->p_pid = nextpid;
	330	LIST_INSERT_HEAD(&allproc, p2, p_list);
	331	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
	332
	333	/*
	334	* Make a proc table entry for the new process.
	335	* Start by zeroing the section of proc that is zero-initialized,
	336	* then copy the section that is copied directly from the parent.
	337	*/
	338	bzero(&p2->p_startzero,
	339	(unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
	340	bcopy(&p1->p_startcopy, &p2->p_startcopy,
	341	(unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
	342
	343	p2->p_aioinfo = NULL;
	344
	345	/*
	346	* Duplicate sub-structures as needed.
	347	* Increase reference counts on shared objects.
	348	* The p_stats and p_sigacts substructs are set in vm_fork.
	349	*
	350	* P_CP_RELEASED indicates that the process is starting out in
	351	* the kernel (in the fork trampoline). The flag will be converted
	352	* to P_CURPROC when the new process calls userret() and attempts
	353	* to return to userland
	354	*/
	355	p2->p_flag = P_INMEM \| P_CP_RELEASED;
	356	if (p1->p_flag & P_PROFIL)
	357	startprofclock(p2);
	358	p2->p_ucred = crhold(p1->p_ucred);
	359
	360	if (p2->p_ucred->cr_prison) {
	361	p2->p_ucred->cr_prison->pr_ref++;
	362	p2->p_flag \|= P_JAILED;
	363	}
	364
	365	if (p2->p_args)
	366	p2->p_args->ar_ref++;
	367
	368	if (flags & RFSIGSHARE) {
	369	p2->p_procsig = p1->p_procsig;
	370	p2->p_procsig->ps_refcnt++;
	371	if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
	372	struct sigacts *newsigacts;
	373	int s;
	374
	375	/* Create the shared sigacts structure */
	376	MALLOC(newsigacts, struct sigacts *,
	377	sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
	378	s = splhigh();
	379	/*
	380	* Set p_sigacts to the new shared structure.
	381	* Note that this is updating p1->p_sigacts at the
	382	* same time, since p_sigacts is just a pointer to
	383	* the shared p_procsig->ps_sigacts.
	384	*/
	385	p2->p_sigacts = newsigacts;
	386	bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
	387	sizeof(*p2->p_sigacts));
	388	*p2->p_sigacts = p1->p_addr->u_sigacts;
	389	splx(s);
	390	}
	391	} else {
	392	MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
	393	M_SUBPROC, M_WAITOK);
	394	bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
	395	p2->p_procsig->ps_refcnt = 1;
	396	p2->p_sigacts = NULL; /* finished in vm_fork() */
	397	}
	398	if (flags & RFLINUXTHPN)
	399	p2->p_sigparent = SIGUSR1;
	400	else
	401	p2->p_sigparent = SIGCHLD;
	402
	403	/* bump references to the text vnode (for procfs) */
	404	p2->p_textvp = p1->p_textvp;
	405	if (p2->p_textvp)
	406	VREF(p2->p_textvp);
	407
	408	if (flags & RFCFDG) {
	409	p2->p_fd = fdinit(p1);
	410	fdtol = NULL;
	411	} else if (flags & RFFDG) {
	412	p2->p_fd = fdcopy(p1);
	413	fdtol = NULL;
	414	} else {
	415	p2->p_fd = fdshare(p1);
	416	if (p1->p_fdtol == NULL)
	417	p1->p_fdtol =
	418	filedesc_to_leader_alloc(NULL,
	419	p1->p_leader);
	420	if ((flags & RFTHREAD) != 0) {
	421	/*
	422	* Shared file descriptor table and
	423	* shared process leaders.
	424	*/
	425	fdtol = p1->p_fdtol;
	426	fdtol->fdl_refcount++;
	427	} else {
	428	/*
	429	* Shared file descriptor table, and
	430	* different process leaders
	431	*/
	432	fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
	433	p2);
	434	}
	435	}
	436	p2->p_fdtol = fdtol;
	437
	438	/*
	439	* If p_limit is still copy-on-write, bump refcnt,
	440	* otherwise get a copy that won't be modified.
	441	* (If PL_SHAREMOD is clear, the structure is shared
	442	* copy-on-write.)
	443	*/
	444	if (p1->p_limit->p_lflags & PL_SHAREMOD)
	445	p2->p_limit = limcopy(p1->p_limit);
	446	else {
	447	p2->p_limit = p1->p_limit;
	448	p2->p_limit->p_refcnt++;
	449	}
	450
	451	/*
	452	* Preserve some more flags in subprocess. P_PROFIL has already
	453	* been preserved.
	454	*/
	455	p2->p_flag \|= p1->p_flag & (P_SUGID \| P_ALTSTACK);
	456	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
	457	p2->p_flag \|= P_CONTROLT;
	458	if (flags & RFPPWAIT)
	459	p2->p_flag \|= P_PPWAIT;
	460
	461	LIST_INSERT_AFTER(p1, p2, p_pglist);
	462
	463	/*
	464	* Attach the new process to its parent.
	465	*
	466	* If RFNOWAIT is set, the newly created process becomes a child
	467	* of init. This effectively disassociates the child from the
	468	* parent.
	469	*/
	470	if (flags & RFNOWAIT)
	471	pptr = initproc;
	472	else
	473	pptr = p1;
	474	p2->p_pptr = pptr;
	475	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
	476	LIST_INIT(&p2->p_children);
	477
	478	#ifdef KTRACE
	479	/*
	480	* Copy traceflag and tracefile if enabled. If not inherited,
	481	* these were zeroed above but we still could have a trace race
	482	* so make sure p2's p_tracep is NULL.
	483	*/
	484	if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) {
	485	p2->p_traceflag = p1->p_traceflag;
	486	if ((p2->p_tracep = p1->p_tracep) != NULL)
	487	VREF(p2->p_tracep);
	488	}
	489	#endif
	490
	491	/*
	492	* set priority of child to be that of parent
	493	*/
	494	p2->p_estcpu = p1->p_estcpu;
	495
	496	/*
	497	* This begins the section where we must prevent the parent
	498	* from being swapped.
	499	*/
	500	PHOLD(p1);
	501
	502	/*
	503	* Finish creating the child process. It will return via a different
	504	* execution path later. (ie: directly into user mode)
	505	*/
	506	vm_fork(p1, p2, flags);
	507
	508	if (flags == (RFFDG \| RFPROC)) {
	509	mycpu->gd_cnt.v_forks++;
	510	mycpu->gd_cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	511	} else if (flags == (RFFDG \| RFPROC \| RFPPWAIT \| RFMEM)) {
	512	mycpu->gd_cnt.v_vforks++;
	513	mycpu->gd_cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	514	} else if (p1 == &proc0) {
	515	mycpu->gd_cnt.v_kthreads++;
	516	mycpu->gd_cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	517	} else {
	518	mycpu->gd_cnt.v_rforks++;
	519	mycpu->gd_cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	520	}
	521
	522	/*
	523	* Both processes are set up, now check if any loadable modules want
	524	* to adjust anything.
	525	* What if they have an error? XXX
	526	*/
	527	TAILQ_FOREACH(ep, &fork_list, next) {
	528	(*ep->function)(p1, p2, flags);
	529	}
	530
	531	/*
	532	* Make child runnable and add to run queue.
	533	*/
	534	microtime(&(p2->p_stats->p_start));
	535	p2->p_acflag = AFORK;
	536
	537	/*
	538	* tell any interested parties about the new process
	539	*/
	540	KNOTE(&p1->p_klist, NOTE_FORK \| p2->p_pid);
	541
	542	/*
	543	* Return child proc pointer to parent.
	544	*/
	545	*procp = p2;
	546	return (0);
	547	}
	548
	549	/*
	550	* The next two functionms are general routines to handle adding/deleting
	551	* items on the fork callout list.
	552	*
	553	* at_fork():
	554	* Take the arguments given and put them onto the fork callout list,
	555	* However first make sure that it's not already there.
	556	* Returns 0 on success or a standard error number.
	557	*/
	558
	559	int
	560	at_fork(function)
	561	forklist_fn function;
	562	{
	563	struct forklist *ep;
	564
	565	#ifdef INVARIANTS
	566	/* let the programmer know if he's been stupid */
	567	if (rm_at_fork(function))
	568	printf("WARNING: fork callout entry (%p) already present\n",
	569	function);
	570	#endif
	571	ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
	572	if (ep == NULL)
	573	return (ENOMEM);
	574	ep->function = function;
	575	TAILQ_INSERT_TAIL(&fork_list, ep, next);
	576	return (0);
	577	}
	578
	579	/*
	580	* Scan the exit callout list for the given item and remove it..
	581	* Returns the number of items removed (0 or 1)
	582	*/
	583
	584	int
	585	rm_at_fork(function)
	586	forklist_fn function;
	587	{
	588	struct forklist *ep;
	589
	590	TAILQ_FOREACH(ep, &fork_list, next) {
	591	if (ep->function == function) {
	592	TAILQ_REMOVE(&fork_list, ep, next);
	593	free(ep, M_ATFORK);
	594	return(1);
	595	}
	596	}
	597	return (0);
	598	}
	599
	600	/*
	601	* Add a forked process to the run queue after any remaining setup, such
	602	* as setting the fork handler, has been completed.
	603	*/
	604
	605	void
	606	start_forked_proc(struct proc p1, struct proc p2)
	607	{
	608	/*
	609	* Move from SIDL to RUN queue, and activate the process's thread.
	610	* Activation of the thread effectively makes the process "a"
	611	* current process, so we do not setrunqueue().
	612	*/
	613	KASSERT(p2->p_stat == SIDL,
	614	("cannot start forked process, bad status: %p", p2));
	615	(void) splhigh();
	616	p2->p_stat = SRUN;
	617	setrunqueue(p2);
	618	(void) spl0();
	619
	620	/*
	621	* Now can be swapped.
	622	*/
	623	PRELE(p1);
	624
	625	/*
	626	* Preserve synchronization semantics of vfork. If waiting for
	627	* child to exec or exit, set P_PPWAIT on child, and sleep on our
	628	* proc (in case of exit).
	629	*/
	630	while (p2->p_flag & P_PPWAIT)
	631	tsleep(p1, 0, "ppwait", 0);
	632	}
	633