gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
	39	* $FreeBSD: src/sys/kern/kern_fork.c,v 1.72.2.13 2003/06/06 20:21:32 tegge Exp $
	40	* $DragonFly: src/sys/kern/kern_fork.c,v 1.5 2003/06/18 18:30:08 dillon Exp $
	41	*/
	42
	43	#include "opt_ktrace.h"
	44
	45	#include <sys/param.h>
	46	#include <sys/systm.h>
	47	#include <sys/sysproto.h>
	48	#include <sys/filedesc.h>
	49	#include <sys/kernel.h>
	50	#include <sys/sysctl.h>
	51	#include <sys/malloc.h>
	52	#include <sys/proc.h>
	53	#include <sys/resourcevar.h>
	54	#include <sys/vnode.h>
	55	#include <sys/acct.h>
	56	#include <sys/ktrace.h>
	57	#include <sys/unistd.h>
	58	#include <sys/jail.h>
	59
	60	#include <vm/vm.h>
	61	#include <sys/lock.h>
	62	#include <vm/pmap.h>
	63	#include <vm/vm_map.h>
	64	#include <vm/vm_extern.h>
	65	#include <vm/vm_zone.h>
	66
	67	#include <sys/vmmeter.h>
	68	#include <sys/user.h>
	69
	70	static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
	71
	72	/*
	73	* These are the stuctures used to create a callout list for things to do
	74	* when forking a process
	75	*/
	76	struct forklist {
	77	forklist_fn function;
	78	TAILQ_ENTRY(forklist) next;
	79	};
	80
	81	TAILQ_HEAD(forklist_head, forklist);
	82	static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
	83
	84	#ifndef _SYS_SYSPROTO_H_
	85	struct fork_args {
	86	int dummy;
	87	};
	88	#endif
	89
	90	int forksleep; /* Place for fork1() to sleep on. */
	91
	92	/* ARGSUSED */
	93	int
	94	fork(p, uap)
	95	struct proc *p;
	96	struct fork_args *uap;
	97	{
	98	int error;
	99	struct proc *p2;
	100
	101	error = fork1(p, RFFDG \| RFPROC, &p2);
	102	if (error == 0) {
	103	p->p_retval[0] = p2->p_pid;
	104	p->p_retval[1] = 0;
	105	}
	106	return error;
	107	}
	108
	109	/* ARGSUSED */
	110	int
	111	vfork(p, uap)
	112	struct proc *p;
	113	struct vfork_args *uap;
	114	{
	115	int error;
	116	struct proc *p2;
	117
	118	error = fork1(p, RFFDG \| RFPROC \| RFPPWAIT \| RFMEM, &p2);
	119	if (error == 0) {
	120	p->p_retval[0] = p2->p_pid;
	121	p->p_retval[1] = 0;
	122	}
	123	return error;
	124	}
	125
	126	int
	127	rfork(p, uap)
	128	struct proc *p;
	129	struct rfork_args *uap;
	130	{
	131	int error;
	132	struct proc *p2;
	133
	134	error = fork1(p, uap->flags, &p2);
	135	if (error == 0) {
	136	p->p_retval[0] = p2 ? p2->p_pid : 0;
	137	p->p_retval[1] = 0;
	138	}
	139	return error;
	140	}
	141
	142
	143	int nprocs = 1; /* process 0 */
	144	static int nextpid = 0;
	145
	146	/*
	147	* Random component to nextpid generation. We mix in a random factor to make
	148	* it a little harder to predict. We sanity check the modulus value to avoid
	149	* doing it in critical paths. Don't let it be too small or we pointlessly
	150	* waste randomness entropy, and don't let it be impossibly large. Using a
	151	* modulus that is too big causes a LOT more process table scans and slows
	152	* down fork processing as the pidchecked caching is defeated.
	153	*/
	154	static int randompid = 0;
	155
	156	static int
	157	sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
	158	{
	159	int error, pid;
	160
	161	pid = randompid;
	162	error = sysctl_handle_int(oidp, &pid, 0, req);
	163	if (error \|\| !req->newptr)
	164	return (error);
	165	if (pid < 0 \|\| pid > PID_MAX - 100) /* out of range */
	166	pid = PID_MAX - 100;
	167	else if (pid < 2) /* NOP */
	168	pid = 0;
	169	else if (pid < 100) /* Make it reasonable */
	170	pid = 100;
	171	randompid = pid;
	172	return (error);
	173	}
	174
	175	SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT\|CTLFLAG_RW,
	176	0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
	177
	178	int
	179	fork1(p1, flags, procp)
	180	struct proc *p1;
	181	int flags;
	182	struct proc **procp;
	183	{
	184	struct proc p2, pptr;
	185	uid_t uid;
	186	struct proc *newproc;
	187	int ok;
	188	static int pidchecked = 0;
	189	struct forklist *ep;
	190	struct filedesc_to_leader *fdtol;
	191
	192	if ((flags & (RFFDG\|RFCFDG)) == (RFFDG\|RFCFDG))
	193	return (EINVAL);
	194
	195	/*
	196	* Here we don't create a new process, but we divorce
	197	* certain parts of a process from itself.
	198	*/
	199	if ((flags & RFPROC) == 0) {
	200
	201	vm_fork(p1, 0, flags);
	202
	203	/*
	204	* Close all file descriptors.
	205	*/
	206	if (flags & RFCFDG) {
	207	struct filedesc *fdtmp;
	208	fdtmp = fdinit(p1);
	209	fdfree(p1);
	210	p1->p_fd = fdtmp;
	211	}
	212
	213	/*
	214	* Unshare file descriptors (from parent.)
	215	*/
	216	if (flags & RFFDG) {
	217	if (p1->p_fd->fd_refcnt > 1) {
	218	struct filedesc *newfd;
	219	newfd = fdcopy(p1);
	220	fdfree(p1);
	221	p1->p_fd = newfd;
	222	}
	223	}
	224	*procp = NULL;
	225	return (0);
	226	}
	227
	228	/*
	229	* Although process entries are dynamically created, we still keep
	230	* a global limit on the maximum number we will create. Don't allow
	231	* a nonprivileged user to use the last ten processes; don't let root
	232	* exceed the limit. The variable nprocs is the current number of
	233	* processes, maxproc is the limit.
	234	*/
	235	uid = p1->p_cred->p_ruid;
	236	if ((nprocs >= maxproc - 10 && uid != 0) \|\| nprocs >= maxproc) {
	237	tsleep(&forksleep, PUSER, "fork", hz / 2);
	238	return (EAGAIN);
	239	}
	240	/*
	241	* Increment the nprocs resource before blocking can occur. There
	242	* are hard-limits as to the number of processes that can run.
	243	*/
	244	nprocs++;
	245
	246	/*
	247	* Increment the count of procs running with this uid. Don't allow
	248	* a nonprivileged user to exceed their current limit.
	249	*/
	250	ok = chgproccnt(p1->p_cred->p_uidinfo, 1,
	251	(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
	252	if (!ok) {
	253	/*
	254	* Back out the process count
	255	*/
	256	nprocs--;
	257	tsleep(&forksleep, PUSER, "fork", hz / 2);
	258	return (EAGAIN);
	259	}
	260
	261	/* Allocate new proc. */
	262	newproc = zalloc(proc_zone);
	263
	264	/*
	265	* Setup linkage for kernel based threading
	266	*/
	267	if((flags & RFTHREAD) != 0) {
	268	newproc->p_peers = p1->p_peers;
	269	p1->p_peers = newproc;
	270	newproc->p_leader = p1->p_leader;
	271	} else {
	272	newproc->p_peers = 0;
	273	newproc->p_leader = newproc;
	274	}
	275
	276	newproc->p_wakeup = 0;
	277	newproc->p_vmspace = NULL;
	278
	279	/*
	280	* Find an unused process ID. We remember a range of unused IDs
	281	* ready to use (from nextpid+1 through pidchecked-1).
	282	*/
	283	nextpid++;
	284	if (randompid)
	285	nextpid += arc4random() % randompid;
	286	retry:
	287	/*
	288	* If the process ID prototype has wrapped around,
	289	* restart somewhat above 0, as the low-numbered procs
	290	* tend to include daemons that don't exit.
	291	*/
	292	if (nextpid >= PID_MAX) {
	293	nextpid = nextpid % PID_MAX;
	294	if (nextpid < 100)
	295	nextpid += 100;
	296	pidchecked = 0;
	297	}
	298	if (nextpid >= pidchecked) {
	299	int doingzomb = 0;
	300
	301	pidchecked = PID_MAX;
	302	/*
	303	* Scan the active and zombie procs to check whether this pid
	304	* is in use. Remember the lowest pid that's greater
	305	* than nextpid, so we can avoid checking for a while.
	306	*/
	307	p2 = LIST_FIRST(&allproc);
	308	again:
	309	for (; p2 != 0; p2 = LIST_NEXT(p2, p_list)) {
	310	while (p2->p_pid == nextpid \|\|
	311	p2->p_pgrp->pg_id == nextpid \|\|
	312	p2->p_session->s_sid == nextpid) {
	313	nextpid++;
	314	if (nextpid >= pidchecked)
	315	goto retry;
	316	}
	317	if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
	318	pidchecked = p2->p_pid;
	319	if (p2->p_pgrp->pg_id > nextpid &&
	320	pidchecked > p2->p_pgrp->pg_id)
	321	pidchecked = p2->p_pgrp->pg_id;
	322	if (p2->p_session->s_sid > nextpid &&
	323	pidchecked > p2->p_session->s_sid)
	324	pidchecked = p2->p_session->s_sid;
	325	}
	326	if (!doingzomb) {
	327	doingzomb = 1;
	328	p2 = LIST_FIRST(&zombproc);
	329	goto again;
	330	}
	331	}
	332
	333	p2 = newproc;
	334	p2->p_stat = SIDL; /* protect against others */
	335	p2->p_pid = nextpid;
	336	LIST_INSERT_HEAD(&allproc, p2, p_list);
	337	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
	338
	339	/*
	340	* Make a proc table entry for the new process.
	341	* Start by zeroing the section of proc that is zero-initialized,
	342	* then copy the section that is copied directly from the parent.
	343	*/
	344	bzero(&p2->p_startzero,
	345	(unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
	346	bcopy(&p1->p_startcopy, &p2->p_startcopy,
	347	(unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
	348
	349	p2->p_aioinfo = NULL;
	350
	351	/*
	352	* Duplicate sub-structures as needed.
	353	* Increase reference counts on shared objects.
	354	* The p_stats and p_sigacts substructs are set in vm_fork.
	355	*/
	356	p2->p_flag = P_INMEM;
	357	if (p1->p_flag & P_PROFIL)
	358	startprofclock(p2);
	359	MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred),
	360	M_SUBPROC, M_WAITOK);
	361	bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred));
	362	p2->p_cred->p_refcnt = 1;
	363	crhold(p1->p_ucred);
	364	uihold(p1->p_cred->p_uidinfo);
	365
	366	if (p2->p_prison) {
	367	p2->p_prison->pr_ref++;
	368	p2->p_flag \|= P_JAILED;
	369	}
	370
	371	if (p2->p_args)
	372	p2->p_args->ar_ref++;
	373
	374	if (flags & RFSIGSHARE) {
	375	p2->p_procsig = p1->p_procsig;
	376	p2->p_procsig->ps_refcnt++;
	377	if (p1->p_sigacts == &p1->p_addr->u_sigacts) {
	378	struct sigacts *newsigacts;
	379	int s;
	380
	381	/* Create the shared sigacts structure */
	382	MALLOC(newsigacts, struct sigacts *,
	383	sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
	384	s = splhigh();
	385	/*
	386	* Set p_sigacts to the new shared structure.
	387	* Note that this is updating p1->p_sigacts at the
	388	* same time, since p_sigacts is just a pointer to
	389	* the shared p_procsig->ps_sigacts.
	390	*/
	391	p2->p_sigacts = newsigacts;
	392	bcopy(&p1->p_addr->u_sigacts, p2->p_sigacts,
	393	sizeof(*p2->p_sigacts));
	394	*p2->p_sigacts = p1->p_addr->u_sigacts;
	395	splx(s);
	396	}
	397	} else {
	398	MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
	399	M_SUBPROC, M_WAITOK);
	400	bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
	401	p2->p_procsig->ps_refcnt = 1;
	402	p2->p_sigacts = NULL; /* finished in vm_fork() */
	403	}
	404	if (flags & RFLINUXTHPN)
	405	p2->p_sigparent = SIGUSR1;
	406	else
	407	p2->p_sigparent = SIGCHLD;
	408
	409	/* bump references to the text vnode (for procfs) */
	410	p2->p_textvp = p1->p_textvp;
	411	if (p2->p_textvp)
	412	VREF(p2->p_textvp);
	413
	414	if (flags & RFCFDG) {
	415	p2->p_fd = fdinit(p1);
	416	fdtol = NULL;
	417	} else if (flags & RFFDG) {
	418	p2->p_fd = fdcopy(p1);
	419	fdtol = NULL;
	420	} else {
	421	p2->p_fd = fdshare(p1);
	422	if (p1->p_fdtol == NULL)
	423	p1->p_fdtol =
	424	filedesc_to_leader_alloc(NULL,
	425	p1->p_leader);
	426	if ((flags & RFTHREAD) != 0) {
	427	/*
	428	* Shared file descriptor table and
	429	* shared process leaders.
	430	*/
	431	fdtol = p1->p_fdtol;
	432	fdtol->fdl_refcount++;
	433	} else {
	434	/*
	435	* Shared file descriptor table, and
	436	* different process leaders
	437	*/
	438	fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
	439	p2);
	440	}
	441	}
	442	p2->p_fdtol = fdtol;
	443
	444	/*
	445	* If p_limit is still copy-on-write, bump refcnt,
	446	* otherwise get a copy that won't be modified.
	447	* (If PL_SHAREMOD is clear, the structure is shared
	448	* copy-on-write.)
	449	*/
	450	if (p1->p_limit->p_lflags & PL_SHAREMOD)
	451	p2->p_limit = limcopy(p1->p_limit);
	452	else {
	453	p2->p_limit = p1->p_limit;
	454	p2->p_limit->p_refcnt++;
	455	}
	456
	457	/*
	458	* Preserve some more flags in subprocess. P_PROFIL has already
	459	* been preserved.
	460	*/
	461	p2->p_flag \|= p1->p_flag & (P_SUGID \| P_ALTSTACK);
	462	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
	463	p2->p_flag \|= P_CONTROLT;
	464	if (flags & RFPPWAIT)
	465	p2->p_flag \|= P_PPWAIT;
	466
	467	LIST_INSERT_AFTER(p1, p2, p_pglist);
	468
	469	/*
	470	* Attach the new process to its parent.
	471	*
	472	* If RFNOWAIT is set, the newly created process becomes a child
	473	* of init. This effectively disassociates the child from the
	474	* parent.
	475	*/
	476	if (flags & RFNOWAIT)
	477	pptr = initproc;
	478	else
	479	pptr = p1;
	480	p2->p_pptr = pptr;
	481	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
	482	LIST_INIT(&p2->p_children);
	483
	484	#ifdef KTRACE
	485	/*
	486	* Copy traceflag and tracefile if enabled. If not inherited,
	487	* these were zeroed above but we still could have a trace race
	488	* so make sure p2's p_tracep is NULL.
	489	*/
	490	if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) {
	491	p2->p_traceflag = p1->p_traceflag;
	492	if ((p2->p_tracep = p1->p_tracep) != NULL)
	493	VREF(p2->p_tracep);
	494	}
	495	#endif
	496
	497	/*
	498	* set priority of child to be that of parent
	499	*/
	500	p2->p_estcpu = p1->p_estcpu;
	501
	502	/*
	503	* This begins the section where we must prevent the parent
	504	* from being swapped.
	505	*/
	506	PHOLD(p1);
	507
	508	/*
	509	* Finish creating the child process. It will return via a different
	510	* execution path later. (ie: directly into user mode)
	511	*/
	512	vm_fork(p1, p2, flags);
	513
	514	if (flags == (RFFDG \| RFPROC)) {
	515	cnt.v_forks++;
	516	cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	517	} else if (flags == (RFFDG \| RFPROC \| RFPPWAIT \| RFMEM)) {
	518	cnt.v_vforks++;
	519	cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	520	} else if (p1 == &proc0) {
	521	cnt.v_kthreads++;
	522	cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	523	} else {
	524	cnt.v_rforks++;
	525	cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
	526	}
	527
	528	/*
	529	* Both processes are set up, now check if any loadable modules want
	530	* to adjust anything.
	531	* What if they have an error? XXX
	532	*/
	533	TAILQ_FOREACH(ep, &fork_list, next) {
	534	(*ep->function)(p1, p2, flags);
	535	}
	536
	537	/*
	538	* Make child runnable and add to run queue.
	539	*/
	540	microtime(&(p2->p_stats->p_start));
	541	p2->p_acflag = AFORK;
	542	(void) splhigh();
	543	p2->p_stat = SRUN;
	544	setrunqueue(p2);
	545	(void) spl0();
	546
	547	/*
	548	* Now can be swapped.
	549	*/
	550	PRELE(p1);
	551
	552	/*
	553	* tell any interested parties about the new process
	554	*/
	555	KNOTE(&p1->p_klist, NOTE_FORK \| p2->p_pid);
	556
	557	/*
	558	* Preserve synchronization semantics of vfork. If waiting for
	559	* child to exec or exit, set P_PPWAIT on child, and sleep on our
	560	* proc (in case of exit).
	561	*/
	562	while (p2->p_flag & P_PPWAIT)
	563	tsleep(p1, PWAIT, "ppwait", 0);
	564
	565	/*
	566	* Return child proc pointer to parent.
	567	*/
	568	*procp = p2;
	569	return (0);
	570	}
	571
	572	/*
	573	* The next two functionms are general routines to handle adding/deleting
	574	* items on the fork callout list.
	575	*
	576	* at_fork():
	577	* Take the arguments given and put them onto the fork callout list,
	578	* However first make sure that it's not already there.
	579	* Returns 0 on success or a standard error number.
	580	*/
	581
	582	int
	583	at_fork(function)
	584	forklist_fn function;
	585	{
	586	struct forklist *ep;
	587
	588	#ifdef INVARIANTS
	589	/* let the programmer know if he's been stupid */
	590	if (rm_at_fork(function))
	591	printf("WARNING: fork callout entry (%p) already present\n",
	592	function);
	593	#endif
	594	ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
	595	if (ep == NULL)
	596	return (ENOMEM);
	597	ep->function = function;
	598	TAILQ_INSERT_TAIL(&fork_list, ep, next);
	599	return (0);
	600	}
	601
	602	/*
	603	* Scan the exit callout list for the given item and remove it..
	604	* Returns the number of items removed (0 or 1)
	605	*/
	606
	607	int
	608	rm_at_fork(function)
	609	forklist_fn function;
	610	{
	611	struct forklist *ep;
	612
	613	TAILQ_FOREACH(ep, &fork_list, next) {
	614	if (ep->function == function) {
	615	TAILQ_REMOVE(&fork_list, ep, next);
	616	free(ep, M_ATFORK);
	617	return(1);
	618	}
	619	}
	620	return (0);
	621	}