gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1993, David Greenman
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* $FreeBSD: src/sys/kern/kern_exec.c,v 1.107.2.15 2002/07/30 15:40:46 nectar Exp $
	27	* $DragonFly: src/sys/kern/kern_exec.c,v 1.64 2008/10/26 04:29:19 sephe Exp $
	28	*/
	29
	30	#include <sys/param.h>
	31	#include <sys/systm.h>
	32	#include <sys/sysproto.h>
	33	#include <sys/kernel.h>
	34	#include <sys/mount.h>
	35	#include <sys/filedesc.h>
	36	#include <sys/fcntl.h>
	37	#include <sys/acct.h>
	38	#include <sys/exec.h>
	39	#include <sys/imgact.h>
	40	#include <sys/imgact_elf.h>
	41	#include <sys/kern_syscall.h>
	42	#include <sys/wait.h>
	43	#include <sys/malloc.h>
	44	#include <sys/proc.h>
	45	#include <sys/priv.h>
	46	#include <sys/ktrace.h>
	47	#include <sys/signalvar.h>
	48	#include <sys/pioctl.h>
	49	#include <sys/nlookup.h>
	50	#include <sys/sfbuf.h>
	51	#include <sys/sysent.h>
	52	#include <sys/shm.h>
	53	#include <sys/sysctl.h>
	54	#include <sys/vnode.h>
	55	#include <sys/vmmeter.h>
	56	#include <sys/aio.h>
	57	#include <sys/libkern.h>
	58
	59	#include <vm/vm.h>
	60	#include <vm/vm_param.h>
	61	#include <sys/lock.h>
	62	#include <vm/pmap.h>
	63	#include <vm/vm_page.h>
	64	#include <vm/vm_map.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_extern.h>
	67	#include <vm/vm_object.h>
	68	#include <vm/vnode_pager.h>
	69	#include <vm/vm_pager.h>
	70
	71	#include <sys/user.h>
	72	#include <sys/reg.h>
	73
	74	#include <sys/thread2.h>
	75
	76	MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
	77	MALLOC_DEFINE(M_EXECARGS, "exec-args", "Exec arguments");
	78
	79	static register_t exec_copyout_strings (struct image_params );
	80
	81	/* XXX This should be vm_size_t. */
	82	static u_long ps_strings = PS_STRINGS;
	83	SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, "");
	84
	85	/* XXX This should be vm_size_t. */
	86	static u_long usrstack = USRSTACK;
	87	SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
	88
	89	u_long ps_arg_cache_limit = PAGE_SIZE / 16;
	90	SYSCTL_LONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
	91	&ps_arg_cache_limit, 0, "");
	92
	93	int ps_argsopen = 1;
	94	SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
	95
	96	static int ktrace_suid = 0;
	97	SYSCTL_INT(_kern, OID_AUTO, ktrace_suid, CTLFLAG_RW, &ktrace_suid, 0, "");
	98
	99	void print_execve_args(struct image_args *args);
	100	int debug_execve_args = 0;
	101	SYSCTL_INT(_kern, OID_AUTO, debug_execve_args, CTLFLAG_RW, &debug_execve_args,
	102	0, "");
	103
	104	/*
	105	* Exec arguments object cache
	106	*/
	107	static struct objcache *exec_objcache;
	108
	109	static
	110	void
	111	exec_objcache_init(void *arg __unused)
	112	{
	113	int cluster_limit;
	114
	115	cluster_limit = 16; /* up to this many objects */
	116	exec_objcache = objcache_create_mbacked(
	117	M_EXECARGS, PATH_MAX + ARG_MAX,
	118	&cluster_limit,
	119	2, /* minimal magazine capacity */
	120	NULL, NULL, NULL);
	121	}
	122	SYSINIT(exec_objcache, SI_BOOT2_MACHDEP, SI_ORDER_ANY, exec_objcache_init, 0);
	123
	124	/*
	125	* stackgap_random specifies if the stackgap should have a random size added
	126	* to it. It must be a power of 2. If non-zero, the stack gap will be
	127	* calculated as: ALIGN(karc4random() & (stackgap_random - 1)).
	128	*/
	129	static int stackgap_random = 1024;
	130	static int
	131	sysctl_kern_stackgap(SYSCTL_HANDLER_ARGS)
	132	{
	133	int error, new_val;
	134	new_val = stackgap_random;
	135	error = sysctl_handle_int(oidp, &new_val, 0, req);
	136	if (error != 0 \|\| req->newptr == NULL)
	137	return (error);
	138	if ((new_val < 0) \|\| (new_val > 16 * PAGE_SIZE) \|\| ! powerof2(new_val))
	139	return (EINVAL);
	140	stackgap_random = new_val;
	141
	142	return(0);
	143	}
	144
	145	SYSCTL_PROC(_kern, OID_AUTO, stackgap_random, CTLFLAG_RW\|CTLTYPE_UINT,
	146	0, 0, sysctl_kern_stackgap, "IU", "Max random stack gap (power of 2)");
	147
	148	void
	149	print_execve_args(struct image_args *args)
	150	{
	151	char *cp;
	152	int ndx;
	153
	154	cp = args->begin_argv;
	155	for (ndx = 0; ndx < args->argc; ndx++) {
	156	kprintf("\targv[%d]: %s\n", ndx, cp);
	157	while (*cp++ != '\0');
	158	}
	159	for (ndx = 0; ndx < args->envc; ndx++) {
	160	kprintf("\tenvv[%d]: %s\n", ndx, cp);
	161	while (*cp++ != '\0');
	162	}
	163	}
	164
	165	/*
	166	* Each of the items is a pointer to a `const struct execsw', hence the
	167	* double pointer here.
	168	*/
	169	static const struct execsw **execsw;
	170
	171	/*
	172	* Replace current vmspace with a new binary.
	173	* Returns 0 on success, > 0 on recoverable error (use as errno).
	174	* Returns -1 on lethal error which demands killing of the current
	175	* process!
	176	*/
	177	int
	178	kern_execve(struct nlookupdata nd, struct image_args args)
	179	{
	180	struct thread *td = curthread;
	181	struct lwp *lp = td->td_lwp;
	182	struct proc *p = td->td_proc;
	183	register_t *stack_base;
	184	int error, len, i;
	185	struct image_params image_params, *imgp;
	186	struct vattr attr;
	187	int (img_first) (struct image_params );
	188
	189	if (debug_execve_args) {
	190	kprintf("%s()\n", __func__);
	191	print_execve_args(args);
	192	}
	193
	194	KKASSERT(p);
	195	imgp = &image_params;
	196
	197	/*
	198	* NOTE: P_INEXEC is handled by exec_new_vmspace() now. We make
	199	* no modifications to the process at all until we get there.
	200	*
	201	* Note that multiple threads may be trying to exec at the same
	202	* time. exec_new_vmspace() handles that too.
	203	*/
	204
	205	/*
	206	* Initialize part of the common data
	207	*/
	208	imgp->proc = p;
	209	imgp->args = args;
	210	imgp->attr = &attr;
	211	imgp->entry_addr = 0;
	212	imgp->resident = 0;
	213	imgp->vmspace_destroyed = 0;
	214	imgp->interpreted = 0;
	215	imgp->interpreter_name[0] = 0;
	216	imgp->auxargs = NULL;
	217	imgp->vp = NULL;
	218	imgp->firstpage = NULL;
	219	imgp->ps_strings = 0;
	220	imgp->image_header = NULL;
	221
	222	interpret:
	223
	224	/*
	225	* Translate the file name to a vnode. Unlock the cache entry to
	226	* improve parallelism for programs exec'd in parallel.
	227	*/
	228	if ((error = nlookup(nd)) != 0)
	229	goto exec_fail;
	230	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &imgp->vp);
	231	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
	232	nd->nl_flags &= ~NLC_NCPISLOCKED;
	233	cache_unlock(&nd->nl_nch);
	234	if (error)
	235	goto exec_fail;
	236
	237	/*
	238	* Check file permissions (also 'opens' file)
	239	*/
	240	error = exec_check_permissions(imgp);
	241	if (error) {
	242	vn_unlock(imgp->vp);
	243	goto exec_fail_dealloc;
	244	}
	245
	246	error = exec_map_first_page(imgp);
	247	vn_unlock(imgp->vp);
	248	if (error)
	249	goto exec_fail_dealloc;
	250
	251	if (debug_execve_args && imgp->interpreted) {
	252	kprintf(" target is interpreted -- recursive pass\n");
	253	kprintf(" interpreter: %s\n", imgp->interpreter_name);
	254	print_execve_args(args);
	255	}
	256
	257	/*
	258	* If the current process has a special image activator it
	259	* wants to try first, call it. For example, emulating shell
	260	* scripts differently.
	261	*/
	262	error = -1;
	263	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
	264	error = img_first(imgp);
	265
	266	/*
	267	* If the vnode has a registered vmspace, exec the vmspace
	268	*/
	269	if (error == -1 && imgp->vp->v_resident) {
	270	error = exec_resident_imgact(imgp);
	271	}
	272
	273	/*
	274	* Loop through the list of image activators, calling each one.
	275	* An activator returns -1 if there is no match, 0 on success,
	276	* and an error otherwise.
	277	*/
	278	for (i = 0; error == -1 && execsw[i]; ++i) {
	279	if (execsw[i]->ex_imgact == NULL \|\|
	280	execsw[i]->ex_imgact == img_first) {
	281	continue;
	282	}
	283	error = (*execsw[i]->ex_imgact)(imgp);
	284	}
	285
	286	if (error) {
	287	if (error == -1)
	288	error = ENOEXEC;
	289	goto exec_fail_dealloc;
	290	}
	291
	292	/*
	293	* Special interpreter operation, cleanup and loop up to try to
	294	* activate the interpreter.
	295	*/
	296	if (imgp->interpreted) {
	297	exec_unmap_first_page(imgp);
	298	nlookup_done(nd);
	299	vrele(imgp->vp);
	300	imgp->vp = NULL;
	301	error = nlookup_init(nd, imgp->interpreter_name, UIO_SYSSPACE,
	302	NLC_FOLLOW);
	303	if (error)
	304	goto exec_fail;
	305	goto interpret;
	306	}
	307
	308	/*
	309	* Copy out strings (args and env) and initialize stack base
	310	*/
	311	stack_base = exec_copyout_strings(imgp);
	312	p->p_vmspace->vm_minsaddr = (char *)stack_base;
	313
	314	/*
	315	* If custom stack fixup routine present for this process
	316	* let it do the stack setup. If we are running a resident
	317	* image there is no auxinfo or other image activator context
	318	* so don't try to add fixups to the stack.
	319	*
	320	* Else stuff argument count as first item on stack
	321	*/
	322	if (p->p_sysent->sv_fixup && imgp->resident == 0)
	323	(*p->p_sysent->sv_fixup)(&stack_base, imgp);
	324	else
	325	suword(--stack_base, imgp->args->argc);
	326
	327	/*
	328	* For security and other reasons, the file descriptor table cannot
	329	* be shared after an exec.
	330	*/
	331	if (p->p_fd->fd_refcnt > 1) {
	332	struct filedesc *tmp;
	333
	334	tmp = fdcopy(p);
	335	fdfree(p, tmp);
	336	}
	337
	338	/*
	339	* For security and other reasons, signal handlers cannot
	340	* be shared after an exec. The new proces gets a copy of the old
	341	* handlers. In execsigs(), the new process will have its signals
	342	* reset.
	343	*/
	344	if (p->p_sigacts->ps_refcnt > 1) {
	345	struct sigacts *newsigacts;
	346
	347	newsigacts = (struct sigacts )kmalloc(sizeof(newsigacts),
	348	M_SUBPROC, M_WAITOK);
	349	bcopy(p->p_sigacts, newsigacts, sizeof(*newsigacts));
	350	p->p_sigacts->ps_refcnt--;
	351	p->p_sigacts = newsigacts;
	352	p->p_sigacts->ps_refcnt = 1;
	353	}
	354
	355	/*
	356	* For security and other reasons virtual kernels cannot be
	357	* inherited by an exec. This also allows a virtual kernel
	358	* to fork/exec unrelated applications.
	359	*/
	360	if (p->p_vkernel)
	361	vkernel_exit(p);
	362
	363	/* Stop profiling */
	364	stopprofclock(p);
	365
	366	/* close files on exec */
	367	fdcloseexec(p);
	368
	369	/* reset caught signals */
	370	execsigs(p);
	371
	372	/* name this process - nameiexec(p, ndp) */
	373	len = min(nd->nl_nch.ncp->nc_nlen, MAXCOMLEN);
	374	bcopy(nd->nl_nch.ncp->nc_name, p->p_comm, len);
	375	p->p_comm[len] = 0;
	376	bcopy(p->p_comm, lp->lwp_thread->td_comm, MAXCOMLEN+1);
	377
	378	/*
	379	* mark as execed, wakeup the process that vforked (if any) and tell
	380	* it that it now has its own resources back
	381	*/
	382	p->p_flag \|= P_EXEC;
	383	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
	384	p->p_flag &= ~P_PPWAIT;
	385	wakeup((caddr_t)p->p_pptr);
	386	}
	387
	388	/*
	389	* Implement image setuid/setgid.
	390	*
	391	* Don't honor setuid/setgid if the filesystem prohibits it or if
	392	* the process is being traced.
	393	*/
	394	if ((((attr.va_mode & VSUID) && p->p_ucred->cr_uid != attr.va_uid) \|\|
	395	((attr.va_mode & VSGID) && p->p_ucred->cr_gid != attr.va_gid)) &&
	396	(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	397	(p->p_flag & P_TRACED) == 0) {
	398	/*
	399	* Turn off syscall tracing for set-id programs, except for
	400	* root. Record any set-id flags first to make sure that
	401	* we do not regain any tracing during a possible block.
	402	*/
	403	setsugid();
	404	if (p->p_tracenode && ktrace_suid == 0 &&
	405	priv_check(td, PRIV_ROOT) != 0) {
	406	ktrdestroy(&p->p_tracenode);
	407	p->p_traceflag = 0;
	408	}
	409	/* Close any file descriptors 0..2 that reference procfs */
	410	setugidsafety(p);
	411	/* Make sure file descriptors 0..2 are in use. */
	412	error = fdcheckstd(p);
	413	if (error != 0)
	414	goto exec_fail_dealloc;
	415	/*
	416	* Set the new credentials.
	417	*/
	418	cratom(&p->p_ucred);
	419	if (attr.va_mode & VSUID)
	420	change_euid(attr.va_uid);
	421	if (attr.va_mode & VSGID)
	422	p->p_ucred->cr_gid = attr.va_gid;
	423
	424	/*
	425	* Clear local varsym variables
	426	*/
	427	varsymset_clean(&p->p_varsymset);
	428	} else {
	429	if (p->p_ucred->cr_uid == p->p_ucred->cr_ruid &&
	430	p->p_ucred->cr_gid == p->p_ucred->cr_rgid)
	431	p->p_flag &= ~P_SUGID;
	432	}
	433
	434	/*
	435	* Implement correct POSIX saved-id behavior.
	436	*/
	437	if (p->p_ucred->cr_svuid != p->p_ucred->cr_uid \|\|
	438	p->p_ucred->cr_svgid != p->p_ucred->cr_gid) {
	439	cratom(&p->p_ucred);
	440	p->p_ucred->cr_svuid = p->p_ucred->cr_uid;
	441	p->p_ucred->cr_svgid = p->p_ucred->cr_gid;
	442	}
	443
	444	/*
	445	* Store the vp for use in procfs
	446	*/
	447	if (p->p_textvp) /* release old reference */
	448	vrele(p->p_textvp);
	449	p->p_textvp = imgp->vp;
	450	vref(p->p_textvp);
	451
	452	/*
	453	* Notify others that we exec'd, and clear the P_INEXEC flag
	454	* as we're now a bona fide freshly-execed process.
	455	*/
	456	KNOTE(&p->p_klist, NOTE_EXEC);
	457	p->p_flag &= ~P_INEXEC;
	458
	459	/*
	460	* If tracing the process, trap to debugger so breakpoints
	461	* can be set before the program executes.
	462	*/
	463	STOPEVENT(p, S_EXEC, 0);
	464
	465	if (p->p_flag & P_TRACED)
	466	ksignal(p, SIGTRAP);
	467
	468	/* clear "fork but no exec" flag, as we _are_ execing */
	469	p->p_acflag &= ~AFORK;
	470
	471	/* Set values passed into the program in registers. */
	472	exec_setregs(imgp->entry_addr, (u_long)(uintptr_t)stack_base,
	473	imgp->ps_strings);
	474
	475	/* Set the access time on the vnode */
	476	vn_mark_atime(imgp->vp, td);
	477
	478	/* Free any previous argument cache */
	479	if (p->p_args && --p->p_args->ar_ref == 0)
	480	FREE(p->p_args, M_PARGS);
	481	p->p_args = NULL;
	482
	483	/* Cache arguments if they fit inside our allowance */
	484	i = imgp->args->begin_envv - imgp->args->begin_argv;
	485	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
	486	MALLOC(p->p_args, struct pargs *, sizeof(struct pargs) + i,
	487	M_PARGS, M_WAITOK);
	488	p->p_args->ar_ref = 1;
	489	p->p_args->ar_length = i;
	490	bcopy(imgp->args->begin_argv, p->p_args->ar_args, i);
	491	}
	492
	493	exec_fail_dealloc:
	494
	495	/*
	496	* free various allocated resources
	497	*/
	498	if (imgp->firstpage)
	499	exec_unmap_first_page(imgp);
	500
	501	if (imgp->vp) {
	502	vrele(imgp->vp);
	503	imgp->vp = NULL;
	504	}
	505
	506	if (error == 0) {
	507	++mycpu->gd_cnt.v_exec;
	508	return (0);
	509	}
	510
	511	exec_fail:
	512	/*
	513	* we're done here, clear P_INEXEC if we were the ones that
	514	* set it. Otherwise if vmspace_destroyed is still set we
	515	* raced another thread and that thread is responsible for
	516	* clearing it.
	517	*/
	518	if (imgp->vmspace_destroyed & 2)
	519	p->p_flag &= ~P_INEXEC;
	520	if (imgp->vmspace_destroyed) {
	521	/*
	522	* Sorry, no more process anymore. exit gracefully.
	523	* However we can't die right here, because our
	524	* caller might have to clean up, so indicate a
	525	* lethal error by returning -1.
	526	*/
	527	return(-1);
	528	} else {
	529	return(error);
	530	}
	531	}
	532
	533	/*
	534	* execve() system call.
	535	*/
	536	int
	537	sys_execve(struct execve_args *uap)
	538	{
	539	struct nlookupdata nd;
	540	struct image_args args;
	541	int error;
	542
	543	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
	544	bzero(&args, sizeof(args));
	545	if (error == 0) {
	546	error = exec_copyin_args(&args, uap->fname, PATH_USERSPACE,
	547	uap->argv, uap->envv);
	548	}
	549	if (error == 0)
	550	error = kern_execve(&nd, &args);
	551	nlookup_done(&nd);
	552	exec_free_args(&args);
	553
	554	if (error < 0) {
	555	/* We hit a lethal error condition. Let's die now. */
	556	exit1(W_EXITCODE(0, SIGABRT));
	557	/* NOTREACHED */
	558	}
	559
	560	/*
	561	* The syscall result is returned in registers to the new program.
	562	* Linux will register %edx as an atexit function and we must be
	563	* sure to set it to 0. XXX
	564	*/
	565	if (error == 0)
	566	uap->sysmsg_result64 = 0;
	567
	568	return (error);
	569	}
	570
	571	int
	572	exec_map_first_page(struct image_params *imgp)
	573	{
	574	int rv, i;
	575	int initial_pagein;
	576	vm_page_t ma[VM_INITIAL_PAGEIN];
	577	vm_page_t m;
	578	vm_object_t object;
	579
	580	if (imgp->firstpage)
	581	exec_unmap_first_page(imgp);
	582
	583	/*
	584	* The file has to be mappable.
	585	*/
	586	if ((object = imgp->vp->v_object) == NULL)
	587	return (EIO);
	588
	589	/*
	590	* We shouldn't need protection for vm_page_grab() but we certainly
	591	* need it for the lookup loop below (lookup/busy race), since
	592	* an interrupt can unbusy and free the page before our busy check.
	593	*/
	594	crit_enter();
	595	m = vm_page_grab(object, 0, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	596
	597	if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
	598	ma[0] = m;
	599	initial_pagein = VM_INITIAL_PAGEIN;
	600	if (initial_pagein > object->size)
	601	initial_pagein = object->size;
	602	for (i = 1; i < initial_pagein; i++) {
	603	if ((m = vm_page_lookup(object, i)) != NULL) {
	604	if ((m->flags & PG_BUSY) \|\| m->busy)
	605	break;
	606	if (m->valid)
	607	break;
	608	vm_page_busy(m);
	609	} else {
	610	m = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
	611	if (m == NULL)
	612	break;
	613	}
	614	ma[i] = m;
	615	}
	616	initial_pagein = i;
	617
	618	/*
	619	* get_pages unbusies all the requested pages except the
	620	* primary page (at index 0 in this case). The primary
	621	* page may have been wired during the pagein (e.g. by
	622	* the buffer cache) so vnode_pager_freepage() must be
	623	* used to properly release it.
	624	*/
	625	rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
	626	m = vm_page_lookup(object, 0);
	627
	628	if (rv != VM_PAGER_OK \|\| m == NULL \|\| m->valid == 0) {
	629	if (m) {
	630	vm_page_protect(m, VM_PROT_NONE);
	631	vnode_pager_freepage(m);
	632	}
	633	crit_exit();
	634	return EIO;
	635	}
	636	}
	637	vm_page_hold(m);
	638	vm_page_wakeup(m); /* unbusy the page */
	639	crit_exit();
	640
	641	imgp->firstpage = sf_buf_alloc(m, SFB_CPUPRIVATE);
	642	imgp->image_header = (void *)sf_buf_kva(imgp->firstpage);
	643
	644	return 0;
	645	}
	646
	647	void
	648	exec_unmap_first_page(struct image_params *imgp)
	649	{
	650	vm_page_t m;
	651
	652	crit_enter();
	653	if (imgp->firstpage != NULL) {
	654	m = sf_buf_page(imgp->firstpage);
	655	sf_buf_free(imgp->firstpage);
	656	imgp->firstpage = NULL;
	657	imgp->image_header = NULL;
	658	vm_page_unhold(m);
	659	}
	660	crit_exit();
	661	}
	662
	663	/*
	664	* Destroy old address space, and allocate a new stack
	665	* The new stack is only SGROWSIZ large because it is grown
	666	* automatically in trap.c.
	667	*
	668	* This is the point of no return.
	669	*/
	670	int
	671	exec_new_vmspace(struct image_params imgp, struct vmspace vmcopy)
	672	{
	673	struct vmspace *vmspace = imgp->proc->p_vmspace;
	674	vm_offset_t stack_addr = USRSTACK - maxssiz;
	675	struct proc *p;
	676	vm_map_t map;
	677	int error;
	678
	679	/*
	680	* Indicate that we cannot gracefully error out any more, kill
	681	* any other threads present, and set P_INEXEC to indicate that
	682	* we are now messing with the process structure proper.
	683	*
	684	* If killalllwps() races return an error which coupled with
	685	* vmspace_destroyed will cause us to exit. This is what we
	686	* want since another thread is patiently waiting for us to exit
	687	* in that case.
	688	*/
	689	p = curproc;
	690	imgp->vmspace_destroyed = 1;
	691
	692	if (curthread->td_proc->p_nthreads > 1) {
	693	error = killalllwps(1);
	694	if (error)
	695	return (error);
	696	}
	697	imgp->vmspace_destroyed \|= 2; /* we are responsible for P_INEXEC */
	698	p->p_flag \|= P_INEXEC;
	699
	700	/*
	701	* Prevent a pending AIO from modifying the new address space.
	702	*/
	703	aio_proc_rundown(imgp->proc);
	704
	705	/*
	706	* Blow away entire process VM, if address space not shared,
	707	* otherwise, create a new VM space so that other threads are
	708	* not disrupted. If we are execing a resident vmspace we
	709	* create a duplicate of it and remap the stack.
	710	*
	711	* The exitingcnt test is not strictly necessary but has been
	712	* included for code sanity (to make the code more deterministic).
	713	*/
	714	map = &vmspace->vm_map;
	715	if (vmcopy) {
	716	vmspace_exec(imgp->proc, vmcopy);
	717	vmspace = imgp->proc->p_vmspace;
	718	pmap_remove_pages(vmspace_pmap(vmspace), stack_addr, USRSTACK);
	719	map = &vmspace->vm_map;
	720	} else if (vmspace->vm_sysref.refcnt == 1 &&
	721	vmspace->vm_exitingcnt == 0) {
	722	shmexit(vmspace);
	723	if (vmspace->vm_upcalls)
	724	upc_release(vmspace, ONLY_LWP_IN_PROC(imgp->proc));
	725	pmap_remove_pages(vmspace_pmap(vmspace),
	726	0, VM_MAX_USER_ADDRESS);
	727	vm_map_remove(map, 0, VM_MAX_USER_ADDRESS);
	728	} else {
	729	vmspace_exec(imgp->proc, NULL);
	730	vmspace = imgp->proc->p_vmspace;
	731	map = &vmspace->vm_map;
	732	}
	733
	734	/* Allocate a new stack */
	735	error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
	736	0, VM_PROT_ALL, VM_PROT_ALL, 0);
	737	if (error)
	738	return (error);
	739
	740	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
	741	* VM_STACK case, but they are still used to monitor the size of the
	742	* process stack so we can check the stack rlimit.
	743	*/
	744	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	745	vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
	746
	747	return(0);
	748	}
	749
	750	/*
	751	* Copy out argument and environment strings from the old process
	752	* address space into the temporary string buffer.
	753	*/
	754	int
	755	exec_copyin_args(struct image_args args, char fname,
	756	enum exec_path_segflg segflg, char argv, char envv)
	757	{
	758	char argp, envp;
	759	int error = 0;
	760	size_t length;
	761
	762	args->buf = objcache_get(exec_objcache, M_WAITOK);
	763	if (args->buf == NULL)
	764	return (ENOMEM);
	765	args->begin_argv = args->buf;
	766	args->endp = args->begin_argv;
	767	args->space = ARG_MAX;
	768
	769	args->fname = args->buf + ARG_MAX;
	770
	771	/*
	772	* Copy the file name.
	773	*/
	774	if (segflg == PATH_SYSSPACE) {
	775	error = copystr(fname, args->fname, PATH_MAX, &length);
	776	} else if (segflg == PATH_USERSPACE) {
	777	error = copyinstr(fname, args->fname, PATH_MAX, &length);
	778	}
	779
	780	/*
	781	* Extract argument strings. argv may not be NULL. The argv
	782	* array is terminated by a NULL entry. We special-case the
	783	* situation where argv[0] is NULL by passing { filename, NULL }
	784	* to the new program to guarentee that the interpreter knows what
	785	* file to open in case we exec an interpreted file. Note that
	786	* a NULL argv[0] terminates the argv[] array.
	787	*
	788	* XXX the special-casing of argv[0] is historical and needs to be
	789	* revisited.
	790	*/
	791	if (argv == NULL)
	792	error = EFAULT;
	793	if (error == 0) {
	794	while ((argp = (caddr_t)(intptr_t)fuword(argv++)) != NULL) {
	795	if (argp == (caddr_t)-1) {
	796	error = EFAULT;
	797	break;
	798	}
	799	error = copyinstr(argp, args->endp,
	800	args->space, &length);
	801	if (error) {
	802	if (error == ENAMETOOLONG)
	803	error = E2BIG;
	804	break;
	805	}
	806	args->space -= length;
	807	args->endp += length;
	808	args->argc++;
	809	}
	810	if (args->argc == 0 && error == 0) {
	811	length = strlen(args->fname) + 1;
	812	if (length > args->space) {
	813	error = E2BIG;
	814	} else {
	815	bcopy(args->fname, args->endp, length);
	816	args->space -= length;
	817	args->endp += length;
	818	args->argc++;
	819	}
	820	}
	821	}
	822
	823	args->begin_envv = args->endp;
	824
	825	/*
	826	* extract environment strings. envv may be NULL.
	827	*/
	828	if (envv && error == 0) {
	829	while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
	830	if (envp == (caddr_t) -1) {
	831	error = EFAULT;
	832	break;
	833	}
	834	error = copyinstr(envp, args->endp, args->space,
	835	&length);
	836	if (error) {
	837	if (error == ENAMETOOLONG)
	838	error = E2BIG;
	839	break;
	840	}
	841	args->space -= length;
	842	args->endp += length;
	843	args->envc++;
	844	}
	845	}
	846	return (error);
	847	}
	848
	849	void
	850	exec_free_args(struct image_args *args)
	851	{
	852	if (args->buf) {
	853	objcache_put(exec_objcache, args->buf);
	854	args->buf = NULL;
	855	}
	856	}
	857
	858	/*
	859	* Copy strings out to the new process address space, constructing
	860	* new arg and env vector tables. Return a pointer to the base
	861	* so that it can be used as the initial stack pointer.
	862	*/
	863	register_t *
	864	exec_copyout_strings(struct image_params *imgp)
	865	{
	866	int argc, envc, sgap;
	867	char **vectp;
	868	char stringp, destp;
	869	register_t *stack_base;
	870	struct ps_strings *arginfo;
	871	int szsigcode;
	872
	873	/*
	874	* Calculate string base and vector table pointers.
	875	* Also deal with signal trampoline code for this exec type.
	876	*/
	877	arginfo = (struct ps_strings *)PS_STRINGS;
	878	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
	879	if (stackgap_random != 0)
	880	sgap = ALIGN(karc4random() & (stackgap_random - 1));
	881	else
	882	sgap = 0;
	883	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - sgap -
	884	roundup((ARG_MAX - imgp->args->space), sizeof(char *));
	885
	886	/*
	887	* install sigcode
	888	*/
	889	if (szsigcode)
	890	copyout(imgp->proc->p_sysent->sv_sigcode,
	891	((caddr_t)arginfo - szsigcode), szsigcode);
	892
	893	/*
	894	* If we have a valid auxargs ptr, prepare some room
	895	* on the stack.
	896	*
	897	* The '+ 2' is for the null pointers at the end of each of the
	898	* arg and env vector sets, and 'AT_COUNT*2' is room for the
	899	* ELF Auxargs data.
	900	*/
	901	if (imgp->auxargs) {
	902	vectp = (char **)(destp - (imgp->args->argc +
	903	imgp->args->envc + 2 + AT_COUNT * 2) * sizeof(char*));
	904	} else {
	905	vectp = (char **)(destp - (imgp->args->argc +
	906	imgp->args->envc + 2) * sizeof(char*));
	907	}
	908
	909	/*
	910	* NOTE: don't bother aligning the stack here for GCC 2.x, it will
	911	* be done in crt1.o. Note that GCC 3.x aligns the stack in main.
	912	*/
	913
	914	/*
	915	* vectp also becomes our initial stack base
	916	*/
	917	stack_base = (register_t *)vectp;
	918
	919	stringp = imgp->args->begin_argv;
	920	argc = imgp->args->argc;
	921	envc = imgp->args->envc;
	922
	923	/*
	924	* Copy out strings - arguments and environment.
	925	*/
	926	copyout(stringp, destp, ARG_MAX - imgp->args->space);
	927
	928	/*
	929	* Fill in "ps_strings" struct for ps, w, etc.
	930	*/
	931	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
	932	suword(&arginfo->ps_nargvstr, argc);
	933
	934	/*
	935	* Fill in argument portion of vector table.
	936	*/
	937	for (; argc > 0; --argc) {
	938	suword(vectp++, (long)(intptr_t)destp);
	939	while (*stringp++ != 0)
	940	destp++;
	941	destp++;
	942	}
	943
	944	/* a null vector table pointer separates the argp's from the envp's */
	945	suword(vectp++, 0);
	946
	947	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
	948	suword(&arginfo->ps_nenvstr, envc);
	949
	950	/*
	951	* Fill in environment portion of vector table.
	952	*/
	953	for (; envc > 0; --envc) {
	954	suword(vectp++, (long)(intptr_t)destp);
	955	while (*stringp++ != 0)
	956	destp++;
	957	destp++;
	958	}
	959
	960	/* end of vector table is a null pointer */
	961	suword(vectp, 0);
	962
	963	return (stack_base);
	964	}
	965
	966	/*
	967	* Check permissions of file to execute.
	968	* Return 0 for success or error code on failure.
	969	*/
	970	int
	971	exec_check_permissions(struct image_params *imgp)
	972	{
	973	struct proc *p = imgp->proc;
	974	struct vnode *vp = imgp->vp;
	975	struct vattr *attr = imgp->attr;
	976	int error;
	977
	978	/* Get file attributes */
	979	error = VOP_GETATTR(vp, attr);
	980	if (error)
	981	return (error);
	982
	983	/*
	984	* 1) Check if file execution is disabled for the filesystem that this
	985	* file resides on.
	986	* 2) Insure that at least one execute bit is on - otherwise root
	987	* will always succeed, and we don't want to happen unless the
	988	* file really is executable.
	989	* 3) Insure that the file is a regular file.
	990	*/
	991	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	992	((attr->va_mode & 0111) == 0) \|\|
	993	(attr->va_type != VREG)) {
	994	return (EACCES);
	995	}
	996
	997	/*
	998	* Zero length files can't be exec'd
	999	*/
	1000	if (attr->va_size == 0)
	1001	return (ENOEXEC);
	1002
	1003	/*
	1004	* Check for execute permission to file based on current credentials.
	1005	*/
	1006	error = VOP_EACCESS(vp, VEXEC, p->p_ucred);
	1007	if (error)
	1008	return (error);
	1009
	1010	/*
	1011	* Check number of open-for-writes on the file and deny execution
	1012	* if there are any.
	1013	*/
	1014	if (vp->v_writecount)
	1015	return (ETXTBSY);
	1016
	1017	/*
	1018	* Call filesystem specific open routine, which allows us to read,
	1019	* write, and mmap the file. Without the VOP_OPEN we can only
	1020	* stat the file.
	1021	*/
	1022	error = VOP_OPEN(vp, FREAD, p->p_ucred, NULL);
	1023	if (error)
	1024	return (error);
	1025
	1026	return (0);
	1027	}
	1028
	1029	/*
	1030	* Exec handler registration
	1031	*/
	1032	int
	1033	exec_register(const struct execsw *execsw_arg)
	1034	{
	1035	const struct execsw es, xs, **newexecsw;
	1036	int count = 2; /* New slot and trailing NULL */
	1037
	1038	if (execsw)
	1039	for (es = execsw; *es; es++)
	1040	count++;
	1041	newexecsw = kmalloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	1042	xs = newexecsw;
	1043	if (execsw)
	1044	for (es = execsw; *es; es++)
	1045	xs++ = es;
	1046	*xs++ = execsw_arg;
	1047	*xs = NULL;
	1048	if (execsw)
	1049	kfree(execsw, M_TEMP);
	1050	execsw = newexecsw;
	1051	return 0;
	1052	}
	1053
	1054	int
	1055	exec_unregister(const struct execsw *execsw_arg)
	1056	{
	1057	const struct execsw es, xs, **newexecsw;
	1058	int count = 1;
	1059
	1060	if (execsw == NULL)
	1061	panic("unregister with no handlers left?");
	1062
	1063	for (es = execsw; *es; es++) {
	1064	if (*es == execsw_arg)
	1065	break;
	1066	}
	1067	if (*es == NULL)
	1068	return ENOENT;
	1069	for (es = execsw; *es; es++)
	1070	if (*es != execsw_arg)
	1071	count++;
	1072	newexecsw = kmalloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	1073	xs = newexecsw;
	1074	for (es = execsw; *es; es++)
	1075	if (*es != execsw_arg)
	1076	xs++ = es;
	1077	*xs = NULL;
	1078	if (execsw)
	1079	kfree(execsw, M_TEMP);
	1080	execsw = newexecsw;
	1081	return 0;
	1082	}