gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1993, David Greenman
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* $FreeBSD: src/sys/kern/kern_exec.c,v 1.107.2.15 2002/07/30 15:40:46 nectar Exp $
	27	* $DragonFly: src/sys/kern/kern_exec.c,v 1.64 2008/10/26 04:29:19 sephe Exp $
	28	*/
	29
	30	#include <sys/param.h>
	31	#include <sys/systm.h>
	32	#include <sys/sysproto.h>
	33	#include <sys/kernel.h>
	34	#include <sys/mount.h>
	35	#include <sys/filedesc.h>
	36	#include <sys/fcntl.h>
	37	#include <sys/acct.h>
	38	#include <sys/exec.h>
	39	#include <sys/imgact.h>
	40	#include <sys/imgact_elf.h>
	41	#include <sys/kern_syscall.h>
	42	#include <sys/wait.h>
	43	#include <sys/malloc.h>
	44	#include <sys/proc.h>
	45	#include <sys/priv.h>
	46	#include <sys/ktrace.h>
	47	#include <sys/signalvar.h>
	48	#include <sys/pioctl.h>
	49	#include <sys/nlookup.h>
	50	#include <sys/sfbuf.h>
	51	#include <sys/sysent.h>
	52	#include <sys/shm.h>
	53	#include <sys/sysctl.h>
	54	#include <sys/vnode.h>
	55	#include <sys/vmmeter.h>
	56	#include <sys/aio.h>
	57	#include <sys/libkern.h>
	58
	59	#include <vm/vm.h>
	60	#include <vm/vm_param.h>
	61	#include <sys/lock.h>
	62	#include <vm/pmap.h>
	63	#include <vm/vm_page.h>
	64	#include <vm/vm_map.h>
	65	#include <vm/vm_kern.h>
	66	#include <vm/vm_extern.h>
	67	#include <vm/vm_object.h>
	68	#include <vm/vnode_pager.h>
	69	#include <vm/vm_pager.h>
	70
	71	#include <sys/user.h>
	72	#include <sys/reg.h>
	73
	74	#include <sys/thread2.h>
	75
	76	MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
	77	MALLOC_DEFINE(M_EXECARGS, "exec-args", "Exec arguments");
	78
	79	static register_t exec_copyout_strings (struct image_params );
	80
	81	/* XXX This should be vm_size_t. */
	82	static u_long ps_strings = PS_STRINGS;
	83	SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, "");
	84
	85	/* XXX This should be vm_size_t. */
	86	static u_long usrstack = USRSTACK;
	87	SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
	88
	89	u_long ps_arg_cache_limit = PAGE_SIZE / 16;
	90	SYSCTL_LONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
	91	&ps_arg_cache_limit, 0, "");
	92
	93	int ps_argsopen = 1;
	94	SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
	95
	96	void print_execve_args(struct image_args *args);
	97	int debug_execve_args = 0;
	98	SYSCTL_INT(_kern, OID_AUTO, debug_execve_args, CTLFLAG_RW, &debug_execve_args,
	99	0, "");
	100
	101	/*
	102	* Exec arguments object cache
	103	*/
	104	static struct objcache *exec_objcache;
	105
	106	static
	107	void
	108	exec_objcache_init(void *arg __unused)
	109	{
	110	int cluster_limit;
	111
	112	cluster_limit = 16; /* up to this many objects */
	113	exec_objcache = objcache_create_mbacked(
	114	M_EXECARGS, PATH_MAX + ARG_MAX,
	115	&cluster_limit,
	116	2, /* minimal magazine capacity */
	117	NULL, NULL, NULL);
	118	}
	119	SYSINIT(exec_objcache, SI_BOOT2_MACHDEP, SI_ORDER_ANY, exec_objcache_init, 0);
	120
	121	/*
	122	* stackgap_random specifies if the stackgap should have a random size added
	123	* to it. It must be a power of 2. If non-zero, the stack gap will be
	124	* calculated as: ALIGN(karc4random() & (stackgap_random - 1)).
	125	*/
	126	static int stackgap_random = 1024;
	127	static int
	128	sysctl_kern_stackgap(SYSCTL_HANDLER_ARGS)
	129	{
	130	int error, new_val;
	131	new_val = stackgap_random;
	132	error = sysctl_handle_int(oidp, &new_val, 0, req);
	133	if (error != 0 \|\| req->newptr == NULL)
	134	return (error);
	135	if ((new_val < 0) \|\| (new_val > 16 * PAGE_SIZE) \|\| ! powerof2(new_val))
	136	return (EINVAL);
	137	stackgap_random = new_val;
	138
	139	return(0);
	140	}
	141
	142	SYSCTL_PROC(_kern, OID_AUTO, stackgap_random, CTLFLAG_RW\|CTLTYPE_UINT,
	143	0, 0, sysctl_kern_stackgap, "IU", "Max random stack gap (power of 2)");
	144
	145	void
	146	print_execve_args(struct image_args *args)
	147	{
	148	char *cp;
	149	int ndx;
	150
	151	cp = args->begin_argv;
	152	for (ndx = 0; ndx < args->argc; ndx++) {
	153	kprintf("\targv[%d]: %s\n", ndx, cp);
	154	while (*cp++ != '\0');
	155	}
	156	for (ndx = 0; ndx < args->envc; ndx++) {
	157	kprintf("\tenvv[%d]: %s\n", ndx, cp);
	158	while (*cp++ != '\0');
	159	}
	160	}
	161
	162	/*
	163	* Each of the items is a pointer to a `const struct execsw', hence the
	164	* double pointer here.
	165	*/
	166	static const struct execsw **execsw;
	167
	168	/*
	169	* Replace current vmspace with a new binary.
	170	* Returns 0 on success, > 0 on recoverable error (use as errno).
	171	* Returns -1 on lethal error which demands killing of the current
	172	* process!
	173	*/
	174	int
	175	kern_execve(struct nlookupdata nd, struct image_args args)
	176	{
	177	struct thread *td = curthread;
	178	struct lwp *lp = td->td_lwp;
	179	struct proc *p = td->td_proc;
	180	register_t *stack_base;
	181	int error, len, i;
	182	struct image_params image_params, *imgp;
	183	struct vattr attr;
	184	int (img_first) (struct image_params );
	185
	186	if (debug_execve_args) {
	187	kprintf("%s()\n", __func__);
	188	print_execve_args(args);
	189	}
	190
	191	KKASSERT(p);
	192	imgp = &image_params;
	193
	194	/*
	195	* NOTE: P_INEXEC is handled by exec_new_vmspace() now. We make
	196	* no modifications to the process at all until we get there.
	197	*
	198	* Note that multiple threads may be trying to exec at the same
	199	* time. exec_new_vmspace() handles that too.
	200	*/
	201
	202	/*
	203	* Initialize part of the common data
	204	*/
	205	imgp->proc = p;
	206	imgp->args = args;
	207	imgp->attr = &attr;
	208	imgp->entry_addr = 0;
	209	imgp->resident = 0;
	210	imgp->vmspace_destroyed = 0;
	211	imgp->interpreted = 0;
	212	imgp->interpreter_name[0] = 0;
	213	imgp->auxargs = NULL;
	214	imgp->vp = NULL;
	215	imgp->firstpage = NULL;
	216	imgp->ps_strings = 0;
	217	imgp->image_header = NULL;
	218
	219	interpret:
	220
	221	/*
	222	* Translate the file name to a vnode. Unlock the cache entry to
	223	* improve parallelism for programs exec'd in parallel.
	224	*/
	225	if ((error = nlookup(nd)) != 0)
	226	goto exec_fail;
	227	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &imgp->vp);
	228	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
	229	nd->nl_flags &= ~NLC_NCPISLOCKED;
	230	cache_unlock(&nd->nl_nch);
	231	if (error)
	232	goto exec_fail;
	233
	234	/*
	235	* Check file permissions (also 'opens' file)
	236	*/
	237	error = exec_check_permissions(imgp);
	238	if (error) {
	239	vn_unlock(imgp->vp);
	240	goto exec_fail_dealloc;
	241	}
	242
	243	error = exec_map_first_page(imgp);
	244	vn_unlock(imgp->vp);
	245	if (error)
	246	goto exec_fail_dealloc;
	247
	248	if (debug_execve_args && imgp->interpreted) {
	249	kprintf(" target is interpreted -- recursive pass\n");
	250	kprintf(" interpreter: %s\n", imgp->interpreter_name);
	251	print_execve_args(args);
	252	}
	253
	254	/*
	255	* If the current process has a special image activator it
	256	* wants to try first, call it. For example, emulating shell
	257	* scripts differently.
	258	*/
	259	error = -1;
	260	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
	261	error = img_first(imgp);
	262
	263	/*
	264	* If the vnode has a registered vmspace, exec the vmspace
	265	*/
	266	if (error == -1 && imgp->vp->v_resident) {
	267	error = exec_resident_imgact(imgp);
	268	}
	269
	270	/*
	271	* Loop through the list of image activators, calling each one.
	272	* An activator returns -1 if there is no match, 0 on success,
	273	* and an error otherwise.
	274	*/
	275	for (i = 0; error == -1 && execsw[i]; ++i) {
	276	if (execsw[i]->ex_imgact == NULL \|\|
	277	execsw[i]->ex_imgact == img_first) {
	278	continue;
	279	}
	280	error = (*execsw[i]->ex_imgact)(imgp);
	281	}
	282
	283	if (error) {
	284	if (error == -1)
	285	error = ENOEXEC;
	286	goto exec_fail_dealloc;
	287	}
	288
	289	/*
	290	* Special interpreter operation, cleanup and loop up to try to
	291	* activate the interpreter.
	292	*/
	293	if (imgp->interpreted) {
	294	exec_unmap_first_page(imgp);
	295	nlookup_done(nd);
	296	vrele(imgp->vp);
	297	imgp->vp = NULL;
	298	error = nlookup_init(nd, imgp->interpreter_name, UIO_SYSSPACE,
	299	NLC_FOLLOW);
	300	if (error)
	301	goto exec_fail;
	302	goto interpret;
	303	}
	304
	305	/*
	306	* Copy out strings (args and env) and initialize stack base
	307	*/
	308	stack_base = exec_copyout_strings(imgp);
	309	p->p_vmspace->vm_minsaddr = (char *)stack_base;
	310
	311	/*
	312	* If custom stack fixup routine present for this process
	313	* let it do the stack setup. If we are running a resident
	314	* image there is no auxinfo or other image activator context
	315	* so don't try to add fixups to the stack.
	316	*
	317	* Else stuff argument count as first item on stack
	318	*/
	319	if (p->p_sysent->sv_fixup && imgp->resident == 0)
	320	(*p->p_sysent->sv_fixup)(&stack_base, imgp);
	321	else
	322	suword(--stack_base, imgp->args->argc);
	323
	324	/*
	325	* For security and other reasons, the file descriptor table cannot
	326	* be shared after an exec.
	327	*/
	328	if (p->p_fd->fd_refcnt > 1) {
	329	struct filedesc *tmp;
	330
	331	tmp = fdcopy(p);
	332	fdfree(p);
	333	p->p_fd = tmp;
	334	}
	335
	336	/*
	337	* For security and other reasons, signal handlers cannot
	338	* be shared after an exec. The new proces gets a copy of the old
	339	* handlers. In execsigs(), the new process will have its signals
	340	* reset.
	341	*/
	342	if (p->p_sigacts->ps_refcnt > 1) {
	343	struct sigacts *newsigacts;
	344
	345	newsigacts = (struct sigacts )kmalloc(sizeof(newsigacts),
	346	M_SUBPROC, M_WAITOK);
	347	bcopy(p->p_sigacts, newsigacts, sizeof(*newsigacts));
	348	p->p_sigacts->ps_refcnt--;
	349	p->p_sigacts = newsigacts;
	350	p->p_sigacts->ps_refcnt = 1;
	351	}
	352
	353	/*
	354	* For security and other reasons virtual kernels cannot be
	355	* inherited by an exec. This also allows a virtual kernel
	356	* to fork/exec unrelated applications.
	357	*/
	358	if (p->p_vkernel)
	359	vkernel_exit(p);
	360
	361	/* Stop profiling */
	362	stopprofclock(p);
	363
	364	/* close files on exec */
	365	fdcloseexec(p);
	366
	367	/* reset caught signals */
	368	execsigs(p);
	369
	370	/* name this process - nameiexec(p, ndp) */
	371	len = min(nd->nl_nch.ncp->nc_nlen, MAXCOMLEN);
	372	bcopy(nd->nl_nch.ncp->nc_name, p->p_comm, len);
	373	p->p_comm[len] = 0;
	374	bcopy(p->p_comm, lp->lwp_thread->td_comm, MAXCOMLEN+1);
	375
	376	/*
	377	* mark as execed, wakeup the process that vforked (if any) and tell
	378	* it that it now has its own resources back
	379	*/
	380	p->p_flag \|= P_EXEC;
	381	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
	382	p->p_flag &= ~P_PPWAIT;
	383	wakeup((caddr_t)p->p_pptr);
	384	}
	385
	386	/*
	387	* Implement image setuid/setgid.
	388	*
	389	* Don't honor setuid/setgid if the filesystem prohibits it or if
	390	* the process is being traced.
	391	*/
	392	if ((((attr.va_mode & VSUID) && p->p_ucred->cr_uid != attr.va_uid) \|\|
	393	((attr.va_mode & VSGID) && p->p_ucred->cr_gid != attr.va_gid)) &&
	394	(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	395	(p->p_flag & P_TRACED) == 0) {
	396	/*
	397	* Turn off syscall tracing for set-id programs, except for
	398	* root. Record any set-id flags first to make sure that
	399	* we do not regain any tracing during a possible block.
	400	*/
	401	setsugid();
	402	if (p->p_tracenode && priv_check(td, PRIV_ROOT) != 0) {
	403	ktrdestroy(&p->p_tracenode);
	404	p->p_traceflag = 0;
	405	}
	406	/* Close any file descriptors 0..2 that reference procfs */
	407	setugidsafety(p);
	408	/* Make sure file descriptors 0..2 are in use. */
	409	error = fdcheckstd(p);
	410	if (error != 0)
	411	goto exec_fail_dealloc;
	412	/*
	413	* Set the new credentials.
	414	*/
	415	cratom(&p->p_ucred);
	416	if (attr.va_mode & VSUID)
	417	change_euid(attr.va_uid);
	418	if (attr.va_mode & VSGID)
	419	p->p_ucred->cr_gid = attr.va_gid;
	420
	421	/*
	422	* Clear local varsym variables
	423	*/
	424	varsymset_clean(&p->p_varsymset);
	425	} else {
	426	if (p->p_ucred->cr_uid == p->p_ucred->cr_ruid &&
	427	p->p_ucred->cr_gid == p->p_ucred->cr_rgid)
	428	p->p_flag &= ~P_SUGID;
	429	}
	430
	431	/*
	432	* Implement correct POSIX saved-id behavior.
	433	*/
	434	if (p->p_ucred->cr_svuid != p->p_ucred->cr_uid \|\|
	435	p->p_ucred->cr_svgid != p->p_ucred->cr_gid) {
	436	cratom(&p->p_ucred);
	437	p->p_ucred->cr_svuid = p->p_ucred->cr_uid;
	438	p->p_ucred->cr_svgid = p->p_ucred->cr_gid;
	439	}
	440
	441	/*
	442	* Store the vp for use in procfs
	443	*/
	444	if (p->p_textvp) /* release old reference */
	445	vrele(p->p_textvp);
	446	p->p_textvp = imgp->vp;
	447	vref(p->p_textvp);
	448
	449	/*
	450	* Notify others that we exec'd, and clear the P_INEXEC flag
	451	* as we're now a bona fide freshly-execed process.
	452	*/
	453	KNOTE(&p->p_klist, NOTE_EXEC);
	454	p->p_flag &= ~P_INEXEC;
	455
	456	/*
	457	* If tracing the process, trap to debugger so breakpoints
	458	* can be set before the program executes.
	459	*/
	460	STOPEVENT(p, S_EXEC, 0);
	461
	462	if (p->p_flag & P_TRACED)
	463	ksignal(p, SIGTRAP);
	464
	465	/* clear "fork but no exec" flag, as we _are_ execing */
	466	p->p_acflag &= ~AFORK;
	467
	468	/* Set values passed into the program in registers. */
	469	exec_setregs(imgp->entry_addr, (u_long)(uintptr_t)stack_base,
	470	imgp->ps_strings);
	471
	472	/* Set the access time on the vnode */
	473	vn_mark_atime(imgp->vp, td);
	474
	475	/* Free any previous argument cache */
	476	if (p->p_args && --p->p_args->ar_ref == 0)
	477	FREE(p->p_args, M_PARGS);
	478	p->p_args = NULL;
	479
	480	/* Cache arguments if they fit inside our allowance */
	481	i = imgp->args->begin_envv - imgp->args->begin_argv;
	482	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
	483	MALLOC(p->p_args, struct pargs *, sizeof(struct pargs) + i,
	484	M_PARGS, M_WAITOK);
	485	p->p_args->ar_ref = 1;
	486	p->p_args->ar_length = i;
	487	bcopy(imgp->args->begin_argv, p->p_args->ar_args, i);
	488	}
	489
	490	exec_fail_dealloc:
	491
	492	/*
	493	* free various allocated resources
	494	*/
	495	if (imgp->firstpage)
	496	exec_unmap_first_page(imgp);
	497
	498	if (imgp->vp) {
	499	vrele(imgp->vp);
	500	imgp->vp = NULL;
	501	}
	502
	503	if (error == 0) {
	504	++mycpu->gd_cnt.v_exec;
	505	return (0);
	506	}
	507
	508	exec_fail:
	509	/*
	510	* we're done here, clear P_INEXEC if we were the ones that
	511	* set it. Otherwise if vmspace_destroyed is still set we
	512	* raced another thread and that thread is responsible for
	513	* clearing it.
	514	*/
	515	if (imgp->vmspace_destroyed & 2)
	516	p->p_flag &= ~P_INEXEC;
	517	if (imgp->vmspace_destroyed) {
	518	/*
	519	* Sorry, no more process anymore. exit gracefully.
	520	* However we can't die right here, because our
	521	* caller might have to clean up, so indicate a
	522	* lethal error by returning -1.
	523	*/
	524	return(-1);
	525	} else {
	526	return(error);
	527	}
	528	}
	529
	530	/*
	531	* execve() system call.
	532	*/
	533	int
	534	sys_execve(struct execve_args *uap)
	535	{
	536	struct nlookupdata nd;
	537	struct image_args args;
	538	int error;
	539
	540	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
	541	if (error == 0) {
	542	error = exec_copyin_args(&args, uap->fname, PATH_USERSPACE,
	543	uap->argv, uap->envv);
	544	}
	545	if (error == 0)
	546	error = kern_execve(&nd, &args);
	547	nlookup_done(&nd);
	548	exec_free_args(&args);
	549
	550	if (error < 0) {
	551	/* We hit a lethal error condition. Let's die now. */
	552	exit1(W_EXITCODE(0, SIGABRT));
	553	/* NOTREACHED */
	554	}
	555
	556	/*
	557	* The syscall result is returned in registers to the new program.
	558	* Linux will register %edx as an atexit function and we must be
	559	* sure to set it to 0. XXX
	560	*/
	561	if (error == 0)
	562	uap->sysmsg_result64 = 0;
	563
	564	return (error);
	565	}
	566
	567	int
	568	exec_map_first_page(struct image_params *imgp)
	569	{
	570	int rv, i;
	571	int initial_pagein;
	572	vm_page_t ma[VM_INITIAL_PAGEIN];
	573	vm_page_t m;
	574	vm_object_t object;
	575
	576	if (imgp->firstpage)
	577	exec_unmap_first_page(imgp);
	578
	579	/*
	580	* The file has to be mappable.
	581	*/
	582	if ((object = imgp->vp->v_object) == NULL)
	583	return (EIO);
	584
	585	/*
	586	* We shouldn't need protection for vm_page_grab() but we certainly
	587	* need it for the lookup loop below (lookup/busy race), since
	588	* an interrupt can unbusy and free the page before our busy check.
	589	*/
	590	crit_enter();
	591	m = vm_page_grab(object, 0, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	592
	593	if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
	594	ma[0] = m;
	595	initial_pagein = VM_INITIAL_PAGEIN;
	596	if (initial_pagein > object->size)
	597	initial_pagein = object->size;
	598	for (i = 1; i < initial_pagein; i++) {
	599	if ((m = vm_page_lookup(object, i)) != NULL) {
	600	if ((m->flags & PG_BUSY) \|\| m->busy)
	601	break;
	602	if (m->valid)
	603	break;
	604	vm_page_busy(m);
	605	} else {
	606	m = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
	607	if (m == NULL)
	608	break;
	609	}
	610	ma[i] = m;
	611	}
	612	initial_pagein = i;
	613
	614	/*
	615	* get_pages unbusies all the requested pages except the
	616	* primary page (at index 0 in this case). The primary
	617	* page may have been wired during the pagein (e.g. by
	618	* the buffer cache) so vnode_pager_freepage() must be
	619	* used to properly release it.
	620	*/
	621	rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
	622	m = vm_page_lookup(object, 0);
	623
	624	if (rv != VM_PAGER_OK \|\| m == NULL \|\| m->valid == 0) {
	625	if (m) {
	626	vm_page_protect(m, VM_PROT_NONE);
	627	vnode_pager_freepage(m);
	628	}
	629	crit_exit();
	630	return EIO;
	631	}
	632	}
	633	vm_page_hold(m);
	634	vm_page_wakeup(m); /* unbusy the page */
	635	crit_exit();
	636
	637	imgp->firstpage = sf_buf_alloc(m, SFB_CPUPRIVATE);
	638	imgp->image_header = (void *)sf_buf_kva(imgp->firstpage);
	639
	640	return 0;
	641	}
	642
	643	void
	644	exec_unmap_first_page(struct image_params *imgp)
	645	{
	646	vm_page_t m;
	647
	648	crit_enter();
	649	if (imgp->firstpage != NULL) {
	650	m = sf_buf_page(imgp->firstpage);
	651	sf_buf_free(imgp->firstpage);
	652	imgp->firstpage = NULL;
	653	imgp->image_header = NULL;
	654	vm_page_unhold(m);
	655	}
	656	crit_exit();
	657	}
	658
	659	/*
	660	* Destroy old address space, and allocate a new stack
	661	* The new stack is only SGROWSIZ large because it is grown
	662	* automatically in trap.c.
	663	*
	664	* This is the point of no return.
	665	*/
	666	int
	667	exec_new_vmspace(struct image_params imgp, struct vmspace vmcopy)
	668	{
	669	struct vmspace *vmspace = imgp->proc->p_vmspace;
	670	vm_offset_t stack_addr = USRSTACK - maxssiz;
	671	struct proc *p;
	672	vm_map_t map;
	673	int error;
	674
	675	/*
	676	* Indicate that we cannot gracefully error out any more, kill
	677	* any other threads present, and set P_INEXEC to indicate that
	678	* we are now messing with the process structure proper.
	679	*
	680	* If killalllwps() races return an error which coupled with
	681	* vmspace_destroyed will cause us to exit. This is what we
	682	* want since another thread is patiently waiting for us to exit
	683	* in that case.
	684	*/
	685	p = curproc;
	686	imgp->vmspace_destroyed = 1;
	687
	688	if (curthread->td_proc->p_nthreads > 1) {
	689	error = killalllwps(1);
	690	if (error)
	691	return (error);
	692	}
	693	imgp->vmspace_destroyed \|= 2; /* we are responsible for P_INEXEC */
	694	p->p_flag \|= P_INEXEC;
	695
	696	/*
	697	* Prevent a pending AIO from modifying the new address space.
	698	*/
	699	aio_proc_rundown(imgp->proc);
	700
	701	/*
	702	* Blow away entire process VM, if address space not shared,
	703	* otherwise, create a new VM space so that other threads are
	704	* not disrupted. If we are execing a resident vmspace we
	705	* create a duplicate of it and remap the stack.
	706	*
	707	* The exitingcnt test is not strictly necessary but has been
	708	* included for code sanity (to make the code more deterministic).
	709	*/
	710	map = &vmspace->vm_map;
	711	if (vmcopy) {
	712	vmspace_exec(imgp->proc, vmcopy);
	713	vmspace = imgp->proc->p_vmspace;
	714	pmap_remove_pages(vmspace_pmap(vmspace), stack_addr, USRSTACK);
	715	map = &vmspace->vm_map;
	716	} else if (vmspace->vm_sysref.refcnt == 1 &&
	717	vmspace->vm_exitingcnt == 0) {
	718	shmexit(vmspace);
	719	if (vmspace->vm_upcalls)
	720	upc_release(vmspace, ONLY_LWP_IN_PROC(imgp->proc));
	721	pmap_remove_pages(vmspace_pmap(vmspace),
	722	0, VM_MAX_USER_ADDRESS);
	723	vm_map_remove(map, 0, VM_MAX_USER_ADDRESS);
	724	} else {
	725	vmspace_exec(imgp->proc, NULL);
	726	vmspace = imgp->proc->p_vmspace;
	727	map = &vmspace->vm_map;
	728	}
	729
	730	/* Allocate a new stack */
	731	error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
	732	0, VM_PROT_ALL, VM_PROT_ALL, 0);
	733	if (error)
	734	return (error);
	735
	736	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
	737	* VM_STACK case, but they are still used to monitor the size of the
	738	* process stack so we can check the stack rlimit.
	739	*/
	740	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	741	vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
	742
	743	return(0);
	744	}
	745
	746	/*
	747	* Copy out argument and environment strings from the old process
	748	* address space into the temporary string buffer.
	749	*/
	750	int
	751	exec_copyin_args(struct image_args args, char fname,
	752	enum exec_path_segflg segflg, char argv, char envv)
	753	{
	754	char argp, envp;
	755	int error = 0;
	756	size_t length;
	757
	758	bzero(args, sizeof(*args));
	759
	760	args->buf = objcache_get(exec_objcache, M_WAITOK);
	761	if (args->buf == NULL)
	762	return (ENOMEM);
	763	args->begin_argv = args->buf;
	764	args->endp = args->begin_argv;
	765	args->space = ARG_MAX;
	766
	767	args->fname = args->buf + ARG_MAX;
	768
	769	/*
	770	* Copy the file name.
	771	*/
	772	if (segflg == PATH_SYSSPACE) {
	773	error = copystr(fname, args->fname, PATH_MAX, &length);
	774	} else if (segflg == PATH_USERSPACE) {
	775	error = copyinstr(fname, args->fname, PATH_MAX, &length);
	776	}
	777
	778	/*
	779	* Extract argument strings. argv may not be NULL. The argv
	780	* array is terminated by a NULL entry. We special-case the
	781	* situation where argv[0] is NULL by passing { filename, NULL }
	782	* to the new program to guarentee that the interpreter knows what
	783	* file to open in case we exec an interpreted file. Note that
	784	* a NULL argv[0] terminates the argv[] array.
	785	*
	786	* XXX the special-casing of argv[0] is historical and needs to be
	787	* revisited.
	788	*/
	789	if (argv == NULL)
	790	error = EFAULT;
	791	if (error == 0) {
	792	while ((argp = (caddr_t)(intptr_t)fuword(argv++)) != NULL) {
	793	if (argp == (caddr_t)-1) {
	794	error = EFAULT;
	795	break;
	796	}
	797	error = copyinstr(argp, args->endp,
	798	args->space, &length);
	799	if (error) {
	800	if (error == ENAMETOOLONG)
	801	error = E2BIG;
	802	break;
	803	}
	804	args->space -= length;
	805	args->endp += length;
	806	args->argc++;
	807	}
	808	if (args->argc == 0 && error == 0) {
	809	length = strlen(args->fname) + 1;
	810	if (length > args->space) {
	811	error = E2BIG;
	812	} else {
	813	bcopy(args->fname, args->endp, length);
	814	args->space -= length;
	815	args->endp += length;
	816	args->argc++;
	817	}
	818	}
	819	}
	820
	821	args->begin_envv = args->endp;
	822
	823	/*
	824	* extract environment strings. envv may be NULL.
	825	*/
	826	if (envv && error == 0) {
	827	while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
	828	if (envp == (caddr_t) -1) {
	829	error = EFAULT;
	830	break;
	831	}
	832	error = copyinstr(envp, args->endp, args->space,
	833	&length);
	834	if (error) {
	835	if (error == ENAMETOOLONG)
	836	error = E2BIG;
	837	break;
	838	}
	839	args->space -= length;
	840	args->endp += length;
	841	args->envc++;
	842	}
	843	}
	844	return (error);
	845	}
	846
	847	void
	848	exec_free_args(struct image_args *args)
	849	{
	850	if (args->buf) {
	851	objcache_put(exec_objcache, args->buf);
	852	args->buf = NULL;
	853	}
	854	}
	855
	856	/*
	857	* Copy strings out to the new process address space, constructing
	858	* new arg and env vector tables. Return a pointer to the base
	859	* so that it can be used as the initial stack pointer.
	860	*/
	861	register_t *
	862	exec_copyout_strings(struct image_params *imgp)
	863	{
	864	int argc, envc, sgap;
	865	char **vectp;
	866	char stringp, destp;
	867	register_t *stack_base;
	868	struct ps_strings *arginfo;
	869	int szsigcode;
	870
	871	/*
	872	* Calculate string base and vector table pointers.
	873	* Also deal with signal trampoline code for this exec type.
	874	*/
	875	arginfo = (struct ps_strings *)PS_STRINGS;
	876	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
	877	if (stackgap_random != 0)
	878	sgap = ALIGN(karc4random() & (stackgap_random - 1));
	879	else
	880	sgap = 0;
	881	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - sgap -
	882	roundup((ARG_MAX - imgp->args->space), sizeof(char *));
	883
	884	/*
	885	* install sigcode
	886	*/
	887	if (szsigcode)
	888	copyout(imgp->proc->p_sysent->sv_sigcode,
	889	((caddr_t)arginfo - szsigcode), szsigcode);
	890
	891	/*
	892	* If we have a valid auxargs ptr, prepare some room
	893	* on the stack.
	894	*
	895	* The '+ 2' is for the null pointers at the end of each of the
	896	* arg and env vector sets, and 'AT_COUNT*2' is room for the
	897	* ELF Auxargs data.
	898	*/
	899	if (imgp->auxargs) {
	900	vectp = (char **)(destp - (imgp->args->argc +
	901	imgp->args->envc + 2 + AT_COUNT * 2) * sizeof(char*));
	902	} else {
	903	vectp = (char **)(destp - (imgp->args->argc +
	904	imgp->args->envc + 2) * sizeof(char*));
	905	}
	906
	907	/*
	908	* NOTE: don't bother aligning the stack here for GCC 2.x, it will
	909	* be done in crt1.o. Note that GCC 3.x aligns the stack in main.
	910	*/
	911
	912	/*
	913	* vectp also becomes our initial stack base
	914	*/
	915	stack_base = (register_t *)vectp;
	916
	917	stringp = imgp->args->begin_argv;
	918	argc = imgp->args->argc;
	919	envc = imgp->args->envc;
	920
	921	/*
	922	* Copy out strings - arguments and environment.
	923	*/
	924	copyout(stringp, destp, ARG_MAX - imgp->args->space);
	925
	926	/*
	927	* Fill in "ps_strings" struct for ps, w, etc.
	928	*/
	929	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
	930	suword(&arginfo->ps_nargvstr, argc);
	931
	932	/*
	933	* Fill in argument portion of vector table.
	934	*/
	935	for (; argc > 0; --argc) {
	936	suword(vectp++, (long)(intptr_t)destp);
	937	while (*stringp++ != 0)
	938	destp++;
	939	destp++;
	940	}
	941
	942	/* a null vector table pointer separates the argp's from the envp's */
	943	suword(vectp++, 0);
	944
	945	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
	946	suword(&arginfo->ps_nenvstr, envc);
	947
	948	/*
	949	* Fill in environment portion of vector table.
	950	*/
	951	for (; envc > 0; --envc) {
	952	suword(vectp++, (long)(intptr_t)destp);
	953	while (*stringp++ != 0)
	954	destp++;
	955	destp++;
	956	}
	957
	958	/* end of vector table is a null pointer */
	959	suword(vectp, 0);
	960
	961	return (stack_base);
	962	}
	963
	964	/*
	965	* Check permissions of file to execute.
	966	* Return 0 for success or error code on failure.
	967	*/
	968	int
	969	exec_check_permissions(struct image_params *imgp)
	970	{
	971	struct proc *p = imgp->proc;
	972	struct vnode *vp = imgp->vp;
	973	struct vattr *attr = imgp->attr;
	974	int error;
	975
	976	/* Get file attributes */
	977	error = VOP_GETATTR(vp, attr);
	978	if (error)
	979	return (error);
	980
	981	/*
	982	* 1) Check if file execution is disabled for the filesystem that this
	983	* file resides on.
	984	* 2) Insure that at least one execute bit is on - otherwise root
	985	* will always succeed, and we don't want to happen unless the
	986	* file really is executable.
	987	* 3) Insure that the file is a regular file.
	988	*/
	989	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	990	((attr->va_mode & 0111) == 0) \|\|
	991	(attr->va_type != VREG)) {
	992	return (EACCES);
	993	}
	994
	995	/*
	996	* Zero length files can't be exec'd
	997	*/
	998	if (attr->va_size == 0)
	999	return (ENOEXEC);
	1000
	1001	/*
	1002	* Check for execute permission to file based on current credentials.
	1003	*/
	1004	error = VOP_ACCESS(vp, VEXEC, p->p_ucred);
	1005	if (error)
	1006	return (error);
	1007
	1008	/*
	1009	* Check number of open-for-writes on the file and deny execution
	1010	* if there are any.
	1011	*/
	1012	if (vp->v_writecount)
	1013	return (ETXTBSY);
	1014
	1015	/*
	1016	* Call filesystem specific open routine, which allows us to read,
	1017	* write, and mmap the file. Without the VOP_OPEN we can only
	1018	* stat the file.
	1019	*/
	1020	error = VOP_OPEN(vp, FREAD, p->p_ucred, NULL);
	1021	if (error)
	1022	return (error);
	1023
	1024	return (0);
	1025	}
	1026
	1027	/*
	1028	* Exec handler registration
	1029	*/
	1030	int
	1031	exec_register(const struct execsw *execsw_arg)
	1032	{
	1033	const struct execsw es, xs, **newexecsw;
	1034	int count = 2; /* New slot and trailing NULL */
	1035
	1036	if (execsw)
	1037	for (es = execsw; *es; es++)
	1038	count++;
	1039	newexecsw = kmalloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	1040	xs = newexecsw;
	1041	if (execsw)
	1042	for (es = execsw; *es; es++)
	1043	xs++ = es;
	1044	*xs++ = execsw_arg;
	1045	*xs = NULL;
	1046	if (execsw)
	1047	kfree(execsw, M_TEMP);
	1048	execsw = newexecsw;
	1049	return 0;
	1050	}
	1051
	1052	int
	1053	exec_unregister(const struct execsw *execsw_arg)
	1054	{
	1055	const struct execsw es, xs, **newexecsw;
	1056	int count = 1;
	1057
	1058	if (execsw == NULL)
	1059	panic("unregister with no handlers left?");
	1060
	1061	for (es = execsw; *es; es++) {
	1062	if (*es == execsw_arg)
	1063	break;
	1064	}
	1065	if (*es == NULL)
	1066	return ENOENT;
	1067	for (es = execsw; *es; es++)
	1068	if (*es != execsw_arg)
	1069	count++;
	1070	newexecsw = kmalloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	1071	xs = newexecsw;
	1072	for (es = execsw; *es; es++)
	1073	if (*es != execsw_arg)
	1074	xs++ = es;
	1075	*xs = NULL;
	1076	if (execsw)
	1077	kfree(execsw, M_TEMP);
	1078	execsw = newexecsw;
	1079	return 0;
	1080	}