gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1993, David Greenman
	3	* All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	*
	14	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	15	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	17	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	18	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	19	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	20	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	21	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	22	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	23	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	24	* SUCH DAMAGE.
	25	*
	26	* $FreeBSD: src/sys/kern/kern_exec.c,v 1.107.2.15 2002/07/30 15:40:46 nectar Exp $
	27	*/
	28
	29	#include <sys/param.h>
	30	#include <sys/systm.h>
	31	#include <sys/sysproto.h>
	32	#include <sys/kernel.h>
	33	#include <sys/mount.h>
	34	#include <sys/filedesc.h>
	35	#include <sys/fcntl.h>
	36	#include <sys/acct.h>
	37	#include <sys/exec.h>
	38	#include <sys/imgact.h>
	39	#include <sys/imgact_elf.h>
	40	#include <sys/kern_syscall.h>
	41	#include <sys/wait.h>
	42	#include <sys/malloc.h>
	43	#include <sys/proc.h>
	44	#include <sys/priv.h>
	45	#include <sys/ktrace.h>
	46	#include <sys/signalvar.h>
	47	#include <sys/pioctl.h>
	48	#include <sys/nlookup.h>
	49	#include <sys/sysent.h>
	50	#include <sys/shm.h>
	51	#include <sys/sysctl.h>
	52	#include <sys/vnode.h>
	53	#include <sys/vmmeter.h>
	54	#include <sys/libkern.h>
	55
	56	#include <cpu/lwbuf.h>
	57
	58	#include <vm/vm.h>
	59	#include <vm/vm_param.h>
	60	#include <sys/lock.h>
	61	#include <vm/pmap.h>
	62	#include <vm/vm_page.h>
	63	#include <vm/vm_map.h>
	64	#include <vm/vm_kern.h>
	65	#include <vm/vm_extern.h>
	66	#include <vm/vm_object.h>
	67	#include <vm/vnode_pager.h>
	68	#include <vm/vm_pager.h>
	69
	70	#include <sys/user.h>
	71	#include <sys/reg.h>
	72
	73	#include <sys/refcount.h>
	74	#include <sys/thread2.h>
	75	#include <sys/mplock2.h>
	76
	77	MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
	78	MALLOC_DEFINE(M_EXECARGS, "exec-args", "Exec arguments");
	79
	80	static register_t exec_copyout_strings (struct image_params );
	81
	82	/* XXX This should be vm_size_t. */
	83	static u_long ps_strings = PS_STRINGS;
	84	SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, "");
	85
	86	/* XXX This should be vm_size_t. */
	87	static u_long usrstack = USRSTACK;
	88	SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
	89
	90	u_long ps_arg_cache_limit = PAGE_SIZE / 16;
	91	SYSCTL_LONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
	92	&ps_arg_cache_limit, 0, "");
	93
	94	int ps_argsopen = 1;
	95	SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
	96
	97	static int ktrace_suid = 0;
	98	SYSCTL_INT(_kern, OID_AUTO, ktrace_suid, CTLFLAG_RW, &ktrace_suid, 0, "");
	99
	100	void print_execve_args(struct image_args *args);
	101	int debug_execve_args = 0;
	102	SYSCTL_INT(_kern, OID_AUTO, debug_execve_args, CTLFLAG_RW, &debug_execve_args,
	103	0, "");
	104
	105	/*
	106	* Exec arguments object cache
	107	*/
	108	static struct objcache *exec_objcache;
	109
	110	static
	111	void
	112	exec_objcache_init(void *arg __unused)
	113	{
	114	int cluster_limit;
	115	size_t limsize;
	116
	117	/*
	118	* Maximum number of concurrent execs. This can be limiting on
	119	* systems with a lot of cpu cores but it also eats a significant
	120	* amount of memory.
	121	*/
	122	cluster_limit = (ncpus < 16) ? 16 : ncpus;
	123	limsize = kmem_lim_size();
	124	if (limsize > 7 * 1024)
	125	cluster_limit *= 2;
	126	if (limsize > 15 * 1024)
	127	cluster_limit *= 2;
	128
	129	exec_objcache = objcache_create_mbacked(
	130	M_EXECARGS, PATH_MAX + ARG_MAX,
	131	&cluster_limit, 8,
	132	NULL, NULL, NULL);
	133	}
	134	SYSINIT(exec_objcache, SI_BOOT2_MACHDEP, SI_ORDER_ANY, exec_objcache_init, 0);
	135
	136	/*
	137	* stackgap_random specifies if the stackgap should have a random size added
	138	* to it. It must be a power of 2. If non-zero, the stack gap will be
	139	* calculated as: ALIGN(karc4random() & (stackgap_random - 1)).
	140	*/
	141	static int stackgap_random = 1024;
	142	static int
	143	sysctl_kern_stackgap(SYSCTL_HANDLER_ARGS)
	144	{
	145	int error, new_val;
	146	new_val = stackgap_random;
	147	error = sysctl_handle_int(oidp, &new_val, 0, req);
	148	if (error != 0 \|\| req->newptr == NULL)
	149	return (error);
	150	if ((new_val < 0) \|\| (new_val > 16 * PAGE_SIZE) \|\| ! powerof2(new_val))
	151	return (EINVAL);
	152	stackgap_random = new_val;
	153
	154	return(0);
	155	}
	156
	157	SYSCTL_PROC(_kern, OID_AUTO, stackgap_random, CTLFLAG_RW\|CTLTYPE_UINT,
	158	0, 0, sysctl_kern_stackgap, "IU", "Max random stack gap (power of 2)");
	159
	160	void
	161	print_execve_args(struct image_args *args)
	162	{
	163	char *cp;
	164	int ndx;
	165
	166	cp = args->begin_argv;
	167	for (ndx = 0; ndx < args->argc; ndx++) {
	168	kprintf("\targv[%d]: %s\n", ndx, cp);
	169	while (*cp++ != '\0');
	170	}
	171	for (ndx = 0; ndx < args->envc; ndx++) {
	172	kprintf("\tenvv[%d]: %s\n", ndx, cp);
	173	while (*cp++ != '\0');
	174	}
	175	}
	176
	177	/*
	178	* Each of the items is a pointer to a `const struct execsw', hence the
	179	* double pointer here.
	180	*/
	181	static const struct execsw **execsw;
	182
	183	/*
	184	* Replace current vmspace with a new binary.
	185	* Returns 0 on success, > 0 on recoverable error (use as errno).
	186	* Returns -1 on lethal error which demands killing of the current
	187	* process!
	188	*/
	189	int
	190	kern_execve(struct nlookupdata nd, struct image_args args)
	191	{
	192	struct thread *td = curthread;
	193	struct lwp *lp = td->td_lwp;
	194	struct proc *p = td->td_proc;
	195	struct vnode *ovp;
	196	register_t *stack_base;
	197	struct pargs *pa;
	198	struct sigacts *ops;
	199	struct sigacts *nps;
	200	int error, len, i;
	201	struct image_params image_params, *imgp;
	202	struct vattr attr;
	203	int (img_first) (struct image_params );
	204
	205	if (debug_execve_args) {
	206	kprintf("%s()\n", __func__);
	207	print_execve_args(args);
	208	}
	209
	210	KKASSERT(p);
	211	lwkt_gettoken(&p->p_token);
	212	imgp = &image_params;
	213
	214	/*
	215	* NOTE: P_INEXEC is handled by exec_new_vmspace() now. We make
	216	* no modifications to the process at all until we get there.
	217	*
	218	* Note that multiple threads may be trying to exec at the same
	219	* time. exec_new_vmspace() handles that too.
	220	*/
	221
	222	/*
	223	* Initialize part of the common data
	224	*/
	225	imgp->proc = p;
	226	imgp->args = args;
	227	imgp->attr = &attr;
	228	imgp->entry_addr = 0;
	229	imgp->resident = 0;
	230	imgp->vmspace_destroyed = 0;
	231	imgp->interpreted = 0;
	232	imgp->interpreter_name[0] = 0;
	233	imgp->auxargs = NULL;
	234	imgp->vp = NULL;
	235	imgp->firstpage = NULL;
	236	imgp->ps_strings = 0;
	237	imgp->execpath = imgp->freepath = NULL;
	238	imgp->execpathp = 0;
	239	imgp->image_header = NULL;
	240
	241	interpret:
	242
	243	/*
	244	* Translate the file name to a vnode. Unlock the cache entry to
	245	* improve parallelism for programs exec'd in parallel.
	246	*/
	247	if ((error = nlookup(nd)) != 0)
	248	goto exec_fail;
	249	error = cache_vget(&nd->nl_nch, nd->nl_cred, LK_EXCLUSIVE, &imgp->vp);
	250	KKASSERT(nd->nl_flags & NLC_NCPISLOCKED);
	251	nd->nl_flags &= ~NLC_NCPISLOCKED;
	252	cache_unlock(&nd->nl_nch);
	253	if (error)
	254	goto exec_fail;
	255
	256	/*
	257	* Check file permissions (also 'opens' file).
	258	* Include also the top level mount in the check.
	259	*/
	260	error = exec_check_permissions(imgp, nd->nl_nch.mount);
	261	if (error) {
	262	vn_unlock(imgp->vp);
	263	goto exec_fail_dealloc;
	264	}
	265
	266	error = exec_map_first_page(imgp);
	267	vn_unlock(imgp->vp);
	268	if (error)
	269	goto exec_fail_dealloc;
	270
	271	imgp->proc->p_osrel = 0;
	272
	273	if (debug_execve_args && imgp->interpreted) {
	274	kprintf(" target is interpreted -- recursive pass\n");
	275	kprintf(" interpreter: %s\n", imgp->interpreter_name);
	276	print_execve_args(args);
	277	}
	278
	279	/*
	280	* If the current process has a special image activator it
	281	* wants to try first, call it. For example, emulating shell
	282	* scripts differently.
	283	*/
	284	error = -1;
	285	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
	286	error = img_first(imgp);
	287
	288	/*
	289	* If the vnode has a registered vmspace, exec the vmspace
	290	*/
	291	if (error == -1 && imgp->vp->v_resident) {
	292	error = exec_resident_imgact(imgp);
	293	}
	294
	295	/*
	296	* Loop through the list of image activators, calling each one.
	297	* An activator returns -1 if there is no match, 0 on success,
	298	* and an error otherwise.
	299	*/
	300	for (i = 0; error == -1 && execsw[i]; ++i) {
	301	if (execsw[i]->ex_imgact == NULL \|\|
	302	execsw[i]->ex_imgact == img_first) {
	303	continue;
	304	}
	305	error = (*execsw[i]->ex_imgact)(imgp);
	306	}
	307
	308	if (error) {
	309	if (error == -1)
	310	error = ENOEXEC;
	311	goto exec_fail_dealloc;
	312	}
	313
	314	/*
	315	* Special interpreter operation, cleanup and loop up to try to
	316	* activate the interpreter.
	317	*/
	318	if (imgp->interpreted) {
	319	exec_unmap_first_page(imgp);
	320	nlookup_done(nd);
	321	vrele(imgp->vp);
	322	imgp->vp = NULL;
	323	error = nlookup_init(nd, imgp->interpreter_name, UIO_SYSSPACE,
	324	NLC_FOLLOW);
	325	if (error)
	326	goto exec_fail;
	327	goto interpret;
	328	}
	329
	330	/*
	331	* Do the best to calculate the full path to the image file
	332	*/
	333	if (imgp->auxargs != NULL &&
	334	((args->fname != NULL && args->fname[0] == '/') \|\|
	335	vn_fullpath(imgp->proc,
	336	imgp->vp,
	337	&imgp->execpath,
	338	&imgp->freepath,
	339	0) != 0))
	340	imgp->execpath = args->fname;
	341
	342	/*
	343	* Copy out strings (args and env) and initialize stack base
	344	*/
	345	stack_base = exec_copyout_strings(imgp);
	346	p->p_vmspace->vm_minsaddr = (char *)stack_base;
	347
	348	/*
	349	* If custom stack fixup routine present for this process
	350	* let it do the stack setup. If we are running a resident
	351	* image there is no auxinfo or other image activator context
	352	* so don't try to add fixups to the stack.
	353	*
	354	* Else stuff argument count as first item on stack
	355	*/
	356	if (p->p_sysent->sv_fixup && imgp->resident == 0)
	357	(*p->p_sysent->sv_fixup)(&stack_base, imgp);
	358	else
	359	suword(--stack_base, imgp->args->argc);
	360
	361	/*
	362	* For security and other reasons, the file descriptor table cannot
	363	* be shared after an exec.
	364	*/
	365	if (p->p_fd->fd_refcnt > 1) {
	366	struct filedesc *tmp;
	367
	368	error = fdcopy(p, &tmp);
	369	if (error != 0)
	370	goto exec_fail;
	371	fdfree(p, tmp);
	372	}
	373
	374	/*
	375	* For security and other reasons, signal handlers cannot
	376	* be shared after an exec. The new proces gets a copy of the old
	377	* handlers. In execsigs(), the new process will have its signals
	378	* reset.
	379	*/
	380	ops = p->p_sigacts;
	381	if (ops->ps_refcnt > 1) {
	382	nps = kmalloc(sizeof(*nps), M_SUBPROC, M_WAITOK);
	383	bcopy(ops, nps, sizeof(*nps));
	384	refcount_init(&nps->ps_refcnt, 1);
	385	p->p_sigacts = nps;
	386	if (refcount_release(&ops->ps_refcnt)) {
	387	kfree(ops, M_SUBPROC);
	388	ops = NULL;
	389	}
	390	}
	391
	392	/*
	393	* For security and other reasons virtual kernels cannot be
	394	* inherited by an exec. This also allows a virtual kernel
	395	* to fork/exec unrelated applications.
	396	*/
	397	if (p->p_vkernel)
	398	vkernel_exit(p);
	399
	400	/* Stop profiling */
	401	stopprofclock(p);
	402
	403	/* close files on exec */
	404	fdcloseexec(p);
	405
	406	/* reset caught signals */
	407	execsigs(p);
	408
	409	/* name this process - nameiexec(p, ndp) */
	410	len = min(nd->nl_nch.ncp->nc_nlen, MAXCOMLEN);
	411	bcopy(nd->nl_nch.ncp->nc_name, p->p_comm, len);
	412	p->p_comm[len] = 0;
	413	bcopy(p->p_comm, lp->lwp_thread->td_comm, MAXCOMLEN+1);
	414
	415	/*
	416	* mark as execed, wakeup the process that vforked (if any) and tell
	417	* it that it now has its own resources back
	418	*/
	419	p->p_flags \|= P_EXEC;
	420	if (p->p_pptr && (p->p_flags & P_PPWAIT)) {
	421	p->p_flags &= ~P_PPWAIT;
	422	wakeup((caddr_t)p->p_pptr);
	423	}
	424
	425	/*
	426	* Implement image setuid/setgid.
	427	*
	428	* Don't honor setuid/setgid if the filesystem prohibits it or if
	429	* the process is being traced.
	430	*/
	431	if ((((attr.va_mode & VSUID) && p->p_ucred->cr_uid != attr.va_uid) \|\|
	432	((attr.va_mode & VSGID) && p->p_ucred->cr_gid != attr.va_gid)) &&
	433	(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	434	(p->p_flags & P_TRACED) == 0) {
	435	/*
	436	* Turn off syscall tracing for set-id programs, except for
	437	* root. Record any set-id flags first to make sure that
	438	* we do not regain any tracing during a possible block.
	439	*/
	440	setsugid();
	441	if (p->p_tracenode && ktrace_suid == 0 &&
	442	priv_check(td, PRIV_ROOT) != 0) {
	443	ktrdestroy(&p->p_tracenode);
	444	p->p_traceflag = 0;
	445	}
	446	/* Close any file descriptors 0..2 that reference procfs */
	447	setugidsafety(p);
	448	/* Make sure file descriptors 0..2 are in use. */
	449	error = fdcheckstd(lp);
	450	if (error != 0)
	451	goto exec_fail_dealloc;
	452	/*
	453	* Set the new credentials.
	454	*/
	455	cratom(&p->p_ucred);
	456	if (attr.va_mode & VSUID)
	457	change_euid(attr.va_uid);
	458	if (attr.va_mode & VSGID)
	459	p->p_ucred->cr_gid = attr.va_gid;
	460
	461	/*
	462	* Clear local varsym variables
	463	*/
	464	varsymset_clean(&p->p_varsymset);
	465	} else {
	466	if (p->p_ucred->cr_uid == p->p_ucred->cr_ruid &&
	467	p->p_ucred->cr_gid == p->p_ucred->cr_rgid)
	468	p->p_flags &= ~P_SUGID;
	469	}
	470
	471	/*
	472	* Implement correct POSIX saved-id behavior.
	473	*/
	474	if (p->p_ucred->cr_svuid != p->p_ucred->cr_uid \|\|
	475	p->p_ucred->cr_svgid != p->p_ucred->cr_gid) {
	476	cratom(&p->p_ucred);
	477	p->p_ucred->cr_svuid = p->p_ucred->cr_uid;
	478	p->p_ucred->cr_svgid = p->p_ucred->cr_gid;
	479	}
	480
	481	/*
	482	* Store the vp for use in procfs. Be sure to keep p_textvp
	483	* consistent if we block during the switch-over.
	484	*/
	485	ovp = p->p_textvp;
	486	vref(imgp->vp); /* ref new vp */
	487	p->p_textvp = imgp->vp;
	488	if (ovp) /* release old vp */
	489	vrele(ovp);
	490
	491	/* Release old namecache handle to text file */
	492	if (p->p_textnch.ncp)
	493	cache_drop(&p->p_textnch);
	494
	495	if (nd->nl_nch.mount)
	496	cache_copy(&nd->nl_nch, &p->p_textnch);
	497
	498	/*
	499	* Notify others that we exec'd, and clear the P_INEXEC flag
	500	* as we're now a bona fide freshly-execed process.
	501	*/
	502	KNOTE(&p->p_klist, NOTE_EXEC);
	503	p->p_flags &= ~P_INEXEC;
	504
	505	/*
	506	* If tracing the process, trap to debugger so breakpoints
	507	* can be set before the program executes.
	508	*/
	509	STOPEVENT(p, S_EXEC, 0);
	510
	511	if (p->p_flags & P_TRACED)
	512	ksignal(p, SIGTRAP);
	513
	514	/* clear "fork but no exec" flag, as we _are_ execing */
	515	p->p_acflag &= ~AFORK;
	516
	517	/* Set values passed into the program in registers. */
	518	exec_setregs(imgp->entry_addr, (u_long)(uintptr_t)stack_base,
	519	imgp->ps_strings);
	520
	521	/* Set the access time on the vnode */
	522	vn_mark_atime(imgp->vp, td);
	523
	524	/*
	525	* Free any previous argument cache
	526	*/
	527	pa = p->p_args;
	528	p->p_args = NULL;
	529	if (pa && refcount_release(&pa->ar_ref)) {
	530	kfree(pa, M_PARGS);
	531	pa = NULL;
	532	}
	533
	534	/*
	535	* Cache arguments if they fit inside our allowance
	536	*/
	537	i = imgp->args->begin_envv - imgp->args->begin_argv;
	538	if (sizeof(struct pargs) + i <= ps_arg_cache_limit) {
	539	pa = kmalloc(sizeof(struct pargs) + i, M_PARGS, M_WAITOK);
	540	refcount_init(&pa->ar_ref, 1);
	541	pa->ar_length = i;
	542	bcopy(imgp->args->begin_argv, pa->ar_args, i);
	543	KKASSERT(p->p_args == NULL);
	544	p->p_args = pa;
	545	}
	546
	547	exec_fail_dealloc:
	548
	549	/*
	550	* free various allocated resources
	551	*/
	552	if (imgp->firstpage)
	553	exec_unmap_first_page(imgp);
	554
	555	if (imgp->vp) {
	556	vrele(imgp->vp);
	557	imgp->vp = NULL;
	558	}
	559
	560	if (imgp->freepath)
	561	kfree(imgp->freepath, M_TEMP);
	562
	563	if (error == 0) {
	564	++mycpu->gd_cnt.v_exec;
	565	lwkt_reltoken(&p->p_token);
	566	return (0);
	567	}
	568
	569	exec_fail:
	570	/*
	571	* we're done here, clear P_INEXEC if we were the ones that
	572	* set it. Otherwise if vmspace_destroyed is still set we
	573	* raced another thread and that thread is responsible for
	574	* clearing it.
	575	*/
	576	if (imgp->vmspace_destroyed & 2)
	577	p->p_flags &= ~P_INEXEC;
	578	lwkt_reltoken(&p->p_token);
	579	if (imgp->vmspace_destroyed) {
	580	/*
	581	* Sorry, no more process anymore. exit gracefully.
	582	* However we can't die right here, because our
	583	* caller might have to clean up, so indicate a
	584	* lethal error by returning -1.
	585	*/
	586	return(-1);
	587	} else {
	588	return(error);
	589	}
	590	}
	591
	592	/*
	593	* execve() system call.
	594	*/
	595	int
	596	sys_execve(struct execve_args *uap)
	597	{
	598	struct nlookupdata nd;
	599	struct image_args args;
	600	int error;
	601
	602	bzero(&args, sizeof(args));
	603
	604	error = nlookup_init(&nd, uap->fname, UIO_USERSPACE, NLC_FOLLOW);
	605	if (error == 0) {
	606	error = exec_copyin_args(&args, uap->fname, PATH_USERSPACE,
	607	uap->argv, uap->envv);
	608	}
	609	if (error == 0)
	610	error = kern_execve(&nd, &args);
	611	nlookup_done(&nd);
	612	exec_free_args(&args);
	613
	614	if (error < 0) {
	615	/* We hit a lethal error condition. Let's die now. */
	616	exit1(W_EXITCODE(0, SIGABRT));
	617	/* NOTREACHED */
	618	}
	619
	620	/*
	621	* The syscall result is returned in registers to the new program.
	622	* Linux will register %edx as an atexit function and we must be
	623	* sure to set it to 0. XXX
	624	*/
	625	if (error == 0)
	626	uap->sysmsg_result64 = 0;
	627
	628	return (error);
	629	}
	630
	631	int
	632	exec_map_page(struct image_params *imgp, vm_pindex_t pageno,
	633	struct lwbuf plwb, const char pdata)
	634	{
	635	int rv;
	636	vm_page_t ma;
	637	vm_page_t m;
	638	vm_object_t object;
	639
	640	/*
	641	* The file has to be mappable.
	642	*/
	643	if ((object = imgp->vp->v_object) == NULL)
	644	return (EIO);
	645
	646	if (pageno >= object->size)
	647	return (EIO);
	648
	649	vm_object_hold(object);
	650	m = vm_page_grab(object, pageno, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	651	while ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
	652	ma = m;
	653
	654	/*
	655	* get_pages unbusies all the requested pages except the
	656	* primary page (at index 0 in this case). The primary
	657	* page may have been wired during the pagein (e.g. by
	658	* the buffer cache) so vnode_pager_freepage() must be
	659	* used to properly release it.
	660	*/
	661	rv = vm_pager_get_page(object, &ma, 1);
	662	m = vm_page_lookup(object, pageno);
	663
	664	if (rv != VM_PAGER_OK \|\| m == NULL \|\| m->valid == 0) {
	665	if (m) {
	666	vm_page_protect(m, VM_PROT_NONE);
	667	vnode_pager_freepage(m);
	668	}
	669	vm_object_drop(object);
	670	return EIO;
	671	}
	672	}
	673	vm_page_hold(m);
	674	vm_page_wakeup(m); /* unbusy the page */
	675	vm_object_drop(object);
	676
	677	plwb = lwbuf_alloc(m, plwb);
	678	pdata = (void )lwbuf_kva(*plwb);
	679
	680	return (0);
	681	}
	682
	683	/*
	684	* Map the first page of an executable image.
	685	*
	686	* NOTE: If the mapping fails we have to NULL-out firstpage which may
	687	* still be pointing to our supplied lwp structure.
	688	*/
	689	int
	690	exec_map_first_page(struct image_params *imgp)
	691	{
	692	int err;
	693
	694	if (imgp->firstpage)
	695	exec_unmap_first_page(imgp);
	696
	697	imgp->firstpage = &imgp->firstpage_cache;
	698	err = exec_map_page(imgp, 0, &imgp->firstpage, &imgp->image_header);
	699
	700	if (err) {
	701	imgp->firstpage = NULL;
	702	return err;
	703	}
	704
	705	return 0;
	706	}
	707
	708	void
	709	exec_unmap_page(struct lwbuf *lwb)
	710	{
	711	vm_page_t m;
	712
	713	crit_enter();
	714	if (lwb != NULL) {
	715	m = lwbuf_page(lwb);
	716	lwbuf_free(lwb);
	717	vm_page_unhold(m);
	718	}
	719	crit_exit();
	720	}
	721
	722	void
	723	exec_unmap_first_page(struct image_params *imgp)
	724	{
	725	exec_unmap_page(imgp->firstpage);
	726	imgp->firstpage = NULL;
	727	imgp->image_header = NULL;
	728	}
	729
	730	/*
	731	* Destroy old address space, and allocate a new stack
	732	* The new stack is only SGROWSIZ large because it is grown
	733	* automatically in trap.c.
	734	*
	735	* This is the point of no return.
	736	*/
	737	int
	738	exec_new_vmspace(struct image_params imgp, struct vmspace vmcopy)
	739	{
	740	struct vmspace *vmspace = imgp->proc->p_vmspace;
	741	vm_offset_t stack_addr = USRSTACK - maxssiz;
	742	struct proc *p;
	743	vm_map_t map;
	744	int error;
	745
	746	/*
	747	* Indicate that we cannot gracefully error out any more, kill
	748	* any other threads present, and set P_INEXEC to indicate that
	749	* we are now messing with the process structure proper.
	750	*
	751	* If killalllwps() races return an error which coupled with
	752	* vmspace_destroyed will cause us to exit. This is what we
	753	* want since another thread is patiently waiting for us to exit
	754	* in that case.
	755	*/
	756	p = curproc;
	757	imgp->vmspace_destroyed = 1;
	758
	759	if (curthread->td_proc->p_nthreads > 1) {
	760	error = killalllwps(1);
	761	if (error)
	762	return (error);
	763	}
	764	imgp->vmspace_destroyed \|= 2; /* we are responsible for P_INEXEC */
	765	p->p_flags \|= P_INEXEC;
	766
	767	/*
	768	* After setting P_INEXEC wait for any remaining references to
	769	* the process (p) to go away.
	770	*
	771	* In particular, a vfork/exec sequence will replace p->p_vmspace
	772	* and we must interlock anyone trying to access the space (aka
	773	* procfs or sys_process.c calling procfs_domem()).
	774	*
	775	* If P_PPWAIT is set the parent vfork()'d and has a PHOLD() on us.
	776	*/
	777	PSTALL(p, "exec1", ((p->p_flags & P_PPWAIT) ? 1 : 0));
	778
	779	/*
	780	* Blow away entire process VM, if address space not shared,
	781	* otherwise, create a new VM space so that other threads are
	782	* not disrupted. If we are execing a resident vmspace we
	783	* create a duplicate of it and remap the stack.
	784	*/
	785	map = &vmspace->vm_map;
	786	if (vmcopy) {
	787	vmspace_exec(imgp->proc, vmcopy);
	788	vmspace = imgp->proc->p_vmspace;
	789	pmap_remove_pages(vmspace_pmap(vmspace), stack_addr, USRSTACK);
	790	map = &vmspace->vm_map;
	791	} else if (vmspace->vm_sysref.refcnt == 1) {
	792	shmexit(vmspace);
	793	if (vmspace->vm_upcalls)
	794	upc_release(vmspace, ONLY_LWP_IN_PROC(imgp->proc));
	795	pmap_remove_pages(vmspace_pmap(vmspace),
	796	0, VM_MAX_USER_ADDRESS);
	797	vm_map_remove(map, 0, VM_MAX_USER_ADDRESS);
	798	} else {
	799	vmspace_exec(imgp->proc, NULL);
	800	vmspace = imgp->proc->p_vmspace;
	801	map = &vmspace->vm_map;
	802	}
	803
	804	/* Allocate a new stack */
	805	error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
	806	0, VM_PROT_ALL, VM_PROT_ALL, 0);
	807	if (error)
	808	return (error);
	809
	810	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
	811	* VM_STACK case, but they are still used to monitor the size of the
	812	* process stack so we can check the stack rlimit.
	813	*/
	814	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	815	vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
	816
	817	return(0);
	818	}
	819
	820	/*
	821	* Copy out argument and environment strings from the old process
	822	* address space into the temporary string buffer.
	823	*/
	824	int
	825	exec_copyin_args(struct image_args args, char fname,
	826	enum exec_path_segflg segflg, char argv, char envv)
	827	{
	828	char argp, envp;
	829	int error = 0;
	830	size_t length;
	831
	832	args->buf = objcache_get(exec_objcache, M_WAITOK);
	833	if (args->buf == NULL)
	834	return (ENOMEM);
	835	args->begin_argv = args->buf;
	836	args->endp = args->begin_argv;
	837	args->space = ARG_MAX;
	838
	839	args->fname = args->buf + ARG_MAX;
	840
	841	/*
	842	* Copy the file name.
	843	*/
	844	if (segflg == PATH_SYSSPACE) {
	845	error = copystr(fname, args->fname, PATH_MAX, &length);
	846	} else if (segflg == PATH_USERSPACE) {
	847	error = copyinstr(fname, args->fname, PATH_MAX, &length);
	848	}
	849
	850	/*
	851	* Extract argument strings. argv may not be NULL. The argv
	852	* array is terminated by a NULL entry. We special-case the
	853	* situation where argv[0] is NULL by passing { filename, NULL }
	854	* to the new program to guarentee that the interpreter knows what
	855	* file to open in case we exec an interpreted file. Note that
	856	* a NULL argv[0] terminates the argv[] array.
	857	*
	858	* XXX the special-casing of argv[0] is historical and needs to be
	859	* revisited.
	860	*/
	861	if (argv == NULL)
	862	error = EFAULT;
	863	if (error == 0) {
	864	while ((argp = (caddr_t)(intptr_t)fuword(argv++)) != NULL) {
	865	if (argp == (caddr_t)-1) {
	866	error = EFAULT;
	867	break;
	868	}
	869	error = copyinstr(argp, args->endp,
	870	args->space, &length);
	871	if (error) {
	872	if (error == ENAMETOOLONG)
	873	error = E2BIG;
	874	break;
	875	}
	876	args->space -= length;
	877	args->endp += length;
	878	args->argc++;
	879	}
	880	if (args->argc == 0 && error == 0) {
	881	length = strlen(args->fname) + 1;
	882	if (length > args->space) {
	883	error = E2BIG;
	884	} else {
	885	bcopy(args->fname, args->endp, length);
	886	args->space -= length;
	887	args->endp += length;
	888	args->argc++;
	889	}
	890	}
	891	}
	892
	893	args->begin_envv = args->endp;
	894
	895	/*
	896	* extract environment strings. envv may be NULL.
	897	*/
	898	if (envv && error == 0) {
	899	while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
	900	if (envp == (caddr_t) -1) {
	901	error = EFAULT;
	902	break;
	903	}
	904	error = copyinstr(envp, args->endp, args->space,
	905	&length);
	906	if (error) {
	907	if (error == ENAMETOOLONG)
	908	error = E2BIG;
	909	break;
	910	}
	911	args->space -= length;
	912	args->endp += length;
	913	args->envc++;
	914	}
	915	}
	916	return (error);
	917	}
	918
	919	void
	920	exec_free_args(struct image_args *args)
	921	{
	922	if (args->buf) {
	923	objcache_put(exec_objcache, args->buf);
	924	args->buf = NULL;
	925	}
	926	}
	927
	928	/*
	929	* Copy strings out to the new process address space, constructing
	930	* new arg and env vector tables. Return a pointer to the base
	931	* so that it can be used as the initial stack pointer.
	932	*/
	933	register_t *
	934	exec_copyout_strings(struct image_params *imgp)
	935	{
	936	int argc, envc, sgap;
	937	char **vectp;
	938	char stringp, destp;
	939	register_t *stack_base;
	940	struct ps_strings *arginfo;
	941	size_t execpath_len;
	942	int szsigcode;
	943
	944	/*
	945	* Calculate string base and vector table pointers.
	946	* Also deal with signal trampoline code for this exec type.
	947	*/
	948	if (imgp->execpath != NULL && imgp->auxargs != NULL)
	949	execpath_len = strlen(imgp->execpath) + 1;
	950	else
	951	execpath_len = 0;
	952	arginfo = (struct ps_strings *)PS_STRINGS;
	953	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
	954	if (stackgap_random != 0)
	955	sgap = ALIGN(karc4random() & (stackgap_random - 1));
	956	else
	957	sgap = 0;
	958	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - sgap -
	959	roundup(execpath_len, sizeof(char *)) -
	960	roundup((ARG_MAX - imgp->args->space), sizeof(char *));
	961
	962	/*
	963	* install sigcode
	964	*/
	965	if (szsigcode)
	966	copyout(imgp->proc->p_sysent->sv_sigcode,
	967	((caddr_t)arginfo - szsigcode), szsigcode);
	968
	969	/*
	970	* Copy the image path for the rtld
	971	*/
	972	if (execpath_len != 0) {
	973	imgp->execpathp = (uintptr_t)arginfo
	974	- szsigcode
	975	- execpath_len;
	976	copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len);
	977	}
	978
	979	/*
	980	* If we have a valid auxargs ptr, prepare some room
	981	* on the stack.
	982	*
	983	* The '+ 2' is for the null pointers at the end of each of the
	984	* arg and env vector sets, and 'AT_COUNT*2' is room for the
	985	* ELF Auxargs data.
	986	*/
	987	if (imgp->auxargs) {
	988	vectp = (char **)(destp - (imgp->args->argc +
	989	imgp->args->envc + 2 + (AT_COUNT * 2) + execpath_len) *
	990	sizeof(char*));
	991	} else {
	992	vectp = (char **)(destp - (imgp->args->argc +
	993	imgp->args->envc + 2) * sizeof(char*));
	994	}
	995
	996	/*
	997	* NOTE: don't bother aligning the stack here for GCC 2.x, it will
	998	* be done in crt1.o. Note that GCC 3.x aligns the stack in main.
	999	*/
	1000
	1001	/*
	1002	* vectp also becomes our initial stack base
	1003	*/
	1004	stack_base = (register_t *)vectp;
	1005
	1006	stringp = imgp->args->begin_argv;
	1007	argc = imgp->args->argc;
	1008	envc = imgp->args->envc;
	1009
	1010	/*
	1011	* Copy out strings - arguments and environment.
	1012	*/
	1013	copyout(stringp, destp, ARG_MAX - imgp->args->space);
	1014
	1015	/*
	1016	* Fill in "ps_strings" struct for ps, w, etc.
	1017	*/
	1018	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
	1019	suword32(&arginfo->ps_nargvstr, argc);
	1020
	1021	/*
	1022	* Fill in argument portion of vector table.
	1023	*/
	1024	for (; argc > 0; --argc) {
	1025	suword(vectp++, (long)(intptr_t)destp);
	1026	while (*stringp++ != 0)
	1027	destp++;
	1028	destp++;
	1029	}
	1030
	1031	/* a null vector table pointer separates the argp's from the envp's */
	1032	suword(vectp++, 0);
	1033
	1034	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
	1035	suword32(&arginfo->ps_nenvstr, envc);
	1036
	1037	/*
	1038	* Fill in environment portion of vector table.
	1039	*/
	1040	for (; envc > 0; --envc) {
	1041	suword(vectp++, (long)(intptr_t)destp);
	1042	while (*stringp++ != 0)
	1043	destp++;
	1044	destp++;
	1045	}
	1046
	1047	/* end of vector table is a null pointer */
	1048	suword(vectp, 0);
	1049
	1050	return (stack_base);
	1051	}
	1052
	1053	/*
	1054	* Check permissions of file to execute.
	1055	* Return 0 for success or error code on failure.
	1056	*/
	1057	int
	1058	exec_check_permissions(struct image_params imgp, struct mount topmnt)
	1059	{
	1060	struct proc *p = imgp->proc;
	1061	struct vnode *vp = imgp->vp;
	1062	struct vattr *attr = imgp->attr;
	1063	int error;
	1064
	1065	/* Get file attributes */
	1066	error = VOP_GETATTR(vp, attr);
	1067	if (error)
	1068	return (error);
	1069
	1070	/*
	1071	* 1) Check if file execution is disabled for the filesystem that this
	1072	* file resides on.
	1073	* 2) Insure that at least one execute bit is on - otherwise root
	1074	* will always succeed, and we don't want to happen unless the
	1075	* file really is executable.
	1076	* 3) Insure that the file is a regular file.
	1077	*/
	1078	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	1079	((topmnt != NULL) && (topmnt->mnt_flag & MNT_NOEXEC)) \|\|
	1080	((attr->va_mode & 0111) == 0) \|\|
	1081	(attr->va_type != VREG)) {
	1082	return (EACCES);
	1083	}
	1084
	1085	/*
	1086	* Zero length files can't be exec'd
	1087	*/
	1088	if (attr->va_size == 0)
	1089	return (ENOEXEC);
	1090
	1091	/*
	1092	* Check for execute permission to file based on current credentials.
	1093	*/
	1094	error = VOP_EACCESS(vp, VEXEC, p->p_ucred);
	1095	if (error)
	1096	return (error);
	1097
	1098	/*
	1099	* Check number of open-for-writes on the file and deny execution
	1100	* if there are any.
	1101	*/
	1102	if (vp->v_writecount)
	1103	return (ETXTBSY);
	1104
	1105	/*
	1106	* Call filesystem specific open routine, which allows us to read,
	1107	* write, and mmap the file. Without the VOP_OPEN we can only
	1108	* stat the file.
	1109	*/
	1110	error = VOP_OPEN(vp, FREAD, p->p_ucred, NULL);
	1111	if (error)
	1112	return (error);
	1113
	1114	return (0);
	1115	}
	1116
	1117	/*
	1118	* Exec handler registration
	1119	*/
	1120	int
	1121	exec_register(const struct execsw *execsw_arg)
	1122	{
	1123	const struct execsw es, xs, **newexecsw;
	1124	int count = 2; /* New slot and trailing NULL */
	1125
	1126	if (execsw)
	1127	for (es = execsw; *es; es++)
	1128	count++;
	1129	newexecsw = kmalloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	1130	xs = newexecsw;
	1131	if (execsw)
	1132	for (es = execsw; *es; es++)
	1133	xs++ = es;
	1134	*xs++ = execsw_arg;
	1135	*xs = NULL;
	1136	if (execsw)
	1137	kfree(execsw, M_TEMP);
	1138	execsw = newexecsw;
	1139	return 0;
	1140	}
	1141
	1142	int
	1143	exec_unregister(const struct execsw *execsw_arg)
	1144	{
	1145	const struct execsw es, xs, **newexecsw;
	1146	int count = 1;
	1147
	1148	if (execsw == NULL)
	1149	panic("unregister with no handlers left?");
	1150
	1151	for (es = execsw; *es; es++) {
	1152	if (*es == execsw_arg)
	1153	break;
	1154	}
	1155	if (*es == NULL)
	1156	return ENOENT;
	1157	for (es = execsw; *es; es++)
	1158	if (*es != execsw_arg)
	1159	count++;
	1160	newexecsw = kmalloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	1161	xs = newexecsw;
	1162	for (es = execsw; *es; es++)
	1163	if (*es != execsw_arg)
	1164	xs++ = es;
	1165	*xs = NULL;
	1166	if (execsw)
	1167	kfree(execsw, M_TEMP);
	1168	execsw = newexecsw;
	1169	return 0;
	1170	}