gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2005 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Jeffrey Hsu.
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	*
	35	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. All advertising materials mentioning features or use of this software
	52	* must display the following acknowledgement:
	53	* This product includes software developed by the University of
	54	* California, Berkeley and its contributors.
	55	* 4. Neither the name of the University nor the names of its contributors
	56	* may be used to endorse or promote products derived from this software
	57	* without specific prior written permission.
	58	*
	59	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	60	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	61	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	62	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	63	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	64	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	65	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	66	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	67	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	68	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	69	* SUCH DAMAGE.
	70	*
	71	* @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
	72	* $FreeBSD: src/sys/kern/kern_descrip.c,v 1.81.2.19 2004/02/28 00:43:31 tegge Exp $
	73	*/
	74
	75	#include "opt_compat.h"
	76	#include <sys/param.h>
	77	#include <sys/systm.h>
	78	#include <sys/malloc.h>
	79	#include <sys/sysproto.h>
	80	#include <sys/conf.h>
	81	#include <sys/device.h>
	82	#include <sys/file.h>
	83	#include <sys/filedesc.h>
	84	#include <sys/kernel.h>
	85	#include <sys/sysctl.h>
	86	#include <sys/vnode.h>
	87	#include <sys/proc.h>
	88	#include <sys/nlookup.h>
	89	#include <sys/file.h>
	90	#include <sys/stat.h>
	91	#include <sys/filio.h>
	92	#include <sys/fcntl.h>
	93	#include <sys/unistd.h>
	94	#include <sys/resourcevar.h>
	95	#include <sys/event.h>
	96	#include <sys/kern_syscall.h>
	97	#include <sys/kcore.h>
	98	#include <sys/kinfo.h>
	99	#include <sys/un.h>
	100
	101	#include <vm/vm.h>
	102	#include <vm/vm_extern.h>
	103
	104	#include <sys/thread2.h>
	105	#include <sys/file2.h>
	106	#include <sys/spinlock2.h>
	107	#include <sys/mplock2.h>
	108
	109	static void fsetfd_locked(struct filedesc fdp, struct file fp, int fd);
	110	static void fdreserve_locked (struct filedesc *fdp, int fd0, int incr);
	111	static struct file funsetfd_locked (struct filedesc fdp, int fd);
	112	static void ffree(struct file *fp);
	113
	114	static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
	115	static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
	116	"file desc to leader structures");
	117	MALLOC_DEFINE(M_FILE, "file", "Open file structure");
	118	static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
	119
	120	static struct krate krate_uidinfo = { .freq = 1 };
	121
	122	static d_open_t fdopen;
	123	#define NUMFDESC 64
	124
	125	#define CDEV_MAJOR 22
	126	static struct dev_ops fildesc_ops = {
	127	{ "FD", 0, 0 },
	128	.d_open = fdopen,
	129	};
	130
	131	/*
	132	* Descriptor management.
	133	*/
	134	static struct filelist filehead = LIST_HEAD_INITIALIZER(&filehead);
	135	static struct spinlock filehead_spin = SPINLOCK_INITIALIZER(&filehead_spin);
	136	static int nfiles; /* actual number of open files */
	137	extern int cmask;
	138
	139	/*
	140	* Fixup fd_freefile and fd_lastfile after a descriptor has been cleared.
	141	*
	142	* MPSAFE - must be called with fdp->fd_spin exclusively held
	143	*/
	144	static __inline
	145	void
	146	fdfixup_locked(struct filedesc *fdp, int fd)
	147	{
	148	if (fd < fdp->fd_freefile) {
	149	fdp->fd_freefile = fd;
	150	}
	151	while (fdp->fd_lastfile >= 0 &&
	152	fdp->fd_files[fdp->fd_lastfile].fp == NULL &&
	153	fdp->fd_files[fdp->fd_lastfile].reserved == 0
	154	) {
	155	--fdp->fd_lastfile;
	156	}
	157	}
	158
	159	/*
	160	* System calls on descriptors.
	161	*
	162	* MPSAFE
	163	*/
	164	int
	165	sys_getdtablesize(struct getdtablesize_args *uap)
	166	{
	167	struct proc *p = curproc;
	168	struct plimit *limit = p->p_limit;
	169	int dtsize;
	170
	171	spin_lock(&limit->p_spin);
	172	if (limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX)
	173	dtsize = INT_MAX;
	174	else
	175	dtsize = (int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur;
	176	spin_unlock(&limit->p_spin);
	177
	178	if (dtsize > maxfilesperproc)
	179	dtsize = maxfilesperproc;
	180	if (dtsize < minfilesperproc)
	181	dtsize = minfilesperproc;
	182	if (p->p_ucred->cr_uid && dtsize > maxfilesperuser)
	183	dtsize = maxfilesperuser;
	184	uap->sysmsg_result = dtsize;
	185	return (0);
	186	}
	187
	188	/*
	189	* Duplicate a file descriptor to a particular value.
	190	*
	191	* note: keep in mind that a potential race condition exists when closing
	192	* descriptors from a shared descriptor table (via rfork).
	193	*
	194	* MPSAFE
	195	*/
	196	int
	197	sys_dup2(struct dup2_args *uap)
	198	{
	199	int error;
	200	int fd = 0;
	201
	202	error = kern_dup(DUP_FIXED, uap->from, uap->to, &fd);
	203	uap->sysmsg_fds[0] = fd;
	204
	205	return (error);
	206	}
	207
	208	/*
	209	* Duplicate a file descriptor.
	210	*
	211	* MPSAFE
	212	*/
	213	int
	214	sys_dup(struct dup_args *uap)
	215	{
	216	int error;
	217	int fd = 0;
	218
	219	error = kern_dup(DUP_VARIABLE, uap->fd, 0, &fd);
	220	uap->sysmsg_fds[0] = fd;
	221
	222	return (error);
	223	}
	224
	225	/*
	226	* MPALMOSTSAFE - acquires mplock for fp operations
	227	*/
	228	int
	229	kern_fcntl(int fd, int cmd, union fcntl_dat dat, struct ucred cred)
	230	{
	231	struct thread *td = curthread;
	232	struct proc *p = td->td_proc;
	233	struct file *fp;
	234	struct vnode *vp;
	235	u_int newmin;
	236	u_int oflags;
	237	u_int nflags;
	238	int tmp, error, flg = F_POSIX;
	239
	240	KKASSERT(p);
	241
	242	/*
	243	* Operations on file descriptors that do not require a file pointer.
	244	*/
	245	switch (cmd) {
	246	case F_GETFD:
	247	error = fgetfdflags(p->p_fd, fd, &tmp);
	248	if (error == 0)
	249	dat->fc_cloexec = (tmp & UF_EXCLOSE) ? FD_CLOEXEC : 0;
	250	return (error);
	251
	252	case F_SETFD:
	253	if (dat->fc_cloexec & FD_CLOEXEC)
	254	error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE);
	255	else
	256	error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE);
	257	return (error);
	258	case F_DUPFD:
	259	newmin = dat->fc_fd;
	260	error = kern_dup(DUP_VARIABLE, fd, newmin, &dat->fc_fd);
	261	return (error);
	262	default:
	263	break;
	264	}
	265
	266	/*
	267	* Operations on file pointers
	268	*/
	269	if ((fp = holdfp(p->p_fd, fd, -1)) == NULL)
	270	return (EBADF);
	271
	272	get_mplock();
	273	switch (cmd) {
	274	case F_GETFL:
	275	dat->fc_flags = OFLAGS(fp->f_flag);
	276	error = 0;
	277	break;
	278
	279	case F_SETFL:
	280	oflags = fp->f_flag;
	281	nflags = FFLAGS(dat->fc_flags & ~O_ACCMODE) & FCNTLFLAGS;
	282	nflags \|= oflags & ~FCNTLFLAGS;
	283
	284	error = 0;
	285	if (((nflags ^ oflags) & O_APPEND) && (oflags & FAPPENDONLY))
	286	error = EINVAL;
	287	if (error == 0 && ((nflags ^ oflags) & FASYNC)) {
	288	tmp = nflags & FASYNC;
	289	error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp,
	290	cred, NULL);
	291	}
	292	if (error == 0)
	293	fp->f_flag = nflags;
	294	break;
	295
	296	case F_GETOWN:
	297	error = fo_ioctl(fp, FIOGETOWN, (caddr_t)&dat->fc_owner,
	298	cred, NULL);
	299	break;
	300
	301	case F_SETOWN:
	302	error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&dat->fc_owner,
	303	cred, NULL);
	304	break;
	305
	306	case F_SETLKW:
	307	flg \|= F_WAIT;
	308	/* Fall into F_SETLK */
	309
	310	case F_SETLK:
	311	if (fp->f_type != DTYPE_VNODE) {
	312	error = EBADF;
	313	break;
	314	}
	315	vp = (struct vnode *)fp->f_data;
	316
	317	/*
	318	* copyin/lockop may block
	319	*/
	320	if (dat->fc_flock.l_whence == SEEK_CUR)
	321	dat->fc_flock.l_start += fp->f_offset;
	322
	323	switch (dat->fc_flock.l_type) {
	324	case F_RDLCK:
	325	if ((fp->f_flag & FREAD) == 0) {
	326	error = EBADF;
	327	break;
	328	}
	329	p->p_leader->p_flag \|= P_ADVLOCK;
	330	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
	331	&dat->fc_flock, flg);
	332	break;
	333	case F_WRLCK:
	334	if ((fp->f_flag & FWRITE) == 0) {
	335	error = EBADF;
	336	break;
	337	}
	338	p->p_leader->p_flag \|= P_ADVLOCK;
	339	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
	340	&dat->fc_flock, flg);
	341	break;
	342	case F_UNLCK:
	343	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
	344	&dat->fc_flock, F_POSIX);
	345	break;
	346	default:
	347	error = EINVAL;
	348	break;
	349	}
	350
	351	/*
	352	* It is possible to race a close() on the descriptor while
	353	* we were blocked getting the lock. If this occurs the
	354	* close might not have caught the lock.
	355	*/
	356	if (checkfdclosed(p->p_fd, fd, fp)) {
	357	dat->fc_flock.l_whence = SEEK_SET;
	358	dat->fc_flock.l_start = 0;
	359	dat->fc_flock.l_len = 0;
	360	dat->fc_flock.l_type = F_UNLCK;
	361	(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
	362	F_UNLCK, &dat->fc_flock, F_POSIX);
	363	}
	364	break;
	365
	366	case F_GETLK:
	367	if (fp->f_type != DTYPE_VNODE) {
	368	error = EBADF;
	369	break;
	370	}
	371	vp = (struct vnode *)fp->f_data;
	372	/*
	373	* copyin/lockop may block
	374	*/
	375	if (dat->fc_flock.l_type != F_RDLCK &&
	376	dat->fc_flock.l_type != F_WRLCK &&
	377	dat->fc_flock.l_type != F_UNLCK) {
	378	error = EINVAL;
	379	break;
	380	}
	381	if (dat->fc_flock.l_whence == SEEK_CUR)
	382	dat->fc_flock.l_start += fp->f_offset;
	383	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK,
	384	&dat->fc_flock, F_POSIX);
	385	break;
	386	default:
	387	error = EINVAL;
	388	break;
	389	}
	390	rel_mplock();
	391
	392	fdrop(fp);
	393	return (error);
	394	}
	395
	396	/*
	397	* The file control system call.
	398	*
	399	* MPSAFE
	400	*/
	401	int
	402	sys_fcntl(struct fcntl_args *uap)
	403	{
	404	union fcntl_dat dat;
	405	int error;
	406
	407	switch (uap->cmd) {
	408	case F_DUPFD:
	409	dat.fc_fd = uap->arg;
	410	break;
	411	case F_SETFD:
	412	dat.fc_cloexec = uap->arg;
	413	break;
	414	case F_SETFL:
	415	dat.fc_flags = uap->arg;
	416	break;
	417	case F_SETOWN:
	418	dat.fc_owner = uap->arg;
	419	break;
	420	case F_SETLKW:
	421	case F_SETLK:
	422	case F_GETLK:
	423	error = copyin((caddr_t)uap->arg, &dat.fc_flock,
	424	sizeof(struct flock));
	425	if (error)
	426	return (error);
	427	break;
	428	}
	429
	430	error = kern_fcntl(uap->fd, uap->cmd, &dat, curthread->td_ucred);
	431
	432	if (error == 0) {
	433	switch (uap->cmd) {
	434	case F_DUPFD:
	435	uap->sysmsg_result = dat.fc_fd;
	436	break;
	437	case F_GETFD:
	438	uap->sysmsg_result = dat.fc_cloexec;
	439	break;
	440	case F_GETFL:
	441	uap->sysmsg_result = dat.fc_flags;
	442	break;
	443	case F_GETOWN:
	444	uap->sysmsg_result = dat.fc_owner;
	445	case F_GETLK:
	446	error = copyout(&dat.fc_flock, (caddr_t)uap->arg,
	447	sizeof(struct flock));
	448	break;
	449	}
	450	}
	451
	452	return (error);
	453	}
	454
	455	/*
	456	* Common code for dup, dup2, and fcntl(F_DUPFD).
	457	*
	458	* The type flag can be either DUP_FIXED or DUP_VARIABLE. DUP_FIXED tells
	459	* kern_dup() to destructively dup over an existing file descriptor if new
	460	* is already open. DUP_VARIABLE tells kern_dup() to find the lowest
	461	* unused file descriptor that is greater than or equal to new.
	462	*
	463	* MPSAFE
	464	*/
	465	int
	466	kern_dup(enum dup_type type, int old, int new, int *res)
	467	{
	468	struct thread *td = curthread;
	469	struct proc *p = td->td_proc;
	470	struct filedesc *fdp = p->p_fd;
	471	struct file *fp;
	472	struct file *delfp;
	473	int oldflags;
	474	int holdleaders;
	475	int dtsize;
	476	int error, newfd;
	477
	478	/*
	479	* Verify that we have a valid descriptor to dup from and
	480	* possibly to dup to.
	481	*
	482	* NOTE: maxfilesperuser is not applicable to dup()
	483	*/
	484	retry:
	485	if (p->p_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX)
	486	dtsize = INT_MAX;
	487	else
	488	dtsize = (int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur;
	489	if (dtsize > maxfilesperproc)
	490	dtsize = maxfilesperproc;
	491	if (dtsize < minfilesperproc)
	492	dtsize = minfilesperproc;
	493
	494	if (new < 0 \|\| new > dtsize)
	495	return (EINVAL);
	496
	497	spin_lock(&fdp->fd_spin);
	498	if ((unsigned)old >= fdp->fd_nfiles \|\| fdp->fd_files[old].fp == NULL) {
	499	spin_unlock(&fdp->fd_spin);
	500	return (EBADF);
	501	}
	502	if (type == DUP_FIXED && old == new) {
	503	*res = new;
	504	spin_unlock(&fdp->fd_spin);
	505	return (0);
	506	}
	507	fp = fdp->fd_files[old].fp;
	508	oldflags = fdp->fd_files[old].fileflags;
	509	fhold(fp); /* MPSAFE - can be called with a spinlock held */
	510
	511	/*
	512	* Allocate a new descriptor if DUP_VARIABLE, or expand the table
	513	* if the requested descriptor is beyond the current table size.
	514	*
	515	* This can block. Retry if the source descriptor no longer matches
	516	* or if our expectation in the expansion case races.
	517	*
	518	* If we are not expanding or allocating a new decriptor, then reset
	519	* the target descriptor to a reserved state so we have a uniform
	520	* setup for the next code block.
	521	*/
	522	if (type == DUP_VARIABLE \|\| new >= fdp->fd_nfiles) {
	523	spin_unlock(&fdp->fd_spin);
	524	error = fdalloc(p, new, &newfd);
	525	spin_lock(&fdp->fd_spin);
	526	if (error) {
	527	spin_unlock(&fdp->fd_spin);
	528	fdrop(fp);
	529	return (error);
	530	}
	531	/*
	532	* Check for ripout
	533	*/
	534	if (old >= fdp->fd_nfiles \|\| fdp->fd_files[old].fp != fp) {
	535	fsetfd_locked(fdp, NULL, newfd);
	536	spin_unlock(&fdp->fd_spin);
	537	fdrop(fp);
	538	goto retry;
	539	}
	540	/*
	541	* Check for expansion race
	542	*/
	543	if (type != DUP_VARIABLE && new != newfd) {
	544	fsetfd_locked(fdp, NULL, newfd);
	545	spin_unlock(&fdp->fd_spin);
	546	fdrop(fp);
	547	goto retry;
	548	}
	549	/*
	550	* Check for ripout, newfd reused old (this case probably
	551	* can't occur).
	552	*/
	553	if (old == newfd) {
	554	fsetfd_locked(fdp, NULL, newfd);
	555	spin_unlock(&fdp->fd_spin);
	556	fdrop(fp);
	557	goto retry;
	558	}
	559	new = newfd;
	560	delfp = NULL;
	561	} else {
	562	if (fdp->fd_files[new].reserved) {
	563	spin_unlock(&fdp->fd_spin);
	564	fdrop(fp);
	565	kprintf("Warning: dup(): target descriptor %d is reserved, waiting for it to be resolved\n", new);
	566	tsleep(fdp, 0, "fdres", hz);
	567	goto retry;
	568	}
	569
	570	/*
	571	* If the target descriptor was never allocated we have
	572	* to allocate it. If it was we have to clean out the
	573	* old descriptor. delfp inherits the ref from the
	574	* descriptor table.
	575	*/
	576	delfp = fdp->fd_files[new].fp;
	577	fdp->fd_files[new].fp = NULL;
	578	fdp->fd_files[new].reserved = 1;
	579	if (delfp == NULL) {
	580	fdreserve_locked(fdp, new, 1);
	581	if (new > fdp->fd_lastfile)
	582	fdp->fd_lastfile = new;
	583	}
	584
	585	}
	586
	587	/*
	588	* NOTE: still holding an exclusive spinlock
	589	*/
	590
	591	/*
	592	* If a descriptor is being overwritten we may hve to tell
	593	* fdfree() to sleep to ensure that all relevant process
	594	* leaders can be traversed in closef().
	595	*/
	596	if (delfp != NULL && p->p_fdtol != NULL) {
	597	fdp->fd_holdleaderscount++;
	598	holdleaders = 1;
	599	} else {
	600	holdleaders = 0;
	601	}
	602	KASSERT(delfp == NULL \|\| type == DUP_FIXED,
	603	("dup() picked an open file"));
	604
	605	/*
	606	* Duplicate the source descriptor, update lastfile. If the new
	607	* descriptor was not allocated and we aren't replacing an existing
	608	* descriptor we have to mark the descriptor as being in use.
	609	*
	610	* The fd_files[] array inherits fp's hold reference.
	611	*/
	612	fsetfd_locked(fdp, fp, new);
	613	fdp->fd_files[new].fileflags = oldflags & ~UF_EXCLOSE;
	614	spin_unlock(&fdp->fd_spin);
	615	fdrop(fp);
	616	*res = new;
	617
	618	/*
	619	* If we dup'd over a valid file, we now own the reference to it
	620	* and must dispose of it using closef() semantics (as if a
	621	* close() were performed on it).
	622	*/
	623	if (delfp) {
	624	if (SLIST_FIRST(&delfp->f_klist))
	625	knote_fdclose(delfp, fdp, new);
	626	closef(delfp, p);
	627	if (holdleaders) {
	628	spin_lock(&fdp->fd_spin);
	629	fdp->fd_holdleaderscount--;
	630	if (fdp->fd_holdleaderscount == 0 &&
	631	fdp->fd_holdleaderswakeup != 0) {
	632	fdp->fd_holdleaderswakeup = 0;
	633	spin_unlock(&fdp->fd_spin);
	634	wakeup(&fdp->fd_holdleaderscount);
	635	} else {
	636	spin_unlock(&fdp->fd_spin);
	637	}
	638	}
	639	}
	640	return (0);
	641	}
	642
	643	/*
	644	* If sigio is on the list associated with a process or process group,
	645	* disable signalling from the device, remove sigio from the list and
	646	* free sigio.
	647	*/
	648	void
	649	funsetown(struct sigio *sigio)
	650	{
	651	if (sigio == NULL)
	652	return;
	653	crit_enter();
	654	*(sigio->sio_myref) = NULL;
	655	crit_exit();
	656	if (sigio->sio_pgid < 0) {
	657	SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
	658	sigio, sio_pgsigio);
	659	} else /* if ((sigiop)->sio_pgid > 0) / {
	660	SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
	661	sigio, sio_pgsigio);
	662	}
	663	crfree(sigio->sio_ucred);
	664	kfree(sigio, M_SIGIO);
	665	}
	666
	667	/* Free a list of sigio structures. */
	668	void
	669	funsetownlst(struct sigiolst *sigiolst)
	670	{
	671	struct sigio *sigio;
	672
	673	while ((sigio = SLIST_FIRST(sigiolst)) != NULL)
	674	funsetown(sigio);
	675	}
	676
	677	/*
	678	* This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
	679	*
	680	* After permission checking, add a sigio structure to the sigio list for
	681	* the process or process group.
	682	*/
	683	int
	684	fsetown(pid_t pgid, struct sigio **sigiop)
	685	{
	686	struct proc *proc;
	687	struct pgrp *pgrp;
	688	struct sigio *sigio;
	689
	690	if (pgid == 0) {
	691	funsetown(*sigiop);
	692	return (0);
	693	}
	694	if (pgid > 0) {
	695	proc = pfind(pgid);
	696	if (proc == NULL)
	697	return (ESRCH);
	698
	699	/*
	700	* Policy - Don't allow a process to FSETOWN a process
	701	* in another session.
	702	*
	703	* Remove this test to allow maximum flexibility or
	704	* restrict FSETOWN to the current process or process
	705	* group for maximum safety.
	706	*/
	707	if (proc->p_session != curproc->p_session)
	708	return (EPERM);
	709
	710	pgrp = NULL;
	711	} else /* if (pgid < 0) */ {
	712	pgrp = pgfind(-pgid);
	713	if (pgrp == NULL)
	714	return (ESRCH);
	715
	716	/*
	717	* Policy - Don't allow a process to FSETOWN a process
	718	* in another session.
	719	*
	720	* Remove this test to allow maximum flexibility or
	721	* restrict FSETOWN to the current process or process
	722	* group for maximum safety.
	723	*/
	724	if (pgrp->pg_session != curproc->p_session)
	725	return (EPERM);
	726
	727	proc = NULL;
	728	}
	729	funsetown(*sigiop);
	730	sigio = kmalloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
	731	if (pgid > 0) {
	732	SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
	733	sigio->sio_proc = proc;
	734	} else {
	735	SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
	736	sigio->sio_pgrp = pgrp;
	737	}
	738	sigio->sio_pgid = pgid;
	739	sigio->sio_ucred = crhold(curthread->td_ucred);
	740	/* It would be convenient if p_ruid was in ucred. */
	741	sigio->sio_ruid = sigio->sio_ucred->cr_ruid;
	742	sigio->sio_myref = sigiop;
	743	crit_enter();
	744	*sigiop = sigio;
	745	crit_exit();
	746	return (0);
	747	}
	748
	749	/*
	750	* This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
	751	*/
	752	pid_t
	753	fgetown(struct sigio *sigio)
	754	{
	755	return (sigio != NULL ? sigio->sio_pgid : 0);
	756	}
	757
	758	/*
	759	* Close many file descriptors.
	760	*
	761	* MPSAFE
	762	*/
	763	int
	764	sys_closefrom(struct closefrom_args *uap)
	765	{
	766	return(kern_closefrom(uap->fd));
	767	}
	768
	769	/*
	770	* Close all file descriptors greater then or equal to fd
	771	*
	772	* MPSAFE
	773	*/
	774	int
	775	kern_closefrom(int fd)
	776	{
	777	struct thread *td = curthread;
	778	struct proc *p = td->td_proc;
	779	struct filedesc *fdp;
	780
	781	KKASSERT(p);
	782	fdp = p->p_fd;
	783
	784	if (fd < 0)
	785	return (EINVAL);
	786
	787	/*
	788	* NOTE: This function will skip unassociated descriptors and
	789	* reserved descriptors that have not yet been assigned.
	790	* fd_lastfile can change as a side effect of kern_close().
	791	*/
	792	spin_lock(&fdp->fd_spin);
	793	while (fd <= fdp->fd_lastfile) {
	794	if (fdp->fd_files[fd].fp != NULL) {
	795	spin_unlock(&fdp->fd_spin);
	796	/* ok if this races another close */
	797	if (kern_close(fd) == EINTR)
	798	return (EINTR);
	799	spin_lock(&fdp->fd_spin);
	800	}
	801	++fd;
	802	}
	803	spin_unlock(&fdp->fd_spin);
	804	return (0);
	805	}
	806
	807	/*
	808	* Close a file descriptor.
	809	*
	810	* MPSAFE
	811	*/
	812	int
	813	sys_close(struct close_args *uap)
	814	{
	815	return(kern_close(uap->fd));
	816	}
	817
	818	/*
	819	* MPALMOSTSAFE - acquires mplock around knote_fdclose() calls
	820	*/
	821	int
	822	kern_close(int fd)
	823	{
	824	struct thread *td = curthread;
	825	struct proc *p = td->td_proc;
	826	struct filedesc *fdp;
	827	struct file *fp;
	828	int error;
	829	int holdleaders;
	830
	831	KKASSERT(p);
	832	fdp = p->p_fd;
	833
	834	spin_lock(&fdp->fd_spin);
	835	if ((fp = funsetfd_locked(fdp, fd)) == NULL) {
	836	spin_unlock(&fdp->fd_spin);
	837	return (EBADF);
	838	}
	839	holdleaders = 0;
	840	if (p->p_fdtol != NULL) {
	841	/*
	842	* Ask fdfree() to sleep to ensure that all relevant
	843	* process leaders can be traversed in closef().
	844	*/
	845	fdp->fd_holdleaderscount++;
	846	holdleaders = 1;
	847	}
	848
	849	/*
	850	* we now hold the fp reference that used to be owned by the descriptor
	851	* array.
	852	*/
	853	spin_unlock(&fdp->fd_spin);
	854	if (SLIST_FIRST(&fp->f_klist))
	855	knote_fdclose(fp, fdp, fd);
	856	error = closef(fp, p);
	857	if (holdleaders) {
	858	spin_lock(&fdp->fd_spin);
	859	fdp->fd_holdleaderscount--;
	860	if (fdp->fd_holdleaderscount == 0 &&
	861	fdp->fd_holdleaderswakeup != 0) {
	862	fdp->fd_holdleaderswakeup = 0;
	863	spin_unlock(&fdp->fd_spin);
	864	wakeup(&fdp->fd_holdleaderscount);
	865	} else {
	866	spin_unlock(&fdp->fd_spin);
	867	}
	868	}
	869	return (error);
	870	}
	871
	872	/*
	873	* shutdown_args(int fd, int how)
	874	*/
	875	int
	876	kern_shutdown(int fd, int how)
	877	{
	878	struct thread *td = curthread;
	879	struct proc *p = td->td_proc;
	880	struct file *fp;
	881	int error;
	882
	883	KKASSERT(p);
	884
	885	if ((fp = holdfp(p->p_fd, fd, -1)) == NULL)
	886	return (EBADF);
	887	error = fo_shutdown(fp, how);
	888	fdrop(fp);
	889
	890	return (error);
	891	}
	892
	893	/*
	894	* MPALMOSTSAFE
	895	*/
	896	int
	897	sys_shutdown(struct shutdown_args *uap)
	898	{
	899	int error;
	900
	901	get_mplock();
	902	error = kern_shutdown(uap->s, uap->how);
	903	rel_mplock();
	904
	905	return (error);
	906	}
	907
	908	/*
	909	* MPSAFE
	910	*/
	911	int
	912	kern_fstat(int fd, struct stat *ub)
	913	{
	914	struct thread *td = curthread;
	915	struct proc *p = td->td_proc;
	916	struct file *fp;
	917	int error;
	918
	919	KKASSERT(p);
	920
	921	if ((fp = holdfp(p->p_fd, fd, -1)) == NULL)
	922	return (EBADF);
	923	error = fo_stat(fp, ub, td->td_ucred);
	924	fdrop(fp);
	925
	926	return (error);
	927	}
	928
	929	/*
	930	* Return status information about a file descriptor.
	931	*
	932	* MPSAFE
	933	*/
	934	int
	935	sys_fstat(struct fstat_args *uap)
	936	{
	937	struct stat st;
	938	int error;
	939
	940	error = kern_fstat(uap->fd, &st);
	941
	942	if (error == 0)
	943	error = copyout(&st, uap->sb, sizeof(st));
	944	return (error);
	945	}
	946
	947	/*
	948	* Return pathconf information about a file descriptor.
	949	*
	950	* MPALMOSTSAFE
	951	*/
	952	int
	953	sys_fpathconf(struct fpathconf_args *uap)
	954	{
	955	struct thread *td = curthread;
	956	struct proc *p = td->td_proc;
	957	struct file *fp;
	958	struct vnode *vp;
	959	int error = 0;
	960
	961	if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL)
	962	return (EBADF);
	963
	964	switch (fp->f_type) {
	965	case DTYPE_PIPE:
	966	case DTYPE_SOCKET:
	967	if (uap->name != _PC_PIPE_BUF) {
	968	error = EINVAL;
	969	} else {
	970	uap->sysmsg_result = PIPE_BUF;
	971	error = 0;
	972	}
	973	break;
	974	case DTYPE_FIFO:
	975	case DTYPE_VNODE:
	976	vp = (struct vnode *)fp->f_data;
	977	get_mplock();
	978	error = VOP_PATHCONF(vp, uap->name, &uap->sysmsg_reg);
	979	rel_mplock();
	980	break;
	981	default:
	982	error = EOPNOTSUPP;
	983	break;
	984	}
	985	fdrop(fp);
	986	return(error);
	987	}
	988
	989	static int fdexpand;
	990	SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0,
	991	"Number of times a file table has been expanded");
	992
	993	/*
	994	* Grow the file table so it can hold through descriptor (want).
	995	*
	996	* The fdp's spinlock must be held exclusively on entry and may be held
	997	* exclusively on return. The spinlock may be cycled by the routine.
	998	*
	999	* MPSAFE
	1000	*/
	1001	static void
	1002	fdgrow_locked(struct filedesc *fdp, int want)
	1003	{
	1004	struct fdnode *newfiles;
	1005	struct fdnode *oldfiles;
	1006	int nf, extra;
	1007
	1008	nf = fdp->fd_nfiles;
	1009	do {
	1010	/* nf has to be of the form 2^n - 1 */
	1011	nf = 2 * nf + 1;
	1012	} while (nf <= want);
	1013
	1014	spin_unlock(&fdp->fd_spin);
	1015	newfiles = kmalloc(nf * sizeof(struct fdnode), M_FILEDESC, M_WAITOK);
	1016	spin_lock(&fdp->fd_spin);
	1017
	1018	/*
	1019	* We could have raced another extend while we were not holding
	1020	* the spinlock.
	1021	*/
	1022	if (fdp->fd_nfiles >= nf) {
	1023	spin_unlock(&fdp->fd_spin);
	1024	kfree(newfiles, M_FILEDESC);
	1025	spin_lock(&fdp->fd_spin);
	1026	return;
	1027	}
	1028	/*
	1029	* Copy the existing ofile and ofileflags arrays
	1030	* and zero the new portion of each array.
	1031	*/
	1032	extra = nf - fdp->fd_nfiles;
	1033	bcopy(fdp->fd_files, newfiles, fdp->fd_nfiles * sizeof(struct fdnode));
	1034	bzero(&newfiles[fdp->fd_nfiles], extra * sizeof(struct fdnode));
	1035
	1036	oldfiles = fdp->fd_files;
	1037	fdp->fd_files = newfiles;
	1038	fdp->fd_nfiles = nf;
	1039
	1040	if (oldfiles != fdp->fd_builtin_files) {
	1041	spin_unlock(&fdp->fd_spin);
	1042	kfree(oldfiles, M_FILEDESC);
	1043	spin_lock(&fdp->fd_spin);
	1044	}
	1045	fdexpand++;
	1046	}
	1047
	1048	/*
	1049	* Number of nodes in right subtree, including the root.
	1050	*/
	1051	static __inline int
	1052	right_subtree_size(int n)
	1053	{
	1054	return (n ^ (n \| (n + 1)));
	1055	}
	1056
	1057	/*
	1058	* Bigger ancestor.
	1059	*/
	1060	static __inline int
	1061	right_ancestor(int n)
	1062	{
	1063	return (n \| (n + 1));
	1064	}
	1065
	1066	/*
	1067	* Smaller ancestor.
	1068	*/
	1069	static __inline int
	1070	left_ancestor(int n)
	1071	{
	1072	return ((n & (n + 1)) - 1);
	1073	}
	1074
	1075	/*
	1076	* Traverse the in-place binary tree buttom-up adjusting the allocation
	1077	* count so scans can determine where free descriptors are located.
	1078	*
	1079	* MPSAFE - caller must be holding an exclusive spinlock on fdp
	1080	*/
	1081	static
	1082	void
	1083	fdreserve_locked(struct filedesc *fdp, int fd, int incr)
	1084	{
	1085	while (fd >= 0) {
	1086	fdp->fd_files[fd].allocated += incr;
	1087	KKASSERT(fdp->fd_files[fd].allocated >= 0);
	1088	fd = left_ancestor(fd);
	1089	}
	1090	}
	1091
	1092	/*
	1093	* Reserve a file descriptor for the process. If no error occurs, the
	1094	* caller MUST at some point call fsetfd() or assign a file pointer
	1095	* or dispose of the reservation.
	1096	*
	1097	* MPSAFE
	1098	*/
	1099	int
	1100	fdalloc(struct proc p, int want, int result)
	1101	{
	1102	struct filedesc *fdp = p->p_fd;
	1103	struct uidinfo *uip;
	1104	int fd, rsize, rsum, node, lim;
	1105
	1106	/*
	1107	* Check dtable size limit
	1108	*/
	1109	spin_lock(&p->p_limit->p_spin);
	1110	if (p->p_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX)
	1111	lim = INT_MAX;
	1112	else
	1113	lim = (int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur;
	1114	spin_unlock(&p->p_limit->p_spin);
	1115
	1116	if (lim > maxfilesperproc)
	1117	lim = maxfilesperproc;
	1118	if (lim < minfilesperproc)
	1119	lim = minfilesperproc;
	1120	if (want >= lim)
	1121	return (EMFILE);
	1122
	1123	/*
	1124	* Check that the user has not run out of descriptors (non-root only).
	1125	* As a safety measure the dtable is allowed to have at least
	1126	* minfilesperproc open fds regardless of the maxfilesperuser limit.
	1127	*/
	1128	if (p->p_ucred->cr_uid && fdp->fd_nfiles >= minfilesperproc) {
	1129	uip = p->p_ucred->cr_uidinfo;
	1130	if (uip->ui_openfiles > maxfilesperuser) {
	1131	krateprintf(&krate_uidinfo,
	1132	"Warning: user %d pid %d (%s) ran out of "
	1133	"file descriptors (%d/%d)\n",
	1134	p->p_ucred->cr_uid, (int)p->p_pid,
	1135	p->p_comm,
	1136	uip->ui_openfiles, maxfilesperuser);
	1137	return(ENFILE);
	1138	}
	1139	}
	1140
	1141	/*
	1142	* Grow the dtable if necessary
	1143	*/
	1144	spin_lock(&fdp->fd_spin);
	1145	if (want >= fdp->fd_nfiles)
	1146	fdgrow_locked(fdp, want);
	1147
	1148	/*
	1149	* Search for a free descriptor starting at the higher
	1150	* of want or fd_freefile. If that fails, consider
	1151	* expanding the ofile array.
	1152	*
	1153	* NOTE! the 'allocated' field is a cumulative recursive allocation
	1154	* count. If we happen to see a value of 0 then we can shortcut
	1155	* our search. Otherwise we run through through the tree going
	1156	* down branches we know have free descriptor(s) until we hit a
	1157	* leaf node. The leaf node will be free but will not necessarily
	1158	* have an allocated field of 0.
	1159	*/
	1160	retry:
	1161	/* move up the tree looking for a subtree with a free node */
	1162	for (fd = max(want, fdp->fd_freefile); fd < min(fdp->fd_nfiles, lim);
	1163	fd = right_ancestor(fd)) {
	1164	if (fdp->fd_files[fd].allocated == 0)
	1165	goto found;
	1166
	1167	rsize = right_subtree_size(fd);
	1168	if (fdp->fd_files[fd].allocated == rsize)
	1169	continue; /* right subtree full */
	1170
	1171	/*
	1172	* Free fd is in the right subtree of the tree rooted at fd.
	1173	* Call that subtree R. Look for the smallest (leftmost)
	1174	* subtree of R with an unallocated fd: continue moving
	1175	* down the left branch until encountering a full left
	1176	* subtree, then move to the right.
	1177	*/
	1178	for (rsum = 0, rsize /= 2; rsize > 0; rsize /= 2) {
	1179	node = fd + rsize;
	1180	rsum += fdp->fd_files[node].allocated;
	1181	if (fdp->fd_files[fd].allocated == rsum + rsize) {
	1182	fd = node; /* move to the right */
	1183	if (fdp->fd_files[node].allocated == 0)
	1184	goto found;
	1185	rsum = 0;
	1186	}
	1187	}
	1188	goto found;
	1189	}
	1190
	1191	/*
	1192	* No space in current array. Expand?
	1193	*/
	1194	if (fdp->fd_nfiles >= lim) {
	1195	spin_unlock(&fdp->fd_spin);
	1196	return (EMFILE);
	1197	}
	1198	fdgrow_locked(fdp, want);
	1199	goto retry;
	1200
	1201	found:
	1202	KKASSERT(fd < fdp->fd_nfiles);
	1203	if (fd > fdp->fd_lastfile)
	1204	fdp->fd_lastfile = fd;
	1205	if (want <= fdp->fd_freefile)
	1206	fdp->fd_freefile = fd;
	1207	*result = fd;
	1208	KKASSERT(fdp->fd_files[fd].fp == NULL);
	1209	KKASSERT(fdp->fd_files[fd].reserved == 0);
	1210	fdp->fd_files[fd].fileflags = 0;
	1211	fdp->fd_files[fd].reserved = 1;
	1212	fdreserve_locked(fdp, fd, 1);
	1213	spin_unlock(&fdp->fd_spin);
	1214	return (0);
	1215	}
	1216
	1217	/*
	1218	* Check to see whether n user file descriptors
	1219	* are available to the process p.
	1220	*
	1221	* MPSAFE
	1222	*/
	1223	int
	1224	fdavail(struct proc *p, int n)
	1225	{
	1226	struct filedesc *fdp = p->p_fd;
	1227	struct fdnode *fdnode;
	1228	int i, lim, last;
	1229
	1230	spin_lock(&p->p_limit->p_spin);
	1231	if (p->p_rlimit[RLIMIT_NOFILE].rlim_cur > INT_MAX)
	1232	lim = INT_MAX;
	1233	else
	1234	lim = (int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur;
	1235	spin_unlock(&p->p_limit->p_spin);
	1236
	1237	if (lim > maxfilesperproc)
	1238	lim = maxfilesperproc;
	1239	if (lim < minfilesperproc)
	1240	lim = minfilesperproc;
	1241
	1242	spin_lock(&fdp->fd_spin);
	1243	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) {
	1244	spin_unlock(&fdp->fd_spin);
	1245	return (1);
	1246	}
	1247	last = min(fdp->fd_nfiles, lim);
	1248	fdnode = &fdp->fd_files[fdp->fd_freefile];
	1249	for (i = last - fdp->fd_freefile; --i >= 0; ++fdnode) {
	1250	if (fdnode->fp == NULL && --n <= 0) {
	1251	spin_unlock(&fdp->fd_spin);
	1252	return (1);
	1253	}
	1254	}
	1255	spin_unlock(&fdp->fd_spin);
	1256	return (0);
	1257	}
	1258
	1259	/*
	1260	* Revoke open descriptors referencing (f_data, f_type)
	1261	*
	1262	* Any revoke executed within a prison is only able to
	1263	* revoke descriptors for processes within that prison.
	1264	*
	1265	* Returns 0 on success or an error code.
	1266	*/
	1267	struct fdrevoke_info {
	1268	void *data;
	1269	short type;
	1270	short unused;
	1271	int count;
	1272	int intransit;
	1273	struct ucred *cred;
	1274	struct file *nfp;
	1275	};
	1276
	1277	static int fdrevoke_check_callback(struct file fp, void vinfo);
	1278	static int fdrevoke_proc_callback(struct proc p, void vinfo);
	1279
	1280	int
	1281	fdrevoke(void f_data, short f_type, struct ucred cred)
	1282	{
	1283	struct fdrevoke_info info;
	1284	int error;
	1285
	1286	bzero(&info, sizeof(info));
	1287	info.data = f_data;
	1288	info.type = f_type;
	1289	info.cred = cred;
	1290	error = falloc(NULL, &info.nfp, NULL);
	1291	if (error)
	1292	return (error);
	1293
	1294	/*
	1295	* Scan the file pointer table once. dups do not dup file pointers,
	1296	* only descriptors, so there is no leak. Set FREVOKED on the fps
	1297	* being revoked.
	1298	*/
	1299	allfiles_scan_exclusive(fdrevoke_check_callback, &info);
	1300
	1301	/*
	1302	* If any fps were marked track down the related descriptors
	1303	* and close them. Any dup()s at this point will notice
	1304	* the FREVOKED already set in the fp and do the right thing.
	1305	*
	1306	* Any fps with non-zero msgcounts (aka sent over a unix-domain
	1307	* socket) bumped the intransit counter and will require a
	1308	* scan. Races against fps leaving the socket are closed by
	1309	* the socket code checking for FREVOKED.
	1310	*/
	1311	if (info.count)
	1312	allproc_scan(fdrevoke_proc_callback, &info);
	1313	if (info.intransit)
	1314	unp_revoke_gc(info.nfp);
	1315	fdrop(info.nfp);
	1316	return(0);
	1317	}
	1318
	1319	/*
	1320	* Locate matching file pointers directly.
	1321	*
	1322	* WARNING: allfiles_scan_exclusive() holds a spinlock through these calls!
	1323	*/
	1324	static int
	1325	fdrevoke_check_callback(struct file fp, void vinfo)
	1326	{
	1327	struct fdrevoke_info *info = vinfo;
	1328
	1329	/*
	1330	* File pointers already flagged for revokation are skipped.
	1331	*/
	1332	if (fp->f_flag & FREVOKED)
	1333	return(0);
	1334
	1335	/*
	1336	* If revoking from a prison file pointers created outside of
	1337	* that prison, or file pointers without creds, cannot be revoked.
	1338	*/
	1339	if (info->cred->cr_prison &&
	1340	(fp->f_cred == NULL \|\|
	1341	info->cred->cr_prison != fp->f_cred->cr_prison)) {
	1342	return(0);
	1343	}
	1344
	1345	/*
	1346	* If the file pointer matches then mark it for revocation. The
	1347	* flag is currently only used by unp_revoke_gc().
	1348	*
	1349	* info->count is a heuristic and can race in a SMP environment.
	1350	*/
	1351	if (info->data == fp->f_data && info->type == fp->f_type) {
	1352	atomic_set_int(&fp->f_flag, FREVOKED);
	1353	info->count += fp->f_count;
	1354	if (fp->f_msgcount)
	1355	++info->intransit;
	1356	}
	1357	return(0);
	1358	}
	1359
	1360	/*
	1361	* Locate matching file pointers via process descriptor tables.
	1362	*/
	1363	static int
	1364	fdrevoke_proc_callback(struct proc p, void vinfo)
	1365	{
	1366	struct fdrevoke_info *info = vinfo;
	1367	struct filedesc *fdp;
	1368	struct file *fp;
	1369	int n;
	1370
	1371	if (p->p_stat == SIDL \|\| p->p_stat == SZOMB)
	1372	return(0);
	1373	if (info->cred->cr_prison &&
	1374	info->cred->cr_prison != p->p_ucred->cr_prison) {
	1375	return(0);
	1376	}
	1377
	1378	/*
	1379	* If the controlling terminal of the process matches the
	1380	* vnode being revoked we clear the controlling terminal.
	1381	*
	1382	* The normal spec_close() may not catch this because it
	1383	* uses curproc instead of p.
	1384	*/
	1385	if (p->p_session && info->type == DTYPE_VNODE &&
	1386	info->data == p->p_session->s_ttyvp) {
	1387	p->p_session->s_ttyvp = NULL;
	1388	vrele(info->data);
	1389	}
	1390
	1391	/*
	1392	* Softref the fdp to prevent it from being destroyed
	1393	*/
	1394	spin_lock(&p->p_spin);
	1395	if ((fdp = p->p_fd) == NULL) {
	1396	spin_unlock(&p->p_spin);
	1397	return(0);
	1398	}
	1399	atomic_add_int(&fdp->fd_softrefs, 1);
	1400	spin_unlock(&p->p_spin);
	1401
	1402	/*
	1403	* Locate and close any matching file descriptors.
	1404	*/
	1405	spin_lock(&fdp->fd_spin);
	1406	for (n = 0; n < fdp->fd_nfiles; ++n) {
	1407	if ((fp = fdp->fd_files[n].fp) == NULL)
	1408	continue;
	1409	if (fp->f_flag & FREVOKED) {
	1410	fhold(info->nfp);
	1411	fdp->fd_files[n].fp = info->nfp;
	1412	spin_unlock(&fdp->fd_spin);
	1413	knote_fdclose(fp, fdp, n); /* XXX */
	1414	closef(fp, p);
	1415	spin_lock(&fdp->fd_spin);
	1416	--info->count;
	1417	}
	1418	}
	1419	spin_unlock(&fdp->fd_spin);
	1420	atomic_subtract_int(&fdp->fd_softrefs, 1);
	1421	return(0);
	1422	}
	1423
	1424	/*
	1425	* falloc:
	1426	* Create a new open file structure and reserve a file decriptor
	1427	* for the process that refers to it.
	1428	*
	1429	* Root creds are checked using lp, or assumed if lp is NULL. If
	1430	* resultfd is non-NULL then lp must also be non-NULL. No file
	1431	* descriptor is reserved (and no process context is needed) if
	1432	* resultfd is NULL.
	1433	*
	1434	* A file pointer with a refcount of 1 is returned. Note that the
	1435	* file pointer is NOT associated with the descriptor. If falloc
	1436	* returns success, fsetfd() MUST be called to either associate the
	1437	* file pointer or clear the reservation.
	1438	*
	1439	* MPSAFE
	1440	*/
	1441	int
	1442	falloc(struct lwp lp, struct file resultfp, int resultfd)
	1443	{
	1444	static struct timeval lastfail;
	1445	static int curfail;
	1446	struct file *fp;
	1447	struct ucred *cred = lp ? lp->lwp_thread->td_ucred : proc0.p_ucred;
	1448	int error;
	1449
	1450	fp = NULL;
	1451
	1452	/*
	1453	* Handle filetable full issues and root overfill.
	1454	*/
	1455	if (nfiles >= maxfiles - maxfilesrootres &&
	1456	(cred->cr_ruid != 0 \|\| nfiles >= maxfiles)) {
	1457	if (ppsratecheck(&lastfail, &curfail, 1)) {
	1458	kprintf("kern.maxfiles limit exceeded by uid %d, "
	1459	"please see tuning(7).\n",
	1460	cred->cr_ruid);
	1461	}
	1462	error = ENFILE;
	1463	goto done;
	1464	}
	1465
	1466	/*
	1467	* Allocate a new file descriptor.
	1468	*/
	1469	fp = kmalloc(sizeof(struct file), M_FILE, M_WAITOK \| M_ZERO);
	1470	spin_init(&fp->f_spin);
	1471	SLIST_INIT(&fp->f_klist);
	1472	fp->f_count = 1;
	1473	fp->f_ops = &badfileops;
	1474	fp->f_seqcount = 1;
	1475	fsetcred(fp, cred);
	1476	spin_lock(&filehead_spin);
	1477	nfiles++;
	1478	LIST_INSERT_HEAD(&filehead, fp, f_list);
	1479	spin_unlock(&filehead_spin);
	1480	if (resultfd) {
	1481	if ((error = fdalloc(lp->lwp_proc, 0, resultfd)) != 0) {
	1482	fdrop(fp);
	1483	fp = NULL;
	1484	}
	1485	} else {
	1486	error = 0;
	1487	}
	1488	done:
	1489	*resultfp = fp;
	1490	return (error);
	1491	}
	1492
	1493	/*
	1494	* Check for races against a file descriptor by determining that the
	1495	* file pointer is still associated with the specified file descriptor,
	1496	* and a close is not currently in progress.
	1497	*
	1498	* MPSAFE
	1499	*/
	1500	int
	1501	checkfdclosed(struct filedesc fdp, int fd, struct file fp)
	1502	{
	1503	int error;
	1504
	1505	spin_lock(&fdp->fd_spin);
	1506	if ((unsigned)fd >= fdp->fd_nfiles \|\| fp != fdp->fd_files[fd].fp)
	1507	error = EBADF;
	1508	else
	1509	error = 0;
	1510	spin_unlock(&fdp->fd_spin);
	1511	return (error);
	1512	}
	1513
	1514	/*
	1515	* Associate a file pointer with a previously reserved file descriptor.
	1516	* This function always succeeds.
	1517	*
	1518	* If fp is NULL, the file descriptor is returned to the pool.
	1519	*/
	1520
	1521	/*
	1522	* MPSAFE (exclusive spinlock must be held on call)
	1523	*/
	1524	static void
	1525	fsetfd_locked(struct filedesc fdp, struct file fp, int fd)
	1526	{
	1527	KKASSERT((unsigned)fd < fdp->fd_nfiles);
	1528	KKASSERT(fdp->fd_files[fd].reserved != 0);
	1529	if (fp) {
	1530	fhold(fp);
	1531	fdp->fd_files[fd].fp = fp;
	1532	fdp->fd_files[fd].reserved = 0;
	1533	} else {
	1534	fdp->fd_files[fd].reserved = 0;
	1535	fdreserve_locked(fdp, fd, -1);
	1536	fdfixup_locked(fdp, fd);
	1537	}
	1538	}
	1539
	1540	/*
	1541	* MPSAFE
	1542	*/
	1543	void
	1544	fsetfd(struct filedesc fdp, struct file fp, int fd)
	1545	{
	1546	spin_lock(&fdp->fd_spin);
	1547	fsetfd_locked(fdp, fp, fd);
	1548	spin_unlock(&fdp->fd_spin);
	1549	}
	1550
	1551	/*
	1552	* MPSAFE (exclusive spinlock must be held on call)
	1553	*/
	1554	static
	1555	struct file *
	1556	funsetfd_locked(struct filedesc *fdp, int fd)
	1557	{
	1558	struct file *fp;
	1559
	1560	if ((unsigned)fd >= fdp->fd_nfiles)
	1561	return (NULL);
	1562	if ((fp = fdp->fd_files[fd].fp) == NULL)
	1563	return (NULL);
	1564	fdp->fd_files[fd].fp = NULL;
	1565	fdp->fd_files[fd].fileflags = 0;
	1566
	1567	fdreserve_locked(fdp, fd, -1);
	1568	fdfixup_locked(fdp, fd);
	1569	return(fp);
	1570	}
	1571
	1572	/*
	1573	* MPSAFE
	1574	*/
	1575	int
	1576	fgetfdflags(struct filedesc fdp, int fd, int flagsp)
	1577	{
	1578	int error;
	1579
	1580	spin_lock(&fdp->fd_spin);
	1581	if (((u_int)fd) >= fdp->fd_nfiles) {
	1582	error = EBADF;
	1583	} else if (fdp->fd_files[fd].fp == NULL) {
	1584	error = EBADF;
	1585	} else {
	1586	*flagsp = fdp->fd_files[fd].fileflags;
	1587	error = 0;
	1588	}
	1589	spin_unlock(&fdp->fd_spin);
	1590	return (error);
	1591	}
	1592
	1593	/*
	1594	* MPSAFE
	1595	*/
	1596	int
	1597	fsetfdflags(struct filedesc *fdp, int fd, int add_flags)
	1598	{
	1599	int error;
	1600
	1601	spin_lock(&fdp->fd_spin);
	1602	if (((u_int)fd) >= fdp->fd_nfiles) {
	1603	error = EBADF;
	1604	} else if (fdp->fd_files[fd].fp == NULL) {
	1605	error = EBADF;
	1606	} else {
	1607	fdp->fd_files[fd].fileflags \|= add_flags;
	1608	error = 0;
	1609	}
	1610	spin_unlock(&fdp->fd_spin);
	1611	return (error);
	1612	}
	1613
	1614	/*
	1615	* MPSAFE
	1616	*/
	1617	int
	1618	fclrfdflags(struct filedesc *fdp, int fd, int rem_flags)
	1619	{
	1620	int error;
	1621
	1622	spin_lock(&fdp->fd_spin);
	1623	if (((u_int)fd) >= fdp->fd_nfiles) {
	1624	error = EBADF;
	1625	} else if (fdp->fd_files[fd].fp == NULL) {
	1626	error = EBADF;
	1627	} else {
	1628	fdp->fd_files[fd].fileflags &= ~rem_flags;
	1629	error = 0;
	1630	}
	1631	spin_unlock(&fdp->fd_spin);
	1632	return (error);
	1633	}
	1634
	1635	/*
	1636	* Set/Change/Clear the creds for a fp and synchronize the uidinfo.
	1637	*/
	1638	void
	1639	fsetcred(struct file fp, struct ucred ncr)
	1640	{
	1641	struct ucred *ocr;
	1642	struct uidinfo *uip;
	1643
	1644	ocr = fp->f_cred;
	1645	if (ocr == NULL \|\| ncr == NULL \|\| ocr->cr_uidinfo != ncr->cr_uidinfo) {
	1646	if (ocr) {
	1647	uip = ocr->cr_uidinfo;
	1648	atomic_add_int(&uip->ui_openfiles, -1);
	1649	}
	1650	if (ncr) {
	1651	uip = ncr->cr_uidinfo;
	1652	atomic_add_int(&uip->ui_openfiles, 1);
	1653	}
	1654	}
	1655	if (ncr)
	1656	crhold(ncr);
	1657	fp->f_cred = ncr;
	1658	if (ocr)
	1659	crfree(ocr);
	1660	}
	1661
	1662	/*
	1663	* Free a file descriptor.
	1664	*/
	1665	static
	1666	void
	1667	ffree(struct file *fp)
	1668	{
	1669	KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!"));
	1670	spin_lock(&filehead_spin);
	1671	LIST_REMOVE(fp, f_list);
	1672	nfiles--;
	1673	spin_unlock(&filehead_spin);
	1674	fsetcred(fp, NULL);
	1675	if (fp->f_nchandle.ncp)
	1676	cache_drop(&fp->f_nchandle);
	1677	kfree(fp, M_FILE);
	1678	}
	1679
	1680	/*
	1681	* called from init_main, initialize filedesc0 for proc0.
	1682	*/
	1683	void
	1684	fdinit_bootstrap(struct proc p0, struct filedesc fdp0, int cmask)
	1685	{
	1686	p0->p_fd = fdp0;
	1687	p0->p_fdtol = NULL;
	1688	fdp0->fd_refcnt = 1;
	1689	fdp0->fd_cmask = cmask;
	1690	fdp0->fd_files = fdp0->fd_builtin_files;
	1691	fdp0->fd_nfiles = NDFILE;
	1692	fdp0->fd_lastfile = -1;
	1693	spin_init(&fdp0->fd_spin);
	1694	}
	1695
	1696	/*
	1697	* Build a new filedesc structure.
	1698	*
	1699	* NOT MPSAFE (vref)
	1700	*/
	1701	struct filedesc *
	1702	fdinit(struct proc *p)
	1703	{
	1704	struct filedesc *newfdp;
	1705	struct filedesc *fdp = p->p_fd;
	1706
	1707	newfdp = kmalloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK\|M_ZERO);
	1708	spin_lock(&fdp->fd_spin);
	1709	if (fdp->fd_cdir) {
	1710	newfdp->fd_cdir = fdp->fd_cdir;
	1711	vref(newfdp->fd_cdir);
	1712	cache_copy(&fdp->fd_ncdir, &newfdp->fd_ncdir);
	1713	}
	1714
	1715	/*
	1716	* rdir may not be set in e.g. proc0 or anything vm_fork'd off of
	1717	* proc0, but should unconditionally exist in other processes.
	1718	*/
	1719	if (fdp->fd_rdir) {
	1720	newfdp->fd_rdir = fdp->fd_rdir;
	1721	vref(newfdp->fd_rdir);
	1722	cache_copy(&fdp->fd_nrdir, &newfdp->fd_nrdir);
	1723	}
	1724	if (fdp->fd_jdir) {
	1725	newfdp->fd_jdir = fdp->fd_jdir;
	1726	vref(newfdp->fd_jdir);
	1727	cache_copy(&fdp->fd_njdir, &newfdp->fd_njdir);
	1728	}
	1729	spin_unlock(&fdp->fd_spin);
	1730
	1731	/* Create the file descriptor table. */
	1732	newfdp->fd_refcnt = 1;
	1733	newfdp->fd_cmask = cmask;
	1734	newfdp->fd_files = newfdp->fd_builtin_files;
	1735	newfdp->fd_nfiles = NDFILE;
	1736	newfdp->fd_lastfile = -1;
	1737	spin_init(&newfdp->fd_spin);
	1738
	1739	return (newfdp);
	1740	}
	1741
	1742	/*
	1743	* Share a filedesc structure.
	1744	*
	1745	* MPSAFE
	1746	*/
	1747	struct filedesc *
	1748	fdshare(struct proc *p)
	1749	{
	1750	struct filedesc *fdp;
	1751
	1752	fdp = p->p_fd;
	1753	spin_lock(&fdp->fd_spin);
	1754	fdp->fd_refcnt++;
	1755	spin_unlock(&fdp->fd_spin);
	1756	return (fdp);
	1757	}
	1758
	1759	/*
	1760	* Copy a filedesc structure.
	1761	*
	1762	* MPSAFE
	1763	*/
	1764	struct filedesc *
	1765	fdcopy(struct proc *p)
	1766	{
	1767	struct filedesc *fdp = p->p_fd;
	1768	struct filedesc *newfdp;
	1769	struct fdnode *fdnode;
	1770	int i;
	1771	int ni;
	1772
	1773	/*
	1774	* Certain daemons might not have file descriptors.
	1775	*/
	1776	if (fdp == NULL)
	1777	return (NULL);
	1778
	1779	/*
	1780	* Allocate the new filedesc and fd_files[] array. This can race
	1781	* with operations by other threads on the fdp so we have to be
	1782	* careful.
	1783	*/
	1784	newfdp = kmalloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK \| M_ZERO);
	1785	again:
	1786	spin_lock(&fdp->fd_spin);
	1787	if (fdp->fd_lastfile < NDFILE) {
	1788	newfdp->fd_files = newfdp->fd_builtin_files;
	1789	i = NDFILE;
	1790	} else {
	1791	/*
	1792	* We have to allocate (N^2-1) entries for our in-place
	1793	* binary tree. Allow the table to shrink.
	1794	*/
	1795	i = fdp->fd_nfiles;
	1796	ni = (i - 1) / 2;
	1797	while (ni > fdp->fd_lastfile && ni > NDFILE) {
	1798	i = ni;
	1799	ni = (i - 1) / 2;
	1800	}
	1801	spin_unlock(&fdp->fd_spin);
	1802	newfdp->fd_files = kmalloc(i * sizeof(struct fdnode),
	1803	M_FILEDESC, M_WAITOK \| M_ZERO);
	1804
	1805	/*
	1806	* Check for race, retry
	1807	*/
	1808	spin_lock(&fdp->fd_spin);
	1809	if (i <= fdp->fd_lastfile) {
	1810	spin_unlock(&fdp->fd_spin);
	1811	kfree(newfdp->fd_files, M_FILEDESC);
	1812	goto again;
	1813	}
	1814	}
	1815
	1816	/*
	1817	* Dup the remaining fields. vref() and cache_hold() can be
	1818	* safely called while holding the read spinlock on fdp.
	1819	*
	1820	* The read spinlock on fdp is still being held.
	1821	*
	1822	* NOTE: vref and cache_hold calls for the case where the vnode
	1823	* or cache entry already has at least one ref may be called
	1824	* while holding spin locks.
	1825	*/
	1826	if ((newfdp->fd_cdir = fdp->fd_cdir) != NULL) {
	1827	vref(newfdp->fd_cdir);
	1828	cache_copy(&fdp->fd_ncdir, &newfdp->fd_ncdir);
	1829	}
	1830	/*
	1831	* We must check for fd_rdir here, at least for now because
	1832	* the init process is created before we have access to the
	1833	* rootvode to take a reference to it.
	1834	*/
	1835	if ((newfdp->fd_rdir = fdp->fd_rdir) != NULL) {
	1836	vref(newfdp->fd_rdir);
	1837	cache_copy(&fdp->fd_nrdir, &newfdp->fd_nrdir);
	1838	}
	1839	if ((newfdp->fd_jdir = fdp->fd_jdir) != NULL) {
	1840	vref(newfdp->fd_jdir);
	1841	cache_copy(&fdp->fd_njdir, &newfdp->fd_njdir);
	1842	}
	1843	newfdp->fd_refcnt = 1;
	1844	newfdp->fd_nfiles = i;
	1845	newfdp->fd_lastfile = fdp->fd_lastfile;
	1846	newfdp->fd_freefile = fdp->fd_freefile;
	1847	newfdp->fd_cmask = fdp->fd_cmask;
	1848	spin_init(&newfdp->fd_spin);
	1849
	1850	/*
	1851	* Copy the descriptor table through (i). This also copies the
	1852	* allocation state. Then go through and ref the file pointers
	1853	* and clean up any KQ descriptors.
	1854	*
	1855	* kq descriptors cannot be copied. Since we haven't ref'd the
	1856	* copied files yet we can ignore the return value from funsetfd().
	1857	*
	1858	* The read spinlock on fdp is still being held.
	1859	*/
	1860	bcopy(fdp->fd_files, newfdp->fd_files, i * sizeof(struct fdnode));
	1861	for (i = 0 ; i < newfdp->fd_nfiles; ++i) {
	1862	fdnode = &newfdp->fd_files[i];
	1863	if (fdnode->reserved) {
	1864	fdreserve_locked(newfdp, i, -1);
	1865	fdnode->reserved = 0;
	1866	fdfixup_locked(newfdp, i);
	1867	} else if (fdnode->fp) {
	1868	if (fdnode->fp->f_type == DTYPE_KQUEUE) {
	1869	(void)funsetfd_locked(newfdp, i);
	1870	} else {
	1871	fhold(fdnode->fp);
	1872	}
	1873	}
	1874	}
	1875	spin_unlock(&fdp->fd_spin);
	1876	return (newfdp);
	1877	}
	1878
	1879	/*
	1880	* Release a filedesc structure.
	1881	*
	1882	* NOT MPSAFE (MPSAFE for refs > 1, but the final cleanup code is not MPSAFE)
	1883	*/
	1884	void
	1885	fdfree(struct proc p, struct filedesc repl)
	1886	{
	1887	struct filedesc *fdp;
	1888	struct fdnode *fdnode;
	1889	int i;
	1890	struct filedesc_to_leader *fdtol;
	1891	struct file *fp;
	1892	struct vnode *vp;
	1893	struct flock lf;
	1894
	1895	/*
	1896	* Certain daemons might not have file descriptors.
	1897	*/
	1898	fdp = p->p_fd;
	1899	if (fdp == NULL) {
	1900	p->p_fd = repl;
	1901	return;
	1902	}
	1903
	1904	/*
	1905	* Severe messing around to follow.
	1906	*/
	1907	spin_lock(&fdp->fd_spin);
	1908
	1909	/* Check for special need to clear POSIX style locks */
	1910	fdtol = p->p_fdtol;
	1911	if (fdtol != NULL) {
	1912	KASSERT(fdtol->fdl_refcount > 0,
	1913	("filedesc_to_refcount botch: fdl_refcount=%d",
	1914	fdtol->fdl_refcount));
	1915	if (fdtol->fdl_refcount == 1 &&
	1916	(p->p_leader->p_flag & P_ADVLOCK) != 0) {
	1917	for (i = 0; i <= fdp->fd_lastfile; ++i) {
	1918	fdnode = &fdp->fd_files[i];
	1919	if (fdnode->fp == NULL \|\|
	1920	fdnode->fp->f_type != DTYPE_VNODE) {
	1921	continue;
	1922	}
	1923	fp = fdnode->fp;
	1924	fhold(fp);
	1925	spin_unlock(&fdp->fd_spin);
	1926
	1927	lf.l_whence = SEEK_SET;
	1928	lf.l_start = 0;
	1929	lf.l_len = 0;
	1930	lf.l_type = F_UNLCK;
	1931	vp = (struct vnode *)fp->f_data;
	1932	(void) VOP_ADVLOCK(vp,
	1933	(caddr_t)p->p_leader,
	1934	F_UNLCK,
	1935	&lf,
	1936	F_POSIX);
	1937	fdrop(fp);
	1938	spin_lock(&fdp->fd_spin);
	1939	}
	1940	}
	1941	retry:
	1942	if (fdtol->fdl_refcount == 1) {
	1943	if (fdp->fd_holdleaderscount > 0 &&
	1944	(p->p_leader->p_flag & P_ADVLOCK) != 0) {
	1945	/*
	1946	* close() or do_dup() has cleared a reference
	1947	* in a shared file descriptor table.
	1948	*/
	1949	fdp->fd_holdleaderswakeup = 1;
	1950	ssleep(&fdp->fd_holdleaderscount,
	1951	&fdp->fd_spin, 0, "fdlhold", 0);
	1952	goto retry;
	1953	}
	1954	if (fdtol->fdl_holdcount > 0) {
	1955	/*
	1956	* Ensure that fdtol->fdl_leader
	1957	* remains valid in closef().
	1958	*/
	1959	fdtol->fdl_wakeup = 1;
	1960	ssleep(fdtol, &fdp->fd_spin, 0, "fdlhold", 0);
	1961	goto retry;
	1962	}
	1963	}
	1964	fdtol->fdl_refcount--;
	1965	if (fdtol->fdl_refcount == 0 &&
	1966	fdtol->fdl_holdcount == 0) {
	1967	fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
	1968	fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
	1969	} else {
	1970	fdtol = NULL;
	1971	}
	1972	p->p_fdtol = NULL;
	1973	if (fdtol != NULL) {
	1974	spin_unlock(&fdp->fd_spin);
	1975	kfree(fdtol, M_FILEDESC_TO_LEADER);
	1976	spin_lock(&fdp->fd_spin);
	1977	}
	1978	}
	1979	if (--fdp->fd_refcnt > 0) {
	1980	spin_unlock(&fdp->fd_spin);
	1981	spin_lock(&p->p_spin);
	1982	p->p_fd = repl;
	1983	spin_unlock(&p->p_spin);
	1984	return;
	1985	}
	1986
	1987	/*
	1988	* Even though we are the last reference to the structure allproc
	1989	* scans may still reference the structure. Maintain proper
	1990	* locks until we can replace p->p_fd.
	1991	*
	1992	* Also note that kqueue's closef still needs to reference the
	1993	* fdp via p->p_fd, so we have to close the descriptors before
	1994	* we replace p->p_fd.
	1995	*/
	1996	for (i = 0; i <= fdp->fd_lastfile; ++i) {
	1997	if (fdp->fd_files[i].fp) {
	1998	fp = funsetfd_locked(fdp, i);
	1999	if (fp) {
	2000	spin_unlock(&fdp->fd_spin);
	2001	if (SLIST_FIRST(&fp->f_klist))
	2002	knote_fdclose(fp, fdp, i);
	2003	closef(fp, p);
	2004	spin_lock(&fdp->fd_spin);
	2005	}
	2006	}
	2007	}
	2008	spin_unlock(&fdp->fd_spin);
	2009
	2010	/*
	2011	* Interlock against an allproc scan operations (typically frevoke).
	2012	*/
	2013	spin_lock(&p->p_spin);
	2014	p->p_fd = repl;
	2015	spin_unlock(&p->p_spin);
	2016
	2017	/*
	2018	* Wait for any softrefs to go away. This race rarely occurs so
	2019	* we can use a non-critical-path style poll/sleep loop. The
	2020	* race only occurs against allproc scans.
	2021	*
	2022	* No new softrefs can occur with the fdp disconnected from the
	2023	* process.
	2024	*/
	2025	if (fdp->fd_softrefs) {
	2026	kprintf("pid %d: Warning, fdp race avoided\n", p->p_pid);
	2027	while (fdp->fd_softrefs)
	2028	tsleep(&fdp->fd_softrefs, 0, "fdsoft", 1);
	2029	}
	2030
	2031	if (fdp->fd_files != fdp->fd_builtin_files)
	2032	kfree(fdp->fd_files, M_FILEDESC);
	2033	if (fdp->fd_cdir) {
	2034	cache_drop(&fdp->fd_ncdir);
	2035	vrele(fdp->fd_cdir);
	2036	}
	2037	if (fdp->fd_rdir) {
	2038	cache_drop(&fdp->fd_nrdir);
	2039	vrele(fdp->fd_rdir);
	2040	}
	2041	if (fdp->fd_jdir) {
	2042	cache_drop(&fdp->fd_njdir);
	2043	vrele(fdp->fd_jdir);
	2044	}
	2045	kfree(fdp, M_FILEDESC);
	2046	}
	2047
	2048	/*
	2049	* Retrieve and reference the file pointer associated with a descriptor.
	2050	*
	2051	* MPSAFE
	2052	*/
	2053	struct file *
	2054	holdfp(struct filedesc *fdp, int fd, int flag)
	2055	{
	2056	struct file* fp;
	2057
	2058	spin_lock(&fdp->fd_spin);
	2059	if (((u_int)fd) >= fdp->fd_nfiles) {
	2060	fp = NULL;
	2061	goto done;
	2062	}
	2063	if ((fp = fdp->fd_files[fd].fp) == NULL)
	2064	goto done;
	2065	if ((fp->f_flag & flag) == 0 && flag != -1) {
	2066	fp = NULL;
	2067	goto done;
	2068	}
	2069	fhold(fp);
	2070	done:
	2071	spin_unlock(&fdp->fd_spin);
	2072	return (fp);
	2073	}
	2074
	2075	/*
	2076	* holdsock() - load the struct file pointer associated
	2077	* with a socket into *fpp. If an error occurs, non-zero
	2078	* will be returned and *fpp will be set to NULL.
	2079	*
	2080	* MPSAFE
	2081	*/
	2082	int
	2083	holdsock(struct filedesc fdp, int fd, struct file *fpp)
	2084	{
	2085	struct file *fp;
	2086	int error;
	2087
	2088	spin_lock(&fdp->fd_spin);
	2089	if ((unsigned)fd >= fdp->fd_nfiles) {
	2090	error = EBADF;
	2091	fp = NULL;
	2092	goto done;
	2093	}
	2094	if ((fp = fdp->fd_files[fd].fp) == NULL) {
	2095	error = EBADF;
	2096	goto done;
	2097	}
	2098	if (fp->f_type != DTYPE_SOCKET) {
	2099	error = ENOTSOCK;
	2100	goto done;
	2101	}
	2102	fhold(fp);
	2103	error = 0;
	2104	done:
	2105	spin_unlock(&fdp->fd_spin);
	2106	*fpp = fp;
	2107	return (error);
	2108	}
	2109
	2110	/*
	2111	* Convert a user file descriptor to a held file pointer.
	2112	*
	2113	* MPSAFE
	2114	*/
	2115	int
	2116	holdvnode(struct filedesc fdp, int fd, struct file *fpp)
	2117	{
	2118	struct file *fp;
	2119	int error;
	2120
	2121	spin_lock(&fdp->fd_spin);
	2122	if ((unsigned)fd >= fdp->fd_nfiles) {
	2123	error = EBADF;
	2124	fp = NULL;
	2125	goto done;
	2126	}
	2127	if ((fp = fdp->fd_files[fd].fp) == NULL) {
	2128	error = EBADF;
	2129	goto done;
	2130	}
	2131	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
	2132	fp = NULL;
	2133	error = EINVAL;
	2134	goto done;
	2135	}
	2136	fhold(fp);
	2137	error = 0;
	2138	done:
	2139	spin_unlock(&fdp->fd_spin);
	2140	*fpp = fp;
	2141	return (error);
	2142	}
	2143
	2144	/*
	2145	* For setugid programs, we don't want to people to use that setugidness
	2146	* to generate error messages which write to a file which otherwise would
	2147	* otherwise be off-limits to the process.
	2148	*
	2149	* This is a gross hack to plug the hole. A better solution would involve
	2150	* a special vop or other form of generalized access control mechanism. We
	2151	* go ahead and just reject all procfs file systems accesses as dangerous.
	2152	*
	2153	* Since setugidsafety calls this only for fd 0, 1 and 2, this check is
	2154	* sufficient. We also don't for check setugidness since we know we are.
	2155	*/
	2156	static int
	2157	is_unsafe(struct file *fp)
	2158	{
	2159	if (fp->f_type == DTYPE_VNODE &&
	2160	((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS)
	2161	return (1);
	2162	return (0);
	2163	}
	2164
	2165	/*
	2166	* Make this setguid thing safe, if at all possible.
	2167	*
	2168	* NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose()
	2169	*/
	2170	void
	2171	setugidsafety(struct proc *p)
	2172	{
	2173	struct filedesc *fdp = p->p_fd;
	2174	int i;
	2175
	2176	/* Certain daemons might not have file descriptors. */
	2177	if (fdp == NULL)
	2178	return;
	2179
	2180	/*
	2181	* note: fdp->fd_files may be reallocated out from under us while
	2182	* we are blocked in a close. Be careful!
	2183	*/
	2184	for (i = 0; i <= fdp->fd_lastfile; i++) {
	2185	if (i > 2)
	2186	break;
	2187	if (fdp->fd_files[i].fp && is_unsafe(fdp->fd_files[i].fp)) {
	2188	struct file *fp;
	2189
	2190	/*
	2191	* NULL-out descriptor prior to close to avoid
	2192	* a race while close blocks.
	2193	*/
	2194	if ((fp = funsetfd_locked(fdp, i)) != NULL) {
	2195	knote_fdclose(fp, fdp, i);
	2196	closef(fp, p);
	2197	}
	2198	}
	2199	}
	2200	}
	2201
	2202	/*
	2203	* Close any files on exec?
	2204	*
	2205	* NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose()
	2206	*/
	2207	void
	2208	fdcloseexec(struct proc *p)
	2209	{
	2210	struct filedesc *fdp = p->p_fd;
	2211	int i;
	2212
	2213	/* Certain daemons might not have file descriptors. */
	2214	if (fdp == NULL)
	2215	return;
	2216
	2217	/*
	2218	* We cannot cache fd_files since operations may block and rip
	2219	* them out from under us.
	2220	*/
	2221	for (i = 0; i <= fdp->fd_lastfile; i++) {
	2222	if (fdp->fd_files[i].fp != NULL &&
	2223	(fdp->fd_files[i].fileflags & UF_EXCLOSE)) {
	2224	struct file *fp;
	2225
	2226	/*
	2227	* NULL-out descriptor prior to close to avoid
	2228	* a race while close blocks.
	2229	*/
	2230	if ((fp = funsetfd_locked(fdp, i)) != NULL) {
	2231	knote_fdclose(fp, fdp, i);
	2232	closef(fp, p);
	2233	}
	2234	}
	2235	}
	2236	}
	2237
	2238	/*
	2239	* It is unsafe for set[ug]id processes to be started with file
	2240	* descriptors 0..2 closed, as these descriptors are given implicit
	2241	* significance in the Standard C library. fdcheckstd() will create a
	2242	* descriptor referencing /dev/null for each of stdin, stdout, and
	2243	* stderr that is not already open.
	2244	*
	2245	* NOT MPSAFE - calls falloc, vn_open, etc
	2246	*/
	2247	int
	2248	fdcheckstd(struct lwp *lp)
	2249	{
	2250	struct nlookupdata nd;
	2251	struct filedesc *fdp;
	2252	struct file *fp;
	2253	int retval;
	2254	int i, error, flags, devnull;
	2255
	2256	fdp = lp->lwp_proc->p_fd;
	2257	if (fdp == NULL)
	2258	return (0);
	2259	devnull = -1;
	2260	error = 0;
	2261	for (i = 0; i < 3; i++) {
	2262	if (fdp->fd_files[i].fp != NULL)
	2263	continue;
	2264	if (devnull < 0) {
	2265	if ((error = falloc(lp, &fp, &devnull)) != 0)
	2266	break;
	2267
	2268	error = nlookup_init(&nd, "/dev/null", UIO_SYSSPACE,
	2269	NLC_FOLLOW\|NLC_LOCKVP);
	2270	flags = FREAD \| FWRITE;
	2271	if (error == 0)
	2272	error = vn_open(&nd, fp, flags, 0);
	2273	if (error == 0)
	2274	fsetfd(fdp, fp, devnull);
	2275	else
	2276	fsetfd(fdp, NULL, devnull);
	2277	fdrop(fp);
	2278	nlookup_done(&nd);
	2279	if (error)
	2280	break;
	2281	KKASSERT(i == devnull);
	2282	} else {
	2283	error = kern_dup(DUP_FIXED, devnull, i, &retval);
	2284	if (error != 0)
	2285	break;
	2286	}
	2287	}
	2288	return (error);
	2289	}
	2290
	2291	/*
	2292	* Internal form of close.
	2293	* Decrement reference count on file structure.
	2294	* Note: td and/or p may be NULL when closing a file
	2295	* that was being passed in a message.
	2296	*
	2297	* MPALMOSTSAFE - acquires mplock for VOP operations
	2298	*/
	2299	int
	2300	closef(struct file fp, struct proc p)
	2301	{
	2302	struct vnode *vp;
	2303	struct flock lf;
	2304	struct filedesc_to_leader *fdtol;
	2305
	2306	if (fp == NULL)
	2307	return (0);
	2308
	2309	/*
	2310	* POSIX record locking dictates that any close releases ALL
	2311	* locks owned by this process. This is handled by setting
	2312	* a flag in the unlock to free ONLY locks obeying POSIX
	2313	* semantics, and not to free BSD-style file locks.
	2314	* If the descriptor was in a message, POSIX-style locks
	2315	* aren't passed with the descriptor.
	2316	*/
	2317	if (p != NULL && fp->f_type == DTYPE_VNODE &&
	2318	(((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS)
	2319	) {
	2320	get_mplock();
	2321	if ((p->p_leader->p_flag & P_ADVLOCK) != 0) {
	2322	lf.l_whence = SEEK_SET;
	2323	lf.l_start = 0;
	2324	lf.l_len = 0;
	2325	lf.l_type = F_UNLCK;
	2326	vp = (struct vnode *)fp->f_data;
	2327	(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
	2328	&lf, F_POSIX);
	2329	}
	2330	fdtol = p->p_fdtol;
	2331	if (fdtol != NULL) {
	2332	/*
	2333	* Handle special case where file descriptor table
	2334	* is shared between multiple process leaders.
	2335	*/
	2336	for (fdtol = fdtol->fdl_next;
	2337	fdtol != p->p_fdtol;
	2338	fdtol = fdtol->fdl_next) {
	2339	if ((fdtol->fdl_leader->p_flag &
	2340	P_ADVLOCK) == 0)
	2341	continue;
	2342	fdtol->fdl_holdcount++;
	2343	lf.l_whence = SEEK_SET;
	2344	lf.l_start = 0;
	2345	lf.l_len = 0;
	2346	lf.l_type = F_UNLCK;
	2347	vp = (struct vnode *)fp->f_data;
	2348	(void) VOP_ADVLOCK(vp,
	2349	(caddr_t)fdtol->fdl_leader,
	2350	F_UNLCK, &lf, F_POSIX);
	2351	fdtol->fdl_holdcount--;
	2352	if (fdtol->fdl_holdcount == 0 &&
	2353	fdtol->fdl_wakeup != 0) {
	2354	fdtol->fdl_wakeup = 0;
	2355	wakeup(fdtol);
	2356	}
	2357	}
	2358	}
	2359	rel_mplock();
	2360	}
	2361	return (fdrop(fp));
	2362	}
	2363
	2364	/*
	2365	* MPSAFE
	2366	*
	2367	* fhold() can only be called if f_count is already at least 1 (i.e. the
	2368	* caller of fhold() already has a reference to the file pointer in some
	2369	* manner or other).
	2370	*
	2371	* f_count is not spin-locked. Instead, atomic ops are used for
	2372	* incrementing, decrementing, and handling the 1->0 transition.
	2373	*/
	2374	void
	2375	fhold(struct file *fp)
	2376	{
	2377	atomic_add_int(&fp->f_count, 1);
	2378	}
	2379
	2380	/*
	2381	* fdrop() - drop a reference to a descriptor
	2382	*
	2383	* MPALMOSTSAFE - acquires mplock for final close sequence
	2384	*/
	2385	int
	2386	fdrop(struct file *fp)
	2387	{
	2388	struct flock lf;
	2389	struct vnode *vp;
	2390	int error;
	2391
	2392	/*
	2393	* A combined fetch and subtract is needed to properly detect
	2394	* 1->0 transitions, otherwise two cpus dropping from a ref
	2395	* count of 2 might both try to run the 1->0 code.
	2396	*/
	2397	if (atomic_fetchadd_int(&fp->f_count, -1) > 1)
	2398	return (0);
	2399
	2400	KKASSERT(SLIST_FIRST(&fp->f_klist) == NULL);
	2401	get_mplock();
	2402
	2403	/*
	2404	* The last reference has gone away, we own the fp structure free
	2405	* and clear.
	2406	*/
	2407	if (fp->f_count < 0)
	2408	panic("fdrop: count < 0");
	2409	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE &&
	2410	(((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS)
	2411	) {
	2412	lf.l_whence = SEEK_SET;
	2413	lf.l_start = 0;
	2414	lf.l_len = 0;
	2415	lf.l_type = F_UNLCK;
	2416	vp = (struct vnode *)fp->f_data;
	2417	(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0);
	2418	}
	2419	if (fp->f_ops != &badfileops)
	2420	error = fo_close(fp);
	2421	else
	2422	error = 0;
	2423	ffree(fp);
	2424	rel_mplock();
	2425	return (error);
	2426	}
	2427
	2428	/*
	2429	* Apply an advisory lock on a file descriptor.
	2430	*
	2431	* Just attempt to get a record lock of the requested type on
	2432	* the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
	2433	*
	2434	* MPALMOSTSAFE
	2435	*/
	2436	int
	2437	sys_flock(struct flock_args *uap)
	2438	{
	2439	struct proc *p = curproc;
	2440	struct file *fp;
	2441	struct vnode *vp;
	2442	struct flock lf;
	2443	int error;
	2444
	2445	if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL)
	2446	return (EBADF);
	2447	get_mplock();
	2448	if (fp->f_type != DTYPE_VNODE) {
	2449	error = EOPNOTSUPP;
	2450	goto done;
	2451	}
	2452	vp = (struct vnode *)fp->f_data;
	2453	lf.l_whence = SEEK_SET;
	2454	lf.l_start = 0;
	2455	lf.l_len = 0;
	2456	if (uap->how & LOCK_UN) {
	2457	lf.l_type = F_UNLCK;
	2458	fp->f_flag &= ~FHASLOCK;
	2459	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0);
	2460	goto done;
	2461	}
	2462	if (uap->how & LOCK_EX)
	2463	lf.l_type = F_WRLCK;
	2464	else if (uap->how & LOCK_SH)
	2465	lf.l_type = F_RDLCK;
	2466	else {
	2467	error = EBADF;
	2468	goto done;
	2469	}
	2470	fp->f_flag \|= FHASLOCK;
	2471	if (uap->how & LOCK_NB)
	2472	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 0);
	2473	else
	2474	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_WAIT);
	2475	done:
	2476	rel_mplock();
	2477	fdrop(fp);
	2478	return (error);
	2479	}
	2480
	2481	/*
	2482	* File Descriptor pseudo-device driver (/dev/fd/).
	2483	*
	2484	* Opening minor device N dup()s the file (if any) connected to file
	2485	* descriptor N belonging to the calling process. Note that this driver
	2486	* consists of only the ``open()'' routine, because all subsequent
	2487	* references to this file will be direct to the other driver.
	2488	*/
	2489	static int
	2490	fdopen(struct dev_open_args *ap)
	2491	{
	2492	thread_t td = curthread;
	2493
	2494	KKASSERT(td->td_lwp != NULL);
	2495
	2496	/*
	2497	* XXX Kludge: set curlwp->lwp_dupfd to contain the value of the
	2498	* the file descriptor being sought for duplication. The error
	2499	* return ensures that the vnode for this device will be released
	2500	* by vn_open. Open will detect this special error and take the
	2501	* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
	2502	* will simply report the error.
	2503	*/
	2504	td->td_lwp->lwp_dupfd = minor(ap->a_head.a_dev);
	2505	return (ENODEV);
	2506	}
	2507
	2508	/*
	2509	* The caller has reserved the file descriptor dfd for us. On success we
	2510	* must fsetfd() it. On failure the caller will clean it up.
	2511	*
	2512	* MPSAFE
	2513	*/
	2514	int
	2515	dupfdopen(struct filedesc *fdp, int dfd, int sfd, int mode, int error)
	2516	{
	2517	struct file *wfp;
	2518	struct file *xfp;
	2519	int werror;
	2520
	2521	if ((wfp = holdfp(fdp, sfd, -1)) == NULL)
	2522	return (EBADF);
	2523
	2524	/*
	2525	* Close a revoke/dup race. Duping a descriptor marked as revoked
	2526	* will dup a dummy descriptor instead of the real one.
	2527	*/
	2528	if (wfp->f_flag & FREVOKED) {
	2529	kprintf("Warning: attempt to dup() a revoked descriptor\n");
	2530	fdrop(wfp);
	2531	wfp = NULL;
	2532	werror = falloc(NULL, &wfp, NULL);
	2533	if (werror)
	2534	return (werror);
	2535	}
	2536
	2537	/*
	2538	* There are two cases of interest here.
	2539	*
	2540	* For ENODEV simply dup sfd to file descriptor dfd and return.
	2541	*
	2542	* For ENXIO steal away the file structure from sfd and store it
	2543	* dfd. sfd is effectively closed by this operation.
	2544	*
	2545	* Any other error code is just returned.
	2546	*/
	2547	switch (error) {
	2548	case ENODEV:
	2549	/*
	2550	* Check that the mode the file is being opened for is a
	2551	* subset of the mode of the existing descriptor.
	2552	*/
	2553	if (((mode & (FREAD\|FWRITE)) \| wfp->f_flag) != wfp->f_flag) {
	2554	error = EACCES;
	2555	break;
	2556	}
	2557	spin_lock(&fdp->fd_spin);
	2558	fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags;
	2559	fsetfd_locked(fdp, wfp, dfd);
	2560	spin_unlock(&fdp->fd_spin);
	2561	error = 0;
	2562	break;
	2563	case ENXIO:
	2564	/*
	2565	* Steal away the file pointer from dfd, and stuff it into indx.
	2566	*/
	2567	spin_lock(&fdp->fd_spin);
	2568	fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags;
	2569	fsetfd(fdp, wfp, dfd);
	2570	if ((xfp = funsetfd_locked(fdp, sfd)) != NULL) {
	2571	spin_unlock(&fdp->fd_spin);
	2572	fdrop(xfp);
	2573	} else {
	2574	spin_unlock(&fdp->fd_spin);
	2575	}
	2576	error = 0;
	2577	break;
	2578	default:
	2579	break;
	2580	}
	2581	fdrop(wfp);
	2582	return (error);
	2583	}
	2584
	2585	/*
	2586	* NOT MPSAFE - I think these refer to a common file descriptor table
	2587	* and we need to spinlock that to link fdtol in.
	2588	*/
	2589	struct filedesc_to_leader *
	2590	filedesc_to_leader_alloc(struct filedesc_to_leader *old,
	2591	struct proc *leader)
	2592	{
	2593	struct filedesc_to_leader *fdtol;
	2594
	2595	fdtol = kmalloc(sizeof(struct filedesc_to_leader),
	2596	M_FILEDESC_TO_LEADER, M_WAITOK);
	2597	fdtol->fdl_refcount = 1;
	2598	fdtol->fdl_holdcount = 0;
	2599	fdtol->fdl_wakeup = 0;
	2600	fdtol->fdl_leader = leader;
	2601	if (old != NULL) {
	2602	fdtol->fdl_next = old->fdl_next;
	2603	fdtol->fdl_prev = old;
	2604	old->fdl_next = fdtol;
	2605	fdtol->fdl_next->fdl_prev = fdtol;
	2606	} else {
	2607	fdtol->fdl_next = fdtol;
	2608	fdtol->fdl_prev = fdtol;
	2609	}
	2610	return fdtol;
	2611	}
	2612
	2613	/*
	2614	* Scan all file pointers in the system. The callback is made with
	2615	* the master list spinlock held exclusively.
	2616	*
	2617	* MPSAFE
	2618	*/
	2619	void
	2620	allfiles_scan_exclusive(int (callback)(struct file , void ), void data)
	2621	{
	2622	struct file *fp;
	2623	int res;
	2624
	2625	spin_lock(&filehead_spin);
	2626	LIST_FOREACH(fp, &filehead, f_list) {
	2627	res = callback(fp, data);
	2628	if (res < 0)
	2629	break;
	2630	}
	2631	spin_unlock(&filehead_spin);
	2632	}
	2633
	2634	/*
	2635	* Get file structures.
	2636	*
	2637	* NOT MPSAFE - process list scan, SYSCTL_OUT (probably not mpsafe)
	2638	*/
	2639
	2640	struct sysctl_kern_file_info {
	2641	int count;
	2642	int error;
	2643	struct sysctl_req *req;
	2644	};
	2645
	2646	static int sysctl_kern_file_callback(struct proc p, void data);
	2647
	2648	static int
	2649	sysctl_kern_file(SYSCTL_HANDLER_ARGS)
	2650	{
	2651	struct sysctl_kern_file_info info;
	2652
	2653	/*
	2654	* Note: because the number of file descriptors is calculated
	2655	* in different ways for sizing vs returning the data,
	2656	* there is information leakage from the first loop. However,
	2657	* it is of a similar order of magnitude to the leakage from
	2658	* global system statistics such as kern.openfiles.
	2659	*
	2660	* When just doing a count, note that we cannot just count
	2661	* the elements and add f_count via the filehead list because
	2662	* threaded processes share their descriptor table and f_count might
	2663	* still be '1' in that case.
	2664	*
	2665	* Since the SYSCTL op can block, we must hold the process to
	2666	* prevent it being ripped out from under us either in the
	2667	* file descriptor loop or in the greater LIST_FOREACH. The
	2668	* process may be in varying states of disrepair. If the process
	2669	* is in SZOMB we may have caught it just as it is being removed
	2670	* from the allproc list, we must skip it in that case to maintain
	2671	* an unbroken chain through the allproc list.
	2672	*/
	2673	info.count = 0;
	2674	info.error = 0;
	2675	info.req = req;
	2676	allproc_scan(sysctl_kern_file_callback, &info);
	2677
	2678	/*
	2679	* When just calculating the size, overestimate a bit to try to
	2680	* prevent system activity from causing the buffer-fill call
	2681	* to fail later on.
	2682	*/
	2683	if (req->oldptr == NULL) {
	2684	info.count = (info.count + 16) + (info.count / 10);
	2685	info.error = SYSCTL_OUT(req, NULL,
	2686	info.count * sizeof(struct kinfo_file));
	2687	}
	2688	return (info.error);
	2689	}
	2690
	2691	static int
	2692	sysctl_kern_file_callback(struct proc p, void data)
	2693	{
	2694	struct sysctl_kern_file_info *info = data;
	2695	struct kinfo_file kf;
	2696	struct filedesc *fdp;
	2697	struct file *fp;
	2698	uid_t uid;
	2699	int n;
	2700
	2701	if (p->p_stat == SIDL \|\| p->p_stat == SZOMB)
	2702	return(0);
	2703	if (!PRISON_CHECK(info->req->td->td_ucred, p->p_ucred) != 0)
	2704	return(0);
	2705
	2706	/*
	2707	* Softref the fdp to prevent it from being destroyed
	2708	*/
	2709	spin_lock(&p->p_spin);
	2710	if ((fdp = p->p_fd) == NULL) {
	2711	spin_unlock(&p->p_spin);
	2712	return(0);
	2713	}
	2714	atomic_add_int(&fdp->fd_softrefs, 1);
	2715	spin_unlock(&p->p_spin);
	2716
	2717	/*
	2718	* The fdp's own spinlock prevents the contents from being
	2719	* modified.
	2720	*/
	2721	spin_lock(&fdp->fd_spin);
	2722	for (n = 0; n < fdp->fd_nfiles; ++n) {
	2723	if ((fp = fdp->fd_files[n].fp) == NULL)
	2724	continue;
	2725	if (info->req->oldptr == NULL) {
	2726	++info->count;
	2727	} else {
	2728	uid = p->p_ucred ? p->p_ucred->cr_uid : -1;
	2729	kcore_make_file(&kf, fp, p->p_pid, uid, n);
	2730	spin_unlock(&fdp->fd_spin);
	2731	info->error = SYSCTL_OUT(info->req, &kf, sizeof(kf));
	2732	spin_lock(&fdp->fd_spin);
	2733	if (info->error)
	2734	break;
	2735	}
	2736	}
	2737	spin_unlock(&fdp->fd_spin);
	2738	atomic_subtract_int(&fdp->fd_softrefs, 1);
	2739	if (info->error)
	2740	return(-1);
	2741	return(0);
	2742	}
	2743
	2744	SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	2745	0, 0, sysctl_kern_file, "S,file", "Entire file table");
	2746
	2747	SYSCTL_INT(_kern, OID_AUTO, minfilesperproc, CTLFLAG_RW,
	2748	&minfilesperproc, 0, "Minimum files allowed open per process");
	2749	SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
	2750	&maxfilesperproc, 0, "Maximum files allowed open per process");
	2751	SYSCTL_INT(_kern, OID_AUTO, maxfilesperuser, CTLFLAG_RW,
	2752	&maxfilesperuser, 0, "Maximum files allowed open per user");
	2753
	2754	SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
	2755	&maxfiles, 0, "Maximum number of files");
	2756
	2757	SYSCTL_INT(_kern, OID_AUTO, maxfilesrootres, CTLFLAG_RW,
	2758	&maxfilesrootres, 0, "Descriptors reserved for root use");
	2759
	2760	SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
	2761	&nfiles, 0, "System-wide number of open files");
	2762
	2763	static void
	2764	fildesc_drvinit(void *unused)
	2765	{
	2766	int fd;
	2767
	2768	for (fd = 0; fd < NUMFDESC; fd++) {
	2769	make_dev(&fildesc_ops, fd,
	2770	UID_BIN, GID_BIN, 0666, "fd/%d", fd);
	2771	}
	2772
	2773	make_dev(&fildesc_ops, 0, UID_ROOT, GID_WHEEL, 0666, "stdin");
	2774	make_dev(&fildesc_ops, 1, UID_ROOT, GID_WHEEL, 0666, "stdout");
	2775	make_dev(&fildesc_ops, 2, UID_ROOT, GID_WHEEL, 0666, "stderr");
	2776	}
	2777
	2778	/*
	2779	* MPSAFE
	2780	*/
	2781	struct fileops badfileops = {
	2782	.fo_read = badfo_readwrite,
	2783	.fo_write = badfo_readwrite,
	2784	.fo_ioctl = badfo_ioctl,
	2785	.fo_poll = badfo_poll,
	2786	.fo_kqfilter = badfo_kqfilter,
	2787	.fo_stat = badfo_stat,
	2788	.fo_close = badfo_close,
	2789	.fo_shutdown = badfo_shutdown
	2790	};
	2791
	2792	int
	2793	badfo_readwrite(
	2794	struct file *fp,
	2795	struct uio *uio,
	2796	struct ucred *cred,
	2797	int flags
	2798	) {
	2799	return (EBADF);
	2800	}
	2801
	2802	int
	2803	badfo_ioctl(struct file *fp, u_long com, caddr_t data,
	2804	struct ucred cred, struct sysmsg msgv)
	2805	{
	2806	return (EBADF);
	2807	}
	2808
	2809	int
	2810	badfo_poll(struct file fp, int events, struct ucred cred)
	2811	{
	2812	return (0);
	2813	}
	2814
	2815	/*
	2816	* Must return an error to prevent registration, typically
	2817	* due to a revoked descriptor (file_filtops assigned).
	2818	*/
	2819	int
	2820	badfo_kqfilter(struct file fp, struct knote kn)
	2821	{
	2822	return (EOPNOTSUPP);
	2823	}
	2824
	2825	/*
	2826	* MPSAFE
	2827	*/
	2828	int
	2829	badfo_stat(struct file fp, struct stat sb, struct ucred *cred)
	2830	{
	2831	return (EBADF);
	2832	}
	2833
	2834	/*
	2835	* MPSAFE
	2836	*/
	2837	int
	2838	badfo_close(struct file *fp)
	2839	{
	2840	return (EBADF);
	2841	}
	2842
	2843	/*
	2844	* MPSAFE
	2845	*/
	2846	int
	2847	badfo_shutdown(struct file *fp, int how)
	2848	{
	2849	return (EBADF);
	2850	}
	2851
	2852	/*
	2853	* MPSAFE
	2854	*/
	2855	int
	2856	nofo_shutdown(struct file *fp, int how)
	2857	{
	2858	return (EOPNOTSUPP);
	2859	}
	2860
	2861	SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
	2862	fildesc_drvinit,NULL)