gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2005 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Jeffrey Hsu.
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	*
	35	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	36	* The Regents of the University of California. All rights reserved.
	37	* (c) UNIX System Laboratories, Inc.
	38	* All or some portions of this file are derived from material licensed
	39	* to the University of California by American Telephone and Telegraph
	40	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	41	* the permission of UNIX System Laboratories, Inc.
	42	*
	43	* Redistribution and use in source and binary forms, with or without
	44	* modification, are permitted provided that the following conditions
	45	* are met:
	46	* 1. Redistributions of source code must retain the above copyright
	47	* notice, this list of conditions and the following disclaimer.
	48	* 2. Redistributions in binary form must reproduce the above copyright
	49	* notice, this list of conditions and the following disclaimer in the
	50	* documentation and/or other materials provided with the distribution.
	51	* 3. All advertising materials mentioning features or use of this software
	52	* must display the following acknowledgement:
	53	* This product includes software developed by the University of
	54	* California, Berkeley and its contributors.
	55	* 4. Neither the name of the University nor the names of its contributors
	56	* may be used to endorse or promote products derived from this software
	57	* without specific prior written permission.
	58	*
	59	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	60	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	61	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	62	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	63	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	64	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	65	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	66	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	67	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	68	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	69	* SUCH DAMAGE.
	70	*
	71	* @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
	72	* $FreeBSD: src/sys/kern/kern_descrip.c,v 1.81.2.19 2004/02/28 00:43:31 tegge Exp $
	73	* $DragonFly: src/sys/kern/kern_descrip.c,v 1.76 2006/12/23 00:35:03 swildner Exp $
	74	*/
	75
	76	#include "opt_compat.h"
	77	#include <sys/param.h>
	78	#include <sys/systm.h>
	79	#include <sys/malloc.h>
	80	#include <sys/sysproto.h>
	81	#include <sys/conf.h>
	82	#include <sys/device.h>
	83	#include <sys/filedesc.h>
	84	#include <sys/kernel.h>
	85	#include <sys/sysctl.h>
	86	#include <sys/vnode.h>
	87	#include <sys/proc.h>
	88	#include <sys/nlookup.h>
	89	#include <sys/file.h>
	90	#include <sys/stat.h>
	91	#include <sys/filio.h>
	92	#include <sys/fcntl.h>
	93	#include <sys/unistd.h>
	94	#include <sys/resourcevar.h>
	95	#include <sys/event.h>
	96	#include <sys/kern_syscall.h>
	97	#include <sys/kcore.h>
	98	#include <sys/kinfo.h>
	99
	100	#include <vm/vm.h>
	101	#include <vm/vm_extern.h>
	102
	103	#include <sys/thread2.h>
	104	#include <sys/file2.h>
	105	#include <sys/spinlock2.h>
	106
	107	static void fsetfd_locked(struct filedesc fdp, struct file fp, int fd);
	108	static void fdreserve_locked (struct filedesc *fdp, int fd0, int incr);
	109	static struct file funsetfd_locked (struct filedesc fdp, int fd);
	110	static int checkfpclosed(struct filedesc fdp, int fd, struct file fp);
	111	static void ffree(struct file *fp);
	112
	113	static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
	114	static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
	115	"file desc to leader structures");
	116	MALLOC_DEFINE(M_FILE, "file", "Open file structure");
	117	static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
	118
	119	static d_open_t fdopen;
	120	#define NUMFDESC 64
	121
	122	#define CDEV_MAJOR 22
	123	static struct dev_ops fildesc_ops = {
	124	{ "FD", CDEV_MAJOR, 0 },
	125	.d_open = fdopen,
	126	};
	127
	128	static int badfo_readwrite (struct file fp, struct uio uio,
	129	struct ucred *cred, int flags);
	130	static int badfo_ioctl (struct file *fp, u_long com, caddr_t data,
	131	struct ucred *cred);
	132	static int badfo_poll (struct file fp, int events, struct ucred cred);
	133	static int badfo_kqfilter (struct file fp, struct knote kn);
	134	static int badfo_stat (struct file fp, struct stat sb, struct ucred *cred);
	135	static int badfo_close (struct file *fp);
	136	static int badfo_shutdown (struct file *fp, int how);
	137
	138	/*
	139	* Descriptor management.
	140	*/
	141	static struct filelist filehead = LIST_HEAD_INITIALIZER(&filehead);
	142	static struct spinlock filehead_spin = SPINLOCK_INITIALIZER(&filehead_spin);
	143	static int nfiles; /* actual number of open files */
	144	extern int cmask;
	145
	146	/*
	147	* Fixup fd_freefile and fd_lastfile after a descriptor has been cleared.
	148	*
	149	* MPSAFE - must be called with fdp->fd_spin exclusively held
	150	*/
	151	static __inline
	152	void
	153	fdfixup_locked(struct filedesc *fdp, int fd)
	154	{
	155	if (fd < fdp->fd_freefile) {
	156	fdp->fd_freefile = fd;
	157	}
	158	while (fdp->fd_lastfile >= 0 &&
	159	fdp->fd_files[fdp->fd_lastfile].fp == NULL &&
	160	fdp->fd_files[fdp->fd_lastfile].reserved == 0
	161	) {
	162	--fdp->fd_lastfile;
	163	}
	164	}
	165
	166	/*
	167	* System calls on descriptors.
	168	*
	169	* MPSAFE
	170	*/
	171	int
	172	sys_getdtablesize(struct getdtablesize_args *uap)
	173	{
	174	struct proc *p = curproc;
	175	struct plimit *limit = p->p_limit;
	176
	177	spin_lock_rd(&limit->p_spin);
	178	uap->sysmsg_result =
	179	min((int)limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
	180	spin_unlock_rd(&limit->p_spin);
	181	return (0);
	182	}
	183
	184	/*
	185	* Duplicate a file descriptor to a particular value.
	186	*
	187	* note: keep in mind that a potential race condition exists when closing
	188	* descriptors from a shared descriptor table (via rfork).
	189	*
	190	* MPSAFE
	191	*/
	192	int
	193	sys_dup2(struct dup2_args *uap)
	194	{
	195	int error;
	196
	197	error = kern_dup(DUP_FIXED, uap->from, uap->to, uap->sysmsg_fds);
	198
	199	return (error);
	200	}
	201
	202	/*
	203	* Duplicate a file descriptor.
	204	*
	205	* MPSAFE
	206	*/
	207	int
	208	sys_dup(struct dup_args *uap)
	209	{
	210	int error;
	211
	212	error = kern_dup(DUP_VARIABLE, uap->fd, 0, uap->sysmsg_fds);
	213
	214	return (error);
	215	}
	216
	217	/*
	218	* MPALMOSTSAFE - acquires mplock for fp operations
	219	*/
	220	int
	221	kern_fcntl(int fd, int cmd, union fcntl_dat dat, struct ucred cred)
	222	{
	223	struct thread *td = curthread;
	224	struct proc *p = td->td_proc;
	225	struct file *fp;
	226	struct vnode *vp;
	227	u_int newmin;
	228	u_int oflags;
	229	int tmp, error, flg = F_POSIX;
	230
	231	KKASSERT(p);
	232
	233	/*
	234	* Operations on file descriptors that do not require a file pointer.
	235	*/
	236	switch (cmd) {
	237	case F_GETFD:
	238	error = fgetfdflags(p->p_fd, fd, &tmp);
	239	if (error == 0)
	240	dat->fc_cloexec = (tmp & UF_EXCLOSE) ? FD_CLOEXEC : 0;
	241	return (error);
	242
	243	case F_SETFD:
	244	if (dat->fc_cloexec & FD_CLOEXEC)
	245	error = fsetfdflags(p->p_fd, fd, UF_EXCLOSE);
	246	else
	247	error = fclrfdflags(p->p_fd, fd, UF_EXCLOSE);
	248	return (error);
	249	case F_DUPFD:
	250	newmin = dat->fc_fd;
	251	error = kern_dup(DUP_VARIABLE, fd, newmin, &dat->fc_fd);
	252	return (error);
	253	default:
	254	break;
	255	}
	256
	257	/*
	258	* Operations on file pointers
	259	*/
	260	if ((fp = holdfp(p->p_fd, fd, -1)) == NULL)
	261	return (EBADF);
	262
	263	get_mplock();
	264	switch (cmd) {
	265	case F_GETFL:
	266	dat->fc_flags = OFLAGS(fp->f_flag);
	267	error = 0;
	268	break;
	269
	270	case F_SETFL:
	271	oflags = fp->f_flag & FCNTLFLAGS;
	272	fp->f_flag &= ~FCNTLFLAGS;
	273	fp->f_flag \|= FFLAGS(dat->fc_flags & ~O_ACCMODE) & FCNTLFLAGS;
	274	error = 0;
	275	if ((fp->f_flag ^ oflags) & FASYNC) {
	276	tmp = fp->f_flag & FASYNC;
	277	error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, cred);
	278	}
	279	if (error)
	280	fp->f_flag = (fp->f_flag & ~FCNTLFLAGS) \| oflags;
	281	break;
	282
	283	case F_GETOWN:
	284	error = fo_ioctl(fp, FIOGETOWN, (caddr_t)&dat->fc_owner, cred);
	285	break;
	286
	287	case F_SETOWN:
	288	error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&dat->fc_owner, cred);
	289	break;
	290
	291	case F_SETLKW:
	292	flg \|= F_WAIT;
	293	/* Fall into F_SETLK */
	294
	295	case F_SETLK:
	296	if (fp->f_type != DTYPE_VNODE) {
	297	error = EBADF;
	298	break;
	299	}
	300	vp = (struct vnode *)fp->f_data;
	301
	302	/*
	303	* copyin/lockop may block
	304	*/
	305	if (dat->fc_flock.l_whence == SEEK_CUR)
	306	dat->fc_flock.l_start += fp->f_offset;
	307
	308	switch (dat->fc_flock.l_type) {
	309	case F_RDLCK:
	310	if ((fp->f_flag & FREAD) == 0) {
	311	error = EBADF;
	312	break;
	313	}
	314	p->p_leader->p_flag \|= P_ADVLOCK;
	315	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
	316	&dat->fc_flock, flg);
	317	break;
	318	case F_WRLCK:
	319	if ((fp->f_flag & FWRITE) == 0) {
	320	error = EBADF;
	321	break;
	322	}
	323	p->p_leader->p_flag \|= P_ADVLOCK;
	324	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
	325	&dat->fc_flock, flg);
	326	break;
	327	case F_UNLCK:
	328	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
	329	&dat->fc_flock, F_POSIX);
	330	break;
	331	default:
	332	error = EINVAL;
	333	break;
	334	}
	335
	336	/*
	337	* It is possible to race a close() on the descriptor while
	338	* we were blocked getting the lock. If this occurs the
	339	* close might not have caught the lock.
	340	*/
	341	if (checkfpclosed(p->p_fd, fd, fp)) {
	342	dat->fc_flock.l_whence = SEEK_SET;
	343	dat->fc_flock.l_start = 0;
	344	dat->fc_flock.l_len = 0;
	345	dat->fc_flock.l_type = F_UNLCK;
	346	(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
	347	F_UNLCK, &dat->fc_flock, F_POSIX);
	348	}
	349	break;
	350
	351	case F_GETLK:
	352	if (fp->f_type != DTYPE_VNODE) {
	353	error = EBADF;
	354	break;
	355	}
	356	vp = (struct vnode *)fp->f_data;
	357	/*
	358	* copyin/lockop may block
	359	*/
	360	if (dat->fc_flock.l_type != F_RDLCK &&
	361	dat->fc_flock.l_type != F_WRLCK &&
	362	dat->fc_flock.l_type != F_UNLCK) {
	363	error = EINVAL;
	364	break;
	365	}
	366	if (dat->fc_flock.l_whence == SEEK_CUR)
	367	dat->fc_flock.l_start += fp->f_offset;
	368	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK,
	369	&dat->fc_flock, F_POSIX);
	370	break;
	371	default:
	372	error = EINVAL;
	373	break;
	374	}
	375	rel_mplock();
	376
	377	fdrop(fp);
	378	return (error);
	379	}
	380
	381	/*
	382	* The file control system call.
	383	*
	384	* MPSAFE
	385	*/
	386	int
	387	sys_fcntl(struct fcntl_args *uap)
	388	{
	389	union fcntl_dat dat;
	390	int error;
	391
	392	switch (uap->cmd) {
	393	case F_DUPFD:
	394	dat.fc_fd = uap->arg;
	395	break;
	396	case F_SETFD:
	397	dat.fc_cloexec = uap->arg;
	398	break;
	399	case F_SETFL:
	400	dat.fc_flags = uap->arg;
	401	break;
	402	case F_SETOWN:
	403	dat.fc_owner = uap->arg;
	404	break;
	405	case F_SETLKW:
	406	case F_SETLK:
	407	case F_GETLK:
	408	error = copyin((caddr_t)uap->arg, &dat.fc_flock,
	409	sizeof(struct flock));
	410	if (error)
	411	return (error);
	412	break;
	413	}
	414
	415	error = kern_fcntl(uap->fd, uap->cmd, &dat, curproc->p_ucred);
	416
	417	if (error == 0) {
	418	switch (uap->cmd) {
	419	case F_DUPFD:
	420	uap->sysmsg_result = dat.fc_fd;
	421	break;
	422	case F_GETFD:
	423	uap->sysmsg_result = dat.fc_cloexec;
	424	break;
	425	case F_GETFL:
	426	uap->sysmsg_result = dat.fc_flags;
	427	break;
	428	case F_GETOWN:
	429	uap->sysmsg_result = dat.fc_owner;
	430	case F_GETLK:
	431	error = copyout(&dat.fc_flock, (caddr_t)uap->arg,
	432	sizeof(struct flock));
	433	break;
	434	}
	435	}
	436
	437	return (error);
	438	}
	439
	440	/*
	441	* Common code for dup, dup2, and fcntl(F_DUPFD).
	442	*
	443	* The type flag can be either DUP_FIXED or DUP_VARIABLE. DUP_FIXED tells
	444	* kern_dup() to destructively dup over an existing file descriptor if new
	445	* is already open. DUP_VARIABLE tells kern_dup() to find the lowest
	446	* unused file descriptor that is greater than or equal to new.
	447	*
	448	* MPSAFE
	449	*/
	450	int
	451	kern_dup(enum dup_type type, int old, int new, int *res)
	452	{
	453	struct thread *td = curthread;
	454	struct proc *p = td->td_proc;
	455	struct filedesc *fdp = p->p_fd;
	456	struct file *fp;
	457	struct file *delfp;
	458	int oldflags;
	459	int holdleaders;
	460	int error, newfd;
	461
	462	/*
	463	* Verify that we have a valid descriptor to dup from and
	464	* possibly to dup to.
	465	*/
	466	retry:
	467	spin_lock_wr(&fdp->fd_spin);
	468	if (new < 0 \|\| new > p->p_rlimit[RLIMIT_NOFILE].rlim_cur \|\|
	469	new >= maxfilesperproc) {
	470	spin_unlock_wr(&fdp->fd_spin);
	471	return (EINVAL);
	472	}
	473	if ((unsigned)old >= fdp->fd_nfiles \|\| fdp->fd_files[old].fp == NULL) {
	474	spin_unlock_wr(&fdp->fd_spin);
	475	return (EBADF);
	476	}
	477	if (type == DUP_FIXED && old == new) {
	478	*res = new;
	479	spin_unlock_wr(&fdp->fd_spin);
	480	return (0);
	481	}
	482	fp = fdp->fd_files[old].fp;
	483	oldflags = fdp->fd_files[old].fileflags;
	484	fhold(fp); /* MPSAFE - can be called with a spinlock held */
	485
	486	/*
	487	* Allocate a new descriptor if DUP_VARIABLE, or expand the table
	488	* if the requested descriptor is beyond the current table size.
	489	*
	490	* This can block. Retry if the source descriptor no longer matches
	491	* or if our expectation in the expansion case races.
	492	*
	493	* If we are not expanding or allocating a new decriptor, then reset
	494	* the target descriptor to a reserved state so we have a uniform
	495	* setup for the next code block.
	496	*/
	497	if (type == DUP_VARIABLE \|\| new >= fdp->fd_nfiles) {
	498	spin_unlock_wr(&fdp->fd_spin);
	499	error = fdalloc(p, new, &newfd);
	500	spin_lock_wr(&fdp->fd_spin);
	501	if (error) {
	502	spin_unlock_wr(&fdp->fd_spin);
	503	fdrop(fp);
	504	return (error);
	505	}
	506	/*
	507	* Check for ripout
	508	*/
	509	if (old >= fdp->fd_nfiles \|\| fdp->fd_files[old].fp != fp) {
	510	fsetfd_locked(fdp, NULL, newfd);
	511	spin_unlock_wr(&fdp->fd_spin);
	512	fdrop(fp);
	513	goto retry;
	514	}
	515	/*
	516	* Check for expansion race
	517	*/
	518	if (type != DUP_VARIABLE && new != newfd) {
	519	fsetfd_locked(fdp, NULL, newfd);
	520	spin_unlock_wr(&fdp->fd_spin);
	521	fdrop(fp);
	522	goto retry;
	523	}
	524	/*
	525	* Check for ripout, newfd reused old (this case probably
	526	* can't occur).
	527	*/
	528	if (old == newfd) {
	529	fsetfd_locked(fdp, NULL, newfd);
	530	spin_unlock_wr(&fdp->fd_spin);
	531	fdrop(fp);
	532	goto retry;
	533	}
	534	new = newfd;
	535	delfp = NULL;
	536	} else {
	537	if (fdp->fd_files[new].reserved) {
	538	spin_unlock_wr(&fdp->fd_spin);
	539	fdrop(fp);
	540	kprintf("Warning: dup(): target descriptor %d is reserved, waiting for it to be resolved\n", new);
	541	tsleep(fdp, 0, "fdres", hz);
	542	goto retry;
	543	}
	544
	545	/*
	546	* If the target descriptor was never allocated we have
	547	* to allocate it. If it was we have to clean out the
	548	* old descriptor. delfp inherits the ref from the
	549	* descriptor table.
	550	*/
	551	delfp = fdp->fd_files[new].fp;
	552	fdp->fd_files[new].fp = NULL;
	553	fdp->fd_files[new].reserved = 1;
	554	if (delfp == NULL) {
	555	fdreserve_locked(fdp, new, 1);
	556	if (new > fdp->fd_lastfile)
	557	fdp->fd_lastfile = new;
	558	}
	559
	560	}
	561
	562	/*
	563	* NOTE: still holding an exclusive spinlock
	564	*/
	565
	566	/*
	567	* If a descriptor is being overwritten we may hve to tell
	568	* fdfree() to sleep to ensure that all relevant process
	569	* leaders can be traversed in closef().
	570	*/
	571	if (delfp != NULL && p->p_fdtol != NULL) {
	572	fdp->fd_holdleaderscount++;
	573	holdleaders = 1;
	574	} else {
	575	holdleaders = 0;
	576	}
	577	KASSERT(delfp == NULL \|\| type == DUP_FIXED,
	578	("dup() picked an open file"));
	579
	580	/*
	581	* Duplicate the source descriptor, update lastfile. If the new
	582	* descriptor was not allocated and we aren't replacing an existing
	583	* descriptor we have to mark the descriptor as being in use.
	584	*
	585	* The fd_files[] array inherits fp's hold reference.
	586	*/
	587	fsetfd_locked(fdp, fp, new);
	588	fdp->fd_files[new].fileflags = oldflags & ~UF_EXCLOSE;
	589	spin_unlock_wr(&fdp->fd_spin);
	590	fdrop(fp);
	591	*res = new;
	592
	593	/*
	594	* If we dup'd over a valid file, we now own the reference to it
	595	* and must dispose of it using closef() semantics (as if a
	596	* close() were performed on it).
	597	*/
	598	if (delfp) {
	599	(void)closef(delfp, td);
	600	if (holdleaders) {
	601	spin_lock_wr(&fdp->fd_spin);
	602	fdp->fd_holdleaderscount--;
	603	if (fdp->fd_holdleaderscount == 0 &&
	604	fdp->fd_holdleaderswakeup != 0) {
	605	fdp->fd_holdleaderswakeup = 0;
	606	spin_unlock_wr(&fdp->fd_spin);
	607	wakeup(&fdp->fd_holdleaderscount);
	608	} else {
	609	spin_unlock_wr(&fdp->fd_spin);
	610	}
	611	}
	612	}
	613	return (0);
	614	}
	615
	616	/*
	617	* If sigio is on the list associated with a process or process group,
	618	* disable signalling from the device, remove sigio from the list and
	619	* free sigio.
	620	*/
	621	void
	622	funsetown(struct sigio *sigio)
	623	{
	624	if (sigio == NULL)
	625	return;
	626	crit_enter();
	627	*(sigio->sio_myref) = NULL;
	628	crit_exit();
	629	if (sigio->sio_pgid < 0) {
	630	SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
	631	sigio, sio_pgsigio);
	632	} else /* if ((sigiop)->sio_pgid > 0) / {
	633	SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
	634	sigio, sio_pgsigio);
	635	}
	636	crfree(sigio->sio_ucred);
	637	kfree(sigio, M_SIGIO);
	638	}
	639
	640	/* Free a list of sigio structures. */
	641	void
	642	funsetownlst(struct sigiolst *sigiolst)
	643	{
	644	struct sigio *sigio;
	645
	646	while ((sigio = SLIST_FIRST(sigiolst)) != NULL)
	647	funsetown(sigio);
	648	}
	649
	650	/*
	651	* This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
	652	*
	653	* After permission checking, add a sigio structure to the sigio list for
	654	* the process or process group.
	655	*/
	656	int
	657	fsetown(pid_t pgid, struct sigio **sigiop)
	658	{
	659	struct proc *proc;
	660	struct pgrp *pgrp;
	661	struct sigio *sigio;
	662
	663	if (pgid == 0) {
	664	funsetown(*sigiop);
	665	return (0);
	666	}
	667	if (pgid > 0) {
	668	proc = pfind(pgid);
	669	if (proc == NULL)
	670	return (ESRCH);
	671
	672	/*
	673	* Policy - Don't allow a process to FSETOWN a process
	674	* in another session.
	675	*
	676	* Remove this test to allow maximum flexibility or
	677	* restrict FSETOWN to the current process or process
	678	* group for maximum safety.
	679	*/
	680	if (proc->p_session != curproc->p_session)
	681	return (EPERM);
	682
	683	pgrp = NULL;
	684	} else /* if (pgid < 0) */ {
	685	pgrp = pgfind(-pgid);
	686	if (pgrp == NULL)
	687	return (ESRCH);
	688
	689	/*
	690	* Policy - Don't allow a process to FSETOWN a process
	691	* in another session.
	692	*
	693	* Remove this test to allow maximum flexibility or
	694	* restrict FSETOWN to the current process or process
	695	* group for maximum safety.
	696	*/
	697	if (pgrp->pg_session != curproc->p_session)
	698	return (EPERM);
	699
	700	proc = NULL;
	701	}
	702	funsetown(*sigiop);
	703	sigio = kmalloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
	704	if (pgid > 0) {
	705	SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
	706	sigio->sio_proc = proc;
	707	} else {
	708	SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
	709	sigio->sio_pgrp = pgrp;
	710	}
	711	sigio->sio_pgid = pgid;
	712	sigio->sio_ucred = crhold(curproc->p_ucred);
	713	/* It would be convenient if p_ruid was in ucred. */
	714	sigio->sio_ruid = curproc->p_ucred->cr_ruid;
	715	sigio->sio_myref = sigiop;
	716	crit_enter();
	717	*sigiop = sigio;
	718	crit_exit();
	719	return (0);
	720	}
	721
	722	/*
	723	* This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
	724	*/
	725	pid_t
	726	fgetown(struct sigio *sigio)
	727	{
	728	return (sigio != NULL ? sigio->sio_pgid : 0);
	729	}
	730
	731	/*
	732	* Close many file descriptors.
	733	*
	734	* MPSAFE
	735	*/
	736	int
	737	sys_closefrom(struct closefrom_args *uap)
	738	{
	739	return(kern_closefrom(uap->fd));
	740	}
	741
	742	/*
	743	* Close all file descriptors greater then or equal to fd
	744	*
	745	* MPSAFE
	746	*/
	747	int
	748	kern_closefrom(int fd)
	749	{
	750	struct thread *td = curthread;
	751	struct proc *p = td->td_proc;
	752	struct filedesc *fdp;
	753
	754	KKASSERT(p);
	755	fdp = p->p_fd;
	756
	757	if (fd < 0)
	758	return (EINVAL);
	759
	760	/*
	761	* NOTE: This function will skip unassociated descriptors and
	762	* reserved descriptors that have not yet been assigned.
	763	* fd_lastfile can change as a side effect of kern_close().
	764	*/
	765	spin_lock_wr(&fdp->fd_spin);
	766	while (fd <= fdp->fd_lastfile) {
	767	if (fdp->fd_files[fd].fp != NULL) {
	768	spin_unlock_wr(&fdp->fd_spin);
	769	/* ok if this races another close */
	770	if (kern_close(fd) == EINTR)
	771	return (EINTR);
	772	spin_lock_wr(&fdp->fd_spin);
	773	}
	774	++fd;
	775	}
	776	spin_unlock_wr(&fdp->fd_spin);
	777	return (0);
	778	}
	779
	780	/*
	781	* Close a file descriptor.
	782	*
	783	* MPSAFE
	784	*/
	785	int
	786	sys_close(struct close_args *uap)
	787	{
	788	return(kern_close(uap->fd));
	789	}
	790
	791	/*
	792	* MPALMOSTSAFE - acquires mplock around knote_fdclose() calls
	793	*/
	794	int
	795	kern_close(int fd)
	796	{
	797	struct thread *td = curthread;
	798	struct proc *p = td->td_proc;
	799	struct filedesc *fdp;
	800	struct file *fp;
	801	int error;
	802	int holdleaders;
	803
	804	KKASSERT(p);
	805	fdp = p->p_fd;
	806
	807	spin_lock_wr(&fdp->fd_spin);
	808	if ((fp = funsetfd_locked(fdp, fd)) == NULL) {
	809	spin_unlock_wr(&fdp->fd_spin);
	810	return (EBADF);
	811	}
	812	holdleaders = 0;
	813	if (p->p_fdtol != NULL) {
	814	/*
	815	* Ask fdfree() to sleep to ensure that all relevant
	816	* process leaders can be traversed in closef().
	817	*/
	818	fdp->fd_holdleaderscount++;
	819	holdleaders = 1;
	820	}
	821
	822	/*
	823	* we now hold the fp reference that used to be owned by the descriptor
	824	* array.
	825	*/
	826	spin_unlock_wr(&fdp->fd_spin);
	827	if (fd < fdp->fd_knlistsize) {
	828	get_mplock();
	829	if (fd < fdp->fd_knlistsize)
	830	knote_fdclose(p, fd);
	831	rel_mplock();
	832	}
	833	error = closef(fp, td);
	834	if (holdleaders) {
	835	spin_lock_wr(&fdp->fd_spin);
	836	fdp->fd_holdleaderscount--;
	837	if (fdp->fd_holdleaderscount == 0 &&
	838	fdp->fd_holdleaderswakeup != 0) {
	839	fdp->fd_holdleaderswakeup = 0;
	840	spin_unlock_wr(&fdp->fd_spin);
	841	wakeup(&fdp->fd_holdleaderscount);
	842	} else {
	843	spin_unlock_wr(&fdp->fd_spin);
	844	}
	845	}
	846	return (error);
	847	}
	848
	849	/*
	850	* shutdown_args(int fd, int how)
	851	*/
	852	int
	853	kern_shutdown(int fd, int how)
	854	{
	855	struct thread *td = curthread;
	856	struct proc *p = td->td_proc;
	857	struct file *fp;
	858	int error;
	859
	860	KKASSERT(p);
	861
	862	if ((fp = holdfp(p->p_fd, fd, -1)) == NULL)
	863	return (EBADF);
	864	error = fo_shutdown(fp, how);
	865	fdrop(fp);
	866
	867	return (error);
	868	}
	869
	870	int
	871	sys_shutdown(struct shutdown_args *uap)
	872	{
	873	int error;
	874
	875	error = kern_shutdown(uap->s, uap->how);
	876
	877	return (error);
	878	}
	879
	880	int
	881	kern_fstat(int fd, struct stat *ub)
	882	{
	883	struct thread *td = curthread;
	884	struct proc *p = td->td_proc;
	885	struct file *fp;
	886	int error;
	887
	888	KKASSERT(p);
	889
	890	if ((fp = holdfp(p->p_fd, fd, -1)) == NULL)
	891	return (EBADF);
	892	error = fo_stat(fp, ub, p->p_ucred);
	893	fdrop(fp);
	894
	895	return (error);
	896	}
	897
	898	/*
	899	* Return status information about a file descriptor.
	900	*/
	901	int
	902	sys_fstat(struct fstat_args *uap)
	903	{
	904	struct stat st;
	905	int error;
	906
	907	error = kern_fstat(uap->fd, &st);
	908
	909	if (error == 0)
	910	error = copyout(&st, uap->sb, sizeof(st));
	911	return (error);
	912	}
	913
	914	/*
	915	* Return pathconf information about a file descriptor.
	916	*/
	917	/* ARGSUSED */
	918	int
	919	sys_fpathconf(struct fpathconf_args *uap)
	920	{
	921	struct thread *td = curthread;
	922	struct proc *p = td->td_proc;
	923	struct file *fp;
	924	struct vnode *vp;
	925	int error = 0;
	926
	927	KKASSERT(p);
	928
	929	if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL)
	930	return (EBADF);
	931
	932	switch (fp->f_type) {
	933	case DTYPE_PIPE:
	934	case DTYPE_SOCKET:
	935	if (uap->name != _PC_PIPE_BUF) {
	936	error = EINVAL;
	937	} else {
	938	uap->sysmsg_result = PIPE_BUF;
	939	error = 0;
	940	}
	941	break;
	942	case DTYPE_FIFO:
	943	case DTYPE_VNODE:
	944	vp = (struct vnode *)fp->f_data;
	945	error = VOP_PATHCONF(vp, uap->name, uap->sysmsg_fds);
	946	break;
	947	default:
	948	error = EOPNOTSUPP;
	949	break;
	950	}
	951	fdrop(fp);
	952	return(error);
	953	}
	954
	955	static int fdexpand;
	956	SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
	957
	958	/*
	959	* Grow the file table so it can hold through descriptor (want).
	960	*
	961	* The fdp's spinlock must be held exclusively on entry and may be held
	962	* exclusively on return. The spinlock may be cycled by the routine.
	963	*
	964	* MPSAFE
	965	*/
	966	static void
	967	fdgrow_locked(struct filedesc *fdp, int want)
	968	{
	969	struct fdnode *newfiles;
	970	struct fdnode *oldfiles;
	971	int nf, extra;
	972
	973	nf = fdp->fd_nfiles;
	974	do {
	975	/* nf has to be of the form 2^n - 1 */
	976	nf = 2 * nf + 1;
	977	} while (nf <= want);
	978
	979	spin_unlock_wr(&fdp->fd_spin);
	980	newfiles = kmalloc(nf * sizeof(struct fdnode), M_FILEDESC, M_WAITOK);
	981	spin_lock_wr(&fdp->fd_spin);
	982
	983	/*
	984	* We could have raced another extend while we were not holding
	985	* the spinlock.
	986	*/
	987	if (fdp->fd_nfiles >= nf) {
	988	spin_unlock_wr(&fdp->fd_spin);
	989	kfree(newfiles, M_FILEDESC);
	990	spin_lock_wr(&fdp->fd_spin);
	991	return;
	992	}
	993	/*
	994	* Copy the existing ofile and ofileflags arrays
	995	* and zero the new portion of each array.
	996	*/
	997	extra = nf - fdp->fd_nfiles;
	998	bcopy(fdp->fd_files, newfiles, fdp->fd_nfiles * sizeof(struct fdnode));
	999	bzero(&newfiles[fdp->fd_nfiles], extra * sizeof(struct fdnode));
	1000
	1001	oldfiles = fdp->fd_files;
	1002	fdp->fd_files = newfiles;
	1003	fdp->fd_nfiles = nf;
	1004
	1005	if (oldfiles != fdp->fd_builtin_files) {
	1006	spin_unlock_wr(&fdp->fd_spin);
	1007	kfree(oldfiles, M_FILEDESC);
	1008	spin_lock_wr(&fdp->fd_spin);
	1009	}
	1010	fdexpand++;
	1011	}
	1012
	1013	/*
	1014	* Number of nodes in right subtree, including the root.
	1015	*/
	1016	static __inline int
	1017	right_subtree_size(int n)
	1018	{
	1019	return (n ^ (n \| (n + 1)));
	1020	}
	1021
	1022	/*
	1023	* Bigger ancestor.
	1024	*/
	1025	static __inline int
	1026	right_ancestor(int n)
	1027	{
	1028	return (n \| (n + 1));
	1029	}
	1030
	1031	/*
	1032	* Smaller ancestor.
	1033	*/
	1034	static __inline int
	1035	left_ancestor(int n)
	1036	{
	1037	return ((n & (n + 1)) - 1);
	1038	}
	1039
	1040	/*
	1041	* Traverse the in-place binary tree buttom-up adjusting the allocation
	1042	* count so scans can determine where free descriptors are located.
	1043	*
	1044	* MPSAFE - caller must be holding an exclusive spinlock on fdp
	1045	*/
	1046	static
	1047	void
	1048	fdreserve_locked(struct filedesc *fdp, int fd, int incr)
	1049	{
	1050	while (fd >= 0) {
	1051	fdp->fd_files[fd].allocated += incr;
	1052	KKASSERT(fdp->fd_files[fd].allocated >= 0);
	1053	fd = left_ancestor(fd);
	1054	}
	1055	}
	1056
	1057	/*
	1058	* Reserve a file descriptor for the process. If no error occurs, the
	1059	* caller MUST at some point call fsetfd() or assign a file pointer
	1060	* or dispose of the reservation.
	1061	*
	1062	* MPSAFE
	1063	*/
	1064	int
	1065	fdalloc(struct proc p, int want, int result)
	1066	{
	1067	struct filedesc *fdp = p->p_fd;
	1068	int fd, rsize, rsum, node, lim;
	1069
	1070	spin_lock_rd(&p->p_limit->p_spin);
	1071	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
	1072	spin_unlock_rd(&p->p_limit->p_spin);
	1073	if (want >= lim)
	1074	return (EMFILE);
	1075	spin_lock_wr(&fdp->fd_spin);
	1076	if (want >= fdp->fd_nfiles)
	1077	fdgrow_locked(fdp, want);
	1078
	1079	/*
	1080	* Search for a free descriptor starting at the higher
	1081	* of want or fd_freefile. If that fails, consider
	1082	* expanding the ofile array.
	1083	*
	1084	* NOTE! the 'allocated' field is a cumulative recursive allocation
	1085	* count. If we happen to see a value of 0 then we can shortcut
	1086	* our search. Otherwise we run through through the tree going
	1087	* down branches we know have free descriptor(s) until we hit a
	1088	* leaf node. The leaf node will be free but will not necessarily
	1089	* have an allocated field of 0.
	1090	*/
	1091	retry:
	1092	/* move up the tree looking for a subtree with a free node */
	1093	for (fd = max(want, fdp->fd_freefile); fd < min(fdp->fd_nfiles, lim);
	1094	fd = right_ancestor(fd)) {
	1095	if (fdp->fd_files[fd].allocated == 0)
	1096	goto found;
	1097
	1098	rsize = right_subtree_size(fd);
	1099	if (fdp->fd_files[fd].allocated == rsize)
	1100	continue; /* right subtree full */
	1101
	1102	/*
	1103	* Free fd is in the right subtree of the tree rooted at fd.
	1104	* Call that subtree R. Look for the smallest (leftmost)
	1105	* subtree of R with an unallocated fd: continue moving
	1106	* down the left branch until encountering a full left
	1107	* subtree, then move to the right.
	1108	*/
	1109	for (rsum = 0, rsize /= 2; rsize > 0; rsize /= 2) {
	1110	node = fd + rsize;
	1111	rsum += fdp->fd_files[node].allocated;
	1112	if (fdp->fd_files[fd].allocated == rsum + rsize) {
	1113	fd = node; /* move to the right */
	1114	if (fdp->fd_files[node].allocated == 0)
	1115	goto found;
	1116	rsum = 0;
	1117	}
	1118	}
	1119	goto found;
	1120	}
	1121
	1122	/*
	1123	* No space in current array. Expand?
	1124	*/
	1125	if (fdp->fd_nfiles >= lim) {
	1126	spin_unlock_wr(&fdp->fd_spin);
	1127	return (EMFILE);
	1128	}
	1129	fdgrow_locked(fdp, want);
	1130	goto retry;
	1131
	1132	found:
	1133	KKASSERT(fd < fdp->fd_nfiles);
	1134	if (fd > fdp->fd_lastfile)
	1135	fdp->fd_lastfile = fd;
	1136	if (want <= fdp->fd_freefile)
	1137	fdp->fd_freefile = fd;
	1138	*result = fd;
	1139	KKASSERT(fdp->fd_files[fd].fp == NULL);
	1140	KKASSERT(fdp->fd_files[fd].reserved == 0);
	1141	fdp->fd_files[fd].fileflags = 0;
	1142	fdp->fd_files[fd].reserved = 1;
	1143	fdreserve_locked(fdp, fd, 1);
	1144	spin_unlock_wr(&fdp->fd_spin);
	1145	return (0);
	1146	}
	1147
	1148	/*
	1149	* Check to see whether n user file descriptors
	1150	* are available to the process p.
	1151	*
	1152	* MPSAFE
	1153	*/
	1154	int
	1155	fdavail(struct proc *p, int n)
	1156	{
	1157	struct filedesc *fdp = p->p_fd;
	1158	struct fdnode *fdnode;
	1159	int i, lim, last;
	1160
	1161	spin_lock_rd(&p->p_limit->p_spin);
	1162	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
	1163	spin_unlock_rd(&p->p_limit->p_spin);
	1164
	1165	spin_lock_rd(&fdp->fd_spin);
	1166	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) {
	1167	spin_unlock_rd(&fdp->fd_spin);
	1168	return (1);
	1169	}
	1170	last = min(fdp->fd_nfiles, lim);
	1171	fdnode = &fdp->fd_files[fdp->fd_freefile];
	1172	for (i = last - fdp->fd_freefile; --i >= 0; ++fdnode) {
	1173	if (fdnode->fp == NULL && --n <= 0) {
	1174	spin_unlock_rd(&fdp->fd_spin);
	1175	return (1);
	1176	}
	1177	}
	1178	spin_unlock_rd(&fdp->fd_spin);
	1179	return (0);
	1180	}
	1181
	1182	/*
	1183	* falloc:
	1184	* Create a new open file structure and reserve a file decriptor
	1185	* for the process that refers to it.
	1186	*
	1187	* Root creds are checked using p, or assumed if p is NULL. If
	1188	* resultfd is non-NULL then p must also be non-NULL. No file
	1189	* descriptor is reserved if resultfd is NULL.
	1190	*
	1191	* A file pointer with a refcount of 1 is returned. Note that the
	1192	* file pointer is NOT associated with the descriptor. If falloc
	1193	* returns success, fsetfd() MUST be called to either associate the
	1194	* file pointer or clear the reservation.
	1195	*
	1196	* MPSAFE
	1197	*/
	1198	int
	1199	falloc(struct proc p, struct file resultfp, int resultfd)
	1200	{
	1201	static struct timeval lastfail;
	1202	static int curfail;
	1203	struct file *fp;
	1204	int error;
	1205
	1206	fp = NULL;
	1207
	1208	/*
	1209	* Handle filetable full issues and root overfill.
	1210	*/
	1211	if (nfiles >= maxfiles - maxfilesrootres &&
	1212	((p && p->p_ucred->cr_ruid != 0) \|\| nfiles >= maxfiles)) {
	1213	if (ppsratecheck(&lastfail, &curfail, 1)) {
	1214	kprintf("kern.maxfiles limit exceeded by uid %d, please see tuning(7).\n",
	1215	(p ? p->p_ucred->cr_ruid : -1));
	1216	}
	1217	error = ENFILE;
	1218	goto done;
	1219	}
	1220
	1221	/*
	1222	* Allocate a new file descriptor.
	1223	*/
	1224	fp = kmalloc(sizeof(struct file), M_FILE, M_WAITOK \| M_ZERO);
	1225	spin_init(&fp->f_spin);
	1226	fp->f_count = 1;
	1227	fp->f_ops = &badfileops;
	1228	fp->f_seqcount = 1;
	1229	if (p)
	1230	fp->f_cred = crhold(p->p_ucred);
	1231	else
	1232	fp->f_cred = crhold(proc0.p_ucred);
	1233	spin_lock_wr(&filehead_spin);
	1234	nfiles++;
	1235	LIST_INSERT_HEAD(&filehead, fp, f_list);
	1236	spin_unlock_wr(&filehead_spin);
	1237	if (resultfd) {
	1238	if ((error = fdalloc(p, 0, resultfd)) != 0) {
	1239	fdrop(fp);
	1240	fp = NULL;
	1241	}
	1242	} else {
	1243	error = 0;
	1244	}
	1245	done:
	1246	*resultfp = fp;
	1247	return (error);
	1248	}
	1249
	1250	/*
	1251	* MPSAFE
	1252	*/
	1253	static
	1254	int
	1255	checkfpclosed(struct filedesc fdp, int fd, struct file fp)
	1256	{
	1257	int error;
	1258
	1259	spin_lock_rd(&fdp->fd_spin);
	1260	if ((unsigned) fd >= fdp->fd_nfiles \|\| fp != fdp->fd_files[fd].fp)
	1261	error = EBADF;
	1262	else
	1263	error = 0;
	1264	spin_unlock_rd(&fdp->fd_spin);
	1265	return (error);
	1266	}
	1267
	1268	/*
	1269	* Associate a file pointer with a previously reserved file descriptor.
	1270	* This function always succeeds.
	1271	*
	1272	* If fp is NULL, the file descriptor is returned to the pool.
	1273	*/
	1274
	1275	/*
	1276	* MPSAFE (exclusive spinlock must be held on call)
	1277	*/
	1278	static void
	1279	fsetfd_locked(struct filedesc fdp, struct file fp, int fd)
	1280	{
	1281	KKASSERT((unsigned)fd < fdp->fd_nfiles);
	1282	KKASSERT(fdp->fd_files[fd].reserved != 0);
	1283	if (fp) {
	1284	fhold(fp);
	1285	fdp->fd_files[fd].fp = fp;
	1286	fdp->fd_files[fd].reserved = 0;
	1287	if (fp->f_type == DTYPE_KQUEUE) {
	1288	if (fdp->fd_knlistsize < 0)
	1289	fdp->fd_knlistsize = 0;
	1290	}
	1291	} else {
	1292	fdp->fd_files[fd].reserved = 0;
	1293	fdreserve_locked(fdp, fd, -1);
	1294	fdfixup_locked(fdp, fd);
	1295	}
	1296	}
	1297
	1298	/*
	1299	* MPSAFE
	1300	*/
	1301	void
	1302	fsetfd(struct proc p, struct file fp, int fd)
	1303	{
	1304	struct filedesc *fdp = p->p_fd;
	1305
	1306	spin_lock_wr(&fdp->fd_spin);
	1307	fsetfd_locked(fdp, fp, fd);
	1308	spin_unlock_wr(&fdp->fd_spin);
	1309	}
	1310
	1311	/*
	1312	* MPSAFE (exclusive spinlock must be held on call)
	1313	*/
	1314	static
	1315	struct file *
	1316	funsetfd_locked(struct filedesc *fdp, int fd)
	1317	{
	1318	struct file *fp;
	1319
	1320	if ((unsigned)fd >= fdp->fd_nfiles)
	1321	return (NULL);
	1322	if ((fp = fdp->fd_files[fd].fp) == NULL)
	1323	return (NULL);
	1324	fdp->fd_files[fd].fp = NULL;
	1325	fdp->fd_files[fd].fileflags = 0;
	1326
	1327	fdreserve_locked(fdp, fd, -1);
	1328	fdfixup_locked(fdp, fd);
	1329	return(fp);
	1330	}
	1331
	1332	/*
	1333	* MPSAFE
	1334	*/
	1335	int
	1336	fgetfdflags(struct filedesc fdp, int fd, int flagsp)
	1337	{
	1338	int error;
	1339
	1340	spin_lock_rd(&fdp->fd_spin);
	1341	if (((u_int)fd) >= fdp->fd_nfiles) {
	1342	error = EBADF;
	1343	} else if (fdp->fd_files[fd].fp == NULL) {
	1344	error = EBADF;
	1345	} else {
	1346	*flagsp = fdp->fd_files[fd].fileflags;
	1347	error = 0;
	1348	}
	1349	spin_unlock_rd(&fdp->fd_spin);
	1350	return (error);
	1351	}
	1352
	1353	/*
	1354	* MPSAFE
	1355	*/
	1356	int
	1357	fsetfdflags(struct filedesc *fdp, int fd, int add_flags)
	1358	{
	1359	int error;
	1360
	1361	spin_lock_wr(&fdp->fd_spin);
	1362	if (((u_int)fd) >= fdp->fd_nfiles) {
	1363	error = EBADF;
	1364	} else if (fdp->fd_files[fd].fp == NULL) {
	1365	error = EBADF;
	1366	} else {
	1367	fdp->fd_files[fd].fileflags \|= add_flags;
	1368	error = 0;
	1369	}
	1370	spin_unlock_wr(&fdp->fd_spin);
	1371	return (error);
	1372	}
	1373
	1374	/*
	1375	* MPSAFE
	1376	*/
	1377	int
	1378	fclrfdflags(struct filedesc *fdp, int fd, int rem_flags)
	1379	{
	1380	int error;
	1381
	1382	spin_lock_wr(&fdp->fd_spin);
	1383	if (((u_int)fd) >= fdp->fd_nfiles) {
	1384	error = EBADF;
	1385	} else if (fdp->fd_files[fd].fp == NULL) {
	1386	error = EBADF;
	1387	} else {
	1388	fdp->fd_files[fd].fileflags &= ~rem_flags;
	1389	error = 0;
	1390	}
	1391	spin_unlock_wr(&fdp->fd_spin);
	1392	return (error);
	1393	}
	1394
	1395	void
	1396	fsetcred(struct file fp, struct ucred cr)
	1397	{
	1398	crhold(cr);
	1399	crfree(fp->f_cred);
	1400	fp->f_cred = cr;
	1401	}
	1402
	1403	/*
	1404	* Free a file descriptor.
	1405	*/
	1406	static
	1407	void
	1408	ffree(struct file *fp)
	1409	{
	1410	KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!"));
	1411	spin_lock_wr(&filehead_spin);
	1412	LIST_REMOVE(fp, f_list);
	1413	nfiles--;
	1414	spin_unlock_wr(&filehead_spin);
	1415	crfree(fp->f_cred);
	1416	if (fp->f_nchandle.ncp)
	1417	cache_drop(&fp->f_nchandle);
	1418	kfree(fp, M_FILE);
	1419	}
	1420
	1421	/*
	1422	* called from init_main, initialize filedesc0 for proc0.
	1423	*/
	1424	void
	1425	fdinit_bootstrap(struct proc p0, struct filedesc fdp0, int cmask)
	1426	{
	1427	p0->p_fd = fdp0;
	1428	p0->p_fdtol = NULL;
	1429	fdp0->fd_refcnt = 1;
	1430	fdp0->fd_cmask = cmask;
	1431	fdp0->fd_files = fdp0->fd_builtin_files;
	1432	fdp0->fd_nfiles = NDFILE;
	1433	fdp0->fd_lastfile = -1;
	1434	spin_init(&fdp0->fd_spin);
	1435	}
	1436
	1437	/*
	1438	* Build a new filedesc structure.
	1439	*
	1440	* NOT MPSAFE (vref)
	1441	*/
	1442	struct filedesc *
	1443	fdinit(struct proc *p)
	1444	{
	1445	struct filedesc *newfdp;
	1446	struct filedesc *fdp = p->p_fd;
	1447
	1448	newfdp = kmalloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK\|M_ZERO);
	1449	spin_lock_rd(&fdp->fd_spin);
	1450	if (fdp->fd_cdir) {
	1451	newfdp->fd_cdir = fdp->fd_cdir;
	1452	vref(newfdp->fd_cdir);
	1453	cache_copy(&fdp->fd_ncdir, &newfdp->fd_ncdir);
	1454	}
	1455
	1456	/*
	1457	* rdir may not be set in e.g. proc0 or anything vm_fork'd off of
	1458	* proc0, but should unconditionally exist in other processes.
	1459	*/
	1460	if (fdp->fd_rdir) {
	1461	newfdp->fd_rdir = fdp->fd_rdir;
	1462	vref(newfdp->fd_rdir);
	1463	cache_copy(&fdp->fd_nrdir, &newfdp->fd_nrdir);
	1464	}
	1465	if (fdp->fd_jdir) {
	1466	newfdp->fd_jdir = fdp->fd_jdir;
	1467	vref(newfdp->fd_jdir);
	1468	cache_copy(&fdp->fd_njdir, &newfdp->fd_njdir);
	1469	}
	1470	spin_unlock_rd(&fdp->fd_spin);
	1471
	1472	/* Create the file descriptor table. */
	1473	newfdp->fd_refcnt = 1;
	1474	newfdp->fd_cmask = cmask;
	1475	newfdp->fd_files = newfdp->fd_builtin_files;
	1476	newfdp->fd_nfiles = NDFILE;
	1477	newfdp->fd_knlistsize = -1;
	1478	newfdp->fd_lastfile = -1;
	1479	spin_init(&newfdp->fd_spin);
	1480
	1481	return (newfdp);
	1482	}
	1483
	1484	/*
	1485	* Share a filedesc structure.
	1486	*
	1487	* MPSAFE
	1488	*/
	1489	struct filedesc *
	1490	fdshare(struct proc *p)
	1491	{
	1492	struct filedesc *fdp;
	1493
	1494	fdp = p->p_fd;
	1495	spin_lock_wr(&fdp->fd_spin);
	1496	fdp->fd_refcnt++;
	1497	spin_unlock_wr(&fdp->fd_spin);
	1498	return (fdp);
	1499	}
	1500
	1501	/*
	1502	* Copy a filedesc structure.
	1503	*
	1504	* MPSAFE
	1505	*/
	1506	struct filedesc *
	1507	fdcopy(struct proc *p)
	1508	{
	1509	struct filedesc *fdp = p->p_fd;
	1510	struct filedesc *newfdp;
	1511	struct fdnode *fdnode;
	1512	int i;
	1513	int ni;
	1514
	1515	/*
	1516	* Certain daemons might not have file descriptors.
	1517	*/
	1518	if (fdp == NULL)
	1519	return (NULL);
	1520
	1521	/*
	1522	* Allocate the new filedesc and fd_files[] array. This can race
	1523	* with operations by other threads on the fdp so we have to be
	1524	* careful.
	1525	*/
	1526	newfdp = kmalloc(sizeof(struct filedesc), M_FILEDESC, M_WAITOK \| M_ZERO);
	1527	again:
	1528	spin_lock_rd(&fdp->fd_spin);
	1529	if (fdp->fd_lastfile < NDFILE) {
	1530	newfdp->fd_files = newfdp->fd_builtin_files;
	1531	i = NDFILE;
	1532	} else {
	1533	/*
	1534	* We have to allocate (N^2-1) entries for our in-place
	1535	* binary tree. Allow the table to shrink.
	1536	*/
	1537	i = fdp->fd_nfiles;
	1538	ni = (i - 1) / 2;
	1539	while (ni > fdp->fd_lastfile && ni > NDFILE) {
	1540	i = ni;
	1541	ni = (i - 1) / 2;
	1542	}
	1543	spin_unlock_rd(&fdp->fd_spin);
	1544	newfdp->fd_files = kmalloc(i * sizeof(struct fdnode),
	1545	M_FILEDESC, M_WAITOK \| M_ZERO);
	1546
	1547	/*
	1548	* Check for race, retry
	1549	*/
	1550	spin_lock_rd(&fdp->fd_spin);
	1551	if (i <= fdp->fd_lastfile) {
	1552	spin_unlock_rd(&fdp->fd_spin);
	1553	kfree(newfdp->fd_files, M_FILEDESC);
	1554	goto again;
	1555	}
	1556	}
	1557
	1558	/*
	1559	* Dup the remaining fields. vref() and cache_hold() can be
	1560	* safely called while holding the read spinlock on fdp.
	1561	*
	1562	* The read spinlock on fdp is still being held.
	1563	*
	1564	* NOTE: vref and cache_hold calls for the case where the vnode
	1565	* or cache entry already has at least one ref may be called
	1566	* while holding spin locks.
	1567	*/
	1568	if ((newfdp->fd_cdir = fdp->fd_cdir) != NULL) {
	1569	vref(newfdp->fd_cdir);
	1570	cache_copy(&fdp->fd_ncdir, &newfdp->fd_ncdir);
	1571	}
	1572	/*
	1573	* We must check for fd_rdir here, at least for now because
	1574	* the init process is created before we have access to the
	1575	* rootvode to take a reference to it.
	1576	*/
	1577	if ((newfdp->fd_rdir = fdp->fd_rdir) != NULL) {
	1578	vref(newfdp->fd_rdir);
	1579	cache_copy(&fdp->fd_nrdir, &newfdp->fd_nrdir);
	1580	}
	1581	if ((newfdp->fd_jdir = fdp->fd_jdir) != NULL) {
	1582	vref(newfdp->fd_jdir);
	1583	cache_copy(&fdp->fd_njdir, &newfdp->fd_njdir);
	1584	}
	1585	newfdp->fd_refcnt = 1;
	1586	newfdp->fd_nfiles = i;
	1587	newfdp->fd_lastfile = fdp->fd_lastfile;
	1588	newfdp->fd_freefile = fdp->fd_freefile;
	1589	newfdp->fd_cmask = fdp->fd_cmask;
	1590	newfdp->fd_knlist = NULL;
	1591	newfdp->fd_knlistsize = -1;
	1592	newfdp->fd_knhash = NULL;
	1593	newfdp->fd_knhashmask = 0;
	1594	spin_init(&newfdp->fd_spin);
	1595
	1596	/*
	1597	* Copy the descriptor table through (i). This also copies the
	1598	* allocation state. Then go through and ref the file pointers
	1599	* and clean up any KQ descriptors.
	1600	*
	1601	* kq descriptors cannot be copied. Since we haven't ref'd the
	1602	* copied files yet we can ignore the return value from funsetfd().
	1603	*
	1604	* The read spinlock on fdp is still being held.
	1605	*/
	1606	bcopy(fdp->fd_files, newfdp->fd_files, i * sizeof(struct fdnode));
	1607	for (i = 0 ; i < newfdp->fd_nfiles; ++i) {
	1608	fdnode = &newfdp->fd_files[i];
	1609	if (fdnode->reserved) {
	1610	fdreserve_locked(newfdp, i, -1);
	1611	fdnode->reserved = 0;
	1612	fdfixup_locked(newfdp, i);
	1613	} else if (fdnode->fp) {
	1614	if (fdnode->fp->f_type == DTYPE_KQUEUE) {
	1615	(void)funsetfd_locked(newfdp, i);
	1616	} else {
	1617	fhold(fdnode->fp);
	1618	}
	1619	}
	1620	}
	1621	spin_unlock_rd(&fdp->fd_spin);
	1622	return (newfdp);
	1623	}
	1624
	1625	/*
	1626	* Release a filedesc structure.
	1627	*
	1628	* NOT MPSAFE (MPSAFE for refs > 1, but the final cleanup code is not MPSAFE)
	1629	*/
	1630	void
	1631	fdfree(struct proc *p)
	1632	{
	1633	/* Take any thread of p */
	1634	struct thread *td = LIST_FIRST(&p->p_lwps)->lwp_thread;
	1635	struct filedesc *fdp = p->p_fd;
	1636	struct fdnode *fdnode;
	1637	int i;
	1638	struct filedesc_to_leader *fdtol;
	1639	struct file *fp;
	1640	struct vnode *vp;
	1641	struct flock lf;
	1642
	1643	/* Certain daemons might not have file descriptors. */
	1644	if (fdp == NULL)
	1645	return;
	1646
	1647	/*
	1648	* Severe messing around to follow
	1649	*/
	1650	spin_lock_wr(&fdp->fd_spin);
	1651
	1652	/* Check for special need to clear POSIX style locks */
	1653	fdtol = p->p_fdtol;
	1654	if (fdtol != NULL) {
	1655	KASSERT(fdtol->fdl_refcount > 0,
	1656	("filedesc_to_refcount botch: fdl_refcount=%d",
	1657	fdtol->fdl_refcount));
	1658	if (fdtol->fdl_refcount == 1 &&
	1659	(p->p_leader->p_flag & P_ADVLOCK) != 0) {
	1660	for (i = 0; i <= fdp->fd_lastfile; ++i) {
	1661	fdnode = &fdp->fd_files[i];
	1662	if (fdnode->fp == NULL \|\|
	1663	fdnode->fp->f_type != DTYPE_VNODE) {
	1664	continue;
	1665	}
	1666	fp = fdnode->fp;
	1667	fhold(fp);
	1668	spin_unlock_wr(&fdp->fd_spin);
	1669
	1670	lf.l_whence = SEEK_SET;
	1671	lf.l_start = 0;
	1672	lf.l_len = 0;
	1673	lf.l_type = F_UNLCK;
	1674	vp = (struct vnode *)fp->f_data;
	1675	(void) VOP_ADVLOCK(vp,
	1676	(caddr_t)p->p_leader,
	1677	F_UNLCK,
	1678	&lf,
	1679	F_POSIX);
	1680	fdrop(fp);
	1681	spin_lock_wr(&fdp->fd_spin);
	1682	}
	1683	}
	1684	retry:
	1685	if (fdtol->fdl_refcount == 1) {
	1686	if (fdp->fd_holdleaderscount > 0 &&
	1687	(p->p_leader->p_flag & P_ADVLOCK) != 0) {
	1688	/*
	1689	* close() or do_dup() has cleared a reference
	1690	* in a shared file descriptor table.
	1691	*/
	1692	fdp->fd_holdleaderswakeup = 1;
	1693	msleep(&fdp->fd_holdleaderscount,
	1694	&fdp->fd_spin, 0, "fdlhold", 0);
	1695	goto retry;
	1696	}
	1697	if (fdtol->fdl_holdcount > 0) {
	1698	/*
	1699	* Ensure that fdtol->fdl_leader
	1700	* remains valid in closef().
	1701	*/
	1702	fdtol->fdl_wakeup = 1;
	1703	msleep(fdtol, &fdp->fd_spin, 0, "fdlhold", 0);
	1704	goto retry;
	1705	}
	1706	}
	1707	fdtol->fdl_refcount--;
	1708	if (fdtol->fdl_refcount == 0 &&
	1709	fdtol->fdl_holdcount == 0) {
	1710	fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
	1711	fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
	1712	} else {
	1713	fdtol = NULL;
	1714	}
	1715	p->p_fdtol = NULL;
	1716	if (fdtol != NULL) {
	1717	spin_unlock_wr(&fdp->fd_spin);
	1718	kfree(fdtol, M_FILEDESC_TO_LEADER);
	1719	spin_lock_wr(&fdp->fd_spin);
	1720	}
	1721	}
	1722	if (--fdp->fd_refcnt > 0) {
	1723	spin_unlock_wr(&fdp->fd_spin);
	1724	return;
	1725	}
	1726	spin_unlock_wr(&fdp->fd_spin);
	1727
	1728	/*
	1729	* we are the last reference to the structure, we can
	1730	* safely assume it will not change out from under us.
	1731	*/
	1732	for (i = 0; i <= fdp->fd_lastfile; ++i) {
	1733	if (fdp->fd_files[i].fp)
	1734	closef(fdp->fd_files[i].fp, td);
	1735	}
	1736	if (fdp->fd_files != fdp->fd_builtin_files)
	1737	kfree(fdp->fd_files, M_FILEDESC);
	1738	if (fdp->fd_cdir) {
	1739	cache_drop(&fdp->fd_ncdir);
	1740	vrele(fdp->fd_cdir);
	1741	}
	1742	if (fdp->fd_rdir) {
	1743	cache_drop(&fdp->fd_nrdir);
	1744	vrele(fdp->fd_rdir);
	1745	}
	1746	if (fdp->fd_jdir) {
	1747	cache_drop(&fdp->fd_njdir);
	1748	vrele(fdp->fd_jdir);
	1749	}
	1750	if (fdp->fd_knlist)
	1751	kfree(fdp->fd_knlist, M_KQUEUE);
	1752	if (fdp->fd_knhash)
	1753	kfree(fdp->fd_knhash, M_KQUEUE);
	1754	kfree(fdp, M_FILEDESC);
	1755	}
	1756
	1757	/*
	1758	* Retrieve and reference the file pointer associated with a descriptor.
	1759	*
	1760	* MPSAFE
	1761	*/
	1762	struct file *
	1763	holdfp(struct filedesc *fdp, int fd, int flag)
	1764	{
	1765	struct file* fp;
	1766
	1767	spin_lock_rd(&fdp->fd_spin);
	1768	if (((u_int)fd) >= fdp->fd_nfiles) {
	1769	fp = NULL;
	1770	goto done;
	1771	}
	1772	if ((fp = fdp->fd_files[fd].fp) == NULL)
	1773	goto done;
	1774	if ((fp->f_flag & flag) == 0 && flag != -1) {
	1775	fp = NULL;
	1776	goto done;
	1777	}
	1778	fhold(fp);
	1779	done:
	1780	spin_unlock_rd(&fdp->fd_spin);
	1781	return (fp);
	1782	}
	1783
	1784	/*
	1785	* holdsock() - load the struct file pointer associated
	1786	* with a socket into *fpp. If an error occurs, non-zero
	1787	* will be returned and *fpp will be set to NULL.
	1788	*
	1789	* MPSAFE
	1790	*/
	1791	int
	1792	holdsock(struct filedesc fdp, int fd, struct file *fpp)
	1793	{
	1794	struct file *fp;
	1795	int error;
	1796
	1797	spin_lock_rd(&fdp->fd_spin);
	1798	if ((unsigned)fd >= fdp->fd_nfiles) {
	1799	error = EBADF;
	1800	fp = NULL;
	1801	goto done;
	1802	}
	1803	if ((fp = fdp->fd_files[fd].fp) == NULL) {
	1804	error = EBADF;
	1805	goto done;
	1806	}
	1807	if (fp->f_type != DTYPE_SOCKET) {
	1808	error = ENOTSOCK;
	1809	goto done;
	1810	}
	1811	fhold(fp);
	1812	error = 0;
	1813	done:
	1814	spin_unlock_rd(&fdp->fd_spin);
	1815	*fpp = fp;
	1816	return (error);
	1817	}
	1818
	1819	/*
	1820	* Convert a user file descriptor to a held file pointer.
	1821	*
	1822	* MPSAFE
	1823	*/
	1824	int
	1825	holdvnode(struct filedesc fdp, int fd, struct file *fpp)
	1826	{
	1827	struct file *fp;
	1828	int error;
	1829
	1830	spin_lock_rd(&fdp->fd_spin);
	1831	if ((unsigned)fd >= fdp->fd_nfiles) {
	1832	error = EBADF;
	1833	fp = NULL;
	1834	goto done;
	1835	}
	1836	if ((fp = fdp->fd_files[fd].fp) == NULL) {
	1837	error = EBADF;
	1838	goto done;
	1839	}
	1840	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
	1841	error = EINVAL;
	1842	goto done;
	1843	}
	1844	fhold(fp);
	1845	error = 0;
	1846	done:
	1847	spin_unlock_rd(&fdp->fd_spin);
	1848	*fpp = fp;
	1849	return (error);
	1850	}
	1851
	1852	/*
	1853	* For setugid programs, we don't want to people to use that setugidness
	1854	* to generate error messages which write to a file which otherwise would
	1855	* otherwise be off-limits to the process.
	1856	*
	1857	* This is a gross hack to plug the hole. A better solution would involve
	1858	* a special vop or other form of generalized access control mechanism. We
	1859	* go ahead and just reject all procfs file systems accesses as dangerous.
	1860	*
	1861	* Since setugidsafety calls this only for fd 0, 1 and 2, this check is
	1862	* sufficient. We also don't for check setugidness since we know we are.
	1863	*/
	1864	static int
	1865	is_unsafe(struct file *fp)
	1866	{
	1867	if (fp->f_type == DTYPE_VNODE &&
	1868	((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS)
	1869	return (1);
	1870	return (0);
	1871	}
	1872
	1873	/*
	1874	* Make this setguid thing safe, if at all possible.
	1875	*
	1876	* NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose()
	1877	*/
	1878	void
	1879	setugidsafety(struct proc *p)
	1880	{
	1881	/* Take any thread of p */
	1882	struct thread *td = LIST_FIRST(&p->p_lwps)->lwp_thread;
	1883	struct filedesc *fdp = p->p_fd;
	1884	int i;
	1885
	1886	/* Certain daemons might not have file descriptors. */
	1887	if (fdp == NULL)
	1888	return;
	1889
	1890	/*
	1891	* note: fdp->fd_files may be reallocated out from under us while
	1892	* we are blocked in a close. Be careful!
	1893	*/
	1894	for (i = 0; i <= fdp->fd_lastfile; i++) {
	1895	if (i > 2)
	1896	break;
	1897	if (fdp->fd_files[i].fp && is_unsafe(fdp->fd_files[i].fp)) {
	1898	struct file *fp;
	1899
	1900	if (i < fdp->fd_knlistsize)
	1901	knote_fdclose(p, i);
	1902	/*
	1903	* NULL-out descriptor prior to close to avoid
	1904	* a race while close blocks.
	1905	*/
	1906	if ((fp = funsetfd_locked(fdp, i)) != NULL)
	1907	closef(fp, td);
	1908	}
	1909	}
	1910	}
	1911
	1912	/*
	1913	* Close any files on exec?
	1914	*
	1915	* NOT MPSAFE - scans fdp without spinlocks, calls knote_fdclose()
	1916	*/
	1917	void
	1918	fdcloseexec(struct proc *p)
	1919	{
	1920	/* Take any thread of p */
	1921	struct thread *td = LIST_FIRST(&p->p_lwps)->lwp_thread;
	1922	struct filedesc *fdp = p->p_fd;
	1923	int i;
	1924
	1925	/* Certain daemons might not have file descriptors. */
	1926	if (fdp == NULL)
	1927	return;
	1928
	1929	/*
	1930	* We cannot cache fd_files since operations may block and rip
	1931	* them out from under us.
	1932	*/
	1933	for (i = 0; i <= fdp->fd_lastfile; i++) {
	1934	if (fdp->fd_files[i].fp != NULL &&
	1935	(fdp->fd_files[i].fileflags & UF_EXCLOSE)) {
	1936	struct file *fp;
	1937
	1938	if (i < fdp->fd_knlistsize)
	1939	knote_fdclose(p, i);
	1940	/*
	1941	* NULL-out descriptor prior to close to avoid
	1942	* a race while close blocks.
	1943	*/
	1944	if ((fp = funsetfd_locked(fdp, i)) != NULL)
	1945	closef(fp, td);
	1946	}
	1947	}
	1948	}
	1949
	1950	/*
	1951	* It is unsafe for set[ug]id processes to be started with file
	1952	* descriptors 0..2 closed, as these descriptors are given implicit
	1953	* significance in the Standard C library. fdcheckstd() will create a
	1954	* descriptor referencing /dev/null for each of stdin, stdout, and
	1955	* stderr that is not already open.
	1956	*
	1957	* NOT MPSAFE - calls falloc, vn_open, etc
	1958	*/
	1959	int
	1960	fdcheckstd(struct proc *p)
	1961	{
	1962	struct nlookupdata nd;
	1963	struct filedesc *fdp;
	1964	struct file *fp;
	1965	register_t retval;
	1966	int i, error, flags, devnull;
	1967
	1968	fdp = p->p_fd;
	1969	if (fdp == NULL)
	1970	return (0);
	1971	devnull = -1;
	1972	error = 0;
	1973	for (i = 0; i < 3; i++) {
	1974	if (fdp->fd_files[i].fp != NULL)
	1975	continue;
	1976	if (devnull < 0) {
	1977	if ((error = falloc(p, &fp, &devnull)) != 0)
	1978	break;
	1979
	1980	error = nlookup_init(&nd, "/dev/null", UIO_SYSSPACE,
	1981	NLC_FOLLOW\|NLC_LOCKVP);
	1982	flags = FREAD \| FWRITE;
	1983	if (error == 0)
	1984	error = vn_open(&nd, fp, flags, 0);
	1985	if (error == 0)
	1986	fsetfd(p, fp, devnull);
	1987	else
	1988	fsetfd(p, NULL, devnull);
	1989	fdrop(fp);
	1990	nlookup_done(&nd);
	1991	if (error)
	1992	break;
	1993	KKASSERT(i == devnull);
	1994	} else {
	1995	error = kern_dup(DUP_FIXED, devnull, i, &retval);
	1996	if (error != 0)
	1997	break;
	1998	}
	1999	}
	2000	return (error);
	2001	}
	2002
	2003	/*
	2004	* Internal form of close.
	2005	* Decrement reference count on file structure.
	2006	* Note: td and/or p may be NULL when closing a file
	2007	* that was being passed in a message.
	2008	*
	2009	* MPALMOSTSAFE - acquires mplock for VOP operations
	2010	*/
	2011	int
	2012	closef(struct file fp, struct thread td)
	2013	{
	2014	struct vnode *vp;
	2015	struct flock lf;
	2016	struct filedesc_to_leader *fdtol;
	2017	struct proc *p;
	2018
	2019	if (fp == NULL)
	2020	return (0);
	2021	if (td == NULL) {
	2022	td = curthread;
	2023	p = NULL; /* allow no proc association */
	2024	} else {
	2025	p = td->td_proc; /* can also be NULL */
	2026	}
	2027	/*
	2028	* POSIX record locking dictates that any close releases ALL
	2029	* locks owned by this process. This is handled by setting
	2030	* a flag in the unlock to free ONLY locks obeying POSIX
	2031	* semantics, and not to free BSD-style file locks.
	2032	* If the descriptor was in a message, POSIX-style locks
	2033	* aren't passed with the descriptor.
	2034	*/
	2035	if (p != NULL && fp->f_type == DTYPE_VNODE &&
	2036	(((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS)
	2037	) {
	2038	get_mplock();
	2039	if ((p->p_leader->p_flag & P_ADVLOCK) != 0) {
	2040	lf.l_whence = SEEK_SET;
	2041	lf.l_start = 0;
	2042	lf.l_len = 0;
	2043	lf.l_type = F_UNLCK;
	2044	vp = (struct vnode *)fp->f_data;
	2045	(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
	2046	&lf, F_POSIX);
	2047	}
	2048	fdtol = p->p_fdtol;
	2049	if (fdtol != NULL) {
	2050	/*
	2051	* Handle special case where file descriptor table
	2052	* is shared between multiple process leaders.
	2053	*/
	2054	for (fdtol = fdtol->fdl_next;
	2055	fdtol != p->p_fdtol;
	2056	fdtol = fdtol->fdl_next) {
	2057	if ((fdtol->fdl_leader->p_flag &
	2058	P_ADVLOCK) == 0)
	2059	continue;
	2060	fdtol->fdl_holdcount++;
	2061	lf.l_whence = SEEK_SET;
	2062	lf.l_start = 0;
	2063	lf.l_len = 0;
	2064	lf.l_type = F_UNLCK;
	2065	vp = (struct vnode *)fp->f_data;
	2066	(void) VOP_ADVLOCK(vp,
	2067	(caddr_t)fdtol->fdl_leader,
	2068	F_UNLCK, &lf, F_POSIX);
	2069	fdtol->fdl_holdcount--;
	2070	if (fdtol->fdl_holdcount == 0 &&
	2071	fdtol->fdl_wakeup != 0) {
	2072	fdtol->fdl_wakeup = 0;
	2073	wakeup(fdtol);
	2074	}
	2075	}
	2076	}
	2077	rel_mplock();
	2078	}
	2079	return (fdrop(fp));
	2080	}
	2081
	2082	/*
	2083	* MPSAFE
	2084	*
	2085	* fhold() can only be called if f_count is already at least 1 (i.e. the
	2086	* caller of fhold() already has a reference to the file pointer in some
	2087	* manner or other).
	2088	*
	2089	* This is a rare case where callers are allowed to hold spinlocks, so
	2090	* we can't ourselves. Since we are not obtaining the fp spinlock,
	2091	* we have to use an atomic lock to interlock against fdrop().
	2092	*/
	2093	void
	2094	fhold(struct file *fp)
	2095	{
	2096	atomic_add_int(&fp->f_count, 1);
	2097	}
	2098
	2099	/*
	2100	* A spinlock is required to handle 1->0 transitions on f_count. We have
	2101	* to use atomic_sub_int so as not to race the atomic_add_int in fhold().
	2102	*
	2103	* MPALMOSTSAFE - acquires mplock for final close sequence
	2104	*/
	2105	int
	2106	fdrop(struct file *fp)
	2107	{
	2108	struct flock lf;
	2109	struct vnode *vp;
	2110	int error;
	2111
	2112	spin_lock_wr(&fp->f_spin);
	2113	atomic_subtract_int(&fp->f_count, 1);
	2114	if (fp->f_count > 0) {
	2115	spin_unlock_wr(&fp->f_spin);
	2116	return (0);
	2117	}
	2118	spin_unlock_wr(&fp->f_spin);
	2119
	2120	get_mplock();
	2121
	2122	/*
	2123	* The last reference has gone away, we own the fp structure free
	2124	* and clear.
	2125	*/
	2126	if (fp->f_count < 0)
	2127	panic("fdrop: count < 0");
	2128	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE &&
	2129	(((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS)
	2130	) {
	2131	lf.l_whence = SEEK_SET;
	2132	lf.l_start = 0;
	2133	lf.l_len = 0;
	2134	lf.l_type = F_UNLCK;
	2135	vp = (struct vnode *)fp->f_data;
	2136	(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0);
	2137	}
	2138	if (fp->f_ops != &badfileops)
	2139	error = fo_close(fp);
	2140	else
	2141	error = 0;
	2142	ffree(fp);
	2143	rel_mplock();
	2144	return (error);
	2145	}
	2146
	2147	/*
	2148	* Apply an advisory lock on a file descriptor.
	2149	*
	2150	* Just attempt to get a record lock of the requested type on
	2151	* the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
	2152	*/
	2153	int
	2154	sys_flock(struct flock_args *uap)
	2155	{
	2156	struct proc *p = curproc;
	2157	struct file *fp;
	2158	struct vnode *vp;
	2159	struct flock lf;
	2160	int error;
	2161
	2162	if ((fp = holdfp(p->p_fd, uap->fd, -1)) == NULL)
	2163	return (EBADF);
	2164	if (fp->f_type != DTYPE_VNODE) {
	2165	error = EOPNOTSUPP;
	2166	goto done;
	2167	}
	2168	vp = (struct vnode *)fp->f_data;
	2169	lf.l_whence = SEEK_SET;
	2170	lf.l_start = 0;
	2171	lf.l_len = 0;
	2172	if (uap->how & LOCK_UN) {
	2173	lf.l_type = F_UNLCK;
	2174	fp->f_flag &= ~FHASLOCK;
	2175	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0);
	2176	goto done;
	2177	}
	2178	if (uap->how & LOCK_EX)
	2179	lf.l_type = F_WRLCK;
	2180	else if (uap->how & LOCK_SH)
	2181	lf.l_type = F_RDLCK;
	2182	else {
	2183	error = EBADF;
	2184	goto done;
	2185	}
	2186	fp->f_flag \|= FHASLOCK;
	2187	if (uap->how & LOCK_NB)
	2188	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, 0);
	2189	else
	2190	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_WAIT);
	2191	done:
	2192	fdrop(fp);
	2193	return (error);
	2194	}
	2195
	2196	/*
	2197	* File Descriptor pseudo-device driver (/dev/fd/).
	2198	*
	2199	* Opening minor device N dup()s the file (if any) connected to file
	2200	* descriptor N belonging to the calling process. Note that this driver
	2201	* consists of only the ``open()'' routine, because all subsequent
	2202	* references to this file will be direct to the other driver.
	2203	*/
	2204	/* ARGSUSED */
	2205	static int
	2206	fdopen(struct dev_open_args *ap)
	2207	{
	2208	thread_t td = curthread;
	2209
	2210	KKASSERT(td->td_lwp != NULL);
	2211
	2212	/*
	2213	* XXX Kludge: set curlwp->lwp_dupfd to contain the value of the
	2214	* the file descriptor being sought for duplication. The error
	2215	* return ensures that the vnode for this device will be released
	2216	* by vn_open. Open will detect this special error and take the
	2217	* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
	2218	* will simply report the error.
	2219	*/
	2220	td->td_lwp->lwp_dupfd = minor(ap->a_head.a_dev);
	2221	return (ENODEV);
	2222	}
	2223
	2224	/*
	2225	* The caller has reserved the file descriptor dfd for us. On success we
	2226	* must fsetfd() it. On failure the caller will clean it up.
	2227	*
	2228	* NOT MPSAFE - isn't getting spinlocks, possibly other things
	2229	*/
	2230	int
	2231	dupfdopen(struct proc *p, int dfd, int sfd, int mode, int error)
	2232	{
	2233	struct filedesc *fdp = p->p_fd;
	2234	struct file *wfp;
	2235	struct file *xfp;
	2236
	2237	if ((wfp = holdfp(fdp, sfd, -1)) == NULL)
	2238	return (EBADF);
	2239
	2240	/*
	2241	* There are two cases of interest here.
	2242	*
	2243	* For ENODEV simply dup sfd to file descriptor dfd and return.
	2244	*
	2245	* For ENXIO steal away the file structure from sfd and store it
	2246	* dfd. sfd is effectively closed by this operation.
	2247	*
	2248	* Any other error code is just returned.
	2249	*/
	2250	switch (error) {
	2251	case ENODEV:
	2252	/*
	2253	* Check that the mode the file is being opened for is a
	2254	* subset of the mode of the existing descriptor.
	2255	*/
	2256	if (((mode & (FREAD\|FWRITE)) \| wfp->f_flag) != wfp->f_flag)
	2257	return (EACCES);
	2258	fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags;
	2259	fsetfd(p, wfp, dfd);
	2260	error = 0;
	2261	break;
	2262	case ENXIO:
	2263	/*
	2264	* Steal away the file pointer from dfd, and stuff it into indx.
	2265	*/
	2266	fdp->fd_files[dfd].fileflags = fdp->fd_files[sfd].fileflags;
	2267	fsetfd(p, wfp, dfd);
	2268	if ((xfp = funsetfd_locked(fdp, sfd)) != NULL)
	2269	fdrop(xfp);
	2270	KKASSERT(xfp == wfp); /* XXX MP RACE */
	2271	error = 0;
	2272	break;
	2273	default:
	2274	break;
	2275	}
	2276	fdrop(wfp);
	2277	return (error);
	2278	}
	2279
	2280	/*
	2281	* NOT MPSAFE - I think these refer to a common file descriptor table
	2282	* and we need to spinlock that to link fdtol in.
	2283	*/
	2284	struct filedesc_to_leader *
	2285	filedesc_to_leader_alloc(struct filedesc_to_leader *old,
	2286	struct proc *leader)
	2287	{
	2288	struct filedesc_to_leader *fdtol;
	2289
	2290	fdtol = kmalloc(sizeof(struct filedesc_to_leader),
	2291	M_FILEDESC_TO_LEADER, M_WAITOK);
	2292	fdtol->fdl_refcount = 1;
	2293	fdtol->fdl_holdcount = 0;
	2294	fdtol->fdl_wakeup = 0;
	2295	fdtol->fdl_leader = leader;
	2296	if (old != NULL) {
	2297	fdtol->fdl_next = old->fdl_next;
	2298	fdtol->fdl_prev = old;
	2299	old->fdl_next = fdtol;
	2300	fdtol->fdl_next->fdl_prev = fdtol;
	2301	} else {
	2302	fdtol->fdl_next = fdtol;
	2303	fdtol->fdl_prev = fdtol;
	2304	}
	2305	return fdtol;
	2306	}
	2307
	2308	/*
	2309	* Scan all file pointers in the system. The callback is made with
	2310	* both the master list spinlock held and the fp spinlock held,
	2311	* both exclusively.
	2312	*
	2313	* MPSAFE
	2314	*
	2315	* WARNING: both the filehead spinlock and the file pointer spinlock are
	2316	* held exclusively when the callback is made. The file pointer is not
	2317	* referenced.
	2318	*/
	2319	void
	2320	allfiles_scan_exclusive(int (callback)(struct file , void ), void data)
	2321	{
	2322	struct file *fp;
	2323	int res;
	2324
	2325	spin_lock_wr(&filehead_spin);
	2326	LIST_FOREACH(fp, &filehead, f_list) {
	2327	spin_lock_wr(&fp->f_spin);
	2328	res = callback(fp, data);
	2329	spin_unlock_wr(&fp->f_spin);
	2330	if (res < 0)
	2331	break;
	2332	}
	2333	spin_unlock_wr(&filehead_spin);
	2334	}
	2335
	2336	/*
	2337	* Get file structures.
	2338	*
	2339	* NOT MPSAFE - process list scan, SYSCTL_OUT (probably not mpsafe)
	2340	*/
	2341
	2342	struct sysctl_kern_file_info {
	2343	int count;
	2344	int error;
	2345	struct sysctl_req *req;
	2346	};
	2347
	2348	static int sysctl_kern_file_callback(struct proc p, void data);
	2349
	2350	static int
	2351	sysctl_kern_file(SYSCTL_HANDLER_ARGS)
	2352	{
	2353	struct sysctl_kern_file_info info;
	2354
	2355	/*
	2356	* Note: because the number of file descriptors is calculated
	2357	* in different ways for sizing vs returning the data,
	2358	* there is information leakage from the first loop. However,
	2359	* it is of a similar order of magnitude to the leakage from
	2360	* global system statistics such as kern.openfiles.
	2361	*
	2362	* When just doing a count, note that we cannot just count
	2363	* the elements and add f_count via the filehead list because
	2364	* threaded processes share their descriptor table and f_count might
	2365	* still be '1' in that case.
	2366	*
	2367	* Since the SYSCTL op can block, we must hold the process to
	2368	* prevent it being ripped out from under us either in the
	2369	* file descriptor loop or in the greater LIST_FOREACH. The
	2370	* process may be in varying states of disrepair. If the process
	2371	* is in SZOMB we may have caught it just as it is being removed
	2372	* from the allproc list, we must skip it in that case to maintain
	2373	* an unbroken chain through the allproc list.
	2374	*/
	2375	info.count = 0;
	2376	info.error = 0;
	2377	info.req = req;
	2378	allproc_scan(sysctl_kern_file_callback, &info);
	2379
	2380	/*
	2381	* When just calculating the size, overestimate a bit to try to
	2382	* prevent system activity from causing the buffer-fill call
	2383	* to fail later on.
	2384	*/
	2385	if (req->oldptr == NULL) {
	2386	info.count = (info.count + 16) + (info.count / 10);
	2387	info.error = SYSCTL_OUT(req, NULL,
	2388	info.count * sizeof(struct kinfo_file));
	2389	}
	2390	return (info.error);
	2391	}
	2392
	2393	static int
	2394	sysctl_kern_file_callback(struct proc p, void data)
	2395	{
	2396	struct sysctl_kern_file_info *info = data;
	2397	struct kinfo_file kf;
	2398	struct filedesc *fdp;
	2399	struct file *fp;
	2400	uid_t uid;
	2401	int n;
	2402
	2403	if (p->p_stat == SIDL \|\| (p->p_flag & P_ZOMBIE))
	2404	return(0);
	2405	if (!PRISON_CHECK(info->req->td->td_proc->p_ucred, p->p_ucred) != 0)
	2406	return(0);
	2407	if ((fdp = p->p_fd) == NULL)
	2408	return(0);
	2409	spin_lock_rd(&fdp->fd_spin);
	2410	for (n = 0; n < fdp->fd_nfiles; ++n) {
	2411	if ((fp = fdp->fd_files[n].fp) == NULL)
	2412	continue;
	2413	if (info->req->oldptr == NULL) {
	2414	++info->count;
	2415	} else {
	2416	uid = p->p_ucred ? p->p_ucred->cr_uid : -1;
	2417	kcore_make_file(&kf, fp, p->p_pid, uid, n);
	2418	spin_unlock_rd(&fdp->fd_spin);
	2419	info->error = SYSCTL_OUT(info->req, &kf, sizeof(kf));
	2420	spin_lock_rd(&fdp->fd_spin);
	2421	if (info->error)
	2422	break;
	2423	}
	2424	}
	2425	spin_unlock_rd(&fdp->fd_spin);
	2426	if (info->error)
	2427	return(-1);
	2428	return(0);
	2429	}
	2430
	2431	SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	2432	0, 0, sysctl_kern_file, "S,file", "Entire file table");
	2433
	2434	SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
	2435	&maxfilesperproc, 0, "Maximum files allowed open per process");
	2436
	2437	SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
	2438	&maxfiles, 0, "Maximum number of files");
	2439
	2440	SYSCTL_INT(_kern, OID_AUTO, maxfilesrootres, CTLFLAG_RW,
	2441	&maxfilesrootres, 0, "Descriptors reserved for root use");
	2442
	2443	SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
	2444	&nfiles, 0, "System-wide number of open files");
	2445
	2446	static void
	2447	fildesc_drvinit(void *unused)
	2448	{
	2449	int fd;
	2450
	2451	dev_ops_add(&fildesc_ops, 0, 0);
	2452	for (fd = 0; fd < NUMFDESC; fd++) {
	2453	make_dev(&fildesc_ops, fd,
	2454	UID_BIN, GID_BIN, 0666, "fd/%d", fd);
	2455	}
	2456	make_dev(&fildesc_ops, 0, UID_ROOT, GID_WHEEL, 0666, "stdin");
	2457	make_dev(&fildesc_ops, 1, UID_ROOT, GID_WHEEL, 0666, "stdout");
	2458	make_dev(&fildesc_ops, 2, UID_ROOT, GID_WHEEL, 0666, "stderr");
	2459	}
	2460
	2461	/*
	2462	* MPSAFE
	2463	*/
	2464	struct fileops badfileops = {
	2465	.fo_read = badfo_readwrite,
	2466	.fo_write = badfo_readwrite,
	2467	.fo_ioctl = badfo_ioctl,
	2468	.fo_poll = badfo_poll,
	2469	.fo_kqfilter = badfo_kqfilter,
	2470	.fo_stat = badfo_stat,
	2471	.fo_close = badfo_close,
	2472	.fo_shutdown = badfo_shutdown
	2473	};
	2474
	2475	/*
	2476	* MPSAFE
	2477	*/
	2478	static int
	2479	badfo_readwrite(
	2480	struct file *fp,
	2481	struct uio *uio,
	2482	struct ucred *cred,
	2483	int flags
	2484	) {
	2485	return (EBADF);
	2486	}
	2487
	2488	/*
	2489	* MPSAFE
	2490	*/
	2491	static int
	2492	badfo_ioctl(struct file fp, u_long com, caddr_t data, struct ucred cred)
	2493	{
	2494	return (EBADF);
	2495	}
	2496
	2497	/*
	2498	* MPSAFE
	2499	*/
	2500	static int
	2501	badfo_poll(struct file fp, int events, struct ucred cred)
	2502	{
	2503	return (0);
	2504	}
	2505
	2506	/*
	2507	* MPSAFE
	2508	*/
	2509	static int
	2510	badfo_kqfilter(struct file fp, struct knote kn)
	2511	{
	2512	return (0);
	2513	}
	2514
	2515	static int
	2516	badfo_stat(struct file fp, struct stat sb, struct ucred *cred)
	2517	{
	2518	return (EBADF);
	2519	}
	2520
	2521	/*
	2522	* MPSAFE
	2523	*/
	2524	static int
	2525	badfo_close(struct file *fp)
	2526	{
	2527	return (EBADF);
	2528	}
	2529
	2530	/*
	2531	* MPSAFE
	2532	*/
	2533	static int
	2534	badfo_shutdown(struct file *fp, int how)
	2535	{
	2536	return (EBADF);
	2537	}
	2538
	2539	/*
	2540	* MPSAFE
	2541	*/
	2542	int
	2543	nofo_shutdown(struct file *fp, int how)
	2544	{
	2545	return (EOPNOTSUPP);
	2546	}
	2547
	2548	SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
	2549	fildesc_drvinit,NULL)