gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1982, 1986, 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	* (c) UNIX System Laboratories, Inc.
	5	* All or some portions of this file are derived from material licensed
	6	* to the University of California by American Telephone and Telegraph
	7	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	8	* the permission of UNIX System Laboratories, Inc.
	9	*
	10	* Redistribution and use in source and binary forms, with or without
	11	* modification, are permitted provided that the following conditions
	12	* are met:
	13	* 1. Redistributions of source code must retain the above copyright
	14	* notice, this list of conditions and the following disclaimer.
	15	* 2. Redistributions in binary form must reproduce the above copyright
	16	* notice, this list of conditions and the following disclaimer in the
	17	* documentation and/or other materials provided with the distribution.
	18	* 3. All advertising materials mentioning features or use of this software
	19	* must display the following acknowledgement:
	20	* This product includes software developed by the University of
	21	* California, Berkeley and its contributors.
	22	* 4. Neither the name of the University nor the names of its contributors
	23	* may be used to endorse or promote products derived from this software
	24	* without specific prior written permission.
	25	*
	26	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	27	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	28	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	29	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	30	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	31	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	32	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	33	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	34	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	35	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	36	* SUCH DAMAGE.
	37	*
	38	* @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
	39	* $FreeBSD: src/sys/kern/vfs_vnops.c,v 1.87.2.13 2002/12/29 18:19:53 dillon Exp $
	40	* $DragonFly: src/sys/kern/vfs_vnops.c,v 1.58 2008/06/28 17:59:49 dillon Exp $
	41	*/
	42
	43	#include <sys/param.h>
	44	#include <sys/systm.h>
	45	#include <sys/fcntl.h>
	46	#include <sys/file.h>
	47	#include <sys/stat.h>
	48	#include <sys/proc.h>
	49	#include <sys/priv.h>
	50	#include <sys/mount.h>
	51	#include <sys/nlookup.h>
	52	#include <sys/vnode.h>
	53	#include <sys/buf.h>
	54	#include <sys/filio.h>
	55	#include <sys/ttycom.h>
	56	#include <sys/conf.h>
	57	#include <sys/syslog.h>
	58
	59	static int vn_closefile (struct file *fp);
	60	static int vn_ioctl (struct file *fp, u_long com, caddr_t data,
	61	struct ucred *cred);
	62	static int vn_read (struct file fp, struct uio uio,
	63	struct ucred *cred, int flags);
	64	static int svn_read (struct file fp, struct uio uio,
	65	struct ucred *cred, int flags);
	66	static int vn_poll (struct file fp, int events, struct ucred cred);
	67	static int vn_kqfilter (struct file fp, struct knote kn);
	68	static int vn_statfile (struct file fp, struct stat sb, struct ucred *cred);
	69	static int vn_write (struct file fp, struct uio uio,
	70	struct ucred *cred, int flags);
	71	static int svn_write (struct file fp, struct uio uio,
	72	struct ucred *cred, int flags);
	73
	74	struct fileops vnode_fileops = {
	75	.fo_read = vn_read,
	76	.fo_write = vn_write,
	77	.fo_ioctl = vn_ioctl,
	78	.fo_poll = vn_poll,
	79	.fo_kqfilter = vn_kqfilter,
	80	.fo_stat = vn_statfile,
	81	.fo_close = vn_closefile,
	82	.fo_shutdown = nofo_shutdown
	83	};
	84
	85	struct fileops specvnode_fileops = {
	86	.fo_read = svn_read,
	87	.fo_write = svn_write,
	88	.fo_ioctl = vn_ioctl,
	89	.fo_poll = vn_poll,
	90	.fo_kqfilter = vn_kqfilter,
	91	.fo_stat = vn_statfile,
	92	.fo_close = vn_closefile,
	93	.fo_shutdown = nofo_shutdown
	94	};
	95
	96	/*
	97	* Shortcut the device read/write. This avoids a lot of vnode junk.
	98	* Basically the specfs vnops for read and write take the locked vnode,
	99	* unlock it (because we can't hold the vnode locked while reading or writing
	100	* a device which may block indefinitely), issues the device operation, then
	101	* relock the vnode before returning, plus other junk. This bypasses all
	102	* of that and just does the device operation.
	103	*/
	104	void
	105	vn_setspecops(struct file *fp)
	106	{
	107	if (vfs_fastdev && fp->f_ops == &vnode_fileops) {
	108	fp->f_ops = &specvnode_fileops;
	109	}
	110	}
	111
	112	/*
	113	* Common code for vnode open operations. Check permissions, and call
	114	* the VOP_NOPEN or VOP_NCREATE routine.
	115	*
	116	* The caller is responsible for setting up nd with nlookup_init() and
	117	* for cleaning it up with nlookup_done(), whether we return an error
	118	* or not.
	119	*
	120	* On success nd->nl_open_vp will hold a referenced and, if requested,
	121	* locked vnode. A locked vnode is requested via NLC_LOCKVP. If fp
	122	* is non-NULL the vnode will be installed in the file pointer.
	123	*
	124	* NOTE: The vnode is referenced just once on return whether or not it
	125	* is also installed in the file pointer.
	126	*/
	127	int
	128	vn_open(struct nlookupdata nd, struct file fp, int fmode, int cmode)
	129	{
	130	struct vnode *vp;
	131	struct ucred *cred = nd->nl_cred;
	132	struct vattr vat;
	133	struct vattr *vap = &vat;
	134	int error;
	135
	136	/*
	137	* Lookup the path and create or obtain the vnode. After a
	138	* successful lookup a locked nd->nl_nch will be returned.
	139	*
	140	* The result of this section should be a locked vnode.
	141	*
	142	* XXX with only a little work we should be able to avoid locking
	143	* the vnode if FWRITE, O_CREAT, and O_TRUNC are not set.
	144	*/
	145	nd->nl_flags \|= NLC_OPEN;
	146	if (fmode & O_APPEND)
	147	nd->nl_flags \|= NLC_APPEND;
	148	if (fmode & O_TRUNC)
	149	nd->nl_flags \|= NLC_TRUNCATE;
	150	if (fmode & FREAD)
	151	nd->nl_flags \|= NLC_READ;
	152	if (fmode & FWRITE)
	153	nd->nl_flags \|= NLC_WRITE;
	154
	155	if (fmode & O_CREAT) {
	156	/*
	157	* CONDITIONAL CREATE FILE CASE
	158	*
	159	* Setting NLC_CREATE causes a negative hit to store
	160	* the negative hit ncp and not return an error. Then
	161	* nc_error or nc_vp may be checked to see if the ncp
	162	* represents a negative hit. NLC_CREATE also requires
	163	* write permission on the governing directory or EPERM
	164	* is returned.
	165	*/
	166	if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
	167	nd->nl_flags \|= NLC_FOLLOW;
	168	nd->nl_flags \|= NLC_CREATE;
	169	nd->nl_flags \|= NLC_REFDVP;
	170	bwillinode(1);
	171	error = nlookup(nd);
	172	} else {
	173	/*
	174	* NORMAL OPEN FILE CASE
	175	*/
	176	error = nlookup(nd);
	177	}
	178
	179	if (error)
	180	return (error);
	181
	182	/*
	183	* split case to allow us to re-resolve and retry the ncp in case
	184	* we get ESTALE.
	185	*/
	186	again:
	187	if (fmode & O_CREAT) {
	188	if (nd->nl_nch.ncp->nc_vp == NULL) {
	189	if ((error = ncp_writechk(&nd->nl_nch)) != 0)
	190	return (error);
	191	VATTR_NULL(vap);
	192	vap->va_type = VREG;
	193	vap->va_mode = cmode;
	194	if (fmode & O_EXCL)
	195	vap->va_vaflags \|= VA_EXCLUSIVE;
	196	error = VOP_NCREATE(&nd->nl_nch, nd->nl_dvp, &vp,
	197	nd->nl_cred, vap);
	198	if (error)
	199	return (error);
	200	fmode &= ~O_TRUNC;
	201	/* locked vnode is returned */
	202	} else {
	203	if (fmode & O_EXCL) {
	204	error = EEXIST;
	205	} else {
	206	error = cache_vget(&nd->nl_nch, cred,
	207	LK_EXCLUSIVE, &vp);
	208	}
	209	if (error)
	210	return (error);
	211	fmode &= ~O_CREAT;
	212	}
	213	} else {
	214	error = cache_vget(&nd->nl_nch, cred, LK_EXCLUSIVE, &vp);
	215	if (error)
	216	return (error);
	217	}
	218
	219	/*
	220	* We have a locked vnode and ncp now. Note that the ncp will
	221	* be cleaned up by the caller if nd->nl_nch is left intact.
	222	*/
	223	if (vp->v_type == VLNK) {
	224	error = EMLINK;
	225	goto bad;
	226	}
	227	if (vp->v_type == VSOCK) {
	228	error = EOPNOTSUPP;
	229	goto bad;
	230	}
	231	if ((fmode & O_CREAT) == 0) {
	232	if (fmode & (FWRITE \| O_TRUNC)) {
	233	if (vp->v_type == VDIR) {
	234	error = EISDIR;
	235	goto bad;
	236	}
	237	error = vn_writechk(vp, &nd->nl_nch);
	238	if (error) {
	239	/*
	240	* Special stale handling, re-resolve the
	241	* vnode.
	242	*/
	243	if (error == ESTALE) {
	244	vput(vp);
	245	vp = NULL;
	246	cache_setunresolved(&nd->nl_nch);
	247	error = cache_resolve(&nd->nl_nch, cred);
	248	if (error == 0)
	249	goto again;
	250	}
	251	goto bad;
	252	}
	253	}
	254	}
	255	if (fmode & O_TRUNC) {
	256	vn_unlock(vp); /* XXX */
	257	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY); /* XXX */
	258	VATTR_NULL(vap);
	259	vap->va_size = 0;
	260	error = VOP_SETATTR(vp, vap, cred);
	261	if (error)
	262	goto bad;
	263	}
	264
	265	/*
	266	* Setup the fp so VOP_OPEN can override it. No descriptor has been
	267	* associated with the fp yet so we own it clean.
	268	*
	269	* f_nchandle inherits nl_nch. This used to be necessary only for
	270	* directories but now we do it unconditionally so f*() ops
	271	* such as fchmod() can access the actual namespace that was
	272	* used to open the file.
	273	*/
	274	if (fp) {
	275	if (nd->nl_flags & NLC_APPENDONLY)
	276	fmode \|= FAPPENDONLY;
	277	fp->f_nchandle = nd->nl_nch;
	278	cache_zero(&nd->nl_nch);
	279	cache_unlock(&fp->f_nchandle);
	280	}
	281
	282	/*
	283	* Get rid of nl_nch. vn_open does not return it (it returns the
	284	* vnode or the file pointer). Note: we can't leave nl_nch locked
	285	* through the VOP_OPEN anyway since the VOP_OPEN may block, e.g.
	286	* on /dev/ttyd0
	287	*/
	288	if (nd->nl_nch.ncp)
	289	cache_put(&nd->nl_nch);
	290
	291	error = VOP_OPEN(vp, fmode, cred, fp);
	292	if (error) {
	293	/*
	294	* setting f_ops to &badfileops will prevent the descriptor
	295	* code from trying to close and release the vnode, since
	296	* the open failed we do not want to call close.
	297	*/
	298	if (fp) {
	299	fp->f_data = NULL;
	300	fp->f_ops = &badfileops;
	301	}
	302	goto bad;
	303	}
	304
	305	#if 0
	306	/*
	307	* Assert that VREG files have been setup for vmio.
	308	*/
	309	KASSERT(vp->v_type != VREG \|\| vp->v_object != NULL,
	310	("vn_open: regular file was not VMIO enabled!"));
	311	#endif
	312
	313	/*
	314	* Return the vnode. XXX needs some cleaning up. The vnode is
	315	* only returned in the fp == NULL case.
	316	*/
	317	if (fp == NULL) {
	318	nd->nl_open_vp = vp;
	319	nd->nl_vp_fmode = fmode;
	320	if ((nd->nl_flags & NLC_LOCKVP) == 0)
	321	vn_unlock(vp);
	322	} else {
	323	vput(vp);
	324	}
	325	return (0);
	326	bad:
	327	if (vp)
	328	vput(vp);
	329	return (error);
	330	}
	331
	332	int
	333	vn_opendisk(const char devname, int fmode, struct vnode *vpp)
	334	{
	335	struct vnode *vp;
	336	int error;
	337
	338	if (strncmp(devname, "/dev/", 5) == 0)
	339	devname += 5;
	340	if ((vp = getsynthvnode(devname)) == NULL) {
	341	error = ENODEV;
	342	} else {
	343	error = VOP_OPEN(vp, fmode, proc0.p_ucred, NULL);
	344	vn_unlock(vp);
	345	if (error) {
	346	vrele(vp);
	347	vp = NULL;
	348	}
	349	}
	350	*vpp = vp;
	351	return (error);
	352	}
	353
	354	/*
	355	* Check for write permissions on the specified vnode. nch may be NULL.
	356	*/
	357	int
	358	vn_writechk(struct vnode vp, struct nchandle nch)
	359	{
	360	/*
	361	* If there's shared text associated with
	362	* the vnode, try to free it up once. If
	363	* we fail, we can't allow writing.
	364	*/
	365	if (vp->v_flag & VTEXT)
	366	return (ETXTBSY);
	367
	368	/*
	369	* If the vnode represents a regular file, check the mount
	370	* point via the nch. This may be a different mount point
	371	* then the one embedded in the vnode (e.g. nullfs).
	372	*
	373	* We can still write to non-regular files (e.g. devices)
	374	* via read-only mounts.
	375	*/
	376	if (nch && nch->ncp && vp->v_type == VREG)
	377	return (ncp_writechk(nch));
	378	return (0);
	379	}
	380
	381	/*
	382	* Check whether the underlying mount is read-only. The mount point
	383	* referenced by the namecache may be different from the mount point
	384	* used by the underlying vnode in the case of NULLFS, so a separate
	385	* check is needed.
	386	*/
	387	int
	388	ncp_writechk(struct nchandle *nch)
	389	{
	390	if (nch->mount && (nch->mount->mnt_flag & MNT_RDONLY))
	391	return (EROFS);
	392	return(0);
	393	}
	394
	395	/*
	396	* Vnode close call
	397	*/
	398	int
	399	vn_close(struct vnode *vp, int flags)
	400	{
	401	int error;
	402
	403	error = vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	404	if (error == 0) {
	405	error = VOP_CLOSE(vp, flags);
	406	vn_unlock(vp);
	407	}
	408	vrele(vp);
	409	return (error);
	410	}
	411
	412	static __inline
	413	int
	414	sequential_heuristic(struct uio uio, struct file fp)
	415	{
	416	/*
	417	* Sequential heuristic - detect sequential operation
	418	*/
	419	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) \|\|
	420	uio->uio_offset == fp->f_nextoff) {
	421	int tmpseq = fp->f_seqcount;
	422	/*
	423	* XXX we assume that the filesystem block size is
	424	* the default. Not true, but still gives us a pretty
	425	* good indicator of how sequential the read operations
	426	* are.
	427	*/
	428	tmpseq += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
	429	if (tmpseq > IO_SEQMAX)
	430	tmpseq = IO_SEQMAX;
	431	fp->f_seqcount = tmpseq;
	432	return(fp->f_seqcount << IO_SEQSHIFT);
	433	}
	434
	435	/*
	436	* Not sequential, quick draw-down of seqcount
	437	*/
	438	if (fp->f_seqcount > 1)
	439	fp->f_seqcount = 1;
	440	else
	441	fp->f_seqcount = 0;
	442	return(0);
	443	}
	444
	445	/*
	446	* Package up an I/O request on a vnode into a uio and do it.
	447	*/
	448	int
	449	vn_rdwr(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
	450	off_t offset, enum uio_seg segflg, int ioflg,
	451	struct ucred cred, int aresid)
	452	{
	453	struct uio auio;
	454	struct iovec aiov;
	455	struct ccms_lock ccms_lock;
	456	int error;
	457
	458	if ((ioflg & IO_NODELOCKED) == 0)
	459	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	460	auio.uio_iov = &aiov;
	461	auio.uio_iovcnt = 1;
	462	aiov.iov_base = base;
	463	aiov.iov_len = len;
	464	auio.uio_resid = len;
	465	auio.uio_offset = offset;
	466	auio.uio_segflg = segflg;
	467	auio.uio_rw = rw;
	468	auio.uio_td = curthread;
	469	ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, &auio);
	470	if (rw == UIO_READ) {
	471	error = VOP_READ(vp, &auio, ioflg, cred);
	472	} else {
	473	error = VOP_WRITE(vp, &auio, ioflg, cred);
	474	}
	475	ccms_lock_put(&vp->v_ccms, &ccms_lock);
	476	if (aresid)
	477	*aresid = auio.uio_resid;
	478	else
	479	if (auio.uio_resid && error == 0)
	480	error = EIO;
	481	if ((ioflg & IO_NODELOCKED) == 0)
	482	vn_unlock(vp);
	483	return (error);
	484	}
	485
	486	/*
	487	* Package up an I/O request on a vnode into a uio and do it. The I/O
	488	* request is split up into smaller chunks and we try to avoid saturating
	489	* the buffer cache while potentially holding a vnode locked, so we
	490	* check bwillwrite() before calling vn_rdwr(). We also call uio_yield()
	491	* to give other processes a chance to lock the vnode (either other processes
	492	* core'ing the same binary, or unrelated processes scanning the directory).
	493	*/
	494	int
	495	vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len,
	496	off_t offset, enum uio_seg segflg, int ioflg,
	497	struct ucred cred, int aresid)
	498	{
	499	int error = 0;
	500
	501	do {
	502	int chunk;
	503
	504	/*
	505	* Force `offset' to a multiple of MAXBSIZE except possibly
	506	* for the first chunk, so that filesystems only need to
	507	* write full blocks except possibly for the first and last
	508	* chunks.
	509	*/
	510	chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;
	511
	512	if (chunk > len)
	513	chunk = len;
	514	if (vp->v_type == VREG) {
	515	switch(rw) {
	516	case UIO_READ:
	517	bwillread(chunk);
	518	break;
	519	case UIO_WRITE:
	520	bwillwrite(chunk);
	521	break;
	522	}
	523	}
	524	error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
	525	ioflg, cred, aresid);
	526	len -= chunk; /* aresid calc already includes length */
	527	if (error)
	528	break;
	529	offset += chunk;
	530	base += chunk;
	531	uio_yield();
	532	} while (len);
	533	if (aresid)
	534	*aresid += len;
	535	return (error);
	536	}
	537
	538	/*
	539	* MPALMOSTSAFE - acquires mplock
	540	*/
	541	static int
	542	vn_read(struct file fp, struct uio uio, struct ucred *cred, int flags)
	543	{
	544	struct ccms_lock ccms_lock;
	545	struct vnode *vp;
	546	int error, ioflag;
	547
	548	get_mplock();
	549	KASSERT(uio->uio_td == curthread,
	550	("uio_td %p is not td %p", uio->uio_td, curthread));
	551	vp = (struct vnode *)fp->f_data;
	552
	553	ioflag = 0;
	554	if (flags & O_FBLOCKING) {
	555	/* ioflag &= ~IO_NDELAY; */
	556	} else if (flags & O_FNONBLOCKING) {
	557	ioflag \|= IO_NDELAY;
	558	} else if (fp->f_flag & FNONBLOCK) {
	559	ioflag \|= IO_NDELAY;
	560	}
	561	if (flags & O_FBUFFERED) {
	562	/* ioflag &= ~IO_DIRECT; */
	563	} else if (flags & O_FUNBUFFERED) {
	564	ioflag \|= IO_DIRECT;
	565	} else if (fp->f_flag & O_DIRECT) {
	566	ioflag \|= IO_DIRECT;
	567	}
	568	vn_lock(vp, LK_SHARED \| LK_RETRY);
	569	if ((flags & O_FOFFSET) == 0)
	570	uio->uio_offset = fp->f_offset;
	571	ioflag \|= sequential_heuristic(uio, fp);
	572
	573	ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
	574	error = VOP_READ(vp, uio, ioflag, cred);
	575	ccms_lock_put(&vp->v_ccms, &ccms_lock);
	576	if ((flags & O_FOFFSET) == 0)
	577	fp->f_offset = uio->uio_offset;
	578	fp->f_nextoff = uio->uio_offset;
	579	vn_unlock(vp);
	580	rel_mplock();
	581	return (error);
	582	}
	583
	584	/*
	585	* Device-optimized file table vnode read routine.
	586	*
	587	* This bypasses the VOP table and talks directly to the device. Most
	588	* filesystems just route to specfs and can make this optimization.
	589	*
	590	* MPALMOSTSAFE - acquires mplock
	591	*/
	592	static int
	593	svn_read(struct file fp, struct uio uio, struct ucred *cred, int flags)
	594	{
	595	struct vnode *vp;
	596	int ioflag;
	597	int error;
	598	cdev_t dev;
	599
	600	get_mplock();
	601	KASSERT(uio->uio_td == curthread,
	602	("uio_td %p is not td %p", uio->uio_td, curthread));
	603
	604	vp = (struct vnode *)fp->f_data;
	605	if (vp == NULL \|\| vp->v_type == VBAD) {
	606	error = EBADF;
	607	goto done;
	608	}
	609
	610	if ((dev = vp->v_rdev) == NULL) {
	611	error = EBADF;
	612	goto done;
	613	}
	614	reference_dev(dev);
	615
	616	if (uio->uio_resid == 0) {
	617	error = 0;
	618	goto done;
	619	}
	620	if ((flags & O_FOFFSET) == 0)
	621	uio->uio_offset = fp->f_offset;
	622
	623	ioflag = 0;
	624	if (flags & O_FBLOCKING) {
	625	/* ioflag &= ~IO_NDELAY; */
	626	} else if (flags & O_FNONBLOCKING) {
	627	ioflag \|= IO_NDELAY;
	628	} else if (fp->f_flag & FNONBLOCK) {
	629	ioflag \|= IO_NDELAY;
	630	}
	631	if (flags & O_FBUFFERED) {
	632	/* ioflag &= ~IO_DIRECT; */
	633	} else if (flags & O_FUNBUFFERED) {
	634	ioflag \|= IO_DIRECT;
	635	} else if (fp->f_flag & O_DIRECT) {
	636	ioflag \|= IO_DIRECT;
	637	}
	638	ioflag \|= sequential_heuristic(uio, fp);
	639
	640	error = dev_dread(dev, uio, ioflag);
	641
	642	release_dev(dev);
	643	if ((flags & O_FOFFSET) == 0)
	644	fp->f_offset = uio->uio_offset;
	645	fp->f_nextoff = uio->uio_offset;
	646	done:
	647	rel_mplock();
	648	return (error);
	649	}
	650
	651	/*
	652	* MPALMOSTSAFE - acquires mplock
	653	*/
	654	static int
	655	vn_write(struct file fp, struct uio uio, struct ucred *cred, int flags)
	656	{
	657	struct ccms_lock ccms_lock;
	658	struct vnode *vp;
	659	int error, ioflag;
	660
	661	get_mplock();
	662	KASSERT(uio->uio_td == curthread,
	663	("uio_td %p is not p %p", uio->uio_td, curthread));
	664	vp = (struct vnode *)fp->f_data;
	665	#if 0
	666	/* VOP_WRITE should handle this now */
	667	if (vp->v_type == VREG \|\| vp->v_type == VDATABASE)
	668	bwillwrite();
	669	#endif
	670	vp = (struct vnode )fp->f_data; / XXX needed? */
	671
	672	ioflag = IO_UNIT;
	673	if (vp->v_type == VREG &&
	674	((fp->f_flag & O_APPEND) \|\| (flags & O_FAPPEND))) {
	675	ioflag \|= IO_APPEND;
	676	}
	677
	678	if (flags & O_FBLOCKING) {
	679	/* ioflag &= ~IO_NDELAY; */
	680	} else if (flags & O_FNONBLOCKING) {
	681	ioflag \|= IO_NDELAY;
	682	} else if (fp->f_flag & FNONBLOCK) {
	683	ioflag \|= IO_NDELAY;
	684	}
	685	if (flags & O_FBUFFERED) {
	686	/* ioflag &= ~IO_DIRECT; */
	687	} else if (flags & O_FUNBUFFERED) {
	688	ioflag \|= IO_DIRECT;
	689	} else if (fp->f_flag & O_DIRECT) {
	690	ioflag \|= IO_DIRECT;
	691	}
	692	if (flags & O_FASYNCWRITE) {
	693	/* ioflag &= ~IO_SYNC; */
	694	} else if (flags & O_FSYNCWRITE) {
	695	ioflag \|= IO_SYNC;
	696	} else if (fp->f_flag & O_FSYNC) {
	697	ioflag \|= IO_SYNC;
	698	}
	699
	700	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
	701	ioflag \|= IO_SYNC;
	702	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	703	if ((flags & O_FOFFSET) == 0)
	704	uio->uio_offset = fp->f_offset;
	705	ioflag \|= sequential_heuristic(uio, fp);
	706	ccms_lock_get_uio(&vp->v_ccms, &ccms_lock, uio);
	707	error = VOP_WRITE(vp, uio, ioflag, cred);
	708	ccms_lock_put(&vp->v_ccms, &ccms_lock);
	709	if ((flags & O_FOFFSET) == 0)
	710	fp->f_offset = uio->uio_offset;
	711	fp->f_nextoff = uio->uio_offset;
	712	vn_unlock(vp);
	713	rel_mplock();
	714	return (error);
	715	}
	716
	717	/*
	718	* Device-optimized file table vnode write routine.
	719	*
	720	* This bypasses the VOP table and talks directly to the device. Most
	721	* filesystems just route to specfs and can make this optimization.
	722	*
	723	* MPALMOSTSAFE - acquires mplock
	724	*/
	725	static int
	726	svn_write(struct file fp, struct uio uio, struct ucred *cred, int flags)
	727	{
	728	struct vnode *vp;
	729	int ioflag;
	730	int error;
	731	cdev_t dev;
	732
	733	get_mplock();
	734	KASSERT(uio->uio_td == curthread,
	735	("uio_td %p is not p %p", uio->uio_td, curthread));
	736
	737	vp = (struct vnode *)fp->f_data;
	738	if (vp == NULL \|\| vp->v_type == VBAD) {
	739	error = EBADF;
	740	goto done;
	741	}
	742	if (vp->v_type == VREG)
	743	bwillwrite(uio->uio_resid);
	744	vp = (struct vnode )fp->f_data; / XXX needed? */
	745
	746	if ((dev = vp->v_rdev) == NULL) {
	747	error = EBADF;
	748	goto done;
	749	}
	750	reference_dev(dev);
	751
	752	if ((flags & O_FOFFSET) == 0)
	753	uio->uio_offset = fp->f_offset;
	754
	755	ioflag = IO_UNIT;
	756	if (vp->v_type == VREG &&
	757	((fp->f_flag & O_APPEND) \|\| (flags & O_FAPPEND))) {
	758	ioflag \|= IO_APPEND;
	759	}
	760
	761	if (flags & O_FBLOCKING) {
	762	/* ioflag &= ~IO_NDELAY; */
	763	} else if (flags & O_FNONBLOCKING) {
	764	ioflag \|= IO_NDELAY;
	765	} else if (fp->f_flag & FNONBLOCK) {
	766	ioflag \|= IO_NDELAY;
	767	}
	768	if (flags & O_FBUFFERED) {
	769	/* ioflag &= ~IO_DIRECT; */
	770	} else if (flags & O_FUNBUFFERED) {
	771	ioflag \|= IO_DIRECT;
	772	} else if (fp->f_flag & O_DIRECT) {
	773	ioflag \|= IO_DIRECT;
	774	}
	775	if (flags & O_FASYNCWRITE) {
	776	/* ioflag &= ~IO_SYNC; */
	777	} else if (flags & O_FSYNCWRITE) {
	778	ioflag \|= IO_SYNC;
	779	} else if (fp->f_flag & O_FSYNC) {
	780	ioflag \|= IO_SYNC;
	781	}
	782
	783	if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))
	784	ioflag \|= IO_SYNC;
	785	ioflag \|= sequential_heuristic(uio, fp);
	786
	787	error = dev_dwrite(dev, uio, ioflag);
	788
	789	release_dev(dev);
	790	if ((flags & O_FOFFSET) == 0)
	791	fp->f_offset = uio->uio_offset;
	792	fp->f_nextoff = uio->uio_offset;
	793	done:
	794	rel_mplock();
	795	return (error);
	796	}
	797
	798	/*
	799	* MPALMOSTSAFE - acquires mplock
	800	*/
	801	static int
	802	vn_statfile(struct file fp, struct stat sb, struct ucred *cred)
	803	{
	804	struct vnode *vp;
	805	int error;
	806
	807	get_mplock();
	808	vp = (struct vnode *)fp->f_data;
	809	error = vn_stat(vp, sb, cred);
	810	rel_mplock();
	811	return (error);
	812	}
	813
	814	int
	815	vn_stat(struct vnode vp, struct stat sb, struct ucred *cred)
	816	{
	817	struct vattr vattr;
	818	struct vattr *vap;
	819	int error;
	820	u_short mode;
	821	cdev_t dev;
	822
	823	vap = &vattr;
	824	error = VOP_GETATTR(vp, vap);
	825	if (error)
	826	return (error);
	827
	828	/*
	829	* Zero the spare stat fields
	830	*/
	831	sb->st_lspare = 0;
	832	sb->st_qspare = 0;
	833
	834	/*
	835	* Copy from vattr table
	836	*/
	837	if (vap->va_fsid != VNOVAL)
	838	sb->st_dev = vap->va_fsid;
	839	else
	840	sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
	841	sb->st_ino = vap->va_fileid;
	842	mode = vap->va_mode;
	843	switch (vap->va_type) {
	844	case VREG:
	845	mode \|= S_IFREG;
	846	break;
	847	case VDATABASE:
	848	mode \|= S_IFDB;
	849	break;
	850	case VDIR:
	851	mode \|= S_IFDIR;
	852	break;
	853	case VBLK:
	854	mode \|= S_IFBLK;
	855	break;
	856	case VCHR:
	857	mode \|= S_IFCHR;
	858	break;
	859	case VLNK:
	860	mode \|= S_IFLNK;
	861	/* This is a cosmetic change, symlinks do not have a mode. */
	862	if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
	863	sb->st_mode &= ~ACCESSPERMS; /* 0000 */
	864	else
	865	sb->st_mode \|= ACCESSPERMS; /* 0777 */
	866	break;
	867	case VSOCK:
	868	mode \|= S_IFSOCK;
	869	break;
	870	case VFIFO:
	871	mode \|= S_IFIFO;
	872	break;
	873	default:
	874	return (EBADF);
	875	}
	876	sb->st_mode = mode;
	877	if (vap->va_nlink > (nlink_t)-1)
	878	sb->st_nlink = (nlink_t)-1;
	879	else
	880	sb->st_nlink = vap->va_nlink;
	881	sb->st_uid = vap->va_uid;
	882	sb->st_gid = vap->va_gid;
	883	sb->st_rdev = makeudev(vap->va_rmajor, vap->va_rminor);
	884	sb->st_size = vap->va_size;
	885	sb->st_atimespec = vap->va_atime;
	886	sb->st_mtimespec = vap->va_mtime;
	887	sb->st_ctimespec = vap->va_ctime;
	888
	889	/*
	890	* A VCHR and VBLK device may track the last access and last modified
	891	* time independantly of the filesystem. This is particularly true
	892	* because device read and write calls may bypass the filesystem.
	893	*/
	894	if (vp->v_type == VCHR \|\| vp->v_type == VBLK) {
	895	dev = vp->v_rdev;
	896	if (dev != NULL) {
	897	if (dev->si_lastread) {
	898	sb->st_atimespec.tv_sec = dev->si_lastread;
	899	sb->st_atimespec.tv_nsec = 0;
	900	}
	901	if (dev->si_lastwrite) {
	902	sb->st_atimespec.tv_sec = dev->si_lastwrite;
	903	sb->st_atimespec.tv_nsec = 0;
	904	}
	905	}
	906	}
	907
	908	/*
	909	* According to www.opengroup.org, the meaning of st_blksize is
	910	* "a filesystem-specific preferred I/O block size for this
	911	* object. In some filesystem types, this may vary from file
	912	* to file"
	913	* Default to PAGE_SIZE after much discussion.
	914	*/
	915
	916	if (vap->va_type == VREG) {
	917	sb->st_blksize = vap->va_blocksize;
	918	} else if (vn_isdisk(vp, NULL)) {
	919	/*
	920	* XXX this is broken. If the device is not yet open (aka
	921	* stat() call, aka v_rdev == NULL), how are we supposed
	922	* to get a valid block size out of it?
	923	*/
	924	dev = vp->v_rdev;
	925	if (dev == NULL && vp->v_type == VCHR) {
	926	dev = get_dev(vp->v_umajor, vp->v_uminor);
	927	}
	928	sb->st_blksize = dev->si_bsize_best;
	929	if (sb->st_blksize < dev->si_bsize_phys)
	930	sb->st_blksize = dev->si_bsize_phys;
	931	if (sb->st_blksize < BLKDEV_IOSIZE)
	932	sb->st_blksize = BLKDEV_IOSIZE;
	933	} else {
	934	sb->st_blksize = PAGE_SIZE;
	935	}
	936
	937	sb->st_flags = vap->va_flags;
	938
	939	error = priv_check_cred(cred, PRIV_VFS_GENERATION, 0);
	940	if (error)
	941	sb->st_gen = 0;
	942	else
	943	sb->st_gen = (u_int32_t)vap->va_gen;
	944
	945	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
	946	sb->st_fsmid = vap->va_fsmid;
	947	return (0);
	948	}
	949
	950	/*
	951	* MPALMOSTSAFE - acquires mplock
	952	*/
	953	static int
	954	vn_ioctl(struct file fp, u_long com, caddr_t data, struct ucred ucred)
	955	{
	956	struct vnode vp = ((struct vnode )fp->f_data);
	957	struct vnode *ovp;
	958	struct vattr vattr;
	959	int error;
	960
	961	get_mplock();
	962
	963	switch (vp->v_type) {
	964	case VREG:
	965	case VDIR:
	966	if (com == FIONREAD) {
	967	error = VOP_GETATTR(vp, &vattr);
	968	if (error)
	969	break;
	970	(int )data = vattr.va_size - fp->f_offset;
	971	error = 0;
	972	break;
	973	}
	974	if (com == FIOASYNC) { /* XXX */
	975	error = 0; /* XXX */
	976	break;
	977	}
	978	/* fall into ... */
	979	default:
	980	#if 0
	981	return (ENOTTY);
	982	#endif
	983	case VFIFO:
	984	case VCHR:
	985	case VBLK:
	986	if (com == FIODTYPE) {
	987	if (vp->v_type != VCHR && vp->v_type != VBLK) {
	988	error = ENOTTY;
	989	break;
	990	}
	991	(int )data = dev_dflags(vp->v_rdev) & D_TYPEMASK;
	992	error = 0;
	993	break;
	994	}
	995	error = VOP_IOCTL(vp, com, data, fp->f_flag, ucred);
	996	if (error == 0 && com == TIOCSCTTY) {
	997	struct proc *p = curthread->td_proc;
	998	struct session *sess;
	999
	1000	if (p == NULL) {
	1001	error = ENOTTY;
	1002	break;
	1003	}
	1004
	1005	sess = p->p_session;
	1006	/* Do nothing if reassigning same control tty */
	1007	if (sess->s_ttyvp == vp) {
	1008	error = 0;
	1009	break;
	1010	}
	1011
	1012	/* Get rid of reference to old control tty */
	1013	ovp = sess->s_ttyvp;
	1014	vref(vp);
	1015	sess->s_ttyvp = vp;
	1016	if (ovp)
	1017	vrele(ovp);
	1018	}
	1019	break;
	1020	}
	1021	rel_mplock();
	1022	return (error);
	1023	}
	1024
	1025	/*
	1026	* MPALMOSTSAFE - acquires mplock
	1027	*/
	1028	static int
	1029	vn_poll(struct file fp, int events, struct ucred cred)
	1030	{
	1031	int error;
	1032
	1033	get_mplock();
	1034	error = VOP_POLL(((struct vnode *)fp->f_data), events, cred);
	1035	rel_mplock();
	1036	return (error);
	1037	}
	1038
	1039	/*
	1040	* Check that the vnode is still valid, and if so
	1041	* acquire requested lock.
	1042	*/
	1043	int
	1044	#ifndef DEBUG_LOCKS
	1045	vn_lock(struct vnode *vp, int flags)
	1046	#else
	1047	debug_vn_lock(struct vnode vp, int flags, const char filename, int line)
	1048	#endif
	1049	{
	1050	int error;
	1051
	1052	do {
	1053	#ifdef DEBUG_LOCKS
	1054	vp->filename = filename;
	1055	vp->line = line;
	1056	error = debuglockmgr(&vp->v_lock, flags,
	1057	"vn_lock", filename, line);
	1058	#else
	1059	error = lockmgr(&vp->v_lock, flags);
	1060	#endif
	1061	if (error == 0)
	1062	break;
	1063	} while (flags & LK_RETRY);
	1064
	1065	/*
	1066	* Because we (had better!) have a ref on the vnode, once it
	1067	* goes to VRECLAIMED state it will not be recycled until all
	1068	* refs go away. So we can just check the flag.
	1069	*/
	1070	if (error == 0 && (vp->v_flag & VRECLAIMED)) {
	1071	lockmgr(&vp->v_lock, LK_RELEASE);
	1072	error = ENOENT;
	1073	}
	1074	return (error);
	1075	}
	1076
	1077	void
	1078	vn_unlock(struct vnode *vp)
	1079	{
	1080	lockmgr(&vp->v_lock, LK_RELEASE);
	1081	}
	1082
	1083	int
	1084	vn_islocked(struct vnode *vp)
	1085	{
	1086	return (lockstatus(&vp->v_lock, curthread));
	1087	}
	1088
	1089	/*
	1090	* MPALMOSTSAFE - acquires mplock
	1091	*/
	1092	static int
	1093	vn_closefile(struct file *fp)
	1094	{
	1095	int error;
	1096
	1097	get_mplock();
	1098	fp->f_ops = &badfileops;
	1099	error = vn_close(((struct vnode *)fp->f_data), fp->f_flag);
	1100	rel_mplock();
	1101	return (error);
	1102	}
	1103
	1104	/*
	1105	* MPALMOSTSAFE - acquires mplock
	1106	*/
	1107	static int
	1108	vn_kqfilter(struct file fp, struct knote kn)
	1109	{
	1110	int error;
	1111
	1112	get_mplock();
	1113	error = VOP_KQFILTER(((struct vnode *)fp->f_data), kn);
	1114	rel_mplock();
	1115	return (error);
	1116	}