gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	* 3. All advertising materials mentioning features or use of this software
	14	* must display the following acknowledgement:
	15	* This product includes software developed by the University of
	16	* California, Berkeley and its contributors.
	17	* 4. Neither the name of the University nor the names of its contributors
	18	* may be used to endorse or promote products derived from this software
	19	* without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	22	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	24	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	25	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	26	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	27	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	28	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	29	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	30	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	31	* SUCH DAMAGE.
	32	*
	33	* @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
	34	* $FreeBSD: src/sys/ufs/ufs/ufs_readwrite.c,v 1.65.2.14 2003/04/04 22:21:29 tegge Exp $
	35	* $DragonFly: src/sys/vfs/ufs/ufs_readwrite.c,v 1.26 2008/06/19 23:27:39 dillon Exp $
	36	*/
	37
	38	#define BLKSIZE(a, b, c) blksize(a, b, c)
	39	#define FS struct fs
	40	#define I_FS i_fs
	41
	42	#include <vm/vm.h>
	43	#include <vm/vm_object.h>
	44	#include <vm/vm_pager.h>
	45	#include <vm/vm_map.h>
	46	#include <vm/vnode_pager.h>
	47	#include <sys/event.h>
	48	#include <sys/vmmeter.h>
	49	#include <sys/sysctl.h>
	50	#include <vm/vm_page2.h>
	51
	52	#include "opt_directio.h"
	53
	54	#define VN_KNOTE(vp, b) \
	55	KNOTE((struct klist *)&vp->v_pollinfo.vpi_kqinfo.ki_note, (b))
	56
	57	#ifdef DIRECTIO
	58	extern int ffs_rawread(struct vnode vp, struct uio uio, int *workdone);
	59	#endif
	60
	61	SYSCTL_DECL(_vfs_ffs);
	62
	63	/*
	64	* Vnode op for reading.
	65	*
	66	* ffs_read(struct vnode a_vp, struct uio a_uio, int a_ioflag,
	67	* struct ucred *a_cred)
	68	*/
	69	/* ARGSUSED */
	70	int
	71	ffs_read(struct vop_read_args *ap)
	72	{
	73	struct vnode *vp;
	74	struct inode *ip;
	75	struct uio *uio;
	76	FS *fs;
	77	struct buf *bp;
	78	off_t bytesinfile;
	79	int xfersize, blkoffset;
	80	int error, orig_resid;
	81	u_short mode;
	82	int seqcount;
	83	int ioflag;
	84
	85	vp = ap->a_vp;
	86	seqcount = ap->a_ioflag >> 16;
	87	ip = VTOI(vp);
	88	mode = ip->i_mode;
	89	uio = ap->a_uio;
	90	ioflag = ap->a_ioflag;
	91	#ifdef DIRECTIO
	92	if ((ioflag & IO_DIRECT) != 0) {
	93	int workdone;
	94
	95	error = ffs_rawread(vp, uio, &workdone);
	96	if (error \|\| workdone)
	97	return error;
	98	}
	99	#endif
	100
	101	#ifdef DIAGNOSTIC
	102	if (uio->uio_rw != UIO_READ)
	103	panic("ffs_read: mode");
	104
	105	if (vp->v_type == VLNK) {
	106	if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
	107	panic("ffs_read: short symlink");
	108	} else if (vp->v_type != VREG && vp->v_type != VDIR)
	109	panic("ffs_read: type %d", vp->v_type);
	110	#endif
	111	fs = ip->I_FS;
	112	if ((uint64_t)uio->uio_offset > fs->fs_maxfilesize)
	113	return (EFBIG);
	114
	115	orig_resid = uio->uio_resid;
	116	if (orig_resid <= 0)
	117	return (0);
	118
	119	bytesinfile = ip->i_size - uio->uio_offset;
	120	if (bytesinfile <= 0) {
	121	if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
	122	ip->i_flag \|= IN_ACCESS;
	123	return 0;
	124	}
	125
	126	/*
	127	* Ok so we couldn't do it all in one vm trick...
	128	* so cycle around trying smaller bites..
	129	*/
	130	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
	131	if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
	132	break;
	133
	134	error = ffs_blkatoff_ra(vp, uio->uio_offset, NULL,
	135	&bp, seqcount);
	136	if (error)
	137	break;
	138
	139	/*
	140	* If IO_DIRECT then set B_DIRECT for the buffer. This
	141	* will cause us to attempt to release the buffer later on
	142	* and will cause the buffer cache to attempt to free the
	143	* underlying pages.
	144	*/
	145	if (ioflag & IO_DIRECT)
	146	bp->b_flags \|= B_DIRECT;
	147
	148	/*
	149	* We should only get non-zero b_resid when an I/O error
	150	* has occurred, which should cause us to break above.
	151	* However, if the short read did not cause an error,
	152	* then we want to ensure that we do not uiomove bad
	153	* or uninitialized data.
	154	*
	155	* XXX b_resid is only valid when an actual I/O has occured
	156	* and may be incorrect if the buffer is B_CACHE or if the
	157	* last op on the buffer was a failed write. This KASSERT
	158	* is a precursor to removing it from the UFS code.
	159	*/
	160	KASSERT(bp->b_resid == 0, ("bp->b_resid != 0"));
	161
	162	/*
	163	* Calculate how much data we can copy
	164	*/
	165	blkoffset = blkoff(fs, uio->uio_offset);
	166	xfersize = bp->b_bufsize - blkoffset;
	167	if (xfersize > uio->uio_resid)
	168	xfersize = uio->uio_resid;
	169	if (xfersize > bytesinfile)
	170	xfersize = bytesinfile;
	171	if (xfersize <= 0) {
	172	panic("ufs_readwrite: impossible xfersize: %d",
	173	xfersize);
	174	}
	175
	176	/*
	177	* otherwise use the general form
	178	*/
	179	error = uiomovebp(bp, bp->b_data + blkoffset, xfersize, uio);
	180
	181	if (error)
	182	break;
	183
	184	if ((ioflag & (IO_VMIO\|IO_DIRECT)) &&
	185	(LIST_FIRST(&bp->b_dep) == NULL)) {
	186	/*
	187	* If there are no dependencies, and it's VMIO,
	188	* then we don't need the buf, mark it available
	189	* for freeing. The VM has the data.
	190	*/
	191	bp->b_flags \|= B_RELBUF;
	192	brelse(bp);
	193	} else {
	194	/*
	195	* Otherwise let whoever
	196	* made the request take care of
	197	* freeing it. We just queue
	198	* it onto another list.
	199	*/
	200	bqrelse(bp);
	201	}
	202	}
	203
	204	/*
	205	* This can only happen in the case of an error
	206	* because the loop above resets bp to NULL on each iteration
	207	* and on normal completion has not set a new value into it.
	208	* so it must have come from a 'break' statement
	209	*/
	210	if (bp != NULL) {
	211	if ((ioflag & (IO_VMIO\|IO_DIRECT)) &&
	212	(LIST_FIRST(&bp->b_dep) == NULL)) {
	213	bp->b_flags \|= B_RELBUF;
	214	brelse(bp);
	215	} else {
	216	bqrelse(bp);
	217	}
	218	}
	219
	220	if ((error == 0 \|\| uio->uio_resid != orig_resid) &&
	221	(vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
	222	ip->i_flag \|= IN_ACCESS;
	223	return (error);
	224	}
	225
	226	/*
	227	* Vnode op for writing.
	228	*
	229	* ffs_write(struct vnode a_vp, struct uio a_uio, int a_ioflag,
	230	* struct ucred *a_cred)
	231	*/
	232	int
	233	ffs_write(struct vop_write_args *ap)
	234	{
	235	struct vnode *vp;
	236	struct uio *uio;
	237	struct inode *ip;
	238	FS *fs;
	239	struct buf *bp;
	240	ufs_daddr_t lbn;
	241	off_t osize;
	242	off_t nsize;
	243	int seqcount;
	244	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
	245	struct thread *td;
	246
	247	extended = 0;
	248	seqcount = ap->a_ioflag >> 16;
	249	ioflag = ap->a_ioflag;
	250	uio = ap->a_uio;
	251	vp = ap->a_vp;
	252	ip = VTOI(vp);
	253
	254	#ifdef DIAGNOSTIC
	255	if (uio->uio_rw != UIO_WRITE)
	256	panic("ffs_write: mode");
	257	#endif
	258
	259	switch (vp->v_type) {
	260	case VREG:
	261	if (ioflag & IO_APPEND)
	262	uio->uio_offset = ip->i_size;
	263	if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
	264	return (EPERM);
	265	/* FALLTHROUGH */
	266	case VLNK:
	267	break;
	268	case VDIR:
	269	panic("ffs_write: dir write");
	270	break;
	271	default:
	272	panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
	273	(int)uio->uio_offset,
	274	(int)uio->uio_resid
	275	);
	276	}
	277
	278	fs = ip->I_FS;
	279	if (uio->uio_offset < 0 \|\|
	280	(uint64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
	281	return (EFBIG);
	282	}
	283	/*
	284	* Maybe this should be above the vnode op call, but so long as
	285	* file servers have no limits, I don't think it matters.
	286	*/
	287	td = uio->uio_td;
	288	if (vp->v_type == VREG && td && td->td_proc &&
	289	uio->uio_offset + uio->uio_resid >
	290	td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
	291	lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
	292	return (EFBIG);
	293	}
	294
	295	resid = uio->uio_resid;
	296	osize = ip->i_size;
	297
	298	/*
	299	* NOTE! These B_ flags are actually balloc-only flags, not buffer
	300	* flags. They are similar to the BA_ flags in fbsd.
	301	*/
	302	if (seqcount > B_SEQMAX)
	303	flags = B_SEQMAX << B_SEQSHIFT;
	304	else
	305	flags = seqcount << B_SEQSHIFT;
	306	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
	307	flags \|= B_SYNC;
	308
	309	for (error = 0; uio->uio_resid > 0;) {
	310	lbn = lblkno(fs, uio->uio_offset);
	311	blkoffset = blkoff(fs, uio->uio_offset);
	312	xfersize = fs->fs_bsize - blkoffset;
	313	if (uio->uio_resid < xfersize)
	314	xfersize = uio->uio_resid;
	315
	316	if (uio->uio_offset + xfersize > ip->i_size) {
	317	nsize = uio->uio_offset + xfersize;
	318	nvnode_pager_setsize(vp, nsize,
	319	blkoffresize(fs, nsize), blkoff(fs, nsize));
	320	}
	321
	322	#if 0
	323	/*
	324	* If doing a dummy write to flush the buffer for a
	325	* putpages we must perform a read-before-write to
	326	* fill in any missing spots and clear any invalid
	327	* areas. Otherwise a multi-page buffer may not properly
	328	* flush.
	329	*
	330	* We must clear any invalid areas
	331	*/
	332	if (uio->uio_segflg == UIO_NOCOPY) {
	333	error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp);
	334	if (error)
	335	break;
	336	bqrelse(bp);
	337	}
	338	#endif
	339
	340	/*
	341	* We must clear invalid areas.
	342	*/
	343	if (xfersize < fs->fs_bsize \|\| uio->uio_segflg == UIO_NOCOPY)
	344	flags \|= B_CLRBUF;
	345	else
	346	flags &= ~B_CLRBUF;
	347	/* XXX is uio->uio_offset the right thing here? */
	348	error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
	349	ap->a_cred, flags, &bp);
	350	if (error != 0)
	351	break;
	352	/*
	353	* If the buffer is not valid and we did not clear garbage
	354	* out above, we have to do so here even though the write
	355	* covers the entire buffer in order to avoid a mmap()/write
	356	* race where another process may see the garbage prior to
	357	* the uiomove() for a write replacing it.
	358	*/
	359	if ((bp->b_flags & B_CACHE) == 0 && (flags & B_CLRBUF) == 0)
	360	vfs_bio_clrbuf(bp);
	361	if (ioflag & IO_DIRECT)
	362	bp->b_flags \|= B_DIRECT;
	363	if ((ioflag & (IO_SYNC\|IO_INVAL)) == (IO_SYNC\|IO_INVAL))
	364	bp->b_flags \|= B_NOCACHE;
	365
	366	if (uio->uio_offset + xfersize > ip->i_size) {
	367	ip->i_size = uio->uio_offset + xfersize;
	368	extended = 1;
	369	}
	370
	371	size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
	372	if (size < xfersize)
	373	xfersize = size;
	374
	375	error = uiomovebp(bp, bp->b_data + blkoffset, xfersize, uio);
	376	if ((ioflag & (IO_VMIO\|IO_DIRECT)) &&
	377	(LIST_FIRST(&bp->b_dep) == NULL)) {
	378	bp->b_flags \|= B_RELBUF;
	379	}
	380
	381	/*
	382	* If IO_SYNC each buffer is written synchronously. Otherwise
	383	* if we have a severe page deficiency write the buffer
	384	* asynchronously. Otherwise try to cluster, and if that
	385	* doesn't do it then either do an async write (if O_DIRECT),
	386	* or a delayed write (if not).
	387	*/
	388
	389	if (ioflag & IO_SYNC) {
	390	(void)bwrite(bp);
	391	} else if (vm_page_count_severe() \|\|
	392	buf_dirty_count_severe() \|\|
	393	(ioflag & IO_ASYNC)) {
	394	bp->b_flags \|= B_CLUSTEROK;
	395	bawrite(bp);
	396	} else if (xfersize + blkoffset == fs->fs_bsize) {
	397	if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
	398	bp->b_flags \|= B_CLUSTEROK;
	399	cluster_write(bp, (off_t)ip->i_size, fs->fs_bsize, seqcount);
	400	} else {
	401	bawrite(bp);
	402	}
	403	} else if (ioflag & IO_DIRECT) {
	404	bp->b_flags \|= B_CLUSTEROK;
	405	bawrite(bp);
	406	} else {
	407	bp->b_flags \|= B_CLUSTEROK;
	408	bdwrite(bp);
	409	}
	410	if (error \|\| xfersize == 0)
	411	break;
	412	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	413	}
	414	/*
	415	* If we successfully wrote any data, and we are not the superuser
	416	* we clear the setuid and setgid bits as a precaution against
	417	* tampering.
	418	*/
	419	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
	420	ip->i_mode &= ~(ISUID \| ISGID);
	421	if (resid > uio->uio_resid)
	422	VN_KNOTE(vp, NOTE_WRITE \| (extended ? NOTE_EXTEND : 0));
	423	if (error) {
	424	if (ioflag & IO_UNIT) {
	425	(void)ffs_truncate(vp, osize, ioflag & IO_SYNC,
	426	ap->a_cred);
	427	uio->uio_offset -= resid - uio->uio_resid;
	428	uio->uio_resid = resid;
	429	}
	430	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
	431	error = ffs_update(vp, 1);
	432	}
	433
	434	return (error);
	435	}
	436