gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*-
	2	* Copyright (c) 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* Redistribution and use in source and binary forms, with or without
	6	* modification, are permitted provided that the following conditions
	7	* are met:
	8	* 1. Redistributions of source code must retain the above copyright
	9	* notice, this list of conditions and the following disclaimer.
	10	* 2. Redistributions in binary form must reproduce the above copyright
	11	* notice, this list of conditions and the following disclaimer in the
	12	* documentation and/or other materials provided with the distribution.
	13	* 3. All advertising materials mentioning features or use of this software
	14	* must display the following acknowledgement:
	15	* This product includes software developed by the University of
	16	* California, Berkeley and its contributors.
	17	* 4. Neither the name of the University nor the names of its contributors
	18	* may be used to endorse or promote products derived from this software
	19	* without specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	22	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	24	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	25	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	26	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	27	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	28	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	29	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	30	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	31	* SUCH DAMAGE.
	32	*
	33	* @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
	34	* $FreeBSD: src/sys/ufs/ufs/ufs_readwrite.c,v 1.65.2.14 2003/04/04 22:21:29 tegge Exp $
	35	* $DragonFly: src/sys/vfs/ufs/ufs_readwrite.c,v 1.20 2006/09/03 18:29:17 dillon Exp $
	36	*/
	37
	38	#define BLKSIZE(a, b, c) blksize(a, b, c)
	39	#define FS struct fs
	40	#define I_FS i_fs
	41
	42	#include <vm/vm.h>
	43	#include <vm/vm_object.h>
	44	#include <vm/vm_pager.h>
	45	#include <vm/vm_map.h>
	46	#include <vm/vnode_pager.h>
	47	#include <sys/event.h>
	48	#include <sys/vmmeter.h>
	49	#include <vm/vm_page2.h>
	50
	51	#include "opt_directio.h"
	52
	53	#define VN_KNOTE(vp, b) \
	54	KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
	55
	56	#ifdef DIRECTIO
	57	extern int ffs_rawread(struct vnode vp, struct uio uio, int *workdone);
	58	#endif
	59
	60	/*
	61	* Vnode op for reading.
	62	*
	63	* ffs_read(struct vnode a_vp, struct uio a_uio, int a_ioflag,
	64	* struct ucred *a_cred)
	65	*/
	66	/* ARGSUSED */
	67	int
	68	ffs_read(struct vop_read_args *ap)
	69	{
	70	struct vnode *vp;
	71	struct inode *ip;
	72	struct uio *uio;
	73	FS *fs;
	74	struct buf *bp;
	75	off_t bytesinfile;
	76	int xfersize, blkoffset;
	77	int error, orig_resid;
	78	u_short mode;
	79	int seqcount;
	80	int ioflag;
	81
	82	vp = ap->a_vp;
	83	seqcount = ap->a_ioflag >> 16;
	84	ip = VTOI(vp);
	85	mode = ip->i_mode;
	86	uio = ap->a_uio;
	87	ioflag = ap->a_ioflag;
	88	#ifdef DIRECTIO
	89	if ((ioflag & IO_DIRECT) != 0) {
	90	int workdone;
	91
	92	error = ffs_rawread(vp, uio, &workdone);
	93	if (error \|\| workdone)
	94	return error;
	95	}
	96	#endif
	97
	98	#ifdef DIAGNOSTIC
	99	if (uio->uio_rw != UIO_READ)
	100	panic("ffs_read: mode");
	101
	102	if (vp->v_type == VLNK) {
	103	if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
	104	panic("ffs_read: short symlink");
	105	} else if (vp->v_type != VREG && vp->v_type != VDIR)
	106	panic("ffs_read: type %d", vp->v_type);
	107	#endif
	108	fs = ip->I_FS;
	109	if ((uint64_t)uio->uio_offset > fs->fs_maxfilesize)
	110	return (EFBIG);
	111
	112	orig_resid = uio->uio_resid;
	113	if (orig_resid <= 0)
	114	return (0);
	115
	116	bytesinfile = ip->i_size - uio->uio_offset;
	117	if (bytesinfile <= 0) {
	118	if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
	119	ip->i_flag \|= IN_ACCESS;
	120	return 0;
	121	}
	122
	123	/*
	124	* Ok so we couldn't do it all in one vm trick...
	125	* so cycle around trying smaller bites..
	126	*/
	127	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
	128	if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
	129	break;
	130
	131	error = ffs_blkatoff_ra(vp, uio->uio_offset, NULL,
	132	&bp, seqcount);
	133	if (error)
	134	break;
	135
	136	/*
	137	* If IO_DIRECT then set B_DIRECT for the buffer. This
	138	* will cause us to attempt to release the buffer later on
	139	* and will cause the buffer cache to attempt to free the
	140	* underlying pages.
	141	*/
	142	if (ioflag & IO_DIRECT)
	143	bp->b_flags \|= B_DIRECT;
	144
	145	/*
	146	* We should only get non-zero b_resid when an I/O error
	147	* has occurred, which should cause us to break above.
	148	* However, if the short read did not cause an error,
	149	* then we want to ensure that we do not uiomove bad
	150	* or uninitialized data.
	151	*
	152	* XXX b_resid is only valid when an actual I/O has occured
	153	* and may be incorrect if the buffer is B_CACHE or if the
	154	* last op on the buffer was a failed write. This KASSERT
	155	* is a precursor to removing it from the UFS code.
	156	*/
	157	KASSERT(bp->b_resid == 0, ("bp->b_resid != 0"));
	158
	159	/*
	160	* Calculate how much data we can copy
	161	*/
	162	blkoffset = blkoff(fs, uio->uio_offset);
	163	xfersize = bp->b_bufsize - blkoffset;
	164	if (xfersize > uio->uio_resid)
	165	xfersize = uio->uio_resid;
	166	if (xfersize > bytesinfile)
	167	xfersize = bytesinfile;
	168	if (xfersize <= 0) {
	169	panic("ufs_readwrite: impossible xfersize: %d",
	170	xfersize);
	171	}
	172
	173	/*
	174	* otherwise use the general form
	175	*/
	176	error = uiomove((char *)bp->b_data + blkoffset,
	177	(int)xfersize, uio);
	178
	179	if (error)
	180	break;
	181
	182	if ((ioflag & (IO_VMIO\|IO_DIRECT)) &&
	183	(LIST_FIRST(&bp->b_dep) == NULL)) {
	184	/*
	185	* If there are no dependencies, and it's VMIO,
	186	* then we don't need the buf, mark it available
	187	* for freeing. The VM has the data.
	188	*/
	189	bp->b_flags \|= B_RELBUF;
	190	brelse(bp);
	191	} else {
	192	/*
	193	* Otherwise let whoever
	194	* made the request take care of
	195	* freeing it. We just queue
	196	* it onto another list.
	197	*/
	198	bqrelse(bp);
	199	}
	200	}
	201
	202	/*
	203	* This can only happen in the case of an error
	204	* because the loop above resets bp to NULL on each iteration
	205	* and on normal completion has not set a new value into it.
	206	* so it must have come from a 'break' statement
	207	*/
	208	if (bp != NULL) {
	209	if ((ioflag & (IO_VMIO\|IO_DIRECT)) &&
	210	(LIST_FIRST(&bp->b_dep) == NULL)) {
	211	bp->b_flags \|= B_RELBUF;
	212	brelse(bp);
	213	} else {
	214	bqrelse(bp);
	215	}
	216	}
	217
	218	if ((error == 0 \|\| uio->uio_resid != orig_resid) &&
	219	(vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
	220	ip->i_flag \|= IN_ACCESS;
	221	return (error);
	222	}
	223
	224	/*
	225	* Vnode op for writing.
	226	*
	227	* ffs_write(struct vnode a_vp, struct uio a_uio, int a_ioflag,
	228	* struct ucred *a_cred)
	229	*/
	230	int
	231	ffs_write(struct vop_write_args *ap)
	232	{
	233	struct vnode *vp;
	234	struct uio *uio;
	235	struct inode *ip;
	236	FS *fs;
	237	struct buf *bp;
	238	ufs_daddr_t lbn;
	239	off_t osize;
	240	int seqcount;
	241	int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
	242	struct thread *td;
	243
	244	extended = 0;
	245	seqcount = ap->a_ioflag >> 16;
	246	ioflag = ap->a_ioflag;
	247	uio = ap->a_uio;
	248	vp = ap->a_vp;
	249	ip = VTOI(vp);
	250
	251	#ifdef DIAGNOSTIC
	252	if (uio->uio_rw != UIO_WRITE)
	253	panic("ffs_write: mode");
	254	#endif
	255
	256	switch (vp->v_type) {
	257	case VREG:
	258	if (ioflag & IO_APPEND)
	259	uio->uio_offset = ip->i_size;
	260	if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
	261	return (EPERM);
	262	/* FALLTHROUGH */
	263	case VLNK:
	264	break;
	265	case VDIR:
	266	panic("ffs_write: dir write");
	267	break;
	268	default:
	269	panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
	270	(int)uio->uio_offset,
	271	(int)uio->uio_resid
	272	);
	273	}
	274
	275	fs = ip->I_FS;
	276	if (uio->uio_offset < 0 \|\|
	277	(uint64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
	278	return (EFBIG);
	279	}
	280	/*
	281	* Maybe this should be above the vnode op call, but so long as
	282	* file servers have no limits, I don't think it matters.
	283	*/
	284	td = uio->uio_td;
	285	if (vp->v_type == VREG && td && td->td_proc &&
	286	uio->uio_offset + uio->uio_resid >
	287	td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
	288	ksignal(td->td_proc, SIGXFSZ);
	289	return (EFBIG);
	290	}
	291
	292	resid = uio->uio_resid;
	293	osize = ip->i_size;
	294
	295	/*
	296	* NOTE! These B_ flags are actually balloc-only flags, not buffer
	297	* flags. They are similar to the BA_ flags in fbsd.
	298	*/
	299	if (seqcount > B_SEQMAX)
	300	flags = B_SEQMAX << B_SEQSHIFT;
	301	else
	302	flags = seqcount << B_SEQSHIFT;
	303	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
	304	flags \|= B_SYNC;
	305
	306	for (error = 0; uio->uio_resid > 0;) {
	307	lbn = lblkno(fs, uio->uio_offset);
	308	blkoffset = blkoff(fs, uio->uio_offset);
	309	xfersize = fs->fs_bsize - blkoffset;
	310	if (uio->uio_resid < xfersize)
	311	xfersize = uio->uio_resid;
	312
	313	if (uio->uio_offset + xfersize > ip->i_size)
	314	vnode_pager_setsize(vp, uio->uio_offset + xfersize);
	315
	316	/*
	317	* We must perform a read-before-write if the transfer
	318	* size does not cover the entire buffer.
	319	*/
	320	if (fs->fs_bsize > xfersize)
	321	flags \|= B_CLRBUF;
	322	else
	323	flags &= ~B_CLRBUF;
	324	/* XXX is uio->uio_offset the right thing here? */
	325	error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
	326	ap->a_cred, flags, &bp);
	327	if (error != 0)
	328	break;
	329	/*
	330	* If the buffer is not valid and we did not clear garbage
	331	* out above, we have to do so here even though the write
	332	* covers the entire buffer in order to avoid a mmap()/write
	333	* race where another process may see the garbage prior to
	334	* the uiomove() for a write replacing it.
	335	*/
	336	if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
	337	vfs_bio_clrbuf(bp);
	338	if (ioflag & IO_DIRECT)
	339	bp->b_flags \|= B_DIRECT;
	340	if (ioflag & IO_NOWDRAIN)
	341	bp->b_flags \|= B_NOWDRAIN;
	342	if ((ioflag & (IO_SYNC\|IO_INVAL)) == (IO_SYNC\|IO_INVAL))
	343	bp->b_flags \|= B_NOCACHE;
	344
	345	if (uio->uio_offset + xfersize > ip->i_size) {
	346	ip->i_size = uio->uio_offset + xfersize;
	347	extended = 1;
	348	}
	349
	350	size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
	351	if (size < xfersize)
	352	xfersize = size;
	353
	354	error =
	355	uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
	356	if ((ioflag & (IO_VMIO\|IO_DIRECT)) &&
	357	(LIST_FIRST(&bp->b_dep) == NULL)) {
	358	bp->b_flags \|= B_RELBUF;
	359	}
	360
	361	/*
	362	* If IO_SYNC each buffer is written synchronously. Otherwise
	363	* if we have a severe page deficiency write the buffer
	364	* asynchronously. Otherwise try to cluster, and if that
	365	* doesn't do it then either do an async write (if O_DIRECT),
	366	* or a delayed write (if not).
	367	*/
	368
	369	if (ioflag & IO_SYNC) {
	370	(void)bwrite(bp);
	371	} else if (vm_page_count_severe() \|\|
	372	buf_dirty_count_severe() \|\|
	373	(ioflag & IO_ASYNC)) {
	374	bp->b_flags \|= B_CLUSTEROK;
	375	bawrite(bp);
	376	} else if (xfersize + blkoffset == fs->fs_bsize) {
	377	if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
	378	bp->b_flags \|= B_CLUSTEROK;
	379	cluster_write(bp, (off_t)ip->i_size, seqcount);
	380	} else {
	381	bawrite(bp);
	382	}
	383	} else if (ioflag & IO_DIRECT) {
	384	bp->b_flags \|= B_CLUSTEROK;
	385	bawrite(bp);
	386	} else {
	387	bp->b_flags \|= B_CLUSTEROK;
	388	bdwrite(bp);
	389	}
	390	if (error \|\| xfersize == 0)
	391	break;
	392	ip->i_flag \|= IN_CHANGE \| IN_UPDATE;
	393	}
	394	/*
	395	* If we successfully wrote any data, and we are not the superuser
	396	* we clear the setuid and setgid bits as a precaution against
	397	* tampering.
	398	*/
	399	if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
	400	ip->i_mode &= ~(ISUID \| ISGID);
	401	if (resid > uio->uio_resid)
	402	VN_KNOTE(vp, NOTE_WRITE \| (extended ? NOTE_EXTEND : 0));
	403	if (error) {
	404	if (ioflag & IO_UNIT) {
	405	(void)ffs_truncate(vp, osize, ioflag & IO_SYNC,
	406	ap->a_cred);
	407	uio->uio_offset -= resid - uio->uio_resid;
	408	uio->uio_resid = resid;
	409	}
	410	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
	411	error = ffs_update(vp, 1);
	412	}
	413
	414	return (error);
	415	}
	416
	417
	418	/*
	419	* get page routine
	420	*/
	421	int
	422	ffs_getpages(struct vop_getpages_args *ap)
	423	{
	424	off_t foff, physoffset;
	425	int i, size, bsize;
	426	struct vnode dp, vp;
	427	vm_object_t obj;
	428	vm_pindex_t pindex, firstindex;
	429	vm_page_t mreq;
	430	int bbackwards, bforwards;
	431	int pbackwards, pforwards;
	432	int firstpage;
	433	off_t reqoffset;
	434	off_t doffset;
	435	int poff;
	436	int pcount;
	437	int rtval;
	438	int pagesperblock;
	439
	440
	441	pcount = round_page(ap->a_count) / PAGE_SIZE;
	442	mreq = ap->a_m[ap->a_reqpage];
	443	firstindex = ap->a_m[0]->pindex;
	444
	445	/*
	446	* if ANY DEV_BSIZE blocks are valid on a large filesystem block,
	447	* then the entire page is valid. Since the page may be mapped,
	448	* user programs might reference data beyond the actual end of file
	449	* occuring within the page. We have to zero that data.
	450	*/
	451	if (mreq->valid) {
	452	if (mreq->valid != VM_PAGE_BITS_ALL)
	453	vm_page_zero_invalid(mreq, TRUE);
	454	for (i = 0; i < pcount; i++) {
	455	if (i != ap->a_reqpage) {
	456	vm_page_free(ap->a_m[i]);
	457	}
	458	}
	459	return VM_PAGER_OK;
	460	}
	461
	462	vp = ap->a_vp;
	463	obj = vp->v_object;
	464	bsize = vp->v_mount->mnt_stat.f_iosize;
	465	pindex = mreq->pindex;
	466	foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
	467
	468	if (bsize < PAGE_SIZE)
	469	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
	470	ap->a_count,
	471	ap->a_reqpage);
	472
	473	/*
	474	* foff is the file offset of the required page
	475	* reqlblkno is the logical block that contains the page
	476	* poff is the bytes offset of the page in the logical block
	477	*/
	478	poff = (int)(foff % bsize);
	479	reqoffset = foff - poff;
	480
	481	if (VOP_BMAP(vp, reqoffset, &dp, &doffset,
	482	&bforwards, &bbackwards) \|\| (doffset == NOOFFSET)
	483	) {
	484	for (i = 0; i < pcount; i++) {
	485	if (i != ap->a_reqpage)
	486	vm_page_free(ap->a_m[i]);
	487	}
	488	if (doffset == NOOFFSET) {
	489	if ((mreq->flags & PG_ZERO) == 0)
	490	vm_page_zero_fill(mreq);
	491	vm_page_undirty(mreq);
	492	mreq->valid = VM_PAGE_BITS_ALL;
	493	return VM_PAGER_OK;
	494	} else {
	495	return VM_PAGER_ERROR;
	496	}
	497	}
	498
	499	physoffset = doffset + poff;
	500	pagesperblock = bsize / PAGE_SIZE;
	501
	502	/*
	503	* find the first page that is contiguous.
	504	*
	505	* bforwards and bbackwards are the number of contiguous bytes
	506	* available before and after the block offset. poff is the page
	507	* offset, in bytes, relative to the block offset.
	508	*
	509	* pforwards and pbackwards are the number of contiguous pages
	510	* relative to the requested page, non-inclusive of the requested
	511	* page (so a pbackwards and pforwards of 0 indicates just the
	512	* requested page).
	513	*/
	514	firstpage = 0;
	515	if (ap->a_count) {
	516	/*
	517	* Calculate pbackwards and clean up any requested
	518	* pages that are too far back.
	519	*/
	520	pbackwards = (poff + bbackwards) >> PAGE_SHIFT;
	521	if (ap->a_reqpage > pbackwards) {
	522	firstpage = ap->a_reqpage - pbackwards;
	523	for (i = 0; i < firstpage; i++)
	524	vm_page_free(ap->a_m[i]);
	525	}
	526
	527	/*
	528	* Calculate pforwards
	529	*/
	530	pforwards = (bforwards - poff - PAGE_SIZE) >> PAGE_SHIFT;
	531	if (pforwards < 0)
	532	pforwards = 0;
	533	if (pforwards < (pcount - (ap->a_reqpage + 1))) {
	534	for(i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
	535	vm_page_free(ap->a_m[i]);
	536	pcount = ap->a_reqpage + pforwards + 1;
	537	}
	538
	539	/*
	540	* Adjust pcount to be relative to firstpage. All pages prior
	541	* to firstpage in the array have been cleaned up.
	542	*/
	543	pcount -= firstpage;
	544	}
	545
	546	/*
	547	* calculate the size of the transfer
	548	*/
	549	size = pcount * PAGE_SIZE;
	550
	551	if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > vp->v_filesize) {
	552	size = vp->v_filesize - IDX_TO_OFF(ap->a_m[firstpage]->pindex);
	553	}
	554
	555	physoffset -= foff;
	556	rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
	557	(ap->a_reqpage - firstpage), physoffset);
	558
	559	return (rtval);
	560	}
	561
	562	/*
	563	* put page routine
	564	*
	565	* XXX By default, wimp out... note that a_offset is ignored (and always
	566	* XXX has been).
	567	*/
	568	int
	569	ffs_putpages(struct vop_putpages_args *ap)
	570	{
	571	return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
	572	ap->a_sync, ap->a_rtvals);
	573	}