gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* Rick Macklem at The University of Guelph.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
	37	* $FreeBSD: src/sys/nfs/nfs_bio.c,v 1.83.2.4 2002/12/29 18:19:53 dillon Exp $
	38	* $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.4 2003/06/25 03:56:07 dillon Exp $
	39	*/
	40
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/resourcevar.h>
	45	#include <sys/signalvar.h>
	46	#include <sys/proc.h>
	47	#include <sys/buf.h>
	48	#include <sys/vnode.h>
	49	#include <sys/mount.h>
	50	#include <sys/kernel.h>
	51
	52	#include <vm/vm.h>
	53	#include <vm/vm_extern.h>
	54	#include <vm/vm_page.h>
	55	#include <vm/vm_object.h>
	56	#include <vm/vm_pager.h>
	57	#include <vm/vnode_pager.h>
	58
	59	#include <sys/buf2.h>
	60
	61	#include <nfs/rpcv2.h>
	62	#include <nfs/nfsproto.h>
	63	#include <nfs/nfs.h>
	64	#include <nfs/nfsmount.h>
	65	#include <nfs/nqnfs.h>
	66	#include <nfs/nfsnode.h>
	67
	68	static struct buf nfs_getcacheblk __P((struct vnode vp, daddr_t bn, int size,
	69	struct thread *td));
	70
	71	extern int nfs_numasync;
	72	extern int nfs_pbuf_freecnt;
	73	extern struct nfsstats nfsstats;
	74
	75	/*
	76	* Vnode op for VM getpages.
	77	*/
	78	int
	79	nfs_getpages(ap)
	80	struct vop_getpages_args /* {
	81	struct vnode *a_vp;
	82	vm_page_t *a_m;
	83	int a_count;
	84	int a_reqpage;
	85	vm_ooffset_t a_offset;
	86	} / ap;
	87	{
	88	struct thread td = curthread; / XXX */
	89	struct proc p = td->td_proc; / XXX */
	90	int i, error, nextoff, size, toff, count, npages;
	91	struct uio uio;
	92	struct iovec iov;
	93	vm_offset_t kva;
	94	struct buf *bp;
	95	struct vnode *vp;
	96	struct ucred *cred;
	97	struct nfsmount *nmp;
	98	vm_page_t *pages;
	99
	100	KKASSERT(p);
	101
	102	vp = ap->a_vp;
	103	cred = p->p_ucred; /* XXX */
	104	nmp = VFSTONFS(vp->v_mount);
	105	pages = ap->a_m;
	106	count = ap->a_count;
	107
	108	if (vp->v_object == NULL) {
	109	printf("nfs_getpages: called with non-merged cache vnode??\n");
	110	return VM_PAGER_ERROR;
	111	}
	112
	113	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	114	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	115	(void)nfs_fsinfo(nmp, vp, cred, td);
	116
	117	npages = btoc(count);
	118
	119	/*
	120	* If the requested page is partially valid, just return it and
	121	* allow the pager to zero-out the blanks. Partially valid pages
	122	* can only occur at the file EOF.
	123	*/
	124
	125	{
	126	vm_page_t m = pages[ap->a_reqpage];
	127
	128	if (m->valid != 0) {
	129	/* handled by vm_fault now */
	130	/* vm_page_zero_invalid(m, TRUE); */
	131	for (i = 0; i < npages; ++i) {
	132	if (i != ap->a_reqpage)
	133	vnode_pager_freepage(pages[i]);
	134	}
	135	return(0);
	136	}
	137	}
	138
	139	/*
	140	* We use only the kva address for the buffer, but this is extremely
	141	* convienient and fast.
	142	*/
	143	bp = getpbuf(&nfs_pbuf_freecnt);
	144
	145	kva = (vm_offset_t) bp->b_data;
	146	pmap_qenter(kva, pages, npages);
	147
	148	iov.iov_base = (caddr_t) kva;
	149	iov.iov_len = count;
	150	uio.uio_iov = &iov;
	151	uio.uio_iovcnt = 1;
	152	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
	153	uio.uio_resid = count;
	154	uio.uio_segflg = UIO_SYSSPACE;
	155	uio.uio_rw = UIO_READ;
	156	uio.uio_td = td;
	157
	158	error = nfs_readrpc(vp, &uio, cred);
	159	pmap_qremove(kva, npages);
	160
	161	relpbuf(bp, &nfs_pbuf_freecnt);
	162
	163	if (error && (uio.uio_resid == count)) {
	164	printf("nfs_getpages: error %d\n", error);
	165	for (i = 0; i < npages; ++i) {
	166	if (i != ap->a_reqpage)
	167	vnode_pager_freepage(pages[i]);
	168	}
	169	return VM_PAGER_ERROR;
	170	}
	171
	172	/*
	173	* Calculate the number of bytes read and validate only that number
	174	* of bytes. Note that due to pending writes, size may be 0. This
	175	* does not mean that the remaining data is invalid!
	176	*/
	177
	178	size = count - uio.uio_resid;
	179
	180	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
	181	vm_page_t m;
	182	nextoff = toff + PAGE_SIZE;
	183	m = pages[i];
	184
	185	m->flags &= ~PG_ZERO;
	186
	187	if (nextoff <= size) {
	188	/*
	189	* Read operation filled an entire page
	190	*/
	191	m->valid = VM_PAGE_BITS_ALL;
	192	vm_page_undirty(m);
	193	} else if (size > toff) {
	194	/*
	195	* Read operation filled a partial page.
	196	*/
	197	m->valid = 0;
	198	vm_page_set_validclean(m, 0, size - toff);
	199	/* handled by vm_fault now */
	200	/* vm_page_zero_invalid(m, TRUE); */
	201	} else {
	202	/*
	203	* Read operation was short. If no error occured
	204	* we may have hit a zero-fill section. We simply
	205	* leave valid set to 0.
	206	*/
	207	;
	208	}
	209	if (i != ap->a_reqpage) {
	210	/*
	211	* Whether or not to leave the page activated is up in
	212	* the air, but we should put the page on a page queue
	213	* somewhere (it already is in the object). Result:
	214	* It appears that emperical results show that
	215	* deactivating pages is best.
	216	*/
	217
	218	/*
	219	* Just in case someone was asking for this page we
	220	* now tell them that it is ok to use.
	221	*/
	222	if (!error) {
	223	if (m->flags & PG_WANTED)
	224	vm_page_activate(m);
	225	else
	226	vm_page_deactivate(m);
	227	vm_page_wakeup(m);
	228	} else {
	229	vnode_pager_freepage(m);
	230	}
	231	}
	232	}
	233	return 0;
	234	}
	235
	236	/*
	237	* Vnode op for VM putpages.
	238	*/
	239	int
	240	nfs_putpages(ap)
	241	struct vop_putpages_args /* {
	242	struct vnode *a_vp;
	243	vm_page_t *a_m;
	244	int a_count;
	245	int a_sync;
	246	int *a_rtvals;
	247	vm_ooffset_t a_offset;
	248	} / ap;
	249	{
	250	struct thread *td = curthread;
	251	struct uio uio;
	252	struct iovec iov;
	253	vm_offset_t kva;
	254	struct buf *bp;
	255	int iomode, must_commit, i, error, npages, count;
	256	off_t offset;
	257	int *rtvals;
	258	struct vnode *vp;
	259	struct ucred *cred;
	260	struct nfsmount *nmp;
	261	struct nfsnode *np;
	262	vm_page_t *pages;
	263
	264	KKASSERT(td->td_proc);
	265	cred = td->td_proc->p_ucred;
	266
	267	vp = ap->a_vp;
	268	np = VTONFS(vp);
	269	nmp = VFSTONFS(vp->v_mount);
	270	pages = ap->a_m;
	271	count = ap->a_count;
	272	rtvals = ap->a_rtvals;
	273	npages = btoc(count);
	274	offset = IDX_TO_OFF(pages[0]->pindex);
	275
	276	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	277	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	278	(void)nfs_fsinfo(nmp, vp, cred, td);
	279
	280	for (i = 0; i < npages; i++) {
	281	rtvals[i] = VM_PAGER_AGAIN;
	282	}
	283
	284	/*
	285	* When putting pages, do not extend file past EOF.
	286	*/
	287
	288	if (offset + count > np->n_size) {
	289	count = np->n_size - offset;
	290	if (count < 0)
	291	count = 0;
	292	}
	293
	294	/*
	295	* We use only the kva address for the buffer, but this is extremely
	296	* convienient and fast.
	297	*/
	298	bp = getpbuf(&nfs_pbuf_freecnt);
	299
	300	kva = (vm_offset_t) bp->b_data;
	301	pmap_qenter(kva, pages, npages);
	302
	303	iov.iov_base = (caddr_t) kva;
	304	iov.iov_len = count;
	305	uio.uio_iov = &iov;
	306	uio.uio_iovcnt = 1;
	307	uio.uio_offset = offset;
	308	uio.uio_resid = count;
	309	uio.uio_segflg = UIO_SYSSPACE;
	310	uio.uio_rw = UIO_WRITE;
	311	uio.uio_td = td;
	312
	313	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
	314	iomode = NFSV3WRITE_UNSTABLE;
	315	else
	316	iomode = NFSV3WRITE_FILESYNC;
	317
	318	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
	319
	320	pmap_qremove(kva, npages);
	321	relpbuf(bp, &nfs_pbuf_freecnt);
	322
	323	if (!error) {
	324	int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
	325	for (i = 0; i < nwritten; i++) {
	326	rtvals[i] = VM_PAGER_OK;
	327	vm_page_undirty(pages[i]);
	328	}
	329	if (must_commit)
	330	nfs_clearcommit(vp->v_mount);
	331	}
	332	return rtvals[0];
	333	}
	334
	335	/*
	336	* Vnode op for read using bio
	337	*/
	338	int
	339	nfs_bioread(struct vnode vp, struct uio uio, int ioflag, struct ucred *cred)
	340	{
	341	struct nfsnode *np = VTONFS(vp);
	342	int biosize, i;
	343	struct buf bp = 0, rabp;
	344	struct vattr vattr;
	345	struct thread *td;
	346	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	347	daddr_t lbn, rabn;
	348	int bcount;
	349	int seqcount;
	350	int nra, error = 0, n = 0, on = 0;
	351
	352	#ifdef DIAGNOSTIC
	353	if (uio->uio_rw != UIO_READ)
	354	panic("nfs_read mode");
	355	#endif
	356	if (uio->uio_resid == 0)
	357	return (0);
	358	if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
	359	return (EINVAL);
	360	td = uio->uio_td;
	361
	362	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	363	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	364	(void)nfs_fsinfo(nmp, vp, cred, td);
	365	if (vp->v_type != VDIR &&
	366	(uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
	367	return (EFBIG);
	368	biosize = vp->v_mount->mnt_stat.f_iosize;
	369	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
	370	/*
	371	* For nfs, cache consistency can only be maintained approximately.
	372	* Although RFC1094 does not specify the criteria, the following is
	373	* believed to be compatible with the reference port.
	374	* For nqnfs, full cache consistency is maintained within the loop.
	375	* For nfs:
	376	* If the file's modify time on the server has changed since the
	377	* last read rpc or you have written to the file,
	378	* you may have lost data cache consistency with the
	379	* server, so flush all of the file's data out of the cache.
	380	* Then force a getattr rpc to ensure that you have up to date
	381	* attributes.
	382	* NB: This implies that cache data can be read when up to
	383	* NFS_ATTRTIMEO seconds out of date. If you find that you need current
	384	* attributes this could be forced by setting n_attrstamp to 0 before
	385	* the VOP_GETATTR() call.
	386	*/
	387	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
	388	if (np->n_flag & NMODIFIED) {
	389	if (vp->v_type != VREG) {
	390	if (vp->v_type != VDIR)
	391	panic("nfs: bioread, not dir");
	392	nfs_invaldir(vp);
	393	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	394	if (error)
	395	return (error);
	396	}
	397	np->n_attrstamp = 0;
	398	error = VOP_GETATTR(vp, &vattr, cred, td);
	399	if (error)
	400	return (error);
	401	np->n_mtime = vattr.va_mtime.tv_sec;
	402	} else {
	403	error = VOP_GETATTR(vp, &vattr, cred, td);
	404	if (error)
	405	return (error);
	406	if (np->n_mtime != vattr.va_mtime.tv_sec) {
	407	if (vp->v_type == VDIR)
	408	nfs_invaldir(vp);
	409	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	410	if (error)
	411	return (error);
	412	np->n_mtime = vattr.va_mtime.tv_sec;
	413	}
	414	}
	415	}
	416	do {
	417
	418	/*
	419	* Get a valid lease. If cached data is stale, flush it.
	420	*/
	421	if (nmp->nm_flag & NFSMNT_NQNFS) {
	422	if (NQNFS_CKINVALID(vp, np, ND_READ)) {
	423	do {
	424	error = nqnfs_getlease(vp, ND_READ, cred, td);
	425	} while (error == NQNFS_EXPIRED);
	426	if (error)
	427	return (error);
	428	if (np->n_lrev != np->n_brev \|\|
	429	(np->n_flag & NQNFSNONCACHE) \|\|
	430	((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
	431	if (vp->v_type == VDIR)
	432	nfs_invaldir(vp);
	433	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	434	if (error)
	435	return (error);
	436	np->n_brev = np->n_lrev;
	437	}
	438	} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
	439	nfs_invaldir(vp);
	440	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	441	if (error)
	442	return (error);
	443	}
	444	}
	445	if (np->n_flag & NQNFSNONCACHE) {
	446	switch (vp->v_type) {
	447	case VREG:
	448	return (nfs_readrpc(vp, uio, cred));
	449	case VLNK:
	450	return (nfs_readlinkrpc(vp, uio, cred));
	451	case VDIR:
	452	break;
	453	default:
	454	printf(" NQNFSNONCACHE: type %x unexpected\n",
	455	vp->v_type);
	456	};
	457	}
	458	switch (vp->v_type) {
	459	case VREG:
	460	nfsstats.biocache_reads++;
	461	lbn = uio->uio_offset / biosize;
	462	on = uio->uio_offset & (biosize - 1);
	463
	464	/*
	465	* Start the read ahead(s), as required.
	466	*/
	467	if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
	468	for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
	469	(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
	470	rabn = lbn + 1 + nra;
	471	if (!incore(vp, rabn)) {
	472	rabp = nfs_getcacheblk(vp, rabn, biosize, td);
	473	if (!rabp)
	474	return (EINTR);
	475	if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) {
	476	rabp->b_flags \|= (B_READ \| B_ASYNC);
	477	vfs_busy_pages(rabp, 0);
	478	if (nfs_asyncio(rabp, cred, td)) {
	479	rabp->b_flags \|= B_INVAL\|B_ERROR;
	480	vfs_unbusy_pages(rabp);
	481	brelse(rabp);
	482	break;
	483	}
	484	} else {
	485	brelse(rabp);
	486	}
	487	}
	488	}
	489	}
	490
	491	/*
	492	* Obtain the buffer cache block. Figure out the buffer size
	493	* when we are at EOF. If we are modifying the size of the
	494	* buffer based on an EOF condition we need to hold
	495	* nfs_rslock() through obtaining the buffer to prevent
	496	* a potential writer-appender from messing with n_size.
	497	* Otherwise we may accidently truncate the buffer and
	498	* lose dirty data.
	499	*
	500	* Note that bcount is not DEV_BSIZE aligned.
	501	*/
	502
	503	again:
	504	bcount = biosize;
	505	if ((off_t)lbn * biosize >= np->n_size) {
	506	bcount = 0;
	507	} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
	508	bcount = np->n_size - (off_t)lbn * biosize;
	509	}
	510	if (bcount != biosize) {
	511	switch(nfs_rslock(np, td)) {
	512	case ENOLCK:
	513	goto again;
	514	/* not reached */
	515	case EINTR:
	516	case ERESTART:
	517	return(EINTR);
	518	/* not reached */
	519	default:
	520	break;
	521	}
	522	}
	523
	524	bp = nfs_getcacheblk(vp, lbn, bcount, td);
	525
	526	if (bcount != biosize)
	527	nfs_rsunlock(np, td);
	528	if (!bp)
	529	return (EINTR);
	530
	531	/*
	532	* If B_CACHE is not set, we must issue the read. If this
	533	* fails, we return an error.
	534	*/
	535
	536	if ((bp->b_flags & B_CACHE) == 0) {
	537	bp->b_flags \|= B_READ;
	538	vfs_busy_pages(bp, 0);
	539	error = nfs_doio(bp, cred, td);
	540	if (error) {
	541	brelse(bp);
	542	return (error);
	543	}
	544	}
	545
	546	/*
	547	* on is the offset into the current bp. Figure out how many
	548	* bytes we can copy out of the bp. Note that bcount is
	549	* NOT DEV_BSIZE aligned.
	550	*
	551	* Then figure out how many bytes we can copy into the uio.
	552	*/
	553
	554	n = 0;
	555	if (on < bcount)
	556	n = min((unsigned)(bcount - on), uio->uio_resid);
	557	break;
	558	case VLNK:
	559	nfsstats.biocache_readlinks++;
	560	bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, td);
	561	if (!bp)
	562	return (EINTR);
	563	if ((bp->b_flags & B_CACHE) == 0) {
	564	bp->b_flags \|= B_READ;
	565	vfs_busy_pages(bp, 0);
	566	error = nfs_doio(bp, cred, td);
	567	if (error) {
	568	bp->b_flags \|= B_ERROR;
	569	brelse(bp);
	570	return (error);
	571	}
	572	}
	573	n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
	574	on = 0;
	575	break;
	576	case VDIR:
	577	nfsstats.biocache_readdirs++;
	578	if (np->n_direofoffset
	579	&& uio->uio_offset >= np->n_direofoffset) {
	580	return (0);
	581	}
	582	lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
	583	on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
	584	bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, td);
	585	if (!bp)
	586	return (EINTR);
	587	if ((bp->b_flags & B_CACHE) == 0) {
	588	bp->b_flags \|= B_READ;
	589	vfs_busy_pages(bp, 0);
	590	error = nfs_doio(bp, cred, td);
	591	if (error) {
	592	brelse(bp);
	593	}
	594	while (error == NFSERR_BAD_COOKIE) {
	595	printf("got bad cookie vp %p bp %p\n", vp, bp);
	596	nfs_invaldir(vp);
	597	error = nfs_vinvalbuf(vp, 0, cred, td, 1);
	598	/*
	599	* Yuck! The directory has been modified on the
	600	* server. The only way to get the block is by
	601	* reading from the beginning to get all the
	602	* offset cookies.
	603	*
	604	* Leave the last bp intact unless there is an error.
	605	* Loop back up to the while if the error is another
	606	* NFSERR_BAD_COOKIE (double yuch!).
	607	*/
	608	for (i = 0; i <= lbn && !error; i++) {
	609	if (np->n_direofoffset
	610	&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
	611	return (0);
	612	bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, td);
	613	if (!bp)
	614	return (EINTR);
	615	if ((bp->b_flags & B_CACHE) == 0) {
	616	bp->b_flags \|= B_READ;
	617	vfs_busy_pages(bp, 0);
	618	error = nfs_doio(bp, cred, td);
	619	/*
	620	* no error + B_INVAL == directory EOF,
	621	* use the block.
	622	*/
	623	if (error == 0 && (bp->b_flags & B_INVAL))
	624	break;
	625	}
	626	/*
	627	* An error will throw away the block and the
	628	* for loop will break out. If no error and this
	629	* is not the block we want, we throw away the
	630	* block and go for the next one via the for loop.
	631	*/
	632	if (error \|\| i < lbn)
	633	brelse(bp);
	634	}
	635	}
	636	/*
	637	* The above while is repeated if we hit another cookie
	638	* error. If we hit an error and it wasn't a cookie error,
	639	* we give up.
	640	*/
	641	if (error)
	642	return (error);
	643	}
	644
	645	/*
	646	* If not eof and read aheads are enabled, start one.
	647	* (You need the current block first, so that you have the
	648	* directory offset cookie of the next block.)
	649	*/
	650	if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
	651	(bp->b_flags & B_INVAL) == 0 &&
	652	(np->n_direofoffset == 0 \|\|
	653	(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
	654	!(np->n_flag & NQNFSNONCACHE) &&
	655	!incore(vp, lbn + 1)) {
	656	rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, td);
	657	if (rabp) {
	658	if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) {
	659	rabp->b_flags \|= (B_READ \| B_ASYNC);
	660	vfs_busy_pages(rabp, 0);
	661	if (nfs_asyncio(rabp, cred, td)) {
	662	rabp->b_flags \|= B_INVAL\|B_ERROR;
	663	vfs_unbusy_pages(rabp);
	664	brelse(rabp);
	665	}
	666	} else {
	667	brelse(rabp);
	668	}
	669	}
	670	}
	671	/*
	672	* Unlike VREG files, whos buffer size ( bp->b_bcount ) is
	673	* chopped for the EOF condition, we cannot tell how large
	674	* NFS directories are going to be until we hit EOF. So
	675	* an NFS directory buffer is not chopped to its EOF. Now,
	676	* it just so happens that b_resid will effectively chop it
	677	* to EOF. BUT this information is lost if the buffer goes
	678	* away and is reconstituted into a B_CACHE state ( due to
	679	* being VMIO ) later. So we keep track of the directory eof
	680	* in np->n_direofoffset and chop it off as an extra step
	681	* right here.
	682	*/
	683	n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
	684	if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
	685	n = np->n_direofoffset - uio->uio_offset;
	686	break;
	687	default:
	688	printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	689	break;
	690	};
	691
	692	if (n > 0) {
	693	error = uiomove(bp->b_data + on, (int)n, uio);
	694	}
	695	switch (vp->v_type) {
	696	case VREG:
	697	break;
	698	case VLNK:
	699	n = 0;
	700	break;
	701	case VDIR:
	702	/*
	703	* Invalidate buffer if caching is disabled, forcing a
	704	* re-read from the remote later.
	705	*/
	706	if (np->n_flag & NQNFSNONCACHE)
	707	bp->b_flags \|= B_INVAL;
	708	break;
	709	default:
	710	printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	711	}
	712	brelse(bp);
	713	} while (error == 0 && uio->uio_resid > 0 && n > 0);
	714	return (error);
	715	}
	716
	717	/*
	718	* Vnode op for write using bio
	719	*/
	720	int
	721	nfs_write(ap)
	722	struct vop_write_args /* {
	723	struct vnode *a_vp;
	724	struct uio *a_uio;
	725	int a_ioflag;
	726	struct ucred *a_cred;
	727	} / ap;
	728	{
	729	int biosize;
	730	struct uio *uio = ap->a_uio;
	731	struct thread *td = uio->uio_td;
	732	struct vnode *vp = ap->a_vp;
	733	struct nfsnode *np = VTONFS(vp);
	734	struct ucred *cred = ap->a_cred;
	735	int ioflag = ap->a_ioflag;
	736	struct buf *bp;
	737	struct vattr vattr;
	738	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	739	daddr_t lbn;
	740	int bcount;
	741	int n, on, error = 0, iomode, must_commit;
	742	int haverslock = 0;
	743
	744	#ifdef DIAGNOSTIC
	745	if (uio->uio_rw != UIO_WRITE)
	746	panic("nfs_write mode");
	747	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
	748	panic("nfs_write proc");
	749	#endif
	750	if (vp->v_type != VREG)
	751	return (EIO);
	752	if (np->n_flag & NWRITEERR) {
	753	np->n_flag &= ~NWRITEERR;
	754	return (np->n_error);
	755	}
	756	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	757	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	758	(void)nfs_fsinfo(nmp, vp, cred, td);
	759
	760	/*
	761	* Synchronously flush pending buffers if we are in synchronous
	762	* mode or if we are appending.
	763	*/
	764	if (ioflag & (IO_APPEND \| IO_SYNC)) {
	765	if (np->n_flag & NMODIFIED) {
	766	np->n_attrstamp = 0;
	767	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	768	if (error)
	769	return (error);
	770	}
	771	}
	772
	773	/*
	774	* If IO_APPEND then load uio_offset. We restart here if we cannot
	775	* get the append lock.
	776	*/
	777	restart:
	778	if (ioflag & IO_APPEND) {
	779	np->n_attrstamp = 0;
	780	error = VOP_GETATTR(vp, &vattr, cred, td);
	781	if (error)
	782	return (error);
	783	uio->uio_offset = np->n_size;
	784	}
	785
	786	if (uio->uio_offset < 0)
	787	return (EINVAL);
	788	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
	789	return (EFBIG);
	790	if (uio->uio_resid == 0)
	791	return (0);
	792
	793	/*
	794	* We need to obtain the rslock if we intend to modify np->n_size
	795	* in order to guarentee the append point with multiple contending
	796	* writers, to guarentee that no other appenders modify n_size
	797	* while we are trying to obtain a truncated buffer (i.e. to avoid
	798	* accidently truncating data written by another appender due to
	799	* the race), and to ensure that the buffer is populated prior to
	800	* our extending of the file. We hold rslock through the entire
	801	* operation.
	802	*
	803	* Note that we do not synchronize the case where someone truncates
	804	* the file while we are appending to it because attempting to lock
	805	* this case may deadlock other parts of the system unexpectedly.
	806	*/
	807	if ((ioflag & IO_APPEND) \|\|
	808	uio->uio_offset + uio->uio_resid > np->n_size) {
	809	switch(nfs_rslock(np, td)) {
	810	case ENOLCK:
	811	goto restart;
	812	/* not reached */
	813	case EINTR:
	814	case ERESTART:
	815	return(EINTR);
	816	/* not reached */
	817	default:
	818	break;
	819	}
	820	haverslock = 1;
	821	}
	822
	823	/*
	824	* Maybe this should be above the vnode op call, but so long as
	825	* file servers have no limits, i don't think it matters
	826	*/
	827	if (td->td_proc && uio->uio_offset + uio->uio_resid >
	828	td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
	829	psignal(td->td_proc, SIGXFSZ);
	830	if (haverslock)
	831	nfs_rsunlock(np, td);
	832	return (EFBIG);
	833	}
	834
	835	biosize = vp->v_mount->mnt_stat.f_iosize;
	836
	837	do {
	838	/*
	839	* Check for a valid write lease.
	840	*/
	841	if ((nmp->nm_flag & NFSMNT_NQNFS) &&
	842	NQNFS_CKINVALID(vp, np, ND_WRITE)) {
	843	do {
	844	error = nqnfs_getlease(vp, ND_WRITE, cred, td);
	845	} while (error == NQNFS_EXPIRED);
	846	if (error)
	847	break;
	848	if (np->n_lrev != np->n_brev \|\|
	849	(np->n_flag & NQNFSNONCACHE)) {
	850	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	851	if (error)
	852	break;
	853	np->n_brev = np->n_lrev;
	854	}
	855	}
	856	if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
	857	iomode = NFSV3WRITE_FILESYNC;
	858	error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
	859	if (must_commit)
	860	nfs_clearcommit(vp->v_mount);
	861	break;
	862	}
	863	nfsstats.biocache_writes++;
	864	lbn = uio->uio_offset / biosize;
	865	on = uio->uio_offset & (biosize-1);
	866	n = min((unsigned)(biosize - on), uio->uio_resid);
	867	again:
	868	/*
	869	* Handle direct append and file extension cases, calculate
	870	* unaligned buffer size.
	871	*/
	872
	873	if (uio->uio_offset == np->n_size && n) {
	874	/*
	875	* Get the buffer (in its pre-append state to maintain
	876	* B_CACHE if it was previously set). Resize the
	877	* nfsnode after we have locked the buffer to prevent
	878	* readers from reading garbage.
	879	*/
	880	bcount = on;
	881	bp = nfs_getcacheblk(vp, lbn, bcount, td);
	882
	883	if (bp != NULL) {
	884	long save;
	885
	886	np->n_size = uio->uio_offset + n;
	887	np->n_flag \|= NMODIFIED;
	888	vnode_pager_setsize(vp, np->n_size);
	889
	890	save = bp->b_flags & B_CACHE;
	891	bcount += n;
	892	allocbuf(bp, bcount);
	893	bp->b_flags \|= save;
	894	}
	895	} else {
	896	/*
	897	* Obtain the locked cache block first, and then
	898	* adjust the file's size as appropriate.
	899	*/
	900	bcount = on + n;
	901	if ((off_t)lbn * biosize + bcount < np->n_size) {
	902	if ((off_t)(lbn + 1) * biosize < np->n_size)
	903	bcount = biosize;
	904	else
	905	bcount = np->n_size - (off_t)lbn * biosize;
	906	}
	907	bp = nfs_getcacheblk(vp, lbn, bcount, td);
	908	if (uio->uio_offset + n > np->n_size) {
	909	np->n_size = uio->uio_offset + n;
	910	np->n_flag \|= NMODIFIED;
	911	vnode_pager_setsize(vp, np->n_size);
	912	}
	913	}
	914
	915	if (!bp) {
	916	error = EINTR;
	917	break;
	918	}
	919
	920	/*
	921	* Issue a READ if B_CACHE is not set. In special-append
	922	* mode, B_CACHE is based on the buffer prior to the write
	923	* op and is typically set, avoiding the read. If a read
	924	* is required in special append mode, the server will
	925	* probably send us a short-read since we extended the file
	926	* on our end, resulting in b_resid == 0 and, thusly,
	927	* B_CACHE getting set.
	928	*
	929	* We can also avoid issuing the read if the write covers
	930	* the entire buffer. We have to make sure the buffer state
	931	* is reasonable in this case since we will not be initiating
	932	* I/O. See the comments in kern/vfs_bio.c's getblk() for
	933	* more information.
	934	*
	935	* B_CACHE may also be set due to the buffer being cached
	936	* normally.
	937	*/
	938
	939	if (on == 0 && n == bcount) {
	940	bp->b_flags \|= B_CACHE;
	941	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	942	}
	943
	944	if ((bp->b_flags & B_CACHE) == 0) {
	945	bp->b_flags \|= B_READ;
	946	vfs_busy_pages(bp, 0);
	947	error = nfs_doio(bp, cred, td);
	948	if (error) {
	949	brelse(bp);
	950	break;
	951	}
	952	}
	953	if (!bp) {
	954	error = EINTR;
	955	break;
	956	}
	957	if (bp->b_wcred == NOCRED) {
	958	crhold(cred);
	959	bp->b_wcred = cred;
	960	}
	961	np->n_flag \|= NMODIFIED;
	962
	963	/*
	964	* If dirtyend exceeds file size, chop it down. This should
	965	* not normally occur but there is an append race where it
	966	* might occur XXX, so we log it.
	967	*
	968	* If the chopping creates a reverse-indexed or degenerate
	969	* situation with dirtyoff/end, we 0 both of them.
	970	*/
	971
	972	if (bp->b_dirtyend > bcount) {
	973	printf("NFS append race @%lx:%d\n",
	974	(long)bp->b_blkno * DEV_BSIZE,
	975	bp->b_dirtyend - bcount);
	976	bp->b_dirtyend = bcount;
	977	}
	978
	979	if (bp->b_dirtyoff >= bp->b_dirtyend)
	980	bp->b_dirtyoff = bp->b_dirtyend = 0;
	981
	982	/*
	983	* If the new write will leave a contiguous dirty
	984	* area, just update the b_dirtyoff and b_dirtyend,
	985	* otherwise force a write rpc of the old dirty area.
	986	*
	987	* While it is possible to merge discontiguous writes due to
	988	* our having a B_CACHE buffer ( and thus valid read data
	989	* for the hole), we don't because it could lead to
	990	* significant cache coherency problems with multiple clients,
	991	* especially if locking is implemented later on.
	992	*
	993	* as an optimization we could theoretically maintain
	994	* a linked list of discontinuous areas, but we would still
	995	* have to commit them separately so there isn't much
	996	* advantage to it except perhaps a bit of asynchronization.
	997	*/
	998
	999	if (bp->b_dirtyend > 0 &&
	1000	(on > bp->b_dirtyend \|\| (on + n) < bp->b_dirtyoff)) {
	1001	if (VOP_BWRITE(bp->b_vp, bp) == EINTR) {
	1002	error = EINTR;
	1003	break;
	1004	}
	1005	goto again;
	1006	}
	1007
	1008	/*
	1009	* Check for valid write lease and get one as required.
	1010	* In case getblk() and/or bwrite() delayed us.
	1011	*/
	1012	if ((nmp->nm_flag & NFSMNT_NQNFS) &&
	1013	NQNFS_CKINVALID(vp, np, ND_WRITE)) {
	1014	do {
	1015	error = nqnfs_getlease(vp, ND_WRITE, cred, td);
	1016	} while (error == NQNFS_EXPIRED);
	1017	if (error) {
	1018	brelse(bp);
	1019	break;
	1020	}
	1021	if (np->n_lrev != np->n_brev \|\|
	1022	(np->n_flag & NQNFSNONCACHE)) {
	1023	brelse(bp);
	1024	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	1025	if (error)
	1026	break;
	1027	np->n_brev = np->n_lrev;
	1028	goto again;
	1029	}
	1030	}
	1031
	1032	error = uiomove((char *)bp->b_data + on, n, uio);
	1033
	1034	/*
	1035	* Since this block is being modified, it must be written
	1036	* again and not just committed. Since write clustering does
	1037	* not work for the stage 1 data write, only the stage 2
	1038	* commit rpc, we have to clear B_CLUSTEROK as well.
	1039	*/
	1040	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1041
	1042	if (error) {
	1043	bp->b_flags \|= B_ERROR;
	1044	brelse(bp);
	1045	break;
	1046	}
	1047
	1048	/*
	1049	* Only update dirtyoff/dirtyend if not a degenerate
	1050	* condition.
	1051	*/
	1052	if (n) {
	1053	if (bp->b_dirtyend > 0) {
	1054	bp->b_dirtyoff = min(on, bp->b_dirtyoff);
	1055	bp->b_dirtyend = max((on + n), bp->b_dirtyend);
	1056	} else {
	1057	bp->b_dirtyoff = on;
	1058	bp->b_dirtyend = on + n;
	1059	}
	1060	vfs_bio_set_validclean(bp, on, n);
	1061	}
	1062	/*
	1063	* If IO_NOWDRAIN then set B_NOWDRAIN (e.g. nfs-backed VN
	1064	* filesystem). XXX also use for loopback NFS mounts.
	1065	*/
	1066	if (ioflag & IO_NOWDRAIN)
	1067	bp->b_flags \|= B_NOWDRAIN;
	1068
	1069	/*
	1070	* If the lease is non-cachable or IO_SYNC do bwrite().
	1071	*
	1072	* IO_INVAL appears to be unused. The idea appears to be
	1073	* to turn off caching in this case. Very odd. XXX
	1074	*/
	1075	if ((np->n_flag & NQNFSNONCACHE) \|\| (ioflag & IO_SYNC)) {
	1076	if (ioflag & IO_INVAL)
	1077	bp->b_flags \|= B_NOCACHE;
	1078	error = VOP_BWRITE(bp->b_vp, bp);
	1079	if (error)
	1080	break;
	1081	if (np->n_flag & NQNFSNONCACHE) {
	1082	error = nfs_vinvalbuf(vp, V_SAVE, cred, td, 1);
	1083	if (error)
	1084	break;
	1085	}
	1086	} else if ((n + on) == biosize &&
	1087	(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
	1088	bp->b_flags \|= B_ASYNC;
	1089	(void)nfs_writebp(bp, 0, 0);
	1090	} else {
	1091	bdwrite(bp);
	1092	}
	1093	} while (uio->uio_resid > 0 && n > 0);
	1094
	1095	if (haverslock)
	1096	nfs_rsunlock(np, td);
	1097
	1098	return (error);
	1099	}
	1100
	1101	/*
	1102	* Get an nfs cache block.
	1103	*
	1104	* Allocate a new one if the block isn't currently in the cache
	1105	* and return the block marked busy. If the calling process is
	1106	* interrupted by a signal for an interruptible mount point, return
	1107	* NULL.
	1108	*
	1109	* The caller must carefully deal with the possible B_INVAL state of
	1110	* the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
	1111	* indirectly), so synchronous reads can be issued without worrying about
	1112	* the B_INVAL state. We have to be a little more careful when dealing
	1113	* with writes (see comments in nfs_write()) when extending a file past
	1114	* its EOF.
	1115	*/
	1116	static struct buf *
	1117	nfs_getcacheblk(struct vnode vp, daddr_t bn, int size, struct thread td)
	1118	{
	1119	register struct buf *bp;
	1120	struct mount *mp;
	1121	struct nfsmount *nmp;
	1122
	1123	mp = vp->v_mount;
	1124	nmp = VFSTONFS(mp);
	1125
	1126	if (nmp->nm_flag & NFSMNT_INT) {
	1127	bp = getblk(vp, bn, size, PCATCH, 0);
	1128	while (bp == (struct buf *)0) {
	1129	if (nfs_sigintr(nmp, (struct nfsreq *)0, td))
	1130	return ((struct buf *)0);
	1131	bp = getblk(vp, bn, size, 0, 2 * hz);
	1132	}
	1133	} else {
	1134	bp = getblk(vp, bn, size, 0, 0);
	1135	}
	1136
	1137	if (vp->v_type == VREG) {
	1138	int biosize;
	1139
	1140	biosize = mp->mnt_stat.f_iosize;
	1141	bp->b_blkno = bn * (biosize / DEV_BSIZE);
	1142	}
	1143	return (bp);
	1144	}
	1145
	1146	/*
	1147	* Flush and invalidate all dirty buffers. If another process is already
	1148	* doing the flush, just wait for completion.
	1149	*/
	1150	int
	1151	nfs_vinvalbuf(struct vnode vp, int flags, struct ucred cred,
	1152	struct thread *td, int intrflg)
	1153	{
	1154	register struct nfsnode *np = VTONFS(vp);
	1155	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	1156	int error = 0, slpflag, slptimeo;
	1157
	1158	if (vp->v_flag & VXLOCK) {
	1159	return (0);
	1160	}
	1161
	1162	if ((nmp->nm_flag & NFSMNT_INT) == 0)
	1163	intrflg = 0;
	1164	if (intrflg) {
	1165	slpflag = PCATCH;
	1166	slptimeo = 2 * hz;
	1167	} else {
	1168	slpflag = 0;
	1169	slptimeo = 0;
	1170	}
	1171	/*
	1172	* First wait for any other process doing a flush to complete.
	1173	*/
	1174	while (np->n_flag & NFLUSHINPROG) {
	1175	np->n_flag \|= NFLUSHWANT;
	1176	error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
	1177	slptimeo);
	1178	if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td))
	1179	return (EINTR);
	1180	}
	1181
	1182	/*
	1183	* Now, flush as required.
	1184	*/
	1185	np->n_flag \|= NFLUSHINPROG;
	1186	error = vinvalbuf(vp, flags, cred, td, slpflag, 0);
	1187	while (error) {
	1188	if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, td)) {
	1189	np->n_flag &= ~NFLUSHINPROG;
	1190	if (np->n_flag & NFLUSHWANT) {
	1191	np->n_flag &= ~NFLUSHWANT;
	1192	wakeup((caddr_t)&np->n_flag);
	1193	}
	1194	return (EINTR);
	1195	}
	1196	error = vinvalbuf(vp, flags, cred, td, 0, slptimeo);
	1197	}
	1198	np->n_flag &= ~(NMODIFIED \| NFLUSHINPROG);
	1199	if (np->n_flag & NFLUSHWANT) {
	1200	np->n_flag &= ~NFLUSHWANT;
	1201	wakeup((caddr_t)&np->n_flag);
	1202	}
	1203	return (0);
	1204	}
	1205
	1206	/*
	1207	* Initiate asynchronous I/O. Return an error if no nfsiods are available.
	1208	* This is mainly to avoid queueing async I/O requests when the nfsiods
	1209	* are all hung on a dead server.
	1210	*
	1211	* Note: nfs_asyncio() does not clear (B_ERROR\|B_INVAL) but when the bp
	1212	* is eventually dequeued by the async daemon, nfs_doio() will.
	1213	*/
	1214	int
	1215	nfs_asyncio(struct buf bp, struct ucred cred, struct thread *td)
	1216	{
	1217	struct nfsmount *nmp;
	1218	int i;
	1219	int gotiod;
	1220	int slpflag = 0;
	1221	int slptimeo = 0;
	1222	int error;
	1223
	1224	/*
	1225	* If no async daemons then return EIO to force caller to run the rpc
	1226	* synchronously.
	1227	*/
	1228	if (nfs_numasync == 0)
	1229	return (EIO);
	1230
	1231	nmp = VFSTONFS(bp->b_vp->v_mount);
	1232
	1233	/*
	1234	* Commits are usually short and sweet so lets save some cpu and
	1235	* leave the async daemons for more important rpc's (such as reads
	1236	* and writes).
	1237	*/
	1238	if ((bp->b_flags & (B_READ\|B_NEEDCOMMIT)) == B_NEEDCOMMIT &&
	1239	(nmp->nm_bufqiods > nfs_numasync / 2)) {
	1240	return(EIO);
	1241	}
	1242
	1243	again:
	1244	if (nmp->nm_flag & NFSMNT_INT)
	1245	slpflag = PCATCH;
	1246	gotiod = FALSE;
	1247
	1248	/*
	1249	* Find a free iod to process this request.
	1250	*/
	1251	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
	1252	if (nfs_iodwant[i]) {
	1253	/*
	1254	* Found one, so wake it up and tell it which
	1255	* mount to process.
	1256	*/
	1257	NFS_DPF(ASYNCIO,
	1258	("nfs_asyncio: waking iod %d for mount %p\n",
	1259	i, nmp));
	1260	nfs_iodwant[i] = NULL;
	1261	nfs_iodmount[i] = nmp;
	1262	nmp->nm_bufqiods++;
	1263	wakeup((caddr_t)&nfs_iodwant[i]);
	1264	gotiod = TRUE;
	1265	break;
	1266	}
	1267
	1268	/*
	1269	* If none are free, we may already have an iod working on this mount
	1270	* point. If so, it will process our request.
	1271	*/
	1272	if (!gotiod) {
	1273	if (nmp->nm_bufqiods > 0) {
	1274	NFS_DPF(ASYNCIO,
	1275	("nfs_asyncio: %d iods are already processing mount %p\n",
	1276	nmp->nm_bufqiods, nmp));
	1277	gotiod = TRUE;
	1278	}
	1279	}
	1280
	1281	/*
	1282	* If we have an iod which can process the request, then queue
	1283	* the buffer.
	1284	*/
	1285	if (gotiod) {
	1286	/*
	1287	* Ensure that the queue never grows too large. We still want
	1288	* to asynchronize so we block rather then return EIO.
	1289	*/
	1290	while (nmp->nm_bufqlen >= 2*nfs_numasync) {
	1291	NFS_DPF(ASYNCIO,
	1292	("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
	1293	nmp->nm_bufqwant = TRUE;
	1294	error = tsleep(&nmp->nm_bufq, slpflag \| PRIBIO,
	1295	"nfsaio", slptimeo);
	1296	if (error) {
	1297	if (nfs_sigintr(nmp, NULL, td))
	1298	return (EINTR);
	1299	if (slpflag == PCATCH) {
	1300	slpflag = 0;
	1301	slptimeo = 2 * hz;
	1302	}
	1303	}
	1304	/*
	1305	* We might have lost our iod while sleeping,
	1306	* so check and loop if nescessary.
	1307	*/
	1308	if (nmp->nm_bufqiods == 0) {
	1309	NFS_DPF(ASYNCIO,
	1310	("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
	1311	goto again;
	1312	}
	1313	}
	1314
	1315	if (bp->b_flags & B_READ) {
	1316	if (bp->b_rcred == NOCRED && cred != NOCRED) {
	1317	crhold(cred);
	1318	bp->b_rcred = cred;
	1319	}
	1320	} else {
	1321	bp->b_flags \|= B_WRITEINPROG;
	1322	if (bp->b_wcred == NOCRED && cred != NOCRED) {
	1323	crhold(cred);
	1324	bp->b_wcred = cred;
	1325	}
	1326	}
	1327
	1328	BUF_KERNPROC(bp);
	1329	TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
	1330	nmp->nm_bufqlen++;
	1331	return (0);
	1332	}
	1333
	1334	/*
	1335	* All the iods are busy on other mounts, so return EIO to
	1336	* force the caller to process the i/o synchronously.
	1337	*/
	1338	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
	1339	return (EIO);
	1340	}
	1341
	1342	/*
	1343	* Do an I/O operation to/from a cache block. This may be called
	1344	* synchronously or from an nfsiod.
	1345	*
	1346	* NOTE! TD MIGHT BE NULL
	1347	*/
	1348	int
	1349	nfs_doio(struct buf bp, struct ucred cr, struct thread *td)
	1350	{
	1351	struct uio *uiop;
	1352	struct vnode *vp;
	1353	struct nfsnode *np;
	1354	struct nfsmount *nmp;
	1355	int error = 0, iomode, must_commit = 0;
	1356	struct uio uio;
	1357	struct iovec io;
	1358
	1359	vp = bp->b_vp;
	1360	np = VTONFS(vp);
	1361	nmp = VFSTONFS(vp->v_mount);
	1362	uiop = &uio;
	1363	uiop->uio_iov = &io;
	1364	uiop->uio_iovcnt = 1;
	1365	uiop->uio_segflg = UIO_SYSSPACE;
	1366	uiop->uio_td = td;
	1367
	1368	/*
	1369	* clear B_ERROR and B_INVAL state prior to initiating the I/O. We
	1370	* do this here so we do not have to do it in all the code that
	1371	* calls us.
	1372	*/
	1373	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	1374
	1375	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
	1376
	1377	/*
	1378	* Historically, paging was done with physio, but no more.
	1379	*/
	1380	if (bp->b_flags & B_PHYS) {
	1381	/*
	1382	* ...though reading /dev/drum still gets us here.
	1383	*/
	1384	io.iov_len = uiop->uio_resid = bp->b_bcount;
	1385	/* mapping was done by vmapbuf() */
	1386	io.iov_base = bp->b_data;
	1387	uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
	1388	if (bp->b_flags & B_READ) {
	1389	uiop->uio_rw = UIO_READ;
	1390	nfsstats.read_physios++;
	1391	error = nfs_readrpc(vp, uiop, cr);
	1392	} else {
	1393	int com;
	1394
	1395	iomode = NFSV3WRITE_DATASYNC;
	1396	uiop->uio_rw = UIO_WRITE;
	1397	nfsstats.write_physios++;
	1398	error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
	1399	}
	1400	if (error) {
	1401	bp->b_flags \|= B_ERROR;
	1402	bp->b_error = error;
	1403	}
	1404	} else if (bp->b_flags & B_READ) {
	1405	io.iov_len = uiop->uio_resid = bp->b_bcount;
	1406	io.iov_base = bp->b_data;
	1407	uiop->uio_rw = UIO_READ;
	1408
	1409	switch (vp->v_type) {
	1410	case VREG:
	1411	uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
	1412	nfsstats.read_bios++;
	1413	error = nfs_readrpc(vp, uiop, cr);
	1414
	1415	if (!error) {
	1416	if (uiop->uio_resid) {
	1417	/*
	1418	* If we had a short read with no error, we must have
	1419	* hit a file hole. We should zero-fill the remainder.
	1420	* This can also occur if the server hits the file EOF.
	1421	*
	1422	* Holes used to be able to occur due to pending
	1423	* writes, but that is not possible any longer.
	1424	*/
	1425	int nread = bp->b_bcount - uiop->uio_resid;
	1426	int left = uiop->uio_resid;
	1427
	1428	if (left > 0)
	1429	bzero((char *)bp->b_data + nread, left);
	1430	uiop->uio_resid = 0;
	1431	}
	1432	}
	1433	if (td && td->td_proc && (vp->v_flag & VTEXT) &&
	1434	(((nmp->nm_flag & NFSMNT_NQNFS) &&
	1435	NQNFS_CKINVALID(vp, np, ND_READ) &&
	1436	np->n_lrev != np->n_brev) \|\|
	1437	(!(nmp->nm_flag & NFSMNT_NQNFS) &&
	1438	np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
	1439	uprintf("Process killed due to text file modification\n");
	1440	psignal(td->td_proc, SIGKILL);
	1441	PHOLD(td->td_proc);
	1442	}
	1443	break;
	1444	case VLNK:
	1445	uiop->uio_offset = (off_t)0;
	1446	nfsstats.readlink_bios++;
	1447	error = nfs_readlinkrpc(vp, uiop, cr);
	1448	break;
	1449	case VDIR:
	1450	nfsstats.readdir_bios++;
	1451	uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
	1452	if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
	1453	error = nfs_readdirplusrpc(vp, uiop, cr);
	1454	if (error == NFSERR_NOTSUPP)
	1455	nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
	1456	}
	1457	if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
	1458	error = nfs_readdirrpc(vp, uiop, cr);
	1459	/*
	1460	* end-of-directory sets B_INVAL but does not generate an
	1461	* error.
	1462	*/
	1463	if (error == 0 && uiop->uio_resid == bp->b_bcount)
	1464	bp->b_flags \|= B_INVAL;
	1465	break;
	1466	default:
	1467	printf("nfs_doio: type %x unexpected\n",vp->v_type);
	1468	break;
	1469	};
	1470	if (error) {
	1471	bp->b_flags \|= B_ERROR;
	1472	bp->b_error = error;
	1473	}
	1474	} else {
	1475	/*
	1476	* If we only need to commit, try to commit
	1477	*/
	1478	if (bp->b_flags & B_NEEDCOMMIT) {
	1479	int retv;
	1480	off_t off;
	1481
	1482	off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
	1483	bp->b_flags \|= B_WRITEINPROG;
	1484	retv = nfs_commit(
	1485	bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
	1486	bp->b_wcred, td);
	1487	bp->b_flags &= ~B_WRITEINPROG;
	1488	if (retv == 0) {
	1489	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1490	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1491	bp->b_resid = 0;
	1492	biodone(bp);
	1493	return (0);
	1494	}
	1495	if (retv == NFSERR_STALEWRITEVERF) {
	1496	nfs_clearcommit(bp->b_vp->v_mount);
	1497	}
	1498	}
	1499
	1500	/*
	1501	* Setup for actual write
	1502	*/
	1503
	1504	if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
	1505	bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
	1506
	1507	if (bp->b_dirtyend > bp->b_dirtyoff) {
	1508	io.iov_len = uiop->uio_resid = bp->b_dirtyend
	1509	- bp->b_dirtyoff;
	1510	uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
	1511	+ bp->b_dirtyoff;
	1512	io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
	1513	uiop->uio_rw = UIO_WRITE;
	1514	nfsstats.write_bios++;
	1515
	1516	if ((bp->b_flags & (B_ASYNC \| B_NEEDCOMMIT \| B_NOCACHE \| B_CLUSTER)) == B_ASYNC)
	1517	iomode = NFSV3WRITE_UNSTABLE;
	1518	else
	1519	iomode = NFSV3WRITE_FILESYNC;
	1520
	1521	bp->b_flags \|= B_WRITEINPROG;
	1522	error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
	1523
	1524	/*
	1525	* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
	1526	* to cluster the buffers needing commit. This will allow
	1527	* the system to submit a single commit rpc for the whole
	1528	* cluster. We can do this even if the buffer is not 100%
	1529	* dirty (relative to the NFS blocksize), so we optimize the
	1530	* append-to-file-case.
	1531	*
	1532	* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
	1533	* cleared because write clustering only works for commit
	1534	* rpc's, not for the data portion of the write).
	1535	*/
	1536
	1537	if (!error && iomode == NFSV3WRITE_UNSTABLE) {
	1538	bp->b_flags \|= B_NEEDCOMMIT;
	1539	if (bp->b_dirtyoff == 0
	1540	&& bp->b_dirtyend == bp->b_bcount)
	1541	bp->b_flags \|= B_CLUSTEROK;
	1542	} else {
	1543	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1544	}
	1545	bp->b_flags &= ~B_WRITEINPROG;
	1546
	1547	/*
	1548	* For an interrupted write, the buffer is still valid
	1549	* and the write hasn't been pushed to the server yet,
	1550	* so we can't set B_ERROR and report the interruption
	1551	* by setting B_EINTR. For the B_ASYNC case, B_EINTR
	1552	* is not relevant, so the rpc attempt is essentially
	1553	* a noop. For the case of a V3 write rpc not being
	1554	* committed to stable storage, the block is still
	1555	* dirty and requires either a commit rpc or another
	1556	* write rpc with iomode == NFSV3WRITE_FILESYNC before
	1557	* the block is reused. This is indicated by setting
	1558	* the B_DELWRI and B_NEEDCOMMIT flags.
	1559	*
	1560	* If the buffer is marked B_PAGING, it does not reside on
	1561	* the vp's paging queues so we cannot call bdirty(). The
	1562	* bp in this case is not an NFS cache block so we should
	1563	* be safe. XXX
	1564	*/
	1565	if (error == EINTR
	1566	\|\| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
	1567	int s;
	1568
	1569	s = splbio();
	1570	bp->b_flags &= ~(B_INVAL\|B_NOCACHE);
	1571	if ((bp->b_flags & B_PAGING) == 0) {
	1572	bdirty(bp);
	1573	bp->b_flags &= ~B_DONE;
	1574	}
	1575	if (error && (bp->b_flags & B_ASYNC) == 0)
	1576	bp->b_flags \|= B_EINTR;
	1577	splx(s);
	1578	} else {
	1579	if (error) {
	1580	bp->b_flags \|= B_ERROR;
	1581	bp->b_error = np->n_error = error;
	1582	np->n_flag \|= NWRITEERR;
	1583	}
	1584	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1585	}
	1586	} else {
	1587	bp->b_resid = 0;
	1588	biodone(bp);
	1589	return (0);
	1590	}
	1591	}
	1592	bp->b_resid = uiop->uio_resid;
	1593	if (must_commit)
	1594	nfs_clearcommit(vp->v_mount);
	1595	biodone(bp);
	1596	return (error);
	1597	}
	1598
	1599	/*
	1600	* Used to aid in handling ftruncate() operations on the NFS client side.
	1601	* Truncation creates a number of special problems for NFS. We have to
	1602	* throw away VM pages and buffer cache buffers that are beyond EOF, and
	1603	* we have to properly handle VM pages or (potentially dirty) buffers
	1604	* that straddle the truncation point.
	1605	*/
	1606
	1607	int
	1608	nfs_meta_setsize(struct vnode vp, struct ucred cred, struct thread *td, u_quad_t nsize)
	1609	{
	1610	struct nfsnode *np = VTONFS(vp);
	1611	u_quad_t tsize = np->n_size;
	1612	int biosize = vp->v_mount->mnt_stat.f_iosize;
	1613	int error = 0;
	1614
	1615	np->n_size = nsize;
	1616
	1617	if (np->n_size < tsize) {
	1618	struct buf *bp;
	1619	daddr_t lbn;
	1620	int bufsize;
	1621
	1622	/*
	1623	* vtruncbuf() doesn't get the buffer overlapping the
	1624	* truncation point. We may have a B_DELWRI and/or B_CACHE
	1625	* buffer that now needs to be truncated.
	1626	*/
	1627	error = vtruncbuf(vp, cred, td, nsize, biosize);
	1628	lbn = nsize / biosize;
	1629	bufsize = nsize & (biosize - 1);
	1630	bp = nfs_getcacheblk(vp, lbn, bufsize, td);
	1631	if (bp->b_dirtyoff > bp->b_bcount)
	1632	bp->b_dirtyoff = bp->b_bcount;
	1633	if (bp->b_dirtyend > bp->b_bcount)
	1634	bp->b_dirtyend = bp->b_bcount;
	1635	bp->b_flags \|= B_RELBUF; /* don't leave garbage around */
	1636	brelse(bp);
	1637	} else {
	1638	vnode_pager_setsize(vp, nsize);
	1639	}
	1640	return(error);
	1641	}
	1642