gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* Rick Macklem at The University of Guelph.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
	37	* $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $
	38	* $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.42 2008/01/10 07:34:04 dillon Exp $
	39	*/
	40
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/resourcevar.h>
	45	#include <sys/signalvar.h>
	46	#include <sys/proc.h>
	47	#include <sys/buf.h>
	48	#include <sys/vnode.h>
	49	#include <sys/mount.h>
	50	#include <sys/kernel.h>
	51	#include <sys/buf2.h>
	52	#include <sys/msfbuf.h>
	53
	54	#include <vm/vm.h>
	55	#include <vm/vm_extern.h>
	56	#include <vm/vm_page.h>
	57	#include <vm/vm_object.h>
	58	#include <vm/vm_pager.h>
	59	#include <vm/vnode_pager.h>
	60
	61	#include <sys/thread2.h>
	62
	63	#include "rpcv2.h"
	64	#include "nfsproto.h"
	65	#include "nfs.h"
	66	#include "nfsmount.h"
	67	#include "nfsnode.h"
	68
	69	static struct buf nfs_getcacheblk(struct vnode vp, off_t loffset,
	70	int size, struct thread *td);
	71	static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen);
	72
	73	extern int nfs_numasync;
	74	extern int nfs_pbuf_freecnt;
	75	extern struct nfsstats nfsstats;
	76
	77	/*
	78	* Vnode op for VM getpages.
	79	*
	80	* nfs_getpages(struct vnode a_vp, vm_page_t a_m, int a_count,
	81	* int a_reqpage, vm_ooffset_t a_offset)
	82	*/
	83	int
	84	nfs_getpages(struct vop_getpages_args *ap)
	85	{
	86	struct thread td = curthread; / XXX */
	87	int i, error, nextoff, size, toff, count, npages;
	88	struct uio uio;
	89	struct iovec iov;
	90	char *kva;
	91	struct vnode *vp;
	92	struct nfsmount *nmp;
	93	vm_page_t *pages;
	94	vm_page_t m;
	95	struct msf_buf *msf;
	96
	97	vp = ap->a_vp;
	98	nmp = VFSTONFS(vp->v_mount);
	99	pages = ap->a_m;
	100	count = ap->a_count;
	101
	102	if (vp->v_object == NULL) {
	103	kprintf("nfs_getpages: called with non-merged cache vnode??\n");
	104	return VM_PAGER_ERROR;
	105	}
	106
	107	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	108	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	109	(void)nfs_fsinfo(nmp, vp, td);
	110
	111	npages = btoc(count);
	112
	113	/*
	114	* NOTE that partially valid pages may occur in cases other
	115	* then file EOF, such as when a file is partially written and
	116	* ftruncate()-extended to a larger size. It is also possible
	117	* for the valid bits to be set on garbage beyond the file EOF and
	118	* clear in the area before EOF (e.g. m->valid == 0xfc), which can
	119	* occur due to vtruncbuf() and the buffer cache's handling of
	120	* pages which 'straddle' buffers or when b_bufsize is not a
	121	* multiple of PAGE_SIZE.... the buffer cache cannot normally
	122	* clear the extra bits. This kind of situation occurs when you
	123	* make a small write() (m->valid == 0x03) and then mmap() and
	124	* fault in the buffer(m->valid = 0xFF). When NFS flushes the
	125	* buffer (vinvalbuf() m->valid = 0xFC) we are left with a mess.
	126	*
	127	* This is combined with the possibility that the pages are partially
	128	* dirty or that there is a buffer backing the pages that is dirty
	129	* (even if m->dirty is 0).
	130	*
	131	* To solve this problem several hacks have been made: (1) NFS
	132	* guarentees that the IO block size is a multiple of PAGE_SIZE and
	133	* (2) The buffer cache, when invalidating an NFS buffer, will
	134	* disregard the buffer's fragmentory b_bufsize and invalidate
	135	* the whole page rather then just the piece the buffer owns.
	136	*
	137	* This allows us to assume that a partially valid page found here
	138	* is fully valid (vm_fault will zero'd out areas of the page not
	139	* marked as valid).
	140	*/
	141	m = pages[ap->a_reqpage];
	142	if (m->valid != 0) {
	143	for (i = 0; i < npages; ++i) {
	144	if (i != ap->a_reqpage)
	145	vnode_pager_freepage(pages[i]);
	146	}
	147	return(0);
	148	}
	149
	150	/*
	151	* Use an MSF_BUF as a medium to retrieve data from the pages.
	152	*/
	153	msf_map_pagelist(&msf, pages, npages, 0);
	154	KKASSERT(msf);
	155	kva = msf_buf_kva(msf);
	156
	157	iov.iov_base = kva;
	158	iov.iov_len = count;
	159	uio.uio_iov = &iov;
	160	uio.uio_iovcnt = 1;
	161	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
	162	uio.uio_resid = count;
	163	uio.uio_segflg = UIO_SYSSPACE;
	164	uio.uio_rw = UIO_READ;
	165	uio.uio_td = td;
	166
	167	error = nfs_readrpc(vp, &uio);
	168	msf_buf_free(msf);
	169
	170	if (error && (uio.uio_resid == count)) {
	171	kprintf("nfs_getpages: error %d\n", error);
	172	for (i = 0; i < npages; ++i) {
	173	if (i != ap->a_reqpage)
	174	vnode_pager_freepage(pages[i]);
	175	}
	176	return VM_PAGER_ERROR;
	177	}
	178
	179	/*
	180	* Calculate the number of bytes read and validate only that number
	181	* of bytes. Note that due to pending writes, size may be 0. This
	182	* does not mean that the remaining data is invalid!
	183	*/
	184
	185	size = count - uio.uio_resid;
	186
	187	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
	188	nextoff = toff + PAGE_SIZE;
	189	m = pages[i];
	190
	191	m->flags &= ~PG_ZERO;
	192
	193	if (nextoff <= size) {
	194	/*
	195	* Read operation filled an entire page
	196	*/
	197	m->valid = VM_PAGE_BITS_ALL;
	198	vm_page_undirty(m);
	199	} else if (size > toff) {
	200	/*
	201	* Read operation filled a partial page.
	202	*/
	203	m->valid = 0;
	204	vm_page_set_validclean(m, 0, size - toff);
	205	/* handled by vm_fault now */
	206	/* vm_page_zero_invalid(m, TRUE); */
	207	} else {
	208	/*
	209	* Read operation was short. If no error occured
	210	* we may have hit a zero-fill section. We simply
	211	* leave valid set to 0.
	212	*/
	213	;
	214	}
	215	if (i != ap->a_reqpage) {
	216	/*
	217	* Whether or not to leave the page activated is up in
	218	* the air, but we should put the page on a page queue
	219	* somewhere (it already is in the object). Result:
	220	* It appears that emperical results show that
	221	* deactivating pages is best.
	222	*/
	223
	224	/*
	225	* Just in case someone was asking for this page we
	226	* now tell them that it is ok to use.
	227	*/
	228	if (!error) {
	229	if (m->flags & PG_WANTED)
	230	vm_page_activate(m);
	231	else
	232	vm_page_deactivate(m);
	233	vm_page_wakeup(m);
	234	} else {
	235	vnode_pager_freepage(m);
	236	}
	237	}
	238	}
	239	return 0;
	240	}
	241
	242	/*
	243	* Vnode op for VM putpages.
	244	*
	245	* nfs_putpages(struct vnode a_vp, vm_page_t a_m, int a_count, int a_sync,
	246	* int *a_rtvals, vm_ooffset_t a_offset)
	247	*/
	248	int
	249	nfs_putpages(struct vop_putpages_args *ap)
	250	{
	251	struct thread *td = curthread;
	252	struct uio uio;
	253	struct iovec iov;
	254	char *kva;
	255	int iomode, must_commit, i, error, npages, count;
	256	off_t offset;
	257	int *rtvals;
	258	struct vnode *vp;
	259	struct nfsmount *nmp;
	260	struct nfsnode *np;
	261	vm_page_t *pages;
	262	struct msf_buf *msf;
	263
	264	vp = ap->a_vp;
	265	np = VTONFS(vp);
	266	nmp = VFSTONFS(vp->v_mount);
	267	pages = ap->a_m;
	268	count = ap->a_count;
	269	rtvals = ap->a_rtvals;
	270	npages = btoc(count);
	271	offset = IDX_TO_OFF(pages[0]->pindex);
	272
	273	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	274	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	275	(void)nfs_fsinfo(nmp, vp, td);
	276
	277	for (i = 0; i < npages; i++) {
	278	rtvals[i] = VM_PAGER_AGAIN;
	279	}
	280
	281	/*
	282	* When putting pages, do not extend file past EOF.
	283	*/
	284
	285	if (offset + count > np->n_size) {
	286	count = np->n_size - offset;
	287	if (count < 0)
	288	count = 0;
	289	}
	290
	291	/*
	292	* Use an MSF_BUF as a medium to retrieve data from the pages.
	293	*/
	294	msf_map_pagelist(&msf, pages, npages, 0);
	295	KKASSERT(msf);
	296	kva = msf_buf_kva(msf);
	297
	298	iov.iov_base = kva;
	299	iov.iov_len = count;
	300	uio.uio_iov = &iov;
	301	uio.uio_iovcnt = 1;
	302	uio.uio_offset = offset;
	303	uio.uio_resid = count;
	304	uio.uio_segflg = UIO_SYSSPACE;
	305	uio.uio_rw = UIO_WRITE;
	306	uio.uio_td = td;
	307
	308	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
	309	iomode = NFSV3WRITE_UNSTABLE;
	310	else
	311	iomode = NFSV3WRITE_FILESYNC;
	312
	313	error = nfs_writerpc(vp, &uio, &iomode, &must_commit);
	314
	315	msf_buf_free(msf);
	316
	317	if (!error) {
	318	int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
	319	for (i = 0; i < nwritten; i++) {
	320	rtvals[i] = VM_PAGER_OK;
	321	vm_page_undirty(pages[i]);
	322	}
	323	if (must_commit)
	324	nfs_clearcommit(vp->v_mount);
	325	}
	326	return rtvals[0];
	327	}
	328
	329	/*
	330	* Vnode op for read using bio
	331	*/
	332	int
	333	nfs_bioread(struct vnode vp, struct uio uio, int ioflag)
	334	{
	335	struct nfsnode *np = VTONFS(vp);
	336	int biosize, i;
	337	struct buf bp = 0, rabp;
	338	struct vattr vattr;
	339	struct thread *td;
	340	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	341	daddr_t lbn, rabn;
	342	off_t raoffset;
	343	off_t loffset;
	344	int bcount;
	345	int seqcount;
	346	int nra, error = 0, n = 0, on = 0;
	347
	348	#ifdef DIAGNOSTIC
	349	if (uio->uio_rw != UIO_READ)
	350	panic("nfs_read mode");
	351	#endif
	352	if (uio->uio_resid == 0)
	353	return (0);
	354	if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
	355	return (EINVAL);
	356	td = uio->uio_td;
	357
	358	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	359	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	360	(void)nfs_fsinfo(nmp, vp, td);
	361	if (vp->v_type != VDIR &&
	362	(uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
	363	return (EFBIG);
	364	biosize = vp->v_mount->mnt_stat.f_iosize;
	365	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
	366
	367	/*
	368	* For nfs, cache consistency can only be maintained approximately.
	369	* Although RFC1094 does not specify the criteria, the following is
	370	* believed to be compatible with the reference port.
	371	*
	372	* NFS: If local changes have been made and this is a
	373	* directory, the directory must be invalidated and
	374	* the attribute cache must be cleared.
	375	*
	376	* GETATTR is called to synchronize the file size.
	377	*
	378	* If remote changes are detected local data is flushed
	379	* and the cache is invalidated.
	380	*
	381	* NOTE: In the normal case the attribute cache is not
	382	* cleared which means GETATTR may use cached data and
	383	* not immediately detect changes made on the server.
	384	*/
	385	if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) {
	386	nfs_invaldir(vp);
	387	error = nfs_vinvalbuf(vp, V_SAVE, 1);
	388	if (error)
	389	return (error);
	390	np->n_attrstamp = 0;
	391	}
	392	error = VOP_GETATTR(vp, &vattr);
	393	if (error)
	394	return (error);
	395	if (np->n_flag & NRMODIFIED) {
	396	if (vp->v_type == VDIR)
	397	nfs_invaldir(vp);
	398	error = nfs_vinvalbuf(vp, V_SAVE, 1);
	399	if (error)
	400	return (error);
	401	np->n_flag &= ~NRMODIFIED;
	402	}
	403	do {
	404	if (np->n_flag & NDONTCACHE) {
	405	switch (vp->v_type) {
	406	case VREG:
	407	return (nfs_readrpc(vp, uio));
	408	case VLNK:
	409	return (nfs_readlinkrpc(vp, uio));
	410	case VDIR:
	411	break;
	412	default:
	413	kprintf(" NDONTCACHE: type %x unexpected\n", vp->v_type);
	414	break;
	415	};
	416	}
	417	switch (vp->v_type) {
	418	case VREG:
	419	nfsstats.biocache_reads++;
	420	lbn = uio->uio_offset / biosize;
	421	on = uio->uio_offset & (biosize - 1);
	422	loffset = (off_t)lbn * biosize;
	423
	424	/*
	425	* Start the read ahead(s), as required.
	426	*/
	427	if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
	428	for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
	429	(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
	430	rabn = lbn + 1 + nra;
	431	raoffset = (off_t)rabn * biosize;
	432	if (!findblk(vp, raoffset)) {
	433	rabp = nfs_getcacheblk(vp, raoffset, biosize, td);
	434	if (!rabp)
	435	return (EINTR);
	436	if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) {
	437	rabp->b_flags \|= B_ASYNC;
	438	rabp->b_cmd = BUF_CMD_READ;
	439	vfs_busy_pages(vp, rabp);
	440	if (nfs_asyncio(vp, &rabp->b_bio2, td)) {
	441	rabp->b_flags \|= B_INVAL\|B_ERROR;
	442	vfs_unbusy_pages(rabp);
	443	brelse(rabp);
	444	break;
	445	}
	446	} else {
	447	brelse(rabp);
	448	}
	449	}
	450	}
	451	}
	452
	453	/*
	454	* Obtain the buffer cache block. Figure out the buffer size
	455	* when we are at EOF. If we are modifying the size of the
	456	* buffer based on an EOF condition we need to hold
	457	* nfs_rslock() through obtaining the buffer to prevent
	458	* a potential writer-appender from messing with n_size.
	459	* Otherwise we may accidently truncate the buffer and
	460	* lose dirty data.
	461	*
	462	* Note that bcount is not DEV_BSIZE aligned.
	463	*/
	464
	465	again:
	466	bcount = biosize;
	467	if (loffset >= np->n_size) {
	468	bcount = 0;
	469	} else if (loffset + biosize > np->n_size) {
	470	bcount = np->n_size - loffset;
	471	}
	472	if (bcount != biosize) {
	473	switch(nfs_rslock(np)) {
	474	case ENOLCK:
	475	goto again;
	476	/* not reached */
	477	case EINTR:
	478	case ERESTART:
	479	return(EINTR);
	480	/* not reached */
	481	default:
	482	break;
	483	}
	484	}
	485
	486	bp = nfs_getcacheblk(vp, loffset, bcount, td);
	487
	488	if (bcount != biosize)
	489	nfs_rsunlock(np);
	490	if (!bp)
	491	return (EINTR);
	492
	493	/*
	494	* If B_CACHE is not set, we must issue the read. If this
	495	* fails, we return an error.
	496	*/
	497
	498	if ((bp->b_flags & B_CACHE) == 0) {
	499	bp->b_cmd = BUF_CMD_READ;
	500	vfs_busy_pages(vp, bp);
	501	error = nfs_doio(vp, &bp->b_bio2, td);
	502	if (error) {
	503	brelse(bp);
	504	return (error);
	505	}
	506	}
	507
	508	/*
	509	* on is the offset into the current bp. Figure out how many
	510	* bytes we can copy out of the bp. Note that bcount is
	511	* NOT DEV_BSIZE aligned.
	512	*
	513	* Then figure out how many bytes we can copy into the uio.
	514	*/
	515
	516	n = 0;
	517	if (on < bcount)
	518	n = min((unsigned)(bcount - on), uio->uio_resid);
	519	break;
	520	case VLNK:
	521	biosize = min(NFS_MAXPATHLEN, np->n_size);
	522	nfsstats.biocache_readlinks++;
	523	bp = nfs_getcacheblk(vp, (off_t)0, biosize, td);
	524	if (bp == NULL)
	525	return (EINTR);
	526	if ((bp->b_flags & B_CACHE) == 0) {
	527	bp->b_cmd = BUF_CMD_READ;
	528	vfs_busy_pages(vp, bp);
	529	error = nfs_doio(vp, &bp->b_bio2, td);
	530	if (error) {
	531	bp->b_flags \|= B_ERROR;
	532	brelse(bp);
	533	return (error);
	534	}
	535	}
	536	n = min(uio->uio_resid, bp->b_bcount - bp->b_resid);
	537	on = 0;
	538	break;
	539	case VDIR:
	540	nfsstats.biocache_readdirs++;
	541	if (np->n_direofoffset
	542	&& uio->uio_offset >= np->n_direofoffset) {
	543	return (0);
	544	}
	545	lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
	546	on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
	547	loffset = uio->uio_offset - on;
	548	bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td);
	549	if (bp == NULL)
	550	return (EINTR);
	551
	552	if ((bp->b_flags & B_CACHE) == 0) {
	553	bp->b_cmd = BUF_CMD_READ;
	554	vfs_busy_pages(vp, bp);
	555	error = nfs_doio(vp, &bp->b_bio2, td);
	556	if (error) {
	557	brelse(bp);
	558	}
	559	while (error == NFSERR_BAD_COOKIE) {
	560	kprintf("got bad cookie vp %p bp %p\n", vp, bp);
	561	nfs_invaldir(vp);
	562	error = nfs_vinvalbuf(vp, 0, 1);
	563	/*
	564	* Yuck! The directory has been modified on the
	565	* server. The only way to get the block is by
	566	* reading from the beginning to get all the
	567	* offset cookies.
	568	*
	569	* Leave the last bp intact unless there is an error.
	570	* Loop back up to the while if the error is another
	571	* NFSERR_BAD_COOKIE (double yuch!).
	572	*/
	573	for (i = 0; i <= lbn && !error; i++) {
	574	if (np->n_direofoffset
	575	&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
	576	return (0);
	577	bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ,
	578	NFS_DIRBLKSIZ, td);
	579	if (!bp)
	580	return (EINTR);
	581	if ((bp->b_flags & B_CACHE) == 0) {
	582	bp->b_cmd = BUF_CMD_READ;
	583	vfs_busy_pages(vp, bp);
	584	error = nfs_doio(vp, &bp->b_bio2, td);
	585	/*
	586	* no error + B_INVAL == directory EOF,
	587	* use the block.
	588	*/
	589	if (error == 0 && (bp->b_flags & B_INVAL))
	590	break;
	591	}
	592	/*
	593	* An error will throw away the block and the
	594	* for loop will break out. If no error and this
	595	* is not the block we want, we throw away the
	596	* block and go for the next one via the for loop.
	597	*/
	598	if (error \|\| i < lbn)
	599	brelse(bp);
	600	}
	601	}
	602	/*
	603	* The above while is repeated if we hit another cookie
	604	* error. If we hit an error and it wasn't a cookie error,
	605	* we give up.
	606	*/
	607	if (error)
	608	return (error);
	609	}
	610
	611	/*
	612	* If not eof and read aheads are enabled, start one.
	613	* (You need the current block first, so that you have the
	614	* directory offset cookie of the next block.)
	615	*/
	616	if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
	617	(bp->b_flags & B_INVAL) == 0 &&
	618	(np->n_direofoffset == 0 \|\|
	619	loffset + NFS_DIRBLKSIZ < np->n_direofoffset) &&
	620	(np->n_flag & NDONTCACHE) == 0 &&
	621	!findblk(vp, loffset + NFS_DIRBLKSIZ)) {
	622	rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ,
	623	NFS_DIRBLKSIZ, td);
	624	if (rabp) {
	625	if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) {
	626	rabp->b_flags \|= B_ASYNC;
	627	rabp->b_cmd = BUF_CMD_READ;
	628	vfs_busy_pages(vp, rabp);
	629	if (nfs_asyncio(vp, &rabp->b_bio2, td)) {
	630	rabp->b_flags \|= B_INVAL\|B_ERROR;
	631	vfs_unbusy_pages(rabp);
	632	brelse(rabp);
	633	}
	634	} else {
	635	brelse(rabp);
	636	}
	637	}
	638	}
	639	/*
	640	* Unlike VREG files, whos buffer size ( bp->b_bcount ) is
	641	* chopped for the EOF condition, we cannot tell how large
	642	* NFS directories are going to be until we hit EOF. So
	643	* an NFS directory buffer is not chopped to its EOF. Now,
	644	* it just so happens that b_resid will effectively chop it
	645	* to EOF. BUT this information is lost if the buffer goes
	646	* away and is reconstituted into a B_CACHE state ( due to
	647	* being VMIO ) later. So we keep track of the directory eof
	648	* in np->n_direofoffset and chop it off as an extra step
	649	* right here.
	650	*/
	651	n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
	652	if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
	653	n = np->n_direofoffset - uio->uio_offset;
	654	break;
	655	default:
	656	kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	657	break;
	658	};
	659
	660	switch (vp->v_type) {
	661	case VREG:
	662	if (n > 0)
	663	error = uiomove(bp->b_data + on, (int)n, uio);
	664	break;
	665	case VLNK:
	666	if (n > 0)
	667	error = uiomove(bp->b_data + on, (int)n, uio);
	668	n = 0;
	669	break;
	670	case VDIR:
	671	if (n > 0) {
	672	off_t old_off = uio->uio_offset;
	673	caddr_t cpos, epos;
	674	struct nfs_dirent *dp;
	675
	676	/*
	677	* We are casting cpos to nfs_dirent, it must be
	678	* int-aligned.
	679	*/
	680	if (on & 3) {
	681	error = EINVAL;
	682	break;
	683	}
	684
	685	cpos = bp->b_data + on;
	686	epos = bp->b_data + on + n;
	687	while (cpos < epos && error == 0 && uio->uio_resid > 0) {
	688	dp = (struct nfs_dirent *)cpos;
	689	error = nfs_check_dirent(dp, (int)(epos - cpos));
	690	if (error)
	691	break;
	692	if (vop_write_dirent(&error, uio, dp->nfs_ino,
	693	dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) {
	694	break;
	695	}
	696	cpos += dp->nfs_reclen;
	697	}
	698	n = 0;
	699	if (error == 0)
	700	uio->uio_offset = old_off + cpos - bp->b_data - on;
	701	}
	702	/*
	703	* Invalidate buffer if caching is disabled, forcing a
	704	* re-read from the remote later.
	705	*/
	706	if (np->n_flag & NDONTCACHE)
	707	bp->b_flags \|= B_INVAL;
	708	break;
	709	default:
	710	kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	711	}
	712	brelse(bp);
	713	} while (error == 0 && uio->uio_resid > 0 && n > 0);
	714	return (error);
	715	}
	716
	717	/*
	718	* Userland can supply any 'seek' offset when reading a NFS directory.
	719	* Validate the structure so we don't panic the kernel. Note that
	720	* the element name is nul terminated and the nul is not included
	721	* in nfs_namlen.
	722	*/
	723	static
	724	int
	725	nfs_check_dirent(struct nfs_dirent *dp, int maxlen)
	726	{
	727	int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]);
	728
	729	if (nfs_name_off >= maxlen)
	730	return (EINVAL);
	731	if (dp->nfs_reclen < nfs_name_off \|\| dp->nfs_reclen > maxlen)
	732	return (EINVAL);
	733	if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen)
	734	return (EINVAL);
	735	if (dp->nfs_reclen & 3)
	736	return (EINVAL);
	737	return (0);
	738	}
	739
	740	/*
	741	* Vnode op for write using bio
	742	*
	743	* nfs_write(struct vnode a_vp, struct uio a_uio, int a_ioflag,
	744	* struct ucred *a_cred)
	745	*/
	746	int
	747	nfs_write(struct vop_write_args *ap)
	748	{
	749	struct uio *uio = ap->a_uio;
	750	struct thread *td = uio->uio_td;
	751	struct vnode *vp = ap->a_vp;
	752	struct nfsnode *np = VTONFS(vp);
	753	int ioflag = ap->a_ioflag;
	754	struct buf *bp;
	755	struct vattr vattr;
	756	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	757	daddr_t lbn;
	758	off_t loffset;
	759	int n, on, error = 0, iomode, must_commit;
	760	int haverslock = 0;
	761	int bcount;
	762	int biosize;
	763
	764	#ifdef DIAGNOSTIC
	765	if (uio->uio_rw != UIO_WRITE)
	766	panic("nfs_write mode");
	767	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread)
	768	panic("nfs_write proc");
	769	#endif
	770	if (vp->v_type != VREG)
	771	return (EIO);
	772	if (np->n_flag & NWRITEERR) {
	773	np->n_flag &= ~NWRITEERR;
	774	return (np->n_error);
	775	}
	776	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	777	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	778	(void)nfs_fsinfo(nmp, vp, td);
	779
	780	/*
	781	* Synchronously flush pending buffers if we are in synchronous
	782	* mode or if we are appending.
	783	*/
	784	if (ioflag & (IO_APPEND \| IO_SYNC)) {
	785	if (np->n_flag & NLMODIFIED) {
	786	np->n_attrstamp = 0;
	787	error = nfs_flush(vp, MNT_WAIT, td, 0);
	788	/* error = nfs_vinvalbuf(vp, V_SAVE, 1); */
	789	if (error)
	790	return (error);
	791	}
	792	}
	793
	794	/*
	795	* If IO_APPEND then load uio_offset. We restart here if we cannot
	796	* get the append lock.
	797	*/
	798	restart:
	799	if (ioflag & IO_APPEND) {
	800	np->n_attrstamp = 0;
	801	error = VOP_GETATTR(vp, &vattr);
	802	if (error)
	803	return (error);
	804	uio->uio_offset = np->n_size;
	805	}
	806
	807	if (uio->uio_offset < 0)
	808	return (EINVAL);
	809	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
	810	return (EFBIG);
	811	if (uio->uio_resid == 0)
	812	return (0);
	813
	814	/*
	815	* We need to obtain the rslock if we intend to modify np->n_size
	816	* in order to guarentee the append point with multiple contending
	817	* writers, to guarentee that no other appenders modify n_size
	818	* while we are trying to obtain a truncated buffer (i.e. to avoid
	819	* accidently truncating data written by another appender due to
	820	* the race), and to ensure that the buffer is populated prior to
	821	* our extending of the file. We hold rslock through the entire
	822	* operation.
	823	*
	824	* Note that we do not synchronize the case where someone truncates
	825	* the file while we are appending to it because attempting to lock
	826	* this case may deadlock other parts of the system unexpectedly.
	827	*/
	828	if ((ioflag & IO_APPEND) \|\|
	829	uio->uio_offset + uio->uio_resid > np->n_size) {
	830	switch(nfs_rslock(np)) {
	831	case ENOLCK:
	832	goto restart;
	833	/* not reached */
	834	case EINTR:
	835	case ERESTART:
	836	return(EINTR);
	837	/* not reached */
	838	default:
	839	break;
	840	}
	841	haverslock = 1;
	842	}
	843
	844	/*
	845	* Maybe this should be above the vnode op call, but so long as
	846	* file servers have no limits, i don't think it matters
	847	*/
	848	if (td->td_proc && uio->uio_offset + uio->uio_resid >
	849	td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
	850	lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
	851	if (haverslock)
	852	nfs_rsunlock(np);
	853	return (EFBIG);
	854	}
	855
	856	biosize = vp->v_mount->mnt_stat.f_iosize;
	857
	858	do {
	859	if ((np->n_flag & NDONTCACHE) && uio->uio_iovcnt == 1) {
	860	iomode = NFSV3WRITE_FILESYNC;
	861	error = nfs_writerpc(vp, uio, &iomode, &must_commit);
	862	if (must_commit)
	863	nfs_clearcommit(vp->v_mount);
	864	break;
	865	}
	866	nfsstats.biocache_writes++;
	867	lbn = uio->uio_offset / biosize;
	868	on = uio->uio_offset & (biosize-1);
	869	loffset = uio->uio_offset - on;
	870	n = min((unsigned)(biosize - on), uio->uio_resid);
	871	again:
	872	/*
	873	* Handle direct append and file extension cases, calculate
	874	* unaligned buffer size.
	875	*/
	876
	877	if (uio->uio_offset == np->n_size && n) {
	878	/*
	879	* Get the buffer (in its pre-append state to maintain
	880	* B_CACHE if it was previously set). Resize the
	881	* nfsnode after we have locked the buffer to prevent
	882	* readers from reading garbage.
	883	*/
	884	bcount = on;
	885	bp = nfs_getcacheblk(vp, loffset, bcount, td);
	886
	887	if (bp != NULL) {
	888	long save;
	889
	890	np->n_size = uio->uio_offset + n;
	891	np->n_flag \|= NLMODIFIED;
	892	vnode_pager_setsize(vp, np->n_size);
	893
	894	save = bp->b_flags & B_CACHE;
	895	bcount += n;
	896	allocbuf(bp, bcount);
	897	bp->b_flags \|= save;
	898	}
	899	} else {
	900	/*
	901	* Obtain the locked cache block first, and then
	902	* adjust the file's size as appropriate.
	903	*/
	904	bcount = on + n;
	905	if (loffset + bcount < np->n_size) {
	906	if (loffset + biosize < np->n_size)
	907	bcount = biosize;
	908	else
	909	bcount = np->n_size - loffset;
	910	}
	911	bp = nfs_getcacheblk(vp, loffset, bcount, td);
	912	if (uio->uio_offset + n > np->n_size) {
	913	np->n_size = uio->uio_offset + n;
	914	np->n_flag \|= NLMODIFIED;
	915	vnode_pager_setsize(vp, np->n_size);
	916	}
	917	}
	918
	919	if (bp == NULL) {
	920	error = EINTR;
	921	break;
	922	}
	923
	924	/*
	925	* Issue a READ if B_CACHE is not set. In special-append
	926	* mode, B_CACHE is based on the buffer prior to the write
	927	* op and is typically set, avoiding the read. If a read
	928	* is required in special append mode, the server will
	929	* probably send us a short-read since we extended the file
	930	* on our end, resulting in b_resid == 0 and, thusly,
	931	* B_CACHE getting set.
	932	*
	933	* We can also avoid issuing the read if the write covers
	934	* the entire buffer. We have to make sure the buffer state
	935	* is reasonable in this case since we will not be initiating
	936	* I/O. See the comments in kern/vfs_bio.c's getblk() for
	937	* more information.
	938	*
	939	* B_CACHE may also be set due to the buffer being cached
	940	* normally.
	941	*
	942	* When doing a UIO_NOCOPY write the buffer is not
	943	* overwritten and we cannot just set B_CACHE unconditionally
	944	* for full-block writes.
	945	*/
	946
	947	if (on == 0 && n == bcount && uio->uio_segflg != UIO_NOCOPY) {
	948	bp->b_flags \|= B_CACHE;
	949	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	950	}
	951
	952	if ((bp->b_flags & B_CACHE) == 0) {
	953	bp->b_cmd = BUF_CMD_READ;
	954	vfs_busy_pages(vp, bp);
	955	error = nfs_doio(vp, &bp->b_bio2, td);
	956	if (error) {
	957	brelse(bp);
	958	break;
	959	}
	960	}
	961	if (!bp) {
	962	error = EINTR;
	963	break;
	964	}
	965	np->n_flag \|= NLMODIFIED;
	966
	967	/*
	968	* If dirtyend exceeds file size, chop it down. This should
	969	* not normally occur but there is an append race where it
	970	* might occur XXX, so we log it.
	971	*
	972	* If the chopping creates a reverse-indexed or degenerate
	973	* situation with dirtyoff/end, we 0 both of them.
	974	*/
	975
	976	if (bp->b_dirtyend > bcount) {
	977	kprintf("NFS append race @%08llx:%d\n",
	978	bp->b_bio2.bio_offset,
	979	bp->b_dirtyend - bcount);
	980	bp->b_dirtyend = bcount;
	981	}
	982
	983	if (bp->b_dirtyoff >= bp->b_dirtyend)
	984	bp->b_dirtyoff = bp->b_dirtyend = 0;
	985
	986	/*
	987	* If the new write will leave a contiguous dirty
	988	* area, just update the b_dirtyoff and b_dirtyend,
	989	* otherwise force a write rpc of the old dirty area.
	990	*
	991	* While it is possible to merge discontiguous writes due to
	992	* our having a B_CACHE buffer ( and thus valid read data
	993	* for the hole), we don't because it could lead to
	994	* significant cache coherency problems with multiple clients,
	995	* especially if locking is implemented later on.
	996	*
	997	* as an optimization we could theoretically maintain
	998	* a linked list of discontinuous areas, but we would still
	999	* have to commit them separately so there isn't much
	1000	* advantage to it except perhaps a bit of asynchronization.
	1001	*/
	1002
	1003	if (bp->b_dirtyend > 0 &&
	1004	(on > bp->b_dirtyend \|\| (on + n) < bp->b_dirtyoff)) {
	1005	if (bwrite(bp) == EINTR) {
	1006	error = EINTR;
	1007	break;
	1008	}
	1009	goto again;
	1010	}
	1011
	1012	error = uiomove((char *)bp->b_data + on, n, uio);
	1013
	1014	/*
	1015	* Since this block is being modified, it must be written
	1016	* again and not just committed. Since write clustering does
	1017	* not work for the stage 1 data write, only the stage 2
	1018	* commit rpc, we have to clear B_CLUSTEROK as well.
	1019	*/
	1020	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1021
	1022	if (error) {
	1023	bp->b_flags \|= B_ERROR;
	1024	brelse(bp);
	1025	break;
	1026	}
	1027
	1028	/*
	1029	* Only update dirtyoff/dirtyend if not a degenerate
	1030	* condition.
	1031	*/
	1032	if (n) {
	1033	if (bp->b_dirtyend > 0) {
	1034	bp->b_dirtyoff = min(on, bp->b_dirtyoff);
	1035	bp->b_dirtyend = max((on + n), bp->b_dirtyend);
	1036	} else {
	1037	bp->b_dirtyoff = on;
	1038	bp->b_dirtyend = on + n;
	1039	}
	1040	vfs_bio_set_validclean(bp, on, n);
	1041	}
	1042	/*
	1043	* If IO_NOWDRAIN then set B_NOWDRAIN (e.g. nfs-backed VN
	1044	* filesystem). XXX also use for loopback NFS mounts.
	1045	*/
	1046	if (ioflag & IO_NOWDRAIN)
	1047	bp->b_flags \|= B_NOWDRAIN;
	1048
	1049	/*
	1050	* If the lease is non-cachable or IO_SYNC do bwrite().
	1051	*
	1052	* IO_INVAL appears to be unused. The idea appears to be
	1053	* to turn off caching in this case. Very odd. XXX
	1054	*/
	1055	if ((np->n_flag & NDONTCACHE) \|\| (ioflag & IO_SYNC)) {
	1056	if (ioflag & IO_INVAL)
	1057	bp->b_flags \|= B_NOCACHE;
	1058	error = bwrite(bp);
	1059	if (error)
	1060	break;
	1061	if (np->n_flag & NDONTCACHE) {
	1062	error = nfs_vinvalbuf(vp, V_SAVE, 1);
	1063	if (error)
	1064	break;
	1065	}
	1066	} else if ((n + on) == biosize) {
	1067	bp->b_flags \|= B_ASYNC;
	1068	bwrite(bp);
	1069	} else {
	1070	bdwrite(bp);
	1071	}
	1072	} while (uio->uio_resid > 0 && n > 0);
	1073
	1074	if (haverslock)
	1075	nfs_rsunlock(np);
	1076
	1077	return (error);
	1078	}
	1079
	1080	/*
	1081	* Get an nfs cache block.
	1082	*
	1083	* Allocate a new one if the block isn't currently in the cache
	1084	* and return the block marked busy. If the calling process is
	1085	* interrupted by a signal for an interruptible mount point, return
	1086	* NULL.
	1087	*
	1088	* The caller must carefully deal with the possible B_INVAL state of
	1089	* the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
	1090	* indirectly), so synchronous reads can be issued without worrying about
	1091	* the B_INVAL state. We have to be a little more careful when dealing
	1092	* with writes (see comments in nfs_write()) when extending a file past
	1093	* its EOF.
	1094	*/
	1095	static struct buf *
	1096	nfs_getcacheblk(struct vnode vp, off_t loffset, int size, struct thread td)
	1097	{
	1098	struct buf *bp;
	1099	struct mount *mp;
	1100	struct nfsmount *nmp;
	1101
	1102	mp = vp->v_mount;
	1103	nmp = VFSTONFS(mp);
	1104
	1105	if (nmp->nm_flag & NFSMNT_INT) {
	1106	bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0);
	1107	while (bp == NULL) {
	1108	if (nfs_sigintr(nmp, (struct nfsreq *)0, td))
	1109	return (NULL);
	1110	bp = getblk(vp, loffset, size, 0, 2 * hz);
	1111	}
	1112	} else {
	1113	bp = getblk(vp, loffset, size, 0, 0);
	1114	}
	1115
	1116	/*
	1117	* bio2, the 'device' layer. Since BIOs use 64 bit byte offsets
	1118	* now, no translation is necessary.
	1119	*/
	1120	bp->b_bio2.bio_offset = loffset;
	1121	return (bp);
	1122	}
	1123
	1124	/*
	1125	* Flush and invalidate all dirty buffers. If another process is already
	1126	* doing the flush, just wait for completion.
	1127	*/
	1128	int
	1129	nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg)
	1130	{
	1131	struct nfsnode *np = VTONFS(vp);
	1132	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	1133	int error = 0, slpflag, slptimeo;
	1134	thread_t td = curthread;
	1135
	1136	if (vp->v_flag & VRECLAIMED)
	1137	return (0);
	1138
	1139	if ((nmp->nm_flag & NFSMNT_INT) == 0)
	1140	intrflg = 0;
	1141	if (intrflg) {
	1142	slpflag = PCATCH;
	1143	slptimeo = 2 * hz;
	1144	} else {
	1145	slpflag = 0;
	1146	slptimeo = 0;
	1147	}
	1148	/*
	1149	* First wait for any other process doing a flush to complete.
	1150	*/
	1151	while (np->n_flag & NFLUSHINPROG) {
	1152	np->n_flag \|= NFLUSHWANT;
	1153	error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo);
	1154	if (error && intrflg && nfs_sigintr(nmp, NULL, td))
	1155	return (EINTR);
	1156	}
	1157
	1158	/*
	1159	* Now, flush as required.
	1160	*/
	1161	np->n_flag \|= NFLUSHINPROG;
	1162	error = vinvalbuf(vp, flags, slpflag, 0);
	1163	while (error) {
	1164	if (intrflg && nfs_sigintr(nmp, NULL, td)) {
	1165	np->n_flag &= ~NFLUSHINPROG;
	1166	if (np->n_flag & NFLUSHWANT) {
	1167	np->n_flag &= ~NFLUSHWANT;
	1168	wakeup((caddr_t)&np->n_flag);
	1169	}
	1170	return (EINTR);
	1171	}
	1172	error = vinvalbuf(vp, flags, 0, slptimeo);
	1173	}
	1174	np->n_flag &= ~(NLMODIFIED \| NFLUSHINPROG);
	1175	if (np->n_flag & NFLUSHWANT) {
	1176	np->n_flag &= ~NFLUSHWANT;
	1177	wakeup((caddr_t)&np->n_flag);
	1178	}
	1179	return (0);
	1180	}
	1181
	1182	/*
	1183	* Initiate asynchronous I/O. Return an error if no nfsiods are available.
	1184	* This is mainly to avoid queueing async I/O requests when the nfsiods
	1185	* are all hung on a dead server.
	1186	*
	1187	* Note: nfs_asyncio() does not clear (B_ERROR\|B_INVAL) but when the bp
	1188	* is eventually dequeued by the async daemon, nfs_doio() will.
	1189	*/
	1190	int
	1191	nfs_asyncio(struct vnode vp, struct bio bio, struct thread *td)
	1192	{
	1193	struct buf *bp = bio->bio_buf;
	1194	struct nfsmount *nmp;
	1195	int i;
	1196	int gotiod;
	1197	int slpflag = 0;
	1198	int slptimeo = 0;
	1199	int error;
	1200
	1201	/*
	1202	* If no async daemons then return EIO to force caller to run the rpc
	1203	* synchronously.
	1204	*/
	1205	if (nfs_numasync == 0)
	1206	return (EIO);
	1207
	1208	KKASSERT(vp->v_tag == VT_NFS);
	1209	nmp = VFSTONFS(vp->v_mount);
	1210
	1211	/*
	1212	* Commits are usually short and sweet so lets save some cpu and
	1213	* leave the async daemons for more important rpc's (such as reads
	1214	* and writes).
	1215	*/
	1216	if (bp->b_cmd == BUF_CMD_WRITE && (bp->b_flags & B_NEEDCOMMIT) &&
	1217	(nmp->nm_bioqiods > nfs_numasync / 2)) {
	1218	return(EIO);
	1219	}
	1220
	1221	again:
	1222	if (nmp->nm_flag & NFSMNT_INT)
	1223	slpflag = PCATCH;
	1224	gotiod = FALSE;
	1225
	1226	/*
	1227	* Find a free iod to process this request.
	1228	*/
	1229	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
	1230	if (nfs_iodwant[i]) {
	1231	/*
	1232	* Found one, so wake it up and tell it which
	1233	* mount to process.
	1234	*/
	1235	NFS_DPF(ASYNCIO,
	1236	("nfs_asyncio: waking iod %d for mount %p\n",
	1237	i, nmp));
	1238	nfs_iodwant[i] = NULL;
	1239	nfs_iodmount[i] = nmp;
	1240	nmp->nm_bioqiods++;
	1241	wakeup((caddr_t)&nfs_iodwant[i]);
	1242	gotiod = TRUE;
	1243	break;
	1244	}
	1245
	1246	/*
	1247	* If none are free, we may already have an iod working on this mount
	1248	* point. If so, it will process our request.
	1249	*/
	1250	if (!gotiod) {
	1251	if (nmp->nm_bioqiods > 0) {
	1252	NFS_DPF(ASYNCIO,
	1253	("nfs_asyncio: %d iods are already processing mount %p\n",
	1254	nmp->nm_bioqiods, nmp));
	1255	gotiod = TRUE;
	1256	}
	1257	}
	1258
	1259	/*
	1260	* If we have an iod which can process the request, then queue
	1261	* the buffer.
	1262	*/
	1263	if (gotiod) {
	1264	/*
	1265	* Ensure that the queue never grows too large. We still want
	1266	* to asynchronize so we block rather then return EIO.
	1267	*/
	1268	while (nmp->nm_bioqlen >= 2*nfs_numasync) {
	1269	NFS_DPF(ASYNCIO,
	1270	("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
	1271	nmp->nm_bioqwant = TRUE;
	1272	error = tsleep(&nmp->nm_bioq, slpflag,
	1273	"nfsaio", slptimeo);
	1274	if (error) {
	1275	if (nfs_sigintr(nmp, NULL, td))
	1276	return (EINTR);
	1277	if (slpflag == PCATCH) {
	1278	slpflag = 0;
	1279	slptimeo = 2 * hz;
	1280	}
	1281	}
	1282	/*
	1283	* We might have lost our iod while sleeping,
	1284	* so check and loop if nescessary.
	1285	*/
	1286	if (nmp->nm_bioqiods == 0) {
	1287	NFS_DPF(ASYNCIO,
	1288	("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
	1289	goto again;
	1290	}
	1291	}
	1292	BUF_KERNPROC(bp);
	1293
	1294	/*
	1295	* The passed bio's buffer is not necessary associated with
	1296	* the NFS vnode it is being written to. Store the NFS vnode
	1297	* in the BIO driver info.
	1298	*/
	1299	bio->bio_driver_info = vp;
	1300	TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act);
	1301	nmp->nm_bioqlen++;
	1302	return (0);
	1303	}
	1304
	1305	/*
	1306	* All the iods are busy on other mounts, so return EIO to
	1307	* force the caller to process the i/o synchronously.
	1308	*/
	1309	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
	1310	return (EIO);
	1311	}
	1312
	1313	/*
	1314	* Do an I/O operation to/from a cache block. This may be called
	1315	* synchronously or from an nfsiod. The BIO is normalized for DEV_BSIZE.
	1316	*
	1317	* NOTE! TD MIGHT BE NULL
	1318	*/
	1319	int
	1320	nfs_doio(struct vnode vp, struct bio bio, struct thread *td)
	1321	{
	1322	struct buf *bp = bio->bio_buf;
	1323	struct uio *uiop;
	1324	struct nfsnode *np;
	1325	struct nfsmount *nmp;
	1326	int error = 0, iomode, must_commit = 0;
	1327	struct uio uio;
	1328	struct iovec io;
	1329
	1330	KKASSERT(vp->v_tag == VT_NFS);
	1331	np = VTONFS(vp);
	1332	nmp = VFSTONFS(vp->v_mount);
	1333	uiop = &uio;
	1334	uiop->uio_iov = &io;
	1335	uiop->uio_iovcnt = 1;
	1336	uiop->uio_segflg = UIO_SYSSPACE;
	1337	uiop->uio_td = td;
	1338
	1339	/*
	1340	* clear B_ERROR and B_INVAL state prior to initiating the I/O. We
	1341	* do this here so we do not have to do it in all the code that
	1342	* calls us.
	1343	*/
	1344	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	1345
	1346
	1347	KASSERT(bp->b_cmd != BUF_CMD_DONE,
	1348	("nfs_doio: bp %p already marked done!", bp));
	1349
	1350	if (bp->b_cmd == BUF_CMD_READ) {
	1351	io.iov_len = uiop->uio_resid = bp->b_bcount;
	1352	io.iov_base = bp->b_data;
	1353	uiop->uio_rw = UIO_READ;
	1354
	1355	switch (vp->v_type) {
	1356	case VREG:
	1357	uiop->uio_offset = bio->bio_offset;
	1358	nfsstats.read_bios++;
	1359	error = nfs_readrpc(vp, uiop);
	1360
	1361	if (!error) {
	1362	if (uiop->uio_resid) {
	1363	/*
	1364	* If we had a short read with no error, we must have
	1365	* hit a file hole. We should zero-fill the remainder.
	1366	* This can also occur if the server hits the file EOF.
	1367	*
	1368	* Holes used to be able to occur due to pending
	1369	* writes, but that is not possible any longer.
	1370	*/
	1371	int nread = bp->b_bcount - uiop->uio_resid;
	1372	int left = uiop->uio_resid;
	1373
	1374	if (left > 0)
	1375	bzero((char *)bp->b_data + nread, left);
	1376	uiop->uio_resid = 0;
	1377	}
	1378	}
	1379	if (td && td->td_proc && (vp->v_flag & VTEXT) &&
	1380	np->n_mtime != np->n_vattr.va_mtime.tv_sec) {
	1381	uprintf("Process killed due to text file modification\n");
	1382	ksignal(td->td_proc, SIGKILL);
	1383	}
	1384	break;
	1385	case VLNK:
	1386	uiop->uio_offset = 0;
	1387	nfsstats.readlink_bios++;
	1388	error = nfs_readlinkrpc(vp, uiop);
	1389	break;
	1390	case VDIR:
	1391	nfsstats.readdir_bios++;
	1392	uiop->uio_offset = bio->bio_offset;
	1393	if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
	1394	error = nfs_readdirplusrpc(vp, uiop);
	1395	if (error == NFSERR_NOTSUPP)
	1396	nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
	1397	}
	1398	if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
	1399	error = nfs_readdirrpc(vp, uiop);
	1400	/*
	1401	* end-of-directory sets B_INVAL but does not generate an
	1402	* error.
	1403	*/
	1404	if (error == 0 && uiop->uio_resid == bp->b_bcount)
	1405	bp->b_flags \|= B_INVAL;
	1406	break;
	1407	default:
	1408	kprintf("nfs_doio: type %x unexpected\n",vp->v_type);
	1409	break;
	1410	};
	1411	if (error) {
	1412	bp->b_flags \|= B_ERROR;
	1413	bp->b_error = error;
	1414	}
	1415	} else {
	1416	/*
	1417	* If we only need to commit, try to commit
	1418	*/
	1419	KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
	1420	if (bp->b_flags & B_NEEDCOMMIT) {
	1421	int retv;
	1422	off_t off;
	1423
	1424	off = bio->bio_offset + bp->b_dirtyoff;
	1425	retv = nfs_commit(vp, off,
	1426	bp->b_dirtyend - bp->b_dirtyoff, td);
	1427	if (retv == 0) {
	1428	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1429	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1430	bp->b_resid = 0;
	1431	biodone(bio);
	1432	return (0);
	1433	}
	1434	if (retv == NFSERR_STALEWRITEVERF) {
	1435	nfs_clearcommit(vp->v_mount);
	1436	}
	1437	}
	1438
	1439	/*
	1440	* Setup for actual write
	1441	*/
	1442
	1443	if (bio->bio_offset + bp->b_dirtyend > np->n_size)
	1444	bp->b_dirtyend = np->n_size - bio->bio_offset;
	1445
	1446	if (bp->b_dirtyend > bp->b_dirtyoff) {
	1447	io.iov_len = uiop->uio_resid = bp->b_dirtyend
	1448	- bp->b_dirtyoff;
	1449	uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
	1450	io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
	1451	uiop->uio_rw = UIO_WRITE;
	1452	nfsstats.write_bios++;
	1453
	1454	if ((bp->b_flags & (B_ASYNC \| B_NEEDCOMMIT \| B_NOCACHE \| B_CLUSTER)) == B_ASYNC)
	1455	iomode = NFSV3WRITE_UNSTABLE;
	1456	else
	1457	iomode = NFSV3WRITE_FILESYNC;
	1458
	1459	error = nfs_writerpc(vp, uiop, &iomode, &must_commit);
	1460
	1461	/*
	1462	* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
	1463	* to cluster the buffers needing commit. This will allow
	1464	* the system to submit a single commit rpc for the whole
	1465	* cluster. We can do this even if the buffer is not 100%
	1466	* dirty (relative to the NFS blocksize), so we optimize the
	1467	* append-to-file-case.
	1468	*
	1469	* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
	1470	* cleared because write clustering only works for commit
	1471	* rpc's, not for the data portion of the write).
	1472	*/
	1473
	1474	if (!error && iomode == NFSV3WRITE_UNSTABLE) {
	1475	bp->b_flags \|= B_NEEDCOMMIT;
	1476	if (bp->b_dirtyoff == 0
	1477	&& bp->b_dirtyend == bp->b_bcount)
	1478	bp->b_flags \|= B_CLUSTEROK;
	1479	} else {
	1480	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1481	}
	1482
	1483	/*
	1484	* For an interrupted write, the buffer is still valid
	1485	* and the write hasn't been pushed to the server yet,
	1486	* so we can't set B_ERROR and report the interruption
	1487	* by setting B_EINTR. For the B_ASYNC case, B_EINTR
	1488	* is not relevant, so the rpc attempt is essentially
	1489	* a noop. For the case of a V3 write rpc not being
	1490	* committed to stable storage, the block is still
	1491	* dirty and requires either a commit rpc or another
	1492	* write rpc with iomode == NFSV3WRITE_FILESYNC before
	1493	* the block is reused. This is indicated by setting
	1494	* the B_DELWRI and B_NEEDCOMMIT flags.
	1495	*
	1496	* If the buffer is marked B_PAGING, it does not reside on
	1497	* the vp's paging queues so we cannot call bdirty(). The
	1498	* bp in this case is not an NFS cache block so we should
	1499	* be safe. XXX
	1500	*/
	1501	if (error == EINTR
	1502	\|\| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
	1503	crit_enter();
	1504	bp->b_flags &= ~(B_INVAL\|B_NOCACHE);
	1505	if ((bp->b_flags & B_PAGING) == 0)
	1506	bdirty(bp);
	1507	if (error && (bp->b_flags & B_ASYNC) == 0)
	1508	bp->b_flags \|= B_EINTR;
	1509	crit_exit();
	1510	} else {
	1511	if (error) {
	1512	bp->b_flags \|= B_ERROR;
	1513	bp->b_error = np->n_error = error;
	1514	np->n_flag \|= NWRITEERR;
	1515	}
	1516	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1517	}
	1518	} else {
	1519	bp->b_resid = 0;
	1520	biodone(bio);
	1521	return (0);
	1522	}
	1523	}
	1524	bp->b_resid = uiop->uio_resid;
	1525	if (must_commit)
	1526	nfs_clearcommit(vp->v_mount);
	1527	biodone(bio);
	1528	return (error);
	1529	}
	1530
	1531	/*
	1532	* Used to aid in handling ftruncate() operations on the NFS client side.
	1533	* Truncation creates a number of special problems for NFS. We have to
	1534	* throw away VM pages and buffer cache buffers that are beyond EOF, and
	1535	* we have to properly handle VM pages or (potentially dirty) buffers
	1536	* that straddle the truncation point.
	1537	*/
	1538
	1539	int
	1540	nfs_meta_setsize(struct vnode vp, struct thread td, u_quad_t nsize)
	1541	{
	1542	struct nfsnode *np = VTONFS(vp);
	1543	u_quad_t tsize = np->n_size;
	1544	int biosize = vp->v_mount->mnt_stat.f_iosize;
	1545	int error = 0;
	1546
	1547	np->n_size = nsize;
	1548
	1549	if (np->n_size < tsize) {
	1550	struct buf *bp;
	1551	daddr_t lbn;
	1552	off_t loffset;
	1553	int bufsize;
	1554
	1555	/*
	1556	* vtruncbuf() doesn't get the buffer overlapping the
	1557	* truncation point. We may have a B_DELWRI and/or B_CACHE
	1558	* buffer that now needs to be truncated.
	1559	*/
	1560	error = vtruncbuf(vp, nsize, biosize);
	1561	lbn = nsize / biosize;
	1562	bufsize = nsize & (biosize - 1);
	1563	loffset = nsize - bufsize;
	1564	bp = nfs_getcacheblk(vp, loffset, bufsize, td);
	1565	if (bp->b_dirtyoff > bp->b_bcount)
	1566	bp->b_dirtyoff = bp->b_bcount;
	1567	if (bp->b_dirtyend > bp->b_bcount)
	1568	bp->b_dirtyend = bp->b_bcount;
	1569	bp->b_flags \|= B_RELBUF; /* don't leave garbage around */
	1570	brelse(bp);
	1571	} else {
	1572	vnode_pager_setsize(vp, nsize);
	1573	}
	1574	return(error);
	1575	}
	1576