gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1989, 1993
	3	* The Regents of the University of California. All rights reserved.
	4	*
	5	* This code is derived from software contributed to Berkeley by
	6	* Rick Macklem at The University of Guelph.
	7	*
	8	* Redistribution and use in source and binary forms, with or without
	9	* modification, are permitted provided that the following conditions
	10	* are met:
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in the
	15	* documentation and/or other materials provided with the distribution.
	16	* 3. All advertising materials mentioning features or use of this software
	17	* must display the following acknowledgement:
	18	* This product includes software developed by the University of
	19	* California, Berkeley and its contributors.
	20	* 4. Neither the name of the University nor the names of its contributors
	21	* may be used to endorse or promote products derived from this software
	22	* without specific prior written permission.
	23	*
	24	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	25	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	26	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	27	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	28	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	29	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	30	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	31	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	32	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	33	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	34	* SUCH DAMAGE.
	35	*
	36	* @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95
	37	* $FreeBSD: src/sys/nfs/nfs_bio.c,v 1.83.2.4 2002/12/29 18:19:53 dillon Exp $
	38	* $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.2 2003/06/17 04:28:54 dillon Exp $
	39	*/
	40
	41
	42	#include <sys/param.h>
	43	#include <sys/systm.h>
	44	#include <sys/resourcevar.h>
	45	#include <sys/signalvar.h>
	46	#include <sys/proc.h>
	47	#include <sys/buf.h>
	48	#include <sys/vnode.h>
	49	#include <sys/mount.h>
	50	#include <sys/kernel.h>
	51
	52	#include <vm/vm.h>
	53	#include <vm/vm_extern.h>
	54	#include <vm/vm_page.h>
	55	#include <vm/vm_object.h>
	56	#include <vm/vm_pager.h>
	57	#include <vm/vnode_pager.h>
	58
	59	#include <nfs/rpcv2.h>
	60	#include <nfs/nfsproto.h>
	61	#include <nfs/nfs.h>
	62	#include <nfs/nfsmount.h>
	63	#include <nfs/nqnfs.h>
	64	#include <nfs/nfsnode.h>
	65
	66	static struct buf nfs_getcacheblk __P((struct vnode vp, daddr_t bn, int size,
	67	struct proc *p));
	68
	69	extern int nfs_numasync;
	70	extern int nfs_pbuf_freecnt;
	71	extern struct nfsstats nfsstats;
	72
	73	/*
	74	* Vnode op for VM getpages.
	75	*/
	76	int
	77	nfs_getpages(ap)
	78	struct vop_getpages_args /* {
	79	struct vnode *a_vp;
	80	vm_page_t *a_m;
	81	int a_count;
	82	int a_reqpage;
	83	vm_ooffset_t a_offset;
	84	} / ap;
	85	{
	86	int i, error, nextoff, size, toff, count, npages;
	87	struct uio uio;
	88	struct iovec iov;
	89	vm_offset_t kva;
	90	struct buf *bp;
	91	struct vnode *vp;
	92	struct proc *p;
	93	struct ucred *cred;
	94	struct nfsmount *nmp;
	95	vm_page_t *pages;
	96
	97	vp = ap->a_vp;
	98	p = curproc; /* XXX */
	99	cred = curproc->p_ucred; /* XXX */
	100	nmp = VFSTONFS(vp->v_mount);
	101	pages = ap->a_m;
	102	count = ap->a_count;
	103
	104	if (vp->v_object == NULL) {
	105	printf("nfs_getpages: called with non-merged cache vnode??\n");
	106	return VM_PAGER_ERROR;
	107	}
	108
	109	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	110	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	111	(void)nfs_fsinfo(nmp, vp, cred, p);
	112
	113	npages = btoc(count);
	114
	115	/*
	116	* If the requested page is partially valid, just return it and
	117	* allow the pager to zero-out the blanks. Partially valid pages
	118	* can only occur at the file EOF.
	119	*/
	120
	121	{
	122	vm_page_t m = pages[ap->a_reqpage];
	123
	124	if (m->valid != 0) {
	125	/* handled by vm_fault now */
	126	/* vm_page_zero_invalid(m, TRUE); */
	127	for (i = 0; i < npages; ++i) {
	128	if (i != ap->a_reqpage)
	129	vnode_pager_freepage(pages[i]);
	130	}
	131	return(0);
	132	}
	133	}
	134
	135	/*
	136	* We use only the kva address for the buffer, but this is extremely
	137	* convienient and fast.
	138	*/
	139	bp = getpbuf(&nfs_pbuf_freecnt);
	140
	141	kva = (vm_offset_t) bp->b_data;
	142	pmap_qenter(kva, pages, npages);
	143
	144	iov.iov_base = (caddr_t) kva;
	145	iov.iov_len = count;
	146	uio.uio_iov = &iov;
	147	uio.uio_iovcnt = 1;
	148	uio.uio_offset = IDX_TO_OFF(pages[0]->pindex);
	149	uio.uio_resid = count;
	150	uio.uio_segflg = UIO_SYSSPACE;
	151	uio.uio_rw = UIO_READ;
	152	uio.uio_procp = p;
	153
	154	error = nfs_readrpc(vp, &uio, cred);
	155	pmap_qremove(kva, npages);
	156
	157	relpbuf(bp, &nfs_pbuf_freecnt);
	158
	159	if (error && (uio.uio_resid == count)) {
	160	printf("nfs_getpages: error %d\n", error);
	161	for (i = 0; i < npages; ++i) {
	162	if (i != ap->a_reqpage)
	163	vnode_pager_freepage(pages[i]);
	164	}
	165	return VM_PAGER_ERROR;
	166	}
	167
	168	/*
	169	* Calculate the number of bytes read and validate only that number
	170	* of bytes. Note that due to pending writes, size may be 0. This
	171	* does not mean that the remaining data is invalid!
	172	*/
	173
	174	size = count - uio.uio_resid;
	175
	176	for (i = 0, toff = 0; i < npages; i++, toff = nextoff) {
	177	vm_page_t m;
	178	nextoff = toff + PAGE_SIZE;
	179	m = pages[i];
	180
	181	m->flags &= ~PG_ZERO;
	182
	183	if (nextoff <= size) {
	184	/*
	185	* Read operation filled an entire page
	186	*/
	187	m->valid = VM_PAGE_BITS_ALL;
	188	vm_page_undirty(m);
	189	} else if (size > toff) {
	190	/*
	191	* Read operation filled a partial page.
	192	*/
	193	m->valid = 0;
	194	vm_page_set_validclean(m, 0, size - toff);
	195	/* handled by vm_fault now */
	196	/* vm_page_zero_invalid(m, TRUE); */
	197	} else {
	198	/*
	199	* Read operation was short. If no error occured
	200	* we may have hit a zero-fill section. We simply
	201	* leave valid set to 0.
	202	*/
	203	;
	204	}
	205	if (i != ap->a_reqpage) {
	206	/*
	207	* Whether or not to leave the page activated is up in
	208	* the air, but we should put the page on a page queue
	209	* somewhere (it already is in the object). Result:
	210	* It appears that emperical results show that
	211	* deactivating pages is best.
	212	*/
	213
	214	/*
	215	* Just in case someone was asking for this page we
	216	* now tell them that it is ok to use.
	217	*/
	218	if (!error) {
	219	if (m->flags & PG_WANTED)
	220	vm_page_activate(m);
	221	else
	222	vm_page_deactivate(m);
	223	vm_page_wakeup(m);
	224	} else {
	225	vnode_pager_freepage(m);
	226	}
	227	}
	228	}
	229	return 0;
	230	}
	231
	232	/*
	233	* Vnode op for VM putpages.
	234	*/
	235	int
	236	nfs_putpages(ap)
	237	struct vop_putpages_args /* {
	238	struct vnode *a_vp;
	239	vm_page_t *a_m;
	240	int a_count;
	241	int a_sync;
	242	int *a_rtvals;
	243	vm_ooffset_t a_offset;
	244	} / ap;
	245	{
	246	struct uio uio;
	247	struct iovec iov;
	248	vm_offset_t kva;
	249	struct buf *bp;
	250	int iomode, must_commit, i, error, npages, count;
	251	off_t offset;
	252	int *rtvals;
	253	struct vnode *vp;
	254	struct proc *p;
	255	struct ucred *cred;
	256	struct nfsmount *nmp;
	257	struct nfsnode *np;
	258	vm_page_t *pages;
	259
	260	vp = ap->a_vp;
	261	np = VTONFS(vp);
	262	p = curproc; /* XXX */
	263	cred = curproc->p_ucred; /* XXX */
	264	nmp = VFSTONFS(vp->v_mount);
	265	pages = ap->a_m;
	266	count = ap->a_count;
	267	rtvals = ap->a_rtvals;
	268	npages = btoc(count);
	269	offset = IDX_TO_OFF(pages[0]->pindex);
	270
	271	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	272	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	273	(void)nfs_fsinfo(nmp, vp, cred, p);
	274
	275	for (i = 0; i < npages; i++) {
	276	rtvals[i] = VM_PAGER_AGAIN;
	277	}
	278
	279	/*
	280	* When putting pages, do not extend file past EOF.
	281	*/
	282
	283	if (offset + count > np->n_size) {
	284	count = np->n_size - offset;
	285	if (count < 0)
	286	count = 0;
	287	}
	288
	289	/*
	290	* We use only the kva address for the buffer, but this is extremely
	291	* convienient and fast.
	292	*/
	293	bp = getpbuf(&nfs_pbuf_freecnt);
	294
	295	kva = (vm_offset_t) bp->b_data;
	296	pmap_qenter(kva, pages, npages);
	297
	298	iov.iov_base = (caddr_t) kva;
	299	iov.iov_len = count;
	300	uio.uio_iov = &iov;
	301	uio.uio_iovcnt = 1;
	302	uio.uio_offset = offset;
	303	uio.uio_resid = count;
	304	uio.uio_segflg = UIO_SYSSPACE;
	305	uio.uio_rw = UIO_WRITE;
	306	uio.uio_procp = p;
	307
	308	if ((ap->a_sync & VM_PAGER_PUT_SYNC) == 0)
	309	iomode = NFSV3WRITE_UNSTABLE;
	310	else
	311	iomode = NFSV3WRITE_FILESYNC;
	312
	313	error = nfs_writerpc(vp, &uio, cred, &iomode, &must_commit);
	314
	315	pmap_qremove(kva, npages);
	316	relpbuf(bp, &nfs_pbuf_freecnt);
	317
	318	if (!error) {
	319	int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE;
	320	for (i = 0; i < nwritten; i++) {
	321	rtvals[i] = VM_PAGER_OK;
	322	vm_page_undirty(pages[i]);
	323	}
	324	if (must_commit)
	325	nfs_clearcommit(vp->v_mount);
	326	}
	327	return rtvals[0];
	328	}
	329
	330	/*
	331	* Vnode op for read using bio
	332	*/
	333	int
	334	nfs_bioread(vp, uio, ioflag, cred)
	335	register struct vnode *vp;
	336	register struct uio *uio;
	337	int ioflag;
	338	struct ucred *cred;
	339	{
	340	register struct nfsnode *np = VTONFS(vp);
	341	register int biosize, i;
	342	struct buf bp = 0, rabp;
	343	struct vattr vattr;
	344	struct proc *p;
	345	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	346	daddr_t lbn, rabn;
	347	int bcount;
	348	int seqcount;
	349	int nra, error = 0, n = 0, on = 0;
	350
	351	#ifdef DIAGNOSTIC
	352	if (uio->uio_rw != UIO_READ)
	353	panic("nfs_read mode");
	354	#endif
	355	if (uio->uio_resid == 0)
	356	return (0);
	357	if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */
	358	return (EINVAL);
	359	p = uio->uio_procp;
	360
	361	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	362	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	363	(void)nfs_fsinfo(nmp, vp, cred, p);
	364	if (vp->v_type != VDIR &&
	365	(uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
	366	return (EFBIG);
	367	biosize = vp->v_mount->mnt_stat.f_iosize;
	368	seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE);
	369	/*
	370	* For nfs, cache consistency can only be maintained approximately.
	371	* Although RFC1094 does not specify the criteria, the following is
	372	* believed to be compatible with the reference port.
	373	* For nqnfs, full cache consistency is maintained within the loop.
	374	* For nfs:
	375	* If the file's modify time on the server has changed since the
	376	* last read rpc or you have written to the file,
	377	* you may have lost data cache consistency with the
	378	* server, so flush all of the file's data out of the cache.
	379	* Then force a getattr rpc to ensure that you have up to date
	380	* attributes.
	381	* NB: This implies that cache data can be read when up to
	382	* NFS_ATTRTIMEO seconds out of date. If you find that you need current
	383	* attributes this could be forced by setting n_attrstamp to 0 before
	384	* the VOP_GETATTR() call.
	385	*/
	386	if ((nmp->nm_flag & NFSMNT_NQNFS) == 0) {
	387	if (np->n_flag & NMODIFIED) {
	388	if (vp->v_type != VREG) {
	389	if (vp->v_type != VDIR)
	390	panic("nfs: bioread, not dir");
	391	nfs_invaldir(vp);
	392	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	393	if (error)
	394	return (error);
	395	}
	396	np->n_attrstamp = 0;
	397	error = VOP_GETATTR(vp, &vattr, cred, p);
	398	if (error)
	399	return (error);
	400	np->n_mtime = vattr.va_mtime.tv_sec;
	401	} else {
	402	error = VOP_GETATTR(vp, &vattr, cred, p);
	403	if (error)
	404	return (error);
	405	if (np->n_mtime != vattr.va_mtime.tv_sec) {
	406	if (vp->v_type == VDIR)
	407	nfs_invaldir(vp);
	408	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	409	if (error)
	410	return (error);
	411	np->n_mtime = vattr.va_mtime.tv_sec;
	412	}
	413	}
	414	}
	415	do {
	416
	417	/*
	418	* Get a valid lease. If cached data is stale, flush it.
	419	*/
	420	if (nmp->nm_flag & NFSMNT_NQNFS) {
	421	if (NQNFS_CKINVALID(vp, np, ND_READ)) {
	422	do {
	423	error = nqnfs_getlease(vp, ND_READ, cred, p);
	424	} while (error == NQNFS_EXPIRED);
	425	if (error)
	426	return (error);
	427	if (np->n_lrev != np->n_brev \|\|
	428	(np->n_flag & NQNFSNONCACHE) \|\|
	429	((np->n_flag & NMODIFIED) && vp->v_type == VDIR)) {
	430	if (vp->v_type == VDIR)
	431	nfs_invaldir(vp);
	432	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	433	if (error)
	434	return (error);
	435	np->n_brev = np->n_lrev;
	436	}
	437	} else if (vp->v_type == VDIR && (np->n_flag & NMODIFIED)) {
	438	nfs_invaldir(vp);
	439	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	440	if (error)
	441	return (error);
	442	}
	443	}
	444	if (np->n_flag & NQNFSNONCACHE) {
	445	switch (vp->v_type) {
	446	case VREG:
	447	return (nfs_readrpc(vp, uio, cred));
	448	case VLNK:
	449	return (nfs_readlinkrpc(vp, uio, cred));
	450	case VDIR:
	451	break;
	452	default:
	453	printf(" NQNFSNONCACHE: type %x unexpected\n",
	454	vp->v_type);
	455	};
	456	}
	457	switch (vp->v_type) {
	458	case VREG:
	459	nfsstats.biocache_reads++;
	460	lbn = uio->uio_offset / biosize;
	461	on = uio->uio_offset & (biosize - 1);
	462
	463	/*
	464	* Start the read ahead(s), as required.
	465	*/
	466	if (nfs_numasync > 0 && nmp->nm_readahead > 0) {
	467	for (nra = 0; nra < nmp->nm_readahead && nra < seqcount &&
	468	(off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) {
	469	rabn = lbn + 1 + nra;
	470	if (!incore(vp, rabn)) {
	471	rabp = nfs_getcacheblk(vp, rabn, biosize, p);
	472	if (!rabp)
	473	return (EINTR);
	474	if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) {
	475	rabp->b_flags \|= (B_READ \| B_ASYNC);
	476	vfs_busy_pages(rabp, 0);
	477	if (nfs_asyncio(rabp, cred, p)) {
	478	rabp->b_flags \|= B_INVAL\|B_ERROR;
	479	vfs_unbusy_pages(rabp);
	480	brelse(rabp);
	481	break;
	482	}
	483	} else {
	484	brelse(rabp);
	485	}
	486	}
	487	}
	488	}
	489
	490	/*
	491	* Obtain the buffer cache block. Figure out the buffer size
	492	* when we are at EOF. If we are modifying the size of the
	493	* buffer based on an EOF condition we need to hold
	494	* nfs_rslock() through obtaining the buffer to prevent
	495	* a potential writer-appender from messing with n_size.
	496	* Otherwise we may accidently truncate the buffer and
	497	* lose dirty data.
	498	*
	499	* Note that bcount is not DEV_BSIZE aligned.
	500	*/
	501
	502	again:
	503	bcount = biosize;
	504	if ((off_t)lbn * biosize >= np->n_size) {
	505	bcount = 0;
	506	} else if ((off_t)(lbn + 1) * biosize > np->n_size) {
	507	bcount = np->n_size - (off_t)lbn * biosize;
	508	}
	509	if (bcount != biosize) {
	510	switch(nfs_rslock(np, p)) {
	511	case ENOLCK:
	512	goto again;
	513	/* not reached */
	514	case EINTR:
	515	case ERESTART:
	516	return(EINTR);
	517	/* not reached */
	518	default:
	519	break;
	520	}
	521	}
	522
	523	bp = nfs_getcacheblk(vp, lbn, bcount, p);
	524
	525	if (bcount != biosize)
	526	nfs_rsunlock(np, p);
	527	if (!bp)
	528	return (EINTR);
	529
	530	/*
	531	* If B_CACHE is not set, we must issue the read. If this
	532	* fails, we return an error.
	533	*/
	534
	535	if ((bp->b_flags & B_CACHE) == 0) {
	536	bp->b_flags \|= B_READ;
	537	vfs_busy_pages(bp, 0);
	538	error = nfs_doio(bp, cred, p);
	539	if (error) {
	540	brelse(bp);
	541	return (error);
	542	}
	543	}
	544
	545	/*
	546	* on is the offset into the current bp. Figure out how many
	547	* bytes we can copy out of the bp. Note that bcount is
	548	* NOT DEV_BSIZE aligned.
	549	*
	550	* Then figure out how many bytes we can copy into the uio.
	551	*/
	552
	553	n = 0;
	554	if (on < bcount)
	555	n = min((unsigned)(bcount - on), uio->uio_resid);
	556	break;
	557	case VLNK:
	558	nfsstats.biocache_readlinks++;
	559	bp = nfs_getcacheblk(vp, (daddr_t)0, NFS_MAXPATHLEN, p);
	560	if (!bp)
	561	return (EINTR);
	562	if ((bp->b_flags & B_CACHE) == 0) {
	563	bp->b_flags \|= B_READ;
	564	vfs_busy_pages(bp, 0);
	565	error = nfs_doio(bp, cred, p);
	566	if (error) {
	567	bp->b_flags \|= B_ERROR;
	568	brelse(bp);
	569	return (error);
	570	}
	571	}
	572	n = min(uio->uio_resid, NFS_MAXPATHLEN - bp->b_resid);
	573	on = 0;
	574	break;
	575	case VDIR:
	576	nfsstats.biocache_readdirs++;
	577	if (np->n_direofoffset
	578	&& uio->uio_offset >= np->n_direofoffset) {
	579	return (0);
	580	}
	581	lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ;
	582	on = uio->uio_offset & (NFS_DIRBLKSIZ - 1);
	583	bp = nfs_getcacheblk(vp, lbn, NFS_DIRBLKSIZ, p);
	584	if (!bp)
	585	return (EINTR);
	586	if ((bp->b_flags & B_CACHE) == 0) {
	587	bp->b_flags \|= B_READ;
	588	vfs_busy_pages(bp, 0);
	589	error = nfs_doio(bp, cred, p);
	590	if (error) {
	591	brelse(bp);
	592	}
	593	while (error == NFSERR_BAD_COOKIE) {
	594	printf("got bad cookie vp %p bp %p\n", vp, bp);
	595	nfs_invaldir(vp);
	596	error = nfs_vinvalbuf(vp, 0, cred, p, 1);
	597	/*
	598	* Yuck! The directory has been modified on the
	599	* server. The only way to get the block is by
	600	* reading from the beginning to get all the
	601	* offset cookies.
	602	*
	603	* Leave the last bp intact unless there is an error.
	604	* Loop back up to the while if the error is another
	605	* NFSERR_BAD_COOKIE (double yuch!).
	606	*/
	607	for (i = 0; i <= lbn && !error; i++) {
	608	if (np->n_direofoffset
	609	&& (i * NFS_DIRBLKSIZ) >= np->n_direofoffset)
	610	return (0);
	611	bp = nfs_getcacheblk(vp, i, NFS_DIRBLKSIZ, p);
	612	if (!bp)
	613	return (EINTR);
	614	if ((bp->b_flags & B_CACHE) == 0) {
	615	bp->b_flags \|= B_READ;
	616	vfs_busy_pages(bp, 0);
	617	error = nfs_doio(bp, cred, p);
	618	/*
	619	* no error + B_INVAL == directory EOF,
	620	* use the block.
	621	*/
	622	if (error == 0 && (bp->b_flags & B_INVAL))
	623	break;
	624	}
	625	/*
	626	* An error will throw away the block and the
	627	* for loop will break out. If no error and this
	628	* is not the block we want, we throw away the
	629	* block and go for the next one via the for loop.
	630	*/
	631	if (error \|\| i < lbn)
	632	brelse(bp);
	633	}
	634	}
	635	/*
	636	* The above while is repeated if we hit another cookie
	637	* error. If we hit an error and it wasn't a cookie error,
	638	* we give up.
	639	*/
	640	if (error)
	641	return (error);
	642	}
	643
	644	/*
	645	* If not eof and read aheads are enabled, start one.
	646	* (You need the current block first, so that you have the
	647	* directory offset cookie of the next block.)
	648	*/
	649	if (nfs_numasync > 0 && nmp->nm_readahead > 0 &&
	650	(bp->b_flags & B_INVAL) == 0 &&
	651	(np->n_direofoffset == 0 \|\|
	652	(lbn + 1) * NFS_DIRBLKSIZ < np->n_direofoffset) &&
	653	!(np->n_flag & NQNFSNONCACHE) &&
	654	!incore(vp, lbn + 1)) {
	655	rabp = nfs_getcacheblk(vp, lbn + 1, NFS_DIRBLKSIZ, p);
	656	if (rabp) {
	657	if ((rabp->b_flags & (B_CACHE\|B_DELWRI)) == 0) {
	658	rabp->b_flags \|= (B_READ \| B_ASYNC);
	659	vfs_busy_pages(rabp, 0);
	660	if (nfs_asyncio(rabp, cred, p)) {
	661	rabp->b_flags \|= B_INVAL\|B_ERROR;
	662	vfs_unbusy_pages(rabp);
	663	brelse(rabp);
	664	}
	665	} else {
	666	brelse(rabp);
	667	}
	668	}
	669	}
	670	/*
	671	* Unlike VREG files, whos buffer size ( bp->b_bcount ) is
	672	* chopped for the EOF condition, we cannot tell how large
	673	* NFS directories are going to be until we hit EOF. So
	674	* an NFS directory buffer is not chopped to its EOF. Now,
	675	* it just so happens that b_resid will effectively chop it
	676	* to EOF. BUT this information is lost if the buffer goes
	677	* away and is reconstituted into a B_CACHE state ( due to
	678	* being VMIO ) later. So we keep track of the directory eof
	679	* in np->n_direofoffset and chop it off as an extra step
	680	* right here.
	681	*/
	682	n = lmin(uio->uio_resid, NFS_DIRBLKSIZ - bp->b_resid - on);
	683	if (np->n_direofoffset && n > np->n_direofoffset - uio->uio_offset)
	684	n = np->n_direofoffset - uio->uio_offset;
	685	break;
	686	default:
	687	printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	688	break;
	689	};
	690
	691	if (n > 0) {
	692	error = uiomove(bp->b_data + on, (int)n, uio);
	693	}
	694	switch (vp->v_type) {
	695	case VREG:
	696	break;
	697	case VLNK:
	698	n = 0;
	699	break;
	700	case VDIR:
	701	/*
	702	* Invalidate buffer if caching is disabled, forcing a
	703	* re-read from the remote later.
	704	*/
	705	if (np->n_flag & NQNFSNONCACHE)
	706	bp->b_flags \|= B_INVAL;
	707	break;
	708	default:
	709	printf(" nfs_bioread: type %x unexpected\n",vp->v_type);
	710	}
	711	brelse(bp);
	712	} while (error == 0 && uio->uio_resid > 0 && n > 0);
	713	return (error);
	714	}
	715
	716	/*
	717	* Vnode op for write using bio
	718	*/
	719	int
	720	nfs_write(ap)
	721	struct vop_write_args /* {
	722	struct vnode *a_vp;
	723	struct uio *a_uio;
	724	int a_ioflag;
	725	struct ucred *a_cred;
	726	} / ap;
	727	{
	728	int biosize;
	729	struct uio *uio = ap->a_uio;
	730	struct proc *p = uio->uio_procp;
	731	struct vnode *vp = ap->a_vp;
	732	struct nfsnode *np = VTONFS(vp);
	733	struct ucred *cred = ap->a_cred;
	734	int ioflag = ap->a_ioflag;
	735	struct buf *bp;
	736	struct vattr vattr;
	737	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	738	daddr_t lbn;
	739	int bcount;
	740	int n, on, error = 0, iomode, must_commit;
	741	int haverslock = 0;
	742
	743	#ifdef DIAGNOSTIC
	744	if (uio->uio_rw != UIO_WRITE)
	745	panic("nfs_write mode");
	746	if (uio->uio_segflg == UIO_USERSPACE && uio->uio_procp != curproc)
	747	panic("nfs_write proc");
	748	#endif
	749	if (vp->v_type != VREG)
	750	return (EIO);
	751	if (np->n_flag & NWRITEERR) {
	752	np->n_flag &= ~NWRITEERR;
	753	return (np->n_error);
	754	}
	755	if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 &&
	756	(nmp->nm_state & NFSSTA_GOTFSINFO) == 0)
	757	(void)nfs_fsinfo(nmp, vp, cred, p);
	758
	759	/*
	760	* Synchronously flush pending buffers if we are in synchronous
	761	* mode or if we are appending.
	762	*/
	763	if (ioflag & (IO_APPEND \| IO_SYNC)) {
	764	if (np->n_flag & NMODIFIED) {
	765	np->n_attrstamp = 0;
	766	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	767	if (error)
	768	return (error);
	769	}
	770	}
	771
	772	/*
	773	* If IO_APPEND then load uio_offset. We restart here if we cannot
	774	* get the append lock.
	775	*/
	776	restart:
	777	if (ioflag & IO_APPEND) {
	778	np->n_attrstamp = 0;
	779	error = VOP_GETATTR(vp, &vattr, cred, p);
	780	if (error)
	781	return (error);
	782	uio->uio_offset = np->n_size;
	783	}
	784
	785	if (uio->uio_offset < 0)
	786	return (EINVAL);
	787	if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize)
	788	return (EFBIG);
	789	if (uio->uio_resid == 0)
	790	return (0);
	791
	792	/*
	793	* We need to obtain the rslock if we intend to modify np->n_size
	794	* in order to guarentee the append point with multiple contending
	795	* writers, to guarentee that no other appenders modify n_size
	796	* while we are trying to obtain a truncated buffer (i.e. to avoid
	797	* accidently truncating data written by another appender due to
	798	* the race), and to ensure that the buffer is populated prior to
	799	* our extending of the file. We hold rslock through the entire
	800	* operation.
	801	*
	802	* Note that we do not synchronize the case where someone truncates
	803	* the file while we are appending to it because attempting to lock
	804	* this case may deadlock other parts of the system unexpectedly.
	805	*/
	806	if ((ioflag & IO_APPEND) \|\|
	807	uio->uio_offset + uio->uio_resid > np->n_size) {
	808	switch(nfs_rslock(np, p)) {
	809	case ENOLCK:
	810	goto restart;
	811	/* not reached */
	812	case EINTR:
	813	case ERESTART:
	814	return(EINTR);
	815	/* not reached */
	816	default:
	817	break;
	818	}
	819	haverslock = 1;
	820	}
	821
	822	/*
	823	* Maybe this should be above the vnode op call, but so long as
	824	* file servers have no limits, i don't think it matters
	825	*/
	826	if (p && uio->uio_offset + uio->uio_resid >
	827	p->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
	828	psignal(p, SIGXFSZ);
	829	if (haverslock)
	830	nfs_rsunlock(np, p);
	831	return (EFBIG);
	832	}
	833
	834	biosize = vp->v_mount->mnt_stat.f_iosize;
	835
	836	do {
	837	/*
	838	* Check for a valid write lease.
	839	*/
	840	if ((nmp->nm_flag & NFSMNT_NQNFS) &&
	841	NQNFS_CKINVALID(vp, np, ND_WRITE)) {
	842	do {
	843	error = nqnfs_getlease(vp, ND_WRITE, cred, p);
	844	} while (error == NQNFS_EXPIRED);
	845	if (error)
	846	break;
	847	if (np->n_lrev != np->n_brev \|\|
	848	(np->n_flag & NQNFSNONCACHE)) {
	849	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	850	if (error)
	851	break;
	852	np->n_brev = np->n_lrev;
	853	}
	854	}
	855	if ((np->n_flag & NQNFSNONCACHE) && uio->uio_iovcnt == 1) {
	856	iomode = NFSV3WRITE_FILESYNC;
	857	error = nfs_writerpc(vp, uio, cred, &iomode, &must_commit);
	858	if (must_commit)
	859	nfs_clearcommit(vp->v_mount);
	860	break;
	861	}
	862	nfsstats.biocache_writes++;
	863	lbn = uio->uio_offset / biosize;
	864	on = uio->uio_offset & (biosize-1);
	865	n = min((unsigned)(biosize - on), uio->uio_resid);
	866	again:
	867	/*
	868	* Handle direct append and file extension cases, calculate
	869	* unaligned buffer size.
	870	*/
	871
	872	if (uio->uio_offset == np->n_size && n) {
	873	/*
	874	* Get the buffer (in its pre-append state to maintain
	875	* B_CACHE if it was previously set). Resize the
	876	* nfsnode after we have locked the buffer to prevent
	877	* readers from reading garbage.
	878	*/
	879	bcount = on;
	880	bp = nfs_getcacheblk(vp, lbn, bcount, p);
	881
	882	if (bp != NULL) {
	883	long save;
	884
	885	np->n_size = uio->uio_offset + n;
	886	np->n_flag \|= NMODIFIED;
	887	vnode_pager_setsize(vp, np->n_size);
	888
	889	save = bp->b_flags & B_CACHE;
	890	bcount += n;
	891	allocbuf(bp, bcount);
	892	bp->b_flags \|= save;
	893	}
	894	} else {
	895	/*
	896	* Obtain the locked cache block first, and then
	897	* adjust the file's size as appropriate.
	898	*/
	899	bcount = on + n;
	900	if ((off_t)lbn * biosize + bcount < np->n_size) {
	901	if ((off_t)(lbn + 1) * biosize < np->n_size)
	902	bcount = biosize;
	903	else
	904	bcount = np->n_size - (off_t)lbn * biosize;
	905	}
	906	bp = nfs_getcacheblk(vp, lbn, bcount, p);
	907	if (uio->uio_offset + n > np->n_size) {
	908	np->n_size = uio->uio_offset + n;
	909	np->n_flag \|= NMODIFIED;
	910	vnode_pager_setsize(vp, np->n_size);
	911	}
	912	}
	913
	914	if (!bp) {
	915	error = EINTR;
	916	break;
	917	}
	918
	919	/*
	920	* Issue a READ if B_CACHE is not set. In special-append
	921	* mode, B_CACHE is based on the buffer prior to the write
	922	* op and is typically set, avoiding the read. If a read
	923	* is required in special append mode, the server will
	924	* probably send us a short-read since we extended the file
	925	* on our end, resulting in b_resid == 0 and, thusly,
	926	* B_CACHE getting set.
	927	*
	928	* We can also avoid issuing the read if the write covers
	929	* the entire buffer. We have to make sure the buffer state
	930	* is reasonable in this case since we will not be initiating
	931	* I/O. See the comments in kern/vfs_bio.c's getblk() for
	932	* more information.
	933	*
	934	* B_CACHE may also be set due to the buffer being cached
	935	* normally.
	936	*/
	937
	938	if (on == 0 && n == bcount) {
	939	bp->b_flags \|= B_CACHE;
	940	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	941	}
	942
	943	if ((bp->b_flags & B_CACHE) == 0) {
	944	bp->b_flags \|= B_READ;
	945	vfs_busy_pages(bp, 0);
	946	error = nfs_doio(bp, cred, p);
	947	if (error) {
	948	brelse(bp);
	949	break;
	950	}
	951	}
	952	if (!bp) {
	953	error = EINTR;
	954	break;
	955	}
	956	if (bp->b_wcred == NOCRED) {
	957	crhold(cred);
	958	bp->b_wcred = cred;
	959	}
	960	np->n_flag \|= NMODIFIED;
	961
	962	/*
	963	* If dirtyend exceeds file size, chop it down. This should
	964	* not normally occur but there is an append race where it
	965	* might occur XXX, so we log it.
	966	*
	967	* If the chopping creates a reverse-indexed or degenerate
	968	* situation with dirtyoff/end, we 0 both of them.
	969	*/
	970
	971	if (bp->b_dirtyend > bcount) {
	972	printf("NFS append race @%lx:%d\n",
	973	(long)bp->b_blkno * DEV_BSIZE,
	974	bp->b_dirtyend - bcount);
	975	bp->b_dirtyend = bcount;
	976	}
	977
	978	if (bp->b_dirtyoff >= bp->b_dirtyend)
	979	bp->b_dirtyoff = bp->b_dirtyend = 0;
	980
	981	/*
	982	* If the new write will leave a contiguous dirty
	983	* area, just update the b_dirtyoff and b_dirtyend,
	984	* otherwise force a write rpc of the old dirty area.
	985	*
	986	* While it is possible to merge discontiguous writes due to
	987	* our having a B_CACHE buffer ( and thus valid read data
	988	* for the hole), we don't because it could lead to
	989	* significant cache coherency problems with multiple clients,
	990	* especially if locking is implemented later on.
	991	*
	992	* as an optimization we could theoretically maintain
	993	* a linked list of discontinuous areas, but we would still
	994	* have to commit them separately so there isn't much
	995	* advantage to it except perhaps a bit of asynchronization.
	996	*/
	997
	998	if (bp->b_dirtyend > 0 &&
	999	(on > bp->b_dirtyend \|\| (on + n) < bp->b_dirtyoff)) {
	1000	if (VOP_BWRITE(bp->b_vp, bp) == EINTR) {
	1001	error = EINTR;
	1002	break;
	1003	}
	1004	goto again;
	1005	}
	1006
	1007	/*
	1008	* Check for valid write lease and get one as required.
	1009	* In case getblk() and/or bwrite() delayed us.
	1010	*/
	1011	if ((nmp->nm_flag & NFSMNT_NQNFS) &&
	1012	NQNFS_CKINVALID(vp, np, ND_WRITE)) {
	1013	do {
	1014	error = nqnfs_getlease(vp, ND_WRITE, cred, p);
	1015	} while (error == NQNFS_EXPIRED);
	1016	if (error) {
	1017	brelse(bp);
	1018	break;
	1019	}
	1020	if (np->n_lrev != np->n_brev \|\|
	1021	(np->n_flag & NQNFSNONCACHE)) {
	1022	brelse(bp);
	1023	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	1024	if (error)
	1025	break;
	1026	np->n_brev = np->n_lrev;
	1027	goto again;
	1028	}
	1029	}
	1030
	1031	error = uiomove((char *)bp->b_data + on, n, uio);
	1032
	1033	/*
	1034	* Since this block is being modified, it must be written
	1035	* again and not just committed. Since write clustering does
	1036	* not work for the stage 1 data write, only the stage 2
	1037	* commit rpc, we have to clear B_CLUSTEROK as well.
	1038	*/
	1039	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1040
	1041	if (error) {
	1042	bp->b_flags \|= B_ERROR;
	1043	brelse(bp);
	1044	break;
	1045	}
	1046
	1047	/*
	1048	* Only update dirtyoff/dirtyend if not a degenerate
	1049	* condition.
	1050	*/
	1051	if (n) {
	1052	if (bp->b_dirtyend > 0) {
	1053	bp->b_dirtyoff = min(on, bp->b_dirtyoff);
	1054	bp->b_dirtyend = max((on + n), bp->b_dirtyend);
	1055	} else {
	1056	bp->b_dirtyoff = on;
	1057	bp->b_dirtyend = on + n;
	1058	}
	1059	vfs_bio_set_validclean(bp, on, n);
	1060	}
	1061	/*
	1062	* If IO_NOWDRAIN then set B_NOWDRAIN (e.g. nfs-backed VN
	1063	* filesystem). XXX also use for loopback NFS mounts.
	1064	*/
	1065	if (ioflag & IO_NOWDRAIN)
	1066	bp->b_flags \|= B_NOWDRAIN;
	1067
	1068	/*
	1069	* If the lease is non-cachable or IO_SYNC do bwrite().
	1070	*
	1071	* IO_INVAL appears to be unused. The idea appears to be
	1072	* to turn off caching in this case. Very odd. XXX
	1073	*/
	1074	if ((np->n_flag & NQNFSNONCACHE) \|\| (ioflag & IO_SYNC)) {
	1075	if (ioflag & IO_INVAL)
	1076	bp->b_flags \|= B_NOCACHE;
	1077	error = VOP_BWRITE(bp->b_vp, bp);
	1078	if (error)
	1079	break;
	1080	if (np->n_flag & NQNFSNONCACHE) {
	1081	error = nfs_vinvalbuf(vp, V_SAVE, cred, p, 1);
	1082	if (error)
	1083	break;
	1084	}
	1085	} else if ((n + on) == biosize &&
	1086	(nmp->nm_flag & NFSMNT_NQNFS) == 0) {
	1087	bp->b_flags \|= B_ASYNC;
	1088	(void)nfs_writebp(bp, 0, 0);
	1089	} else {
	1090	bdwrite(bp);
	1091	}
	1092	} while (uio->uio_resid > 0 && n > 0);
	1093
	1094	if (haverslock)
	1095	nfs_rsunlock(np, p);
	1096
	1097	return (error);
	1098	}
	1099
	1100	/*
	1101	* Get an nfs cache block.
	1102	*
	1103	* Allocate a new one if the block isn't currently in the cache
	1104	* and return the block marked busy. If the calling process is
	1105	* interrupted by a signal for an interruptible mount point, return
	1106	* NULL.
	1107	*
	1108	* The caller must carefully deal with the possible B_INVAL state of
	1109	* the buffer. nfs_doio() clears B_INVAL (and nfs_asyncio() clears it
	1110	* indirectly), so synchronous reads can be issued without worrying about
	1111	* the B_INVAL state. We have to be a little more careful when dealing
	1112	* with writes (see comments in nfs_write()) when extending a file past
	1113	* its EOF.
	1114	*/
	1115	static struct buf *
	1116	nfs_getcacheblk(vp, bn, size, p)
	1117	struct vnode *vp;
	1118	daddr_t bn;
	1119	int size;
	1120	struct proc *p;
	1121	{
	1122	register struct buf *bp;
	1123	struct mount *mp;
	1124	struct nfsmount *nmp;
	1125
	1126	mp = vp->v_mount;
	1127	nmp = VFSTONFS(mp);
	1128
	1129	if (nmp->nm_flag & NFSMNT_INT) {
	1130	bp = getblk(vp, bn, size, PCATCH, 0);
	1131	while (bp == (struct buf *)0) {
	1132	if (nfs_sigintr(nmp, (struct nfsreq *)0, p))
	1133	return ((struct buf *)0);
	1134	bp = getblk(vp, bn, size, 0, 2 * hz);
	1135	}
	1136	} else {
	1137	bp = getblk(vp, bn, size, 0, 0);
	1138	}
	1139
	1140	if (vp->v_type == VREG) {
	1141	int biosize;
	1142
	1143	biosize = mp->mnt_stat.f_iosize;
	1144	bp->b_blkno = bn * (biosize / DEV_BSIZE);
	1145	}
	1146	return (bp);
	1147	}
	1148
	1149	/*
	1150	* Flush and invalidate all dirty buffers. If another process is already
	1151	* doing the flush, just wait for completion.
	1152	*/
	1153	int
	1154	nfs_vinvalbuf(vp, flags, cred, p, intrflg)
	1155	struct vnode *vp;
	1156	int flags;
	1157	struct ucred *cred;
	1158	struct proc *p;
	1159	int intrflg;
	1160	{
	1161	register struct nfsnode *np = VTONFS(vp);
	1162	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	1163	int error = 0, slpflag, slptimeo;
	1164
	1165	if (vp->v_flag & VXLOCK) {
	1166	return (0);
	1167	}
	1168
	1169	if ((nmp->nm_flag & NFSMNT_INT) == 0)
	1170	intrflg = 0;
	1171	if (intrflg) {
	1172	slpflag = PCATCH;
	1173	slptimeo = 2 * hz;
	1174	} else {
	1175	slpflag = 0;
	1176	slptimeo = 0;
	1177	}
	1178	/*
	1179	* First wait for any other process doing a flush to complete.
	1180	*/
	1181	while (np->n_flag & NFLUSHINPROG) {
	1182	np->n_flag \|= NFLUSHWANT;
	1183	error = tsleep((caddr_t)&np->n_flag, PRIBIO + 2, "nfsvinval",
	1184	slptimeo);
	1185	if (error && intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p))
	1186	return (EINTR);
	1187	}
	1188
	1189	/*
	1190	* Now, flush as required.
	1191	*/
	1192	np->n_flag \|= NFLUSHINPROG;
	1193	error = vinvalbuf(vp, flags, cred, p, slpflag, 0);
	1194	while (error) {
	1195	if (intrflg && nfs_sigintr(nmp, (struct nfsreq *)0, p)) {
	1196	np->n_flag &= ~NFLUSHINPROG;
	1197	if (np->n_flag & NFLUSHWANT) {
	1198	np->n_flag &= ~NFLUSHWANT;
	1199	wakeup((caddr_t)&np->n_flag);
	1200	}
	1201	return (EINTR);
	1202	}
	1203	error = vinvalbuf(vp, flags, cred, p, 0, slptimeo);
	1204	}
	1205	np->n_flag &= ~(NMODIFIED \| NFLUSHINPROG);
	1206	if (np->n_flag & NFLUSHWANT) {
	1207	np->n_flag &= ~NFLUSHWANT;
	1208	wakeup((caddr_t)&np->n_flag);
	1209	}
	1210	return (0);
	1211	}
	1212
	1213	/*
	1214	* Initiate asynchronous I/O. Return an error if no nfsiods are available.
	1215	* This is mainly to avoid queueing async I/O requests when the nfsiods
	1216	* are all hung on a dead server.
	1217	*
	1218	* Note: nfs_asyncio() does not clear (B_ERROR\|B_INVAL) but when the bp
	1219	* is eventually dequeued by the async daemon, nfs_doio() will.
	1220	*/
	1221	int
	1222	nfs_asyncio(bp, cred, procp)
	1223	register struct buf *bp;
	1224	struct ucred *cred;
	1225	struct proc *procp;
	1226	{
	1227	struct nfsmount *nmp;
	1228	int i;
	1229	int gotiod;
	1230	int slpflag = 0;
	1231	int slptimeo = 0;
	1232	int error;
	1233
	1234	/*
	1235	* If no async daemons then return EIO to force caller to run the rpc
	1236	* synchronously.
	1237	*/
	1238	if (nfs_numasync == 0)
	1239	return (EIO);
	1240
	1241	nmp = VFSTONFS(bp->b_vp->v_mount);
	1242
	1243	/*
	1244	* Commits are usually short and sweet so lets save some cpu and
	1245	* leave the async daemons for more important rpc's (such as reads
	1246	* and writes).
	1247	*/
	1248	if ((bp->b_flags & (B_READ\|B_NEEDCOMMIT)) == B_NEEDCOMMIT &&
	1249	(nmp->nm_bufqiods > nfs_numasync / 2)) {
	1250	return(EIO);
	1251	}
	1252
	1253	again:
	1254	if (nmp->nm_flag & NFSMNT_INT)
	1255	slpflag = PCATCH;
	1256	gotiod = FALSE;
	1257
	1258	/*
	1259	* Find a free iod to process this request.
	1260	*/
	1261	for (i = 0; i < NFS_MAXASYNCDAEMON; i++)
	1262	if (nfs_iodwant[i]) {
	1263	/*
	1264	* Found one, so wake it up and tell it which
	1265	* mount to process.
	1266	*/
	1267	NFS_DPF(ASYNCIO,
	1268	("nfs_asyncio: waking iod %d for mount %p\n",
	1269	i, nmp));
	1270	nfs_iodwant[i] = (struct proc *)0;
	1271	nfs_iodmount[i] = nmp;
	1272	nmp->nm_bufqiods++;
	1273	wakeup((caddr_t)&nfs_iodwant[i]);
	1274	gotiod = TRUE;
	1275	break;
	1276	}
	1277
	1278	/*
	1279	* If none are free, we may already have an iod working on this mount
	1280	* point. If so, it will process our request.
	1281	*/
	1282	if (!gotiod) {
	1283	if (nmp->nm_bufqiods > 0) {
	1284	NFS_DPF(ASYNCIO,
	1285	("nfs_asyncio: %d iods are already processing mount %p\n",
	1286	nmp->nm_bufqiods, nmp));
	1287	gotiod = TRUE;
	1288	}
	1289	}
	1290
	1291	/*
	1292	* If we have an iod which can process the request, then queue
	1293	* the buffer.
	1294	*/
	1295	if (gotiod) {
	1296	/*
	1297	* Ensure that the queue never grows too large. We still want
	1298	* to asynchronize so we block rather then return EIO.
	1299	*/
	1300	while (nmp->nm_bufqlen >= 2*nfs_numasync) {
	1301	NFS_DPF(ASYNCIO,
	1302	("nfs_asyncio: waiting for mount %p queue to drain\n", nmp));
	1303	nmp->nm_bufqwant = TRUE;
	1304	error = tsleep(&nmp->nm_bufq, slpflag \| PRIBIO,
	1305	"nfsaio", slptimeo);
	1306	if (error) {
	1307	if (nfs_sigintr(nmp, NULL, procp))
	1308	return (EINTR);
	1309	if (slpflag == PCATCH) {
	1310	slpflag = 0;
	1311	slptimeo = 2 * hz;
	1312	}
	1313	}
	1314	/*
	1315	* We might have lost our iod while sleeping,
	1316	* so check and loop if nescessary.
	1317	*/
	1318	if (nmp->nm_bufqiods == 0) {
	1319	NFS_DPF(ASYNCIO,
	1320	("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp));
	1321	goto again;
	1322	}
	1323	}
	1324
	1325	if (bp->b_flags & B_READ) {
	1326	if (bp->b_rcred == NOCRED && cred != NOCRED) {
	1327	crhold(cred);
	1328	bp->b_rcred = cred;
	1329	}
	1330	} else {
	1331	bp->b_flags \|= B_WRITEINPROG;
	1332	if (bp->b_wcred == NOCRED && cred != NOCRED) {
	1333	crhold(cred);
	1334	bp->b_wcred = cred;
	1335	}
	1336	}
	1337
	1338	BUF_KERNPROC(bp);
	1339	TAILQ_INSERT_TAIL(&nmp->nm_bufq, bp, b_freelist);
	1340	nmp->nm_bufqlen++;
	1341	return (0);
	1342	}
	1343
	1344	/*
	1345	* All the iods are busy on other mounts, so return EIO to
	1346	* force the caller to process the i/o synchronously.
	1347	*/
	1348	NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n"));
	1349	return (EIO);
	1350	}
	1351
	1352	/*
	1353	* Do an I/O operation to/from a cache block. This may be called
	1354	* synchronously or from an nfsiod.
	1355	*/
	1356	int
	1357	nfs_doio(bp, cr, p)
	1358	struct buf *bp;
	1359	struct ucred *cr;
	1360	struct proc *p;
	1361	{
	1362	struct uio *uiop;
	1363	struct vnode *vp;
	1364	struct nfsnode *np;
	1365	struct nfsmount *nmp;
	1366	int error = 0, iomode, must_commit = 0;
	1367	struct uio uio;
	1368	struct iovec io;
	1369
	1370	vp = bp->b_vp;
	1371	np = VTONFS(vp);
	1372	nmp = VFSTONFS(vp->v_mount);
	1373	uiop = &uio;
	1374	uiop->uio_iov = &io;
	1375	uiop->uio_iovcnt = 1;
	1376	uiop->uio_segflg = UIO_SYSSPACE;
	1377	uiop->uio_procp = p;
	1378
	1379	/*
	1380	* clear B_ERROR and B_INVAL state prior to initiating the I/O. We
	1381	* do this here so we do not have to do it in all the code that
	1382	* calls us.
	1383	*/
	1384	bp->b_flags &= ~(B_ERROR \| B_INVAL);
	1385
	1386	KASSERT(!(bp->b_flags & B_DONE), ("nfs_doio: bp %p already marked done", bp));
	1387
	1388	/*
	1389	* Historically, paging was done with physio, but no more.
	1390	*/
	1391	if (bp->b_flags & B_PHYS) {
	1392	/*
	1393	* ...though reading /dev/drum still gets us here.
	1394	*/
	1395	io.iov_len = uiop->uio_resid = bp->b_bcount;
	1396	/* mapping was done by vmapbuf() */
	1397	io.iov_base = bp->b_data;
	1398	uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
	1399	if (bp->b_flags & B_READ) {
	1400	uiop->uio_rw = UIO_READ;
	1401	nfsstats.read_physios++;
	1402	error = nfs_readrpc(vp, uiop, cr);
	1403	} else {
	1404	int com;
	1405
	1406	iomode = NFSV3WRITE_DATASYNC;
	1407	uiop->uio_rw = UIO_WRITE;
	1408	nfsstats.write_physios++;
	1409	error = nfs_writerpc(vp, uiop, cr, &iomode, &com);
	1410	}
	1411	if (error) {
	1412	bp->b_flags \|= B_ERROR;
	1413	bp->b_error = error;
	1414	}
	1415	} else if (bp->b_flags & B_READ) {
	1416	io.iov_len = uiop->uio_resid = bp->b_bcount;
	1417	io.iov_base = bp->b_data;
	1418	uiop->uio_rw = UIO_READ;
	1419
	1420	switch (vp->v_type) {
	1421	case VREG:
	1422	uiop->uio_offset = ((off_t)bp->b_blkno) * DEV_BSIZE;
	1423	nfsstats.read_bios++;
	1424	error = nfs_readrpc(vp, uiop, cr);
	1425
	1426	if (!error) {
	1427	if (uiop->uio_resid) {
	1428	/*
	1429	* If we had a short read with no error, we must have
	1430	* hit a file hole. We should zero-fill the remainder.
	1431	* This can also occur if the server hits the file EOF.
	1432	*
	1433	* Holes used to be able to occur due to pending
	1434	* writes, but that is not possible any longer.
	1435	*/
	1436	int nread = bp->b_bcount - uiop->uio_resid;
	1437	int left = uiop->uio_resid;
	1438
	1439	if (left > 0)
	1440	bzero((char *)bp->b_data + nread, left);
	1441	uiop->uio_resid = 0;
	1442	}
	1443	}
	1444	if (p && (vp->v_flag & VTEXT) &&
	1445	(((nmp->nm_flag & NFSMNT_NQNFS) &&
	1446	NQNFS_CKINVALID(vp, np, ND_READ) &&
	1447	np->n_lrev != np->n_brev) \|\|
	1448	(!(nmp->nm_flag & NFSMNT_NQNFS) &&
	1449	np->n_mtime != np->n_vattr.va_mtime.tv_sec))) {
	1450	uprintf("Process killed due to text file modification\n");
	1451	psignal(p, SIGKILL);
	1452	PHOLD(p);
	1453	}
	1454	break;
	1455	case VLNK:
	1456	uiop->uio_offset = (off_t)0;
	1457	nfsstats.readlink_bios++;
	1458	error = nfs_readlinkrpc(vp, uiop, cr);
	1459	break;
	1460	case VDIR:
	1461	nfsstats.readdir_bios++;
	1462	uiop->uio_offset = ((u_quad_t)bp->b_lblkno) * NFS_DIRBLKSIZ;
	1463	if (nmp->nm_flag & NFSMNT_RDIRPLUS) {
	1464	error = nfs_readdirplusrpc(vp, uiop, cr);
	1465	if (error == NFSERR_NOTSUPP)
	1466	nmp->nm_flag &= ~NFSMNT_RDIRPLUS;
	1467	}
	1468	if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0)
	1469	error = nfs_readdirrpc(vp, uiop, cr);
	1470	/*
	1471	* end-of-directory sets B_INVAL but does not generate an
	1472	* error.
	1473	*/
	1474	if (error == 0 && uiop->uio_resid == bp->b_bcount)
	1475	bp->b_flags \|= B_INVAL;
	1476	break;
	1477	default:
	1478	printf("nfs_doio: type %x unexpected\n",vp->v_type);
	1479	break;
	1480	};
	1481	if (error) {
	1482	bp->b_flags \|= B_ERROR;
	1483	bp->b_error = error;
	1484	}
	1485	} else {
	1486	/*
	1487	* If we only need to commit, try to commit
	1488	*/
	1489	if (bp->b_flags & B_NEEDCOMMIT) {
	1490	int retv;
	1491	off_t off;
	1492
	1493	off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE + bp->b_dirtyoff;
	1494	bp->b_flags \|= B_WRITEINPROG;
	1495	retv = nfs_commit(
	1496	bp->b_vp, off, bp->b_dirtyend-bp->b_dirtyoff,
	1497	bp->b_wcred, p);
	1498	bp->b_flags &= ~B_WRITEINPROG;
	1499	if (retv == 0) {
	1500	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1501	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1502	bp->b_resid = 0;
	1503	biodone(bp);
	1504	return (0);
	1505	}
	1506	if (retv == NFSERR_STALEWRITEVERF) {
	1507	nfs_clearcommit(bp->b_vp->v_mount);
	1508	}
	1509	}
	1510
	1511	/*
	1512	* Setup for actual write
	1513	*/
	1514
	1515	if ((off_t)bp->b_blkno * DEV_BSIZE + bp->b_dirtyend > np->n_size)
	1516	bp->b_dirtyend = np->n_size - (off_t)bp->b_blkno * DEV_BSIZE;
	1517
	1518	if (bp->b_dirtyend > bp->b_dirtyoff) {
	1519	io.iov_len = uiop->uio_resid = bp->b_dirtyend
	1520	- bp->b_dirtyoff;
	1521	uiop->uio_offset = (off_t)bp->b_blkno * DEV_BSIZE
	1522	+ bp->b_dirtyoff;
	1523	io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
	1524	uiop->uio_rw = UIO_WRITE;
	1525	nfsstats.write_bios++;
	1526
	1527	if ((bp->b_flags & (B_ASYNC \| B_NEEDCOMMIT \| B_NOCACHE \| B_CLUSTER)) == B_ASYNC)
	1528	iomode = NFSV3WRITE_UNSTABLE;
	1529	else
	1530	iomode = NFSV3WRITE_FILESYNC;
	1531
	1532	bp->b_flags \|= B_WRITEINPROG;
	1533	error = nfs_writerpc(vp, uiop, cr, &iomode, &must_commit);
	1534
	1535	/*
	1536	* When setting B_NEEDCOMMIT also set B_CLUSTEROK to try
	1537	* to cluster the buffers needing commit. This will allow
	1538	* the system to submit a single commit rpc for the whole
	1539	* cluster. We can do this even if the buffer is not 100%
	1540	* dirty (relative to the NFS blocksize), so we optimize the
	1541	* append-to-file-case.
	1542	*
	1543	* (when clearing B_NEEDCOMMIT, B_CLUSTEROK must also be
	1544	* cleared because write clustering only works for commit
	1545	* rpc's, not for the data portion of the write).
	1546	*/
	1547
	1548	if (!error && iomode == NFSV3WRITE_UNSTABLE) {
	1549	bp->b_flags \|= B_NEEDCOMMIT;
	1550	if (bp->b_dirtyoff == 0
	1551	&& bp->b_dirtyend == bp->b_bcount)
	1552	bp->b_flags \|= B_CLUSTEROK;
	1553	} else {
	1554	bp->b_flags &= ~(B_NEEDCOMMIT \| B_CLUSTEROK);
	1555	}
	1556	bp->b_flags &= ~B_WRITEINPROG;
	1557
	1558	/*
	1559	* For an interrupted write, the buffer is still valid
	1560	* and the write hasn't been pushed to the server yet,
	1561	* so we can't set B_ERROR and report the interruption
	1562	* by setting B_EINTR. For the B_ASYNC case, B_EINTR
	1563	* is not relevant, so the rpc attempt is essentially
	1564	* a noop. For the case of a V3 write rpc not being
	1565	* committed to stable storage, the block is still
	1566	* dirty and requires either a commit rpc or another
	1567	* write rpc with iomode == NFSV3WRITE_FILESYNC before
	1568	* the block is reused. This is indicated by setting
	1569	* the B_DELWRI and B_NEEDCOMMIT flags.
	1570	*
	1571	* If the buffer is marked B_PAGING, it does not reside on
	1572	* the vp's paging queues so we cannot call bdirty(). The
	1573	* bp in this case is not an NFS cache block so we should
	1574	* be safe. XXX
	1575	*/
	1576	if (error == EINTR
	1577	\|\| (!error && (bp->b_flags & B_NEEDCOMMIT))) {
	1578	int s;
	1579
	1580	s = splbio();
	1581	bp->b_flags &= ~(B_INVAL\|B_NOCACHE);
	1582	if ((bp->b_flags & B_PAGING) == 0) {
	1583	bdirty(bp);
	1584	bp->b_flags &= ~B_DONE;
	1585	}
	1586	if (error && (bp->b_flags & B_ASYNC) == 0)
	1587	bp->b_flags \|= B_EINTR;
	1588	splx(s);
	1589	} else {
	1590	if (error) {
	1591	bp->b_flags \|= B_ERROR;
	1592	bp->b_error = np->n_error = error;
	1593	np->n_flag \|= NWRITEERR;
	1594	}
	1595	bp->b_dirtyoff = bp->b_dirtyend = 0;
	1596	}
	1597	} else {
	1598	bp->b_resid = 0;
	1599	biodone(bp);
	1600	return (0);
	1601	}
	1602	}
	1603	bp->b_resid = uiop->uio_resid;
	1604	if (must_commit)
	1605	nfs_clearcommit(vp->v_mount);
	1606	biodone(bp);
	1607	return (error);
	1608	}
	1609
	1610	/*
	1611	* Used to aid in handling ftruncate() operations on the NFS client side.
	1612	* Truncation creates a number of special problems for NFS. We have to
	1613	* throw away VM pages and buffer cache buffers that are beyond EOF, and
	1614	* we have to properly handle VM pages or (potentially dirty) buffers
	1615	* that straddle the truncation point.
	1616	*/
	1617
	1618	int
	1619	nfs_meta_setsize(struct vnode vp, struct ucred cred, struct proc *p, u_quad_t nsize)
	1620	{
	1621	struct nfsnode *np = VTONFS(vp);
	1622	u_quad_t tsize = np->n_size;
	1623	int biosize = vp->v_mount->mnt_stat.f_iosize;
	1624	int error = 0;
	1625
	1626	np->n_size = nsize;
	1627
	1628	if (np->n_size < tsize) {
	1629	struct buf *bp;
	1630	daddr_t lbn;
	1631	int bufsize;
	1632
	1633	/*
	1634	* vtruncbuf() doesn't get the buffer overlapping the
	1635	* truncation point. We may have a B_DELWRI and/or B_CACHE
	1636	* buffer that now needs to be truncated.
	1637	*/
	1638	error = vtruncbuf(vp, cred, p, nsize, biosize);
	1639	lbn = nsize / biosize;
	1640	bufsize = nsize & (biosize - 1);
	1641	bp = nfs_getcacheblk(vp, lbn, bufsize, p);
	1642	if (bp->b_dirtyoff > bp->b_bcount)
	1643	bp->b_dirtyoff = bp->b_bcount;
	1644	if (bp->b_dirtyend > bp->b_bcount)
	1645	bp->b_dirtyend = bp->b_bcount;
	1646	bp->b_flags \|= B_RELBUF; /* don't leave garbage around */
	1647	brelse(bp);
	1648	} else {
	1649	vnode_pager_setsize(vp, nsize);
	1650	}
	1651	return(error);
	1652	}
	1653