| Commit | Line | Data |
|---|---|---|
| 984263bc MD |
1 | /* |
| 2 | * Copyright (c) 1989, 1993 | |
| 3 | * The Regents of the University of California. All rights reserved. | |
| 4 | * | |
| 5 | * This code is derived from software contributed to Berkeley by | |
| 6 | * Rick Macklem at The University of Guelph. | |
| 7 | * | |
| 8 | * Redistribution and use in source and binary forms, with or without | |
| 9 | * modification, are permitted provided that the following conditions | |
| 10 | * are met: | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in the | |
| 15 | * documentation and/or other materials provided with the distribution. | |
| 16 | * 3. All advertising materials mentioning features or use of this software | |
| 17 | * must display the following acknowledgement: | |
| 18 | * This product includes software developed by the University of | |
| 19 | * California, Berkeley and its contributors. | |
| 20 | * 4. Neither the name of the University nor the names of its contributors | |
| 21 | * may be used to endorse or promote products derived from this software | |
| 22 | * without specific prior written permission. | |
| 23 | * | |
| 24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 34 | * SUCH DAMAGE. | |
| 35 | * | |
| 36 | * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 | |
| 79e5012e | 37 | * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $ |
| 984263bc MD |
38 | */ |
| 39 | ||
| 40 | ||
| 41 | #include <sys/param.h> | |
| 42 | #include <sys/systm.h> | |
| 43 | #include <sys/resourcevar.h> | |
| 44 | #include <sys/signalvar.h> | |
| 45 | #include <sys/proc.h> | |
| 46 | #include <sys/buf.h> | |
| 47 | #include <sys/vnode.h> | |
| 48 | #include <sys/mount.h> | |
| 49 | #include <sys/kernel.h> | |
| edb90c22 | 50 | #include <sys/mbuf.h> |
| 984263bc MD |
51 | |
| 52 | #include <vm/vm.h> | |
| 53 | #include <vm/vm_extern.h> | |
| 54 | #include <vm/vm_page.h> | |
| 55 | #include <vm/vm_object.h> | |
| 56 | #include <vm/vm_pager.h> | |
| 57 | #include <vm/vnode_pager.h> | |
| 58 | ||
| edb90c22 | 59 | #include <sys/buf2.h> |
| 165dba55 | 60 | #include <sys/thread2.h> |
| 1a54183b | 61 | #include <vm/vm_page2.h> |
| 165dba55 | 62 | |
| 1f2de5d4 MD |
63 | #include "rpcv2.h" |
| 64 | #include "nfsproto.h" | |
| 65 | #include "nfs.h" | |
| 66 | #include "nfsmount.h" | |
| 1f2de5d4 | 67 | #include "nfsnode.h" |
| edb90c22 MD |
68 | #include "xdr_subs.h" |
| 69 | #include "nfsm_subs.h" | |
| 70 | ||
| 984263bc | 71 | |
| 54078292 MD |
72 | static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset, |
| 73 | int size, struct thread *td); | |
| b66959e2 | 74 | static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen); |
| ae8e83e6 | 75 | static void nfsiodone_sync(struct bio *bio); |
| cc7d050e MD |
76 | static void nfs_readrpc_bio_done(nfsm_info_t info); |
| 77 | static void nfs_writerpc_bio_done(nfsm_info_t info); | |
| 78 | static void nfs_commitrpc_bio_done(nfsm_info_t info); | |
| 984263bc | 79 | |
| 984263bc | 80 | /* |
| 984263bc MD |
81 | * Vnode op for read using bio |
| 82 | */ | |
| 83 | int | |
| 3b568787 | 84 | nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag) |
| 984263bc | 85 | { |
| dadab5e9 MD |
86 | struct nfsnode *np = VTONFS(vp); |
| 87 | int biosize, i; | |
| a63246d1 | 88 | struct buf *bp, *rabp; |
| 984263bc | 89 | struct vattr vattr; |
| dadab5e9 | 90 | struct thread *td; |
| 984263bc | 91 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| a63246d1 | 92 | off_t lbn, rabn; |
| 54078292 MD |
93 | off_t raoffset; |
| 94 | off_t loffset; | |
| 984263bc | 95 | int seqcount; |
| a63246d1 MD |
96 | int nra, error = 0; |
| 97 | int boff = 0; | |
| 98 | size_t n; | |
| 984263bc MD |
99 | |
| 100 | #ifdef DIAGNOSTIC | |
| 101 | if (uio->uio_rw != UIO_READ) | |
| 102 | panic("nfs_read mode"); | |
| 103 | #endif | |
| 104 | if (uio->uio_resid == 0) | |
| 105 | return (0); | |
| 106 | if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ | |
| 107 | return (EINVAL); | |
| dadab5e9 | 108 | td = uio->uio_td; |
| 984263bc MD |
109 | |
| 110 | if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && | |
| 111 | (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) | |
| 3b568787 | 112 | (void)nfs_fsinfo(nmp, vp, td); |
| 984263bc MD |
113 | if (vp->v_type != VDIR && |
| 114 | (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) | |
| 115 | return (EFBIG); | |
| 116 | biosize = vp->v_mount->mnt_stat.f_iosize; | |
| 117 | seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); | |
| 5a9187cb | 118 | |
| 984263bc MD |
119 | /* |
| 120 | * For nfs, cache consistency can only be maintained approximately. | |
| 121 | * Although RFC1094 does not specify the criteria, the following is | |
| 122 | * believed to be compatible with the reference port. | |
| 5a9187cb | 123 | * |
| 5a9187cb MD |
124 | * NFS: If local changes have been made and this is a |
| 125 | * directory, the directory must be invalidated and | |
| 126 | * the attribute cache must be cleared. | |
| 127 | * | |
| 128 | * GETATTR is called to synchronize the file size. | |
| 129 | * | |
| 130 | * If remote changes are detected local data is flushed | |
| 131 | * and the cache is invalidated. | |
| 132 | * | |
| 5a9187cb MD |
133 | * NOTE: In the normal case the attribute cache is not |
| 134 | * cleared which means GETATTR may use cached data and | |
| 135 | * not immediately detect changes made on the server. | |
| 984263bc | 136 | */ |
| e07fef60 MD |
137 | if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) { |
| 138 | nfs_invaldir(vp); | |
| 87de5057 | 139 | error = nfs_vinvalbuf(vp, V_SAVE, 1); |
| e07fef60 MD |
140 | if (error) |
| 141 | return (error); | |
| 142 | np->n_attrstamp = 0; | |
| 143 | } | |
| 87de5057 | 144 | error = VOP_GETATTR(vp, &vattr); |
| e07fef60 MD |
145 | if (error) |
| 146 | return (error); | |
| 8452310f MD |
147 | |
| 148 | /* | |
| 149 | * This can deadlock getpages/putpages for regular | |
| 150 | * files. Only do it for directories. | |
| 151 | */ | |
| e07fef60 | 152 | if (np->n_flag & NRMODIFIED) { |
| 8452310f | 153 | if (vp->v_type == VDIR) { |
| 5a9187cb | 154 | nfs_invaldir(vp); |
| 8452310f MD |
155 | error = nfs_vinvalbuf(vp, V_SAVE, 1); |
| 156 | if (error) | |
| 157 | return (error); | |
| 158 | np->n_flag &= ~NRMODIFIED; | |
| 159 | } | |
| 984263bc | 160 | } |
| a63246d1 MD |
161 | |
| 162 | /* | |
| 163 | * Loop until uio exhausted or we hit EOF | |
| 164 | */ | |
| 984263bc | 165 | do { |
| a63246d1 MD |
166 | bp = NULL; |
| 167 | ||
| 984263bc MD |
168 | switch (vp->v_type) { |
| 169 | case VREG: | |
| 170 | nfsstats.biocache_reads++; | |
| 171 | lbn = uio->uio_offset / biosize; | |
| a63246d1 | 172 | boff = uio->uio_offset & (biosize - 1); |
| 54078292 | 173 | loffset = (off_t)lbn * biosize; |
| 984263bc MD |
174 | |
| 175 | /* | |
| 176 | * Start the read ahead(s), as required. | |
| 177 | */ | |
| edb90c22 | 178 | if (nmp->nm_readahead > 0 && nfs_asyncok(nmp)) { |
| 984263bc MD |
179 | for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && |
| 180 | (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { | |
| 181 | rabn = lbn + 1 + nra; | |
| 54078292 | 182 | raoffset = (off_t)rabn * biosize; |
| b1c20cfa | 183 | if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) { |
| 54078292 | 184 | rabp = nfs_getcacheblk(vp, raoffset, biosize, td); |
| 984263bc MD |
185 | if (!rabp) |
| 186 | return (EINTR); | |
| 187 | if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { | |
| 10f3fee5 MD |
188 | rabp->b_cmd = BUF_CMD_READ; |
| 189 | vfs_busy_pages(vp, rabp); | |
| edb90c22 | 190 | nfs_asyncio(vp, &rabp->b_bio2); |
| 984263bc MD |
191 | } else { |
| 192 | brelse(rabp); | |
| 193 | } | |
| 194 | } | |
| 195 | } | |
| 196 | } | |
| 197 | ||
| 198 | /* | |
| 199 | * Obtain the buffer cache block. Figure out the buffer size | |
| 200 | * when we are at EOF. If we are modifying the size of the | |
| 201 | * buffer based on an EOF condition we need to hold | |
| 202 | * nfs_rslock() through obtaining the buffer to prevent | |
| 203 | * a potential writer-appender from messing with n_size. | |
| 204 | * Otherwise we may accidently truncate the buffer and | |
| 205 | * lose dirty data. | |
| 206 | * | |
| 207 | * Note that bcount is *not* DEV_BSIZE aligned. | |
| 208 | */ | |
| a63246d1 MD |
209 | if (loffset + boff >= np->n_size) { |
| 210 | n = 0; | |
| 211 | break; | |
| 984263bc | 212 | } |
| a63246d1 | 213 | bp = nfs_getcacheblk(vp, loffset, biosize, td); |
| 984263bc | 214 | |
| a63246d1 | 215 | if (bp == NULL) |
| 984263bc MD |
216 | return (EINTR); |
| 217 | ||
| 218 | /* | |
| 219 | * If B_CACHE is not set, we must issue the read. If this | |
| 220 | * fails, we return an error. | |
| 221 | */ | |
| 984263bc | 222 | if ((bp->b_flags & B_CACHE) == 0) { |
| 28953d39 MD |
223 | bp->b_cmd = BUF_CMD_READ; |
| 224 | bp->b_bio2.bio_done = nfsiodone_sync; | |
| 225 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 226 | vfs_busy_pages(vp, bp); | |
| 227 | error = nfs_doio(vp, &bp->b_bio2, td); | |
| 228 | if (error) { | |
| 229 | brelse(bp); | |
| 230 | return (error); | |
| 231 | } | |
| 984263bc MD |
232 | } |
| 233 | ||
| 234 | /* | |
| 235 | * on is the offset into the current bp. Figure out how many | |
| 236 | * bytes we can copy out of the bp. Note that bcount is | |
| 237 | * NOT DEV_BSIZE aligned. | |
| 238 | * | |
| 239 | * Then figure out how many bytes we can copy into the uio. | |
| 240 | */ | |
| a63246d1 MD |
241 | n = biosize - boff; |
| 242 | if (n > uio->uio_resid) | |
| 243 | n = uio->uio_resid; | |
| 244 | if (loffset + boff + n > np->n_size) | |
| 245 | n = np->n_size - loffset - boff; | |
| 984263bc MD |
246 | break; |
| 247 | case VLNK: | |
| ded0173f | 248 | biosize = min(NFS_MAXPATHLEN, np->n_size); |
| 984263bc | 249 | nfsstats.biocache_readlinks++; |
| ded0173f | 250 | bp = nfs_getcacheblk(vp, (off_t)0, biosize, td); |
| 81b5c339 | 251 | if (bp == NULL) |
| 984263bc MD |
252 | return (EINTR); |
| 253 | if ((bp->b_flags & B_CACHE) == 0) { | |
| 28953d39 MD |
254 | bp->b_cmd = BUF_CMD_READ; |
| 255 | bp->b_bio2.bio_done = nfsiodone_sync; | |
| 256 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 257 | vfs_busy_pages(vp, bp); | |
| 258 | error = nfs_doio(vp, &bp->b_bio2, td); | |
| 259 | if (error) { | |
| 260 | bp->b_flags |= B_ERROR | B_INVAL; | |
| 261 | brelse(bp); | |
| 262 | return (error); | |
| 263 | } | |
| 984263bc | 264 | } |
| a63246d1 MD |
265 | n = szmin(uio->uio_resid, (size_t)bp->b_bcount - bp->b_resid); |
| 266 | boff = 0; | |
| 984263bc MD |
267 | break; |
| 268 | case VDIR: | |
| 269 | nfsstats.biocache_readdirs++; | |
| a63246d1 MD |
270 | if (np->n_direofoffset && |
| 271 | uio->uio_offset >= np->n_direofoffset | |
| 272 | ) { | |
| 273 | return (0); | |
| 984263bc MD |
274 | } |
| 275 | lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; | |
| a63246d1 MD |
276 | boff = uio->uio_offset & (NFS_DIRBLKSIZ - 1); |
| 277 | loffset = uio->uio_offset - boff; | |
| 54078292 | 278 | bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td); |
| 81b5c339 | 279 | if (bp == NULL) |
| a63246d1 | 280 | return (EINTR); |
| b66959e2 | 281 | |
| 984263bc | 282 | if ((bp->b_flags & B_CACHE) == 0) { |
| 10f3fee5 | 283 | bp->b_cmd = BUF_CMD_READ; |
| ae8e83e6 MD |
284 | bp->b_bio2.bio_done = nfsiodone_sync; |
| 285 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 10f3fee5 | 286 | vfs_busy_pages(vp, bp); |
| cc7d050e MD |
287 | error = nfs_doio(vp, &bp->b_bio2, td); |
| 288 | if (error) | |
| 984263bc | 289 | brelse(bp); |
| 984263bc | 290 | while (error == NFSERR_BAD_COOKIE) { |
| 086c1d7e | 291 | kprintf("got bad cookie vp %p bp %p\n", vp, bp); |
| 984263bc | 292 | nfs_invaldir(vp); |
| 87de5057 | 293 | error = nfs_vinvalbuf(vp, 0, 1); |
| 984263bc MD |
294 | /* |
| 295 | * Yuck! The directory has been modified on the | |
| 296 | * server. The only way to get the block is by | |
| 297 | * reading from the beginning to get all the | |
| 298 | * offset cookies. | |
| 299 | * | |
| 300 | * Leave the last bp intact unless there is an error. | |
| 301 | * Loop back up to the while if the error is another | |
| 302 | * NFSERR_BAD_COOKIE (double yuch!). | |
| 303 | */ | |
| 304 | for (i = 0; i <= lbn && !error; i++) { | |
| 305 | if (np->n_direofoffset | |
| 306 | && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) | |
| 307 | return (0); | |
| 54078292 MD |
308 | bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ, |
| 309 | NFS_DIRBLKSIZ, td); | |
| 984263bc MD |
310 | if (!bp) |
| 311 | return (EINTR); | |
| 312 | if ((bp->b_flags & B_CACHE) == 0) { | |
| 10f3fee5 | 313 | bp->b_cmd = BUF_CMD_READ; |
| ae8e83e6 MD |
314 | bp->b_bio2.bio_done = nfsiodone_sync; |
| 315 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 10f3fee5 | 316 | vfs_busy_pages(vp, bp); |
| cc7d050e | 317 | error = nfs_doio(vp, &bp->b_bio2, td); |
| 984263bc MD |
318 | /* |
| 319 | * no error + B_INVAL == directory EOF, | |
| 320 | * use the block. | |
| 321 | */ | |
| 322 | if (error == 0 && (bp->b_flags & B_INVAL)) | |
| 323 | break; | |
| 324 | } | |
| 325 | /* | |
| 326 | * An error will throw away the block and the | |
| 327 | * for loop will break out. If no error and this | |
| 328 | * is not the block we want, we throw away the | |
| 329 | * block and go for the next one via the for loop. | |
| 330 | */ | |
| 331 | if (error || i < lbn) | |
| 332 | brelse(bp); | |
| 333 | } | |
| 334 | } | |
| 335 | /* | |
| 336 | * The above while is repeated if we hit another cookie | |
| 337 | * error. If we hit an error and it wasn't a cookie error, | |
| 338 | * we give up. | |
| 339 | */ | |
| 340 | if (error) | |
| 341 | return (error); | |
| 342 | } | |
| 343 | ||
| 344 | /* | |
| 345 | * If not eof and read aheads are enabled, start one. | |
| 346 | * (You need the current block first, so that you have the | |
| 347 | * directory offset cookie of the next block.) | |
| 348 | */ | |
| edb90c22 | 349 | if (nmp->nm_readahead > 0 && nfs_asyncok(nmp) && |
| 984263bc MD |
350 | (bp->b_flags & B_INVAL) == 0 && |
| 351 | (np->n_direofoffset == 0 || | |
| 54078292 | 352 | loffset + NFS_DIRBLKSIZ < np->n_direofoffset) && |
| b1c20cfa MD |
353 | findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL |
| 354 | ) { | |
| 54078292 MD |
355 | rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ, |
| 356 | NFS_DIRBLKSIZ, td); | |
| 984263bc MD |
357 | if (rabp) { |
| 358 | if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { | |
| 10f3fee5 MD |
359 | rabp->b_cmd = BUF_CMD_READ; |
| 360 | vfs_busy_pages(vp, rabp); | |
| edb90c22 | 361 | nfs_asyncio(vp, &rabp->b_bio2); |
| 984263bc MD |
362 | } else { |
| 363 | brelse(rabp); | |
| 364 | } | |
| 365 | } | |
| 366 | } | |
| 367 | /* | |
| 368 | * Unlike VREG files, whos buffer size ( bp->b_bcount ) is | |
| 369 | * chopped for the EOF condition, we cannot tell how large | |
| 370 | * NFS directories are going to be until we hit EOF. So | |
| 371 | * an NFS directory buffer is *not* chopped to its EOF. Now, | |
| 372 | * it just so happens that b_resid will effectively chop it | |
| 373 | * to EOF. *BUT* this information is lost if the buffer goes | |
| 374 | * away and is reconstituted into a B_CACHE state ( due to | |
| 375 | * being VMIO ) later. So we keep track of the directory eof | |
| 376 | * in np->n_direofoffset and chop it off as an extra step | |
| 377 | * right here. | |
| c0b6e0f5 MD |
378 | * |
| 379 | * NOTE: boff could already be beyond EOF. | |
| 984263bc | 380 | */ |
| c0b6e0f5 MD |
381 | if ((size_t)boff > NFS_DIRBLKSIZ - bp->b_resid) { |
| 382 | n = 0; | |
| 383 | } else { | |
| 384 | n = szmin(uio->uio_resid, | |
| 385 | NFS_DIRBLKSIZ - bp->b_resid - (size_t)boff); | |
| 386 | } | |
| a63246d1 MD |
387 | if (np->n_direofoffset && |
| 388 | n > (size_t)(np->n_direofoffset - uio->uio_offset)) { | |
| 389 | n = (size_t)(np->n_direofoffset - uio->uio_offset); | |
| 390 | } | |
| 984263bc MD |
391 | break; |
| 392 | default: | |
| 086c1d7e | 393 | kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type); |
| a63246d1 | 394 | n = 0; |
| 984263bc MD |
395 | break; |
| 396 | }; | |
| 397 | ||
| 984263bc MD |
398 | switch (vp->v_type) { |
| 399 | case VREG: | |
| 01f31ab3 | 400 | if (n > 0) |
| 68cdd773 | 401 | error = uiomovebp(bp, bp->b_data + boff, n, uio); |
| 984263bc MD |
402 | break; |
| 403 | case VLNK: | |
| 01f31ab3 | 404 | if (n > 0) |
| 68cdd773 | 405 | error = uiomovebp(bp, bp->b_data + boff, n, uio); |
| 984263bc MD |
406 | n = 0; |
| 407 | break; | |
| 408 | case VDIR: | |
| 01f31ab3 JS |
409 | if (n > 0) { |
| 410 | off_t old_off = uio->uio_offset; | |
| 411 | caddr_t cpos, epos; | |
| 412 | struct nfs_dirent *dp; | |
| 413 | ||
| b66959e2 MD |
414 | /* |
| 415 | * We are casting cpos to nfs_dirent, it must be | |
| 416 | * int-aligned. | |
| 417 | */ | |
| a63246d1 | 418 | if (boff & 3) { |
| b66959e2 MD |
419 | error = EINVAL; |
| 420 | break; | |
| 421 | } | |
| 422 | ||
| a63246d1 MD |
423 | cpos = bp->b_data + boff; |
| 424 | epos = bp->b_data + boff + n; | |
| 01f31ab3 JS |
425 | while (cpos < epos && error == 0 && uio->uio_resid > 0) { |
| 426 | dp = (struct nfs_dirent *)cpos; | |
| b66959e2 MD |
427 | error = nfs_check_dirent(dp, (int)(epos - cpos)); |
| 428 | if (error) | |
| 429 | break; | |
| 01f31ab3 | 430 | if (vop_write_dirent(&error, uio, dp->nfs_ino, |
| b66959e2 | 431 | dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) { |
| 01f31ab3 | 432 | break; |
| b66959e2 | 433 | } |
| 01f31ab3 JS |
434 | cpos += dp->nfs_reclen; |
| 435 | } | |
| 436 | n = 0; | |
| a63246d1 MD |
437 | if (error == 0) { |
| 438 | uio->uio_offset = old_off + cpos - | |
| 439 | bp->b_data - boff; | |
| 440 | } | |
| 01f31ab3 | 441 | } |
| 984263bc MD |
442 | break; |
| 443 | default: | |
| 086c1d7e | 444 | kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type); |
| 984263bc | 445 | } |
| a63246d1 MD |
446 | if (bp) |
| 447 | brelse(bp); | |
| 984263bc MD |
448 | } while (error == 0 && uio->uio_resid > 0 && n > 0); |
| 449 | return (error); | |
| 450 | } | |
| 451 | ||
| 452 | /* | |
| b66959e2 MD |
453 | * Userland can supply any 'seek' offset when reading a NFS directory. |
| 454 | * Validate the structure so we don't panic the kernel. Note that | |
| 455 | * the element name is nul terminated and the nul is not included | |
| 456 | * in nfs_namlen. | |
| 457 | */ | |
| 458 | static | |
| 459 | int | |
| 460 | nfs_check_dirent(struct nfs_dirent *dp, int maxlen) | |
| 461 | { | |
| 462 | int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]); | |
| 463 | ||
| 464 | if (nfs_name_off >= maxlen) | |
| 465 | return (EINVAL); | |
| 466 | if (dp->nfs_reclen < nfs_name_off || dp->nfs_reclen > maxlen) | |
| 467 | return (EINVAL); | |
| 468 | if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen) | |
| 469 | return (EINVAL); | |
| 470 | if (dp->nfs_reclen & 3) | |
| 471 | return (EINVAL); | |
| 472 | return (0); | |
| 473 | } | |
| 474 | ||
| 475 | /* | |
| 984263bc | 476 | * Vnode op for write using bio |
| e851b29e CP |
477 | * |
| 478 | * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag, | |
| 479 | * struct ucred *a_cred) | |
| 984263bc MD |
480 | */ |
| 481 | int | |
| e851b29e | 482 | nfs_write(struct vop_write_args *ap) |
| 984263bc | 483 | { |
| 984263bc | 484 | struct uio *uio = ap->a_uio; |
| dadab5e9 | 485 | struct thread *td = uio->uio_td; |
| 984263bc MD |
486 | struct vnode *vp = ap->a_vp; |
| 487 | struct nfsnode *np = VTONFS(vp); | |
| 984263bc MD |
488 | int ioflag = ap->a_ioflag; |
| 489 | struct buf *bp; | |
| 490 | struct vattr vattr; | |
| 491 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); | |
| 54078292 | 492 | off_t loffset; |
| a63246d1 MD |
493 | int boff, bytes; |
| 494 | int error = 0; | |
| 984263bc | 495 | int haverslock = 0; |
| 81b5c339 MD |
496 | int bcount; |
| 497 | int biosize; | |
| 8452310f | 498 | int trivial; |
| 984263bc MD |
499 | |
| 500 | #ifdef DIAGNOSTIC | |
| 501 | if (uio->uio_rw != UIO_WRITE) | |
| 502 | panic("nfs_write mode"); | |
| 7b95be2a | 503 | if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) |
| 984263bc MD |
504 | panic("nfs_write proc"); |
| 505 | #endif | |
| 506 | if (vp->v_type != VREG) | |
| 507 | return (EIO); | |
| 104db2fb MD |
508 | |
| 509 | lwkt_gettoken(&nmp->nm_token); | |
| 510 | ||
| 984263bc MD |
511 | if (np->n_flag & NWRITEERR) { |
| 512 | np->n_flag &= ~NWRITEERR; | |
| 104db2fb | 513 | lwkt_reltoken(&nmp->nm_token); |
| 984263bc MD |
514 | return (np->n_error); |
| 515 | } | |
| 516 | if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && | |
| 104db2fb | 517 | (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) { |
| 3b568787 | 518 | (void)nfs_fsinfo(nmp, vp, td); |
| 104db2fb | 519 | } |
| 984263bc MD |
520 | |
| 521 | /* | |
| 522 | * Synchronously flush pending buffers if we are in synchronous | |
| 523 | * mode or if we are appending. | |
| 524 | */ | |
| 525 | if (ioflag & (IO_APPEND | IO_SYNC)) { | |
| 5a9187cb | 526 | if (np->n_flag & NLMODIFIED) { |
| 984263bc | 527 | np->n_attrstamp = 0; |
| 5a9187cb | 528 | error = nfs_flush(vp, MNT_WAIT, td, 0); |
| 87de5057 | 529 | /* error = nfs_vinvalbuf(vp, V_SAVE, 1); */ |
| 984263bc | 530 | if (error) |
| 104db2fb | 531 | goto done; |
| 984263bc MD |
532 | } |
| 533 | } | |
| 534 | ||
| 535 | /* | |
| 536 | * If IO_APPEND then load uio_offset. We restart here if we cannot | |
| 537 | * get the append lock. | |
| 538 | */ | |
| 539 | restart: | |
| 540 | if (ioflag & IO_APPEND) { | |
| 541 | np->n_attrstamp = 0; | |
| 87de5057 | 542 | error = VOP_GETATTR(vp, &vattr); |
| 984263bc | 543 | if (error) |
| 104db2fb | 544 | goto done; |
| 984263bc MD |
545 | uio->uio_offset = np->n_size; |
| 546 | } | |
| 547 | ||
| 104db2fb MD |
548 | if (uio->uio_offset < 0) { |
| 549 | error = EINVAL; | |
| 550 | goto done; | |
| 551 | } | |
| 552 | if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) { | |
| 553 | error = EFBIG; | |
| 554 | goto done; | |
| 555 | } | |
| 556 | if (uio->uio_resid == 0) { | |
| 557 | error = 0; | |
| 558 | goto done; | |
| 559 | } | |
| 984263bc MD |
560 | |
| 561 | /* | |
| 562 | * We need to obtain the rslock if we intend to modify np->n_size | |
| 563 | * in order to guarentee the append point with multiple contending | |
| 564 | * writers, to guarentee that no other appenders modify n_size | |
| 565 | * while we are trying to obtain a truncated buffer (i.e. to avoid | |
| 566 | * accidently truncating data written by another appender due to | |
| 567 | * the race), and to ensure that the buffer is populated prior to | |
| 568 | * our extending of the file. We hold rslock through the entire | |
| 569 | * operation. | |
| 570 | * | |
| 571 | * Note that we do not synchronize the case where someone truncates | |
| 572 | * the file while we are appending to it because attempting to lock | |
| 573 | * this case may deadlock other parts of the system unexpectedly. | |
| 574 | */ | |
| 575 | if ((ioflag & IO_APPEND) || | |
| 576 | uio->uio_offset + uio->uio_resid > np->n_size) { | |
| 2313ec23 | 577 | switch(nfs_rslock(np)) { |
| 984263bc MD |
578 | case ENOLCK: |
| 579 | goto restart; | |
| 580 | /* not reached */ | |
| 581 | case EINTR: | |
| 582 | case ERESTART: | |
| 104db2fb MD |
583 | error = EINTR; |
| 584 | goto done; | |
| 984263bc MD |
585 | /* not reached */ |
| 586 | default: | |
| 587 | break; | |
| 588 | } | |
| 589 | haverslock = 1; | |
| 590 | } | |
| 591 | ||
| 592 | /* | |
| 593 | * Maybe this should be above the vnode op call, but so long as | |
| 594 | * file servers have no limits, i don't think it matters | |
| 595 | */ | |
| 8452310f | 596 | if (td && td->td_proc && uio->uio_offset + uio->uio_resid > |
| dadab5e9 | 597 | td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { |
| 7278a846 | 598 | lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); |
| 984263bc | 599 | if (haverslock) |
| 2313ec23 | 600 | nfs_rsunlock(np); |
| 104db2fb MD |
601 | error = EFBIG; |
| 602 | goto done; | |
| 984263bc MD |
603 | } |
| 604 | ||
| 605 | biosize = vp->v_mount->mnt_stat.f_iosize; | |
| 606 | ||
| 607 | do { | |
| 984263bc | 608 | nfsstats.biocache_writes++; |
| a63246d1 MD |
609 | boff = uio->uio_offset & (biosize-1); |
| 610 | loffset = uio->uio_offset - boff; | |
| 611 | bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid); | |
| 984263bc MD |
612 | again: |
| 613 | /* | |
| 614 | * Handle direct append and file extension cases, calculate | |
| a63246d1 MD |
615 | * unaligned buffer size. When extending B_CACHE will be |
| 616 | * set if possible. See UIO_NOCOPY note below. | |
| 984263bc | 617 | */ |
| a63246d1 MD |
618 | if (uio->uio_offset + bytes > np->n_size) { |
| 619 | np->n_flag |= NLMODIFIED; | |
| 8452310f MD |
620 | trivial = (uio->uio_segflg != UIO_NOCOPY && |
| 621 | uio->uio_offset <= np->n_size); | |
| 622 | nfs_meta_setsize(vp, td, uio->uio_offset + bytes, | |
| 623 | trivial); | |
| 984263bc | 624 | } |
| 8452310f | 625 | bp = nfs_getcacheblk(vp, loffset, biosize, td); |
| 81b5c339 | 626 | if (bp == NULL) { |
| 984263bc MD |
627 | error = EINTR; |
| 628 | break; | |
| 629 | } | |
| 630 | ||
| 631 | /* | |
| a63246d1 MD |
632 | * Actual bytes in buffer which we care about |
| 633 | */ | |
| 634 | if (loffset + biosize < np->n_size) | |
| 635 | bcount = biosize; | |
| 636 | else | |
| 637 | bcount = (int)(np->n_size - loffset); | |
| 638 | ||
| 639 | /* | |
| 28953d39 | 640 | * Avoid a read by setting B_CACHE where the data we |
| a63246d1 MD |
641 | * intend to write covers the entire buffer. Note |
| 642 | * that the buffer may have been set to B_CACHE by | |
| 643 | * nfs_meta_setsize() above or otherwise inherited the | |
| 644 | * flag, but if B_CACHE isn't set the buffer may be | |
| 645 | * uninitialized and must be zero'd to accomodate | |
| 646 | * future seek+write's. | |
| 984263bc | 647 | * |
| 28953d39 | 648 | * See the comments in kern/vfs_bio.c's getblk() for |
| 984263bc MD |
649 | * more information. |
| 650 | * | |
| 8aa7625b MD |
651 | * When doing a UIO_NOCOPY write the buffer is not |
| 652 | * overwritten and we cannot just set B_CACHE unconditionally | |
| 653 | * for full-block writes. | |
| 984263bc | 654 | */ |
| a63246d1 MD |
655 | if (boff == 0 && bytes == biosize && |
| 656 | uio->uio_segflg != UIO_NOCOPY) { | |
| 984263bc MD |
657 | bp->b_flags |= B_CACHE; |
| 658 | bp->b_flags &= ~(B_ERROR | B_INVAL); | |
| 659 | } | |
| 660 | ||
| 28953d39 MD |
661 | /* |
| 662 | * b_resid may be set due to file EOF if we extended out. | |
| 663 | * The NFS bio code will zero the difference anyway so | |
| 664 | * just acknowledged the fact and set b_resid to 0. | |
| 665 | */ | |
| 984263bc | 666 | if ((bp->b_flags & B_CACHE) == 0) { |
| 10f3fee5 | 667 | bp->b_cmd = BUF_CMD_READ; |
| ae8e83e6 MD |
668 | bp->b_bio2.bio_done = nfsiodone_sync; |
| 669 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 10f3fee5 | 670 | vfs_busy_pages(vp, bp); |
| cc7d050e | 671 | error = nfs_doio(vp, &bp->b_bio2, td); |
| 984263bc MD |
672 | if (error) { |
| 673 | brelse(bp); | |
| 674 | break; | |
| 675 | } | |
| 28953d39 | 676 | bp->b_resid = 0; |
| 984263bc | 677 | } |
| 5a9187cb | 678 | np->n_flag |= NLMODIFIED; |
| 984263bc MD |
679 | |
| 680 | /* | |
| 681 | * If dirtyend exceeds file size, chop it down. This should | |
| 682 | * not normally occur but there is an append race where it | |
| 683 | * might occur XXX, so we log it. | |
| 684 | * | |
| 685 | * If the chopping creates a reverse-indexed or degenerate | |
| 686 | * situation with dirtyoff/end, we 0 both of them. | |
| 687 | */ | |
| 984263bc | 688 | if (bp->b_dirtyend > bcount) { |
| 086c1d7e | 689 | kprintf("NFS append race @%08llx:%d\n", |
| 973c11b9 | 690 | (long long)bp->b_bio2.bio_offset, |
| 984263bc MD |
691 | bp->b_dirtyend - bcount); |
| 692 | bp->b_dirtyend = bcount; | |
| 693 | } | |
| 694 | ||
| 695 | if (bp->b_dirtyoff >= bp->b_dirtyend) | |
| 696 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 697 | ||
| 698 | /* | |
| 699 | * If the new write will leave a contiguous dirty | |
| 700 | * area, just update the b_dirtyoff and b_dirtyend, | |
| 701 | * otherwise force a write rpc of the old dirty area. | |
| 702 | * | |
| 703 | * While it is possible to merge discontiguous writes due to | |
| 704 | * our having a B_CACHE buffer ( and thus valid read data | |
| 705 | * for the hole), we don't because it could lead to | |
| 706 | * significant cache coherency problems with multiple clients, | |
| 707 | * especially if locking is implemented later on. | |
| 708 | * | |
| 709 | * as an optimization we could theoretically maintain | |
| 710 | * a linked list of discontinuous areas, but we would still | |
| 711 | * have to commit them separately so there isn't much | |
| 712 | * advantage to it except perhaps a bit of asynchronization. | |
| 713 | */ | |
| 984263bc | 714 | if (bp->b_dirtyend > 0 && |
| a63246d1 MD |
715 | (boff > bp->b_dirtyend || |
| 716 | (boff + bytes) < bp->b_dirtyoff) | |
| 717 | ) { | |
| 62cfda27 | 718 | if (bwrite(bp) == EINTR) { |
| 984263bc MD |
719 | error = EINTR; |
| 720 | break; | |
| 721 | } | |
| 722 | goto again; | |
| 723 | } | |
| 724 | ||
| 68cdd773 | 725 | error = uiomovebp(bp, bp->b_data + boff, bytes, uio); |
| 984263bc MD |
726 | |
| 727 | /* | |
| 728 | * Since this block is being modified, it must be written | |
| 729 | * again and not just committed. Since write clustering does | |
| 730 | * not work for the stage 1 data write, only the stage 2 | |
| 731 | * commit rpc, we have to clear B_CLUSTEROK as well. | |
| 732 | */ | |
| 733 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 734 | ||
| 735 | if (error) { | |
| 984263bc MD |
736 | brelse(bp); |
| 737 | break; | |
| 738 | } | |
| 739 | ||
| 740 | /* | |
| 741 | * Only update dirtyoff/dirtyend if not a degenerate | |
| 742 | * condition. | |
| 1a54183b MD |
743 | * |
| 744 | * The underlying VM pages have been marked valid by | |
| 745 | * virtue of acquiring the bp. Because the entire buffer | |
| 746 | * is marked dirty we do not have to worry about cleaning | |
| 747 | * out the related dirty bits (and wouldn't really know | |
| 748 | * how to deal with byte ranges anyway) | |
| 984263bc | 749 | */ |
| a63246d1 | 750 | if (bytes) { |
| 984263bc | 751 | if (bp->b_dirtyend > 0) { |
| a63246d1 MD |
752 | bp->b_dirtyoff = imin(boff, bp->b_dirtyoff); |
| 753 | bp->b_dirtyend = imax(boff + bytes, | |
| 754 | bp->b_dirtyend); | |
| 984263bc | 755 | } else { |
| a63246d1 MD |
756 | bp->b_dirtyoff = boff; |
| 757 | bp->b_dirtyend = boff + bytes; | |
| 984263bc | 758 | } |
| 984263bc | 759 | } |
| 984263bc MD |
760 | |
| 761 | /* | |
| 762 | * If the lease is non-cachable or IO_SYNC do bwrite(). | |
| 763 | * | |
| 764 | * IO_INVAL appears to be unused. The idea appears to be | |
| 765 | * to turn off caching in this case. Very odd. XXX | |
| a482a28a MD |
766 | * |
| 767 | * If nfs_async is set bawrite() will use an unstable write | |
| 768 | * (build dirty bufs on the server), so we might as well | |
| 769 | * push it out with bawrite(). If nfs_async is not set we | |
| 770 | * use bdwrite() to cache dirty bufs on the client. | |
| 984263bc | 771 | */ |
| a63246d1 | 772 | if (ioflag & IO_SYNC) { |
| 984263bc MD |
773 | if (ioflag & IO_INVAL) |
| 774 | bp->b_flags |= B_NOCACHE; | |
| 62cfda27 | 775 | error = bwrite(bp); |
| 984263bc MD |
776 | if (error) |
| 777 | break; | |
| a63246d1 | 778 | } else if (boff + bytes == biosize && nfs_async) { |
| a482a28a | 779 | bawrite(bp); |
| 984263bc MD |
780 | } else { |
| 781 | bdwrite(bp); | |
| 782 | } | |
| a63246d1 | 783 | } while (uio->uio_resid > 0 && bytes > 0); |
| 984263bc MD |
784 | |
| 785 | if (haverslock) | |
| 2313ec23 | 786 | nfs_rsunlock(np); |
| 984263bc | 787 | |
| 104db2fb MD |
788 | done: |
| 789 | lwkt_reltoken(&nmp->nm_token); | |
| 984263bc MD |
790 | return (error); |
| 791 | } | |
| 792 | ||
| 793 | /* | |
| 794 | * Get an nfs cache block. | |
| 795 | * | |
| 796 | * Allocate a new one if the block isn't currently in the cache | |
| 797 | * and return the block marked busy. If the calling process is | |
| 798 | * interrupted by a signal for an interruptible mount point, return | |
| 799 | * NULL. | |
| 800 | * | |
| 801 | * The caller must carefully deal with the possible B_INVAL state of | |
| edb90c22 | 802 | * the buffer. nfs_startio() clears B_INVAL (and nfs_asyncio() clears it |
| 984263bc MD |
803 | * indirectly), so synchronous reads can be issued without worrying about |
| 804 | * the B_INVAL state. We have to be a little more careful when dealing | |
| 805 | * with writes (see comments in nfs_write()) when extending a file past | |
| 806 | * its EOF. | |
| 807 | */ | |
| 808 | static struct buf * | |
| 54078292 | 809 | nfs_getcacheblk(struct vnode *vp, off_t loffset, int size, struct thread *td) |
| 984263bc | 810 | { |
| 40393ded | 811 | struct buf *bp; |
| 984263bc MD |
812 | struct mount *mp; |
| 813 | struct nfsmount *nmp; | |
| 814 | ||
| 815 | mp = vp->v_mount; | |
| 816 | nmp = VFSTONFS(mp); | |
| 817 | ||
| 818 | if (nmp->nm_flag & NFSMNT_INT) { | |
| 4b958e7b | 819 | bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0); |
| 81b5c339 | 820 | while (bp == NULL) { |
| 60233e58 | 821 | if (nfs_sigintr(nmp, NULL, td)) |
| 81b5c339 | 822 | return (NULL); |
| 54078292 | 823 | bp = getblk(vp, loffset, size, 0, 2 * hz); |
| 984263bc MD |
824 | } |
| 825 | } else { | |
| 54078292 | 826 | bp = getblk(vp, loffset, size, 0, 0); |
| 984263bc MD |
827 | } |
| 828 | ||
| 81b5c339 | 829 | /* |
| 54078292 MD |
830 | * bio2, the 'device' layer. Since BIOs use 64 bit byte offsets |
| 831 | * now, no translation is necessary. | |
| 81b5c339 | 832 | */ |
| 54078292 | 833 | bp->b_bio2.bio_offset = loffset; |
| 984263bc MD |
834 | return (bp); |
| 835 | } | |
| 836 | ||
| 837 | /* | |
| 838 | * Flush and invalidate all dirty buffers. If another process is already | |
| 839 | * doing the flush, just wait for completion. | |
| 840 | */ | |
| 841 | int | |
| 87de5057 | 842 | nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg) |
| 984263bc | 843 | { |
| 40393ded | 844 | struct nfsnode *np = VTONFS(vp); |
| 984263bc MD |
845 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| 846 | int error = 0, slpflag, slptimeo; | |
| 87de5057 | 847 | thread_t td = curthread; |
| 984263bc | 848 | |
| 5fd012e0 | 849 | if (vp->v_flag & VRECLAIMED) |
| 984263bc | 850 | return (0); |
| 984263bc MD |
851 | |
| 852 | if ((nmp->nm_flag & NFSMNT_INT) == 0) | |
| 853 | intrflg = 0; | |
| 854 | if (intrflg) { | |
| 855 | slpflag = PCATCH; | |
| 856 | slptimeo = 2 * hz; | |
| 857 | } else { | |
| 858 | slpflag = 0; | |
| 859 | slptimeo = 0; | |
| 860 | } | |
| 861 | /* | |
| 862 | * First wait for any other process doing a flush to complete. | |
| 863 | */ | |
| 864 | while (np->n_flag & NFLUSHINPROG) { | |
| 865 | np->n_flag |= NFLUSHWANT; | |
| 377d4740 | 866 | error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo); |
| 87de5057 | 867 | if (error && intrflg && nfs_sigintr(nmp, NULL, td)) |
| 984263bc MD |
868 | return (EINTR); |
| 869 | } | |
| 870 | ||
| 871 | /* | |
| 872 | * Now, flush as required. | |
| 873 | */ | |
| 874 | np->n_flag |= NFLUSHINPROG; | |
| 87de5057 | 875 | error = vinvalbuf(vp, flags, slpflag, 0); |
| 984263bc | 876 | while (error) { |
| 87de5057 | 877 | if (intrflg && nfs_sigintr(nmp, NULL, td)) { |
| 984263bc MD |
878 | np->n_flag &= ~NFLUSHINPROG; |
| 879 | if (np->n_flag & NFLUSHWANT) { | |
| 880 | np->n_flag &= ~NFLUSHWANT; | |
| 881 | wakeup((caddr_t)&np->n_flag); | |
| 882 | } | |
| 883 | return (EINTR); | |
| 884 | } | |
| 87de5057 | 885 | error = vinvalbuf(vp, flags, 0, slptimeo); |
| 984263bc | 886 | } |
| 5a9187cb | 887 | np->n_flag &= ~(NLMODIFIED | NFLUSHINPROG); |
| 984263bc MD |
888 | if (np->n_flag & NFLUSHWANT) { |
| 889 | np->n_flag &= ~NFLUSHWANT; | |
| 890 | wakeup((caddr_t)&np->n_flag); | |
| 891 | } | |
| 892 | return (0); | |
| 893 | } | |
| 894 | ||
| 895 | /* | |
| edb90c22 MD |
896 | * Return true (non-zero) if the txthread and rxthread are operational |
| 897 | * and we do not already have too many not-yet-started BIO's built up. | |
| 984263bc MD |
898 | */ |
| 899 | int | |
| edb90c22 MD |
900 | nfs_asyncok(struct nfsmount *nmp) |
| 901 | { | |
| cc7d050e | 902 | return (nmp->nm_bioqlen < nfs_maxasyncbio && |
| f8565b0f | 903 | nmp->nm_bioqlen < nmp->nm_maxasync_scaled / NFS_ASYSCALE && |
| edb90c22 MD |
904 | nmp->nm_rxstate <= NFSSVC_PENDING && |
| 905 | nmp->nm_txstate <= NFSSVC_PENDING); | |
| 906 | } | |
| 907 | ||
| 908 | /* | |
| 909 | * The read-ahead code calls this to queue a bio to the txthread. | |
| 910 | * | |
| 911 | * We don't touch the bio otherwise... that is, we do not even | |
| 912 | * construct or send the initial rpc. The txthread will do it | |
| 913 | * for us. | |
| f8565b0f MD |
914 | * |
| 915 | * NOTE! nm_bioqlen is not decremented until the request completes, | |
| 916 | * so it does not reflect the number of bio's on bioq. | |
| edb90c22 MD |
917 | */ |
| 918 | void | |
| 919 | nfs_asyncio(struct vnode *vp, struct bio *bio) | |
| 984263bc | 920 | { |
| 81b5c339 | 921 | struct buf *bp = bio->bio_buf; |
| edb90c22 | 922 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| 984263bc | 923 | |
| 81b5c339 | 924 | KKASSERT(vp->v_tag == VT_NFS); |
| 52e1cf57 | 925 | BUF_KERNPROC(bp); |
| c504e38e MD |
926 | |
| 927 | /* | |
| 928 | * Shortcut swap cache (not done automatically because we are not | |
| 929 | * using bread()). | |
| 930 | */ | |
| 931 | if (vn_cache_strategy(vp, bio)) | |
| 932 | return; | |
| 933 | ||
| 52e1cf57 | 934 | bio->bio_driver_info = vp; |
| f8565b0f | 935 | crit_enter(); |
| 52e1cf57 | 936 | TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act); |
| f8565b0f MD |
937 | atomic_add_int(&nmp->nm_bioqlen, 1); |
| 938 | crit_exit(); | |
| 52e1cf57 | 939 | nfssvc_iod_writer_wakeup(nmp); |
| 984263bc MD |
940 | } |
| 941 | ||
| 942 | /* | |
| 5e6f1ca5 | 943 | * nfs_doio() - Execute a BIO operation synchronously. The BIO will be |
| cc7d050e MD |
944 | * completed and its error returned. The caller is responsible |
| 945 | * for brelse()ing it. ONLY USE FOR BIO_SYNC IOs! Otherwise | |
| 946 | * our error probe will be against an invalid pointer. | |
| edb90c22 | 947 | * |
| cc7d050e | 948 | * nfs_startio()- Execute a BIO operation assynchronously. |
| dadab5e9 | 949 | * |
| cc7d050e MD |
950 | * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation, |
| 951 | * which basically just queues it to the txthread. nfs_startio() | |
| 952 | * actually initiates the I/O AFTER it has gotten to the txthread. | |
| ae8e83e6 | 953 | * |
| cc7d050e | 954 | * NOTE: td might be NULL. |
| cb1cf930 MD |
955 | * |
| 956 | * NOTE: Caller has already busied the I/O. | |
| 984263bc | 957 | */ |
| edb90c22 MD |
958 | void |
| 959 | nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td) | |
| 984263bc | 960 | { |
| 81b5c339 | 961 | struct buf *bp = bio->bio_buf; |
| cc7d050e MD |
962 | struct nfsnode *np; |
| 963 | struct nfsmount *nmp; | |
| 964 | ||
| 965 | KKASSERT(vp->v_tag == VT_NFS); | |
| 966 | np = VTONFS(vp); | |
| 967 | nmp = VFSTONFS(vp->v_mount); | |
| 968 | ||
| 969 | /* | |
| 970 | * clear B_ERROR and B_INVAL state prior to initiating the I/O. We | |
| 971 | * do this here so we do not have to do it in all the code that | |
| 972 | * calls us. | |
| 973 | */ | |
| 974 | bp->b_flags &= ~(B_ERROR | B_INVAL); | |
| 975 | ||
| 976 | KASSERT(bp->b_cmd != BUF_CMD_DONE, | |
| 977 | ("nfs_doio: bp %p already marked done!", bp)); | |
| 978 | ||
| 979 | if (bp->b_cmd == BUF_CMD_READ) { | |
| 980 | switch (vp->v_type) { | |
| 981 | case VREG: | |
| 982 | nfsstats.read_bios++; | |
| 983 | nfs_readrpc_bio(vp, bio); | |
| 984 | break; | |
| 985 | case VLNK: | |
| 986 | #if 0 | |
| 987 | bio->bio_offset = 0; | |
| 988 | nfsstats.readlink_bios++; | |
| 989 | nfs_readlinkrpc_bio(vp, bio); | |
| 990 | #else | |
| 991 | nfs_doio(vp, bio, td); | |
| 992 | #endif | |
| 993 | break; | |
| 994 | case VDIR: | |
| 995 | /* | |
| 996 | * NOTE: If nfs_readdirplusrpc_bio() is requested but | |
| 997 | * not supported, it will chain to | |
| 998 | * nfs_readdirrpc_bio(). | |
| 999 | */ | |
| 1000 | #if 0 | |
| 1001 | nfsstats.readdir_bios++; | |
| 1002 | uiop->uio_offset = bio->bio_offset; | |
| 1003 | if (nmp->nm_flag & NFSMNT_RDIRPLUS) | |
| 1004 | nfs_readdirplusrpc_bio(vp, bio); | |
| 1005 | else | |
| 1006 | nfs_readdirrpc_bio(vp, bio); | |
| 1007 | #else | |
| 1008 | nfs_doio(vp, bio, td); | |
| 1009 | #endif | |
| 1010 | break; | |
| 1011 | default: | |
| 1012 | kprintf("nfs_doio: type %x unexpected\n",vp->v_type); | |
| 1013 | bp->b_flags |= B_ERROR; | |
| 1014 | bp->b_error = EINVAL; | |
| 1015 | biodone(bio); | |
| 1016 | break; | |
| 1017 | } | |
| 1018 | } else { | |
| 1019 | /* | |
| 1020 | * If we only need to commit, try to commit. If this fails | |
| 1021 | * it will chain through to the write. Basically all the logic | |
| 1022 | * in nfs_doio() is replicated. | |
| 1023 | */ | |
| 1024 | KKASSERT(bp->b_cmd == BUF_CMD_WRITE); | |
| 1025 | if (bp->b_flags & B_NEEDCOMMIT) | |
| 1026 | nfs_commitrpc_bio(vp, bio); | |
| 1027 | else | |
| 1028 | nfs_writerpc_bio(vp, bio); | |
| 1029 | } | |
| 1030 | } | |
| 1031 | ||
| 1032 | int | |
| 1033 | nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td) | |
| 1034 | { | |
| 1035 | struct buf *bp = bio->bio_buf; | |
| 984263bc | 1036 | struct uio *uiop; |
| 984263bc MD |
1037 | struct nfsnode *np; |
| 1038 | struct nfsmount *nmp; | |
| cc7d050e MD |
1039 | int error = 0; |
| 1040 | int iomode, must_commit; | |
| 28953d39 | 1041 | size_t n; |
| 984263bc MD |
1042 | struct uio uio; |
| 1043 | struct iovec io; | |
| 1044 | ||
| c504e38e MD |
1045 | #if 0 |
| 1046 | /* | |
| 1047 | * Shortcut swap cache (not done automatically because we are not | |
| 1048 | * using bread()). | |
| 1049 | * | |
| 1050 | * XXX The biowait is a hack until we can figure out how to stop a | |
| 1051 | * biodone chain when a middle element is BIO_SYNC. BIO_SYNC is | |
| 1052 | * set so the bp shouldn't get ripped out from under us. The only | |
| 1053 | * use-cases are fully synchronous I/O cases. | |
| 1054 | * | |
| 1055 | * XXX This is having problems, give up for now. | |
| 1056 | */ | |
| 1057 | if (vn_cache_strategy(vp, bio)) { | |
| c504e38e MD |
1058 | error = biowait(&bio->bio_buf->b_bio1, "nfsrsw"); |
| 1059 | return (error); | |
| 1060 | } | |
| 1061 | #endif | |
| 1062 | ||
| 81b5c339 | 1063 | KKASSERT(vp->v_tag == VT_NFS); |
| 984263bc MD |
1064 | np = VTONFS(vp); |
| 1065 | nmp = VFSTONFS(vp->v_mount); | |
| 1066 | uiop = &uio; | |
| 1067 | uiop->uio_iov = &io; | |
| 1068 | uiop->uio_iovcnt = 1; | |
| 1069 | uiop->uio_segflg = UIO_SYSSPACE; | |
| dadab5e9 | 1070 | uiop->uio_td = td; |
| 984263bc MD |
1071 | |
| 1072 | /* | |
| 1073 | * clear B_ERROR and B_INVAL state prior to initiating the I/O. We | |
| 1074 | * do this here so we do not have to do it in all the code that | |
| 1075 | * calls us. | |
| 1076 | */ | |
| 1077 | bp->b_flags &= ~(B_ERROR | B_INVAL); | |
| 1078 | ||
| 10f3fee5 MD |
1079 | KASSERT(bp->b_cmd != BUF_CMD_DONE, |
| 1080 | ("nfs_doio: bp %p already marked done!", bp)); | |
| 1081 | ||
| 1082 | if (bp->b_cmd == BUF_CMD_READ) { | |
| e54488bb | 1083 | io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount; |
| 984263bc MD |
1084 | io.iov_base = bp->b_data; |
| 1085 | uiop->uio_rw = UIO_READ; | |
| 1086 | ||
| 1087 | switch (vp->v_type) { | |
| 1088 | case VREG: | |
| 28953d39 MD |
1089 | /* |
| 1090 | * When reading from a regular file zero-fill any residual. | |
| 1091 | * Note that this residual has nothing to do with NFS short | |
| 1092 | * reads, which nfs_readrpc_uio() will handle for us. | |
| 1093 | * | |
| 1094 | * We have to do this because when we are write extending | |
| 1095 | * a file the server may not have the same notion of | |
| 1096 | * filesize as we do. Our BIOs should already be sized | |
| 1097 | * (b_bcount) to account for the file EOF. | |
| 1098 | */ | |
| 984263bc | 1099 | nfsstats.read_bios++; |
| edb90c22 MD |
1100 | uiop->uio_offset = bio->bio_offset; |
| 1101 | error = nfs_readrpc_uio(vp, uiop); | |
| 28953d39 MD |
1102 | if (error == 0 && uiop->uio_resid) { |
| 1103 | n = (size_t)bp->b_bcount - uiop->uio_resid; | |
| 1104 | bzero(bp->b_data + n, bp->b_bcount - n); | |
| 1105 | uiop->uio_resid = 0; | |
| 984263bc | 1106 | } |
| dadab5e9 | 1107 | if (td && td->td_proc && (vp->v_flag & VTEXT) && |
| e07fef60 | 1108 | np->n_mtime != np->n_vattr.va_mtime.tv_sec) { |
| 984263bc | 1109 | uprintf("Process killed due to text file modification\n"); |
| 84204577 | 1110 | ksignal(td->td_proc, SIGKILL); |
| 984263bc MD |
1111 | } |
| 1112 | break; | |
| 1113 | case VLNK: | |
| 81b5c339 | 1114 | uiop->uio_offset = 0; |
| 984263bc | 1115 | nfsstats.readlink_bios++; |
| cc7d050e | 1116 | error = nfs_readlinkrpc_uio(vp, uiop); |
| 984263bc MD |
1117 | break; |
| 1118 | case VDIR: | |
| 1119 | nfsstats.readdir_bios++; | |
| 54078292 | 1120 | uiop->uio_offset = bio->bio_offset; |
| 984263bc | 1121 | if (nmp->nm_flag & NFSMNT_RDIRPLUS) { |
| cc7d050e | 1122 | error = nfs_readdirplusrpc_uio(vp, uiop); |
| 984263bc MD |
1123 | if (error == NFSERR_NOTSUPP) |
| 1124 | nmp->nm_flag &= ~NFSMNT_RDIRPLUS; | |
| 1125 | } | |
| 1126 | if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) | |
| cc7d050e | 1127 | error = nfs_readdirrpc_uio(vp, uiop); |
| 984263bc MD |
1128 | /* |
| 1129 | * end-of-directory sets B_INVAL but does not generate an | |
| 1130 | * error. | |
| 1131 | */ | |
| 1132 | if (error == 0 && uiop->uio_resid == bp->b_bcount) | |
| 1133 | bp->b_flags |= B_INVAL; | |
| 1134 | break; | |
| 1135 | default: | |
| 086c1d7e | 1136 | kprintf("nfs_doio: type %x unexpected\n",vp->v_type); |
| 984263bc MD |
1137 | break; |
| 1138 | }; | |
| 1139 | if (error) { | |
| 1140 | bp->b_flags |= B_ERROR; | |
| 1141 | bp->b_error = error; | |
| 1142 | } | |
| cc7d050e | 1143 | bp->b_resid = uiop->uio_resid; |
| 984263bc MD |
1144 | } else { |
| 1145 | /* | |
| cb1cf930 MD |
1146 | * If we only need to commit, try to commit. |
| 1147 | * | |
| 1148 | * NOTE: The I/O has already been staged for the write and | |
| 1149 | * its pages busied, so b_dirtyoff/end is valid. | |
| 984263bc | 1150 | */ |
| 10f3fee5 | 1151 | KKASSERT(bp->b_cmd == BUF_CMD_WRITE); |
| 984263bc MD |
1152 | if (bp->b_flags & B_NEEDCOMMIT) { |
| 1153 | int retv; | |
| 1154 | off_t off; | |
| 1155 | ||
| 54078292 | 1156 | off = bio->bio_offset + bp->b_dirtyoff; |
| cc7d050e MD |
1157 | retv = nfs_commitrpc_uio(vp, off, |
| 1158 | bp->b_dirtyend - bp->b_dirtyoff, | |
| 1159 | td); | |
| 984263bc MD |
1160 | if (retv == 0) { |
| 1161 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1162 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1163 | bp->b_resid = 0; | |
| 81b5c339 | 1164 | biodone(bio); |
| cc7d050e | 1165 | return(0); |
| 984263bc MD |
1166 | } |
| 1167 | if (retv == NFSERR_STALEWRITEVERF) { | |
| 81b5c339 | 1168 | nfs_clearcommit(vp->v_mount); |
| 984263bc MD |
1169 | } |
| 1170 | } | |
| 1171 | ||
| 1172 | /* | |
| 1173 | * Setup for actual write | |
| 1174 | */ | |
| 54078292 MD |
1175 | if (bio->bio_offset + bp->b_dirtyend > np->n_size) |
| 1176 | bp->b_dirtyend = np->n_size - bio->bio_offset; | |
| 984263bc MD |
1177 | |
| 1178 | if (bp->b_dirtyend > bp->b_dirtyoff) { | |
| 1179 | io.iov_len = uiop->uio_resid = bp->b_dirtyend | |
| 1180 | - bp->b_dirtyoff; | |
| 54078292 | 1181 | uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff; |
| 984263bc MD |
1182 | io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; |
| 1183 | uiop->uio_rw = UIO_WRITE; | |
| 1184 | nfsstats.write_bios++; | |
| 1185 | ||
| ae8e83e6 | 1186 | if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0) |
| 984263bc MD |
1187 | iomode = NFSV3WRITE_UNSTABLE; |
| 1188 | else | |
| 1189 | iomode = NFSV3WRITE_FILESYNC; | |
| 1190 | ||
| cc7d050e MD |
1191 | must_commit = 0; |
| 1192 | error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit); | |
| 984263bc MD |
1193 | |
| 1194 | /* | |
| 8ae5c7e0 MD |
1195 | * We no longer try to use kern/vfs_bio's cluster code to |
| 1196 | * cluster commits, so B_CLUSTEROK is no longer set with | |
| 1197 | * B_NEEDCOMMIT. The problem is that a vfs_busy_pages() | |
| 1198 | * may have to clear B_NEEDCOMMIT if it finds underlying | |
| 1199 | * pages have been redirtied through a memory mapping | |
| 1200 | * and doing this on a clustered bp will probably cause | |
| 1201 | * a panic, plus the flag in the underlying NFS bufs | |
| 1202 | * making up the cluster bp will not be properly cleared. | |
| 984263bc | 1203 | */ |
| 984263bc MD |
1204 | if (!error && iomode == NFSV3WRITE_UNSTABLE) { |
| 1205 | bp->b_flags |= B_NEEDCOMMIT; | |
| 8ae5c7e0 MD |
1206 | #if 0 |
| 1207 | /* XXX do not enable commit clustering */ | |
| 984263bc MD |
1208 | if (bp->b_dirtyoff == 0 |
| 1209 | && bp->b_dirtyend == bp->b_bcount) | |
| 1210 | bp->b_flags |= B_CLUSTEROK; | |
| 8ae5c7e0 | 1211 | #endif |
| 984263bc MD |
1212 | } else { |
| 1213 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1214 | } | |
| 984263bc MD |
1215 | |
| 1216 | /* | |
| 1217 | * For an interrupted write, the buffer is still valid | |
| 1218 | * and the write hasn't been pushed to the server yet, | |
| 1219 | * so we can't set B_ERROR and report the interruption | |
| ae8e83e6 | 1220 | * by setting B_EINTR. For the async case, B_EINTR |
| 984263bc MD |
1221 | * is not relevant, so the rpc attempt is essentially |
| 1222 | * a noop. For the case of a V3 write rpc not being | |
| 1223 | * committed to stable storage, the block is still | |
| 1224 | * dirty and requires either a commit rpc or another | |
| 1225 | * write rpc with iomode == NFSV3WRITE_FILESYNC before | |
| 1226 | * the block is reused. This is indicated by setting | |
| 1227 | * the B_DELWRI and B_NEEDCOMMIT flags. | |
| 1228 | * | |
| 1229 | * If the buffer is marked B_PAGING, it does not reside on | |
| 1230 | * the vp's paging queues so we cannot call bdirty(). The | |
| 1231 | * bp in this case is not an NFS cache block so we should | |
| 1232 | * be safe. XXX | |
| 1233 | */ | |
| 1234 | if (error == EINTR | |
| 1235 | || (!error && (bp->b_flags & B_NEEDCOMMIT))) { | |
| 165dba55 | 1236 | crit_enter(); |
| 984263bc | 1237 | bp->b_flags &= ~(B_INVAL|B_NOCACHE); |
| 10f3fee5 | 1238 | if ((bp->b_flags & B_PAGING) == 0) |
| 984263bc | 1239 | bdirty(bp); |
| ae8e83e6 | 1240 | if (error) |
| 984263bc | 1241 | bp->b_flags |= B_EINTR; |
| 165dba55 | 1242 | crit_exit(); |
| 984263bc MD |
1243 | } else { |
| 1244 | if (error) { | |
| 1245 | bp->b_flags |= B_ERROR; | |
| 1246 | bp->b_error = np->n_error = error; | |
| 1247 | np->n_flag |= NWRITEERR; | |
| 1248 | } | |
| 1249 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1250 | } | |
| cc7d050e MD |
1251 | if (must_commit) |
| 1252 | nfs_clearcommit(vp->v_mount); | |
| 1253 | bp->b_resid = uiop->uio_resid; | |
| 984263bc MD |
1254 | } else { |
| 1255 | bp->b_resid = 0; | |
| 984263bc MD |
1256 | } |
| 1257 | } | |
| cc7d050e MD |
1258 | |
| 1259 | /* | |
| 1260 | * I/O was run synchronously, biodone() it and calculate the | |
| 1261 | * error to return. | |
| 1262 | */ | |
| 81b5c339 | 1263 | biodone(bio); |
| cc7d050e MD |
1264 | KKASSERT(bp->b_cmd == BUF_CMD_DONE); |
| 1265 | if (bp->b_flags & B_EINTR) | |
| 1266 | return (EINTR); | |
| 1267 | if (bp->b_flags & B_ERROR) | |
| 1268 | return (bp->b_error ? bp->b_error : EIO); | |
| 1269 | return (0); | |
| 984263bc MD |
1270 | } |
| 1271 | ||
| 1272 | /* | |
| 8452310f MD |
1273 | * Handle all truncation, write-extend, and ftruncate()-extend operations |
| 1274 | * on the NFS lcient side. | |
| cb1cf930 | 1275 | * |
| 8452310f MD |
1276 | * We use the new API in kern/vfs_vm.c to perform these operations in a |
| 1277 | * VM-friendly way. With this API VM pages are properly zerod and pages | |
| 1278 | * still mapped into the buffer straddling EOF are not invalidated. | |
| 984263bc | 1279 | */ |
| 8452310f MD |
1280 | int |
| 1281 | nfs_meta_setsize(struct vnode *vp, struct thread *td, off_t nsize, int trivial) | |
| 984263bc MD |
1282 | { |
| 1283 | struct nfsnode *np = VTONFS(vp); | |
| 8452310f | 1284 | off_t osize; |
| 984263bc | 1285 | int biosize = vp->v_mount->mnt_stat.f_iosize; |
| 8452310f | 1286 | int error; |
| 984263bc | 1287 | |
| 8452310f | 1288 | osize = np->n_size; |
| 984263bc MD |
1289 | np->n_size = nsize; |
| 1290 | ||
| a63246d1 | 1291 | if (nsize < osize) { |
| 3bb7eedb | 1292 | error = nvtruncbuf(vp, nsize, biosize, -1); |
| a63246d1 | 1293 | } else { |
| 8452310f | 1294 | error = nvextendbuf(vp, osize, nsize, |
| 3bb7eedb MD |
1295 | biosize, biosize, -1, -1, |
| 1296 | trivial); | |
| 984263bc | 1297 | } |
| 8452310f | 1298 | return(error); |
| 984263bc MD |
1299 | } |
| 1300 | ||
| ae8e83e6 MD |
1301 | /* |
| 1302 | * Synchronous completion for nfs_doio. Call bpdone() with elseit=FALSE. | |
| 1303 | * Caller is responsible for brelse()'ing the bp. | |
| 1304 | */ | |
| 1305 | static void | |
| 1306 | nfsiodone_sync(struct bio *bio) | |
| 1307 | { | |
| 1308 | bio->bio_flags = 0; | |
| 1309 | bpdone(bio->bio_buf, 0); | |
| 1310 | } | |
| edb90c22 MD |
1311 | |
| 1312 | /* | |
| edb90c22 MD |
1313 | * nfs read rpc - BIO version |
| 1314 | */ | |
| edb90c22 MD |
1315 | void |
| 1316 | nfs_readrpc_bio(struct vnode *vp, struct bio *bio) | |
| 1317 | { | |
| 1318 | struct buf *bp = bio->bio_buf; | |
| 1319 | u_int32_t *tl; | |
| 1320 | struct nfsmount *nmp; | |
| 1321 | int error = 0, len, tsiz; | |
| 1322 | struct nfsm_info *info; | |
| 1323 | ||
| 1324 | info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK); | |
| 1325 | info->mrep = NULL; | |
| 1326 | info->v3 = NFS_ISV3(vp); | |
| 1327 | ||
| 1328 | nmp = VFSTONFS(vp->v_mount); | |
| 1329 | tsiz = bp->b_bcount; | |
| cc7d050e | 1330 | KKASSERT(tsiz <= nmp->nm_rsize); |
| edb90c22 MD |
1331 | if (bio->bio_offset + tsiz > nmp->nm_maxfilesize) { |
| 1332 | error = EFBIG; | |
| 1333 | goto nfsmout; | |
| 1334 | } | |
| 1335 | nfsstats.rpccnt[NFSPROC_READ]++; | |
| cc7d050e | 1336 | len = tsiz; |
| edb90c22 MD |
1337 | nfsm_reqhead(info, vp, NFSPROC_READ, |
| 1338 | NFSX_FH(info->v3) + NFSX_UNSIGNED * 3); | |
| 1339 | ERROROUT(nfsm_fhtom(info, vp)); | |
| 1340 | tl = nfsm_build(info, NFSX_UNSIGNED * 3); | |
| 1341 | if (info->v3) { | |
| 1342 | txdr_hyper(bio->bio_offset, tl); | |
| 1343 | *(tl + 2) = txdr_unsigned(len); | |
| 1344 | } else { | |
| 1345 | *tl++ = txdr_unsigned(bio->bio_offset); | |
| 1346 | *tl++ = txdr_unsigned(len); | |
| 1347 | *tl = 0; | |
| 1348 | } | |
| 1349 | info->bio = bio; | |
| 1350 | info->done = nfs_readrpc_bio_done; | |
| 1351 | nfsm_request_bio(info, vp, NFSPROC_READ, NULL, | |
| 1352 | nfs_vpcred(vp, ND_READ)); | |
| 1353 | return; | |
| 1354 | nfsmout: | |
| 1355 | kfree(info, M_NFSREQ); | |
| 1356 | bp->b_error = error; | |
| 1357 | bp->b_flags |= B_ERROR; | |
| 1358 | biodone(bio); | |
| 1359 | } | |
| 1360 | ||
| 1361 | static void | |
| 1362 | nfs_readrpc_bio_done(nfsm_info_t info) | |
| 1363 | { | |
| 1364 | struct nfsmount *nmp = VFSTONFS(info->vp->v_mount); | |
| 1365 | struct bio *bio = info->bio; | |
| 1366 | struct buf *bp = bio->bio_buf; | |
| 1367 | u_int32_t *tl; | |
| 1368 | int attrflag; | |
| 1369 | int retlen; | |
| 1370 | int eof; | |
| 1371 | int error = 0; | |
| 1372 | ||
| 1373 | KKASSERT(info->state == NFSM_STATE_DONE); | |
| 1374 | ||
| c6b43e93 | 1375 | lwkt_gettoken(&nmp->nm_token); |
| 77912481 | 1376 | |
| edb90c22 MD |
1377 | if (info->v3) { |
| 1378 | ERROROUT(nfsm_postop_attr(info, info->vp, &attrflag, | |
| 1379 | NFS_LATTR_NOSHRINK)); | |
| 1380 | NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED)); | |
| 1381 | eof = fxdr_unsigned(int, *(tl + 1)); | |
| 1382 | } else { | |
| 1383 | ERROROUT(nfsm_loadattr(info, info->vp, NULL)); | |
| 1384 | eof = 0; | |
| 1385 | } | |
| 1386 | NEGATIVEOUT(retlen = nfsm_strsiz(info, nmp->nm_rsize)); | |
| 1387 | ERROROUT(nfsm_mtobio(info, bio, retlen)); | |
| 1388 | m_freem(info->mrep); | |
| 1389 | info->mrep = NULL; | |
| 1390 | ||
| 1391 | /* | |
| 28953d39 MD |
1392 | * No error occured, if retlen is less then bcount and no EOF |
| 1393 | * and NFSv3 a zero-fill short read occured. | |
| 1394 | * | |
| 1395 | * For NFSv2 a short-read indicates EOF. | |
| edb90c22 | 1396 | */ |
| 28953d39 | 1397 | if (retlen < bp->b_bcount && info->v3 && eof == 0) { |
| edb90c22 | 1398 | bzero(bp->b_data + retlen, bp->b_bcount - retlen); |
| 28953d39 | 1399 | retlen = bp->b_bcount; |
| edb90c22 | 1400 | } |
| 28953d39 MD |
1401 | |
| 1402 | /* | |
| 1403 | * If we hit an EOF we still zero-fill, but return the expected | |
| 1404 | * b_resid anyway. This should normally not occur since async | |
| 1405 | * BIOs are not used for read-before-write case. Races against | |
| 1406 | * the server can cause it though and we don't want to leave | |
| 1407 | * garbage in the buffer. | |
| 1408 | */ | |
| 1409 | if (retlen < bp->b_bcount) { | |
| 1410 | bzero(bp->b_data + retlen, bp->b_bcount - retlen); | |
| edb90c22 | 1411 | } |
| 28953d39 MD |
1412 | bp->b_resid = 0; |
| 1413 | /* bp->b_resid = bp->b_bcount - retlen; */ | |
| edb90c22 | 1414 | nfsmout: |
| c6b43e93 | 1415 | lwkt_reltoken(&nmp->nm_token); |
| f8565b0f | 1416 | kfree(info, M_NFSREQ); |
| edb90c22 MD |
1417 | if (error) { |
| 1418 | bp->b_error = error; | |
| 1419 | bp->b_flags |= B_ERROR; | |
| 1420 | } | |
| 1421 | biodone(bio); | |
| 1422 | } | |
| 1423 | ||
| edb90c22 MD |
1424 | /* |
| 1425 | * nfs write call - BIO version | |
| cb1cf930 MD |
1426 | * |
| 1427 | * NOTE: Caller has already busied the I/O. | |
| edb90c22 | 1428 | */ |
| cc7d050e MD |
1429 | void |
| 1430 | nfs_writerpc_bio(struct vnode *vp, struct bio *bio) | |
| edb90c22 | 1431 | { |
| edb90c22 | 1432 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| cc7d050e MD |
1433 | struct nfsnode *np = VTONFS(vp); |
| 1434 | struct buf *bp = bio->bio_buf; | |
| 1435 | u_int32_t *tl; | |
| 1436 | int len; | |
| 1437 | int iomode; | |
| 1438 | int error = 0; | |
| 1439 | struct nfsm_info *info; | |
| 1440 | off_t offset; | |
| edb90c22 | 1441 | |
| cc7d050e MD |
1442 | /* |
| 1443 | * Setup for actual write. Just clean up the bio if there | |
| cb1cf930 MD |
1444 | * is nothing to do. b_dirtyoff/end have already been staged |
| 1445 | * by the bp's pages getting busied. | |
| cc7d050e MD |
1446 | */ |
| 1447 | if (bio->bio_offset + bp->b_dirtyend > np->n_size) | |
| 1448 | bp->b_dirtyend = np->n_size - bio->bio_offset; | |
| edb90c22 | 1449 | |
| cc7d050e MD |
1450 | if (bp->b_dirtyend <= bp->b_dirtyoff) { |
| 1451 | bp->b_resid = 0; | |
| 1452 | biodone(bio); | |
| 1453 | return; | |
| 1454 | } | |
| 1455 | len = bp->b_dirtyend - bp->b_dirtyoff; | |
| 1456 | offset = bio->bio_offset + bp->b_dirtyoff; | |
| 1457 | if (offset + len > nmp->nm_maxfilesize) { | |
| 1458 | bp->b_flags |= B_ERROR; | |
| 1459 | bp->b_error = EFBIG; | |
| 1460 | biodone(bio); | |
| 1461 | return; | |
| 1462 | } | |
| 1463 | bp->b_resid = len; | |
| 1464 | nfsstats.write_bios++; | |
| 1465 | ||
| 1466 | info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK); | |
| 1467 | info->mrep = NULL; | |
| 1468 | info->v3 = NFS_ISV3(vp); | |
| 1469 | info->info_writerpc.must_commit = 0; | |
| 1470 | if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0) | |
| 1471 | iomode = NFSV3WRITE_UNSTABLE; | |
| 1472 | else | |
| 1473 | iomode = NFSV3WRITE_FILESYNC; | |
| edb90c22 | 1474 | |
| cc7d050e MD |
1475 | KKASSERT(len <= nmp->nm_wsize); |
| 1476 | ||
| 1477 | nfsstats.rpccnt[NFSPROC_WRITE]++; | |
| 1478 | nfsm_reqhead(info, vp, NFSPROC_WRITE, | |
| 1479 | NFSX_FH(info->v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); | |
| 1480 | ERROROUT(nfsm_fhtom(info, vp)); | |
| 1481 | if (info->v3) { | |
| 1482 | tl = nfsm_build(info, 5 * NFSX_UNSIGNED); | |
| 1483 | txdr_hyper(offset, tl); | |
| 1484 | tl += 2; | |
| 1485 | *tl++ = txdr_unsigned(len); | |
| 1486 | *tl++ = txdr_unsigned(iomode); | |
| 1487 | *tl = txdr_unsigned(len); | |
| 1488 | } else { | |
| 1489 | u_int32_t x; | |
| 1490 | ||
| 1491 | tl = nfsm_build(info, 4 * NFSX_UNSIGNED); | |
| 1492 | /* Set both "begin" and "current" to non-garbage. */ | |
| 1493 | x = txdr_unsigned((u_int32_t)offset); | |
| 1494 | *tl++ = x; /* "begin offset" */ | |
| 1495 | *tl++ = x; /* "current offset" */ | |
| 1496 | x = txdr_unsigned(len); | |
| 1497 | *tl++ = x; /* total to this offset */ | |
| 1498 | *tl = x; /* size of this write */ | |
| 1499 | } | |
| 1500 | ERROROUT(nfsm_biotom(info, bio, bp->b_dirtyoff, len)); | |
| 1501 | info->bio = bio; | |
| 1502 | info->done = nfs_writerpc_bio_done; | |
| 1503 | nfsm_request_bio(info, vp, NFSPROC_WRITE, NULL, | |
| 1504 | nfs_vpcred(vp, ND_WRITE)); | |
| 1505 | return; | |
| 1506 | nfsmout: | |
| 1507 | kfree(info, M_NFSREQ); | |
| 1508 | bp->b_error = error; | |
| 1509 | bp->b_flags |= B_ERROR; | |
| 1510 | biodone(bio); | |
| 1511 | } | |
| 1512 | ||
| 1513 | static void | |
| 1514 | nfs_writerpc_bio_done(nfsm_info_t info) | |
| 1515 | { | |
| 1516 | struct nfsmount *nmp = VFSTONFS(info->vp->v_mount); | |
| 1517 | struct nfsnode *np = VTONFS(info->vp); | |
| 1518 | struct bio *bio = info->bio; | |
| 1519 | struct buf *bp = bio->bio_buf; | |
| 1520 | int wccflag = NFSV3_WCCRATTR; | |
| 1521 | int iomode = NFSV3WRITE_FILESYNC; | |
| 1522 | int commit; | |
| 1523 | int rlen; | |
| 1524 | int error; | |
| 1525 | int len = bp->b_resid; /* b_resid was set to shortened length */ | |
| 1526 | u_int32_t *tl; | |
| 1527 | ||
| c6b43e93 | 1528 | lwkt_gettoken(&nmp->nm_token); |
| 77912481 | 1529 | |
| cc7d050e MD |
1530 | if (info->v3) { |
| 1531 | /* | |
| 1532 | * The write RPC returns a before and after mtime. The | |
| 1533 | * nfsm_wcc_data() macro checks the before n_mtime | |
| 1534 | * against the before time and stores the after time | |
| 1535 | * in the nfsnode's cached vattr and n_mtime field. | |
| 1536 | * The NRMODIFIED bit will be set if the before | |
| 1537 | * time did not match the original mtime. | |
| 1538 | */ | |
| 1539 | wccflag = NFSV3_WCCCHK; | |
| 1540 | ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag)); | |
| 1541 | if (error == 0) { | |
| 1542 | NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF)); | |
| 1543 | rlen = fxdr_unsigned(int, *tl++); | |
| 1544 | if (rlen == 0) { | |
| 1545 | error = NFSERR_IO; | |
| 1546 | m_freem(info->mrep); | |
| 1547 | info->mrep = NULL; | |
| 1548 | goto nfsmout; | |
| 1549 | } else if (rlen < len) { | |
| 1550 | #if 0 | |
| edb90c22 | 1551 | /* |
| cc7d050e | 1552 | * XXX what do we do here? |
| edb90c22 | 1553 | */ |
| cc7d050e MD |
1554 | backup = len - rlen; |
| 1555 | uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; | |
| 1556 | uiop->uio_iov->iov_len += backup; | |
| 1557 | uiop->uio_offset -= backup; | |
| 1558 | uiop->uio_resid += backup; | |
| 1559 | len = rlen; | |
| 1560 | #endif | |
| 1561 | } | |
| 1562 | commit = fxdr_unsigned(int, *tl++); | |
| 1563 | ||
| 1564 | /* | |
| 1565 | * Return the lowest committment level | |
| 1566 | * obtained by any of the RPCs. | |
| 1567 | */ | |
| 1568 | if (iomode == NFSV3WRITE_FILESYNC) | |
| 1569 | iomode = commit; | |
| 1570 | else if (iomode == NFSV3WRITE_DATASYNC && | |
| 1571 | commit == NFSV3WRITE_UNSTABLE) | |
| 1572 | iomode = commit; | |
| 1573 | if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ | |
| 1574 | bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); | |
| 1575 | nmp->nm_state |= NFSSTA_HASWRITEVERF; | |
| 1576 | } else if (bcmp(tl, nmp->nm_verf, NFSX_V3WRITEVERF)) { | |
| 1577 | info->info_writerpc.must_commit = 1; | |
| 1578 | bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); | |
| edb90c22 | 1579 | } |
| edb90c22 | 1580 | } |
| cc7d050e MD |
1581 | } else { |
| 1582 | ERROROUT(nfsm_loadattr(info, info->vp, NULL)); | |
| 1583 | } | |
| 1584 | m_freem(info->mrep); | |
| 1585 | info->mrep = NULL; | |
| 1586 | len = 0; | |
| 1587 | nfsmout: | |
| 1588 | if (info->vp->v_mount->mnt_flag & MNT_ASYNC) | |
| 1589 | iomode = NFSV3WRITE_FILESYNC; | |
| 1590 | bp->b_resid = len; | |
| 1591 | ||
| 1592 | /* | |
| 1593 | * End of RPC. Now clean up the bp. | |
| 1594 | * | |
| 8ae5c7e0 MD |
1595 | * We no longer enable write clustering for commit operations, |
| 1596 | * See around line 1157 for a more detailed comment. | |
| cc7d050e MD |
1597 | */ |
| 1598 | if (!error && iomode == NFSV3WRITE_UNSTABLE) { | |
| 1599 | bp->b_flags |= B_NEEDCOMMIT; | |
| 8ae5c7e0 MD |
1600 | #if 0 |
| 1601 | /* XXX do not enable commit clustering */ | |
| cc7d050e MD |
1602 | if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) |
| 1603 | bp->b_flags |= B_CLUSTEROK; | |
| 8ae5c7e0 | 1604 | #endif |
| cc7d050e MD |
1605 | } else { |
| 1606 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1607 | } | |
| 1608 | ||
| 1609 | /* | |
| 1610 | * For an interrupted write, the buffer is still valid | |
| 1611 | * and the write hasn't been pushed to the server yet, | |
| 1612 | * so we can't set B_ERROR and report the interruption | |
| 1613 | * by setting B_EINTR. For the async case, B_EINTR | |
| 1614 | * is not relevant, so the rpc attempt is essentially | |
| 1615 | * a noop. For the case of a V3 write rpc not being | |
| 1616 | * committed to stable storage, the block is still | |
| 1617 | * dirty and requires either a commit rpc or another | |
| 1618 | * write rpc with iomode == NFSV3WRITE_FILESYNC before | |
| 1619 | * the block is reused. This is indicated by setting | |
| 1620 | * the B_DELWRI and B_NEEDCOMMIT flags. | |
| 1621 | * | |
| 1622 | * If the buffer is marked B_PAGING, it does not reside on | |
| 1623 | * the vp's paging queues so we cannot call bdirty(). The | |
| 1624 | * bp in this case is not an NFS cache block so we should | |
| 1625 | * be safe. XXX | |
| 1626 | */ | |
| 1627 | if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { | |
| 1628 | crit_enter(); | |
| 1629 | bp->b_flags &= ~(B_INVAL|B_NOCACHE); | |
| 1630 | if ((bp->b_flags & B_PAGING) == 0) | |
| 1631 | bdirty(bp); | |
| edb90c22 | 1632 | if (error) |
| cc7d050e MD |
1633 | bp->b_flags |= B_EINTR; |
| 1634 | crit_exit(); | |
| 1635 | } else { | |
| 1636 | if (error) { | |
| 1637 | bp->b_flags |= B_ERROR; | |
| 1638 | bp->b_error = np->n_error = error; | |
| 1639 | np->n_flag |= NWRITEERR; | |
| 1640 | } | |
| 1641 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1642 | } | |
| 1643 | if (info->info_writerpc.must_commit) | |
| 1644 | nfs_clearcommit(info->vp->v_mount); | |
| c6b43e93 MD |
1645 | lwkt_reltoken(&nmp->nm_token); |
| 1646 | ||
| cc7d050e MD |
1647 | kfree(info, M_NFSREQ); |
| 1648 | if (error) { | |
| 1649 | bp->b_flags |= B_ERROR; | |
| 1650 | bp->b_error = error; | |
| 1651 | } | |
| 1652 | biodone(bio); | |
| 1653 | } | |
| 1654 | ||
| 1655 | /* | |
| 1656 | * Nfs Version 3 commit rpc - BIO version | |
| 1657 | * | |
| 1658 | * This function issues the commit rpc and will chain to a write | |
| 1659 | * rpc if necessary. | |
| 1660 | */ | |
| 1661 | void | |
| 1662 | nfs_commitrpc_bio(struct vnode *vp, struct bio *bio) | |
| 1663 | { | |
| 1664 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); | |
| 1665 | struct buf *bp = bio->bio_buf; | |
| 1666 | struct nfsm_info *info; | |
| 1667 | int error = 0; | |
| 1668 | u_int32_t *tl; | |
| 1669 | ||
| 1670 | if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) { | |
| 1671 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1672 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1673 | bp->b_resid = 0; | |
| 1674 | biodone(bio); | |
| 1675 | return; | |
| 1676 | } | |
| 1677 | ||
| 1678 | info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK); | |
| 1679 | info->mrep = NULL; | |
| 1680 | info->v3 = 1; | |
| 1681 | ||
| 1682 | nfsstats.rpccnt[NFSPROC_COMMIT]++; | |
| 1683 | nfsm_reqhead(info, vp, NFSPROC_COMMIT, NFSX_FH(1)); | |
| 1684 | ERROROUT(nfsm_fhtom(info, vp)); | |
| 1685 | tl = nfsm_build(info, 3 * NFSX_UNSIGNED); | |
| 1686 | txdr_hyper(bio->bio_offset + bp->b_dirtyoff, tl); | |
| 1687 | tl += 2; | |
| 1688 | *tl = txdr_unsigned(bp->b_dirtyend - bp->b_dirtyoff); | |
| 1689 | info->bio = bio; | |
| 1690 | info->done = nfs_commitrpc_bio_done; | |
| 1691 | nfsm_request_bio(info, vp, NFSPROC_COMMIT, NULL, | |
| 1692 | nfs_vpcred(vp, ND_WRITE)); | |
| 1693 | return; | |
| 1694 | nfsmout: | |
| 1695 | /* | |
| 1696 | * Chain to write RPC on (early) error | |
| 1697 | */ | |
| 1698 | kfree(info, M_NFSREQ); | |
| 1699 | nfs_writerpc_bio(vp, bio); | |
| 1700 | } | |
| 1701 | ||
| 1702 | static void | |
| 1703 | nfs_commitrpc_bio_done(nfsm_info_t info) | |
| 1704 | { | |
| 1705 | struct nfsmount *nmp = VFSTONFS(info->vp->v_mount); | |
| 1706 | struct bio *bio = info->bio; | |
| 1707 | struct buf *bp = bio->bio_buf; | |
| 1708 | u_int32_t *tl; | |
| 1709 | int wccflag = NFSV3_WCCRATTR; | |
| 1710 | int error = 0; | |
| 1711 | ||
| c6b43e93 | 1712 | lwkt_gettoken(&nmp->nm_token); |
| 77912481 | 1713 | |
| cc7d050e MD |
1714 | ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag)); |
| 1715 | if (error == 0) { | |
| 1716 | NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF)); | |
| 1717 | if (bcmp(nmp->nm_verf, tl, NFSX_V3WRITEVERF)) { | |
| 1718 | bcopy(tl, nmp->nm_verf, NFSX_V3WRITEVERF); | |
| 1719 | error = NFSERR_STALEWRITEVERF; | |
| 1720 | } | |
| edb90c22 | 1721 | } |
| cc7d050e MD |
1722 | m_freem(info->mrep); |
| 1723 | info->mrep = NULL; | |
| 1724 | ||
| 1725 | /* | |
| 1726 | * On completion we must chain to a write bio if an | |
| 1727 | * error occurred. | |
| 1728 | */ | |
| edb90c22 | 1729 | nfsmout: |
| cc7d050e MD |
1730 | if (error == 0) { |
| 1731 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1732 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1733 | bp->b_resid = 0; | |
| 1734 | biodone(bio); | |
| 1735 | } else { | |
| cc7d050e MD |
1736 | nfs_writerpc_bio(info->vp, bio); |
| 1737 | } | |
| 8af6746a | 1738 | kfree(info, M_NFSREQ); |
| c6b43e93 | 1739 | lwkt_reltoken(&nmp->nm_token); |
| edb90c22 MD |
1740 | } |
| 1741 |