| Commit | Line | Data |
|---|---|---|
| 984263bc MD |
1 | /* |
| 2 | * Copyright (c) 1989, 1993 | |
| 3 | * The Regents of the University of California. All rights reserved. | |
| 4 | * | |
| 5 | * This code is derived from software contributed to Berkeley by | |
| 6 | * Rick Macklem at The University of Guelph. | |
| 7 | * | |
| 8 | * Redistribution and use in source and binary forms, with or without | |
| 9 | * modification, are permitted provided that the following conditions | |
| 10 | * are met: | |
| 11 | * 1. Redistributions of source code must retain the above copyright | |
| 12 | * notice, this list of conditions and the following disclaimer. | |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 14 | * notice, this list of conditions and the following disclaimer in the | |
| 15 | * documentation and/or other materials provided with the distribution. | |
| 16 | * 3. All advertising materials mentioning features or use of this software | |
| 17 | * must display the following acknowledgement: | |
| 18 | * This product includes software developed by the University of | |
| 19 | * California, Berkeley and its contributors. | |
| 20 | * 4. Neither the name of the University nor the names of its contributors | |
| 21 | * may be used to endorse or promote products derived from this software | |
| 22 | * without specific prior written permission. | |
| 23 | * | |
| 24 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 25 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 26 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 27 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 28 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 29 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 30 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 31 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 33 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 34 | * SUCH DAMAGE. | |
| 35 | * | |
| 36 | * @(#)nfs_bio.c 8.9 (Berkeley) 3/30/95 | |
| 79e5012e | 37 | * $FreeBSD: /repoman/r/ncvs/src/sys/nfsclient/nfs_bio.c,v 1.130 2004/04/14 23:23:55 peadar Exp $ |
| a1a9228a | 38 | * $DragonFly: src/sys/vfs/nfs/nfs_bio.c,v 1.45 2008/07/18 00:09:39 dillon Exp $ |
| 984263bc MD |
39 | */ |
| 40 | ||
| 41 | ||
| 42 | #include <sys/param.h> | |
| 43 | #include <sys/systm.h> | |
| 44 | #include <sys/resourcevar.h> | |
| 45 | #include <sys/signalvar.h> | |
| 46 | #include <sys/proc.h> | |
| 47 | #include <sys/buf.h> | |
| 48 | #include <sys/vnode.h> | |
| 49 | #include <sys/mount.h> | |
| 50 | #include <sys/kernel.h> | |
| edb90c22 | 51 | #include <sys/mbuf.h> |
| 984263bc MD |
52 | |
| 53 | #include <vm/vm.h> | |
| 54 | #include <vm/vm_extern.h> | |
| 55 | #include <vm/vm_page.h> | |
| 56 | #include <vm/vm_object.h> | |
| 57 | #include <vm/vm_pager.h> | |
| 58 | #include <vm/vnode_pager.h> | |
| 59 | ||
| edb90c22 | 60 | #include <sys/buf2.h> |
| 165dba55 | 61 | #include <sys/thread2.h> |
| 1a54183b | 62 | #include <vm/vm_page2.h> |
| 165dba55 | 63 | |
| 1f2de5d4 MD |
64 | #include "rpcv2.h" |
| 65 | #include "nfsproto.h" | |
| 66 | #include "nfs.h" | |
| 67 | #include "nfsmount.h" | |
| 1f2de5d4 | 68 | #include "nfsnode.h" |
| edb90c22 MD |
69 | #include "xdr_subs.h" |
| 70 | #include "nfsm_subs.h" | |
| 71 | ||
| 984263bc | 72 | |
| 54078292 MD |
73 | static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset, |
| 74 | int size, struct thread *td); | |
| b66959e2 | 75 | static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen); |
| ae8e83e6 | 76 | static void nfsiodone_sync(struct bio *bio); |
| cc7d050e MD |
77 | static void nfs_readrpc_bio_done(nfsm_info_t info); |
| 78 | static void nfs_writerpc_bio_done(nfsm_info_t info); | |
| 79 | static void nfs_commitrpc_bio_done(nfsm_info_t info); | |
| 984263bc | 80 | |
| 984263bc | 81 | /* |
| 984263bc MD |
82 | * Vnode op for read using bio |
| 83 | */ | |
| 84 | int | |
| 3b568787 | 85 | nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag) |
| 984263bc | 86 | { |
| dadab5e9 MD |
87 | struct nfsnode *np = VTONFS(vp); |
| 88 | int biosize, i; | |
| a63246d1 | 89 | struct buf *bp, *rabp; |
| 984263bc | 90 | struct vattr vattr; |
| dadab5e9 | 91 | struct thread *td; |
| 984263bc | 92 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| a63246d1 | 93 | off_t lbn, rabn; |
| 54078292 MD |
94 | off_t raoffset; |
| 95 | off_t loffset; | |
| 984263bc | 96 | int seqcount; |
| a63246d1 MD |
97 | int nra, error = 0; |
| 98 | int boff = 0; | |
| 99 | size_t n; | |
| 984263bc MD |
100 | |
| 101 | #ifdef DIAGNOSTIC | |
| 102 | if (uio->uio_rw != UIO_READ) | |
| 103 | panic("nfs_read mode"); | |
| 104 | #endif | |
| 105 | if (uio->uio_resid == 0) | |
| 106 | return (0); | |
| 107 | if (uio->uio_offset < 0) /* XXX VDIR cookies can be negative */ | |
| 108 | return (EINVAL); | |
| dadab5e9 | 109 | td = uio->uio_td; |
| 984263bc MD |
110 | |
| 111 | if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && | |
| 112 | (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) | |
| 3b568787 | 113 | (void)nfs_fsinfo(nmp, vp, td); |
| 984263bc MD |
114 | if (vp->v_type != VDIR && |
| 115 | (uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) | |
| 116 | return (EFBIG); | |
| 117 | biosize = vp->v_mount->mnt_stat.f_iosize; | |
| 118 | seqcount = (int)((off_t)(ioflag >> IO_SEQSHIFT) * biosize / BKVASIZE); | |
| 5a9187cb | 119 | |
| 984263bc MD |
120 | /* |
| 121 | * For nfs, cache consistency can only be maintained approximately. | |
| 122 | * Although RFC1094 does not specify the criteria, the following is | |
| 123 | * believed to be compatible with the reference port. | |
| 5a9187cb | 124 | * |
| 5a9187cb MD |
125 | * NFS: If local changes have been made and this is a |
| 126 | * directory, the directory must be invalidated and | |
| 127 | * the attribute cache must be cleared. | |
| 128 | * | |
| 129 | * GETATTR is called to synchronize the file size. | |
| 130 | * | |
| 131 | * If remote changes are detected local data is flushed | |
| 132 | * and the cache is invalidated. | |
| 133 | * | |
| 5a9187cb MD |
134 | * NOTE: In the normal case the attribute cache is not |
| 135 | * cleared which means GETATTR may use cached data and | |
| 136 | * not immediately detect changes made on the server. | |
| 984263bc | 137 | */ |
| e07fef60 MD |
138 | if ((np->n_flag & NLMODIFIED) && vp->v_type == VDIR) { |
| 139 | nfs_invaldir(vp); | |
| 87de5057 | 140 | error = nfs_vinvalbuf(vp, V_SAVE, 1); |
| e07fef60 MD |
141 | if (error) |
| 142 | return (error); | |
| 143 | np->n_attrstamp = 0; | |
| 144 | } | |
| 87de5057 | 145 | error = VOP_GETATTR(vp, &vattr); |
| e07fef60 MD |
146 | if (error) |
| 147 | return (error); | |
| 8452310f MD |
148 | |
| 149 | /* | |
| 150 | * This can deadlock getpages/putpages for regular | |
| 151 | * files. Only do it for directories. | |
| 152 | */ | |
| e07fef60 | 153 | if (np->n_flag & NRMODIFIED) { |
| 8452310f | 154 | if (vp->v_type == VDIR) { |
| 5a9187cb | 155 | nfs_invaldir(vp); |
| 8452310f MD |
156 | error = nfs_vinvalbuf(vp, V_SAVE, 1); |
| 157 | if (error) | |
| 158 | return (error); | |
| 159 | np->n_flag &= ~NRMODIFIED; | |
| 160 | } | |
| 984263bc | 161 | } |
| a63246d1 MD |
162 | |
| 163 | /* | |
| 164 | * Loop until uio exhausted or we hit EOF | |
| 165 | */ | |
| 984263bc | 166 | do { |
| a63246d1 MD |
167 | bp = NULL; |
| 168 | ||
| 984263bc MD |
169 | switch (vp->v_type) { |
| 170 | case VREG: | |
| 171 | nfsstats.biocache_reads++; | |
| 172 | lbn = uio->uio_offset / biosize; | |
| a63246d1 | 173 | boff = uio->uio_offset & (biosize - 1); |
| 54078292 | 174 | loffset = (off_t)lbn * biosize; |
| 984263bc MD |
175 | |
| 176 | /* | |
| 177 | * Start the read ahead(s), as required. | |
| 178 | */ | |
| edb90c22 | 179 | if (nmp->nm_readahead > 0 && nfs_asyncok(nmp)) { |
| 984263bc MD |
180 | for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && |
| 181 | (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { | |
| 182 | rabn = lbn + 1 + nra; | |
| 54078292 | 183 | raoffset = (off_t)rabn * biosize; |
| b1c20cfa | 184 | if (findblk(vp, raoffset, FINDBLK_TEST) == NULL) { |
| 54078292 | 185 | rabp = nfs_getcacheblk(vp, raoffset, biosize, td); |
| 984263bc MD |
186 | if (!rabp) |
| 187 | return (EINTR); | |
| 188 | if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { | |
| 10f3fee5 MD |
189 | rabp->b_cmd = BUF_CMD_READ; |
| 190 | vfs_busy_pages(vp, rabp); | |
| edb90c22 | 191 | nfs_asyncio(vp, &rabp->b_bio2); |
| 984263bc MD |
192 | } else { |
| 193 | brelse(rabp); | |
| 194 | } | |
| 195 | } | |
| 196 | } | |
| 197 | } | |
| 198 | ||
| 199 | /* | |
| 200 | * Obtain the buffer cache block. Figure out the buffer size | |
| 201 | * when we are at EOF. If we are modifying the size of the | |
| 202 | * buffer based on an EOF condition we need to hold | |
| 203 | * nfs_rslock() through obtaining the buffer to prevent | |
| 204 | * a potential writer-appender from messing with n_size. | |
| 205 | * Otherwise we may accidently truncate the buffer and | |
| 206 | * lose dirty data. | |
| 207 | * | |
| 208 | * Note that bcount is *not* DEV_BSIZE aligned. | |
| 209 | */ | |
| a63246d1 MD |
210 | if (loffset + boff >= np->n_size) { |
| 211 | n = 0; | |
| 212 | break; | |
| 984263bc | 213 | } |
| a63246d1 | 214 | bp = nfs_getcacheblk(vp, loffset, biosize, td); |
| 984263bc | 215 | |
| a63246d1 | 216 | if (bp == NULL) |
| 984263bc MD |
217 | return (EINTR); |
| 218 | ||
| 219 | /* | |
| 220 | * If B_CACHE is not set, we must issue the read. If this | |
| 221 | * fails, we return an error. | |
| 222 | */ | |
| 984263bc | 223 | if ((bp->b_flags & B_CACHE) == 0) { |
| 28953d39 MD |
224 | bp->b_cmd = BUF_CMD_READ; |
| 225 | bp->b_bio2.bio_done = nfsiodone_sync; | |
| 226 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 227 | vfs_busy_pages(vp, bp); | |
| 228 | error = nfs_doio(vp, &bp->b_bio2, td); | |
| 229 | if (error) { | |
| 230 | brelse(bp); | |
| 231 | return (error); | |
| 232 | } | |
| 984263bc MD |
233 | } |
| 234 | ||
| 235 | /* | |
| 236 | * on is the offset into the current bp. Figure out how many | |
| 237 | * bytes we can copy out of the bp. Note that bcount is | |
| 238 | * NOT DEV_BSIZE aligned. | |
| 239 | * | |
| 240 | * Then figure out how many bytes we can copy into the uio. | |
| 241 | */ | |
| a63246d1 MD |
242 | n = biosize - boff; |
| 243 | if (n > uio->uio_resid) | |
| 244 | n = uio->uio_resid; | |
| 245 | if (loffset + boff + n > np->n_size) | |
| 246 | n = np->n_size - loffset - boff; | |
| 984263bc MD |
247 | break; |
| 248 | case VLNK: | |
| ded0173f | 249 | biosize = min(NFS_MAXPATHLEN, np->n_size); |
| 984263bc | 250 | nfsstats.biocache_readlinks++; |
| ded0173f | 251 | bp = nfs_getcacheblk(vp, (off_t)0, biosize, td); |
| 81b5c339 | 252 | if (bp == NULL) |
| 984263bc MD |
253 | return (EINTR); |
| 254 | if ((bp->b_flags & B_CACHE) == 0) { | |
| 28953d39 MD |
255 | bp->b_cmd = BUF_CMD_READ; |
| 256 | bp->b_bio2.bio_done = nfsiodone_sync; | |
| 257 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 258 | vfs_busy_pages(vp, bp); | |
| 259 | error = nfs_doio(vp, &bp->b_bio2, td); | |
| 260 | if (error) { | |
| 261 | bp->b_flags |= B_ERROR | B_INVAL; | |
| 262 | brelse(bp); | |
| 263 | return (error); | |
| 264 | } | |
| 984263bc | 265 | } |
| a63246d1 MD |
266 | n = szmin(uio->uio_resid, (size_t)bp->b_bcount - bp->b_resid); |
| 267 | boff = 0; | |
| 984263bc MD |
268 | break; |
| 269 | case VDIR: | |
| 270 | nfsstats.biocache_readdirs++; | |
| a63246d1 MD |
271 | if (np->n_direofoffset && |
| 272 | uio->uio_offset >= np->n_direofoffset | |
| 273 | ) { | |
| 274 | return (0); | |
| 984263bc MD |
275 | } |
| 276 | lbn = (uoff_t)uio->uio_offset / NFS_DIRBLKSIZ; | |
| a63246d1 MD |
277 | boff = uio->uio_offset & (NFS_DIRBLKSIZ - 1); |
| 278 | loffset = uio->uio_offset - boff; | |
| 54078292 | 279 | bp = nfs_getcacheblk(vp, loffset, NFS_DIRBLKSIZ, td); |
| 81b5c339 | 280 | if (bp == NULL) |
| a63246d1 | 281 | return (EINTR); |
| b66959e2 | 282 | |
| 984263bc | 283 | if ((bp->b_flags & B_CACHE) == 0) { |
| 10f3fee5 | 284 | bp->b_cmd = BUF_CMD_READ; |
| ae8e83e6 MD |
285 | bp->b_bio2.bio_done = nfsiodone_sync; |
| 286 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 10f3fee5 | 287 | vfs_busy_pages(vp, bp); |
| cc7d050e MD |
288 | error = nfs_doio(vp, &bp->b_bio2, td); |
| 289 | if (error) | |
| 984263bc | 290 | brelse(bp); |
| 984263bc | 291 | while (error == NFSERR_BAD_COOKIE) { |
| 086c1d7e | 292 | kprintf("got bad cookie vp %p bp %p\n", vp, bp); |
| 984263bc | 293 | nfs_invaldir(vp); |
| 87de5057 | 294 | error = nfs_vinvalbuf(vp, 0, 1); |
| 984263bc MD |
295 | /* |
| 296 | * Yuck! The directory has been modified on the | |
| 297 | * server. The only way to get the block is by | |
| 298 | * reading from the beginning to get all the | |
| 299 | * offset cookies. | |
| 300 | * | |
| 301 | * Leave the last bp intact unless there is an error. | |
| 302 | * Loop back up to the while if the error is another | |
| 303 | * NFSERR_BAD_COOKIE (double yuch!). | |
| 304 | */ | |
| 305 | for (i = 0; i <= lbn && !error; i++) { | |
| 306 | if (np->n_direofoffset | |
| 307 | && (i * NFS_DIRBLKSIZ) >= np->n_direofoffset) | |
| 308 | return (0); | |
| 54078292 MD |
309 | bp = nfs_getcacheblk(vp, (off_t)i * NFS_DIRBLKSIZ, |
| 310 | NFS_DIRBLKSIZ, td); | |
| 984263bc MD |
311 | if (!bp) |
| 312 | return (EINTR); | |
| 313 | if ((bp->b_flags & B_CACHE) == 0) { | |
| 10f3fee5 | 314 | bp->b_cmd = BUF_CMD_READ; |
| ae8e83e6 MD |
315 | bp->b_bio2.bio_done = nfsiodone_sync; |
| 316 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 10f3fee5 | 317 | vfs_busy_pages(vp, bp); |
| cc7d050e | 318 | error = nfs_doio(vp, &bp->b_bio2, td); |
| 984263bc MD |
319 | /* |
| 320 | * no error + B_INVAL == directory EOF, | |
| 321 | * use the block. | |
| 322 | */ | |
| 323 | if (error == 0 && (bp->b_flags & B_INVAL)) | |
| 324 | break; | |
| 325 | } | |
| 326 | /* | |
| 327 | * An error will throw away the block and the | |
| 328 | * for loop will break out. If no error and this | |
| 329 | * is not the block we want, we throw away the | |
| 330 | * block and go for the next one via the for loop. | |
| 331 | */ | |
| 332 | if (error || i < lbn) | |
| 333 | brelse(bp); | |
| 334 | } | |
| 335 | } | |
| 336 | /* | |
| 337 | * The above while is repeated if we hit another cookie | |
| 338 | * error. If we hit an error and it wasn't a cookie error, | |
| 339 | * we give up. | |
| 340 | */ | |
| 341 | if (error) | |
| 342 | return (error); | |
| 343 | } | |
| 344 | ||
| 345 | /* | |
| 346 | * If not eof and read aheads are enabled, start one. | |
| 347 | * (You need the current block first, so that you have the | |
| 348 | * directory offset cookie of the next block.) | |
| 349 | */ | |
| edb90c22 | 350 | if (nmp->nm_readahead > 0 && nfs_asyncok(nmp) && |
| 984263bc MD |
351 | (bp->b_flags & B_INVAL) == 0 && |
| 352 | (np->n_direofoffset == 0 || | |
| 54078292 | 353 | loffset + NFS_DIRBLKSIZ < np->n_direofoffset) && |
| b1c20cfa MD |
354 | findblk(vp, loffset + NFS_DIRBLKSIZ, FINDBLK_TEST) == NULL |
| 355 | ) { | |
| 54078292 MD |
356 | rabp = nfs_getcacheblk(vp, loffset + NFS_DIRBLKSIZ, |
| 357 | NFS_DIRBLKSIZ, td); | |
| 984263bc MD |
358 | if (rabp) { |
| 359 | if ((rabp->b_flags & (B_CACHE|B_DELWRI)) == 0) { | |
| 10f3fee5 MD |
360 | rabp->b_cmd = BUF_CMD_READ; |
| 361 | vfs_busy_pages(vp, rabp); | |
| edb90c22 | 362 | nfs_asyncio(vp, &rabp->b_bio2); |
| 984263bc MD |
363 | } else { |
| 364 | brelse(rabp); | |
| 365 | } | |
| 366 | } | |
| 367 | } | |
| 368 | /* | |
| 369 | * Unlike VREG files, whos buffer size ( bp->b_bcount ) is | |
| 370 | * chopped for the EOF condition, we cannot tell how large | |
| 371 | * NFS directories are going to be until we hit EOF. So | |
| 372 | * an NFS directory buffer is *not* chopped to its EOF. Now, | |
| 373 | * it just so happens that b_resid will effectively chop it | |
| 374 | * to EOF. *BUT* this information is lost if the buffer goes | |
| 375 | * away and is reconstituted into a B_CACHE state ( due to | |
| 376 | * being VMIO ) later. So we keep track of the directory eof | |
| 377 | * in np->n_direofoffset and chop it off as an extra step | |
| 378 | * right here. | |
| c0b6e0f5 MD |
379 | * |
| 380 | * NOTE: boff could already be beyond EOF. | |
| 984263bc | 381 | */ |
| c0b6e0f5 MD |
382 | if ((size_t)boff > NFS_DIRBLKSIZ - bp->b_resid) { |
| 383 | n = 0; | |
| 384 | } else { | |
| 385 | n = szmin(uio->uio_resid, | |
| 386 | NFS_DIRBLKSIZ - bp->b_resid - (size_t)boff); | |
| 387 | } | |
| a63246d1 MD |
388 | if (np->n_direofoffset && |
| 389 | n > (size_t)(np->n_direofoffset - uio->uio_offset)) { | |
| 390 | n = (size_t)(np->n_direofoffset - uio->uio_offset); | |
| 391 | } | |
| 984263bc MD |
392 | break; |
| 393 | default: | |
| 086c1d7e | 394 | kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type); |
| a63246d1 | 395 | n = 0; |
| 984263bc MD |
396 | break; |
| 397 | }; | |
| 398 | ||
| 984263bc MD |
399 | switch (vp->v_type) { |
| 400 | case VREG: | |
| 01f31ab3 | 401 | if (n > 0) |
| a63246d1 | 402 | error = uiomove(bp->b_data + boff, n, uio); |
| 984263bc MD |
403 | break; |
| 404 | case VLNK: | |
| 01f31ab3 | 405 | if (n > 0) |
| a63246d1 | 406 | error = uiomove(bp->b_data + boff, n, uio); |
| 984263bc MD |
407 | n = 0; |
| 408 | break; | |
| 409 | case VDIR: | |
| 01f31ab3 JS |
410 | if (n > 0) { |
| 411 | off_t old_off = uio->uio_offset; | |
| 412 | caddr_t cpos, epos; | |
| 413 | struct nfs_dirent *dp; | |
| 414 | ||
| b66959e2 MD |
415 | /* |
| 416 | * We are casting cpos to nfs_dirent, it must be | |
| 417 | * int-aligned. | |
| 418 | */ | |
| a63246d1 | 419 | if (boff & 3) { |
| b66959e2 MD |
420 | error = EINVAL; |
| 421 | break; | |
| 422 | } | |
| 423 | ||
| a63246d1 MD |
424 | cpos = bp->b_data + boff; |
| 425 | epos = bp->b_data + boff + n; | |
| 01f31ab3 JS |
426 | while (cpos < epos && error == 0 && uio->uio_resid > 0) { |
| 427 | dp = (struct nfs_dirent *)cpos; | |
| b66959e2 MD |
428 | error = nfs_check_dirent(dp, (int)(epos - cpos)); |
| 429 | if (error) | |
| 430 | break; | |
| 01f31ab3 | 431 | if (vop_write_dirent(&error, uio, dp->nfs_ino, |
| b66959e2 | 432 | dp->nfs_type, dp->nfs_namlen, dp->nfs_name)) { |
| 01f31ab3 | 433 | break; |
| b66959e2 | 434 | } |
| 01f31ab3 JS |
435 | cpos += dp->nfs_reclen; |
| 436 | } | |
| 437 | n = 0; | |
| a63246d1 MD |
438 | if (error == 0) { |
| 439 | uio->uio_offset = old_off + cpos - | |
| 440 | bp->b_data - boff; | |
| 441 | } | |
| 01f31ab3 | 442 | } |
| 984263bc MD |
443 | break; |
| 444 | default: | |
| 086c1d7e | 445 | kprintf(" nfs_bioread: type %x unexpected\n",vp->v_type); |
| 984263bc | 446 | } |
| a63246d1 MD |
447 | if (bp) |
| 448 | brelse(bp); | |
| 984263bc MD |
449 | } while (error == 0 && uio->uio_resid > 0 && n > 0); |
| 450 | return (error); | |
| 451 | } | |
| 452 | ||
| 453 | /* | |
| b66959e2 MD |
454 | * Userland can supply any 'seek' offset when reading a NFS directory. |
| 455 | * Validate the structure so we don't panic the kernel. Note that | |
| 456 | * the element name is nul terminated and the nul is not included | |
| 457 | * in nfs_namlen. | |
| 458 | */ | |
| 459 | static | |
| 460 | int | |
| 461 | nfs_check_dirent(struct nfs_dirent *dp, int maxlen) | |
| 462 | { | |
| 463 | int nfs_name_off = offsetof(struct nfs_dirent, nfs_name[0]); | |
| 464 | ||
| 465 | if (nfs_name_off >= maxlen) | |
| 466 | return (EINVAL); | |
| 467 | if (dp->nfs_reclen < nfs_name_off || dp->nfs_reclen > maxlen) | |
| 468 | return (EINVAL); | |
| 469 | if (nfs_name_off + dp->nfs_namlen >= dp->nfs_reclen) | |
| 470 | return (EINVAL); | |
| 471 | if (dp->nfs_reclen & 3) | |
| 472 | return (EINVAL); | |
| 473 | return (0); | |
| 474 | } | |
| 475 | ||
| 476 | /* | |
| 984263bc | 477 | * Vnode op for write using bio |
| e851b29e CP |
478 | * |
| 479 | * nfs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag, | |
| 480 | * struct ucred *a_cred) | |
| 984263bc MD |
481 | */ |
| 482 | int | |
| e851b29e | 483 | nfs_write(struct vop_write_args *ap) |
| 984263bc | 484 | { |
| 984263bc | 485 | struct uio *uio = ap->a_uio; |
| dadab5e9 | 486 | struct thread *td = uio->uio_td; |
| 984263bc MD |
487 | struct vnode *vp = ap->a_vp; |
| 488 | struct nfsnode *np = VTONFS(vp); | |
| 984263bc MD |
489 | int ioflag = ap->a_ioflag; |
| 490 | struct buf *bp; | |
| 491 | struct vattr vattr; | |
| 492 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); | |
| 54078292 | 493 | off_t loffset; |
| a63246d1 MD |
494 | int boff, bytes; |
| 495 | int error = 0; | |
| 984263bc | 496 | int haverslock = 0; |
| 81b5c339 MD |
497 | int bcount; |
| 498 | int biosize; | |
| 8452310f | 499 | int trivial; |
| 984263bc MD |
500 | |
| 501 | #ifdef DIAGNOSTIC | |
| 502 | if (uio->uio_rw != UIO_WRITE) | |
| 503 | panic("nfs_write mode"); | |
| 7b95be2a | 504 | if (uio->uio_segflg == UIO_USERSPACE && uio->uio_td != curthread) |
| 984263bc MD |
505 | panic("nfs_write proc"); |
| 506 | #endif | |
| 507 | if (vp->v_type != VREG) | |
| 508 | return (EIO); | |
| 509 | if (np->n_flag & NWRITEERR) { | |
| 510 | np->n_flag &= ~NWRITEERR; | |
| 511 | return (np->n_error); | |
| 512 | } | |
| 513 | if ((nmp->nm_flag & NFSMNT_NFSV3) != 0 && | |
| 514 | (nmp->nm_state & NFSSTA_GOTFSINFO) == 0) | |
| 3b568787 | 515 | (void)nfs_fsinfo(nmp, vp, td); |
| 984263bc MD |
516 | |
| 517 | /* | |
| 518 | * Synchronously flush pending buffers if we are in synchronous | |
| 519 | * mode or if we are appending. | |
| 520 | */ | |
| 521 | if (ioflag & (IO_APPEND | IO_SYNC)) { | |
| 5a9187cb | 522 | if (np->n_flag & NLMODIFIED) { |
| 984263bc | 523 | np->n_attrstamp = 0; |
| 5a9187cb | 524 | error = nfs_flush(vp, MNT_WAIT, td, 0); |
| 87de5057 | 525 | /* error = nfs_vinvalbuf(vp, V_SAVE, 1); */ |
| 984263bc MD |
526 | if (error) |
| 527 | return (error); | |
| 528 | } | |
| 529 | } | |
| 530 | ||
| 531 | /* | |
| 532 | * If IO_APPEND then load uio_offset. We restart here if we cannot | |
| 533 | * get the append lock. | |
| 534 | */ | |
| 535 | restart: | |
| 536 | if (ioflag & IO_APPEND) { | |
| 537 | np->n_attrstamp = 0; | |
| 87de5057 | 538 | error = VOP_GETATTR(vp, &vattr); |
| 984263bc MD |
539 | if (error) |
| 540 | return (error); | |
| 541 | uio->uio_offset = np->n_size; | |
| 542 | } | |
| 543 | ||
| 544 | if (uio->uio_offset < 0) | |
| 545 | return (EINVAL); | |
| 546 | if ((uio->uio_offset + uio->uio_resid) > nmp->nm_maxfilesize) | |
| 547 | return (EFBIG); | |
| 548 | if (uio->uio_resid == 0) | |
| 549 | return (0); | |
| 550 | ||
| 551 | /* | |
| 552 | * We need to obtain the rslock if we intend to modify np->n_size | |
| 553 | * in order to guarentee the append point with multiple contending | |
| 554 | * writers, to guarentee that no other appenders modify n_size | |
| 555 | * while we are trying to obtain a truncated buffer (i.e. to avoid | |
| 556 | * accidently truncating data written by another appender due to | |
| 557 | * the race), and to ensure that the buffer is populated prior to | |
| 558 | * our extending of the file. We hold rslock through the entire | |
| 559 | * operation. | |
| 560 | * | |
| 561 | * Note that we do not synchronize the case where someone truncates | |
| 562 | * the file while we are appending to it because attempting to lock | |
| 563 | * this case may deadlock other parts of the system unexpectedly. | |
| 564 | */ | |
| 565 | if ((ioflag & IO_APPEND) || | |
| 566 | uio->uio_offset + uio->uio_resid > np->n_size) { | |
| 2313ec23 | 567 | switch(nfs_rslock(np)) { |
| 984263bc MD |
568 | case ENOLCK: |
| 569 | goto restart; | |
| 570 | /* not reached */ | |
| 571 | case EINTR: | |
| 572 | case ERESTART: | |
| 573 | return(EINTR); | |
| 574 | /* not reached */ | |
| 575 | default: | |
| 576 | break; | |
| 577 | } | |
| 578 | haverslock = 1; | |
| 579 | } | |
| 580 | ||
| 581 | /* | |
| 582 | * Maybe this should be above the vnode op call, but so long as | |
| 583 | * file servers have no limits, i don't think it matters | |
| 584 | */ | |
| 8452310f | 585 | if (td && td->td_proc && uio->uio_offset + uio->uio_resid > |
| dadab5e9 | 586 | td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { |
| 7278a846 | 587 | lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ); |
| 984263bc | 588 | if (haverslock) |
| 2313ec23 | 589 | nfs_rsunlock(np); |
| 984263bc MD |
590 | return (EFBIG); |
| 591 | } | |
| 592 | ||
| 593 | biosize = vp->v_mount->mnt_stat.f_iosize; | |
| 594 | ||
| 595 | do { | |
| 984263bc | 596 | nfsstats.biocache_writes++; |
| a63246d1 MD |
597 | boff = uio->uio_offset & (biosize-1); |
| 598 | loffset = uio->uio_offset - boff; | |
| 599 | bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid); | |
| 984263bc MD |
600 | again: |
| 601 | /* | |
| 602 | * Handle direct append and file extension cases, calculate | |
| a63246d1 MD |
603 | * unaligned buffer size. When extending B_CACHE will be |
| 604 | * set if possible. See UIO_NOCOPY note below. | |
| 984263bc | 605 | */ |
| a63246d1 MD |
606 | if (uio->uio_offset + bytes > np->n_size) { |
| 607 | np->n_flag |= NLMODIFIED; | |
| 8452310f MD |
608 | trivial = (uio->uio_segflg != UIO_NOCOPY && |
| 609 | uio->uio_offset <= np->n_size); | |
| 610 | nfs_meta_setsize(vp, td, uio->uio_offset + bytes, | |
| 611 | trivial); | |
| 984263bc | 612 | } |
| 8452310f | 613 | bp = nfs_getcacheblk(vp, loffset, biosize, td); |
| 81b5c339 | 614 | if (bp == NULL) { |
| 984263bc MD |
615 | error = EINTR; |
| 616 | break; | |
| 617 | } | |
| 618 | ||
| 619 | /* | |
| a63246d1 MD |
620 | * Actual bytes in buffer which we care about |
| 621 | */ | |
| 622 | if (loffset + biosize < np->n_size) | |
| 623 | bcount = biosize; | |
| 624 | else | |
| 625 | bcount = (int)(np->n_size - loffset); | |
| 626 | ||
| 627 | /* | |
| 28953d39 | 628 | * Avoid a read by setting B_CACHE where the data we |
| a63246d1 MD |
629 | * intend to write covers the entire buffer. Note |
| 630 | * that the buffer may have been set to B_CACHE by | |
| 631 | * nfs_meta_setsize() above or otherwise inherited the | |
| 632 | * flag, but if B_CACHE isn't set the buffer may be | |
| 633 | * uninitialized and must be zero'd to accomodate | |
| 634 | * future seek+write's. | |
| 984263bc | 635 | * |
| 28953d39 | 636 | * See the comments in kern/vfs_bio.c's getblk() for |
| 984263bc MD |
637 | * more information. |
| 638 | * | |
| 8aa7625b MD |
639 | * When doing a UIO_NOCOPY write the buffer is not |
| 640 | * overwritten and we cannot just set B_CACHE unconditionally | |
| 641 | * for full-block writes. | |
| 984263bc | 642 | */ |
| a63246d1 MD |
643 | if (boff == 0 && bytes == biosize && |
| 644 | uio->uio_segflg != UIO_NOCOPY) { | |
| 984263bc MD |
645 | bp->b_flags |= B_CACHE; |
| 646 | bp->b_flags &= ~(B_ERROR | B_INVAL); | |
| 647 | } | |
| 648 | ||
| 28953d39 MD |
649 | /* |
| 650 | * b_resid may be set due to file EOF if we extended out. | |
| 651 | * The NFS bio code will zero the difference anyway so | |
| 652 | * just acknowledged the fact and set b_resid to 0. | |
| 653 | */ | |
| 984263bc | 654 | if ((bp->b_flags & B_CACHE) == 0) { |
| 10f3fee5 | 655 | bp->b_cmd = BUF_CMD_READ; |
| ae8e83e6 MD |
656 | bp->b_bio2.bio_done = nfsiodone_sync; |
| 657 | bp->b_bio2.bio_flags |= BIO_SYNC; | |
| 10f3fee5 | 658 | vfs_busy_pages(vp, bp); |
| cc7d050e | 659 | error = nfs_doio(vp, &bp->b_bio2, td); |
| 984263bc MD |
660 | if (error) { |
| 661 | brelse(bp); | |
| 662 | break; | |
| 663 | } | |
| 28953d39 | 664 | bp->b_resid = 0; |
| 984263bc | 665 | } |
| 5a9187cb | 666 | np->n_flag |= NLMODIFIED; |
| 984263bc MD |
667 | |
| 668 | /* | |
| 669 | * If dirtyend exceeds file size, chop it down. This should | |
| 670 | * not normally occur but there is an append race where it | |
| 671 | * might occur XXX, so we log it. | |
| 672 | * | |
| 673 | * If the chopping creates a reverse-indexed or degenerate | |
| 674 | * situation with dirtyoff/end, we 0 both of them. | |
| 675 | */ | |
| 984263bc | 676 | if (bp->b_dirtyend > bcount) { |
| 086c1d7e | 677 | kprintf("NFS append race @%08llx:%d\n", |
| 973c11b9 | 678 | (long long)bp->b_bio2.bio_offset, |
| 984263bc MD |
679 | bp->b_dirtyend - bcount); |
| 680 | bp->b_dirtyend = bcount; | |
| 681 | } | |
| 682 | ||
| 683 | if (bp->b_dirtyoff >= bp->b_dirtyend) | |
| 684 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 685 | ||
| 686 | /* | |
| 687 | * If the new write will leave a contiguous dirty | |
| 688 | * area, just update the b_dirtyoff and b_dirtyend, | |
| 689 | * otherwise force a write rpc of the old dirty area. | |
| 690 | * | |
| 691 | * While it is possible to merge discontiguous writes due to | |
| 692 | * our having a B_CACHE buffer ( and thus valid read data | |
| 693 | * for the hole), we don't because it could lead to | |
| 694 | * significant cache coherency problems with multiple clients, | |
| 695 | * especially if locking is implemented later on. | |
| 696 | * | |
| 697 | * as an optimization we could theoretically maintain | |
| 698 | * a linked list of discontinuous areas, but we would still | |
| 699 | * have to commit them separately so there isn't much | |
| 700 | * advantage to it except perhaps a bit of asynchronization. | |
| 701 | */ | |
| 984263bc | 702 | if (bp->b_dirtyend > 0 && |
| a63246d1 MD |
703 | (boff > bp->b_dirtyend || |
| 704 | (boff + bytes) < bp->b_dirtyoff) | |
| 705 | ) { | |
| 62cfda27 | 706 | if (bwrite(bp) == EINTR) { |
| 984263bc MD |
707 | error = EINTR; |
| 708 | break; | |
| 709 | } | |
| 710 | goto again; | |
| 711 | } | |
| 712 | ||
| a63246d1 | 713 | error = uiomove(bp->b_data + boff, bytes, uio); |
| 984263bc MD |
714 | |
| 715 | /* | |
| 716 | * Since this block is being modified, it must be written | |
| 717 | * again and not just committed. Since write clustering does | |
| 718 | * not work for the stage 1 data write, only the stage 2 | |
| 719 | * commit rpc, we have to clear B_CLUSTEROK as well. | |
| 720 | */ | |
| 721 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 722 | ||
| 723 | if (error) { | |
| 984263bc MD |
724 | brelse(bp); |
| 725 | break; | |
| 726 | } | |
| 727 | ||
| 728 | /* | |
| 729 | * Only update dirtyoff/dirtyend if not a degenerate | |
| 730 | * condition. | |
| 1a54183b MD |
731 | * |
| 732 | * The underlying VM pages have been marked valid by | |
| 733 | * virtue of acquiring the bp. Because the entire buffer | |
| 734 | * is marked dirty we do not have to worry about cleaning | |
| 735 | * out the related dirty bits (and wouldn't really know | |
| 736 | * how to deal with byte ranges anyway) | |
| 984263bc | 737 | */ |
| a63246d1 | 738 | if (bytes) { |
| 984263bc | 739 | if (bp->b_dirtyend > 0) { |
| a63246d1 MD |
740 | bp->b_dirtyoff = imin(boff, bp->b_dirtyoff); |
| 741 | bp->b_dirtyend = imax(boff + bytes, | |
| 742 | bp->b_dirtyend); | |
| 984263bc | 743 | } else { |
| a63246d1 MD |
744 | bp->b_dirtyoff = boff; |
| 745 | bp->b_dirtyend = boff + bytes; | |
| 984263bc | 746 | } |
| 984263bc | 747 | } |
| 984263bc MD |
748 | |
| 749 | /* | |
| 750 | * If the lease is non-cachable or IO_SYNC do bwrite(). | |
| 751 | * | |
| 752 | * IO_INVAL appears to be unused. The idea appears to be | |
| 753 | * to turn off caching in this case. Very odd. XXX | |
| a482a28a MD |
754 | * |
| 755 | * If nfs_async is set bawrite() will use an unstable write | |
| 756 | * (build dirty bufs on the server), so we might as well | |
| 757 | * push it out with bawrite(). If nfs_async is not set we | |
| 758 | * use bdwrite() to cache dirty bufs on the client. | |
| 984263bc | 759 | */ |
| a63246d1 | 760 | if (ioflag & IO_SYNC) { |
| 984263bc MD |
761 | if (ioflag & IO_INVAL) |
| 762 | bp->b_flags |= B_NOCACHE; | |
| 62cfda27 | 763 | error = bwrite(bp); |
| 984263bc MD |
764 | if (error) |
| 765 | break; | |
| a63246d1 | 766 | } else if (boff + bytes == biosize && nfs_async) { |
| a482a28a | 767 | bawrite(bp); |
| 984263bc MD |
768 | } else { |
| 769 | bdwrite(bp); | |
| 770 | } | |
| a63246d1 | 771 | } while (uio->uio_resid > 0 && bytes > 0); |
| 984263bc MD |
772 | |
| 773 | if (haverslock) | |
| 2313ec23 | 774 | nfs_rsunlock(np); |
| 984263bc MD |
775 | |
| 776 | return (error); | |
| 777 | } | |
| 778 | ||
| 779 | /* | |
| 780 | * Get an nfs cache block. | |
| 781 | * | |
| 782 | * Allocate a new one if the block isn't currently in the cache | |
| 783 | * and return the block marked busy. If the calling process is | |
| 784 | * interrupted by a signal for an interruptible mount point, return | |
| 785 | * NULL. | |
| 786 | * | |
| 787 | * The caller must carefully deal with the possible B_INVAL state of | |
| edb90c22 | 788 | * the buffer. nfs_startio() clears B_INVAL (and nfs_asyncio() clears it |
| 984263bc MD |
789 | * indirectly), so synchronous reads can be issued without worrying about |
| 790 | * the B_INVAL state. We have to be a little more careful when dealing | |
| 791 | * with writes (see comments in nfs_write()) when extending a file past | |
| 792 | * its EOF. | |
| 793 | */ | |
| 794 | static struct buf * | |
| 54078292 | 795 | nfs_getcacheblk(struct vnode *vp, off_t loffset, int size, struct thread *td) |
| 984263bc | 796 | { |
| 40393ded | 797 | struct buf *bp; |
| 984263bc MD |
798 | struct mount *mp; |
| 799 | struct nfsmount *nmp; | |
| 800 | ||
| 801 | mp = vp->v_mount; | |
| 802 | nmp = VFSTONFS(mp); | |
| 803 | ||
| 804 | if (nmp->nm_flag & NFSMNT_INT) { | |
| 4b958e7b | 805 | bp = getblk(vp, loffset, size, GETBLK_PCATCH, 0); |
| 81b5c339 | 806 | while (bp == NULL) { |
| 60233e58 | 807 | if (nfs_sigintr(nmp, NULL, td)) |
| 81b5c339 | 808 | return (NULL); |
| 54078292 | 809 | bp = getblk(vp, loffset, size, 0, 2 * hz); |
| 984263bc MD |
810 | } |
| 811 | } else { | |
| 54078292 | 812 | bp = getblk(vp, loffset, size, 0, 0); |
| 984263bc MD |
813 | } |
| 814 | ||
| 81b5c339 | 815 | /* |
| 54078292 MD |
816 | * bio2, the 'device' layer. Since BIOs use 64 bit byte offsets |
| 817 | * now, no translation is necessary. | |
| 81b5c339 | 818 | */ |
| 54078292 | 819 | bp->b_bio2.bio_offset = loffset; |
| 984263bc MD |
820 | return (bp); |
| 821 | } | |
| 822 | ||
| 823 | /* | |
| 824 | * Flush and invalidate all dirty buffers. If another process is already | |
| 825 | * doing the flush, just wait for completion. | |
| 826 | */ | |
| 827 | int | |
| 87de5057 | 828 | nfs_vinvalbuf(struct vnode *vp, int flags, int intrflg) |
| 984263bc | 829 | { |
| 40393ded | 830 | struct nfsnode *np = VTONFS(vp); |
| 984263bc MD |
831 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| 832 | int error = 0, slpflag, slptimeo; | |
| 87de5057 | 833 | thread_t td = curthread; |
| 984263bc | 834 | |
| 5fd012e0 | 835 | if (vp->v_flag & VRECLAIMED) |
| 984263bc | 836 | return (0); |
| 984263bc MD |
837 | |
| 838 | if ((nmp->nm_flag & NFSMNT_INT) == 0) | |
| 839 | intrflg = 0; | |
| 840 | if (intrflg) { | |
| 841 | slpflag = PCATCH; | |
| 842 | slptimeo = 2 * hz; | |
| 843 | } else { | |
| 844 | slpflag = 0; | |
| 845 | slptimeo = 0; | |
| 846 | } | |
| 847 | /* | |
| 848 | * First wait for any other process doing a flush to complete. | |
| 849 | */ | |
| 850 | while (np->n_flag & NFLUSHINPROG) { | |
| 851 | np->n_flag |= NFLUSHWANT; | |
| 377d4740 | 852 | error = tsleep((caddr_t)&np->n_flag, 0, "nfsvinval", slptimeo); |
| 87de5057 | 853 | if (error && intrflg && nfs_sigintr(nmp, NULL, td)) |
| 984263bc MD |
854 | return (EINTR); |
| 855 | } | |
| 856 | ||
| 857 | /* | |
| 858 | * Now, flush as required. | |
| 859 | */ | |
| 860 | np->n_flag |= NFLUSHINPROG; | |
| 87de5057 | 861 | error = vinvalbuf(vp, flags, slpflag, 0); |
| 984263bc | 862 | while (error) { |
| 87de5057 | 863 | if (intrflg && nfs_sigintr(nmp, NULL, td)) { |
| 984263bc MD |
864 | np->n_flag &= ~NFLUSHINPROG; |
| 865 | if (np->n_flag & NFLUSHWANT) { | |
| 866 | np->n_flag &= ~NFLUSHWANT; | |
| 867 | wakeup((caddr_t)&np->n_flag); | |
| 868 | } | |
| 869 | return (EINTR); | |
| 870 | } | |
| 87de5057 | 871 | error = vinvalbuf(vp, flags, 0, slptimeo); |
| 984263bc | 872 | } |
| 5a9187cb | 873 | np->n_flag &= ~(NLMODIFIED | NFLUSHINPROG); |
| 984263bc MD |
874 | if (np->n_flag & NFLUSHWANT) { |
| 875 | np->n_flag &= ~NFLUSHWANT; | |
| 876 | wakeup((caddr_t)&np->n_flag); | |
| 877 | } | |
| 878 | return (0); | |
| 879 | } | |
| 880 | ||
| 881 | /* | |
| edb90c22 MD |
882 | * Return true (non-zero) if the txthread and rxthread are operational |
| 883 | * and we do not already have too many not-yet-started BIO's built up. | |
| 984263bc MD |
884 | */ |
| 885 | int | |
| edb90c22 MD |
886 | nfs_asyncok(struct nfsmount *nmp) |
| 887 | { | |
| cc7d050e | 888 | return (nmp->nm_bioqlen < nfs_maxasyncbio && |
| f8565b0f | 889 | nmp->nm_bioqlen < nmp->nm_maxasync_scaled / NFS_ASYSCALE && |
| edb90c22 MD |
890 | nmp->nm_rxstate <= NFSSVC_PENDING && |
| 891 | nmp->nm_txstate <= NFSSVC_PENDING); | |
| 892 | } | |
| 893 | ||
| 894 | /* | |
| 895 | * The read-ahead code calls this to queue a bio to the txthread. | |
| 896 | * | |
| 897 | * We don't touch the bio otherwise... that is, we do not even | |
| 898 | * construct or send the initial rpc. The txthread will do it | |
| 899 | * for us. | |
| f8565b0f MD |
900 | * |
| 901 | * NOTE! nm_bioqlen is not decremented until the request completes, | |
| 902 | * so it does not reflect the number of bio's on bioq. | |
| edb90c22 MD |
903 | */ |
| 904 | void | |
| 905 | nfs_asyncio(struct vnode *vp, struct bio *bio) | |
| 984263bc | 906 | { |
| 81b5c339 | 907 | struct buf *bp = bio->bio_buf; |
| edb90c22 | 908 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| 984263bc | 909 | |
| 81b5c339 | 910 | KKASSERT(vp->v_tag == VT_NFS); |
| 52e1cf57 | 911 | BUF_KERNPROC(bp); |
| c504e38e MD |
912 | |
| 913 | /* | |
| 914 | * Shortcut swap cache (not done automatically because we are not | |
| 915 | * using bread()). | |
| 916 | */ | |
| 917 | if (vn_cache_strategy(vp, bio)) | |
| 918 | return; | |
| 919 | ||
| 52e1cf57 | 920 | bio->bio_driver_info = vp; |
| f8565b0f | 921 | crit_enter(); |
| 52e1cf57 | 922 | TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act); |
| f8565b0f MD |
923 | atomic_add_int(&nmp->nm_bioqlen, 1); |
| 924 | crit_exit(); | |
| 52e1cf57 | 925 | nfssvc_iod_writer_wakeup(nmp); |
| 984263bc MD |
926 | } |
| 927 | ||
| 928 | /* | |
| cc7d050e MD |
929 | * nfs_dio() - Execute a BIO operation synchronously. The BIO will be |
| 930 | * completed and its error returned. The caller is responsible | |
| 931 | * for brelse()ing it. ONLY USE FOR BIO_SYNC IOs! Otherwise | |
| 932 | * our error probe will be against an invalid pointer. | |
| edb90c22 | 933 | * |
| cc7d050e | 934 | * nfs_startio()- Execute a BIO operation assynchronously. |
| dadab5e9 | 935 | * |
| cc7d050e MD |
936 | * NOTE: nfs_asyncio() is used to initiate an asynchronous BIO operation, |
| 937 | * which basically just queues it to the txthread. nfs_startio() | |
| 938 | * actually initiates the I/O AFTER it has gotten to the txthread. | |
| ae8e83e6 | 939 | * |
| cc7d050e | 940 | * NOTE: td might be NULL. |
| cb1cf930 MD |
941 | * |
| 942 | * NOTE: Caller has already busied the I/O. | |
| 984263bc | 943 | */ |
| edb90c22 MD |
944 | void |
| 945 | nfs_startio(struct vnode *vp, struct bio *bio, struct thread *td) | |
| 984263bc | 946 | { |
| 81b5c339 | 947 | struct buf *bp = bio->bio_buf; |
| cc7d050e MD |
948 | struct nfsnode *np; |
| 949 | struct nfsmount *nmp; | |
| 950 | ||
| 951 | KKASSERT(vp->v_tag == VT_NFS); | |
| 952 | np = VTONFS(vp); | |
| 953 | nmp = VFSTONFS(vp->v_mount); | |
| 954 | ||
| 955 | /* | |
| 956 | * clear B_ERROR and B_INVAL state prior to initiating the I/O. We | |
| 957 | * do this here so we do not have to do it in all the code that | |
| 958 | * calls us. | |
| 959 | */ | |
| 960 | bp->b_flags &= ~(B_ERROR | B_INVAL); | |
| 961 | ||
| 962 | KASSERT(bp->b_cmd != BUF_CMD_DONE, | |
| 963 | ("nfs_doio: bp %p already marked done!", bp)); | |
| 964 | ||
| 965 | if (bp->b_cmd == BUF_CMD_READ) { | |
| 966 | switch (vp->v_type) { | |
| 967 | case VREG: | |
| 968 | nfsstats.read_bios++; | |
| 969 | nfs_readrpc_bio(vp, bio); | |
| 970 | break; | |
| 971 | case VLNK: | |
| 972 | #if 0 | |
| 973 | bio->bio_offset = 0; | |
| 974 | nfsstats.readlink_bios++; | |
| 975 | nfs_readlinkrpc_bio(vp, bio); | |
| 976 | #else | |
| 977 | nfs_doio(vp, bio, td); | |
| 978 | #endif | |
| 979 | break; | |
| 980 | case VDIR: | |
| 981 | /* | |
| 982 | * NOTE: If nfs_readdirplusrpc_bio() is requested but | |
| 983 | * not supported, it will chain to | |
| 984 | * nfs_readdirrpc_bio(). | |
| 985 | */ | |
| 986 | #if 0 | |
| 987 | nfsstats.readdir_bios++; | |
| 988 | uiop->uio_offset = bio->bio_offset; | |
| 989 | if (nmp->nm_flag & NFSMNT_RDIRPLUS) | |
| 990 | nfs_readdirplusrpc_bio(vp, bio); | |
| 991 | else | |
| 992 | nfs_readdirrpc_bio(vp, bio); | |
| 993 | #else | |
| 994 | nfs_doio(vp, bio, td); | |
| 995 | #endif | |
| 996 | break; | |
| 997 | default: | |
| 998 | kprintf("nfs_doio: type %x unexpected\n",vp->v_type); | |
| 999 | bp->b_flags |= B_ERROR; | |
| 1000 | bp->b_error = EINVAL; | |
| 1001 | biodone(bio); | |
| 1002 | break; | |
| 1003 | } | |
| 1004 | } else { | |
| 1005 | /* | |
| 1006 | * If we only need to commit, try to commit. If this fails | |
| 1007 | * it will chain through to the write. Basically all the logic | |
| 1008 | * in nfs_doio() is replicated. | |
| 1009 | */ | |
| 1010 | KKASSERT(bp->b_cmd == BUF_CMD_WRITE); | |
| 1011 | if (bp->b_flags & B_NEEDCOMMIT) | |
| 1012 | nfs_commitrpc_bio(vp, bio); | |
| 1013 | else | |
| 1014 | nfs_writerpc_bio(vp, bio); | |
| 1015 | } | |
| 1016 | } | |
| 1017 | ||
| 1018 | int | |
| 1019 | nfs_doio(struct vnode *vp, struct bio *bio, struct thread *td) | |
| 1020 | { | |
| 1021 | struct buf *bp = bio->bio_buf; | |
| 984263bc | 1022 | struct uio *uiop; |
| 984263bc MD |
1023 | struct nfsnode *np; |
| 1024 | struct nfsmount *nmp; | |
| cc7d050e MD |
1025 | int error = 0; |
| 1026 | int iomode, must_commit; | |
| 28953d39 | 1027 | size_t n; |
| 984263bc MD |
1028 | struct uio uio; |
| 1029 | struct iovec io; | |
| 1030 | ||
| c504e38e MD |
1031 | #if 0 |
| 1032 | /* | |
| 1033 | * Shortcut swap cache (not done automatically because we are not | |
| 1034 | * using bread()). | |
| 1035 | * | |
| 1036 | * XXX The biowait is a hack until we can figure out how to stop a | |
| 1037 | * biodone chain when a middle element is BIO_SYNC. BIO_SYNC is | |
| 1038 | * set so the bp shouldn't get ripped out from under us. The only | |
| 1039 | * use-cases are fully synchronous I/O cases. | |
| 1040 | * | |
| 1041 | * XXX This is having problems, give up for now. | |
| 1042 | */ | |
| 1043 | if (vn_cache_strategy(vp, bio)) { | |
| 1044 | kprintf("X"); | |
| 1045 | error = biowait(&bio->bio_buf->b_bio1, "nfsrsw"); | |
| 1046 | return (error); | |
| 1047 | } | |
| 1048 | #endif | |
| 1049 | ||
| 81b5c339 | 1050 | KKASSERT(vp->v_tag == VT_NFS); |
| 984263bc MD |
1051 | np = VTONFS(vp); |
| 1052 | nmp = VFSTONFS(vp->v_mount); | |
| 1053 | uiop = &uio; | |
| 1054 | uiop->uio_iov = &io; | |
| 1055 | uiop->uio_iovcnt = 1; | |
| 1056 | uiop->uio_segflg = UIO_SYSSPACE; | |
| dadab5e9 | 1057 | uiop->uio_td = td; |
| 984263bc MD |
1058 | |
| 1059 | /* | |
| 1060 | * clear B_ERROR and B_INVAL state prior to initiating the I/O. We | |
| 1061 | * do this here so we do not have to do it in all the code that | |
| 1062 | * calls us. | |
| 1063 | */ | |
| 1064 | bp->b_flags &= ~(B_ERROR | B_INVAL); | |
| 1065 | ||
| 10f3fee5 MD |
1066 | KASSERT(bp->b_cmd != BUF_CMD_DONE, |
| 1067 | ("nfs_doio: bp %p already marked done!", bp)); | |
| 1068 | ||
| 1069 | if (bp->b_cmd == BUF_CMD_READ) { | |
| e54488bb | 1070 | io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount; |
| 984263bc MD |
1071 | io.iov_base = bp->b_data; |
| 1072 | uiop->uio_rw = UIO_READ; | |
| 1073 | ||
| 1074 | switch (vp->v_type) { | |
| 1075 | case VREG: | |
| 28953d39 MD |
1076 | /* |
| 1077 | * When reading from a regular file zero-fill any residual. | |
| 1078 | * Note that this residual has nothing to do with NFS short | |
| 1079 | * reads, which nfs_readrpc_uio() will handle for us. | |
| 1080 | * | |
| 1081 | * We have to do this because when we are write extending | |
| 1082 | * a file the server may not have the same notion of | |
| 1083 | * filesize as we do. Our BIOs should already be sized | |
| 1084 | * (b_bcount) to account for the file EOF. | |
| 1085 | */ | |
| 984263bc | 1086 | nfsstats.read_bios++; |
| edb90c22 MD |
1087 | uiop->uio_offset = bio->bio_offset; |
| 1088 | error = nfs_readrpc_uio(vp, uiop); | |
| 28953d39 MD |
1089 | if (error == 0 && uiop->uio_resid) { |
| 1090 | n = (size_t)bp->b_bcount - uiop->uio_resid; | |
| 1091 | bzero(bp->b_data + n, bp->b_bcount - n); | |
| 1092 | uiop->uio_resid = 0; | |
| 984263bc | 1093 | } |
| dadab5e9 | 1094 | if (td && td->td_proc && (vp->v_flag & VTEXT) && |
| e07fef60 | 1095 | np->n_mtime != np->n_vattr.va_mtime.tv_sec) { |
| 984263bc | 1096 | uprintf("Process killed due to text file modification\n"); |
| 84204577 | 1097 | ksignal(td->td_proc, SIGKILL); |
| 984263bc MD |
1098 | } |
| 1099 | break; | |
| 1100 | case VLNK: | |
| 81b5c339 | 1101 | uiop->uio_offset = 0; |
| 984263bc | 1102 | nfsstats.readlink_bios++; |
| cc7d050e | 1103 | error = nfs_readlinkrpc_uio(vp, uiop); |
| 984263bc MD |
1104 | break; |
| 1105 | case VDIR: | |
| 1106 | nfsstats.readdir_bios++; | |
| 54078292 | 1107 | uiop->uio_offset = bio->bio_offset; |
| 984263bc | 1108 | if (nmp->nm_flag & NFSMNT_RDIRPLUS) { |
| cc7d050e | 1109 | error = nfs_readdirplusrpc_uio(vp, uiop); |
| 984263bc MD |
1110 | if (error == NFSERR_NOTSUPP) |
| 1111 | nmp->nm_flag &= ~NFSMNT_RDIRPLUS; | |
| 1112 | } | |
| 1113 | if ((nmp->nm_flag & NFSMNT_RDIRPLUS) == 0) | |
| cc7d050e | 1114 | error = nfs_readdirrpc_uio(vp, uiop); |
| 984263bc MD |
1115 | /* |
| 1116 | * end-of-directory sets B_INVAL but does not generate an | |
| 1117 | * error. | |
| 1118 | */ | |
| 1119 | if (error == 0 && uiop->uio_resid == bp->b_bcount) | |
| 1120 | bp->b_flags |= B_INVAL; | |
| 1121 | break; | |
| 1122 | default: | |
| 086c1d7e | 1123 | kprintf("nfs_doio: type %x unexpected\n",vp->v_type); |
| 984263bc MD |
1124 | break; |
| 1125 | }; | |
| 1126 | if (error) { | |
| 1127 | bp->b_flags |= B_ERROR; | |
| 1128 | bp->b_error = error; | |
| 1129 | } | |
| cc7d050e | 1130 | bp->b_resid = uiop->uio_resid; |
| 984263bc MD |
1131 | } else { |
| 1132 | /* | |
| cb1cf930 MD |
1133 | * If we only need to commit, try to commit. |
| 1134 | * | |
| 1135 | * NOTE: The I/O has already been staged for the write and | |
| 1136 | * its pages busied, so b_dirtyoff/end is valid. | |
| 984263bc | 1137 | */ |
| 10f3fee5 | 1138 | KKASSERT(bp->b_cmd == BUF_CMD_WRITE); |
| 984263bc MD |
1139 | if (bp->b_flags & B_NEEDCOMMIT) { |
| 1140 | int retv; | |
| 1141 | off_t off; | |
| 1142 | ||
| 54078292 | 1143 | off = bio->bio_offset + bp->b_dirtyoff; |
| cc7d050e MD |
1144 | retv = nfs_commitrpc_uio(vp, off, |
| 1145 | bp->b_dirtyend - bp->b_dirtyoff, | |
| 1146 | td); | |
| 984263bc MD |
1147 | if (retv == 0) { |
| 1148 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1149 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1150 | bp->b_resid = 0; | |
| 81b5c339 | 1151 | biodone(bio); |
| cc7d050e | 1152 | return(0); |
| 984263bc MD |
1153 | } |
| 1154 | if (retv == NFSERR_STALEWRITEVERF) { | |
| 81b5c339 | 1155 | nfs_clearcommit(vp->v_mount); |
| 984263bc MD |
1156 | } |
| 1157 | } | |
| 1158 | ||
| 1159 | /* | |
| 1160 | * Setup for actual write | |
| 1161 | */ | |
| 54078292 MD |
1162 | if (bio->bio_offset + bp->b_dirtyend > np->n_size) |
| 1163 | bp->b_dirtyend = np->n_size - bio->bio_offset; | |
| 984263bc MD |
1164 | |
| 1165 | if (bp->b_dirtyend > bp->b_dirtyoff) { | |
| 1166 | io.iov_len = uiop->uio_resid = bp->b_dirtyend | |
| 1167 | - bp->b_dirtyoff; | |
| 54078292 | 1168 | uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff; |
| 984263bc MD |
1169 | io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; |
| 1170 | uiop->uio_rw = UIO_WRITE; | |
| 1171 | nfsstats.write_bios++; | |
| 1172 | ||
| ae8e83e6 | 1173 | if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0) |
| 984263bc MD |
1174 | iomode = NFSV3WRITE_UNSTABLE; |
| 1175 | else | |
| 1176 | iomode = NFSV3WRITE_FILESYNC; | |
| 1177 | ||
| cc7d050e MD |
1178 | must_commit = 0; |
| 1179 | error = nfs_writerpc_uio(vp, uiop, &iomode, &must_commit); | |
| 984263bc MD |
1180 | |
| 1181 | /* | |
| 8ae5c7e0 MD |
1182 | * We no longer try to use kern/vfs_bio's cluster code to |
| 1183 | * cluster commits, so B_CLUSTEROK is no longer set with | |
| 1184 | * B_NEEDCOMMIT. The problem is that a vfs_busy_pages() | |
| 1185 | * may have to clear B_NEEDCOMMIT if it finds underlying | |
| 1186 | * pages have been redirtied through a memory mapping | |
| 1187 | * and doing this on a clustered bp will probably cause | |
| 1188 | * a panic, plus the flag in the underlying NFS bufs | |
| 1189 | * making up the cluster bp will not be properly cleared. | |
| 984263bc | 1190 | */ |
| 984263bc MD |
1191 | if (!error && iomode == NFSV3WRITE_UNSTABLE) { |
| 1192 | bp->b_flags |= B_NEEDCOMMIT; | |
| 8ae5c7e0 MD |
1193 | #if 0 |
| 1194 | /* XXX do not enable commit clustering */ | |
| 984263bc MD |
1195 | if (bp->b_dirtyoff == 0 |
| 1196 | && bp->b_dirtyend == bp->b_bcount) | |
| 1197 | bp->b_flags |= B_CLUSTEROK; | |
| 8ae5c7e0 | 1198 | #endif |
| 984263bc MD |
1199 | } else { |
| 1200 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1201 | } | |
| 984263bc MD |
1202 | |
| 1203 | /* | |
| 1204 | * For an interrupted write, the buffer is still valid | |
| 1205 | * and the write hasn't been pushed to the server yet, | |
| 1206 | * so we can't set B_ERROR and report the interruption | |
| ae8e83e6 | 1207 | * by setting B_EINTR. For the async case, B_EINTR |
| 984263bc MD |
1208 | * is not relevant, so the rpc attempt is essentially |
| 1209 | * a noop. For the case of a V3 write rpc not being | |
| 1210 | * committed to stable storage, the block is still | |
| 1211 | * dirty and requires either a commit rpc or another | |
| 1212 | * write rpc with iomode == NFSV3WRITE_FILESYNC before | |
| 1213 | * the block is reused. This is indicated by setting | |
| 1214 | * the B_DELWRI and B_NEEDCOMMIT flags. | |
| 1215 | * | |
| 1216 | * If the buffer is marked B_PAGING, it does not reside on | |
| 1217 | * the vp's paging queues so we cannot call bdirty(). The | |
| 1218 | * bp in this case is not an NFS cache block so we should | |
| 1219 | * be safe. XXX | |
| 1220 | */ | |
| 1221 | if (error == EINTR | |
| 1222 | || (!error && (bp->b_flags & B_NEEDCOMMIT))) { | |
| 165dba55 | 1223 | crit_enter(); |
| 984263bc | 1224 | bp->b_flags &= ~(B_INVAL|B_NOCACHE); |
| 10f3fee5 | 1225 | if ((bp->b_flags & B_PAGING) == 0) |
| 984263bc | 1226 | bdirty(bp); |
| ae8e83e6 | 1227 | if (error) |
| 984263bc | 1228 | bp->b_flags |= B_EINTR; |
| 165dba55 | 1229 | crit_exit(); |
| 984263bc MD |
1230 | } else { |
| 1231 | if (error) { | |
| 1232 | bp->b_flags |= B_ERROR; | |
| 1233 | bp->b_error = np->n_error = error; | |
| 1234 | np->n_flag |= NWRITEERR; | |
| 1235 | } | |
| 1236 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1237 | } | |
| cc7d050e MD |
1238 | if (must_commit) |
| 1239 | nfs_clearcommit(vp->v_mount); | |
| 1240 | bp->b_resid = uiop->uio_resid; | |
| 984263bc MD |
1241 | } else { |
| 1242 | bp->b_resid = 0; | |
| 984263bc MD |
1243 | } |
| 1244 | } | |
| cc7d050e MD |
1245 | |
| 1246 | /* | |
| 1247 | * I/O was run synchronously, biodone() it and calculate the | |
| 1248 | * error to return. | |
| 1249 | */ | |
| 81b5c339 | 1250 | biodone(bio); |
| cc7d050e MD |
1251 | KKASSERT(bp->b_cmd == BUF_CMD_DONE); |
| 1252 | if (bp->b_flags & B_EINTR) | |
| 1253 | return (EINTR); | |
| 1254 | if (bp->b_flags & B_ERROR) | |
| 1255 | return (bp->b_error ? bp->b_error : EIO); | |
| 1256 | return (0); | |
| 984263bc MD |
1257 | } |
| 1258 | ||
| 1259 | /* | |
| 8452310f MD |
1260 | * Handle all truncation, write-extend, and ftruncate()-extend operations |
| 1261 | * on the NFS lcient side. | |
| cb1cf930 | 1262 | * |
| 8452310f MD |
1263 | * We use the new API in kern/vfs_vm.c to perform these operations in a |
| 1264 | * VM-friendly way. With this API VM pages are properly zerod and pages | |
| 1265 | * still mapped into the buffer straddling EOF are not invalidated. | |
| 984263bc | 1266 | */ |
| 8452310f MD |
1267 | int |
| 1268 | nfs_meta_setsize(struct vnode *vp, struct thread *td, off_t nsize, int trivial) | |
| 984263bc MD |
1269 | { |
| 1270 | struct nfsnode *np = VTONFS(vp); | |
| 8452310f | 1271 | off_t osize; |
| 984263bc | 1272 | int biosize = vp->v_mount->mnt_stat.f_iosize; |
| 8452310f | 1273 | int error; |
| 984263bc | 1274 | |
| 8452310f | 1275 | osize = np->n_size; |
| 984263bc MD |
1276 | np->n_size = nsize; |
| 1277 | ||
| a63246d1 | 1278 | if (nsize < osize) { |
| 3bb7eedb | 1279 | error = nvtruncbuf(vp, nsize, biosize, -1); |
| a63246d1 | 1280 | } else { |
| 8452310f | 1281 | error = nvextendbuf(vp, osize, nsize, |
| 3bb7eedb MD |
1282 | biosize, biosize, -1, -1, |
| 1283 | trivial); | |
| 984263bc | 1284 | } |
| 8452310f | 1285 | return(error); |
| 984263bc MD |
1286 | } |
| 1287 | ||
| ae8e83e6 MD |
1288 | /* |
| 1289 | * Synchronous completion for nfs_doio. Call bpdone() with elseit=FALSE. | |
| 1290 | * Caller is responsible for brelse()'ing the bp. | |
| 1291 | */ | |
| 1292 | static void | |
| 1293 | nfsiodone_sync(struct bio *bio) | |
| 1294 | { | |
| 1295 | bio->bio_flags = 0; | |
| 1296 | bpdone(bio->bio_buf, 0); | |
| 1297 | } | |
| edb90c22 MD |
1298 | |
| 1299 | /* | |
| edb90c22 MD |
1300 | * nfs read rpc - BIO version |
| 1301 | */ | |
| edb90c22 MD |
1302 | void |
| 1303 | nfs_readrpc_bio(struct vnode *vp, struct bio *bio) | |
| 1304 | { | |
| 1305 | struct buf *bp = bio->bio_buf; | |
| 1306 | u_int32_t *tl; | |
| 1307 | struct nfsmount *nmp; | |
| 1308 | int error = 0, len, tsiz; | |
| 1309 | struct nfsm_info *info; | |
| 1310 | ||
| 1311 | info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK); | |
| 1312 | info->mrep = NULL; | |
| 1313 | info->v3 = NFS_ISV3(vp); | |
| 1314 | ||
| 1315 | nmp = VFSTONFS(vp->v_mount); | |
| 1316 | tsiz = bp->b_bcount; | |
| cc7d050e | 1317 | KKASSERT(tsiz <= nmp->nm_rsize); |
| edb90c22 MD |
1318 | if (bio->bio_offset + tsiz > nmp->nm_maxfilesize) { |
| 1319 | error = EFBIG; | |
| 1320 | goto nfsmout; | |
| 1321 | } | |
| 1322 | nfsstats.rpccnt[NFSPROC_READ]++; | |
| cc7d050e | 1323 | len = tsiz; |
| edb90c22 MD |
1324 | nfsm_reqhead(info, vp, NFSPROC_READ, |
| 1325 | NFSX_FH(info->v3) + NFSX_UNSIGNED * 3); | |
| 1326 | ERROROUT(nfsm_fhtom(info, vp)); | |
| 1327 | tl = nfsm_build(info, NFSX_UNSIGNED * 3); | |
| 1328 | if (info->v3) { | |
| 1329 | txdr_hyper(bio->bio_offset, tl); | |
| 1330 | *(tl + 2) = txdr_unsigned(len); | |
| 1331 | } else { | |
| 1332 | *tl++ = txdr_unsigned(bio->bio_offset); | |
| 1333 | *tl++ = txdr_unsigned(len); | |
| 1334 | *tl = 0; | |
| 1335 | } | |
| 1336 | info->bio = bio; | |
| 1337 | info->done = nfs_readrpc_bio_done; | |
| 1338 | nfsm_request_bio(info, vp, NFSPROC_READ, NULL, | |
| 1339 | nfs_vpcred(vp, ND_READ)); | |
| 1340 | return; | |
| 1341 | nfsmout: | |
| 1342 | kfree(info, M_NFSREQ); | |
| 1343 | bp->b_error = error; | |
| 1344 | bp->b_flags |= B_ERROR; | |
| 1345 | biodone(bio); | |
| 1346 | } | |
| 1347 | ||
| 1348 | static void | |
| 1349 | nfs_readrpc_bio_done(nfsm_info_t info) | |
| 1350 | { | |
| 1351 | struct nfsmount *nmp = VFSTONFS(info->vp->v_mount); | |
| 1352 | struct bio *bio = info->bio; | |
| 1353 | struct buf *bp = bio->bio_buf; | |
| 1354 | u_int32_t *tl; | |
| 1355 | int attrflag; | |
| 1356 | int retlen; | |
| 1357 | int eof; | |
| 1358 | int error = 0; | |
| 1359 | ||
| 1360 | KKASSERT(info->state == NFSM_STATE_DONE); | |
| 1361 | ||
| c6b43e93 | 1362 | lwkt_gettoken(&nmp->nm_token); |
| 77912481 | 1363 | |
| edb90c22 MD |
1364 | if (info->v3) { |
| 1365 | ERROROUT(nfsm_postop_attr(info, info->vp, &attrflag, | |
| 1366 | NFS_LATTR_NOSHRINK)); | |
| 1367 | NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED)); | |
| 1368 | eof = fxdr_unsigned(int, *(tl + 1)); | |
| 1369 | } else { | |
| 1370 | ERROROUT(nfsm_loadattr(info, info->vp, NULL)); | |
| 1371 | eof = 0; | |
| 1372 | } | |
| 1373 | NEGATIVEOUT(retlen = nfsm_strsiz(info, nmp->nm_rsize)); | |
| 1374 | ERROROUT(nfsm_mtobio(info, bio, retlen)); | |
| 1375 | m_freem(info->mrep); | |
| 1376 | info->mrep = NULL; | |
| 1377 | ||
| 1378 | /* | |
| 28953d39 MD |
1379 | * No error occured, if retlen is less then bcount and no EOF |
| 1380 | * and NFSv3 a zero-fill short read occured. | |
| 1381 | * | |
| 1382 | * For NFSv2 a short-read indicates EOF. | |
| edb90c22 | 1383 | */ |
| 28953d39 | 1384 | if (retlen < bp->b_bcount && info->v3 && eof == 0) { |
| edb90c22 | 1385 | bzero(bp->b_data + retlen, bp->b_bcount - retlen); |
| 28953d39 | 1386 | retlen = bp->b_bcount; |
| edb90c22 | 1387 | } |
| 28953d39 MD |
1388 | |
| 1389 | /* | |
| 1390 | * If we hit an EOF we still zero-fill, but return the expected | |
| 1391 | * b_resid anyway. This should normally not occur since async | |
| 1392 | * BIOs are not used for read-before-write case. Races against | |
| 1393 | * the server can cause it though and we don't want to leave | |
| 1394 | * garbage in the buffer. | |
| 1395 | */ | |
| 1396 | if (retlen < bp->b_bcount) { | |
| 1397 | bzero(bp->b_data + retlen, bp->b_bcount - retlen); | |
| edb90c22 | 1398 | } |
| 28953d39 MD |
1399 | bp->b_resid = 0; |
| 1400 | /* bp->b_resid = bp->b_bcount - retlen; */ | |
| edb90c22 | 1401 | nfsmout: |
| c6b43e93 | 1402 | lwkt_reltoken(&nmp->nm_token); |
| f8565b0f | 1403 | kfree(info, M_NFSREQ); |
| edb90c22 MD |
1404 | if (error) { |
| 1405 | bp->b_error = error; | |
| 1406 | bp->b_flags |= B_ERROR; | |
| 1407 | } | |
| 1408 | biodone(bio); | |
| 1409 | } | |
| 1410 | ||
| edb90c22 MD |
1411 | /* |
| 1412 | * nfs write call - BIO version | |
| cb1cf930 MD |
1413 | * |
| 1414 | * NOTE: Caller has already busied the I/O. | |
| edb90c22 | 1415 | */ |
| cc7d050e MD |
1416 | void |
| 1417 | nfs_writerpc_bio(struct vnode *vp, struct bio *bio) | |
| edb90c22 | 1418 | { |
| edb90c22 | 1419 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); |
| cc7d050e MD |
1420 | struct nfsnode *np = VTONFS(vp); |
| 1421 | struct buf *bp = bio->bio_buf; | |
| 1422 | u_int32_t *tl; | |
| 1423 | int len; | |
| 1424 | int iomode; | |
| 1425 | int error = 0; | |
| 1426 | struct nfsm_info *info; | |
| 1427 | off_t offset; | |
| edb90c22 | 1428 | |
| cc7d050e MD |
1429 | /* |
| 1430 | * Setup for actual write. Just clean up the bio if there | |
| cb1cf930 MD |
1431 | * is nothing to do. b_dirtyoff/end have already been staged |
| 1432 | * by the bp's pages getting busied. | |
| cc7d050e MD |
1433 | */ |
| 1434 | if (bio->bio_offset + bp->b_dirtyend > np->n_size) | |
| 1435 | bp->b_dirtyend = np->n_size - bio->bio_offset; | |
| edb90c22 | 1436 | |
| cc7d050e MD |
1437 | if (bp->b_dirtyend <= bp->b_dirtyoff) { |
| 1438 | bp->b_resid = 0; | |
| 1439 | biodone(bio); | |
| 1440 | return; | |
| 1441 | } | |
| 1442 | len = bp->b_dirtyend - bp->b_dirtyoff; | |
| 1443 | offset = bio->bio_offset + bp->b_dirtyoff; | |
| 1444 | if (offset + len > nmp->nm_maxfilesize) { | |
| 1445 | bp->b_flags |= B_ERROR; | |
| 1446 | bp->b_error = EFBIG; | |
| 1447 | biodone(bio); | |
| 1448 | return; | |
| 1449 | } | |
| 1450 | bp->b_resid = len; | |
| 1451 | nfsstats.write_bios++; | |
| 1452 | ||
| 1453 | info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK); | |
| 1454 | info->mrep = NULL; | |
| 1455 | info->v3 = NFS_ISV3(vp); | |
| 1456 | info->info_writerpc.must_commit = 0; | |
| 1457 | if ((bp->b_flags & (B_NEEDCOMMIT | B_NOCACHE | B_CLUSTER)) == 0) | |
| 1458 | iomode = NFSV3WRITE_UNSTABLE; | |
| 1459 | else | |
| 1460 | iomode = NFSV3WRITE_FILESYNC; | |
| edb90c22 | 1461 | |
| cc7d050e MD |
1462 | KKASSERT(len <= nmp->nm_wsize); |
| 1463 | ||
| 1464 | nfsstats.rpccnt[NFSPROC_WRITE]++; | |
| 1465 | nfsm_reqhead(info, vp, NFSPROC_WRITE, | |
| 1466 | NFSX_FH(info->v3) + 5 * NFSX_UNSIGNED + nfsm_rndup(len)); | |
| 1467 | ERROROUT(nfsm_fhtom(info, vp)); | |
| 1468 | if (info->v3) { | |
| 1469 | tl = nfsm_build(info, 5 * NFSX_UNSIGNED); | |
| 1470 | txdr_hyper(offset, tl); | |
| 1471 | tl += 2; | |
| 1472 | *tl++ = txdr_unsigned(len); | |
| 1473 | *tl++ = txdr_unsigned(iomode); | |
| 1474 | *tl = txdr_unsigned(len); | |
| 1475 | } else { | |
| 1476 | u_int32_t x; | |
| 1477 | ||
| 1478 | tl = nfsm_build(info, 4 * NFSX_UNSIGNED); | |
| 1479 | /* Set both "begin" and "current" to non-garbage. */ | |
| 1480 | x = txdr_unsigned((u_int32_t)offset); | |
| 1481 | *tl++ = x; /* "begin offset" */ | |
| 1482 | *tl++ = x; /* "current offset" */ | |
| 1483 | x = txdr_unsigned(len); | |
| 1484 | *tl++ = x; /* total to this offset */ | |
| 1485 | *tl = x; /* size of this write */ | |
| 1486 | } | |
| 1487 | ERROROUT(nfsm_biotom(info, bio, bp->b_dirtyoff, len)); | |
| 1488 | info->bio = bio; | |
| 1489 | info->done = nfs_writerpc_bio_done; | |
| 1490 | nfsm_request_bio(info, vp, NFSPROC_WRITE, NULL, | |
| 1491 | nfs_vpcred(vp, ND_WRITE)); | |
| 1492 | return; | |
| 1493 | nfsmout: | |
| 1494 | kfree(info, M_NFSREQ); | |
| 1495 | bp->b_error = error; | |
| 1496 | bp->b_flags |= B_ERROR; | |
| 1497 | biodone(bio); | |
| 1498 | } | |
| 1499 | ||
| 1500 | static void | |
| 1501 | nfs_writerpc_bio_done(nfsm_info_t info) | |
| 1502 | { | |
| 1503 | struct nfsmount *nmp = VFSTONFS(info->vp->v_mount); | |
| 1504 | struct nfsnode *np = VTONFS(info->vp); | |
| 1505 | struct bio *bio = info->bio; | |
| 1506 | struct buf *bp = bio->bio_buf; | |
| 1507 | int wccflag = NFSV3_WCCRATTR; | |
| 1508 | int iomode = NFSV3WRITE_FILESYNC; | |
| 1509 | int commit; | |
| 1510 | int rlen; | |
| 1511 | int error; | |
| 1512 | int len = bp->b_resid; /* b_resid was set to shortened length */ | |
| 1513 | u_int32_t *tl; | |
| 1514 | ||
| c6b43e93 | 1515 | lwkt_gettoken(&nmp->nm_token); |
| 77912481 | 1516 | |
| cc7d050e MD |
1517 | if (info->v3) { |
| 1518 | /* | |
| 1519 | * The write RPC returns a before and after mtime. The | |
| 1520 | * nfsm_wcc_data() macro checks the before n_mtime | |
| 1521 | * against the before time and stores the after time | |
| 1522 | * in the nfsnode's cached vattr and n_mtime field. | |
| 1523 | * The NRMODIFIED bit will be set if the before | |
| 1524 | * time did not match the original mtime. | |
| 1525 | */ | |
| 1526 | wccflag = NFSV3_WCCCHK; | |
| 1527 | ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag)); | |
| 1528 | if (error == 0) { | |
| 1529 | NULLOUT(tl = nfsm_dissect(info, 2 * NFSX_UNSIGNED + NFSX_V3WRITEVERF)); | |
| 1530 | rlen = fxdr_unsigned(int, *tl++); | |
| 1531 | if (rlen == 0) { | |
| 1532 | error = NFSERR_IO; | |
| 1533 | m_freem(info->mrep); | |
| 1534 | info->mrep = NULL; | |
| 1535 | goto nfsmout; | |
| 1536 | } else if (rlen < len) { | |
| 1537 | #if 0 | |
| edb90c22 | 1538 | /* |
| cc7d050e | 1539 | * XXX what do we do here? |
| edb90c22 | 1540 | */ |
| cc7d050e MD |
1541 | backup = len - rlen; |
| 1542 | uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base - backup; | |
| 1543 | uiop->uio_iov->iov_len += backup; | |
| 1544 | uiop->uio_offset -= backup; | |
| 1545 | uiop->uio_resid += backup; | |
| 1546 | len = rlen; | |
| 1547 | #endif | |
| 1548 | } | |
| 1549 | commit = fxdr_unsigned(int, *tl++); | |
| 1550 | ||
| 1551 | /* | |
| 1552 | * Return the lowest committment level | |
| 1553 | * obtained by any of the RPCs. | |
| 1554 | */ | |
| 1555 | if (iomode == NFSV3WRITE_FILESYNC) | |
| 1556 | iomode = commit; | |
| 1557 | else if (iomode == NFSV3WRITE_DATASYNC && | |
| 1558 | commit == NFSV3WRITE_UNSTABLE) | |
| 1559 | iomode = commit; | |
| 1560 | if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0){ | |
| 1561 | bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); | |
| 1562 | nmp->nm_state |= NFSSTA_HASWRITEVERF; | |
| 1563 | } else if (bcmp(tl, nmp->nm_verf, NFSX_V3WRITEVERF)) { | |
| 1564 | info->info_writerpc.must_commit = 1; | |
| 1565 | bcopy(tl, (caddr_t)nmp->nm_verf, NFSX_V3WRITEVERF); | |
| edb90c22 | 1566 | } |
| edb90c22 | 1567 | } |
| cc7d050e MD |
1568 | } else { |
| 1569 | ERROROUT(nfsm_loadattr(info, info->vp, NULL)); | |
| 1570 | } | |
| 1571 | m_freem(info->mrep); | |
| 1572 | info->mrep = NULL; | |
| 1573 | len = 0; | |
| 1574 | nfsmout: | |
| 1575 | if (info->vp->v_mount->mnt_flag & MNT_ASYNC) | |
| 1576 | iomode = NFSV3WRITE_FILESYNC; | |
| 1577 | bp->b_resid = len; | |
| 1578 | ||
| 1579 | /* | |
| 1580 | * End of RPC. Now clean up the bp. | |
| 1581 | * | |
| 8ae5c7e0 MD |
1582 | * We no longer enable write clustering for commit operations, |
| 1583 | * See around line 1157 for a more detailed comment. | |
| cc7d050e MD |
1584 | */ |
| 1585 | if (!error && iomode == NFSV3WRITE_UNSTABLE) { | |
| 1586 | bp->b_flags |= B_NEEDCOMMIT; | |
| 8ae5c7e0 MD |
1587 | #if 0 |
| 1588 | /* XXX do not enable commit clustering */ | |
| cc7d050e MD |
1589 | if (bp->b_dirtyoff == 0 && bp->b_dirtyend == bp->b_bcount) |
| 1590 | bp->b_flags |= B_CLUSTEROK; | |
| 8ae5c7e0 | 1591 | #endif |
| cc7d050e MD |
1592 | } else { |
| 1593 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1594 | } | |
| 1595 | ||
| 1596 | /* | |
| 1597 | * For an interrupted write, the buffer is still valid | |
| 1598 | * and the write hasn't been pushed to the server yet, | |
| 1599 | * so we can't set B_ERROR and report the interruption | |
| 1600 | * by setting B_EINTR. For the async case, B_EINTR | |
| 1601 | * is not relevant, so the rpc attempt is essentially | |
| 1602 | * a noop. For the case of a V3 write rpc not being | |
| 1603 | * committed to stable storage, the block is still | |
| 1604 | * dirty and requires either a commit rpc or another | |
| 1605 | * write rpc with iomode == NFSV3WRITE_FILESYNC before | |
| 1606 | * the block is reused. This is indicated by setting | |
| 1607 | * the B_DELWRI and B_NEEDCOMMIT flags. | |
| 1608 | * | |
| 1609 | * If the buffer is marked B_PAGING, it does not reside on | |
| 1610 | * the vp's paging queues so we cannot call bdirty(). The | |
| 1611 | * bp in this case is not an NFS cache block so we should | |
| 1612 | * be safe. XXX | |
| 1613 | */ | |
| 1614 | if (error == EINTR || (!error && (bp->b_flags & B_NEEDCOMMIT))) { | |
| 1615 | crit_enter(); | |
| 1616 | bp->b_flags &= ~(B_INVAL|B_NOCACHE); | |
| 1617 | if ((bp->b_flags & B_PAGING) == 0) | |
| 1618 | bdirty(bp); | |
| edb90c22 | 1619 | if (error) |
| cc7d050e MD |
1620 | bp->b_flags |= B_EINTR; |
| 1621 | crit_exit(); | |
| 1622 | } else { | |
| 1623 | if (error) { | |
| 1624 | bp->b_flags |= B_ERROR; | |
| 1625 | bp->b_error = np->n_error = error; | |
| 1626 | np->n_flag |= NWRITEERR; | |
| 1627 | } | |
| 1628 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1629 | } | |
| 1630 | if (info->info_writerpc.must_commit) | |
| 1631 | nfs_clearcommit(info->vp->v_mount); | |
| c6b43e93 MD |
1632 | lwkt_reltoken(&nmp->nm_token); |
| 1633 | ||
| cc7d050e MD |
1634 | kfree(info, M_NFSREQ); |
| 1635 | if (error) { | |
| 1636 | bp->b_flags |= B_ERROR; | |
| 1637 | bp->b_error = error; | |
| 1638 | } | |
| 1639 | biodone(bio); | |
| 1640 | } | |
| 1641 | ||
| 1642 | /* | |
| 1643 | * Nfs Version 3 commit rpc - BIO version | |
| 1644 | * | |
| 1645 | * This function issues the commit rpc and will chain to a write | |
| 1646 | * rpc if necessary. | |
| 1647 | */ | |
| 1648 | void | |
| 1649 | nfs_commitrpc_bio(struct vnode *vp, struct bio *bio) | |
| 1650 | { | |
| 1651 | struct nfsmount *nmp = VFSTONFS(vp->v_mount); | |
| 1652 | struct buf *bp = bio->bio_buf; | |
| 1653 | struct nfsm_info *info; | |
| 1654 | int error = 0; | |
| 1655 | u_int32_t *tl; | |
| 1656 | ||
| 1657 | if ((nmp->nm_state & NFSSTA_HASWRITEVERF) == 0) { | |
| 1658 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1659 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1660 | bp->b_resid = 0; | |
| 1661 | biodone(bio); | |
| 1662 | return; | |
| 1663 | } | |
| 1664 | ||
| 1665 | info = kmalloc(sizeof(*info), M_NFSREQ, M_WAITOK); | |
| 1666 | info->mrep = NULL; | |
| 1667 | info->v3 = 1; | |
| 1668 | ||
| 1669 | nfsstats.rpccnt[NFSPROC_COMMIT]++; | |
| 1670 | nfsm_reqhead(info, vp, NFSPROC_COMMIT, NFSX_FH(1)); | |
| 1671 | ERROROUT(nfsm_fhtom(info, vp)); | |
| 1672 | tl = nfsm_build(info, 3 * NFSX_UNSIGNED); | |
| 1673 | txdr_hyper(bio->bio_offset + bp->b_dirtyoff, tl); | |
| 1674 | tl += 2; | |
| 1675 | *tl = txdr_unsigned(bp->b_dirtyend - bp->b_dirtyoff); | |
| 1676 | info->bio = bio; | |
| 1677 | info->done = nfs_commitrpc_bio_done; | |
| 1678 | nfsm_request_bio(info, vp, NFSPROC_COMMIT, NULL, | |
| 1679 | nfs_vpcred(vp, ND_WRITE)); | |
| 1680 | return; | |
| 1681 | nfsmout: | |
| 1682 | /* | |
| 1683 | * Chain to write RPC on (early) error | |
| 1684 | */ | |
| 1685 | kfree(info, M_NFSREQ); | |
| 1686 | nfs_writerpc_bio(vp, bio); | |
| 1687 | } | |
| 1688 | ||
| 1689 | static void | |
| 1690 | nfs_commitrpc_bio_done(nfsm_info_t info) | |
| 1691 | { | |
| 1692 | struct nfsmount *nmp = VFSTONFS(info->vp->v_mount); | |
| 1693 | struct bio *bio = info->bio; | |
| 1694 | struct buf *bp = bio->bio_buf; | |
| 1695 | u_int32_t *tl; | |
| 1696 | int wccflag = NFSV3_WCCRATTR; | |
| 1697 | int error = 0; | |
| 1698 | ||
| c6b43e93 | 1699 | lwkt_gettoken(&nmp->nm_token); |
| 77912481 | 1700 | |
| cc7d050e MD |
1701 | ERROROUT(nfsm_wcc_data(info, info->vp, &wccflag)); |
| 1702 | if (error == 0) { | |
| 1703 | NULLOUT(tl = nfsm_dissect(info, NFSX_V3WRITEVERF)); | |
| 1704 | if (bcmp(nmp->nm_verf, tl, NFSX_V3WRITEVERF)) { | |
| 1705 | bcopy(tl, nmp->nm_verf, NFSX_V3WRITEVERF); | |
| 1706 | error = NFSERR_STALEWRITEVERF; | |
| 1707 | } | |
| edb90c22 | 1708 | } |
| cc7d050e MD |
1709 | m_freem(info->mrep); |
| 1710 | info->mrep = NULL; | |
| 1711 | ||
| 1712 | /* | |
| 1713 | * On completion we must chain to a write bio if an | |
| 1714 | * error occurred. | |
| 1715 | */ | |
| edb90c22 | 1716 | nfsmout: |
| cc7d050e MD |
1717 | kfree(info, M_NFSREQ); |
| 1718 | if (error == 0) { | |
| 1719 | bp->b_dirtyoff = bp->b_dirtyend = 0; | |
| 1720 | bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); | |
| 1721 | bp->b_resid = 0; | |
| 1722 | biodone(bio); | |
| 1723 | } else { | |
| cc7d050e MD |
1724 | nfs_writerpc_bio(info->vp, bio); |
| 1725 | } | |
| c6b43e93 | 1726 | lwkt_reltoken(&nmp->nm_token); |
| edb90c22 MD |
1727 | } |
| 1728 |