1 /* $NetBSD: puffs_vnops.c,v 1.154 2011/07/04 08:07:30 manu Exp $ */
4 * Copyright (c) 2005, 2006, 2007 Antti Kantee. All Rights Reserved.
6 * Development of this software was supported by the
7 * Google Summer of Code program and the Ulla Tuominen Foundation.
8 * The Google SoC project was mentored by Bill Studenmund.
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 #include <sys/param.h>
34 #include <sys/lockf.h>
35 #include <sys/malloc.h>
36 #include <sys/mount.h>
37 #include <sys/namei.h>
38 #include <sys/vnode.h>
40 #include <sys/thread2.h>
42 #include <vfs/puffs/puffs_msgif.h>
43 #include <vfs/puffs/puffs_sys.h>
45 #define RWARGS(cont, iofl, move, offset, creds) \
46 (cont)->pvnr_ioflag = (iofl); \
47 (cont)->pvnr_resid = (move); \
48 (cont)->pvnr_offset = (offset); \
49 puffs_credcvt(&(cont)->pvnr_cred, creds)
52 puffs_directread(struct vnode *vp, struct uio *uio, int ioflag,
55 PUFFS_MSG_VARS(vn, read);
56 struct puffs_mount *pmp = MPTOPUFFSMP(vp->v_mount);
57 size_t tomove, argsize;
60 KKASSERT(vp->v_type == VREG);
62 if (uio->uio_offset < 0)
64 if (uio->uio_resid == 0)
71 if (uio->uio_resid == 0)
73 if (uio->uio_offset < 0)
77 * in case it's not a regular file or we're operating
78 * uncached, do read in the old-fashioned style,
79 * i.e. explicit read operations
82 tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
83 argsize = sizeof(struct puffs_vnmsg_read);
84 puffs_msgmem_alloc(argsize + tomove, &park_read,
85 (void *)&read_msg, 1);
88 while (uio->uio_resid > 0) {
89 tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
90 memset(read_msg, 0, argsize); /* XXX: touser KASSERT */
91 RWARGS(read_msg, ioflag, tomove,
92 uio->uio_offset, cred);
93 puffs_msg_setinfo(park_read, PUFFSOP_VN,
94 PUFFS_VN_READ, VPTOPNC(vp));
95 puffs_msg_setdelta(park_read, tomove);
97 PUFFS_MSG_ENQUEUEWAIT2(pmp, park_read, vp->v_data,
99 error = checkerr(pmp, error, __func__);
103 if (read_msg->pvnr_resid > tomove) {
104 puffs_senderr(pmp, PUFFS_ERR_READ,
105 E2BIG, "resid grew", VPTOPNC(vp));
110 error = uiomove(read_msg->pvnr_data,
111 tomove - read_msg->pvnr_resid, uio);
114 * in case the file is out of juice, resid from
115 * userspace is != 0. and the error-case is
118 if (error || read_msg->pvnr_resid)
122 puffs_msgmem_release(park_read);
128 puffs_directwrite(struct vnode *vp, struct uio *uio, int ioflag,
131 PUFFS_MSG_VARS(vn, write);
132 struct puffs_mount *pmp = MPTOPUFFSMP(vp->v_mount);
133 size_t tomove, argsize;
136 KKASSERT(vp->v_type == VREG);
138 if (uio->uio_offset < 0)
140 if (uio->uio_resid == 0)
146 /* tomove is non-increasing */
147 tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
148 argsize = sizeof(struct puffs_vnmsg_write) + tomove;
149 puffs_msgmem_alloc(argsize, &park_write, (void *)&write_msg,1);
151 while (uio->uio_resid > 0) {
152 /* move data to buffer */
153 tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
154 memset(write_msg, 0, argsize); /* XXX: touser KASSERT */
155 RWARGS(write_msg, ioflag, tomove,
156 uio->uio_offset, cred);
157 error = uiomove(write_msg->pvnr_data, tomove, uio);
161 /* move buffer to userspace */
162 puffs_msg_setinfo(park_write, PUFFSOP_VN,
163 PUFFS_VN_WRITE, VPTOPNC(vp));
164 PUFFS_MSG_ENQUEUEWAIT2(pmp, park_write, vp->v_data,
166 error = checkerr(pmp, error, __func__);
170 if (write_msg->pvnr_resid > tomove) {
171 puffs_senderr(pmp, PUFFS_ERR_WRITE,
172 E2BIG, "resid grew", VPTOPNC(vp));
177 if (PUFFS_USE_PAGECACHE(pmp))
178 KKASSERT(vp->v_filesize >= uio->uio_offset);
180 /* didn't move everything? bad userspace. bail */
181 if (write_msg->pvnr_resid != 0) {
186 puffs_msgmem_release(park_write);
192 puffs_iodone(struct bio *bio)
195 bpdone(bio->bio_buf, 0);
199 puffs_bioread(struct vnode *vp, struct uio *uio, int ioflag,
202 int biosize = vp->v_mount->mnt_stat.f_iosize;
205 off_t lbn, loffset, fsize;
210 KKASSERT(uio->uio_rw == UIO_READ);
211 KKASSERT(vp->v_type == VREG);
213 if (uio->uio_offset < 0)
215 if (uio->uio_resid == 0)
219 * Cache consistency can only be maintained approximately.
221 * GETATTR is called to synchronize the file size.
223 * NOTE: In the normal case the attribute cache is not
224 * cleared which means GETATTR may use cached data and
225 * not immediately detect changes made on the server.
228 error = VOP_GETATTR(vp, &vattr);
233 * Loop until uio exhausted or we hit EOF
238 lbn = uio->uio_offset / biosize;
239 boff = uio->uio_offset & (biosize - 1);
240 loffset = lbn * biosize;
241 fsize = puffs_meta_getsize(vp);
243 if (loffset + boff >= fsize) {
247 bp = getblk(vp, loffset, biosize, 0, 0);
253 * If B_CACHE is not set, we must issue the read. If this
254 * fails, we return an error.
256 if ((bp->b_flags & B_CACHE) == 0) {
257 bp->b_cmd = BUF_CMD_READ;
258 bp->b_bio2.bio_done = puffs_iodone;
259 bp->b_bio2.bio_flags |= BIO_SYNC;
260 vfs_busy_pages(vp, bp);
261 error = puffs_doio(vp, &bp->b_bio2, uio->uio_td);
269 * on is the offset into the current bp. Figure out how many
270 * bytes we can copy out of the bp. Note that bcount is
271 * NOT DEV_BSIZE aligned.
273 * Then figure out how many bytes we can copy into the uio.
276 if (n > uio->uio_resid)
278 if (loffset + boff + n > fsize)
279 n = fsize - loffset - boff;
282 error = uiomove(bp->b_data + boff, n, uio);
285 } while (error == 0 && uio->uio_resid > 0 && n > 0);
291 puffs_biowrite(struct vnode *vp, struct uio *uio, int ioflag,
294 int biosize = vp->v_mount->mnt_stat.f_iosize;
297 off_t loffset, fsize;
303 KKASSERT(uio->uio_rw == UIO_WRITE);
304 KKASSERT(vp->v_type == VREG);
306 if (uio->uio_offset < 0)
308 if (uio->uio_resid == 0)
312 * If IO_APPEND then load uio_offset. We restart here if we cannot
313 * get the append lock.
315 * We need to obtain exclusize lock if we intend to modify file size
316 * in order to guarentee the append point with multiple contending
319 if (ioflag & IO_APPEND) {
320 /* XXXDF relock if necessary */
321 KKASSERT(vn_islocked(vp) == LK_EXCLUSIVE);
322 error = VOP_GETATTR(vp, &vattr);
325 uio->uio_offset = puffs_meta_getsize(vp);
329 boff = uio->uio_offset & (biosize-1);
330 loffset = uio->uio_offset - boff;
331 bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid);
334 * Handle direct append and file extension cases, calculate
335 * unaligned buffer size. When extending B_CACHE will be
336 * set if possible. See UIO_NOCOPY note below.
338 fsize = puffs_meta_getsize(vp);
339 if (uio->uio_offset + bytes > fsize) {
340 trivial = (uio->uio_segflg != UIO_NOCOPY &&
341 uio->uio_offset <= fsize);
342 puffs_meta_setsize(vp, uio->uio_offset + bytes,
345 bp = getblk(vp, loffset, biosize, 0, 0);
352 * Actual bytes in buffer which we care about
354 if (loffset + biosize < fsize)
357 bcount = (int)(fsize - loffset);
360 * Avoid a read by setting B_CACHE where the data we
361 * intend to write covers the entire buffer. Note
362 * that the buffer may have been set to B_CACHE by
363 * puffs_meta_setsize() above or otherwise inherited the
364 * flag, but if B_CACHE isn't set the buffer may be
365 * uninitialized and must be zero'd to accomodate
366 * future seek+write's.
368 * See the comments in kern/vfs_bio.c's getblk() for
371 * When doing a UIO_NOCOPY write the buffer is not
372 * overwritten and we cannot just set B_CACHE unconditionally
373 * for full-block writes.
375 if (boff == 0 && bytes == biosize &&
376 uio->uio_segflg != UIO_NOCOPY) {
377 bp->b_flags |= B_CACHE;
378 bp->b_flags &= ~(B_ERROR | B_INVAL);
382 * b_resid may be set due to file EOF if we extended out.
383 * The NFS bio code will zero the difference anyway so
384 * just acknowledged the fact and set b_resid to 0.
386 if ((bp->b_flags & B_CACHE) == 0) {
387 bp->b_cmd = BUF_CMD_READ;
388 bp->b_bio2.bio_done = puffs_iodone;
389 bp->b_bio2.bio_flags |= BIO_SYNC;
390 vfs_busy_pages(vp, bp);
391 error = puffs_doio(vp, &bp->b_bio2, uio->uio_td);
400 * If dirtyend exceeds file size, chop it down. This should
401 * not normally occur but there is an append race where it
402 * might occur XXX, so we log it.
404 * If the chopping creates a reverse-indexed or degenerate
405 * situation with dirtyoff/end, we 0 both of them.
407 if (bp->b_dirtyend > bcount) {
408 kprintf("PUFFS append race @%08llx:%d\n",
409 (long long)bp->b_bio2.bio_offset,
410 bp->b_dirtyend - bcount);
411 bp->b_dirtyend = bcount;
414 if (bp->b_dirtyoff >= bp->b_dirtyend)
415 bp->b_dirtyoff = bp->b_dirtyend = 0;
418 * If the new write will leave a contiguous dirty
419 * area, just update the b_dirtyoff and b_dirtyend,
420 * otherwise force a write rpc of the old dirty area.
422 * While it is possible to merge discontiguous writes due to
423 * our having a B_CACHE buffer ( and thus valid read data
424 * for the hole), we don't because it could lead to
425 * significant cache coherency problems with multiple clients,
426 * especially if locking is implemented later on.
428 * as an optimization we could theoretically maintain
429 * a linked list of discontinuous areas, but we would still
430 * have to commit them separately so there isn't much
431 * advantage to it except perhaps a bit of asynchronization.
433 if (bp->b_dirtyend > 0 &&
434 (boff > bp->b_dirtyend ||
435 (boff + bytes) < bp->b_dirtyoff)
437 if (bwrite(bp) == EINTR) {
444 error = uiomove(bp->b_data + boff, bytes, uio);
447 * Since this block is being modified, it must be written
448 * again and not just committed. Since write clustering does
449 * not work for the stage 1 data write, only the stage 2
450 * commit rpc, we have to clear B_CLUSTEROK as well.
452 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
460 * Only update dirtyoff/dirtyend if not a degenerate
463 * The underlying VM pages have been marked valid by
464 * virtue of acquiring the bp. Because the entire buffer
465 * is marked dirty we do not have to worry about cleaning
466 * out the related dirty bits (and wouldn't really know
467 * how to deal with byte ranges anyway)
470 if (bp->b_dirtyend > 0) {
471 bp->b_dirtyoff = imin(boff, bp->b_dirtyoff);
472 bp->b_dirtyend = imax(boff + bytes,
475 bp->b_dirtyoff = boff;
476 bp->b_dirtyend = boff + bytes;
480 if (ioflag & IO_SYNC) {
481 if (ioflag & IO_INVAL)
482 bp->b_flags |= B_NOCACHE;
489 } while (uio->uio_resid > 0 && bytes > 0);
495 puffs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
497 struct buf *bp = bio->bio_buf;
505 if (td != NULL && td->td_proc != NULL)
506 cred = td->td_proc->p_ucred;
508 cred = proc0.p_ucred;
512 uiop->uio_iovcnt = 1;
513 uiop->uio_segflg = UIO_SYSSPACE;
517 * clear B_ERROR and B_INVAL state prior to initiating the I/O. We
518 * do this here so we do not have to do it in all the code that
521 bp->b_flags &= ~(B_ERROR | B_INVAL);
523 KASSERT(bp->b_cmd != BUF_CMD_DONE,
524 ("puffs_doio: bp %p already marked done!", bp));
526 if (bp->b_cmd == BUF_CMD_READ) {
527 io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount;
528 io.iov_base = bp->b_data;
529 uiop->uio_rw = UIO_READ;
531 uiop->uio_offset = bio->bio_offset;
532 error = puffs_directread(vp, uiop, 0, cred);
533 if (error == 0 && uiop->uio_resid) {
534 n = (size_t)bp->b_bcount - uiop->uio_resid;
535 bzero(bp->b_data + n, bp->b_bcount - n);
539 bp->b_flags |= B_ERROR;
542 bp->b_resid = uiop->uio_resid;
544 KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
545 if (bio->bio_offset + bp->b_dirtyend > puffs_meta_getsize(vp))
546 bp->b_dirtyend = puffs_meta_getsize(vp) -
549 if (bp->b_dirtyend > bp->b_dirtyoff) {
550 io.iov_len = uiop->uio_resid = bp->b_dirtyend
552 uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
553 io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
554 uiop->uio_rw = UIO_WRITE;
556 error = puffs_directwrite(vp, uiop, 0, cred);
559 || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
561 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
562 if ((bp->b_flags & B_PAGING) == 0)
565 bp->b_flags |= B_EINTR;
569 bp->b_flags |= B_ERROR;
572 bp->b_dirtyoff = bp->b_dirtyend = 0;
574 bp->b_resid = uiop->uio_resid;
581 KKASSERT(bp->b_cmd == BUF_CMD_DONE);
582 if (bp->b_flags & B_EINTR)
584 if (bp->b_flags & B_ERROR)
585 return (bp->b_error ? bp->b_error : EIO);