sys/vfs/puffs/puffs_io.c

   1 /*      $NetBSD: puffs_vnops.c,v 1.154 2011/07/04 08:07:30 manu Exp $   */
   2
   3 /*
   4  * Copyright (c) 2005, 2006, 2007  Antti Kantee.  All Rights Reserved.
   5  *
   6  * Development of this software was supported by the
   7  * Google Summer of Code program and the Ulla Tuominen Foundation.
   8  * The Google SoC project was mentored by Bill Studenmund.
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  * 1. Redistributions of source code must retain the above copyright
  14  *    notice, this list of conditions and the following disclaimer.
  15  * 2. Redistributions in binary form must reproduce the above copyright
  16  *    notice, this list of conditions and the following disclaimer in the
  17  *    documentation and/or other materials provided with the distribution.
  18  *
  19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
  20  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  21  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  22  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  25  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29  * SUCH DAMAGE.
  30  */
  31
  32 #include <sys/param.h>
  33 #include <sys/buf.h>
  34 #include <sys/lockf.h>
  35 #include <sys/malloc.h>
  36 #include <sys/mount.h>
  37 #include <sys/namei.h>
  38 #include <sys/vnode.h>
  39 #include <sys/proc.h>
  40 #include <sys/thread2.h>
  41
  42 #include <vfs/puffs/puffs_msgif.h>
  43 #include <vfs/puffs/puffs_sys.h>
  44
  45 #define RWARGS(cont, iofl, move, offset, creds)                         \
  46         (cont)->pvnr_ioflag = (iofl);                                   \
  47         (cont)->pvnr_resid = (move);                                    \
  48         (cont)->pvnr_offset = (offset);                                 \
  49         puffs_credcvt(&(cont)->pvnr_cred, creds)
  50
  51 int
  52 puffs_directread(struct vnode *vp, struct uio *uio, int ioflag,
  53     struct ucred *cred)
  54 {
  55         PUFFS_MSG_VARS(vn, read);
  56         struct puffs_mount *pmp = MPTOPUFFSMP(vp->v_mount);
  57         size_t tomove, argsize;
  58         int error;
  59
  60         KKASSERT(vp->v_type == VREG);
  61
  62         if (uio->uio_offset < 0)
  63                 return EINVAL;
  64         if (uio->uio_resid == 0)
  65                 return 0;
  66
  67         read_msg = NULL;
  68         error = 0;
  69
  70         /* std sanity */
  71         if (uio->uio_resid == 0)
  72                 return 0;
  73         if (uio->uio_offset < 0)
  74                 return EINVAL;
  75
  76         /*
  77          * in case it's not a regular file or we're operating
  78          * uncached, do read in the old-fashioned style,
  79          * i.e. explicit read operations
  80          */
  81
  82         tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
  83         argsize = sizeof(struct puffs_vnmsg_read);
  84         puffs_msgmem_alloc(argsize + tomove, &park_read,
  85             (void *)&read_msg, 1);
  86
  87         error = 0;
  88         while (uio->uio_resid > 0) {
  89                 tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
  90                 memset(read_msg, 0, argsize); /* XXX: touser KASSERT */
  91                 RWARGS(read_msg, ioflag, tomove,
  92                     uio->uio_offset, cred);
  93                 puffs_msg_setinfo(park_read, PUFFSOP_VN,
  94                     PUFFS_VN_READ, VPTOPNC(vp));
  95                 puffs_msg_setdelta(park_read, tomove);
  96
  97                 PUFFS_MSG_ENQUEUEWAIT2(pmp, park_read, vp->v_data,
  98                     NULL, error);
  99                 error = checkerr(pmp, error, __func__);
 100                 if (error)
 101                         break;
 102
 103                 if (read_msg->pvnr_resid > tomove) {
 104                         puffs_senderr(pmp, PUFFS_ERR_READ,
 105                             E2BIG, "resid grew", VPTOPNC(vp));
 106                         error = EPROTO;
 107                         break;
 108                 }
 109
 110                 error = uiomove(read_msg->pvnr_data,
 111                     tomove - read_msg->pvnr_resid, uio);
 112
 113                 /*
 114                  * in case the file is out of juice, resid from
 115                  * userspace is != 0.  and the error-case is
 116                  * quite obvious
 117                  */
 118                 if (error || read_msg->pvnr_resid)
 119                         break;
 120         }
 121
 122         puffs_msgmem_release(park_read);
 123
 124         return error;
 125 }
 126
 127 int
 128 puffs_directwrite(struct vnode *vp, struct uio *uio, int ioflag,
 129     struct ucred *cred)
 130 {
 131         PUFFS_MSG_VARS(vn, write);
 132         struct puffs_mount *pmp = MPTOPUFFSMP(vp->v_mount);
 133         size_t tomove, argsize;
 134         int error, uflags;
 135
 136         KKASSERT(vp->v_type == VREG);
 137
 138         if (uio->uio_offset < 0)
 139                 return EINVAL;
 140         if (uio->uio_resid == 0)
 141                 return 0;
 142
 143         error = uflags = 0;
 144         write_msg = NULL;
 145
 146         /* tomove is non-increasing */
 147         tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
 148         argsize = sizeof(struct puffs_vnmsg_write) + tomove;
 149         puffs_msgmem_alloc(argsize, &park_write, (void *)&write_msg,1);
 150
 151         while (uio->uio_resid > 0) {
 152                 /* move data to buffer */
 153                 tomove = PUFFS_TOMOVE(uio->uio_resid, pmp);
 154                 memset(write_msg, 0, argsize); /* XXX: touser KASSERT */
 155                 RWARGS(write_msg, ioflag, tomove,
 156                     uio->uio_offset, cred);
 157                 error = uiomove(write_msg->pvnr_data, tomove, uio);
 158                 if (error)
 159                         break;
 160
 161                 /* move buffer to userspace */
 162                 puffs_msg_setinfo(park_write, PUFFSOP_VN,
 163                     PUFFS_VN_WRITE, VPTOPNC(vp));
 164                 PUFFS_MSG_ENQUEUEWAIT2(pmp, park_write, vp->v_data,
 165                     NULL, error);
 166                 error = checkerr(pmp, error, __func__);
 167                 if (error)
 168                         break;
 169
 170                 if (write_msg->pvnr_resid > tomove) {
 171                         puffs_senderr(pmp, PUFFS_ERR_WRITE,
 172                             E2BIG, "resid grew", VPTOPNC(vp));
 173                         error = EPROTO;
 174                         break;
 175                 }
 176
 177                 if (PUFFS_USE_PAGECACHE(pmp))
 178                         KKASSERT(vp->v_filesize >= uio->uio_offset);
 179
 180                 /* didn't move everything?  bad userspace.  bail */
 181                 if (write_msg->pvnr_resid != 0) {
 182                         error = EIO;
 183                         break;
 184                 }
 185         }
 186         puffs_msgmem_release(park_write);
 187
 188         return error;
 189 }
 190
 191 static void
 192 puffs_iodone(struct bio *bio)
 193 {
 194         bio->bio_flags = 0;
 195         bpdone(bio->bio_buf, 0);
 196 }
 197
 198 int
 199 puffs_bioread(struct vnode *vp, struct uio *uio, int ioflag,
 200     struct ucred *cred)
 201 {
 202         int biosize = vp->v_mount->mnt_stat.f_iosize;
 203         struct buf *bp;
 204         struct vattr vattr;
 205         off_t lbn, loffset, fsize;
 206         size_t n;
 207         int boff;
 208         int error = 0;
 209
 210         KKASSERT(uio->uio_rw == UIO_READ);
 211         KKASSERT(vp->v_type == VREG);
 212
 213         if (uio->uio_offset < 0)
 214                 return EINVAL;
 215         if (uio->uio_resid == 0)
 216                 return 0;
 217
 218         /*
 219          * Cache consistency can only be maintained approximately.
 220          *
 221          * GETATTR is called to synchronize the file size.
 222          *
 223          * NOTE: In the normal case the attribute cache is not
 224          * cleared which means GETATTR may use cached data and
 225          * not immediately detect changes made on the server.
 226          */
 227
 228         error = VOP_GETATTR(vp, &vattr);
 229         if (error)
 230                 return error;
 231
 232         /*
 233          * Loop until uio exhausted or we hit EOF
 234          */
 235         do {
 236                 bp = NULL;
 237
 238                 lbn = uio->uio_offset / biosize;
 239                 boff = uio->uio_offset & (biosize - 1);
 240                 loffset = lbn * biosize;
 241                 fsize = puffs_meta_getsize(vp);
 242
 243                 if (loffset + boff >= fsize) {
 244                         n = 0;
 245                         break;
 246                 }
 247                 bp = getblk(vp, loffset, biosize, 0, 0);
 248
 249                 if (bp == NULL)
 250                         return EINTR;
 251
 252                 /*
 253                  * If B_CACHE is not set, we must issue the read.  If this
 254                  * fails, we return an error.
 255                  */
 256                 if ((bp->b_flags & B_CACHE) == 0) {
 257                         bp->b_cmd = BUF_CMD_READ;
 258                         bp->b_bio2.bio_done = puffs_iodone;
 259                         bp->b_bio2.bio_flags |= BIO_SYNC;
 260                         vfs_busy_pages(vp, bp);
 261                         error = puffs_doio(vp, &bp->b_bio2, uio->uio_td);
 262                         if (error) {
 263                                 brelse(bp);
 264                                 return error;
 265                         }
 266                 }
 267
 268                 /*
 269                  * on is the offset into the current bp.  Figure out how many
 270                  * bytes we can copy out of the bp.  Note that bcount is
 271                  * NOT DEV_BSIZE aligned.
 272                  *
 273                  * Then figure out how many bytes we can copy into the uio.
 274                  */
 275                 n = biosize - boff;
 276                 if (n > uio->uio_resid)
 277                         n = uio->uio_resid;
 278                 if (loffset + boff + n > fsize)
 279                         n = fsize - loffset - boff;
 280
 281                 if (n > 0)
 282                         error = uiomove(bp->b_data + boff, n, uio);
 283                 if (bp)
 284                         brelse(bp);
 285         } while (error == 0 && uio->uio_resid > 0 && n > 0);
 286
 287         return error;
 288 }
 289
 290 int
 291 puffs_biowrite(struct vnode *vp, struct uio *uio, int ioflag,
 292     struct ucred *cred)
 293 {
 294         int biosize = vp->v_mount->mnt_stat.f_iosize;
 295         struct buf *bp;
 296         struct vattr vattr;
 297         off_t loffset, fsize;
 298         int boff, bytes;
 299         int error = 0;
 300         int bcount;
 301         int trivial;
 302
 303         KKASSERT(uio->uio_rw == UIO_WRITE);
 304         KKASSERT(vp->v_type == VREG);
 305
 306         if (uio->uio_offset < 0)
 307                 return EINVAL;
 308         if (uio->uio_resid == 0)
 309                 return 0;
 310
 311         /*
 312          * If IO_APPEND then load uio_offset.  We restart here if we cannot
 313          * get the append lock.
 314          *
 315          * We need to obtain exclusize lock if we intend to modify file size
 316          * in order to guarentee the append point with multiple contending
 317          * writers.
 318          */
 319         if (ioflag & IO_APPEND) {
 320                 /* XXXDF relock if necessary */
 321                 KKASSERT(vn_islocked(vp) == LK_EXCLUSIVE);
 322                 error = VOP_GETATTR(vp, &vattr);
 323                 if (error)
 324                         return error;
 325                 uio->uio_offset = puffs_meta_getsize(vp);
 326         }
 327
 328         do {
 329                 boff = uio->uio_offset & (biosize-1);
 330                 loffset = uio->uio_offset - boff;
 331                 bytes = (int)szmin((unsigned)(biosize - boff), uio->uio_resid);
 332 again:
 333                 /*
 334                  * Handle direct append and file extension cases, calculate
 335                  * unaligned buffer size.  When extending B_CACHE will be
 336                  * set if possible.  See UIO_NOCOPY note below.
 337                  */
 338                 fsize = puffs_meta_getsize(vp);
 339                 if (uio->uio_offset + bytes > fsize) {
 340                         trivial = (uio->uio_segflg != UIO_NOCOPY &&
 341                             uio->uio_offset <= fsize);
 342                         puffs_meta_setsize(vp, uio->uio_offset + bytes,
 343                             trivial);
 344                 }
 345                 bp = getblk(vp, loffset, biosize, 0, 0);
 346                 if (bp == NULL) {
 347                         error = EINTR;
 348                         break;
 349                 }
 350
 351                 /*
 352                  * Actual bytes in buffer which we care about
 353                  */
 354                 if (loffset + biosize < fsize)
 355                         bcount = biosize;
 356                 else
 357                         bcount = (int)(fsize - loffset);
 358
 359                 /*
 360                  * Avoid a read by setting B_CACHE where the data we
 361                  * intend to write covers the entire buffer.  Note
 362                  * that the buffer may have been set to B_CACHE by
 363                  * puffs_meta_setsize() above or otherwise inherited the
 364                  * flag, but if B_CACHE isn't set the buffer may be
 365                  * uninitialized and must be zero'd to accomodate
 366                  * future seek+write's.
 367                  *
 368                  * See the comments in kern/vfs_bio.c's getblk() for
 369                  * more information.
 370                  *
 371                  * When doing a UIO_NOCOPY write the buffer is not
 372                  * overwritten and we cannot just set B_CACHE unconditionally
 373                  * for full-block writes.
 374                  */
 375                 if (boff == 0 && bytes == biosize &&
 376                     uio->uio_segflg != UIO_NOCOPY) {
 377                         bp->b_flags |= B_CACHE;
 378                         bp->b_flags &= ~(B_ERROR | B_INVAL);
 379                 }
 380
 381                 /*
 382                  * b_resid may be set due to file EOF if we extended out.
 383                  * The NFS bio code will zero the difference anyway so
 384                  * just acknowledged the fact and set b_resid to 0.
 385                  */
 386                 if ((bp->b_flags & B_CACHE) == 0) {
 387                         bp->b_cmd = BUF_CMD_READ;
 388                         bp->b_bio2.bio_done = puffs_iodone;
 389                         bp->b_bio2.bio_flags |= BIO_SYNC;
 390                         vfs_busy_pages(vp, bp);
 391                         error = puffs_doio(vp, &bp->b_bio2, uio->uio_td);
 392                         if (error) {
 393                                 brelse(bp);
 394                                 break;
 395                         }
 396                         bp->b_resid = 0;
 397                 }
 398
 399                 /*
 400                  * If dirtyend exceeds file size, chop it down.  This should
 401                  * not normally occur but there is an append race where it
 402                  * might occur XXX, so we log it.
 403                  *
 404                  * If the chopping creates a reverse-indexed or degenerate
 405                  * situation with dirtyoff/end, we 0 both of them.
 406                  */
 407                 if (bp->b_dirtyend > bcount) {
 408                         kprintf("PUFFS append race @%08llx:%d\n",
 409                             (long long)bp->b_bio2.bio_offset,
 410                             bp->b_dirtyend - bcount);
 411                         bp->b_dirtyend = bcount;
 412                 }
 413
 414                 if (bp->b_dirtyoff >= bp->b_dirtyend)
 415                         bp->b_dirtyoff = bp->b_dirtyend = 0;
 416
 417                 /*
 418                  * If the new write will leave a contiguous dirty
 419                  * area, just update the b_dirtyoff and b_dirtyend,
 420                  * otherwise force a write rpc of the old dirty area.
 421                  *
 422                  * While it is possible to merge discontiguous writes due to
 423                  * our having a B_CACHE buffer ( and thus valid read data
 424                  * for the hole), we don't because it could lead to
 425                  * significant cache coherency problems with multiple clients,
 426                  * especially if locking is implemented later on.
 427                  *
 428                  * as an optimization we could theoretically maintain
 429                  * a linked list of discontinuous areas, but we would still
 430                  * have to commit them separately so there isn't much
 431                  * advantage to it except perhaps a bit of asynchronization.
 432                  */
 433                 if (bp->b_dirtyend > 0 &&
 434                     (boff > bp->b_dirtyend ||
 435                     (boff + bytes) < bp->b_dirtyoff)
 436                    ) {
 437                         if (bwrite(bp) == EINTR) {
 438                                 error = EINTR;
 439                                 break;
 440                         }
 441                         goto again;
 442                 }
 443
 444                 error = uiomove(bp->b_data + boff, bytes, uio);
 445
 446                 /*
 447                  * Since this block is being modified, it must be written
 448                  * again and not just committed.  Since write clustering does
 449                  * not work for the stage 1 data write, only the stage 2
 450                  * commit rpc, we have to clear B_CLUSTEROK as well.
 451                  */
 452                 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK);
 453
 454                 if (error) {
 455                         brelse(bp);
 456                         break;
 457                 }
 458
 459                 /*
 460                  * Only update dirtyoff/dirtyend if not a degenerate
 461                  * condition.
 462                  *
 463                  * The underlying VM pages have been marked valid by
 464                  * virtue of acquiring the bp.  Because the entire buffer
 465                  * is marked dirty we do not have to worry about cleaning
 466                  * out the related dirty bits (and wouldn't really know
 467                  * how to deal with byte ranges anyway)
 468                  */
 469                 if (bytes) {
 470                         if (bp->b_dirtyend > 0) {
 471                                 bp->b_dirtyoff = imin(boff, bp->b_dirtyoff);
 472                                 bp->b_dirtyend = imax(boff + bytes,
 473                                     bp->b_dirtyend);
 474                         } else {
 475                                 bp->b_dirtyoff = boff;
 476                                 bp->b_dirtyend = boff + bytes;
 477                         }
 478                 }
 479
 480                 if (ioflag & IO_SYNC) {
 481                         if (ioflag & IO_INVAL)
 482                                 bp->b_flags |= B_NOCACHE;
 483                         error = bwrite(bp);
 484                         if (error)
 485                                 break;
 486                 } else {
 487                         bdwrite(bp);
 488                 }
 489         } while (uio->uio_resid > 0 && bytes > 0);
 490
 491         return error;
 492 }
 493
 494 int
 495 puffs_doio(struct vnode *vp, struct bio *bio, struct thread *td)
 496 {
 497         struct buf *bp = bio->bio_buf;
 498         struct ucred *cred;
 499         struct uio *uiop;
 500         struct uio uio;
 501         struct iovec io;
 502         size_t n;
 503         int error = 0;
 504
 505         if (td != NULL && td->td_proc != NULL)
 506                 cred = td->td_proc->p_ucred;
 507         else
 508                 cred = proc0.p_ucred;
 509
 510         uiop = &uio;
 511         uiop->uio_iov = &io;
 512         uiop->uio_iovcnt = 1;
 513         uiop->uio_segflg = UIO_SYSSPACE;
 514         uiop->uio_td = td;
 515
 516         /*
 517          * clear B_ERROR and B_INVAL state prior to initiating the I/O.  We
 518          * do this here so we do not have to do it in all the code that
 519          * calls us.
 520          */
 521         bp->b_flags &= ~(B_ERROR | B_INVAL);
 522
 523         KASSERT(bp->b_cmd != BUF_CMD_DONE,
 524             ("puffs_doio: bp %p already marked done!", bp));
 525
 526         if (bp->b_cmd == BUF_CMD_READ) {
 527                 io.iov_len = uiop->uio_resid = (size_t)bp->b_bcount;
 528                 io.iov_base = bp->b_data;
 529                 uiop->uio_rw = UIO_READ;
 530
 531                 uiop->uio_offset = bio->bio_offset;
 532                 error = puffs_directread(vp, uiop, 0, cred);
 533                 if (error == 0 && uiop->uio_resid) {
 534                         n = (size_t)bp->b_bcount - uiop->uio_resid;
 535                         bzero(bp->b_data + n, bp->b_bcount - n);
 536                         uiop->uio_resid = 0;
 537                 }
 538                 if (error) {
 539                         bp->b_flags |= B_ERROR;
 540                         bp->b_error = error;
 541                 }
 542                 bp->b_resid = uiop->uio_resid;
 543         } else {
 544                 KKASSERT(bp->b_cmd == BUF_CMD_WRITE);
 545                 if (bio->bio_offset + bp->b_dirtyend > puffs_meta_getsize(vp))
 546                         bp->b_dirtyend = puffs_meta_getsize(vp) -
 547                             bio->bio_offset;
 548
 549                 if (bp->b_dirtyend > bp->b_dirtyoff) {
 550                         io.iov_len = uiop->uio_resid = bp->b_dirtyend
 551                             - bp->b_dirtyoff;
 552                         uiop->uio_offset = bio->bio_offset + bp->b_dirtyoff;
 553                         io.iov_base = (char *)bp->b_data + bp->b_dirtyoff;
 554                         uiop->uio_rw = UIO_WRITE;
 555
 556                         error = puffs_directwrite(vp, uiop, 0, cred);
 557
 558                         if (error == EINTR
 559                             || (!error && (bp->b_flags & B_NEEDCOMMIT))) {
 560                                 crit_enter();
 561                                 bp->b_flags &= ~(B_INVAL|B_NOCACHE);
 562                                 if ((bp->b_flags & B_PAGING) == 0)
 563                                         bdirty(bp);
 564                                 if (error)
 565                                         bp->b_flags |= B_EINTR;
 566                                 crit_exit();
 567                         } else {
 568                                 if (error) {
 569                                         bp->b_flags |= B_ERROR;
 570                                         bp->b_error = error;
 571                                 }
 572                                 bp->b_dirtyoff = bp->b_dirtyend = 0;
 573                         }
 574                         bp->b_resid = uiop->uio_resid;
 575                 } else {
 576                         bp->b_resid = 0;
 577                 }
 578         }
 579
 580         biodone(bio);
 581         KKASSERT(bp->b_cmd == BUF_CMD_DONE);
 582         if (bp->b_flags & B_EINTR)
 583                 return (EINTR);
 584         if (bp->b_flags & B_ERROR)
 585                 return (bp->b_error ? bp->b_error : EIO);
 586         return (0);
 587 }