sys/vfs/ufs/ufs_readwrite.c

   1 /*-
   2  * Copyright (c) 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. All advertising materials mentioning features or use of this software
  14  *    must display the following acknowledgement:
  15  *      This product includes software developed by the University of
  16  *      California, Berkeley and its contributors.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      @(#)ufs_readwrite.c     8.11 (Berkeley) 5/8/95
  34  * $FreeBSD: src/sys/ufs/ufs/ufs_readwrite.c,v 1.65.2.14 2003/04/04 22:21:29 tegge Exp $
  35  * $DragonFly: src/sys/vfs/ufs/ufs_readwrite.c,v 1.8 2003/07/06 21:23:55 dillon Exp $
  36  */
  37
  38 #define BLKSIZE(a, b, c)        blksize(a, b, c)
  39 #define FS                      struct fs
  40 #define I_FS                    i_fs
  41
  42 #include <vm/vm.h>
  43 #include <vm/vm_object.h>
  44 #include <vm/vm_pager.h>
  45 #include <vm/vm_map.h>
  46 #include <vm/vnode_pager.h>
  47 #include <sys/event.h>
  48 #include <sys/vmmeter.h>
  49 #include <vm/vm_page2.h>
  50
  51 #include "opt_directio.h"
  52
  53 #define VN_KNOTE(vp, b) \
  54         KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
  55
  56 #ifdef DIRECTIO
  57 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
  58 #endif
  59
  60 /*
  61  * Vnode op for reading.
  62  */
  63 /* ARGSUSED */
  64 int
  65 ffs_read(ap)
  66         struct vop_read_args /* {
  67                 struct vnode *a_vp;
  68                 struct uio *a_uio;
  69                 int a_ioflag;
  70                 struct ucred *a_cred;
  71         } */ *ap;
  72 {
  73         register struct vnode *vp;
  74         register struct inode *ip;
  75         register struct uio *uio;
  76         register FS *fs;
  77         struct buf *bp;
  78         ufs_daddr_t lbn, nextlbn;
  79         off_t bytesinfile;
  80         long size, xfersize, blkoffset;
  81         int error, orig_resid;
  82         u_short mode;
  83         int seqcount;
  84         int ioflag;
  85         vm_object_t object;
  86
  87         vp = ap->a_vp;
  88         seqcount = ap->a_ioflag >> 16;
  89         ip = VTOI(vp);
  90         mode = ip->i_mode;
  91         uio = ap->a_uio;
  92         ioflag = ap->a_ioflag;
  93 #ifdef DIRECTIO
  94         if ((ioflag & IO_DIRECT) != 0) {
  95                 int workdone;
  96
  97                 error = ffs_rawread(vp, uio, &workdone);
  98                 if (error || workdone)
  99                         return error;
 100         }
 101 #endif
 102
 103 #ifdef DIAGNOSTIC
 104         if (uio->uio_rw != UIO_READ)
 105                 panic("ffs_read: mode");
 106
 107         if (vp->v_type == VLNK) {
 108                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 109                         panic("ffs_read: short symlink");
 110         } else if (vp->v_type != VREG && vp->v_type != VDIR)
 111                 panic("ffs_read: type %d", vp->v_type);
 112 #endif
 113         fs = ip->I_FS;
 114         if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize)
 115                 return (EFBIG);
 116
 117         orig_resid = uio->uio_resid;
 118         if (orig_resid <= 0)
 119                 return (0);
 120
 121         object = vp->v_object;
 122
 123         bytesinfile = ip->i_size - uio->uio_offset;
 124         if (bytesinfile <= 0) {
 125                 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 126                         ip->i_flag |= IN_ACCESS;
 127                 return 0;
 128         }
 129
 130         if (object)
 131                 vm_object_reference(object);
 132
 133 #ifdef ENABLE_VFS_IOOPT
 134         /*
 135          * If IO optimisation is turned on,
 136          * and we are NOT a VM based IO request,
 137          * (i.e. not headed for the buffer cache)
 138          * but there IS a vm object associated with it.
 139          */
 140         if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
 141                 int nread, toread;
 142
 143                 toread = uio->uio_resid;
 144                 if (toread > bytesinfile)
 145                         toread = bytesinfile;
 146                 if (toread >= PAGE_SIZE) {
 147                         /*
 148                          * Then if it's at least a page in size, try
 149                          * get the data from the object using vm tricks
 150                          */
 151                         error = uioread(toread, uio, object, &nread);
 152                         if ((uio->uio_resid == 0) || (error != 0)) {
 153                                 /*
 154                                  * If we finished or there was an error
 155                                  * then finish up (the reference previously
 156                                  * obtained on object must be released).
 157                                  */
 158                                 if ((error == 0 ||
 159                                     uio->uio_resid != orig_resid) &&
 160                                     (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 161                                         ip->i_flag |= IN_ACCESS;
 162
 163                                 if (object)
 164                                         vm_object_vndeallocate(object);
 165                                 return error;
 166                         }
 167                 }
 168         }
 169 #endif
 170
 171         /*
 172          * Ok so we couldn't do it all in one vm trick...
 173          * so cycle around trying smaller bites..
 174          */
 175         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 176                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 177                         break;
 178 #ifdef ENABLE_VFS_IOOPT
 179                 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
 180                         /*
 181                          * Obviously we didn't finish above, but we
 182                          * didn't get an error either. Try the same trick again.
 183                          * but this time we are looping.
 184                          */
 185                         int nread, toread;
 186                         toread = uio->uio_resid;
 187                         if (toread > bytesinfile)
 188                                 toread = bytesinfile;
 189
 190                         /*
 191                          * Once again, if there isn't enough for a
 192                          * whole page, don't try optimising.
 193                          */
 194                         if (toread >= PAGE_SIZE) {
 195                                 error = uioread(toread, uio, object, &nread);
 196                                 if ((uio->uio_resid == 0) || (error != 0)) {
 197                                         /*
 198                                          * If we finished or there was an
 199                                          * error then finish up (the reference
 200                                          * previously obtained on object must
 201                                          * be released).
 202                                          */
 203                                         if ((error == 0 ||
 204                                             uio->uio_resid != orig_resid) &&
 205                                             (vp->v_mount->mnt_flag &
 206                                             MNT_NOATIME) == 0)
 207                                                 ip->i_flag |= IN_ACCESS;
 208                                         if (object)
 209                                                 vm_object_vndeallocate(object);
 210                                         return error;
 211                                 }
 212                                 /*
 213                                  * To get here we didnt't finish or err.
 214                                  * If we did get some data,
 215                                  * loop to try another bite.
 216                                  */
 217                                 if (nread > 0) {
 218                                         continue;
 219                                 }
 220                         }
 221                 }
 222 #endif
 223
 224                 lbn = lblkno(fs, uio->uio_offset);
 225                 nextlbn = lbn + 1;
 226
 227                 /*
 228                  * size of buffer.  The buffer representing the
 229                  * end of the file is rounded up to the size of
 230                  * the block type ( fragment or full block,
 231                  * depending ).
 232                  */
 233                 size = BLKSIZE(fs, ip, lbn);
 234                 blkoffset = blkoff(fs, uio->uio_offset);
 235
 236                 /*
 237                  * The amount we want to transfer in this iteration is
 238                  * one FS block less the amount of the data before
 239                  * our startpoint (duh!)
 240                  */
 241                 xfersize = fs->fs_bsize - blkoffset;
 242
 243                 /*
 244                  * But if we actually want less than the block,
 245                  * or the file doesn't have a whole block more of data,
 246                  * then use the lesser number.
 247                  */
 248                 if (uio->uio_resid < xfersize)
 249                         xfersize = uio->uio_resid;
 250                 if (bytesinfile < xfersize)
 251                         xfersize = bytesinfile;
 252
 253                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
 254                         /*
 255                          * Don't do readahead if this is the end of the file.
 256                          */
 257                         error = bread(vp, lbn, size, &bp);
 258                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 259                         /*
 260                          * Otherwise if we are allowed to cluster,
 261                          * grab as much as we can.
 262                          *
 263                          * XXX  This may not be a win if we are not
 264                          * doing sequential access.
 265                          */
 266                         error = cluster_read(vp, ip->i_size, lbn,
 267                                 size, uio->uio_resid, seqcount, &bp);
 268                 } else if (seqcount > 1) {
 269                         /*
 270                          * If we are NOT allowed to cluster, then
 271                          * if we appear to be acting sequentially,
 272                          * fire off a request for a readahead
 273                          * as well as a read. Note that the 4th and 5th
 274                          * arguments point to arrays of the size specified in
 275                          * the 6th argument.
 276                          */
 277                         int nextsize = BLKSIZE(fs, ip, nextlbn);
 278                         error = breadn(vp, lbn,
 279                             size, &nextlbn, &nextsize, 1, &bp);
 280                 } else {
 281                         /*
 282                          * Failing all of the above, just read what the
 283                          * user asked for. Interestingly, the same as
 284                          * the first option above.
 285                          */
 286                         error = bread(vp, lbn, size, &bp);
 287                 }
 288                 if (error) {
 289                         brelse(bp);
 290                         bp = NULL;
 291                         break;
 292                 }
 293
 294                 /*
 295                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 296                  * will cause us to attempt to release the buffer later on
 297                  * and will cause the buffer cache to attempt to free the
 298                  * underlying pages.
 299                  */
 300                 if (ioflag & IO_DIRECT)
 301                         bp->b_flags |= B_DIRECT;
 302
 303                 /*
 304                  * We should only get non-zero b_resid when an I/O error
 305                  * has occurred, which should cause us to break above.
 306                  * However, if the short read did not cause an error,
 307                  * then we want to ensure that we do not uiomove bad
 308                  * or uninitialized data.
 309                  *
 310                  * XXX b_resid is only valid when an actual I/O has occured
 311                  * and may be incorrect if the buffer is B_CACHE or if the
 312                  * last op on the buffer was a failed write.  This KASSERT
 313                  * is a precursor to removing it from the UFS code.
 314                  */
 315                 KASSERT(bp->b_resid == 0, ("bp->b_resid != 0"));
 316                 size -= bp->b_resid;
 317                 if (size < xfersize) {
 318                         if (size == 0)
 319                                 break;
 320                         xfersize = size;
 321                 }
 322
 323 #ifdef ENABLE_VFS_IOOPT
 324                 if (vfs_ioopt && object &&
 325                     (bp->b_flags & B_VMIO) &&
 326                     ((blkoffset & PAGE_MASK) == 0) &&
 327                     ((xfersize & PAGE_MASK) == 0)) {
 328                         /*
 329                          * If VFS IO  optimisation is turned on,
 330                          * and it's an exact page multiple
 331                          * And a normal VM based op,
 332                          * then use uiomiveco()
 333                          */
 334                         error =
 335                                 uiomoveco((char *)bp->b_data + blkoffset,
 336                                         (int)xfersize, uio, object);
 337                 } else
 338 #endif
 339                 {
 340                         /*
 341                          * otherwise use the general form
 342                          */
 343                         error =
 344                                 uiomove((char *)bp->b_data + blkoffset,
 345                                         (int)xfersize, uio);
 346                 }
 347
 348                 if (error)
 349                         break;
 350
 351                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 352                     (LIST_FIRST(&bp->b_dep) == NULL)) {
 353                         /*
 354                          * If there are no dependencies, and it's VMIO,
 355                          * then we don't need the buf, mark it available
 356                          * for freeing. The VM has the data.
 357                          */
 358                         bp->b_flags |= B_RELBUF;
 359                         brelse(bp);
 360                 } else {
 361                         /*
 362                          * Otherwise let whoever
 363                          * made the request take care of
 364                          * freeing it. We just queue
 365                          * it onto another list.
 366                          */
 367                         bqrelse(bp);
 368                 }
 369         }
 370
 371         /*
 372          * This can only happen in the case of an error
 373          * because the loop above resets bp to NULL on each iteration
 374          * and on normal completion has not set a new value into it.
 375          * so it must have come from a 'break' statement
 376          */
 377         if (bp != NULL) {
 378                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 379                     (LIST_FIRST(&bp->b_dep) == NULL)) {
 380                         bp->b_flags |= B_RELBUF;
 381                         brelse(bp);
 382                 } else {
 383                         bqrelse(bp);
 384                 }
 385         }
 386
 387         if (object)
 388                 vm_object_vndeallocate(object);
 389         if ((error == 0 || uio->uio_resid != orig_resid) &&
 390             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 391                 ip->i_flag |= IN_ACCESS;
 392         return (error);
 393 }
 394
 395 /*
 396  * Vnode op for writing.
 397  */
 398 int
 399 ffs_write(ap)
 400         struct vop_write_args /* {
 401                 struct vnode *a_vp;
 402                 struct uio *a_uio;
 403                 int a_ioflag;
 404                 struct ucred *a_cred;
 405         } */ *ap;
 406 {
 407         register struct vnode *vp;
 408         register struct uio *uio;
 409         register struct inode *ip;
 410         register FS *fs;
 411         struct buf *bp;
 412         ufs_daddr_t lbn;
 413         off_t osize;
 414         int seqcount;
 415         int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
 416         vm_object_t object;
 417         struct thread *td;
 418
 419         extended = 0;
 420         seqcount = ap->a_ioflag >> 16;
 421         ioflag = ap->a_ioflag;
 422         uio = ap->a_uio;
 423         vp = ap->a_vp;
 424         ip = VTOI(vp);
 425
 426         object = vp->v_object;
 427         if (object)
 428                 vm_object_reference(object);
 429
 430 #ifdef DIAGNOSTIC
 431         if (uio->uio_rw != UIO_WRITE)
 432                 panic("ffs_write: mode");
 433 #endif
 434
 435         switch (vp->v_type) {
 436         case VREG:
 437                 if (ioflag & IO_APPEND)
 438                         uio->uio_offset = ip->i_size;
 439                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
 440                         if (object)
 441                                 vm_object_vndeallocate(object);
 442                         return (EPERM);
 443                 }
 444                 /* FALLTHROUGH */
 445         case VLNK:
 446                 break;
 447         case VDIR:
 448                 panic("ffs_write: dir write");
 449                 break;
 450         default:
 451                 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 452                         (int)uio->uio_offset,
 453                         (int)uio->uio_resid
 454                 );
 455         }
 456
 457         fs = ip->I_FS;
 458         if (uio->uio_offset < 0 ||
 459             (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
 460                 if (object)
 461                         vm_object_vndeallocate(object);
 462                 return (EFBIG);
 463         }
 464         /*
 465          * Maybe this should be above the vnode op call, but so long as
 466          * file servers have no limits, I don't think it matters.
 467          */
 468         td = uio->uio_td;
 469         if (vp->v_type == VREG && td && td->td_proc &&
 470             uio->uio_offset + uio->uio_resid >
 471             td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 472                 psignal(td->td_proc, SIGXFSZ);
 473                 if (object)
 474                         vm_object_vndeallocate(object);
 475                 return (EFBIG);
 476         }
 477
 478         resid = uio->uio_resid;
 479         osize = ip->i_size;
 480
 481         /*
 482          * NOTE! These B_ flags are actually balloc-only flags, not buffer
 483          * flags.  They are similar to the BA_ flags in -current.
 484          */
 485         if (seqcount > B_SEQMAX)
 486                 flags = B_SEQMAX << B_SEQSHIFT;
 487         else
 488                 flags = seqcount << B_SEQSHIFT;
 489         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 490                 flags |= B_SYNC;
 491
 492         if (object && (object->flags & OBJ_OPT)) {
 493                 vm_freeze_copyopts(object,
 494                         OFF_TO_IDX(uio->uio_offset),
 495                         OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK));
 496         }
 497
 498         for (error = 0; uio->uio_resid > 0;) {
 499                 lbn = lblkno(fs, uio->uio_offset);
 500                 blkoffset = blkoff(fs, uio->uio_offset);
 501                 xfersize = fs->fs_bsize - blkoffset;
 502                 if (uio->uio_resid < xfersize)
 503                         xfersize = uio->uio_resid;
 504
 505                 if (uio->uio_offset + xfersize > ip->i_size)
 506                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 507
 508                 /*
 509                  * We must perform a read-before-write if the transfer
 510                  * size does not cover the entire buffer.
 511                  */
 512                 if (fs->fs_bsize > xfersize)
 513                         flags |= B_CLRBUF;
 514                 else
 515                         flags &= ~B_CLRBUF;
 516 /* XXX is uio->uio_offset the right thing here? */
 517                 error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
 518                     ap->a_cred, flags, &bp);
 519                 if (error != 0)
 520                         break;
 521                 /*
 522                  * If the buffer is not valid and we did not clear garbage
 523                  * out above, we have to do so here even though the write
 524                  * covers the entire buffer in order to avoid a mmap()/write
 525                  * race where another process may see the garbage prior to
 526                  * the uiomove() for a write replacing it.
 527                  */
 528                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 529                         vfs_bio_clrbuf(bp);
 530                 if (ioflag & IO_DIRECT)
 531                         bp->b_flags |= B_DIRECT;
 532                 if (ioflag & IO_NOWDRAIN)
 533                         bp->b_flags |= B_NOWDRAIN;
 534
 535                 if (uio->uio_offset + xfersize > ip->i_size) {
 536                         ip->i_size = uio->uio_offset + xfersize;
 537                         extended = 1;
 538                 }
 539
 540                 size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
 541                 if (size < xfersize)
 542                         xfersize = size;
 543
 544                 error =
 545                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 546                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 547                     (LIST_FIRST(&bp->b_dep) == NULL)) {
 548                         bp->b_flags |= B_RELBUF;
 549                 }
 550
 551                 /*
 552                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 553                  * if we have a severe page deficiency write the buffer
 554                  * asynchronously.  Otherwise try to cluster, and if that
 555                  * doesn't do it then either do an async write (if O_DIRECT),
 556                  * or a delayed write (if not).
 557                  */
 558
 559                 if (ioflag & IO_SYNC) {
 560                         (void)bwrite(bp);
 561                 } else if (vm_page_count_severe() ||
 562                             buf_dirty_count_severe() ||
 563                             (ioflag & IO_ASYNC)) {
 564                         bp->b_flags |= B_CLUSTEROK;
 565                         bawrite(bp);
 566                 } else if (xfersize + blkoffset == fs->fs_bsize) {
 567                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 568                                 bp->b_flags |= B_CLUSTEROK;
 569                                 cluster_write(bp, ip->i_size, seqcount);
 570                         } else {
 571                                 bawrite(bp);
 572                         }
 573                 } else if (ioflag & IO_DIRECT) {
 574                         bp->b_flags |= B_CLUSTEROK;
 575                         bawrite(bp);
 576                 } else {
 577                         bp->b_flags |= B_CLUSTEROK;
 578                         bdwrite(bp);
 579                 }
 580                 if (error || xfersize == 0)
 581                         break;
 582                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 583         }
 584         /*
 585          * If we successfully wrote any data, and we are not the superuser
 586          * we clear the setuid and setgid bits as a precaution against
 587          * tampering.
 588          */
 589         if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 590                 ip->i_mode &= ~(ISUID | ISGID);
 591         if (resid > uio->uio_resid)
 592                 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
 593         if (error) {
 594                 if (ioflag & IO_UNIT) {
 595                         (void)UFS_TRUNCATE(vp, osize,
 596                             ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
 597                         uio->uio_offset -= resid - uio->uio_resid;
 598                         uio->uio_resid = resid;
 599                 }
 600         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 601                 error = UFS_UPDATE(vp, 1);
 602
 603         if (object)
 604                 vm_object_vndeallocate(object);
 605
 606         return (error);
 607 }
 608
 609
 610 /*
 611  * get page routine
 612  */
 613 int
 614 ffs_getpages(ap)
 615         struct vop_getpages_args *ap;
 616 {
 617         off_t foff, physoffset;
 618         int i, size, bsize;
 619         struct vnode *dp, *vp;
 620         vm_object_t obj;
 621         vm_pindex_t pindex, firstindex;
 622         vm_page_t mreq;
 623         int bbackwards, bforwards;
 624         int pbackwards, pforwards;
 625         int firstpage;
 626         int reqlblkno;
 627         daddr_t reqblkno;
 628         int poff;
 629         int pcount;
 630         int rtval;
 631         int pagesperblock;
 632
 633
 634         pcount = round_page(ap->a_count) / PAGE_SIZE;
 635         mreq = ap->a_m[ap->a_reqpage];
 636         firstindex = ap->a_m[0]->pindex;
 637
 638         /*
 639          * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
 640          * then the entire page is valid.  Since the page may be mapped,
 641          * user programs might reference data beyond the actual end of file
 642          * occuring within the page.  We have to zero that data.
 643          */
 644         if (mreq->valid) {
 645                 if (mreq->valid != VM_PAGE_BITS_ALL)
 646                         vm_page_zero_invalid(mreq, TRUE);
 647                 for (i = 0; i < pcount; i++) {
 648                         if (i != ap->a_reqpage) {
 649                                 vm_page_free(ap->a_m[i]);
 650                         }
 651                 }
 652                 return VM_PAGER_OK;
 653         }
 654
 655         vp = ap->a_vp;
 656         obj = vp->v_object;
 657         bsize = vp->v_mount->mnt_stat.f_iosize;
 658         pindex = mreq->pindex;
 659         foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
 660
 661         if (bsize < PAGE_SIZE)
 662                 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 663                                                     ap->a_count,
 664                                                     ap->a_reqpage);
 665
 666         /*
 667          * foff is the file offset of the required page
 668          * reqlblkno is the logical block that contains the page
 669          * poff is the index of the page into the logical block
 670          */
 671         reqlblkno = foff / bsize;
 672         poff = (foff % bsize) / PAGE_SIZE;
 673
 674         if ( VOP_BMAP( vp, reqlblkno, &dp, &reqblkno,
 675                 &bforwards, &bbackwards) || (reqblkno == -1)) {
 676                 for(i = 0; i < pcount; i++) {
 677                         if (i != ap->a_reqpage)
 678                                 vm_page_free(ap->a_m[i]);
 679                 }
 680                 if (reqblkno == -1) {
 681                         if ((mreq->flags & PG_ZERO) == 0)
 682                                 vm_page_zero_fill(mreq);
 683                         vm_page_undirty(mreq);
 684                         mreq->valid = VM_PAGE_BITS_ALL;
 685                         return VM_PAGER_OK;
 686                 } else {
 687                         return VM_PAGER_ERROR;
 688                 }
 689         }
 690
 691         physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
 692         pagesperblock = bsize / PAGE_SIZE;
 693         /*
 694          * find the first page that is contiguous...
 695          * note that pbackwards is the number of pages that are contiguous
 696          * backwards.
 697          */
 698         firstpage = 0;
 699         if (ap->a_count) {
 700                 pbackwards = poff + bbackwards * pagesperblock;
 701                 if (ap->a_reqpage > pbackwards) {
 702                         firstpage = ap->a_reqpage - pbackwards;
 703                         for(i=0;i<firstpage;i++)
 704                                 vm_page_free(ap->a_m[i]);
 705                 }
 706
 707         /*
 708          * pforwards is the number of pages that are contiguous
 709          * after the current page.
 710          */
 711                 pforwards = (pagesperblock - (poff + 1)) +
 712                         bforwards * pagesperblock;
 713                 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
 714                         for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
 715                                 vm_page_free(ap->a_m[i]);
 716                         pcount = ap->a_reqpage + pforwards + 1;
 717                 }
 718
 719         /*
 720          * number of pages for I/O corrected for the non-contig pages at
 721          * the beginning of the array.
 722          */
 723                 pcount -= firstpage;
 724         }
 725
 726         /*
 727          * calculate the size of the transfer
 728          */
 729
 730         size = pcount * PAGE_SIZE;
 731
 732         if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
 733                 obj->un_pager.vnp.vnp_size)
 734                 size = obj->un_pager.vnp.vnp_size -
 735                         IDX_TO_OFF(ap->a_m[firstpage]->pindex);
 736
 737         physoffset -= foff;
 738         rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
 739                 (ap->a_reqpage - firstpage), physoffset);
 740
 741         return (rtval);
 742 }
 743
 744 /*
 745  * put page routine
 746  *
 747  * XXX By default, wimp out... note that a_offset is ignored (and always
 748  * XXX has been).
 749  */
 750 int
 751 ffs_putpages(ap)
 752         struct vop_putpages_args *ap;
 753 {
 754         return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
 755                 ap->a_sync, ap->a_rtvals);
 756 }