sys/vfs/ufs/ufs_readwrite.c

   1 /*-
   2  * Copyright (c) 1993
   3  *      The Regents of the University of California.  All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  * 1. Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  * 2. Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in the
  12  *    documentation and/or other materials provided with the distribution.
  13  * 3. All advertising materials mentioning features or use of this software
  14  *    must display the following acknowledgement:
  15  *      This product includes software developed by the University of
  16  *      California, Berkeley and its contributors.
  17  * 4. Neither the name of the University nor the names of its contributors
  18  *    may be used to endorse or promote products derived from this software
  19  *    without specific prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  31  * SUCH DAMAGE.
  32  *
  33  *      @(#)ufs_readwrite.c     8.11 (Berkeley) 5/8/95
  34  * $FreeBSD: src/sys/ufs/ufs/ufs_readwrite.c,v 1.65.2.14 2003/04/04 22:21:29 tegge Exp $
  35  * $DragonFly: src/sys/vfs/ufs/ufs_readwrite.c,v 1.12 2004/07/18 19:43:48 drhodus Exp $
  36  */
  37
  38 #define BLKSIZE(a, b, c)        blksize(a, b, c)
  39 #define FS                      struct fs
  40 #define I_FS                    i_fs
  41
  42 #include <vm/vm.h>
  43 #include <vm/vm_object.h>
  44 #include <vm/vm_pager.h>
  45 #include <vm/vm_map.h>
  46 #include <vm/vnode_pager.h>
  47 #include <sys/event.h>
  48 #include <sys/vmmeter.h>
  49 #include <vm/vm_page2.h>
  50
  51 #include "opt_directio.h"
  52
  53 #define VN_KNOTE(vp, b) \
  54         KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b))
  55
  56 #ifdef DIRECTIO
  57 extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
  58 #endif
  59
  60 /*
  61  * Vnode op for reading.
  62  *
  63  * ffs_read(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
  64  *          struct ucred *a_cred)
  65  */
  66 /* ARGSUSED */
  67 int
  68 ffs_read(struct vop_read_args *ap)
  69 {
  70         struct vnode *vp;
  71         struct inode *ip;
  72         struct uio *uio;
  73         FS *fs;
  74         struct buf *bp;
  75         ufs_daddr_t lbn, nextlbn;
  76         off_t bytesinfile;
  77         long size, xfersize, blkoffset;
  78         int error, orig_resid;
  79         u_short mode;
  80         int seqcount;
  81         int ioflag;
  82         vm_object_t object;
  83
  84         vp = ap->a_vp;
  85         seqcount = ap->a_ioflag >> 16;
  86         ip = VTOI(vp);
  87         mode = ip->i_mode;
  88         uio = ap->a_uio;
  89         ioflag = ap->a_ioflag;
  90 #ifdef DIRECTIO
  91         if ((ioflag & IO_DIRECT) != 0) {
  92                 int workdone;
  93
  94                 error = ffs_rawread(vp, uio, &workdone);
  95                 if (error || workdone)
  96                         return error;
  97         }
  98 #endif
  99
 100 #ifdef DIAGNOSTIC
 101         if (uio->uio_rw != UIO_READ)
 102                 panic("ffs_read: mode");
 103
 104         if (vp->v_type == VLNK) {
 105                 if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 106                         panic("ffs_read: short symlink");
 107         } else if (vp->v_type != VREG && vp->v_type != VDIR)
 108                 panic("ffs_read: type %d", vp->v_type);
 109 #endif
 110         fs = ip->I_FS;
 111         if ((uint64_t)uio->uio_offset > fs->fs_maxfilesize)
 112                 return (EFBIG);
 113
 114         orig_resid = uio->uio_resid;
 115         if (orig_resid <= 0)
 116                 return (0);
 117
 118         object = vp->v_object;
 119
 120         bytesinfile = ip->i_size - uio->uio_offset;
 121         if (bytesinfile <= 0) {
 122                 if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 123                         ip->i_flag |= IN_ACCESS;
 124                 return 0;
 125         }
 126
 127         if (object)
 128                 vm_object_reference(object);
 129
 130 #ifdef ENABLE_VFS_IOOPT
 131         /*
 132          * If IO optimisation is turned on,
 133          * and we are NOT a VM based IO request,
 134          * (i.e. not headed for the buffer cache)
 135          * but there IS a vm object associated with it.
 136          */
 137         if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
 138                 int nread, toread;
 139
 140                 toread = uio->uio_resid;
 141                 if (toread > bytesinfile)
 142                         toread = bytesinfile;
 143                 if (toread >= PAGE_SIZE) {
 144                         /*
 145                          * Then if it's at least a page in size, try
 146                          * get the data from the object using vm tricks
 147                          */
 148                         error = uioread(toread, uio, object, &nread);
 149                         if ((uio->uio_resid == 0) || (error != 0)) {
 150                                 /*
 151                                  * If we finished or there was an error
 152                                  * then finish up (the reference previously
 153                                  * obtained on object must be released).
 154                                  */
 155                                 if ((error == 0 ||
 156                                     uio->uio_resid != orig_resid) &&
 157                                     (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 158                                         ip->i_flag |= IN_ACCESS;
 159
 160                                 if (object)
 161                                         vm_object_vndeallocate(object);
 162                                 return error;
 163                         }
 164                 }
 165         }
 166 #endif
 167
 168         /*
 169          * Ok so we couldn't do it all in one vm trick...
 170          * so cycle around trying smaller bites..
 171          */
 172         for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 173                 if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 174                         break;
 175 #ifdef ENABLE_VFS_IOOPT
 176                 if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) {
 177                         /*
 178                          * Obviously we didn't finish above, but we
 179                          * didn't get an error either. Try the same trick again.
 180                          * but this time we are looping.
 181                          */
 182                         int nread, toread;
 183                         toread = uio->uio_resid;
 184                         if (toread > bytesinfile)
 185                                 toread = bytesinfile;
 186
 187                         /*
 188                          * Once again, if there isn't enough for a
 189                          * whole page, don't try optimising.
 190                          */
 191                         if (toread >= PAGE_SIZE) {
 192                                 error = uioread(toread, uio, object, &nread);
 193                                 if ((uio->uio_resid == 0) || (error != 0)) {
 194                                         /*
 195                                          * If we finished or there was an
 196                                          * error then finish up (the reference
 197                                          * previously obtained on object must
 198                                          * be released).
 199                                          */
 200                                         if ((error == 0 ||
 201                                             uio->uio_resid != orig_resid) &&
 202                                             (vp->v_mount->mnt_flag &
 203                                             MNT_NOATIME) == 0)
 204                                                 ip->i_flag |= IN_ACCESS;
 205                                         if (object)
 206                                                 vm_object_vndeallocate(object);
 207                                         return error;
 208                                 }
 209                                 /*
 210                                  * To get here we didnt't finish or err.
 211                                  * If we did get some data,
 212                                  * loop to try another bite.
 213                                  */
 214                                 if (nread > 0) {
 215                                         continue;
 216                                 }
 217                         }
 218                 }
 219 #endif
 220
 221                 lbn = lblkno(fs, uio->uio_offset);
 222                 nextlbn = lbn + 1;
 223
 224                 /*
 225                  * size of buffer.  The buffer representing the
 226                  * end of the file is rounded up to the size of
 227                  * the block type ( fragment or full block,
 228                  * depending ).
 229                  */
 230                 size = BLKSIZE(fs, ip, lbn);
 231                 blkoffset = blkoff(fs, uio->uio_offset);
 232
 233                 /*
 234                  * The amount we want to transfer in this iteration is
 235                  * one FS block less the amount of the data before
 236                  * our startpoint (duh!)
 237                  */
 238                 xfersize = fs->fs_bsize - blkoffset;
 239
 240                 /*
 241                  * But if we actually want less than the block,
 242                  * or the file doesn't have a whole block more of data,
 243                  * then use the lesser number.
 244                  */
 245                 if (uio->uio_resid < xfersize)
 246                         xfersize = uio->uio_resid;
 247                 if (bytesinfile < xfersize)
 248                         xfersize = bytesinfile;
 249
 250                 if (lblktosize(fs, nextlbn) >= ip->i_size) {
 251                         /*
 252                          * Don't do readahead if this is the end of the file.
 253                          */
 254                         error = bread(vp, lbn, size, &bp);
 255                 } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 256                         /*
 257                          * Otherwise if we are allowed to cluster,
 258                          * grab as much as we can.
 259                          *
 260                          * XXX  This may not be a win if we are not
 261                          * doing sequential access.
 262                          */
 263                         error = cluster_read(vp, ip->i_size, lbn,
 264                                 size, uio->uio_resid, seqcount, &bp);
 265                 } else if (seqcount > 1) {
 266                         /*
 267                          * If we are NOT allowed to cluster, then
 268                          * if we appear to be acting sequentially,
 269                          * fire off a request for a readahead
 270                          * as well as a read. Note that the 4th and 5th
 271                          * arguments point to arrays of the size specified in
 272                          * the 6th argument.
 273                          */
 274                         int nextsize = BLKSIZE(fs, ip, nextlbn);
 275                         error = breadn(vp, lbn,
 276                             size, &nextlbn, &nextsize, 1, &bp);
 277                 } else {
 278                         /*
 279                          * Failing all of the above, just read what the
 280                          * user asked for. Interestingly, the same as
 281                          * the first option above.
 282                          */
 283                         error = bread(vp, lbn, size, &bp);
 284                 }
 285                 if (error) {
 286                         brelse(bp);
 287                         bp = NULL;
 288                         break;
 289                 }
 290
 291                 /*
 292                  * If IO_DIRECT then set B_DIRECT for the buffer.  This
 293                  * will cause us to attempt to release the buffer later on
 294                  * and will cause the buffer cache to attempt to free the
 295                  * underlying pages.
 296                  */
 297                 if (ioflag & IO_DIRECT)
 298                         bp->b_flags |= B_DIRECT;
 299
 300                 /*
 301                  * We should only get non-zero b_resid when an I/O error
 302                  * has occurred, which should cause us to break above.
 303                  * However, if the short read did not cause an error,
 304                  * then we want to ensure that we do not uiomove bad
 305                  * or uninitialized data.
 306                  *
 307                  * XXX b_resid is only valid when an actual I/O has occured
 308                  * and may be incorrect if the buffer is B_CACHE or if the
 309                  * last op on the buffer was a failed write.  This KASSERT
 310                  * is a precursor to removing it from the UFS code.
 311                  */
 312                 KASSERT(bp->b_resid == 0, ("bp->b_resid != 0"));
 313                 size -= bp->b_resid;
 314                 if (size < xfersize) {
 315                         if (size == 0)
 316                                 break;
 317                         xfersize = size;
 318                 }
 319
 320 #ifdef ENABLE_VFS_IOOPT
 321                 if (vfs_ioopt && object &&
 322                     (bp->b_flags & B_VMIO) &&
 323                     ((blkoffset & PAGE_MASK) == 0) &&
 324                     ((xfersize & PAGE_MASK) == 0)) {
 325                         /*
 326                          * If VFS IO  optimisation is turned on,
 327                          * and it's an exact page multiple
 328                          * And a normal VM based op,
 329                          * then use uiomiveco()
 330                          */
 331                         error =
 332                                 uiomoveco((char *)bp->b_data + blkoffset,
 333                                         (int)xfersize, uio, object);
 334                 } else
 335 #endif
 336                 {
 337                         /*
 338                          * otherwise use the general form
 339                          */
 340                         error =
 341                                 uiomove((char *)bp->b_data + blkoffset,
 342                                         (int)xfersize, uio);
 343                 }
 344
 345                 if (error)
 346                         break;
 347
 348                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 349                     (LIST_FIRST(&bp->b_dep) == NULL)) {
 350                         /*
 351                          * If there are no dependencies, and it's VMIO,
 352                          * then we don't need the buf, mark it available
 353                          * for freeing. The VM has the data.
 354                          */
 355                         bp->b_flags |= B_RELBUF;
 356                         brelse(bp);
 357                 } else {
 358                         /*
 359                          * Otherwise let whoever
 360                          * made the request take care of
 361                          * freeing it. We just queue
 362                          * it onto another list.
 363                          */
 364                         bqrelse(bp);
 365                 }
 366         }
 367
 368         /*
 369          * This can only happen in the case of an error
 370          * because the loop above resets bp to NULL on each iteration
 371          * and on normal completion has not set a new value into it.
 372          * so it must have come from a 'break' statement
 373          */
 374         if (bp != NULL) {
 375                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 376                     (LIST_FIRST(&bp->b_dep) == NULL)) {
 377                         bp->b_flags |= B_RELBUF;
 378                         brelse(bp);
 379                 } else {
 380                         bqrelse(bp);
 381                 }
 382         }
 383
 384         if (object)
 385                 vm_object_vndeallocate(object);
 386         if ((error == 0 || uio->uio_resid != orig_resid) &&
 387             (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
 388                 ip->i_flag |= IN_ACCESS;
 389         return (error);
 390 }
 391
 392 /*
 393  * Vnode op for writing.
 394  *
 395  * ffs_write(struct vnode *a_vp, struct uio *a_uio, int a_ioflag,
 396  *           struct ucred *a_cred)
 397  */
 398 int
 399 ffs_write(struct vop_write_args *ap)
 400 {
 401         struct vnode *vp;
 402         struct uio *uio;
 403         struct inode *ip;
 404         FS *fs;
 405         struct buf *bp;
 406         ufs_daddr_t lbn;
 407         off_t osize;
 408         int seqcount;
 409         int blkoffset, error, extended, flags, ioflag, resid, size, xfersize;
 410         vm_object_t object;
 411         struct thread *td;
 412
 413         extended = 0;
 414         seqcount = ap->a_ioflag >> 16;
 415         ioflag = ap->a_ioflag;
 416         uio = ap->a_uio;
 417         vp = ap->a_vp;
 418         ip = VTOI(vp);
 419
 420         object = vp->v_object;
 421         if (object)
 422                 vm_object_reference(object);
 423
 424 #ifdef DIAGNOSTIC
 425         if (uio->uio_rw != UIO_WRITE)
 426                 panic("ffs_write: mode");
 427 #endif
 428
 429         switch (vp->v_type) {
 430         case VREG:
 431                 if (ioflag & IO_APPEND)
 432                         uio->uio_offset = ip->i_size;
 433                 if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) {
 434                         if (object)
 435                                 vm_object_vndeallocate(object);
 436                         return (EPERM);
 437                 }
 438                 /* FALLTHROUGH */
 439         case VLNK:
 440                 break;
 441         case VDIR:
 442                 panic("ffs_write: dir write");
 443                 break;
 444         default:
 445                 panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
 446                         (int)uio->uio_offset,
 447                         (int)uio->uio_resid
 448                 );
 449         }
 450
 451         fs = ip->I_FS;
 452         if (uio->uio_offset < 0 ||
 453             (uint64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) {
 454                 if (object)
 455                         vm_object_vndeallocate(object);
 456                 return (EFBIG);
 457         }
 458         /*
 459          * Maybe this should be above the vnode op call, but so long as
 460          * file servers have no limits, I don't think it matters.
 461          */
 462         td = uio->uio_td;
 463         if (vp->v_type == VREG && td && td->td_proc &&
 464             uio->uio_offset + uio->uio_resid >
 465             td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
 466                 psignal(td->td_proc, SIGXFSZ);
 467                 if (object)
 468                         vm_object_vndeallocate(object);
 469                 return (EFBIG);
 470         }
 471
 472         resid = uio->uio_resid;
 473         osize = ip->i_size;
 474
 475         /*
 476          * NOTE! These B_ flags are actually balloc-only flags, not buffer
 477          * flags.  They are similar to the BA_ flags in fbsd.
 478          */
 479         if (seqcount > B_SEQMAX)
 480                 flags = B_SEQMAX << B_SEQSHIFT;
 481         else
 482                 flags = seqcount << B_SEQSHIFT;
 483         if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 484                 flags |= B_SYNC;
 485
 486         if (object && (object->flags & OBJ_OPT)) {
 487                 vm_freeze_copyopts(object,
 488                         OFF_TO_IDX(uio->uio_offset),
 489                         OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK));
 490         }
 491
 492         for (error = 0; uio->uio_resid > 0;) {
 493                 lbn = lblkno(fs, uio->uio_offset);
 494                 blkoffset = blkoff(fs, uio->uio_offset);
 495                 xfersize = fs->fs_bsize - blkoffset;
 496                 if (uio->uio_resid < xfersize)
 497                         xfersize = uio->uio_resid;
 498
 499                 if (uio->uio_offset + xfersize > ip->i_size)
 500                         vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 501
 502                 /*
 503                  * We must perform a read-before-write if the transfer
 504                  * size does not cover the entire buffer.
 505                  */
 506                 if (fs->fs_bsize > xfersize)
 507                         flags |= B_CLRBUF;
 508                 else
 509                         flags &= ~B_CLRBUF;
 510 /* XXX is uio->uio_offset the right thing here? */
 511                 error = VOP_BALLOC(vp, uio->uio_offset, xfersize,
 512                     ap->a_cred, flags, &bp);
 513                 if (error != 0)
 514                         break;
 515                 /*
 516                  * If the buffer is not valid and we did not clear garbage
 517                  * out above, we have to do so here even though the write
 518                  * covers the entire buffer in order to avoid a mmap()/write
 519                  * race where another process may see the garbage prior to
 520                  * the uiomove() for a write replacing it.
 521                  */
 522                 if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
 523                         vfs_bio_clrbuf(bp);
 524                 if (ioflag & IO_DIRECT)
 525                         bp->b_flags |= B_DIRECT;
 526                 if (ioflag & IO_NOWDRAIN)
 527                         bp->b_flags |= B_NOWDRAIN;
 528                 if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
 529                         bp->b_flags |= B_NOCACHE;
 530
 531                 if (uio->uio_offset + xfersize > ip->i_size) {
 532                         ip->i_size = uio->uio_offset + xfersize;
 533                         extended = 1;
 534                 }
 535
 536                 size = BLKSIZE(fs, ip, lbn) - bp->b_resid;
 537                 if (size < xfersize)
 538                         xfersize = size;
 539
 540                 error =
 541                     uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 542                 if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
 543                     (LIST_FIRST(&bp->b_dep) == NULL)) {
 544                         bp->b_flags |= B_RELBUF;
 545                 }
 546
 547                 /*
 548                  * If IO_SYNC each buffer is written synchronously.  Otherwise
 549                  * if we have a severe page deficiency write the buffer
 550                  * asynchronously.  Otherwise try to cluster, and if that
 551                  * doesn't do it then either do an async write (if O_DIRECT),
 552                  * or a delayed write (if not).
 553                  */
 554
 555                 if (ioflag & IO_SYNC) {
 556                         (void)bwrite(bp);
 557                 } else if (vm_page_count_severe() ||
 558                             buf_dirty_count_severe() ||
 559                             (ioflag & IO_ASYNC)) {
 560                         bp->b_flags |= B_CLUSTEROK;
 561                         bawrite(bp);
 562                 } else if (xfersize + blkoffset == fs->fs_bsize) {
 563                         if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 564                                 bp->b_flags |= B_CLUSTEROK;
 565                                 cluster_write(bp, ip->i_size, seqcount);
 566                         } else {
 567                                 bawrite(bp);
 568                         }
 569                 } else if (ioflag & IO_DIRECT) {
 570                         bp->b_flags |= B_CLUSTEROK;
 571                         bawrite(bp);
 572                 } else {
 573                         bp->b_flags |= B_CLUSTEROK;
 574                         bdwrite(bp);
 575                 }
 576                 if (error || xfersize == 0)
 577                         break;
 578                 ip->i_flag |= IN_CHANGE | IN_UPDATE;
 579         }
 580         /*
 581          * If we successfully wrote any data, and we are not the superuser
 582          * we clear the setuid and setgid bits as a precaution against
 583          * tampering.
 584          */
 585         if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0)
 586                 ip->i_mode &= ~(ISUID | ISGID);
 587         if (resid > uio->uio_resid)
 588                 VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0));
 589         if (error) {
 590                 if (ioflag & IO_UNIT) {
 591                         (void)UFS_TRUNCATE(vp, osize,
 592                             ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
 593                         uio->uio_offset -= resid - uio->uio_resid;
 594                         uio->uio_resid = resid;
 595                 }
 596         } else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
 597                 error = UFS_UPDATE(vp, 1);
 598
 599         if (object)
 600                 vm_object_vndeallocate(object);
 601
 602         return (error);
 603 }
 604
 605
 606 /*
 607  * get page routine
 608  */
 609 int
 610 ffs_getpages(struct vop_getpages_args *ap)
 611 {
 612         off_t foff, physoffset;
 613         int i, size, bsize;
 614         struct vnode *dp, *vp;
 615         vm_object_t obj;
 616         vm_pindex_t pindex, firstindex;
 617         vm_page_t mreq;
 618         int bbackwards, bforwards;
 619         int pbackwards, pforwards;
 620         int firstpage;
 621         int reqlblkno;
 622         daddr_t reqblkno;
 623         int poff;
 624         int pcount;
 625         int rtval;
 626         int pagesperblock;
 627
 628
 629         pcount = round_page(ap->a_count) / PAGE_SIZE;
 630         mreq = ap->a_m[ap->a_reqpage];
 631         firstindex = ap->a_m[0]->pindex;
 632
 633         /*
 634          * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
 635          * then the entire page is valid.  Since the page may be mapped,
 636          * user programs might reference data beyond the actual end of file
 637          * occuring within the page.  We have to zero that data.
 638          */
 639         if (mreq->valid) {
 640                 if (mreq->valid != VM_PAGE_BITS_ALL)
 641                         vm_page_zero_invalid(mreq, TRUE);
 642                 for (i = 0; i < pcount; i++) {
 643                         if (i != ap->a_reqpage) {
 644                                 vm_page_free(ap->a_m[i]);
 645                         }
 646                 }
 647                 return VM_PAGER_OK;
 648         }
 649
 650         vp = ap->a_vp;
 651         obj = vp->v_object;
 652         bsize = vp->v_mount->mnt_stat.f_iosize;
 653         pindex = mreq->pindex;
 654         foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */;
 655
 656         if (bsize < PAGE_SIZE)
 657                 return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
 658                                                     ap->a_count,
 659                                                     ap->a_reqpage);
 660
 661         /*
 662          * foff is the file offset of the required page
 663          * reqlblkno is the logical block that contains the page
 664          * poff is the index of the page into the logical block
 665          */
 666         reqlblkno = foff / bsize;
 667         poff = (foff % bsize) / PAGE_SIZE;
 668
 669         if ( VOP_BMAP( vp, reqlblkno, &dp, &reqblkno,
 670                 &bforwards, &bbackwards) || (reqblkno == -1)) {
 671                 for(i = 0; i < pcount; i++) {
 672                         if (i != ap->a_reqpage)
 673                                 vm_page_free(ap->a_m[i]);
 674                 }
 675                 if (reqblkno == -1) {
 676                         if ((mreq->flags & PG_ZERO) == 0)
 677                                 vm_page_zero_fill(mreq);
 678                         vm_page_undirty(mreq);
 679                         mreq->valid = VM_PAGE_BITS_ALL;
 680                         return VM_PAGER_OK;
 681                 } else {
 682                         return VM_PAGER_ERROR;
 683                 }
 684         }
 685
 686         physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE;
 687         pagesperblock = bsize / PAGE_SIZE;
 688         /*
 689          * find the first page that is contiguous...
 690          * note that pbackwards is the number of pages that are contiguous
 691          * backwards.
 692          */
 693         firstpage = 0;
 694         if (ap->a_count) {
 695                 pbackwards = poff + bbackwards * pagesperblock;
 696                 if (ap->a_reqpage > pbackwards) {
 697                         firstpage = ap->a_reqpage - pbackwards;
 698                         for(i=0;i<firstpage;i++)
 699                                 vm_page_free(ap->a_m[i]);
 700                 }
 701
 702         /*
 703          * pforwards is the number of pages that are contiguous
 704          * after the current page.
 705          */
 706                 pforwards = (pagesperblock - (poff + 1)) +
 707                         bforwards * pagesperblock;
 708                 if (pforwards < (pcount - (ap->a_reqpage + 1))) {
 709                         for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++)
 710                                 vm_page_free(ap->a_m[i]);
 711                         pcount = ap->a_reqpage + pforwards + 1;
 712                 }
 713
 714         /*
 715          * number of pages for I/O corrected for the non-contig pages at
 716          * the beginning of the array.
 717          */
 718                 pcount -= firstpage;
 719         }
 720
 721         /*
 722          * calculate the size of the transfer
 723          */
 724
 725         size = pcount * PAGE_SIZE;
 726
 727         if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) >
 728                 obj->un_pager.vnp.vnp_size)
 729                 size = obj->un_pager.vnp.vnp_size -
 730                         IDX_TO_OFF(ap->a_m[firstpage]->pindex);
 731
 732         physoffset -= foff;
 733         rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size,
 734                 (ap->a_reqpage - firstpage), physoffset);
 735
 736         return (rtval);
 737 }
 738
 739 /*
 740  * put page routine
 741  *
 742  * XXX By default, wimp out... note that a_offset is ignored (and always
 743  * XXX has been).
 744  */
 745 int
 746 ffs_putpages(struct vop_putpages_args *ap)
 747 {
 748         return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count,
 749                 ap->a_sync, ap->a_rtvals);
 750 }