sys/kern/vfs_vm.c

   1 /*
   2  * Copyright (c) 2010 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * Implements new VFS/VM coherency functions.  For conforming VFSs
  37  * we treat the backing VM object slightly differently.  Instead of
  38  * maintaining a number of pages to exactly fit the size of the file
  39  * we instead maintain pages to fit the entire contents of the last
  40  * buffer cache buffer used by the file.
  41  *
  42  * For VFSs like NFS and HAMMER which use (generally speaking) fixed
  43  * sized buffers this greatly reduces the complexity of VFS/VM interactions.
  44  *
  45  * Truncations no longer invalidate pages covered by the buffer cache
  46  * beyond the file EOF which still fit within the file's last buffer.
  47  * We simply unmap them and do not allow userland to fault them in.
  48  *
  49  * The VFS is no longer responsible for zero-filling buffers during a
  50  * truncation, the last buffer will be automatically zero-filled by
  51  * nvtruncbuf().
  52  *
  53  * This code is intended to (eventually) replace vtruncbuf() and
  54  * vnode_pager_setsize().
  55  */
  56
  57 #include <sys/param.h>
  58 #include <sys/systm.h>
  59 #include <sys/buf.h>
  60 #include <sys/conf.h>
  61 #include <sys/fcntl.h>
  62 #include <sys/file.h>
  63 #include <sys/kernel.h>
  64 #include <sys/malloc.h>
  65 #include <sys/mount.h>
  66 #include <sys/proc.h>
  67 #include <sys/socket.h>
  68 #include <sys/stat.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/unistd.h>
  71 #include <sys/vmmeter.h>
  72 #include <sys/vnode.h>
  73
  74 #include <machine/limits.h>
  75
  76 #include <vm/vm.h>
  77 #include <vm/vm_object.h>
  78 #include <vm/vm_extern.h>
  79 #include <vm/vm_kern.h>
  80 #include <vm/pmap.h>
  81 #include <vm/vm_map.h>
  82 #include <vm/vm_page.h>
  83 #include <vm/vm_pager.h>
  84 #include <vm/vnode_pager.h>
  85 #include <vm/vm_zone.h>
  86
  87 #include <sys/buf2.h>
  88 #include <vm/vm_page2.h>
  89
  90 static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
  91 static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
  92 static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
  93 static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
  94
  95 /*
  96  * Truncate a file's buffer and pages to a specified length. The
  97  * byte-granular length of the file is specified along with the block
  98  * size of the buffer containing that offset.
  99  *
 100  * If the last buffer straddles the length its contents will be zero-filled
 101  * as appropriate.  All buffers and pages after the last buffer will be
 102  * destroyed.  The last buffer itself will be destroyed only if the length
 103  * is exactly aligned with it.
 104  *
 105  * UFS typically passes the old block size prior to the actual truncation,
 106  * then later resizes the block based on the new file size.  NFS uses a
 107  * fixed block size and doesn't care.  HAMMER uses a block size based on
 108  * the offset which is fixed for any particular offset.
 109  *
 110  * When zero-filling we must bdwrite() to avoid a window of opportunity
 111  * where the kernel might throw away a clean buffer and the filesystem
 112  * then attempts to bread() it again before completing (or as part of)
 113  * the extension.  The filesystem is still responsible for zero-filling
 114  * any remainder when writing to the media in the strategy function when
 115  * it is able to do so without the page being mapped.  The page may still
 116  * be mapped by userland here.
 117  *
 118  * When modifying a buffer we must clear any cached raw disk offset.
 119  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
 120  * never overwrite existing data blocks.
 121  */
 122
 123 struct truncbuf_info {
 124         struct vnode *vp;
 125         off_t truncloffset;     /* truncation point */
 126         int clean;              /* clean tree, else dirty tree */
 127 };
 128
 129 int
 130 nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff, int flags)
 131 {
 132         struct truncbuf_info info;
 133         off_t truncboffset;
 134         const char *filename;
 135         struct buf *bp;
 136         int count;
 137         int error;
 138
 139         /*
 140          * Round up to the *next* block, then destroy the buffers in question.
 141          * Since we are only removing some of the buffers we must rely on the
 142          * scan count to determine whether a loop is necessary.
 143          *
 144          * Destroy any pages beyond the last buffer.
 145          */
 146         if (boff < 0)
 147                 boff = (int)(length % blksize);
 148         if (boff)
 149                 info.truncloffset = length + (blksize - boff);
 150         else
 151                 info.truncloffset = length;
 152         info.vp = vp;
 153         lwkt_gettoken(&vp->v_token);
 154         do {
 155                 info.clean = 1;
 156                 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
 157                                 nvtruncbuf_bp_trunc_cmp,
 158                                 nvtruncbuf_bp_trunc, &info);
 159                 info.clean = 0;
 160                 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
 161                                 nvtruncbuf_bp_trunc_cmp,
 162                                 nvtruncbuf_bp_trunc, &info);
 163         } while(count);
 164
 165         nvnode_pager_setsize(vp, length, blksize, boff);
 166
 167         /*
 168          * Zero-fill the area beyond the file EOF that still fits within
 169          * the last buffer.  We must mark the buffer as dirty even though
 170          * the modified area is beyond EOF to avoid races where the kernel
 171          * might flush the buffer before the filesystem is able to reallocate
 172          * the block.
 173          *
 174          * The VFS is responsible for dealing with the actual truncation.
 175          *
 176          * Only do this if NVEXTF_TRIVIAL is not set, otherwise it is up to
 177          * the VFS to handle the block straddling the EOF.
 178          */
 179         if (boff && (flags & NVEXTF_TRIVIAL) == 0) {
 180                 truncboffset = length - boff;
 181                 error = bread_kvabio(vp, truncboffset, blksize, &bp);
 182                 if (error == 0) {
 183                         bkvasync(bp);
 184                         bzero(bp->b_data + boff, blksize - boff);
 185                         if (bp->b_flags & B_DELWRI) {
 186                                 if (bp->b_dirtyoff > boff)
 187                                         bp->b_dirtyoff = boff;
 188                                 if (bp->b_dirtyend > boff)
 189                                         bp->b_dirtyend = boff;
 190                         }
 191                         bp->b_bio2.bio_offset = NOOFFSET;
 192                         if (flags & NVEXTF_BUWRITE)
 193                                 buwrite(bp);
 194                         else
 195                                 bdwrite(bp);
 196                 } else {
 197                         kprintf("nvtruncbuf: bread error %d @0x%016jx\n",
 198                                 error, truncboffset);
 199                         bp->b_flags |= B_INVAL | B_RELBUF;
 200                         brelse(bp);
 201                 }
 202         } else {
 203                 error = 0;
 204         }
 205
 206         /*
 207          * For safety, fsync any remaining metadata if the file is not being
 208          * truncated to 0.  Since the metadata does not represent the entire
 209          * dirty list we have to rely on the hit count to ensure that we get
 210          * all of it.
 211          *
 212          * This is typically applicable only to UFS.  NFS and HAMMER do
 213          * not store indirect blocks in the per-vnode buffer cache.
 214          */
 215         if (length > 0) {
 216                 do {
 217                         count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
 218                                         nvtruncbuf_bp_metasync_cmp,
 219                                         nvtruncbuf_bp_metasync, &info);
 220                 } while (count);
 221         }
 222
 223         /*
 224          * It is possible to have in-progress I/O from buffers that were
 225          * not part of the truncation.  This should not happen if we
 226          * are truncating to 0-length.
 227          */
 228         bio_track_wait(&vp->v_track_write, 0, 0);
 229
 230         /*
 231          * Debugging only
 232          */
 233         spin_lock(&vp->v_spin);
 234         filename = TAILQ_FIRST(&vp->v_namecache) ?
 235                    TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
 236         spin_unlock(&vp->v_spin);
 237
 238         /*
 239          * Make sure no buffers were instantiated while we were trying
 240          * to clean out the remaining VM pages.  This could occur due
 241          * to busy dirty VM pages being flushed out to disk.
 242          */
 243         do {
 244                 info.clean = 1;
 245                 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
 246                                 nvtruncbuf_bp_trunc_cmp,
 247                                 nvtruncbuf_bp_trunc, &info);
 248                 info.clean = 0;
 249                 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
 250                                 nvtruncbuf_bp_trunc_cmp,
 251                                 nvtruncbuf_bp_trunc, &info);
 252                 if (count) {
 253                         kprintf("Warning: vtruncbuf():  Had to re-clean %d "
 254                                "left over buffers in %s\n", count, filename);
 255                 }
 256         } while(count);
 257
 258         lwkt_reltoken(&vp->v_token);
 259
 260         return (error);
 261 }
 262
 263 /*
 264  * The callback buffer is beyond the new file EOF and must be destroyed.
 265  * Note that the compare function must conform to the RB_SCAN's requirements.
 266  */
 267 static
 268 int
 269 nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
 270 {
 271         struct truncbuf_info *info = data;
 272
 273         if (bp->b_loffset >= info->truncloffset)
 274                 return(0);
 275         return(-1);
 276 }
 277
 278 static
 279 int
 280 nvtruncbuf_bp_trunc(struct buf *bp, void *data)
 281 {
 282         struct truncbuf_info *info = data;
 283
 284         /*
 285          * Do not try to use a buffer we cannot immediately lock,
 286          * but sleep anyway to prevent a livelock.  The code will
 287          * loop until all buffers can be acted upon.
 288          */
 289         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 290                 atomic_add_int(&bp->b_refs, 1);
 291                 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
 292                         BUF_UNLOCK(bp);
 293                 atomic_subtract_int(&bp->b_refs, 1);
 294         } else if ((info->clean && (bp->b_flags & B_DELWRI)) ||
 295                    (info->clean == 0 && (bp->b_flags & B_DELWRI) == 0) ||
 296                    bp->b_vp != info->vp ||
 297                    nvtruncbuf_bp_trunc_cmp(bp, data)) {
 298                 BUF_UNLOCK(bp);
 299         } else {
 300                 bremfree(bp);
 301                 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
 302                 brelse(bp);
 303         }
 304         lwkt_yield();
 305         return(1);
 306 }
 307
 308 /*
 309  * Fsync all meta-data after truncating a file to be non-zero.  Only metadata
 310  * blocks (with a negative loffset) are scanned.
 311  * Note that the compare function must conform to the RB_SCAN's requirements.
 312  */
 313 static int
 314 nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data __unused)
 315 {
 316         if (bp->b_loffset < 0)
 317                 return(0);
 318         lwkt_yield();
 319         return(1);
 320 }
 321
 322 static int
 323 nvtruncbuf_bp_metasync(struct buf *bp, void *data)
 324 {
 325         struct truncbuf_info *info = data;
 326
 327         /*
 328          * Do not try to use a buffer we cannot immediately lock,
 329          * but sleep anyway to prevent a livelock.  The code will
 330          * loop until all buffers can be acted upon.
 331          */
 332         if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
 333                 atomic_add_int(&bp->b_refs, 1);
 334                 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
 335                         BUF_UNLOCK(bp);
 336                 atomic_subtract_int(&bp->b_refs, 1);
 337         } else if ((bp->b_flags & B_DELWRI) == 0 ||
 338                    bp->b_vp != info->vp ||
 339                    nvtruncbuf_bp_metasync_cmp(bp, data)) {
 340                 BUF_UNLOCK(bp);
 341         } else {
 342                 bremfree(bp);
 343                 bawrite(bp);
 344         }
 345         lwkt_yield();
 346         return(1);
 347 }
 348
 349 /*
 350  * Extend a file's buffer and pages to a new, larger size.  The block size
 351  * at both the old and new length must be passed, but buffer cache operations
 352  * will only be performed on the old block.  The new nlength/nblksize will
 353  * be used to properly set the VM object size.
 354  *
 355  * To make this explicit we require the old length to passed even though
 356  * we can acquire it from vp->v_filesize, which also avoids potential
 357  * corruption if the filesystem and vp get desynchronized somehow.
 358  *
 359  * If the caller intends to immediately write into the newly extended
 360  * space pass NVEXTF_TRIVIAL.  If not set, the original buffer will be
 361  * zero-filled as necessary to clean out any junk in the extended space.
 362  * If non-zero the original buffer (straddling EOF) is not touched.
 363  *
 364  * When zero-filling we must bdwrite() to avoid a window of opportunity
 365  * where the kernel might throw away a clean buffer and the filesystem
 366  * then attempts to bread() it again before completing (or as part of)
 367  * the extension.  The filesystem is still responsible for zero-filling
 368  * any remainder when writing to the media in the strategy function when
 369  * it is able to do so without the page being mapped.  The page may still
 370  * be mapped by userland here.
 371  *
 372  * When modifying a buffer we must clear any cached raw disk offset.
 373  * bdwrite() will call BMAP on it again.  Some filesystems, like HAMMER,
 374  * never overwrite existing data blocks.
 375  */
 376 int
 377 nvextendbuf(struct vnode *vp, off_t olength, off_t nlength,
 378             int oblksize, int nblksize, int oboff, int nboff, int flags)
 379 {
 380         off_t truncboffset;
 381         struct buf *bp;
 382         int error;
 383
 384         error = 0;
 385         nvnode_pager_setsize(vp, nlength, nblksize, nboff);
 386         if ((flags & NVEXTF_TRIVIAL) == 0) {
 387                 if (oboff < 0)
 388                         oboff = (int)(olength % oblksize);
 389                 truncboffset = olength - oboff;
 390
 391                 if (oboff) {
 392                         error = bread_kvabio(vp, truncboffset, oblksize, &bp);
 393                         if (error == 0) {
 394                                 bkvasync(bp);
 395                                 bzero(bp->b_data + oboff, oblksize - oboff);
 396                                 bp->b_bio2.bio_offset = NOOFFSET;
 397                                 if (flags & NVEXTF_BUWRITE)
 398                                         buwrite(bp);
 399                                 else
 400                                         bdwrite(bp);
 401                         } else {
 402                                 kprintf("nvextendbuf: bread EOF @ %016jx "
 403                                         "error %d\n",
 404                                         truncboffset, error);
 405                                 bp->b_flags |= B_INVAL | B_RELBUF;
 406                                 brelse(bp);
 407                         }
 408                 }
 409         }
 410         return (error);
 411 }
 412
 413 /*
 414  * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
 415  * the last buffer when truncating.
 416  *
 417  * This function does not do any zeroing or invalidating of partially
 418  * overlapping pages.  Zeroing is the responsibility of nvtruncbuf().
 419  * However, it does unmap VM pages from the user address space on a
 420  * page-granular (verses buffer cache granular) basis.
 421  *
 422  * If boff is passed as -1 the base offset of the buffer cache buffer is
 423  * calculated from length and blksize.  Filesystems such as UFS which deal
 424  * with fragments have to specify a boff >= 0 since the base offset cannot
 425  * be calculated from length and blksize.
 426  *
 427  * For UFS blksize is the 'new' blocksize, used only to determine how large
 428  * the VM object must become.
 429  */
 430 void
 431 nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff)
 432 {
 433         vm_pindex_t nobjsize;
 434         vm_pindex_t oobjsize;
 435         vm_pindex_t pi;
 436         vm_object_t object;
 437         vm_page_t m;
 438         off_t truncboffset;
 439
 440         /*
 441          * Degenerate conditions
 442          */
 443         if ((object = vp->v_object) == NULL)
 444                 return;
 445         vm_object_hold(object);
 446         if (length == vp->v_filesize) {
 447                 vm_object_drop(object);
 448                 return;
 449         }
 450
 451         /*
 452          * Calculate the size of the VM object, coverage includes
 453          * the buffer straddling EOF.  If EOF is buffer-aligned
 454          * we don't bother.
 455          *
 456          * Buffers do not have to be page-aligned.  Make sure
 457          * nobjsize is beyond the last page of the buffer.
 458          */
 459         if (boff < 0)
 460                 boff = (int)(length % blksize);
 461         truncboffset = length - boff;
 462         oobjsize = object->size;
 463         if (boff)
 464                 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK);
 465         else
 466                 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK);
 467         object->size = nobjsize;
 468
 469         if (length < vp->v_filesize) {
 470                 /*
 471                  * File has shrunk, toss any cached pages beyond
 472                  * the end of the buffer (blksize aligned) for the
 473                  * new EOF.
 474                  */
 475                 vp->v_filesize = length;
 476                 if (nobjsize < oobjsize) {
 477                         vm_object_page_remove(object, nobjsize, oobjsize,
 478                                               FALSE);
 479                 }
 480
 481                 /*
 482                  * Unmap any pages (page aligned) beyond the new EOF.
 483                  * The pages remain part of the (last) buffer and are not
 484                  * invalidated.
 485                  */
 486                 pi = OFF_TO_IDX(length + PAGE_MASK);
 487                 while (pi < nobjsize) {
 488                         m = vm_page_lookup_busy_wait(object, pi, FALSE, "vmpg");
 489                         if (m) {
 490                                 vm_page_protect(m, VM_PROT_NONE);
 491                                 vm_page_wakeup(m);
 492                         }
 493                         ++pi;
 494                         lwkt_yield();
 495                 }
 496         } else {
 497                 /*
 498                  * File has expanded.
 499                  */
 500                 vp->v_filesize = length;
 501         }
 502         vm_object_drop(object);
 503 }