/* * Copyright (c) 2010 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project * by Matthew Dillon * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of The DragonFly Project nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific, prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Implements new VFS/VM coherency functions. For conforming VFSs * we treat the backing VM object slightly differently. Instead of * maintaining a number of pages to exactly fit the size of the file * we instead maintain pages to fit the entire contents of the last * buffer cache buffer used by the file. * * For VFSs like NFS and HAMMER which use (generally speaking) fixed * sized buffers this greatly reduces the complexity of VFS/VM interactions. * * Truncations no longer invalidate pages covered by the buffer cache * beyond the file EOF which still fit within the file's last buffer. * We simply unmap them and do not allow userland to fault them in. * * The VFS is no longer responsible for zero-filling buffers during a * truncation, the last buffer will be automatically zero-filled by * nvtruncbuf(). * * This code is intended to (eventually) replace vtruncbuf() and * vnode_pager_setsize(). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data); static int nvtruncbuf_bp_trunc(struct buf *bp, void *data); static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data); static int nvtruncbuf_bp_metasync(struct buf *bp, void *data); /* * Truncate a file's buffer and pages to a specified length. The * byte-granular length of the file is specified along with the block * size of the buffer containing that offset. * * If the last buffer straddles the length its contents will be zero-filled * as appropriate. All buffers and pages after the last buffer will be * destroyed. The last buffer itself will be destroyed only if the length * is exactly aligned with it. * * UFS typically passes the old block size prior to the actual truncation, * then later resizes the block based on the new file size. NFS uses a * fixed block size and doesn't care. HAMMER uses a block size based on * the offset which is fixed for any particular offset. * * When zero-filling we must bdwrite() to avoid a window of opportunity * where the kernel might throw away a clean buffer and the filesystem * then attempts to bread() it again before completing (or as part of) * the extension. The filesystem is still responsible for zero-filling * any remainder when writing to the media in the strategy function when * it is able to do so without the page being mapped. The page may still * be mapped by userland here. * * When modifying a buffer we must clear any cached raw disk offset. * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, * never overwrite existing data blocks. */ int nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff) { off_t truncloffset; off_t truncboffset; const char *filename; lwkt_tokref vlock; struct buf *bp; int count; int error; /* * Round up to the *next* block, then destroy the buffers in question. * Since we are only removing some of the buffers we must rely on the * scan count to determine whether a loop is necessary. * * Destroy any pages beyond the last buffer. */ if (boff < 0) boff = (int)(length % blksize); if (boff) truncloffset = length + (blksize - boff); else truncloffset = length; lwkt_gettoken(&vlock, &vp->v_token); do { count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, nvtruncbuf_bp_trunc_cmp, nvtruncbuf_bp_trunc, &truncloffset); count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, nvtruncbuf_bp_trunc_cmp, nvtruncbuf_bp_trunc, &truncloffset); } while(count); nvnode_pager_setsize(vp, length, blksize, boff); /* * Zero-fill the area beyond the file EOF that still fits within * the last buffer. We must mark the buffer as dirty even though * the modified area is beyond EOF to avoid races where the kernel * might flush the buffer before the filesystem is able to reallocate * the block. * * The VFS is responsible for dealing with the actual truncation. */ if (boff) { truncboffset = length - boff; error = bread(vp, truncboffset, blksize, &bp); if (error == 0) { bzero(bp->b_data + boff, blksize - boff); if (bp->b_flags & B_DELWRI) { if (bp->b_dirtyoff > boff) bp->b_dirtyoff = boff; if (bp->b_dirtyend > boff) bp->b_dirtyend = boff; } bp->b_bio2.bio_offset = NOOFFSET; bdwrite(bp); } } else { error = 0; } /* * For safety, fsync any remaining metadata if the file is not being * truncated to 0. Since the metadata does not represent the entire * dirty list we have to rely on the hit count to ensure that we get * all of it. * * This is typically applicable only to UFS. NFS and HAMMER do * not store indirect blocks in the per-vnode buffer cache. */ if (length > 0) { do { count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, nvtruncbuf_bp_metasync_cmp, nvtruncbuf_bp_metasync, vp); } while (count); } /* * It is possible to have in-progress I/O from buffers that were * not part of the truncation. This should not happen if we * are truncating to 0-length. */ bio_track_wait(&vp->v_track_write, 0, 0); /* * Debugging only */ spin_lock_wr(&vp->v_spinlock); filename = TAILQ_FIRST(&vp->v_namecache) ? TAILQ_FIRST(&vp->v_namecache)->nc_name : "?"; spin_unlock_wr(&vp->v_spinlock); /* * Make sure no buffers were instantiated while we were trying * to clean out the remaining VM pages. This could occur due * to busy dirty VM pages being flushed out to disk. */ do { count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree, nvtruncbuf_bp_trunc_cmp, nvtruncbuf_bp_trunc, &truncloffset); count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree, nvtruncbuf_bp_trunc_cmp, nvtruncbuf_bp_trunc, &truncloffset); if (count) { kprintf("Warning: vtruncbuf(): Had to re-clean %d " "left over buffers in %s\n", count, filename); } } while(count); lwkt_reltoken(&vlock); return (error); } /* * The callback buffer is beyond the new file EOF and must be destroyed. * Note that the compare function must conform to the RB_SCAN's requirements. */ static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data) { if (bp->b_loffset >= *(off_t *)data) return(0); return(-1); } static int nvtruncbuf_bp_trunc(struct buf *bp, void *data) { /* * Do not try to use a buffer we cannot immediately lock, but sleep * anyway to prevent a livelock. The code will loop until all buffers * can be acted upon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) BUF_UNLOCK(bp); } else { bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE); brelse(bp); } return(1); } /* * Fsync all meta-data after truncating a file to be non-zero. Only metadata * blocks (with a negative loffset) are scanned. * Note that the compare function must conform to the RB_SCAN's requirements. */ static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data) { if (bp->b_loffset < 0) return(0); return(1); } static int nvtruncbuf_bp_metasync(struct buf *bp, void *data) { struct vnode *vp = data; if (bp->b_flags & B_DELWRI) { /* * Do not try to use a buffer we cannot immediately lock, * but sleep anyway to prevent a livelock. The code will * loop until all buffers can be acted upon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0) BUF_UNLOCK(bp); } else { bremfree(bp); if (bp->b_vp == vp) bawrite(bp); else bwrite(bp); } return(1); } else { return(0); } } /* * Extend a file's buffer and pages to a new, larger size. The block size * at both the old and new length must be passed, but buffer cache operations * will only be performed on the old block. The new nlength/nblksize will * be used to properly set the VM object size. * * To make this explicit we require the old length to passed even though * we can acquire it from vp->v_filesize, which also avoids potential * corruption if the filesystem and vp get desynchronized somehow. * * If the caller intends to immediately write into the newly extended * space pass trivial == 1. If trivial is 0 the original buffer will be * zero-filled as necessary to clean out any junk in the extended space. * * When zero-filling we must bdwrite() to avoid a window of opportunity * where the kernel might throw away a clean buffer and the filesystem * then attempts to bread() it again before completing (or as part of) * the extension. The filesystem is still responsible for zero-filling * any remainder when writing to the media in the strategy function when * it is able to do so without the page being mapped. The page may still * be mapped by userland here. * * When modifying a buffer we must clear any cached raw disk offset. * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER, * never overwrite existing data blocks. */ int nvextendbuf(struct vnode *vp, off_t olength, off_t nlength, int oblksize, int nblksize, int oboff, int nboff, int trivial) { off_t truncboffset; struct buf *bp; int error; error = 0; nvnode_pager_setsize(vp, nlength, nblksize, nboff); if (trivial == 0) { if (oboff < 0) oboff = (int)(olength % oblksize); truncboffset = olength - oboff; if (oboff) { error = bread(vp, truncboffset, oblksize, &bp); if (error == 0) { bzero(bp->b_data + oboff, oblksize - oboff); bp->b_bio2.bio_offset = NOOFFSET; bdwrite(bp); } } } return (error); } /* * Set vp->v_filesize and vp->v_object->size, destroy pages beyond * the last buffer when truncating. * * This function does not do any zeroing or invalidating of partially * overlapping pages. Zeroing is the responsibility of nvtruncbuf(). * However, it does unmap VM pages from the user address space on a * page-granular (verses buffer cache granular) basis. * * If boff is passed as -1 the base offset of the buffer cache buffer is * calculated from length and blksize. Filesystems such as UFS which deal * with fragments have to specify a boff >= 0 since the base offset cannot * be calculated from length and blksize. * * For UFS blksize is the 'new' blocksize, used only to determine how large * the VM object must become. */ void nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff) { vm_pindex_t nobjsize; vm_pindex_t oobjsize; vm_pindex_t pi; vm_object_t object; vm_page_t m; off_t truncboffset; /* * Degenerate conditions */ if ((object = vp->v_object) == NULL) return; if (length == vp->v_filesize) return; /* * Calculate the size of the VM object, coverage includes * the buffer straddling EOF. If EOF is buffer-aligned * we don't bother. * * Buffers do not have to be page-aligned. Make sure * nobjsize is beyond the last page of the buffer. */ if (boff < 0) boff = (int)(length % blksize); truncboffset = length - boff; oobjsize = object->size; if (boff) nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK); else nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK); object->size = nobjsize; if (length < vp->v_filesize) { /* * File has shrunk, toss any cached pages beyond * the end of the buffer (blksize aligned) for the * new EOF. */ vp->v_filesize = length; if (nobjsize < oobjsize) { vm_object_page_remove(object, nobjsize, oobjsize, FALSE); } /* * Unmap any pages (page aligned) beyond the new EOF. * The pages remain part of the (last) buffer and are not * invalidated. */ pi = OFF_TO_IDX(length + PAGE_MASK); while (pi < nobjsize) { do { m = vm_page_lookup(object, pi); } while (m && vm_page_sleep_busy(m, TRUE, "vsetsz")); if (m) { vm_page_busy(m); vm_page_protect(m, VM_PROT_NONE); vm_page_wakeup(m); } ++pi; } } else { /* * File has expanded. */ vp->v_filesize = length; } }