From: Matthew Dillon Date: Sat, 13 Feb 2010 03:15:21 +0000 (-0800) Subject: kernel - TMPFS - Bug fixing pass X-Git-Tag: v2.7.1~172 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/9fc94b5f0ece4794f4f859f149d8f61883bccf70 kernel - TMPFS - Bug fixing pass * TMPFS_ASSERT_ELOCKED() is called in numerous places where tn_vnode is not necessarily assigned, for example during tmpfs_nremove() after the directory entry has been cleaned out. Remove the assertion that tn_vnode != NULL. * Add tmpfs_mount->tm_flags and TMPFS_FLAG_UNMOUNTING, used during unmounting to tell tmpfs_fsync() to throw away the contents of the file (normally it ignores it). This fixes a panic on umount when the main kernel checks that all dirty buffers have been cleaned out. * Fix two places where the wrong length for a string is being kmalloc()'d. The softlink and the directory entry string allocations were wrong and resulted in a string terminator being stuffed beyond the end of the malloced buffer. * Do a safety-NULL-out of a few fields after kfree()ing them. * Refactor tmpfs_dir_lookup() a little. * Enhance tmpfs_reg_resize() to also resize the SWAP VM object. Failing to do so can leave extranious swap assignments for deleted areas of the file which become visible again (instead of becoming zero-fill) if the file is then later ftruncate()d larger. Also fix the block size parameters to nvtruncbuf() and nvextendbuf(). It must match the block size used for the buffer cache. * Temporarily turn off all the MPSAFE flags. Run under the BGL. * The buffer offset (offset) in tmpfs_read() and tmpfs_write() can be a size_t. It does not have to be off_t. * tmpfs_write() was using getblk(). It actually has to use bread() in order to ensure that the buffer contents is valid when potentially doing a piecemeal write which does not cover the whole buffer. * Refactor tmpfs_write() to leave the underlying VM pages dirty, except in cases the system page daemon wants to flush pages to clear space in ram (IO_SYNC, IO_ASYNC). Use buwrite() to do this. * Fix an error path in tmpfs_strategy() which was not biodone()ing the bio. * tmpfs_remove() was making assumptions with regards to v->a_nch.ncp->nc_vp which were not correct. The vp is not referenced and can get ripped out from under the caller unless properly handled. * Fix sequencing in tmpfs_inactive(). If tn_links is 0 and the node is not in the middle of being allocated we can destroy it. * Remove unnecessary vnode locks from tmpfs_reclaim(). There are also other vnode locks scattered around that aren't needed (for another time). * Implement vop_bmap(), it is basically a dummy. --- diff --git a/sys/vfs/tmpfs/tmpfs.h b/sys/vfs/tmpfs/tmpfs.h index 73be1431e6..22b3e5730d 100644 --- a/sys/vfs/tmpfs/tmpfs.h +++ b/sys/vfs/tmpfs/tmpfs.h @@ -322,7 +322,6 @@ LIST_HEAD(tmpfs_node_list, tmpfs_node); } while (0) #define TMPFS_ASSERT_ELOCKED(node) do { \ KKASSERT((node) != NULL); \ - KKASSERT((node)->tn_vnode != NULL); \ KKASSERT(lockstatus(TMPFS_NODE_MTX(node), curthread) == LK_EXCLUSIVE); \ } while (0) #else @@ -391,10 +390,15 @@ struct tmpfs_mount { * tmpfs_pool.c. */ struct objcache *tm_dirent_pool; struct objcache *tm_node_pool; + + int tm_flags; }; + #define TMPFS_LOCK(tm) lockmgr(&(tm)->allnode_lock, LK_EXCLUSIVE|LK_RETRY) #define TMPFS_UNLOCK(tm) lockmgr(&(tm)->allnode_lock, LK_RELEASE) +#define TMPFS_FLAG_UNMOUNTING 0x0001 + /* --------------------------------------------------------------------- */ /* diff --git a/sys/vfs/tmpfs/tmpfs_subr.c b/sys/vfs/tmpfs/tmpfs_subr.c index 371c103274..fa4968a7e0 100644 --- a/sys/vfs/tmpfs/tmpfs_subr.c +++ b/sys/vfs/tmpfs/tmpfs_subr.c @@ -151,8 +151,8 @@ tmpfs_alloc_node(struct tmpfs_mount *tmp, enum vtype type, case VLNK: KKASSERT((strlen(target) +1) < MAXPATHLEN); - nnode->tn_size = strlen(target) +1; - nnode->tn_link = kmalloc(nnode->tn_size, M_TMPFSNAME, + nnode->tn_size = strlen(target); + nnode->tn_link = kmalloc(nnode->tn_size + 1, M_TMPFSNAME, M_WAITOK); bcopy(target, nnode->tn_link, nnode->tn_size); nnode->tn_link[nnode->tn_size] = '\0'; @@ -231,6 +231,7 @@ tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) case VDIR: node->tn_links--; node->tn_size = 0; + KKASSERT(node->tn_dir.tn_parent != NULL); TMPFS_NODE_LOCK(node->tn_dir.tn_parent); node->tn_dir.tn_parent->tn_links--; TMPFS_NODE_UNLOCK(node->tn_dir.tn_parent); @@ -242,6 +243,7 @@ tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) case VLNK: kfree(node->tn_link, M_TMPFSNAME); + node->tn_link = NULL; break; case VREG: @@ -282,7 +284,7 @@ tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, nde = (struct tmpfs_dirent *)objcache_get(tmp->tm_dirent_pool, M_WAITOK); - nde->td_name = kmalloc(len, M_TMPFSNAME, M_WAITOK); + nde->td_name = kmalloc(len + 1, M_TMPFSNAME, M_WAITOK); nde->td_namelen = len; bcopy(name, nde->td_name, len); nde->td_name[len] = '\0'; @@ -326,6 +328,7 @@ tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de, } kfree(de->td_name, M_TMPFSNAME); + de->td_name = NULL; objcache_put(tmp->tm_dirent_pool, de); } @@ -620,21 +623,17 @@ struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, struct namecache *ncp) { - void *found; struct tmpfs_dirent *de; int len = ncp->nc_nlen; TMPFS_VALIDATE_DIR(node); - found = 0; TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { if (f != NULL && de->td_node != f) continue; if (len == de->td_namelen) { - if (!memcmp(ncp->nc_name, de->td_name, len)) { - found = node; + if (!memcmp(ncp->nc_name, de->td_name, len)) break; - } } } @@ -642,7 +641,7 @@ tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, node->tn_status |= TMPFS_NODE_ACCESSED; TMPFS_NODE_UNLOCK(node); - return found ? de : NULL; + return de; } /* --------------------------------------------------------------------- */ @@ -895,7 +894,6 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize, int trivial) struct tmpfs_mount *tmp; struct tmpfs_node *node; off_t oldsize; - int biosize = vp->v_mount->mnt_stat.f_iosize; #ifdef INVARIANTS KKASSERT(vp->v_type == VREG); @@ -929,11 +927,34 @@ tmpfs_reg_resize(struct vnode *vp, off_t newsize, int trivial) node->tn_size = newsize; TMPFS_NODE_UNLOCK(node); - if (newsize < oldsize) - error = nvtruncbuf(vp, newsize, biosize, -1); - else - error = nvextendbuf(vp, oldsize, newsize, biosize, biosize, + /* + * When adjusting the vnode filesize and its VM object we must + * also adjust our backing VM object (aobj). The blocksize + * used must match the block sized we use for the buffer cache. + */ + if (newsize < oldsize) { + vm_pindex_t osize; + vm_object_t aobj; + + error = nvtruncbuf(vp, newsize, BSIZE, -1); + aobj = node->tn_reg.tn_aobj; + if (aobj) { + osize = aobj->size; + if (osize > vp->v_object->size) { + aobj->size = osize; + vm_object_page_remove(aobj, vp->v_object->size, + osize, FALSE); + } + } + } else { + vm_object_t aobj; + + error = nvextendbuf(vp, oldsize, newsize, BSIZE, BSIZE, -1, -1, trivial); + aobj = node->tn_reg.tn_aobj; + if (aobj) + aobj->size = vp->v_object->size; + } out: return error; diff --git a/sys/vfs/tmpfs/tmpfs_vfsops.c b/sys/vfs/tmpfs/tmpfs_vfsops.c index 3dde5a908d..f3d3d031bf 100644 --- a/sys/vfs/tmpfs/tmpfs_vfsops.c +++ b/sys/vfs/tmpfs/tmpfs_vfsops.c @@ -273,8 +273,10 @@ tmpfs_mount(struct mount *mp, char *path, caddr_t data, struct ucred *cred) tmp->tm_root = root; mp->mnt_flag |= MNT_LOCAL; +#if 0 mp->mnt_kern_flag |= MNTK_RD_MPSAFE | MNTK_WR_MPSAFE | MNTK_GA_MPSAFE | MNTK_IN_MPSAFE | MNTK_SG_MPSAFE; +#endif mp->mnt_data = (qaddr_t)tmp; vfs_getnewfsid(mp); @@ -310,13 +312,15 @@ tmpfs_unmount(struct mount *mp, int mntflags) if (mntflags & MNT_FORCE) flags |= FORCECLOSE; + /* Tell vflush->vinvalbuf->fsync to throw away data */ + tmp = VFS_TO_TMPFS(mp); + tmp->tm_flags |= TMPFS_FLAG_UNMOUNTING; + /* Finalize all pending I/O. */ error = vflush(mp, 0, flags); if (error != 0) return error; - tmp = VFS_TO_TMPFS(mp); - /* Free all associated data. The loop iterates over the linked list * we have containing all used nodes. For each of them that is * a directory, we free all its directory entries. Note that after diff --git a/sys/vfs/tmpfs/tmpfs_vnops.c b/sys/vfs/tmpfs/tmpfs_vnops.c index 40754e4373..a78fee5bd6 100644 --- a/sys/vfs/tmpfs/tmpfs_vnops.c +++ b/sys/vfs/tmpfs/tmpfs_vnops.c @@ -450,14 +450,24 @@ tmpfs_setattr(struct vop_setattr_args *v) /* --------------------------------------------------------------------- */ +/* + * fsync is basically a NOP. However, when unmounting we have + * to clean out the buffer cache to prevent a dirty buffers kernel + * assertion. We can do this simply by truncating the file to 0. + */ static int tmpfs_fsync(struct vop_fsync_args *v) { + struct tmpfs_mount *tmp; struct vnode *vp = v->a_vp; - tmpfs_update(vp); - - + tmp = VFS_TO_TMPFS(vp->v_mount); + if (tmp->tm_flags & TMPFS_FLAG_UNMOUNTING) { + if (vp->v_type == VREG) + tmpfs_truncate(vp, 0); + } else { + tmpfs_update(vp); + } return 0; } @@ -470,11 +480,11 @@ tmpfs_read (struct vop_read_args *ap) struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct tmpfs_node *node; - int error; - off_t offset; off_t base_offset; + size_t offset; size_t len; int got_mplock; + int error; error = 0; if (uio->uio_resid == 0) { @@ -503,7 +513,7 @@ tmpfs_read (struct vop_read_args *ap) /* * Use buffer cache I/O (via tmpfs_strategy) */ - offset = (off_t)uio->uio_offset & BMASK; + offset = (size_t)uio->uio_offset & BMASK; base_offset = (off_t)uio->uio_offset - offset; bp = getcacheblk(vp, base_offset); if (bp == NULL) @@ -566,8 +576,8 @@ tmpfs_write (struct vop_write_args *ap) boolean_t extended; off_t oldsize; int error; - off_t offset; off_t base_offset; + size_t offset; size_t len; struct rlimit limit; int got_mplock; @@ -608,7 +618,7 @@ tmpfs_write (struct vop_write_args *ap) /* * Extend the file's size if necessary */ - extended = (uio->uio_offset + uio->uio_resid) > node->tn_size; + extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); #ifdef SMP @@ -626,22 +636,29 @@ tmpfs_write (struct vop_write_args *ap) /* * Use buffer cache I/O (via tmpfs_strategy) */ - offset = (off_t)uio->uio_offset & BMASK; + offset = (size_t)uio->uio_offset & BMASK; base_offset = (off_t)uio->uio_offset - offset; len = BSIZE - offset; if (len > uio->uio_resid) len = uio->uio_resid; if ((uio->uio_offset + len) > node->tn_size) { - trivial = uio->uio_offset <= node->tn_size; + trivial = (uio->uio_offset <= node->tn_size); error = tmpfs_reg_resize(vp, uio->uio_offset + len, trivial); if (error) break; } - bp = getblk(vp, base_offset, BSIZE, GETBLK_BHEAVY, 0); - vfs_bio_clrbuf(bp); - + /* + * Read to fill in any gaps. Theoretically we could + * optimize this if the write covers the entire buffer + * and is not a UIO_NOCOPY write, however this can lead + * to a security violation exposing random kernel memory + * (whatever junk was in the backing VM pages before). + * + * So just use bread() to do the right thing. + */ + error = bread(vp, base_offset, BSIZE, &bp); error = uiomove((char *)bp->b_data + offset, len, uio); if (error) { kprintf("tmpfs_write uiomove error %d\n", error); @@ -653,14 +670,25 @@ tmpfs_write (struct vop_write_args *ap) node->tn_size = uio->uio_offset; /* - * The data has been loaded into the buffer, write it out. (via tmpfs_strategy) + * The data has been loaded into the buffer, write it out. * - * call bdwrite() because we don't care about storage io flag (ap->a_ioflag) for a swap I/O - * maybe bawrite() for IO_DIRECT, bwrite() for IO_SYNC + * We want tmpfs to be able to use all available ram, not + * just the buffer cache, so if not explicitly paging we + * use buwrite() to leave the buffer clean but mark all the + * VM pages valid+dirty. * - * XXX: need to implement tmpfs_bmap() for a dirty bit handling of bdwrite() + * Be careful, the kernel uses IO_SYNC/IO_ASYNC when it + * really really wants to flush the buffer, typically as + * part of the paging system, and we'll loop forever if + * we fake those. */ - bdwrite(bp); + if (ap->a_ioflag & IO_SYNC) + bwrite(bp); + else if (ap->a_ioflag & IO_ASYNC) + bawrite(bp); + else + buwrite(bp); + if (bp->b_error) { kprintf("tmpfs_write bwrite error %d\n", error); break; @@ -708,18 +736,26 @@ static int tmpfs_strategy(struct vop_strategy_args *ap) { struct bio *bio = ap->a_bio; + struct buf *bp = bio->bio_buf; struct vnode *vp = ap->a_vp; struct tmpfs_node *node; vm_object_t uobj; - if (vp->v_type != VREG) - return EINVAL; + if (vp->v_type != VREG) { + bp->b_resid = bp->b_bcount; + bp->b_flags |= B_ERROR | B_INVAL; + bp->b_error = EINVAL; + biodone(bio); + return(0); + } node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_reg.tn_aobj; + /* - * call swap_pager_strategy to store vm object into swap device + * Call swap_pager_strategy to read or write between the VM + * object and the buffer cache. */ swap_pager_strategy(uobj, bio); @@ -738,6 +774,7 @@ tmpfs_bmap(struct vop_bmap_args *ap) return 0; } + /* --------------------------------------------------------------------- */ static int @@ -745,15 +782,22 @@ tmpfs_nremove(struct vop_nremove_args *v) { struct vnode *dvp = v->a_dvp; struct namecache *ncp = v->a_nch->ncp; - struct vnode *vp = ncp->nc_vp; + struct vnode *vp; int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; + /* + * We have to acquire the vp from v->a_nch because + * we will likely unresolve the namecache entry, and + * a vrele is needed to trigger the tmpfs_inactive/tmpfs_reclaim + * sequence to recover space from the file. + */ + error = cache_vref(v->a_nch, v->a_cred, &vp); + KKASSERT(error == 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; @@ -794,13 +838,12 @@ tmpfs_nremove(struct vop_nremove_args *v) cache_setunresolved(v->a_nch); cache_setvp(v->a_nch, NULL); - cache_inval_vp(vp, CINV_DESTROY); + /*cache_inval_vp(vp, CINV_DESTROY);*/ error = 0; - out: - vn_unlock(vp); vn_unlock(dvp); + vrele(vp); return error; } @@ -984,7 +1027,7 @@ tmpfs_nrename(struct vop_nrename_args *v) * has to be changed. */ if (fncp->nc_nlen != tncp->nc_nlen || bcmp(fncp->nc_name, tncp->nc_name, fncp->nc_nlen) != 0) { - newname = kmalloc(tncp->nc_nlen, M_TMPFSNAME, M_WAITOK); + newname = kmalloc(tncp->nc_nlen + 1, M_TMPFSNAME, M_WAITOK); } else newname = NULL; @@ -1096,8 +1139,7 @@ tmpfs_nrename(struct vop_nrename_args *v) * node referred by it will not be removed until the vnode is * really reclaimed. */ tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), de, TRUE); - - cache_inval_vp(tvp, CINV_DESTROY); + /*cache_inval_vp(tvp, CINV_DESTROY);*/ } cache_rename(v->a_fnch, v->a_tnch); @@ -1154,7 +1196,7 @@ tmpfs_nrmdir(struct vop_nrmdir_args *v) { struct vnode *dvp = v->a_dvp; struct namecache *ncp = v->a_nch->ncp; - struct vnode *vp = ncp->nc_vp; + struct vnode *vp; int error; struct tmpfs_dirent *de; @@ -1162,8 +1204,15 @@ tmpfs_nrmdir(struct vop_nrmdir_args *v) struct tmpfs_node *dnode; struct tmpfs_node *node; + /* + * We have to acquire the vp from v->a_nch because + * we will likely unresolve the namecache entry, and + * a vrele is needed to trigger the tmpfs_inactive/tmpfs_reclaim + * sequence. + */ + error = cache_vref(v->a_nch, v->a_cred, &vp); + KKASSERT(error == 0); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); @@ -1237,12 +1286,12 @@ tmpfs_nrmdir(struct vop_nrmdir_args *v) cache_setunresolved(v->a_nch); cache_setvp(v->a_nch, NULL); - cache_inval_vp(vp, CINV_DESTROY); + /*cache_inval_vp(vp, CINV_DESTROY);*/ error = 0; out: - vn_unlock(vp); vn_unlock(dvp); + vrele(vp); return error; } @@ -1400,14 +1449,19 @@ tmpfs_inactive(struct vop_inactive_args *v) node = VP_TO_TMPFS_NODE(vp); + /* + * Get rid of unreferenced deleted vnodes sooner rather than + * later so the data memory can be recovered immediately. + */ TMPFS_NODE_LOCK(node); if (node->tn_links == 0 && - (node->tn_vpstate & TMPFS_VNODE_DOOMED)) { + (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0) { + node->tn_vpstate = TMPFS_VNODE_DOOMED; TMPFS_NODE_UNLOCK(node); vrecycle(vp); - } - else + } else { TMPFS_NODE_UNLOCK(node); + } return 0; } @@ -1418,14 +1472,12 @@ int tmpfs_reclaim(struct vop_reclaim_args *v) { struct vnode *vp = v->a_vp; - struct tmpfs_mount *tmp; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); tmp = VFS_TO_TMPFS(vp->v_mount); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); tmpfs_free_vp(vp); /* If the node referenced by this vnode was deleted by the user, @@ -1436,12 +1488,11 @@ tmpfs_reclaim(struct vop_reclaim_args *v) (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0) { node->tn_vpstate = TMPFS_VNODE_DOOMED; TMPFS_NODE_UNLOCK(node); + kprintf("reclaim\n"); tmpfs_free_node(tmp, node); - } - else + } else { TMPFS_NODE_UNLOCK(node); - - vn_unlock(vp); + } KKASSERT(vp->v_data == NULL); return 0; @@ -1557,7 +1608,7 @@ struct vop_ops tmpfs_vnode_vops = { .vop_reclaim = tmpfs_reclaim, .vop_print = tmpfs_print, .vop_pathconf = tmpfs_pathconf, -// .vop_bmap = tmpfs_bmap, + .vop_bmap = tmpfs_bmap, .vop_bmap = (void *)vop_eopnotsupp, .vop_strategy = tmpfs_strategy, .vop_advlock = tmpfs_advlock,