From 2ec4b00d38cf6ed7cf61efde0392ff0d3b26c787 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 18 May 2008 05:54:31 +0000 Subject: [PATCH] Fix a number of core kernel issues related to HAMMER operation. * The cluster code was incorrectly using the maximum IO size from the filesystem on which /dev is mounted instead of the maximum IO size of the block device. This became evident when HAMMER (with 16K blocks) tried to call cluster_read() via /dev/ad6s1h (on UFS with 8K blocks). * Change the way the VNLRU code works to avoid an infinite loop in vmntvnodescan(). The vnode LRU recycling code was cycling vnodes from the head of mp->mnt_nvnodelist to the tail. Under certain heavy load conditions this could cause a vmntvnodescan() to never finish running and eventually hit a count assertion (at 1,000,000 vnodes scanned). Instead of cycling the vnodes in the mnt_nvnodelist, use the syncer vnode (mount->mnt_syncer) as a placemarker and move *IT* within the list to represent the LRU scan. By not cycling vnodes to the end of the list, vmntvnodescan() can no longer get into an infinite loop. * Change the mount->mnt_syncer logic slightly to avoid races against a background sync while unmounting. The field is no longer cleared by the sync_reclaim() call but is instead cleared by the unmount code before vrele()ing the special vnode. --- sys/kern/vfs_cluster.c | 27 +++++++++++----- sys/kern/vfs_mount.c | 53 +++++++++++++++++++------------- sys/kern/vfs_subr.c | 18 ++++++++++- sys/kern/vfs_sync.c | 11 +++++-- sys/kern/vfs_syscalls.c | 14 +++++++-- sys/sys/vnode.h | 3 +- sys/vfs/msdosfs/msdosfs_vfsops.c | 7 ++--- 7 files changed, 91 insertions(+), 42 deletions(-) diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index ec9c1ad636..126b1edf85 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -34,7 +34,7 @@ * * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 * $FreeBSD: src/sys/kern/vfs_cluster.c,v 1.92.2.9 2001/11/18 07:10:59 dillon Exp $ - * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.35 2008/05/18 01:35:40 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_cluster.c,v 1.36 2008/05/18 05:54:25 dillon Exp $ */ #include "opt_debug_cluster.h" @@ -106,7 +106,7 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset, * Try to limit the amount of read-ahead by a few * ad-hoc parameters. This needs work!!! */ - racluster = vp->v_mount->mnt_iosize_max / size; + racluster = vmaxiosize(vp) / size; maxra = 2 * racluster + (totread / size); if (maxra > MAXRA) maxra = MAXRA; @@ -316,10 +316,20 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, struct buf *bp, *tbp; off_t boffset; int i, j; + int maxiosize = vmaxiosize(vp); - KASSERT(size == vp->v_mount->mnt_stat.f_iosize, - ("cluster_rbuild: size %d != filesize %ld\n", - size, vp->v_mount->mnt_stat.f_iosize)); + /* + * This is a filesystem sanity check. For regular files h + * cluster_write() currently uses f_iosize, make sure cluster_read() + * uses the same block size. + * + * NOTE: The vp can be a block device + */ + if (vp->v_type == VREG) { + KASSERT(size == vp->v_mount->mnt_stat.f_iosize, + ("cluster_rbuild: size %d != filesize %ld\n", + size, vp->v_mount->mnt_stat.f_iosize)); + } /* * avoid a division @@ -364,7 +374,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, for (boffset = doffset, i = 0; i < run; ++i, boffset += size) { if (i) { if ((bp->b_xio.xio_npages * PAGE_SIZE) + - round_page(size) > vp->v_mount->mnt_iosize_max) { + round_page(size) > maxiosize) { break; } @@ -623,7 +633,7 @@ cluster_write(struct buf *bp, off_t filesize, int seqcount) if (vp->v_clen == 0 || loffset != vp->v_lastw + lblocksize || bp->b_bio2.bio_offset == NOOFFSET || (bp->b_bio2.bio_offset != vp->v_lasta + lblocksize)) { - maxclen = vp->v_mount->mnt_iosize_max; + maxclen = vmaxiosize(vp); if (vp->v_clen != 0) { /* * Next block is not sequential. @@ -756,6 +766,7 @@ cluster_wbuild(struct vnode *vp, int size, off_t start_loffset, int bytes) struct buf *bp, *tbp; int i, j; int totalwritten = 0; + int maxiosize = vmaxiosize(vp); while (bytes > 0) { crit_enter(); @@ -867,7 +878,7 @@ cluster_wbuild(struct vnode *vp, int size, off_t start_loffset, int bytes) ((bp->b_bio2.bio_offset + i) != tbp->b_bio2.bio_offset) || ((tbp->b_xio.xio_npages + bp->b_xio.xio_npages) > - (vp->v_mount->mnt_iosize_max / PAGE_SIZE))) { + (maxiosize / PAGE_SIZE))) { BUF_UNLOCK(tbp); crit_exit(); break; diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 2acd198cbf..42376874f4 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -67,7 +67,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $DragonFly: src/sys/kern/vfs_mount.c,v 1.32 2008/05/02 00:19:52 corecode Exp $ + * $DragonFly: src/sys/kern/vfs_mount.c,v 1.33 2008/05/18 05:54:25 dillon Exp $ */ /* @@ -153,18 +153,6 @@ vremovevnodemnt(struct vnode *vp) TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); } -/* - * Support function called with mntvnode_token held to move a vnode to - * the end of the list. - */ -static void -vmovevnodetoend(struct mount *mp, struct vnode *vp) -{ - vremovevnodemnt(vp); - TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); -} - - /* * Allocate a new vnode and associate it with a tag, mount point, and * operations vector. @@ -526,7 +514,25 @@ vlrureclaim(struct mount *mp, void *data) done = 0; lwkt_gettoken(&ilock, &mntvnode_token); count = mp->mnt_nvnodelistsize / 10 + 1; - while (count && (vp = TAILQ_FIRST(&mp->mnt_nvnodelist)) != NULL) { + while (count && mp->mnt_syncer) { + /* + * Next vnode. Use the special syncer vnode to placemark + * the LRU. This way the LRU code does not interfere with + * vmntvnodescan(). + */ + vp = TAILQ_NEXT(mp->mnt_syncer, v_nmntvnodes); + TAILQ_REMOVE(&mp->mnt_nvnodelist, mp->mnt_syncer, v_nmntvnodes); + if (vp) { + TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, + mp->mnt_syncer, v_nmntvnodes); + } else { + TAILQ_INSERT_HEAD(&mp->mnt_nvnodelist, mp->mnt_syncer, + v_nmntvnodes); + vp = TAILQ_NEXT(mp->mnt_syncer, v_nmntvnodes); + if (vp == NULL) + break; + } + /* * __VNODESCAN__ * @@ -537,7 +543,6 @@ vlrureclaim(struct mount *mp, void *data) if (vp->v_type == VNON || /* syncer or indeterminant */ !vmightfree(vp, trigger) /* critical path opt */ ) { - vmovevnodetoend(mp, vp); --count; continue; } @@ -549,8 +554,6 @@ vlrureclaim(struct mount *mp, void *data) * mountlist. */ if (vx_get_nonblock(vp) != 0) { - if (vp->v_mount == mp) - vmovevnodetoend(mp, vp); --count; continue; } @@ -566,8 +569,6 @@ vlrureclaim(struct mount *mp, void *data) vp->v_mount != mp || !vtrytomakegoneable(vp, trigger) /* critical path opt */ ) { - if (vp->v_mount == mp) - vmovevnodetoend(mp, vp); --count; vx_put(vp); continue; @@ -581,7 +582,6 @@ vlrureclaim(struct mount *mp, void *data) * vnode to the free list if the vgone() was successful. */ KKASSERT(vp->v_mount == mp); - vmovevnodetoend(mp, vp); vgone_vxlocked(vp); vx_put(vp); ++done; @@ -864,12 +864,17 @@ insmntque(struct vnode *vp, struct mount *mp) } /* * Insert into list of vnodes for the new mount point, if available. + * The 'end' of the LRU list is the vnode prior to mp->mnt_syncer. */ if ((vp->v_mount = mp) == NULL) { lwkt_reltoken(&ilock); return; } - TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + if (mp->mnt_syncer) { + TAILQ_INSERT_BEFORE(mp->mnt_syncer, vp, v_nmntvnodes); + } else { + TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); + } mp->mnt_nvnodelistsize++; lwkt_reltoken(&ilock); } @@ -916,7 +921,11 @@ vmntvnodescan( if (--maxcount == 0) panic("maxcount reached during vmntvnodescan"); - if (vp->v_type == VNON) /* visible but not ready */ + /* + * Skip if visible but not ready, or special (e.g. + * mp->mnt_syncer) + */ + if (vp->v_type == VNON) goto next; KKASSERT(vp->v_mount == mp); diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index b4a3f1e656..30249bf52e 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -37,7 +37,7 @@ * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ - * $DragonFly: src/sys/kern/vfs_subr.c,v 1.113 2008/05/08 01:41:05 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_subr.c,v 1.114 2008/05/18 05:54:25 dillon Exp $ */ /* @@ -1246,6 +1246,22 @@ vrecycle(struct vnode *vp) return (0); } +/* + * Return the maximum I/O size allowed for strategy calls on VP. + * + * If vp is VCHR or VBLK we dive the device, otherwise we use + * the vp's mount info. + */ +int +vmaxiosize(struct vnode *vp) +{ + if (vp->v_type == VBLK || vp->v_type == VCHR) { + return(vp->v_rdev->si_iosize_max); + } else { + return(vp->v_mount->mnt_iosize_max); + } +} + /* * Eliminate all activity associated with a vnode in preparation for reuse. * diff --git a/sys/kern/vfs_sync.c b/sys/kern/vfs_sync.c index 585642ba84..cdcfac5351 100644 --- a/sys/kern/vfs_sync.c +++ b/sys/kern/vfs_sync.c @@ -37,7 +37,7 @@ * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 * $FreeBSD: src/sys/kern/vfs_subr.c,v 1.249.2.30 2003/04/04 20:35:57 tegge Exp $ - * $DragonFly: src/sys/kern/vfs_sync.c,v 1.17 2007/11/06 03:49:58 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_sync.c,v 1.18 2008/05/18 05:54:25 dillon Exp $ */ /* @@ -353,6 +353,11 @@ vfs_allocate_syncvnode(struct mount *mp) next = start; } vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); + + /* + * The mnt_syncer field inherits the vnode reference, which is + * held until later decomissioning. + */ mp->mnt_syncer = vp; vx_unlock(vp); return (0); @@ -424,6 +429,8 @@ sync_inactive(struct vop_inactive_args *ap) /* * The syncer vnode is no longer needed and is being decommissioned. + * This can only occur when the last reference has been released on + * mp->mnt_syncer, so mp->mnt_syncer had better be NULL. * * Modifications to the worklist must be protected with a critical * section. @@ -436,7 +443,7 @@ sync_reclaim(struct vop_reclaim_args *ap) struct vnode *vp = ap->a_vp; crit_enter(); - vp->v_mount->mnt_syncer = NULL; + KKASSERT(vp->v_mount->mnt_syncer != vp); if (vp->v_flag & VONWORKLST) { LIST_REMOVE(vp, v_synclist); vp->v_flag &= ~VONWORKLST; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 62e630116e..7d1bba0beb 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -37,7 +37,7 @@ * * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 * $FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.151.2.18 2003/04/04 20:35:58 tegge Exp $ - * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.126 2008/05/09 17:52:17 dillon Exp $ + * $DragonFly: src/sys/kern/vfs_syscalls.c,v 1.127 2008/05/18 05:54:25 dillon Exp $ */ #include @@ -607,6 +607,7 @@ dounmount(struct mount *mp, int flags) { struct namecache *ncp; struct nchandle nch; + struct vnode *vp; int error; int async_flag; int lflags; @@ -688,9 +689,16 @@ dounmount(struct mount *mp, int flags) } } + /* + * Decomission our special mnt_syncer vnode. This also stops + * the vnlru code. If we are unable to unmount we recommission + * the vnode. + */ if (error == 0) { - if (mp->mnt_syncer != NULL) - vrele(mp->mnt_syncer); + if ((vp = mp->mnt_syncer) != NULL) { + mp->mnt_syncer = NULL; + vrele(vp); + } if (((mp->mnt_flag & MNT_RDONLY) || (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE)) { diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 3bd342c965..474044bf16 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -32,7 +32,7 @@ * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD: src/sys/sys/vnode.h,v 1.111.2.19 2002/12/29 18:19:53 dillon Exp $ - * $DragonFly: src/sys/sys/vnode.h,v 1.78 2008/04/22 18:46:52 dillon Exp $ + * $DragonFly: src/sys/sys/vnode.h,v 1.79 2008/05/18 05:54:30 dillon Exp $ */ #ifndef _SYS_VNODE_H_ @@ -489,6 +489,7 @@ int vfsync(struct vnode *vp, int waitfor, int passes, int vinitvmio(struct vnode *vp, off_t filesize); void vprint (char *label, struct vnode *vp); int vrecycle (struct vnode *vp); +int vmaxiosize (struct vnode *vp); void vn_strategy(struct vnode *vp, struct bio *bio); int vn_close (struct vnode *vp, int flags); int vn_isdisk (struct vnode *vp, int *errp); diff --git a/sys/vfs/msdosfs/msdosfs_vfsops.c b/sys/vfs/msdosfs/msdosfs_vfsops.c index d23fdc0558..ba87e152db 100644 --- a/sys/vfs/msdosfs/msdosfs_vfsops.c +++ b/sys/vfs/msdosfs/msdosfs_vfsops.c @@ -1,5 +1,5 @@ /* $FreeBSD: /usr/local/www/cvsroot/FreeBSD/src/sys/msdosfs/Attic/msdosfs_vfsops.c,v 1.60.2.8 2004/03/02 09:43:04 tjr Exp $ */ -/* $DragonFly: src/sys/vfs/msdosfs/msdosfs_vfsops.c,v 1.49 2008/01/05 14:02:41 swildner Exp $ */ +/* $DragonFly: src/sys/vfs/msdosfs/msdosfs_vfsops.c,v 1.50 2008/05/18 05:54:31 dillon Exp $ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- @@ -658,10 +658,7 @@ msdosfs_unmount(struct mount *mp, int mntflags) vp->v_flag, vp->v_sysref.refcnt, vp->v_writecount, vp->v_auxrefs); kprintf("mount %p, op %p\n", vp->v_mount, vp->v_ops); - kprintf("freef %p, freeb %p, mount %p\n", - TAILQ_NEXT(vp, v_freelist), - *vp->v_freelist.tqe_prev, - vp->v_mount); + kprintf("mount %p\n", vp->v_mount); kprintf("cleanblkhd %p, dirtyblkhd %p, numoutput %d, type %d\n", RB_ROOT(&vp->v_rbclean_tree), RB_ROOT(&vp->v_rbdirty_tree), -- 2.41.0