From a9d06cca8c6894668c0d26cfa6846fb58975a647 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 9 Feb 2017 16:51:44 -0800 Subject: [PATCH] kernel - Fix vmspace termination race (2) * Fix a race and fix dangling cached mount points. At the point where dounmount() checks to see if there are any dangling cache refs left it has already dropped its own ref on the mp. The test, however, was (refs > 1). The test needs to be (refs > 0). This race was probably causing the corruption, and in fact its the smoking gun because the mp->mnt_refs field is at the same offset as the pmap->copyin field, and the pmap->copyin field was getting corrupted by being decremented by 1. * Fix a race where the unmount code was issuing tue dounmount() call without any ref or hold on the mp. This race was unlikely (requires two unmounts of the same partition at the same time). --- sys/kern/vfs_mount.c | 19 +++++++++++---- sys/kern/vfs_syscalls.c | 52 ++++++++++++++++++++++++++++++++++------- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 1004f5dcf7..424c8bbc77 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -253,11 +253,15 @@ vfs_busy(struct mount *mp, int flags) } /* XXX not MP safe */ mp->mnt_kern_flag |= MNTK_MWAIT; + /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. + * + * WARNING! mp can potentially go away once we release + * our ref. */ tsleep((caddr_t)mp, 0, "vfs_busy", 0); lwkt_reltoken(&mp->mnt_token); @@ -274,14 +278,19 @@ vfs_busy(struct mount *mp, int flags) /* * Free a busy filesystem. * - * Decrement refs before releasing the lock so e.g. a pending umount - * doesn't give us an unexpected busy error. + * Once refs is decremented the mount point can potentially get ripped + * out from under us, but we want to clean up our refs before unlocking + * so do a hold/drop around the whole mess. + * + * This is not in the critical path (I hope). */ void vfs_unbusy(struct mount *mp) { + mount_hold(mp); atomic_add_int(&mp->mnt_refs, -1); lockmgr(&mp->mnt_lock, LK_RELEASE); + mount_drop(mp); } /* @@ -343,7 +352,7 @@ mount_init(struct mount *mp) TAILQ_INIT(&mp->mnt_jlist); mp->mnt_nvnodelistsize = 0; mp->mnt_flag = 0; - mp->mnt_hold = 1; + mp->mnt_hold = 1; /* hold for umount last drop */ mp->mnt_iosize_max = MAXPHYS; vn_syncer_thr_create(mp); } @@ -357,8 +366,10 @@ mount_hold(struct mount *mp) void mount_drop(struct mount *mp) { - if (atomic_fetchadd_int(&mp->mnt_hold, -1) == 1) + if (atomic_fetchadd_int(&mp->mnt_hold, -1) == 1) { + KKASSERT(mp->mnt_refs == 0); kfree(mp, M_MOUNT); + } } /* diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 1052d3737b..c5c50bd607 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -409,7 +409,9 @@ update: if (!error) { if (mp->mnt_ncmountpt.ncp == NULL) { /* - * allocate, then unlock, but leave the ref intact + * Allocate, then unlock, but leave the ref intact. + * This is the mnt_refs (1) that we will retain + * through to the unmount. */ cache_allocroot(&mp->mnt_ncmountpt, mp, NULL); cache_unlock(&mp->mnt_ncmountpt); @@ -632,16 +634,27 @@ sys_unmount(struct unmount_args *uap) goto out; } + /* + * If no error try to issue the unmount. We lose our cache + * ref when we call nlookup_done so we must hold the mount point + * to prevent use-after-free races. + */ out: - nlookup_done(&nd); - if (error == 0) + if (error == 0) { + mount_hold(mp); + nlookup_done(&nd); error = dounmount(mp, uap->flags); + mount_drop(mp); + } else { + nlookup_done(&nd); + } done: return (error); } /* - * Do the actual file system unmount. + * Do the actual file system unmount (interlocked against the mountlist + * token and mp->mnt_token). */ static int dounmount_interlock(struct mount *mp) @@ -667,6 +680,11 @@ unmount_allproc_cb(struct proc *p, void *arg) return 0; } +/* + * The guts of the unmount code. The mount owns one ref and one hold + * count. If we successfully interlock the unmount, those refs are ours. + * (The ref is from mnt_ncmountpt). + */ int dounmount(struct mount *mp, int flags) { @@ -680,13 +698,16 @@ dounmount(struct mount *mp, int flags) int retry; lwkt_gettoken(&mp->mnt_token); + /* - * Exclusive access for unmounting purposes + * Exclusive access for unmounting purposes. */ if ((error = mountlist_interlock(dounmount_interlock, mp)) != 0) goto out; /* + * We now 'own' the last mp->mnt_refs + * * Allow filesystems to detect that a forced unmount is in progress. */ if (flags & MNT_FORCE) @@ -713,6 +734,8 @@ dounmount(struct mount *mp, int flags) * If this filesystem isn't aliasing other filesystems, * try to invalidate any remaining namecache entries and * check the count afterwords. + * + * We own the last mnt_refs by owning mnt_ncmountpt. */ if ((mp->mnt_kern_flag & MNTK_NCALIASED) == 0) { cache_lock(&mp->mnt_ncmountpt); @@ -835,6 +858,8 @@ dounmount(struct mount *mp, int flags) /* * Remove any installed vnode ops here so the individual VFSs don't * have to. + * + * mnt_refs should go to zero when we scrap mnt_ncmountpt. */ vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_coherency_ops); vfs_rm_vnodeops(mp, NULL, &mp->mnt_vn_journal_ops); @@ -859,6 +884,10 @@ dounmount(struct mount *mp, int flags) mp->mnt_vfc->vfc_refcount--; if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) panic("unmount: dangling vnode"); + + /* + * Release the lock + */ lockmgr(&mp->mnt_lock, LK_RELEASE); if (mp->mnt_kern_flag & MNTK_MWAIT) { mp->mnt_kern_flag &= ~MNTK_MWAIT; @@ -867,13 +896,18 @@ dounmount(struct mount *mp, int flags) /* * If we reach here and freeok != 0 we must free the mount. - * If refs > 1 cycle and wait, just in case someone tried - * to busy the mount after we decided to do the unmount. + * mnt_refs should already have dropped to 0, so if it is not + * zero we must cycle the caches and wait. + * + * When we are satisfied that the mount has disconnected we can + * drop the hold on the mp that represented the mount (though the + * caller might actually have another, so the caller's drop may + * do the actual free). */ if (freeok) { - if (mp->mnt_refs > 1) + if (mp->mnt_refs > 0) cache_clearmntcache(); - while (mp->mnt_refs > 1) { + while (mp->mnt_refs > 0) { cache_unmounting(mp); wakeup(mp); tsleep(&mp->mnt_refs, 0, "umntrwait", hz / 10 + 1); -- 2.41.0