From e654922c869be2609aefdb087a028c991681b1fa Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 11 Feb 2010 12:38:10 -0800 Subject: [PATCH] kernel - sysref - Fix vnode interlock for 1->0 transition * When the vnode refcount drops to 0 vnode_terminate() is called. This function previously acquired the vx_lock() after the 1->0 transition. Instead integrate the acquisition of the vx_lock() into the 1->0 transition to close a small MP race. --- sys/kern/kern_sysref.c | 4 +- sys/kern/vfs_lock.c | 120 ++++++++++++++++++------------------- sys/sys/sysref.h | 4 ++ sys/vfs/devfs/devfs_core.c | 19 +++++- sys/vm/vm_map.c | 16 ++++- 5 files changed, 99 insertions(+), 64 deletions(-) diff --git a/sys/kern/kern_sysref.c b/sys/kern/kern_sysref.c index 705dac38ac..f9250ec975 100644 --- a/sys/kern/kern_sysref.c +++ b/sys/kern/kern_sysref.c @@ -315,11 +315,13 @@ _sysref_put(struct sysref *sr) * in-progress state. The termination function is * then called. */ + data = (char *)sr - sr->srclass->offset; + sr->srclass->ops.lock(data); if (atomic_cmpset_int(&sr->refcnt, count, -0x40000000)) { - data = (char *)sr - sr->srclass->offset; sr->srclass->ops.terminate(data); break; } + sr->srclass->ops.unlock(data); } else if (count > -0x40000000) { /* * release 1 count, nominal case, resource undergoing diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index f3b280305d..d79a325c1e 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -74,7 +74,9 @@ static struct sysref_class vnode_sysref_class = { .ctor = vnode_ctor, .dtor = vnode_dtor, .ops = { - .terminate = (sysref_terminate_func_t)vnode_terminate + .terminate = (sysref_terminate_func_t)vnode_terminate, + .lock = (sysref_terminate_func_t)vx_lock, + .unlock = (sysref_terminate_func_t)vx_unlock } }; @@ -320,73 +322,68 @@ vdrop(struct vnode *vp) /* * This function is called when the last active reference on the vnode - * is released, typically via vrele(). SYSREF will give the vnode a - * negative ref count, indicating that it is undergoing termination or - * is being set aside for the cache, and one final sysref_put() is - * required to actually return it to the memory subsystem. + * is released, typically via vrele(). SYSREF will VX lock the vnode + * and then give the vnode a negative ref count, indicating that it is + * undergoing termination or is being set aside for the cache, and one + * final sysref_put() is required to actually return it to the memory + * subsystem. * - * However, because vnodes may have auxiliary structural references via - * v_auxrefs, we must interlock auxiliary references against termination - * via the VX lock mechanism. It is possible for a vnode to be reactivated - * while we were blocked on the lock. + * Additional inactive sysrefs may race us but that's ok. Reactivations + * cannot race us because the sysref code interlocked with the VX lock + * (which is held on call). * * MPSAFE */ void vnode_terminate(struct vnode *vp) { - vx_lock(vp); - if (sysref_isinactive(&vp->v_sysref)) { - /* - * Deactivate the vnode by marking it VFREE or VCACHED. - * The vnode can be reactivated from either state until - * reclaimed. These states inherit the 'last' sysref on the - * vnode. - * - * NOTE: There may be additional inactive references from - * other entities blocking on the VX lock while we hold it, - * but this does not prevent us from changing the vnode's - * state. - * - * NOTE: The vnode could already be marked inactive. XXX - * how? - * - * NOTE: v_mount may be NULL due to assignment to - * dead_vnode_vops - * - * NOTE: The vnode may be marked inactive with dirty buffers - * or dirty pages in its cached VM object still present. - * - * NOTE: VCACHED should not be set on entry. We lose control - * of the sysref the instant the vnode is placed on the - * free list or when VCACHED is set. - * - * The VX lock is sufficient when transitioning - * to +VCACHED but not sufficient for the vshouldfree() - * interlocked test. - */ - if ((vp->v_flag & VINACTIVE) == 0) { - _vsetflags(vp, VINACTIVE); - if (vp->v_mount) - VOP_INACTIVE(vp); - } - spin_lock_wr(&vp->v_spinlock); - KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0); - if (vshouldfree(vp)) - __vfree(vp); - else - _vsetflags(vp, VCACHED); /* inactive but not yet free*/ - spin_unlock_wr(&vp->v_spinlock); - vx_unlock(vp); - } else { - /* - * Someone reactivated the vnode while were blocked on the - * VX lock. Release the VX lock and release the (now active) - * last reference which is no longer last. - */ - vx_unlock(vp); - vrele(vp); + /* + * We own the VX lock, it should not be possible for someone else + * to have reactivated the vp. + */ + KKASSERT(sysref_isinactive(&vp->v_sysref)); + + /* + * Deactivate the vnode by marking it VFREE or VCACHED. + * The vnode can be reactivated from either state until + * reclaimed. These states inherit the 'last' sysref on the + * vnode. + * + * NOTE: There may be additional inactive references from + * other entities blocking on the VX lock while we hold it, + * but this does not prevent us from changing the vnode's + * state. + * + * NOTE: The vnode could already be marked inactive. XXX + * how? + * + * NOTE: v_mount may be NULL due to assignment to + * dead_vnode_vops + * + * NOTE: The vnode may be marked inactive with dirty buffers + * or dirty pages in its cached VM object still present. + * + * NOTE: VCACHED should not be set on entry. We lose control + * of the sysref the instant the vnode is placed on the + * free list or when VCACHED is set. + * + * The VX lock is sufficient when transitioning + * to +VCACHED but not sufficient for the vshouldfree() + * interlocked test. + */ + if ((vp->v_flag & VINACTIVE) == 0) { + _vsetflags(vp, VINACTIVE); + if (vp->v_mount) + VOP_INACTIVE(vp); } + spin_lock_wr(&vp->v_spinlock); + KKASSERT((vp->v_flag & (VFREE|VCACHED)) == 0); + if (vshouldfree(vp)) + __vfree(vp); + else + _vsetflags(vp, VCACHED); /* inactive but not yet free*/ + spin_unlock_wr(&vp->v_spinlock); + vx_unlock(vp); } /* @@ -528,7 +525,8 @@ vget(struct vnode *vp, int flags) * sysref that was earmarking those cases and preventing * the vnode from being destroyed. Our sysref is still held. * - * The spinlock is our only real protection here. + * We are allowed to reactivate the vnode while we hold + * the VX lock, assuming it can be reactivated. */ spin_lock_wr(&vp->v_spinlock); if (vp->v_flag & VFREE) { diff --git a/sys/sys/sysref.h b/sys/sys/sysref.h index afce40d8cb..a496d77a77 100644 --- a/sys/sys/sysref.h +++ b/sys/sys/sysref.h @@ -71,6 +71,8 @@ * must be carefully coded if it does anything more complex then objcache_put */ typedef void (*sysref_terminate_func_t)(void *); +typedef void (*sysref_lock_func_t)(void *); +typedef void (*sysref_unlock_func_t)(void *); struct sysref_class { const char *name; /* name of the system resource */ @@ -85,6 +87,8 @@ struct sysref_class { objcache_dtor_fn *dtor; /* objcache dtor chaining */ struct sysref_ops { sysref_terminate_func_t terminate; + sysref_lock_func_t lock; + sysref_unlock_func_t unlock; } ops; }; diff --git a/sys/vfs/devfs/devfs_core.c b/sys/vfs/devfs/devfs_core.c index 91caad68e1..6aa1eeadbd 100644 --- a/sys/vfs/devfs/devfs_core.c +++ b/sys/vfs/devfs/devfs_core.c @@ -57,6 +57,8 @@ DEVFS_DECLARE_CLONE_BITMAP(ops_id); * sysid and syslink integration. */ static void devfs_cdev_terminate(cdev_t dev); +static void devfs_cdev_lock(cdev_t dev); +static void devfs_cdev_unlock(cdev_t dev); static struct sysref_class cdev_sysref_class = { .name = "cdev", .mtype = M_DEVFS, @@ -66,7 +68,9 @@ static struct sysref_class cdev_sysref_class = { .mag_capacity = 32, .flags = 0, .ops = { - .terminate = (sysref_terminate_func_t)devfs_cdev_terminate + .terminate = (sysref_terminate_func_t)devfs_cdev_terminate, + .lock = (sysref_lock_func_t)devfs_cdev_lock, + .unlock = (sysref_unlock_func_t)devfs_cdev_unlock } }; @@ -2160,6 +2164,19 @@ devfs_cdev_terminate(cdev_t dev) sysref_put(&dev->si_sysref); } +/* + * Dummies for now (individual locks for MPSAFE) + */ +static void +devfs_cdev_lock(cdev_t dev) +{ +} + +static void +devfs_cdev_unlock(cdev_t dev) +{ +} + /* * Links a given cdev into the dev list. */ diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index bf7b2b8d2b..8c6fe39d3e 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -122,6 +122,8 @@ */ static void vmspace_terminate(struct vmspace *vm); +static void vmspace_lock(struct vmspace *vm); +static void vmspace_unlock(struct vmspace *vm); static void vmspace_dtor(void *obj, void *private); MALLOC_DEFINE(M_VMSPACE, "vmspace", "vmspace objcache backingstore"); @@ -136,7 +138,9 @@ struct sysref_class vmspace_sysref_class = { .flags = SRC_MANAGEDINIT, .dtor = vmspace_dtor, .ops = { - .terminate = (sysref_terminate_func_t)vmspace_terminate + .terminate = (sysref_terminate_func_t)vmspace_terminate, + .lock = (sysref_lock_func_t)vmspace_lock, + .unlock = (sysref_lock_func_t)vmspace_unlock } }; @@ -314,6 +318,16 @@ vmspace_terminate(struct vmspace *vm) sysref_put(&vm->vm_sysref); } +static void +vmspace_lock(struct vmspace *vm __unused) +{ +} + +static void +vmspace_unlock(struct vmspace *vm __unused) +{ +} + /* * This is called in the wait*() handling code. The vmspace can be terminated * after the last wait is finished using it. -- 2.41.0