From: Matthew Dillon Date: Sat, 23 Feb 2013 05:57:45 +0000 (-0800) Subject: kernel - Implement shared namecache locks X-Git-Tag: v3.4.0rc~226 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/79fd1696254e1413ba2bd0b0c029e2acd9ba416a kernel - Implement shared namecache locks * Currently highly experimental, so I've added a sysctl and default it to disabled for now. sysctl debug.ncp_shared_lock_disable 0 Shared namecache locks enabled 1 Shared namecache locks disabled (default) * Removes most conflicts when concurrent processes are doing long path lookups with substantially similar prefixes. * Also removes directory conflicts when concurrent processes are accessing different file names under the same directory using short paths. * Shared mode is only used when the ncp is resolved and in a normal working state (this includes entries which have resolved to ENOENT). Otherwise the code falls back to exclusive mode. * Shared namecache locks have three major complexities: (1) First, some bits of the nlookup() routine had to be rearranged to avoid double locking. This is because the last namecache component always has to be locked exclusively, but a path such as a/b/d/. references the same ncp entry for both of the last two components. (2) Second, any lock on a namecache structure vhold()'s the related vp (if not NULL). Shared locks present a particular issue where a second cpu may obtain a second shared lock before the first cpu is able to complete vhold()ing the vnode. The vnode cannot be vhold()'d prior to the lock. To deal with this an interlock was implemented (see NC_SHLOCK_VHOLD). (3) Finally, because there might be many concurrent shared lock users to avoid starving out an exclusive lock user we must stall further shared locks while an exclusive request is pending. * The implementation specifically does not attempt to implement lock upgrading. That's another can of worms that I'd rather not open. --- diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 269be46bdf..c415c3a8e9 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -183,6 +183,10 @@ static int ncposlimit; /* number of cache entries allocated */ SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0, "Number of cache entries allocated"); +static int ncp_shared_lock_disable = 1; +SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW, + &ncp_shared_lock_disable, 0, "Disable shared namecache locks"); + SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "sizeof(struct vnode)"); SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), @@ -267,10 +271,13 @@ static struct namecache *cache_zap(struct namecache *ncp, int nonblock); * vnode association state changes by other threads, and prevents the * namecache entry from being resolved or unresolved by other threads. * - * The lock owner has full authority to associate/disassociate vnodes - * and resolve/unresolve the locked ncp. + * An exclusive lock owner has full authority to associate/disassociate + * vnodes and resolve/unresolve the locked ncp. + * + * A shared lock owner only has authority to acquire the underlying vnode, + * if any. * - * The primary lock field is nc_exlocks. nc_locktd is set after the + * The primary lock field is nc_lockstatus. nc_locktd is set after the * fact (when locking) or cleared prior to unlocking. * * WARNING! Holding a locked ncp will prevent a vnode from being destroyed @@ -280,8 +287,6 @@ static struct namecache *cache_zap(struct namecache *ncp, int nonblock); * way the refs counter is handled). Or, alternatively, make an * unconditional call to cache_validate() or cache_resolve() * after cache_lock() returns. - * - * MPSAFE */ static void @@ -297,10 +302,12 @@ _cache_lock(struct namecache *ncp) td = curthread; for (;;) { - count = ncp->nc_exlocks; + count = ncp->nc_lockstatus; + cpu_ccfence(); - if (count == 0) { - if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) { + if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count + 1)) { /* * The vp associated with a locked ncp must * be held to prevent it from being recycled. @@ -317,38 +324,41 @@ _cache_lock(struct namecache *ncp) */ ncp->nc_locktd = td; if (ncp->nc_vp) - vhold(ncp->nc_vp); /* MPSAFE */ + vhold(ncp->nc_vp); break; } /* cmpset failed */ continue; } if (ncp->nc_locktd == td) { - if (atomic_cmpset_int(&ncp->nc_exlocks, count, - count + 1)) { + KKASSERT((count & NC_SHLOCK_FLAG) == 0); + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count + 1)) { break; } /* cmpset failed */ continue; } - tsleep_interlock(ncp, 0); - if (atomic_cmpset_int(&ncp->nc_exlocks, count, + tsleep_interlock(&ncp->nc_locktd, 0); + if (atomic_cmpset_int(&ncp->nc_lockstatus, count, count | NC_EXLOCK_REQ) == 0) { /* cmpset failed */ continue; } - error = tsleep(ncp, PINTERLOCKED, "clock", nclockwarn); + error = tsleep(&ncp->nc_locktd, PINTERLOCKED, + "clock", nclockwarn); if (error == EWOULDBLOCK) { if (didwarn == 0) { didwarn = ticks; - kprintf("[diagnostic] cache_lock: blocked " - "on %p", - ncp); + kprintf("[diagnostic] cache_lock: " + "blocked on %p %08x", + ncp, count); kprintf(" \"%*.*s\"\n", ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name); } } + /* loop */ } if (didwarn) { kprintf("[diagnostic] cache_lock: unblocked %*.*s after " @@ -358,11 +368,115 @@ _cache_lock(struct namecache *ncp) } } +/* + * The shared lock works similarly to the exclusive lock except + * nc_locktd is left NULL and we need an interlock (VHOLD) to + * prevent vhold() races, since the moment our cmpset_int succeeds + * another cpu can come in and get its own shared lock. + * + * A critical section is needed to prevent interruption during the + * VHOLD interlock. + */ +static +void +_cache_lock_shared(struct namecache *ncp) +{ + int didwarn; + int error; + u_int count; + + KKASSERT(ncp->nc_refs != 0); + didwarn = 0; + + for (;;) { + count = ncp->nc_lockstatus; + cpu_ccfence(); + + if ((count & ~NC_SHLOCK_REQ) == 0) { + crit_enter(); + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, + (count + 1) | NC_SHLOCK_FLAG | + NC_SHLOCK_VHOLD)) { + /* + * The vp associated with a locked ncp must + * be held to prevent it from being recycled. + * + * WARNING! If VRECLAIMED is set the vnode + * could already be in the middle of a recycle. + * Callers must use cache_vref() or + * cache_vget() on the locked ncp to + * validate the vp or set the cache entry + * to unresolved. + * + * NOTE! vhold() is allowed if we hold a + * lock on the ncp (which we do). + */ + if (ncp->nc_vp) + vhold(ncp->nc_vp); + atomic_clear_int(&ncp->nc_lockstatus, + NC_SHLOCK_VHOLD); + crit_exit(); + break; + } + /* cmpset failed */ + crit_exit(); + continue; + } + + /* + * If already held shared we can just bump the count, but + * only allow this if nobody is trying to get the lock + * exclusively. + * + * VHOLD is a bit of a hack. Even though we successfully + * added another shared ref, the cpu that got the first + * shared ref might not yet have held the vnode. + */ + if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == + NC_SHLOCK_FLAG) { + KKASSERT((count & ~(NC_EXLOCK_REQ | + NC_SHLOCK_REQ | + NC_SHLOCK_FLAG)) > 0); + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count + 1)) { + while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) + cpu_pause(); + break; + } + continue; + } + tsleep_interlock(ncp, 0); + if (atomic_cmpset_int(&ncp->nc_lockstatus, count, + count | NC_SHLOCK_REQ) == 0) { + /* cmpset failed */ + continue; + } + error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn); + if (error == EWOULDBLOCK) { + if (didwarn == 0) { + didwarn = ticks; + kprintf("[diagnostic] cache_lock_shared: " + "blocked on %p %08x", + ncp, count); + kprintf(" \"%*.*s\"\n", + ncp->nc_nlen, ncp->nc_nlen, + ncp->nc_name); + } + } + /* loop */ + } + if (didwarn) { + kprintf("[diagnostic] cache_lock_shared: " + "unblocked %*.*s after %d secs\n", + ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name, + (int)(ticks - didwarn) / hz); + } +} + /* * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance, * such as the case where one of its children is locked. - * - * MPSAFE */ static int @@ -374,10 +488,11 @@ _cache_lock_nonblock(struct namecache *ncp) td = curthread; for (;;) { - count = ncp->nc_exlocks; + count = ncp->nc_lockstatus; - if (count == 0) { - if (atomic_cmpset_int(&ncp->nc_exlocks, 0, 1)) { + if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) { + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count + 1)) { /* * The vp associated with a locked ncp must * be held to prevent it from being recycled. @@ -394,15 +509,15 @@ _cache_lock_nonblock(struct namecache *ncp) */ ncp->nc_locktd = td; if (ncp->nc_vp) - vhold(ncp->nc_vp); /* MPSAFE */ + vhold(ncp->nc_vp); break; } /* cmpset failed */ continue; } if (ncp->nc_locktd == td) { - if (atomic_cmpset_int(&ncp->nc_exlocks, count, - count + 1)) { + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count + 1)) { break; } /* cmpset failed */ @@ -413,14 +528,89 @@ _cache_lock_nonblock(struct namecache *ncp) return(0); } +/* + * The shared lock works similarly to the exclusive lock except + * nc_locktd is left NULL and we need an interlock (VHOLD) to + * prevent vhold() races, since the moment our cmpset_int succeeds + * another cpu can come in and get its own shared lock. + * + * A critical section is needed to prevent interruption during the + * VHOLD interlock. + */ +static +int +_cache_lock_shared_nonblock(struct namecache *ncp) +{ + u_int count; + + for (;;) { + count = ncp->nc_lockstatus; + + if ((count & ~NC_SHLOCK_REQ) == 0) { + crit_enter(); + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, + (count + 1) | NC_SHLOCK_FLAG | + NC_SHLOCK_VHOLD)) { + /* + * The vp associated with a locked ncp must + * be held to prevent it from being recycled. + * + * WARNING! If VRECLAIMED is set the vnode + * could already be in the middle of a recycle. + * Callers must use cache_vref() or + * cache_vget() on the locked ncp to + * validate the vp or set the cache entry + * to unresolved. + * + * NOTE! vhold() is allowed if we hold a + * lock on the ncp (which we do). + */ + if (ncp->nc_vp) + vhold(ncp->nc_vp); + atomic_clear_int(&ncp->nc_lockstatus, + NC_SHLOCK_VHOLD); + crit_exit(); + break; + } + /* cmpset failed */ + crit_exit(); + continue; + } + + /* + * If already held shared we can just bump the count, but + * only allow this if nobody is trying to get the lock + * exclusively. + * + * VHOLD is a bit of a hack. Even though we successfully + * added another shared ref, the cpu that got the first + * shared ref might not yet have held the vnode. + */ + if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) == + NC_SHLOCK_FLAG) { + KKASSERT((count & ~(NC_EXLOCK_REQ | + NC_SHLOCK_REQ | + NC_SHLOCK_FLAG)) > 0); + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count + 1)) { + while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD) + cpu_pause(); + break; + } + continue; + } + return(EWOULDBLOCK); + } + return(0); +} + /* * Helper function * * NOTE: nc_refs can be 0 (degenerate case during _cache_drop). * - * nc_locktd must be NULLed out prior to nc_exlocks getting cleared. - * - * MPSAFE + * nc_locktd must be NULLed out prior to nc_lockstatus getting cleared. */ static void @@ -428,34 +618,73 @@ _cache_unlock(struct namecache *ncp) { thread_t td __debugvar = curthread; u_int count; + u_int ncount; + struct vnode *dropvp; KKASSERT(ncp->nc_refs >= 0); - KKASSERT(ncp->nc_exlocks > 0); - KKASSERT(ncp->nc_locktd == td); + KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0); + KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td); + + count = ncp->nc_lockstatus; + cpu_ccfence(); - count = ncp->nc_exlocks; - if ((count & ~NC_EXLOCK_REQ) == 1) { + /* + * Clear nc_locktd prior to the atomic op (excl lock only) + */ + if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) ncp->nc_locktd = NULL; - if (ncp->nc_vp) - vdrop(ncp->nc_vp); - } + dropvp = NULL; + for (;;) { - if ((count & ~NC_EXLOCK_REQ) == 1) { - if (atomic_cmpset_int(&ncp->nc_exlocks, count, 0)) { + if ((count & + ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) { + dropvp = ncp->nc_vp; + if (count & NC_EXLOCK_REQ) + ncount = count & NC_SHLOCK_REQ; /* cnt->0 */ + else + ncount = 0; + + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, ncount)) { if (count & NC_EXLOCK_REQ) + wakeup(&ncp->nc_locktd); + else if (count & NC_SHLOCK_REQ) wakeup(ncp); break; } + dropvp = NULL; } else { - if (atomic_cmpset_int(&ncp->nc_exlocks, count, - count - 1)) { + KKASSERT((count & NC_SHLOCK_VHOLD) == 0); + KKASSERT((count & ~(NC_EXLOCK_REQ | + NC_SHLOCK_REQ | + NC_SHLOCK_FLAG)) > 1); + if (atomic_cmpset_int(&ncp->nc_lockstatus, + count, count - 1)) { break; } } - count = ncp->nc_exlocks; + count = ncp->nc_lockstatus; + cpu_ccfence(); } + + /* + * Don't actually drop the vp until we successfully clean out + * the lock, otherwise we may race another shared lock. + */ + if (dropvp) + vdrop(dropvp); } +static +int +_cache_lockstatus(struct namecache *ncp) +{ + if (ncp->nc_locktd == curthread) + return(LK_EXCLUSIVE); + if (ncp->nc_lockstatus & NC_SHLOCK_FLAG) + return(LK_SHARED); + return(-1); +} /* * cache_hold() and cache_drop() prevent the premature deletion of a @@ -467,8 +696,6 @@ _cache_unlock(struct namecache *ncp) * * This is a rare case where callers are allowed to hold a spinlock, * so we can't ourselves. - * - * MPSAFE */ static __inline struct namecache * @@ -494,8 +721,6 @@ _cache_hold(struct namecache *ncp) * * NOTE: cache_zap() may return a non-NULL referenced parent which must * be dropped in a loop. - * - * MPSAFE */ static __inline void @@ -537,8 +762,6 @@ _cache_drop(struct namecache *ncp) * * NOTE: The hash table spinlock is likely held during this call, we * can't do anything fancy. - * - * MPSAFE */ static void _cache_link_parent(struct namecache *ncp, struct namecache *par, @@ -581,8 +804,6 @@ _cache_link_parent(struct namecache *ncp, struct namecache *par, * * ncp must be locked. This routine will acquire a temporary lock on * the parent as wlel as the appropriate hash chain. - * - * MPSAFE */ static void _cache_unlink_parent(struct namecache *ncp) @@ -617,8 +838,6 @@ _cache_unlink_parent(struct namecache *ncp) /* * Allocate a new namecache structure. Most of the code does not require * zero-termination of the string but it makes vop_compat_ncreate() easier. - * - * MPSAFE */ static struct namecache * cache_alloc(int nlen) @@ -641,20 +860,18 @@ cache_alloc(int nlen) /* * Can only be called for the case where the ncp has never been * associated with anything (so no spinlocks are needed). - * - * MPSAFE */ static void _cache_free(struct namecache *ncp) { - KKASSERT(ncp->nc_refs == 1 && ncp->nc_exlocks == 1); + KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1); if (ncp->nc_name) kfree(ncp->nc_name, M_VFSCACHE); kfree(ncp, M_VFSCACHE); } /* - * MPSAFE + * [re]initialize a nchandle. */ void cache_zero(struct nchandle *nch) @@ -709,9 +926,6 @@ cache_changemount(struct nchandle *nch, struct mount *mp) atomic_add_int(&nch->mount->mnt_refs, 1); } -/* - * MPSAFE - */ void cache_drop(struct nchandle *nch) { @@ -721,15 +935,40 @@ cache_drop(struct nchandle *nch) nch->mount = NULL; } -/* - * MPSAFE - */ +int +cache_lockstatus(struct nchandle *nch) +{ + return(_cache_lockstatus(nch->ncp)); +} + void cache_lock(struct nchandle *nch) { _cache_lock(nch->ncp); } +void +cache_lock_maybe_shared(struct nchandle *nch, int excl) +{ + struct namecache *ncp = nch->ncp; + + if (ncp_shared_lock_disable || excl || + (ncp->nc_flag & NCF_UNRESOLVED)) { + _cache_lock(ncp); + } else { + _cache_lock_shared(ncp); + if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { + if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { + _cache_unlock(ncp); + _cache_lock(ncp); + } + } else { + _cache_unlock(ncp); + _cache_lock(ncp); + } + } +} + /* * Relock nch1 given an unlocked nch1 and a locked nch2. The caller * is responsible for checking both for validity on return as they @@ -773,19 +1012,12 @@ cache_relock(struct nchandle *nch1, struct ucred *cred1, } } -/* - * MPSAFE - */ int cache_lock_nonblock(struct nchandle *nch) { return(_cache_lock_nonblock(nch->ncp)); } - -/* - * MPSAFE - */ void cache_unlock(struct nchandle *nch) { @@ -801,8 +1033,6 @@ cache_unlock(struct nchandle *nch) * * We want cache_get() to return a definitively usable vnode or a * definitively unresolved ncp. - * - * MPSAFE */ static struct namecache * @@ -815,6 +1045,35 @@ _cache_get(struct namecache *ncp) return(ncp); } +/* + * Attempt to obtain a shared lock on the ncp. A shared lock will only + * be obtained if the ncp is resolved and the vnode (if not ENOENT) is + * valid. Otherwise an exclusive lock will be acquired instead. + */ +static +struct namecache * +_cache_get_maybe_shared(struct namecache *ncp, int excl) +{ + if (ncp_shared_lock_disable || excl || + (ncp->nc_flag & NCF_UNRESOLVED)) { + return(_cache_get(ncp)); + } + _cache_hold(ncp); + _cache_lock_shared(ncp); + if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { + if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) { + _cache_unlock(ncp); + ncp = _cache_get(ncp); + _cache_drop(ncp); + } + } else { + _cache_unlock(ncp); + ncp = _cache_get(ncp); + _cache_drop(ncp); + } + return(ncp); +} + /* * This is a special form of _cache_lock() which only succeeds if * it can get a pristine, non-recursive lock. The caller must have @@ -825,14 +1084,13 @@ _cache_get(struct namecache *ncp) * * We want _cache_lock_special() (on success) to return a definitively * usable vnode or a definitively unresolved ncp. - * - * MPSAFE */ static int _cache_lock_special(struct namecache *ncp) { if (_cache_lock_nonblock(ncp) == 0) { - if ((ncp->nc_exlocks & ~NC_EXLOCK_REQ) == 1) { + if ((ncp->nc_lockstatus & + ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) { if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) _cache_setunresolved(ncp); return(0); @@ -842,11 +1100,25 @@ _cache_lock_special(struct namecache *ncp) return(EWOULDBLOCK); } +static int +_cache_lock_shared_special(struct namecache *ncp) +{ + if (_cache_lock_shared_nonblock(ncp) == 0) { + if ((ncp->nc_lockstatus & + ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == (NC_SHLOCK_FLAG | 1)) { + if (ncp->nc_vp == NULL || + (ncp->nc_vp->v_flag & VRECLAIMED) == 0) { + return(0); + } + } + _cache_unlock(ncp); + } + return(EWOULDBLOCK); +} + /* * NOTE: The same nchandle can be passed for both arguments. - * - * MPSAFE */ void cache_get(struct nchandle *nch, struct nchandle *target) @@ -857,8 +1129,17 @@ cache_get(struct nchandle *nch, struct nchandle *target) atomic_add_int(&target->mount->mnt_refs, 1); } +void +cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl) +{ + KKASSERT(nch->ncp->nc_refs > 0); + target->mount = nch->mount; + target->ncp = _cache_get_maybe_shared(nch->ncp, excl); + atomic_add_int(&target->mount->mnt_refs, 1); +} + /* - * MPSAFE + * */ static __inline void @@ -869,7 +1150,7 @@ _cache_put(struct namecache *ncp) } /* - * MPSAFE + * */ void cache_put(struct nchandle *nch) @@ -885,14 +1166,13 @@ cache_put(struct nchandle *nch) * vnode is NULL, a negative cache entry is created. * * The ncp should be locked on entry and will remain locked on return. - * - * MPSAFE */ static void _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) { KKASSERT(ncp->nc_flag & NCF_UNRESOLVED); + KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); if (vp != NULL) { /* @@ -905,7 +1185,7 @@ _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) ncp->nc_vp = vp; TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode); spin_unlock(&vp->v_spin); - if (ncp->nc_exlocks) + if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) vhold(vp); /* @@ -950,7 +1230,7 @@ _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp) } /* - * MPSAFE + * */ void cache_setvp(struct nchandle *nch, struct vnode *vp) @@ -959,7 +1239,7 @@ cache_setvp(struct nchandle *nch, struct vnode *vp) } /* - * MPSAFE + * */ void cache_settimeout(struct nchandle *nch, int nticks) @@ -984,7 +1264,6 @@ cache_settimeout(struct nchandle *nch, int nticks) * from its namecache and can cause the OLDAPI and NEWAPI to get out of * sync. * - * MPSAFE */ static void @@ -1011,7 +1290,7 @@ _cache_setunresolved(struct namecache *ncp) */ if (!TAILQ_EMPTY(&ncp->nc_list)) vdrop(vp); - if (ncp->nc_exlocks) + if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) vdrop(vp); } else { spin_lock(&ncspin); @@ -1028,18 +1307,10 @@ _cache_setunresolved(struct namecache *ncp) * set a resolved cache element to unresolved if it has timed out * or if it is a negative cache hit and the mount point namecache_gen * has changed. - * - * MPSAFE */ -static __inline void -_cache_auto_unresolve(struct mount *mp, struct namecache *ncp) +static __inline int +_cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp) { - /* - * Already in an unresolved state, nothing to do. - */ - if (ncp->nc_flag & NCF_UNRESOLVED) - return; - /* * Try to zap entries that have timed out. We have * to be careful here because locked leafs may depend @@ -1048,8 +1319,7 @@ _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) */ if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 && TAILQ_EMPTY(&ncp->nc_list)) { - _cache_setunresolved(ncp); - return; + return 1; } /* @@ -1057,13 +1327,29 @@ _cache_auto_unresolve(struct mount *mp, struct namecache *ncp) * the mount's namecache generation being bumped, zap it. */ if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) { - _cache_setunresolved(ncp); - return; + return 1; + } + + /* + * Otherwise we are good + */ + return 0; +} + +static __inline void +_cache_auto_unresolve(struct mount *mp, struct namecache *ncp) +{ + /* + * Already in an unresolved state, nothing to do. + */ + if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) { + if (_cache_auto_unresolve_test(mp, ncp)) + _cache_setunresolved(ncp); } } /* - * MPSAFE + * */ void cache_setunresolved(struct nchandle *nch) @@ -1076,8 +1362,6 @@ cache_setunresolved(struct nchandle *nch) * looking for matches. This flag tells the lookup code when it must * check for a mount linkage and also prevents the directories in question * from being deleted or renamed. - * - * MPSAFE */ static int @@ -1093,7 +1377,7 @@ cache_clrmountpt_callback(struct mount *mp, void *data) } /* - * MPSAFE + * */ void cache_clrmountpt(struct nchandle *nch) @@ -1157,8 +1441,6 @@ cache_clrmountpt(struct nchandle *nch) * node using a depth-first algorithm in order to allow multiple deep * recursions to chain through each other, then we restart the invalidation * from scratch. - * - * MPSAFE */ struct cinvtrack { @@ -1216,7 +1498,7 @@ _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) struct namecache *nextkid; int rcnt = 0; - KKASSERT(ncp->nc_exlocks); + KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); _cache_setunresolved(ncp); if (flags & CINV_DESTROY) @@ -1276,8 +1558,6 @@ _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track) * * In addition, the v_namecache list itself must be locked via * the vnode's spinlock. - * - * MPSAFE */ int cache_inval_vp(struct vnode *vp, int flags) @@ -1326,8 +1606,6 @@ restart: * * Return 0 on success, non-zero if not all namecache records could be * disassociated from the vnode (for various reasons). - * - * MPSAFE */ int cache_inval_vp_nonblock(struct vnode *vp) @@ -1383,8 +1661,6 @@ done: * Because there may be references to the source ncp we cannot copy its * contents to the target. Instead the source ncp is relinked as the target * and the target ncp is removed from the namecache topology. - * - * MPSAFE */ void cache_rename(struct nchandle *fnch, struct nchandle *tnch) @@ -1502,7 +1778,9 @@ _cache_unlink(struct namecache *ncp) * can safely acquire the vnode. In fact, we MUST NOT release the ncp * lock when acquiring the vp lock or we might cause a deadlock. * - * MPSAFE + * NOTE: The passed-in ncp must be locked exclusively if it is initially + * unresolved. If a reclaim race occurs the passed-in ncp will be + * relocked exclusively before being re-resolved. */ int cache_vget(struct nchandle *nch, struct ucred *cred, @@ -1513,7 +1791,6 @@ cache_vget(struct nchandle *nch, struct ucred *cred, int error; ncp = nch->ncp; - KKASSERT(ncp->nc_locktd == curthread); again: vp = NULL; if (ncp->nc_flag & NCF_UNRESOLVED) @@ -1531,6 +1808,8 @@ again: kprintf("Warning: vnode reclaim race detected " "in cache_vget on %p (%s)\n", vp, ncp->nc_name); + _cache_unlock(ncp); + _cache_lock(ncp); _cache_setunresolved(ncp); goto again; } @@ -1551,6 +1830,13 @@ again: return(error); } +/* + * Similar to cache_vget() but only acquires a ref on the vnode. + * + * NOTE: The passed-in ncp must be locked exclusively if it is initially + * unresolved. If a reclaim race occurs the passed-in ncp will be + * relocked exclusively before being re-resolved. + */ int cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) { @@ -1559,7 +1845,6 @@ cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp) int error; ncp = nch->ncp; - KKASSERT(ncp->nc_locktd == curthread); again: vp = NULL; if (ncp->nc_flag & NCF_UNRESOLVED) @@ -1577,6 +1862,8 @@ again: kprintf("Warning: vnode reclaim race detected " "in cache_vget on %p (%s)\n", vp, ncp->nc_name); + _cache_unlock(ncp); + _cache_lock(ncp); _cache_setunresolved(ncp); goto again; } @@ -1612,8 +1899,8 @@ again: * so use vhold()/vdrop() while holding the lock to prevent dvp from * getting destroyed. * - * MPSAFE - Note vhold() is allowed when dvp has 0 refs if we hold a - * lock on the ncp in question.. + * NOTE: vhold() is allowed when dvp has 0 refs if we hold a + * lock on the ncp in question.. */ static struct vnode * cache_dvpref(struct namecache *ncp) @@ -2422,6 +2709,96 @@ found: return(nch); } +/* + * Attempt to lookup a namecache entry and return with a shared namecache + * lock. + */ +int +cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc, + int excl, struct nchandle *res_nch) +{ + struct namecache *ncp; + struct nchash_head *nchpp; + struct mount *mp; + u_int32_t hash; + globaldata_t gd; + + /* + * If exclusive requested or shared namecache locks are disabled, + * return failure. + */ + if (ncp_shared_lock_disable || excl) + return(EWOULDBLOCK); + + numcalls++; + gd = mycpu; + mp = par_nch->mount; + + /* + * This is a good time to call it, no ncp's are locked by + * the caller or us. + */ + cache_hysteresis(); + + /* + * Try to locate an existing entry + */ + hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); + hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); + nchpp = NCHHASH(hash); + + spin_lock(&nchpp->spin); + + LIST_FOREACH(ncp, &nchpp->list, nc_hash) { + numchecks++; + + /* + * Break out if we find a matching entry. Note that + * UNRESOLVED entries may match, but DESTROYED entries + * do not. + */ + if (ncp->nc_parent == par_nch->ncp && + ncp->nc_nlen == nlc->nlc_namelen && + bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && + (ncp->nc_flag & NCF_DESTROYED) == 0 + ) { + _cache_hold(ncp); + spin_unlock(&nchpp->spin); + if (_cache_lock_shared_special(ncp) == 0) { + if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 && + (ncp->nc_flag & NCF_DESTROYED) == 0 && + _cache_auto_unresolve_test(mp, ncp) == 0) { + goto found; + } + _cache_unlock(ncp); + } + _cache_drop(ncp); + spin_lock(&nchpp->spin); + break; + } + } + + /* + * Failure + */ + spin_unlock(&nchpp->spin); + return(EWOULDBLOCK); + + /* + * Success + * + * Note that nc_error might be non-zero (e.g ENOENT). + */ +found: + res_nch->mount = mp; + res_nch->ncp = ncp; + ++gd->gd_nchstats->ncs_goodhits; + atomic_add_int(&res_nch->mount->mnt_refs, 1); + + KKASSERT(ncp->nc_error != EWOULDBLOCK); + return(ncp->nc_error); +} + /* * This is a non-blocking verison of cache_nlookup() used by * nfs_readdirplusrpc_uio(). It can fail for any reason and @@ -2756,8 +3133,6 @@ cache_unmounting(struct mount *mp) * Note that successful resolution does not necessarily return an error * code of 0. If the ncp resolves to a negative cache hit then ENOENT * will be returned. - * - * MPSAFE */ int cache_resolve(struct nchandle *nch, struct ucred *cred) @@ -2772,6 +3147,7 @@ cache_resolve(struct nchandle *nch, struct ucred *cred) ncp = nch->ncp; mp = nch->mount; + KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE); restart: /* * If the ncp is already resolved we have nothing to do. However, @@ -2986,8 +3362,6 @@ cache_resolve_mp(struct mount *mp) /* * Clean out negative cache entries when too many have accumulated. - * - * MPSAFE */ static void _cache_cleanneg(int count) @@ -3033,8 +3407,6 @@ _cache_cleanneg(int count) /* * Clean out positive cache entries when too many have accumulated. - * - * MPSAFE */ static void _cache_cleanpos(int count) @@ -3079,8 +3451,6 @@ _cache_cleanpos(int count) * * Such entries can also be removed via cache_inval_vp(), such * as when unmounting. - * - * MPSAFE */ static void _cache_cleandefered(void) @@ -3366,7 +3736,10 @@ kern_getcwd(char *buf, size_t buflen, int *error) * have to check again. */ while ((nch.ncp = ncp->nc_parent) != NULL) { - _cache_lock(ncp); + if (ncp_shared_lock_disable) + _cache_lock(ncp); + else + _cache_lock_shared(ncp); if (nch.ncp != ncp->nc_parent) { _cache_unlock(ncp); continue; diff --git a/sys/kern/vfs_nlookup.c b/sys/kern/vfs_nlookup.c index 01b1dee496..988aa174fd 100644 --- a/sys/kern/vfs_nlookup.c +++ b/sys/kern/vfs_nlookup.c @@ -402,7 +402,20 @@ nlookup_simple(const char *str, enum uio_seg seg, * If NLC_REFDVP is set nd->nl_dvp will be set to the directory vnode * of the returned entry. The vnode will be referenced, but not locked, * and will be released by nlookup_done() along with everything else. + * + * NOTE: As an optimization we attempt to obtain a shared namecache lock + * on any intermediate elements. On success, the returned element + * is ALWAYS locked exclusively. */ +static +int +islastelement(const char *ptr) +{ + while (*ptr == '/') + ++ptr; + return (*ptr == 0); +} + int nlookup(struct nlookupdata *nd) { @@ -415,7 +428,6 @@ nlookup(struct nlookupdata *nd) struct vnode *hvp; /* hold to prevent recyclement */ int wasdotordotdot; char *ptr; - char *xptr; int error; int len; int dflags; @@ -450,7 +462,7 @@ nlookup(struct nlookupdata *nd) */ if ((nd->nl_flags & NLC_NCPISLOCKED) == 0) { nd->nl_flags |= NLC_NCPISLOCKED; - cache_lock(&nd->nl_nch); + cache_lock_maybe_shared(&nd->nl_nch, islastelement(ptr)); } /* @@ -463,8 +475,10 @@ nlookup(struct nlookupdata *nd) do { ++ptr; } while (*ptr == '/'); - cache_get(&nd->nl_rootnch, &nch); - cache_put(&nd->nl_nch); + cache_unlock(&nd->nl_nch); + cache_get_maybe_shared(&nd->nl_rootnch, &nch, + islastelement(ptr)); + cache_drop(&nd->nl_nch); nd->nl_nch = nch; /* remains locked */ /* @@ -485,7 +499,7 @@ nlookup(struct nlookupdata *nd) } /* - * Check directory search permissions. + * Check directory search permissions (nd->nl_nch is locked & refd) */ dflags = 0; error = naccess(&nd->nl_nch, NLC_EXEC, nd->nl_cred, &dflags); @@ -517,13 +531,21 @@ nlookup(struct nlookupdata *nd) * since our dflags will be for some sub-directory instead of the * parent dir. * - * This subsection returns a locked, refd 'nch' unless it errors out. + * This subsection returns a locked, refd 'nch' unless it errors out, + * and an unlocked but still ref'd nd->nl_nch. + * * The namecache topology is not allowed to be disconnected, so * encountering a NULL parent will generate EINVAL. This typically * occurs when a directory is removed out from under a process. + * + * WARNING! The unlocking of nd->nl_nch is sensitive code. */ + KKASSERT(nd->nl_flags & NLC_NCPISLOCKED); + if (nlc.nlc_namelen == 1 && nlc.nlc_nameptr[0] == '.') { - cache_get(&nd->nl_nch, &nch); + cache_unlock(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCPISLOCKED; + cache_get_maybe_shared(&nd->nl_nch, &nch, islastelement(ptr)); wasdotordotdot = 1; } else if (nlc.nlc_namelen == 2 && nlc.nlc_nameptr[0] == '.' && nlc.nlc_nameptr[1] == '.') { @@ -533,7 +555,9 @@ nlookup(struct nlookupdata *nd) /* * ".." at the root returns the root */ - cache_get(&nd->nl_nch, &nch); + cache_unlock(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCPISLOCKED; + cache_get_maybe_shared(&nd->nl_nch, &nch, islastelement(ptr)); } else { /* * Locate the parent ncp. If we are at the root of a @@ -550,7 +574,9 @@ nlookup(struct nlookupdata *nd) nctmp.ncp = nctmp.ncp->nc_parent; KKASSERT(nctmp.ncp != NULL); cache_hold(&nctmp); - cache_get(&nctmp, &nch); + cache_unlock(&nd->nl_nch); + nd->nl_flags &= ~NLC_NCPISLOCKED; + cache_get_maybe_shared(&nctmp, &nch, islastelement(ptr)); cache_drop(&nctmp); /* NOTE: zero's nctmp */ } wasdotordotdot = 2; @@ -573,15 +599,24 @@ nlookup(struct nlookupdata *nd) vhold(hvp); cache_unlock(&nd->nl_nch); nd->nl_flags &= ~NLC_NCPISLOCKED; - nch = cache_nlookup(&nd->nl_nch, &nlc); - if (nch.ncp->nc_flag & NCF_UNRESOLVED) - hit = 0; - while ((error = cache_resolve(&nch, nd->nl_cred)) == EAGAIN || - (nch.ncp->nc_flag & NCF_DESTROYED)) { - kprintf("[diagnostic] nlookup: relookup %*.*s\n", - nch.ncp->nc_nlen, nch.ncp->nc_nlen, nch.ncp->nc_name); - cache_put(&nch); - nch = cache_nlookup(&nd->nl_nch, &nlc); + error = cache_nlookup_maybe_shared(&nd->nl_nch, &nlc, + islastelement(ptr), &nch); + if (error == EWOULDBLOCK) { + nch = cache_nlookup(&nd->nl_nch, &nlc); + if (nch.ncp->nc_flag & NCF_UNRESOLVED) + hit = 0; + for (;;) { + error = cache_resolve(&nch, nd->nl_cred); + if (error != EAGAIN && + (nch.ncp->nc_flag & NCF_DESTROYED) == 0) { + break; + } + kprintf("[diagnostic] nlookup: relookup %*.*s\n", + nch.ncp->nc_nlen, nch.ncp->nc_nlen, + nch.ncp->nc_name); + cache_put(&nch); + nch = cache_nlookup(&nd->nl_nch, &nlc); + } } if (hvp) vdrop(hvp); @@ -600,15 +635,11 @@ nlookup(struct nlookupdata *nd) if ((par.ncp = nch.ncp->nc_parent) != NULL) { par.mount = nch.mount; cache_hold(&par); - cache_lock(&par); + cache_lock_maybe_shared(&par, islastelement(ptr)); error = naccess(&par, 0, nd->nl_cred, &dflags); cache_put(&par); } } - if (nd->nl_flags & NLC_NCPISLOCKED) { - cache_unlock(&nd->nl_nch); - nd->nl_flags &= ~NLC_NCPISLOCKED; - } /* * [end of subsection] @@ -619,6 +650,7 @@ nlookup(struct nlookupdata *nd) * nl_nch must be unlocked or we could chain lock to the root * if a resolve gets stuck (e.g. in NFS). */ + KKASSERT((nd->nl_flags & NLC_NCPISLOCKED) == 0); /* * Resolve the namespace if necessary. The ncp returned by @@ -645,9 +677,7 @@ nlookup(struct nlookupdata *nd) * for a create/rename/delete. The standard requires this and pax * pretty stupidly depends on it. */ - for (xptr = ptr; *xptr == '/'; ++xptr) - ; - if (*xptr == 0) { + if (islastelement(ptr)) { if (error == ENOENT && (nd->nl_flags & (NLC_CREATE | NLC_RENAME_DST)) ) { @@ -755,7 +785,8 @@ again: } } } - cache_get(&mp->mnt_ncmountpt, &nch); + cache_get_maybe_shared(&mp->mnt_ncmountpt, &nch, + islastelement(ptr)); if (nch.ncp->nc_flag & NCF_UNRESOLVED) { if (vfs_do_busy == 0) { @@ -864,9 +895,12 @@ double_break: } if (hit) - ++gd->gd_nchstats->ncs_longhits; + ++gd->gd_nchstats->ncs_longhits; else - ++gd->gd_nchstats->ncs_longmiss; + ++gd->gd_nchstats->ncs_longmiss; + + if (nd->nl_flags & NLC_NCPISLOCKED) + KKASSERT(cache_lockstatus(&nd->nl_nch) == LK_EXCLUSIVE); /* * NOTE: If NLC_CREATE was set the ncp may represent a negative hit @@ -979,7 +1013,8 @@ fail: * The directory sticky bit is tested for NLC_DELETE and NLC_RENAME_DST, * the latter is only tested if the target exists. * - * The passed ncp must be referenced and locked. + * The passed ncp must be referenced and locked. If it is already resolved + * it may be locked shared but otherwise should be locked exclusively. */ static int naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) @@ -990,7 +1025,8 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) int error; int cflags; - ASSERT_NCH_LOCKED(nch); + KKASSERT(cache_lockstatus(nch) > 0); + ncp = nch->ncp; if (ncp->nc_flag & NCF_UNRESOLVED) { cache_resolve(nch, cred); @@ -1019,7 +1055,7 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) } else if (error == 0 || error == ENOENT) { par.mount = nch->mount; cache_hold(&par); - cache_lock(&par); + cache_lock_maybe_shared(&par, 0); error = naccess(&par, NLC_WRITE, cred, NULL); cache_put(&par); } @@ -1133,9 +1169,15 @@ naccess(struct nchandle *nch, int nflags, struct ucred *cred, int *nflagsp) cflags |= NCF_UF_PCACHE; } } - ncp->nc_flag &= ~(NCF_SF_NOCACHE | NCF_UF_CACHE | - NCF_SF_PNOCACHE | NCF_UF_PCACHE); - ncp->nc_flag |= cflags; + + /* + * XXX we're not supposed to update nc_flag when holding + * a shared lock. + */ + atomic_clear_short(&ncp->nc_flag, + (NCF_SF_NOCACHE | NCF_UF_CACHE | + NCF_SF_PNOCACHE | NCF_UF_PCACHE) & ~cflags); + atomic_set_short(&ncp->nc_flag, cflags); /* * Process general access. diff --git a/sys/sys/namecache.h b/sys/sys/namecache.h index e7b22dace6..693a3cf8c5 100644 --- a/sys/sys/namecache.h +++ b/sys/sys/namecache.h @@ -109,7 +109,7 @@ TAILQ_HEAD(namecache_list, namecache); * confusion, but only the one representing the physical directory is passed * into lower layer VOP calls. * - * ncp locking is done using atomic ops on nc_exlocks, including a request + * ncp locking is done using atomic ops on nc_lockstatus, including a request * flag for waiters. nc_locktd is set after locking or cleared before * the last unlock. ncp locks are reentrant. * @@ -131,7 +131,7 @@ struct namecache { char *nc_name; /* Separately allocated seg name */ int nc_error; int nc_timeout; /* compared against ticks, or 0 */ - u_int nc_exlocks; /* namespace locking */ + u_int nc_lockstatus; /* namespace locking */ struct thread *nc_locktd; /* namespace locking */ long nc_namecache_gen; /* cmp against mnt_namecache_gen */ }; @@ -145,8 +145,6 @@ struct nchandle { struct mount *mount; /* mount pt (possible overlay) */ }; -#define ASSERT_NCH_LOCKED(nch) KKASSERT(nch->ncp->nc_locktd == curthread) - /* * Flags in namecache.nc_flag (u_char) */ @@ -163,7 +161,10 @@ struct nchandle { #define NCF_DESTROYED 0x0400 /* name association is considered destroyed */ #define NCF_DEFEREDZAP 0x0800 /* zap defered due to lock unavailability */ -#define NC_EXLOCK_REQ 0x80000000 /* ex_lock state */ +#define NC_EXLOCK_REQ 0x80000000 /* nc_lockstatus state flag */ +#define NC_SHLOCK_REQ 0x40000000 /* nc_lockstatus state flag */ +#define NC_SHLOCK_FLAG 0x20000000 /* nc_lockstatus state flag */ +#define NC_SHLOCK_VHOLD 0x10000000 /* nc_lockstatus state flag */ /* * cache_inval[_vp]() flags @@ -179,10 +180,12 @@ struct nlcomponent; struct mount; void cache_lock(struct nchandle *nch); +void cache_lock_maybe_shared(struct nchandle *nch, int excl); void cache_relock(struct nchandle *nch1, struct ucred *cred1, struct nchandle *nch2, struct ucred *cred2); int cache_lock_nonblock(struct nchandle *nch); void cache_unlock(struct nchandle *nch); +int cache_lockstatus(struct nchandle *nch); void cache_setvp(struct nchandle *nch, struct vnode *vp); void cache_settimeout(struct nchandle *nch, int nticks); void cache_setunresolved(struct nchandle *nch); @@ -190,6 +193,9 @@ void cache_clrmountpt(struct nchandle *nch); struct nchandle cache_nlookup(struct nchandle *nch, struct nlcomponent *nlc); struct nchandle cache_nlookup_nonblock(struct nchandle *nch, struct nlcomponent *nlc); +int cache_nlookup_maybe_shared(struct nchandle *nch, + struct nlcomponent *nlc, int excl, + struct nchandle *nchres); void cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp); struct mount *cache_findmount(struct nchandle *nch); void cache_dropmount(struct mount *mp); @@ -203,9 +209,11 @@ void vfs_cache_setroot(struct vnode *vp, struct nchandle *nch); int cache_resolve(struct nchandle *nch, struct ucred *cred); void cache_purge(struct vnode *vp); void cache_purgevfs (struct mount *mp); -int cache_get_nonblock(struct nchandle *nch, struct nchandle *target); void cache_hysteresis(void); void cache_get(struct nchandle *nch, struct nchandle *target); +int cache_get_nonblock(struct nchandle *nch, struct nchandle *target); +void cache_get_maybe_shared(struct nchandle *nch, + struct nchandle *target, int excl); struct nchandle *cache_hold(struct nchandle *nch); void cache_copy(struct nchandle *nch, struct nchandle *target); void cache_changemount(struct nchandle *nch, struct mount *mp);