From: Matthew Dillon Date: Sat, 13 Feb 2010 17:23:33 +0000 (-0800) Subject: kernel - More readdirplus deadlock avoidance work X-Git-Tag: v2.7.1~153 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/668b43c50042037f92a5b34aac99c0c8d6b8dee1 kernel - More readdirplus deadlock avoidance work * Add cache_nlookup_nonblock() and nfs_nget_nonblock() * Adjust the readdirplus code to use the new functions. Basically there are too many locks being held by callers and then readdirplus goes and tries to acquire (and lock) nodes for the directory elements on top of that, leading to deadlocks. Attempts to order the locks have failed so now we just use a non-blocking approach since the readdirplus entries are all advisory anyway. --- diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index e922af0303..7bdc517de9 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -2279,7 +2279,7 @@ restart: /* * WARNING! We still hold the spinlock. We have to set the hash - * table entry attomically. + * table entry atomically. */ ncp = new_ncp; _cache_link_parent(ncp, par_nch->ncp, nchpp); @@ -2302,6 +2302,132 @@ found: return(nch); } +/* + * This is a non-blocking verison of cache_nlookup() used by + * nfs_readdirplusrpc_uio(). It can fail for any reason and + * will return nch.ncp == NULL in that case. + */ +struct nchandle +cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc) +{ + struct nchandle nch; + struct namecache *ncp; + struct namecache *new_ncp; + struct nchash_head *nchpp; + struct mount *mp; + u_int32_t hash; + globaldata_t gd; + int par_locked; + + numcalls++; + gd = mycpu; + mp = par_nch->mount; + par_locked = 0; + + /* + * Try to locate an existing entry + */ + hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT); + hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash); + new_ncp = NULL; + nchpp = NCHHASH(hash); +restart: + spin_lock_wr(&nchpp->spin); + LIST_FOREACH(ncp, &nchpp->list, nc_hash) { + numchecks++; + + /* + * Break out if we find a matching entry. Note that + * UNRESOLVED entries may match, but DESTROYED entries + * do not. + */ + if (ncp->nc_parent == par_nch->ncp && + ncp->nc_nlen == nlc->nlc_namelen && + bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 && + (ncp->nc_flag & NCF_DESTROYED) == 0 + ) { + _cache_hold(ncp); + spin_unlock_wr(&nchpp->spin); + if (par_locked) { + _cache_unlock(par_nch->ncp); + par_locked = 0; + } + if (_cache_lock_special(ncp) == 0) { + _cache_auto_unresolve(mp, ncp); + if (new_ncp) { + _cache_free(new_ncp); + new_ncp = NULL; + } + goto found; + } + _cache_drop(ncp); + goto failed; + } + } + + /* + * We failed to locate an entry, create a new entry and add it to + * the cache. The parent ncp must also be locked so we + * can link into it. + * + * We have to relookup after possibly blocking in kmalloc or + * when locking par_nch. + * + * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special + * mount case, in which case nc_name will be NULL. + */ + if (new_ncp == NULL) { + spin_unlock_wr(&nchpp->spin); + new_ncp = cache_alloc(nlc->nlc_namelen); + if (nlc->nlc_namelen) { + bcopy(nlc->nlc_nameptr, new_ncp->nc_name, + nlc->nlc_namelen); + new_ncp->nc_name[nlc->nlc_namelen] = 0; + } + goto restart; + } + if (par_locked == 0) { + spin_unlock_wr(&nchpp->spin); + if (_cache_lock_nonblock(par_nch->ncp) == 0) { + par_locked = 1; + goto restart; + } + goto failed; + } + + /* + * WARNING! We still hold the spinlock. We have to set the hash + * table entry atomically. + */ + ncp = new_ncp; + _cache_link_parent(ncp, par_nch->ncp, nchpp); + spin_unlock_wr(&nchpp->spin); + _cache_unlock(par_nch->ncp); + /* par_locked = 0 - not used */ +found: + /* + * stats and namecache size management + */ + if (ncp->nc_flag & NCF_UNRESOLVED) + ++gd->gd_nchstats->ncs_miss; + else if (ncp->nc_vp) + ++gd->gd_nchstats->ncs_goodhits; + else + ++gd->gd_nchstats->ncs_neghits; + nch.mount = mp; + nch.ncp = ncp; + atomic_add_int(&nch.mount->mnt_refs, 1); + return(nch); +failed: + if (new_ncp) { + _cache_free(new_ncp); + new_ncp = NULL; + } + nch.mount = NULL; + nch.ncp = NULL; + return(nch); +} + /* * The namecache entry is marked as being used as a mount point. * Locate the mount if it is visible to the caller. diff --git a/sys/sys/namecache.h b/sys/sys/namecache.h index 4c5e893fdb..3efbddd14a 100644 --- a/sys/sys/namecache.h +++ b/sys/sys/namecache.h @@ -230,6 +230,8 @@ void cache_settimeout(struct nchandle *nch, int nticks); void cache_setunresolved(struct nchandle *nch); void cache_clrmountpt(struct nchandle *nch); struct nchandle cache_nlookup(struct nchandle *nch, struct nlcomponent *nlc); +struct nchandle cache_nlookup_nonblock(struct nchandle *nch, + struct nlcomponent *nlc); void cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp); struct mount *cache_findmount(struct nchandle *nch); int cache_inval(struct nchandle *nch, int flags); diff --git a/sys/vfs/nfs/nfs_node.c b/sys/vfs/nfs/nfs_node.c index 78952a31d6..281101e9f5 100644 --- a/sys/vfs/nfs/nfs_node.c +++ b/sys/vfs/nfs/nfs_node.c @@ -182,8 +182,9 @@ loop: LIST_INSERT_HEAD(nhpp, np, n_hash); if (fhsize > NFS_SMALLFH) { MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK); - } else + } else { np->n_fhp = &np->n_fh; + } bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); np->n_fhsize = fhsize; lockinit(&np->n_rslock, "nfrslk", 0, lkflags); @@ -192,7 +193,6 @@ loop: * nvp is locked & refd so effectively so is np. */ *npp = np; - if (nfs_node_hash_lock < 0) wakeup(&nfs_node_hash_lock); nfs_node_hash_lock = 0; @@ -200,6 +200,139 @@ loop: return (0); } +/* + * Nonblocking version of nfs_nget() + */ +int +nfs_nget_nonblock(struct mount *mntp, nfsfh_t *fhp, int fhsize, + struct nfsnode **npp) +{ + struct nfsnode *np, *np2; + struct nfsnodehashhead *nhpp; + struct vnode *vp; + struct vnode *nvp; + int error; + int lkflags; + struct nfsmount *nmp; + + /* + * Calculate nfs mount point and figure out whether the rslock should + * be interruptable or not. + */ + nmp = VFSTONFS(mntp); + if (nmp->nm_flag & NFSMNT_INT) + lkflags = LK_PCATCH; + else + lkflags = 0; + vp = NULL; + *npp = NULL; +retry: + nhpp = NFSNOHASH(fnv_32_buf(fhp->fh_bytes, fhsize, FNV1_32_INIT)); +loop: + for (np = nhpp->lh_first; np; np = np->n_hash.le_next) { + if (mntp != NFSTOV(np)->v_mount || np->n_fhsize != fhsize || + bcmp((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize)) { + continue; + } + if (vp == NULL) { + vp = NFSTOV(np); + if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT)) { + error = EWOULDBLOCK; + goto fail; + } + goto loop; + } + if (NFSTOV(np) != vp) { + vput(vp); + vp = NULL; + goto loop; + } + *npp = np; + return(0); + } + + /* + * Not found. If we raced and had acquired a vp we have to release + * it here. + */ + if (vp) { + vput(vp); + vp = NULL; + } + + /* + * Obtain a lock to prevent a race condition if the getnewvnode() + * or MALLOC() below happens to block. + */ + if (nfs_node_hash_lock) { + while (nfs_node_hash_lock) { + nfs_node_hash_lock = -1; + tsleep(&nfs_node_hash_lock, 0, "nfsngt", 0); + } + goto loop; + } + nfs_node_hash_lock = 1; + + /* + * Entry not found, allocate a new entry. + * + * Allocate before getnewvnode since doing so afterward + * might cause a bogus v_data pointer to get dereferenced + * elsewhere if zalloc should block. + */ + np = zalloc(nfsnode_zone); + + error = getnewvnode(VT_NFS, mntp, &nvp, 0, 0); + if (error) { + if (nfs_node_hash_lock < 0) + wakeup(&nfs_node_hash_lock); + nfs_node_hash_lock = 0; + zfree(nfsnode_zone, np); + return (error); + } + vp = nvp; + bzero(np, sizeof (*np)); + np->n_vnode = vp; + vp->v_data = np; + + /* + * Insert the nfsnode in the hash queue for its new file handle. + * If someone raced us we free np and vp and try again. + */ + for (np2 = nhpp->lh_first; np2 != 0; np2 = np2->n_hash.le_next) { + if (mntp != NFSTOV(np2)->v_mount || np2->n_fhsize != fhsize || + bcmp((caddr_t)fhp, (caddr_t)np2->n_fhp, fhsize)) { + continue; + } + vx_put(vp); + if (nfs_node_hash_lock < 0) + wakeup(&nfs_node_hash_lock); + nfs_node_hash_lock = 0; + zfree(nfsnode_zone, np); + goto retry; + } + LIST_INSERT_HEAD(nhpp, np, n_hash); + if (fhsize > NFS_SMALLFH) { + MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK); + } else { + np->n_fhp = &np->n_fh; + } + bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize); + np->n_fhsize = fhsize; + lockinit(&np->n_rslock, "nfrslk", 0, lkflags); + + /* + * nvp is locked & refd so effectively so is np. + */ + *npp = np; + error = 0; + if (nfs_node_hash_lock < 0) + wakeup(&nfs_node_hash_lock); + nfs_node_hash_lock = 0; +fail: + return (error); +} + /* * nfs_inactive(struct vnode *a_vp) * diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index dba0935db7..dcd47e4c78 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -2505,7 +2505,6 @@ nfs_readdirplusrpc_uio(struct vnode *vp, struct uio *uiop) u_quad_t fileno; int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i; int attrflag, fhsize; - int vpls; struct nchandle nch; struct nchandle dnch; struct nlcomponent nlc; @@ -2666,31 +2665,33 @@ nfs_readdirplusrpc_uio(struct vnode *vp, struct uio *uiop) * hold the directory vp locked while * doing lookups and gets. */ - vpls = vn_islocked_unlock(vp); - nch = cache_nlookup(&dnch, &nlc); + nch = cache_nlookup_nonblock(&dnch, &nlc); + if (nch.ncp == NULL) + goto rdfail; cache_setunresolved(&nch); - error = nfs_nget(vp->v_mount, fhp, - fhsize, &np); - vn_islocked_relock(vp, vpls); - if (error == 0) { - newvp = NFSTOV(np); - dpossav2 = info.dpos; - info.dpos = dpossav1; - mdsav2 = info.md; - info.md = mdsav1; - ERROROUT(nfsm_loadattr(&info, newvp, - NULL)); - info.dpos = dpossav2; - info.md = mdsav2; - dp->nfs_type = - IFTODT(VTTOIF(np->n_vattr.va_type)); - nfs_cache_setvp(&nch, newvp, - nfspos_cache_timeout); - vput(newvp); - newvp = NULLVP; + error = nfs_nget_nonblock(vp->v_mount, fhp, + fhsize, &np); + if (error) { + cache_put(&nch); + goto rdfail; } + newvp = NFSTOV(np); + dpossav2 = info.dpos; + info.dpos = dpossav1; + mdsav2 = info.md; + info.md = mdsav1; + ERROROUT(nfsm_loadattr(&info, newvp, NULL)); + info.dpos = dpossav2; + info.md = mdsav2; + dp->nfs_type = + IFTODT(VTTOIF(np->n_vattr.va_type)); + nfs_cache_setvp(&nch, newvp, + nfspos_cache_timeout); + vput(newvp); + newvp = NULLVP; cache_put(&nch); } else { +rdfail: kprintf("Warning: NFS/rddirplus, " "UNABLE TO ENTER %*.*s\n", nlc.nlc_namelen, nlc.nlc_namelen, diff --git a/sys/vfs/nfs/nfsnode.h b/sys/vfs/nfs/nfsnode.h index 08324f3cf3..11157e2bbb 100644 --- a/sys/vfs/nfs/nfsnode.h +++ b/sys/vfs/nfs/nfsnode.h @@ -227,6 +227,7 @@ int nfs_flush (struct vnode *, int, struct thread *, int); /* other stuff */ int nfs_removeit (struct sillyrename *); int nfs_nget (struct mount *,nfsfh_t *,int,struct nfsnode **); +int nfs_nget_nonblock (struct mount *,nfsfh_t *,int,struct nfsnode **); nfsuint64 *nfs_getcookie (struct nfsnode *, off_t, int); void nfs_invaldir (struct vnode *);