kernel - More readdirplus deadlock avoidance work
authorMatthew Dillon <dillon@apollo.backplane.com>
Sat, 13 Feb 2010 17:23:33 +0000 (09:23 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sat, 13 Feb 2010 17:26:46 +0000 (09:26 -0800)
* Add cache_nlookup_nonblock() and nfs_nget_nonblock()

* Adjust the readdirplus code to use the new functions.   Basically
  there are too many locks being held by callers and then readdirplus
  goes and tries to acquire (and lock) nodes for the directory elements
  on top of that, leading to deadlocks.

  Attempts to order the locks have failed so now we just use a
  non-blocking approach since the readdirplus entries are all
  advisory anyway.

sys/kern/vfs_cache.c
sys/sys/namecache.h
sys/vfs/nfs/nfs_node.c
sys/vfs/nfs/nfs_vnops.c
sys/vfs/nfs/nfsnode.h

index e922af0..7bdc517 100644 (file)
@@ -2279,7 +2279,7 @@ restart:
 
        /*
         * WARNING!  We still hold the spinlock.  We have to set the hash
-        *           table entry attomically.
+        *           table entry atomically.
         */
        ncp = new_ncp;
        _cache_link_parent(ncp, par_nch->ncp, nchpp);
@@ -2302,6 +2302,132 @@ found:
        return(nch);
 }
 
+/*
+ * This is a non-blocking verison of cache_nlookup() used by
+ * nfs_readdirplusrpc_uio().  It can fail for any reason and
+ * will return nch.ncp == NULL in that case.
+ */
+struct nchandle
+cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
+{
+       struct nchandle nch;
+       struct namecache *ncp;
+       struct namecache *new_ncp;
+       struct nchash_head *nchpp;
+       struct mount *mp;
+       u_int32_t hash;
+       globaldata_t gd;
+       int par_locked;
+
+       numcalls++;
+       gd = mycpu;
+       mp = par_nch->mount;
+       par_locked = 0;
+
+       /*
+        * Try to locate an existing entry
+        */
+       hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
+       hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
+       new_ncp = NULL;
+       nchpp = NCHHASH(hash);
+restart:
+       spin_lock_wr(&nchpp->spin);
+       LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
+               numchecks++;
+
+               /*
+                * Break out if we find a matching entry.  Note that
+                * UNRESOLVED entries may match, but DESTROYED entries
+                * do not.
+                */
+               if (ncp->nc_parent == par_nch->ncp &&
+                   ncp->nc_nlen == nlc->nlc_namelen &&
+                   bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
+                   (ncp->nc_flag & NCF_DESTROYED) == 0
+               ) {
+                       _cache_hold(ncp);
+                       spin_unlock_wr(&nchpp->spin);
+                       if (par_locked) {
+                               _cache_unlock(par_nch->ncp);
+                               par_locked = 0;
+                       }
+                       if (_cache_lock_special(ncp) == 0) {
+                               _cache_auto_unresolve(mp, ncp);
+                               if (new_ncp) {
+                                       _cache_free(new_ncp);
+                                       new_ncp = NULL;
+                               }
+                               goto found;
+                       }
+                       _cache_drop(ncp);
+                       goto failed;
+               }
+       }
+
+       /*
+        * We failed to locate an entry, create a new entry and add it to
+        * the cache.  The parent ncp must also be locked so we
+        * can link into it.
+        *
+        * We have to relookup after possibly blocking in kmalloc or
+        * when locking par_nch.
+        *
+        * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
+        *       mount case, in which case nc_name will be NULL.
+        */
+       if (new_ncp == NULL) {
+               spin_unlock_wr(&nchpp->spin);
+               new_ncp = cache_alloc(nlc->nlc_namelen);
+               if (nlc->nlc_namelen) {
+                       bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
+                             nlc->nlc_namelen);
+                       new_ncp->nc_name[nlc->nlc_namelen] = 0;
+               }
+               goto restart;
+       }
+       if (par_locked == 0) {
+               spin_unlock_wr(&nchpp->spin);
+               if (_cache_lock_nonblock(par_nch->ncp) == 0) {
+                       par_locked = 1;
+                       goto restart;
+               }
+               goto failed;
+       }
+
+       /*
+        * WARNING!  We still hold the spinlock.  We have to set the hash
+        *           table entry atomically.
+        */
+       ncp = new_ncp;
+       _cache_link_parent(ncp, par_nch->ncp, nchpp);
+       spin_unlock_wr(&nchpp->spin);
+       _cache_unlock(par_nch->ncp);
+       /* par_locked = 0 - not used */
+found:
+       /*
+        * stats and namecache size management
+        */
+       if (ncp->nc_flag & NCF_UNRESOLVED)
+               ++gd->gd_nchstats->ncs_miss;
+       else if (ncp->nc_vp)
+               ++gd->gd_nchstats->ncs_goodhits;
+       else
+               ++gd->gd_nchstats->ncs_neghits;
+       nch.mount = mp;
+       nch.ncp = ncp;
+       atomic_add_int(&nch.mount->mnt_refs, 1);
+       return(nch);
+failed:
+       if (new_ncp) {
+               _cache_free(new_ncp);
+               new_ncp = NULL;
+       }
+       nch.mount = NULL;
+       nch.ncp = NULL;
+       return(nch);
+}
+
 /*
  * The namecache entry is marked as being used as a mount point. 
  * Locate the mount if it is visible to the caller.
index 4c5e893..3efbddd 100644 (file)
@@ -230,6 +230,8 @@ void        cache_settimeout(struct nchandle *nch, int nticks);
 void   cache_setunresolved(struct nchandle *nch);
 void   cache_clrmountpt(struct nchandle *nch);
 struct nchandle cache_nlookup(struct nchandle *nch, struct nlcomponent *nlc);
+struct nchandle cache_nlookup_nonblock(struct nchandle *nch,
+                       struct nlcomponent *nlc);
 void   cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp);
 struct mount *cache_findmount(struct nchandle *nch);
 int    cache_inval(struct nchandle *nch, int flags);
index 78952a3..281101e 100644 (file)
@@ -182,8 +182,9 @@ loop:
        LIST_INSERT_HEAD(nhpp, np, n_hash);
        if (fhsize > NFS_SMALLFH) {
                MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK);
-       } else
+       } else {
                np->n_fhp = &np->n_fh;
+       }
        bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
        np->n_fhsize = fhsize;
        lockinit(&np->n_rslock, "nfrslk", 0, lkflags);
@@ -192,7 +193,6 @@ loop:
         * nvp is locked & refd so effectively so is np.
         */
        *npp = np;
-
        if (nfs_node_hash_lock < 0)
                wakeup(&nfs_node_hash_lock);
        nfs_node_hash_lock = 0;
@@ -200,6 +200,139 @@ loop:
        return (0);
 }
 
+/*
+ * Nonblocking version of nfs_nget()
+ */
+int
+nfs_nget_nonblock(struct mount *mntp, nfsfh_t *fhp, int fhsize,
+                 struct nfsnode **npp)
+{
+       struct nfsnode *np, *np2;
+       struct nfsnodehashhead *nhpp;
+       struct vnode *vp;
+       struct vnode *nvp;
+       int error;
+       int lkflags;
+       struct nfsmount *nmp;
+
+       /*
+        * Calculate nfs mount point and figure out whether the rslock should
+        * be interruptable or not.
+        */
+       nmp = VFSTONFS(mntp);
+       if (nmp->nm_flag & NFSMNT_INT)
+               lkflags = LK_PCATCH;
+       else
+               lkflags = 0;
+       vp = NULL;
+       *npp = NULL;
+retry:
+       nhpp = NFSNOHASH(fnv_32_buf(fhp->fh_bytes, fhsize, FNV1_32_INIT));
+loop:
+       for (np = nhpp->lh_first; np; np = np->n_hash.le_next) {
+               if (mntp != NFSTOV(np)->v_mount || np->n_fhsize != fhsize ||
+                   bcmp((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize)) {
+                       continue;
+               }
+               if (vp == NULL) {
+                       vp = NFSTOV(np);
+                       if (vget(vp, LK_EXCLUSIVE | LK_NOWAIT)) {
+                               error = EWOULDBLOCK;
+                               goto fail;
+                       }
+                       goto loop;
+               }
+               if (NFSTOV(np) != vp) {
+                       vput(vp);
+                       vp = NULL;
+                       goto loop;
+               }
+               *npp = np;
+               return(0);
+       }
+
+       /*
+        * Not found.  If we raced and had acquired a vp we have to release
+        * it here.
+        */
+       if (vp) {
+               vput(vp);
+               vp = NULL;
+       }
+
+       /*
+        * Obtain a lock to prevent a race condition if the getnewvnode()
+        * or MALLOC() below happens to block.
+        */
+       if (nfs_node_hash_lock) {
+               while (nfs_node_hash_lock) {
+                       nfs_node_hash_lock = -1;
+                       tsleep(&nfs_node_hash_lock, 0, "nfsngt", 0);
+               }
+               goto loop;
+       }
+       nfs_node_hash_lock = 1;
+
+       /*
+        * Entry not found, allocate a new entry.
+        *
+        * Allocate before getnewvnode since doing so afterward
+        * might cause a bogus v_data pointer to get dereferenced
+        * elsewhere if zalloc should block.
+        */
+       np = zalloc(nfsnode_zone);
+
+       error = getnewvnode(VT_NFS, mntp, &nvp, 0, 0);
+       if (error) {
+               if (nfs_node_hash_lock < 0)
+                       wakeup(&nfs_node_hash_lock);
+               nfs_node_hash_lock = 0;
+               zfree(nfsnode_zone, np);
+               return (error);
+       }
+       vp = nvp;
+       bzero(np, sizeof (*np));
+       np->n_vnode = vp;
+       vp->v_data = np;
+
+       /*
+        * Insert the nfsnode in the hash queue for its new file handle.
+        * If someone raced us we free np and vp and try again.
+        */
+       for (np2 = nhpp->lh_first; np2 != 0; np2 = np2->n_hash.le_next) {
+               if (mntp != NFSTOV(np2)->v_mount || np2->n_fhsize != fhsize ||
+                   bcmp((caddr_t)fhp, (caddr_t)np2->n_fhp, fhsize)) {
+                       continue;
+               }
+               vx_put(vp);
+               if (nfs_node_hash_lock < 0)
+                       wakeup(&nfs_node_hash_lock);
+               nfs_node_hash_lock = 0;
+               zfree(nfsnode_zone, np);
+               goto retry;
+       }
+       LIST_INSERT_HEAD(nhpp, np, n_hash);
+       if (fhsize > NFS_SMALLFH) {
+               MALLOC(np->n_fhp, nfsfh_t *, fhsize, M_NFSBIGFH, M_WAITOK);
+       } else {
+               np->n_fhp = &np->n_fh;
+       }
+       bcopy((caddr_t)fhp, (caddr_t)np->n_fhp, fhsize);
+       np->n_fhsize = fhsize;
+       lockinit(&np->n_rslock, "nfrslk", 0, lkflags);
+
+       /*
+        * nvp is locked & refd so effectively so is np.
+        */
+       *npp = np;
+       error = 0;
+       if (nfs_node_hash_lock < 0)
+               wakeup(&nfs_node_hash_lock);
+       nfs_node_hash_lock = 0;
+fail:
+       return (error);
+}
+
 /*
  * nfs_inactive(struct vnode *a_vp)
  *
index dba0935..dcd47e4 100644 (file)
@@ -2505,7 +2505,6 @@ nfs_readdirplusrpc_uio(struct vnode *vp, struct uio *uiop)
        u_quad_t fileno;
        int error = 0, tlen, more_dirs = 1, blksiz = 0, doit, bigenough = 1, i;
        int attrflag, fhsize;
-       int vpls;
        struct nchandle nch;
        struct nchandle dnch;
        struct nlcomponent nlc;
@@ -2666,31 +2665,33 @@ nfs_readdirplusrpc_uio(struct vnode *vp, struct uio *uiop)
                                     * hold the directory vp locked while
                                     * doing lookups and gets.
                                     */
-                                   vpls = vn_islocked_unlock(vp);
-                                   nch = cache_nlookup(&dnch, &nlc);
+                                   nch = cache_nlookup_nonblock(&dnch, &nlc);
+                                   if (nch.ncp == NULL)
+                                       goto rdfail;
                                    cache_setunresolved(&nch);
-                                   error = nfs_nget(vp->v_mount, fhp,
-                                                    fhsize, &np);
-                                   vn_islocked_relock(vp, vpls);
-                                   if (error == 0) {
-                                       newvp = NFSTOV(np);
-                                       dpossav2 = info.dpos;
-                                       info.dpos = dpossav1;
-                                       mdsav2 = info.md;
-                                       info.md = mdsav1;
-                                       ERROROUT(nfsm_loadattr(&info, newvp,
-                                                              NULL));
-                                       info.dpos = dpossav2;
-                                       info.md = mdsav2;
-                                       dp->nfs_type =
-                                           IFTODT(VTTOIF(np->n_vattr.va_type));
-                                       nfs_cache_setvp(&nch, newvp,
-                                                       nfspos_cache_timeout);
-                                       vput(newvp);
-                                       newvp = NULLVP;
+                                   error = nfs_nget_nonblock(vp->v_mount, fhp,
+                                                             fhsize, &np);
+                                   if (error) {
+                                       cache_put(&nch);
+                                       goto rdfail;
                                    }
+                                   newvp = NFSTOV(np);
+                                   dpossav2 = info.dpos;
+                                   info.dpos = dpossav1;
+                                   mdsav2 = info.md;
+                                   info.md = mdsav1;
+                                   ERROROUT(nfsm_loadattr(&info, newvp, NULL));
+                                   info.dpos = dpossav2;
+                                   info.md = mdsav2;
+                                   dp->nfs_type =
+                                           IFTODT(VTTOIF(np->n_vattr.va_type));
+                                   nfs_cache_setvp(&nch, newvp,
+                                                   nfspos_cache_timeout);
+                                   vput(newvp);
+                                   newvp = NULLVP;
                                    cache_put(&nch);
                                } else {
+rdfail:
                                    kprintf("Warning: NFS/rddirplus, "
                                            "UNABLE TO ENTER %*.*s\n",
                                        nlc.nlc_namelen, nlc.nlc_namelen,
index 08324f3..11157e2 100644 (file)
@@ -227,6 +227,7 @@ int nfs_flush (struct vnode *, int, struct thread *, int);
 /* other stuff */
 int    nfs_removeit (struct sillyrename *);
 int    nfs_nget (struct mount *,nfsfh_t *,int,struct nfsnode **);
+int    nfs_nget_nonblock (struct mount *,nfsfh_t *,int,struct nfsnode **);
 nfsuint64 *nfs_getcookie (struct nfsnode *, off_t, int);
 void   nfs_invaldir (struct vnode *);