sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/uio.h>
  68 #include <sys/kernel.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/mount.h>
  71 #include <sys/vnode.h>
  72 #include <sys/malloc.h>
  73 #include <sys/sysproto.h>
  74 #include <sys/spinlock.h>
  75 #include <sys/proc.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/spinlock2.h>
  85
  86 #define MAX_RECURSION_DEPTH     64
  87
  88 /*
  89  * Random lookups in the cache are accomplished with a hash table using
  90  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock,
  91  * but we use the ncp->update counter trick to avoid acquiring any
  92  * contestable spin-locks during a lookup.
  93  *
  94  * Negative entries may exist and correspond to resolved namecache
  95  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  96  * will be set if the entry corresponds to a whited-out directory entry
  97  * (verses simply not finding the entry at all).  pcpu_ncache[n].neg_list
  98  * is locked via pcpu_ncache[n].neg_spin;
  99  *
 100  * MPSAFE RULES:
 101  *
 102  * (1) ncp's typically have at least a nc_refs of 1, and usually 2.  One
 103  *     is applicable to direct lookups via the hash table nchpp or via
 104  *     nc_list (the two are added or removed together).  Removal of the ncp
 105  *     from the hash table drops this reference.  The second is applicable
 106  *     to vp->v_namecache linkages (or negative list linkages), and removal
 107  *     of the ncp from these lists drops this reference.
 108  *
 109  *     On the 1->0 transition of nc_refs the ncp can no longer be referenced
 110  *     and must be destroyed.  No other thread should have access to it at
 111  *     this point so it can be safely locked and freed without any deadlock
 112  *     fears.
 113  *
 114  *     The 1->0 transition can occur at almost any juncture and so cache_drop()
 115  *     deals with it directly.
 116  *
 117  * (2) Once the 1->0 transition occurs, the entity that caused the transition
 118  *     will be responsible for destroying the ncp.  The ncp cannot be on any
 119  *     list or hash at this time, or be held by anyone other than the caller
 120  *     responsible for the transition.
 121  *
 122  * (3) A ncp must be locked in order to modify it.
 123  *
 124  * (5) ncp locks are ordered, child-to-parent.  Child first, then parent.
 125  *     This may seem backwards but forward-scans use the hash table and thus
 126  *     can hold the parent unlocked while traversing downward.  Deletions,
 127  *     on the other-hand, tend to propagate bottom-up since the ref on the
 128  *     is dropped as the children go away.
 129  *
 130  * (6) Both parent and child must be locked in order to enter the child onto
 131  *     the parent's nc_list.
 132  */
 133
 134 /*
 135  * Structures associated with name cacheing.
 136  */
 137 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 138 #define MINNEG                  1024
 139 #define MINPOS                  1024
 140 #define NCMOUNT_NUMCACHE        (16384) /* power of 2 */
 141 #define NCMOUNT_SET             (8)     /* power of 2 */
 142
 143 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 144
 145 TAILQ_HEAD(nchash_list, namecache);
 146
 147 /*
 148  * Don't cachealign, but at least pad to 32 bytes so entries
 149  * don't cross a cache line.
 150  */
 151 struct nchash_head {
 152        struct nchash_list list; /* 16 bytes */
 153        struct spinlock  spin;   /* 8 bytes */
 154        long     pad01;          /* 8 bytes */
 155 };
 156
 157 struct ncmount_cache {
 158         struct spinlock spin;
 159         struct namecache *ncp;
 160         struct mount *mp;
 161         struct mount *mp_target;
 162         int isneg;
 163         int ticks;
 164         int updating;
 165         int unused01;
 166 };
 167
 168 struct pcpu_ncache {
 169         struct spinlock         umount_spin;    /* cache_findmount/interlock */
 170         struct spinlock         neg_spin;       /* for neg_list and neg_count */
 171         struct namecache_list   neg_list;
 172         long                    neg_count;
 173         long                    vfscache_negs;
 174         long                    vfscache_count;
 175         long                    vfscache_leafs;
 176         long                    numdefered;
 177 } __cachealign;
 178
 179 __read_mostly static struct nchash_head *nchashtbl;
 180 __read_mostly static struct pcpu_ncache *pcpu_ncache;
 181 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 182
 183 /*
 184  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 185  * to create the namecache infrastructure leading to a dangling vnode.
 186  *
 187  * 0    Only errors are reported
 188  * 1    Successes are reported
 189  * 2    Successes + the whole directory scan is reported
 190  * 3    Force the directory scan code run as if the parent vnode did not
 191  *      have a namecache record, even if it does have one.
 192  */
 193 __read_mostly static int        ncvp_debug;
 194 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 195     "Namecache debug level (0-3)");
 196
 197 __read_mostly static u_long nchash;             /* size of hash table */
 198 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 199     "Size of namecache hash table");
 200
 201 __read_mostly static int ncnegflush = 10;       /* burst for negative flush */
 202 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 203     "Batch flush negative entries");
 204
 205 __read_mostly static int ncposflush = 10;       /* burst for positive flush */
 206 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 207     "Batch flush positive entries");
 208
 209 __read_mostly static int ncnegfactor = 16;      /* ratio of negative entries */
 210 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 211     "Ratio of namecache negative entries");
 212
 213 __read_mostly static int nclockwarn;    /* warn on locked entries in ticks */
 214 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 215     "Warn on locked namecache entries in ticks");
 216
 217 __read_mostly static int ncposlimit;    /* number of cache entries allocated */
 218 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 219     "Number of cache entries allocated");
 220
 221 __read_mostly static int ncp_shared_lock_disable = 0;
 222 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 223            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 224
 225 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 226     "sizeof(struct vnode)");
 227 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 228     "sizeof(struct namecache)");
 229
 230 __read_mostly static int ncmount_cache_enable = 1;
 231 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 232            &ncmount_cache_enable, 0, "mount point cache");
 233
 234 static __inline void _cache_drop(struct namecache *ncp);
 235 static int cache_resolve_mp(struct mount *mp);
 236 static int cache_findmount_callback(struct mount *mp, void *data);
 237 static void _cache_setunresolved(struct namecache *ncp);
 238 static void _cache_cleanneg(long count);
 239 static void _cache_cleanpos(long count);
 240 static void _cache_cleandefered(void);
 241 static void _cache_unlink(struct namecache *ncp);
 242
 243 /*
 244  * The new name cache statistics (these are rolled up globals and not
 245  * modified in the critical path, see struct pcpu_ncache).
 246  */
 247 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 248 static long vfscache_negs;
 249 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
 250     "Number of negative namecache entries");
 251 static long vfscache_count;
 252 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
 253     "Number of namecaches entries");
 254 static long vfscache_leafs;
 255 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
 256     "Number of namecaches entries");
 257 static long     numdefered;
 258 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 259     "Number of cache entries allocated");
 260
 261
 262 struct nchstats nchstats[SMP_MAXCPU];
 263 /*
 264  * Export VFS cache effectiveness statistics to user-land.
 265  *
 266  * The statistics are left for aggregation to user-land so
 267  * neat things can be achieved, like observing per-CPU cache
 268  * distribution.
 269  */
 270 static int
 271 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 272 {
 273         struct globaldata *gd;
 274         int i, error;
 275
 276         error = 0;
 277         for (i = 0; i < ncpus; ++i) {
 278                 gd = globaldata_find(i);
 279                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 280                         sizeof(struct nchstats))))
 281                         break;
 282         }
 283
 284         return (error);
 285 }
 286 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 287   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 288
 289 static void cache_zap(struct namecache *ncp);
 290
 291 /*
 292  * Cache mount points and namecache records in order to avoid unnecessary
 293  * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
 294  * performance and is particularly important on multi-socket systems to
 295  * reduce cache-line ping-ponging.
 296  *
 297  * Try to keep the pcpu structure within one cache line (~64 bytes).
 298  */
 299 #define MNTCACHE_COUNT  32      /* power of 2, multiple of SET */
 300 #define MNTCACHE_SET    8       /* set associativity */
 301
 302 struct mntcache_elm {
 303         struct namecache *ncp;
 304         struct mount     *mp;
 305         int     ticks;
 306         int     unused01;
 307 };
 308
 309 struct mntcache {
 310         struct mntcache_elm array[MNTCACHE_COUNT];
 311 } __cachealign;
 312
 313 static struct mntcache  pcpu_mntcache[MAXCPU];
 314
 315 static __inline
 316 struct mntcache_elm *
 317 _cache_mntcache_hash(void *ptr)
 318 {
 319         struct mntcache_elm *elm;
 320         int hv;
 321
 322         hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1);
 323         elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)];
 324
 325         return elm;
 326 }
 327
 328 static
 329 void
 330 _cache_mntref(struct mount *mp)
 331 {
 332         struct mntcache_elm *elm;
 333         struct mount *mpr;
 334         int i;
 335
 336         elm = _cache_mntcache_hash(mp);
 337         for (i = 0; i < MNTCACHE_SET; ++i) {
 338                 if (elm->mp == mp) {
 339                         mpr = atomic_swap_ptr((void *)&elm->mp, NULL);
 340                         if (__predict_true(mpr == mp))
 341                                 return;
 342                         if (mpr)
 343                                 atomic_add_int(&mpr->mnt_refs, -1);
 344                 }
 345                 ++elm;
 346         }
 347         atomic_add_int(&mp->mnt_refs, 1);
 348 }
 349
 350 static
 351 void
 352 _cache_mntrel(struct mount *mp)
 353 {
 354         struct mntcache_elm *elm;
 355         struct mntcache_elm *best;
 356         struct mount *mpr;
 357         int delta1;
 358         int delta2;
 359         int i;
 360
 361         elm = _cache_mntcache_hash(mp);
 362         best = elm;
 363         for (i = 0; i < MNTCACHE_SET; ++i) {
 364                 if (elm->mp == NULL) {
 365                         mpr = atomic_swap_ptr((void *)&elm->mp, mp);
 366                         if (__predict_false(mpr != NULL)) {
 367                                 atomic_add_int(&mpr->mnt_refs, -1);
 368                         }
 369                         elm->ticks = ticks;
 370                         return;
 371                 }
 372                 delta1 = ticks - best->ticks;
 373                 delta2 = ticks - elm->ticks;
 374                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
 375                         best = elm;
 376                 ++elm;
 377         }
 378         mpr = atomic_swap_ptr((void *)&best->mp, mp);
 379         best->ticks = ticks;
 380         if (mpr)
 381                 atomic_add_int(&mpr->mnt_refs, -1);
 382 }
 383
 384 /*
 385  * Clears all cached mount points on all cpus.  This routine should only
 386  * be called when we are waiting for a mount to clear, e.g. so we can
 387  * unmount.
 388  */
 389 void
 390 cache_clearmntcache(struct mount *target __unused)
 391 {
 392         int n;
 393
 394         for (n = 0; n < ncpus; ++n) {
 395                 struct mntcache *cache = &pcpu_mntcache[n];
 396                 struct mntcache_elm *elm;
 397                 struct namecache *ncp;
 398                 struct mount *mp;
 399                 int i;
 400
 401                 for (i = 0; i < MNTCACHE_COUNT; ++i) {
 402                         elm = &cache->array[i];
 403                         if (elm->mp) {
 404                                 mp = atomic_swap_ptr((void *)&elm->mp, NULL);
 405                                 if (mp)
 406                                         atomic_add_int(&mp->mnt_refs, -1);
 407                         }
 408                         if (elm->ncp) {
 409                                 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL);
 410                                 if (ncp)
 411                                         _cache_drop(ncp);
 412                         }
 413                 }
 414         }
 415 }
 416
 417 /*
 418  * Namespace locking.  The caller must already hold a reference to the
 419  * namecache structure in order to lock/unlock it.  The controlling entity
 420  * in a 1->0 transition does not need to lock the ncp to dispose of it,
 421  * as nobody else will have visiblity to it at that point.
 422  *
 423  * Note that holding a locked namecache structure prevents other threads
 424  * from making namespace changes (e.g. deleting or creating), prevents
 425  * vnode association state changes by other threads, and prevents the
 426  * namecache entry from being resolved or unresolved by other threads.
 427  *
 428  * An exclusive lock owner has full authority to associate/disassociate
 429  * vnodes and resolve/unresolve the locked ncp.
 430  *
 431  * A shared lock owner only has authority to acquire the underlying vnode,
 432  * if any.
 433  *
 434  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 435  * fact (when locking) or cleared prior to unlocking.
 436  *
 437  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 438  *           or recycled, but it does NOT help you if the vnode had already
 439  *           initiated a recyclement.  If this is important, use cache_get()
 440  *           rather then cache_lock() (and deal with the differences in the
 441  *           way the refs counter is handled).  Or, alternatively, make an
 442  *           unconditional call to cache_validate() or cache_resolve()
 443  *           after cache_lock() returns.
 444  */
 445 static __inline
 446 void
 447 _cache_lock(struct namecache *ncp)
 448 {
 449         int didwarn = 0;
 450         int error;
 451
 452         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 453         while (__predict_false(error == EWOULDBLOCK)) {
 454                 if (didwarn == 0) {
 455                         didwarn = ticks - nclockwarn;
 456                         kprintf("[diagnostic] cache_lock: "
 457                                 "%s blocked on %p "
 458                                 "\"%*.*s\"\n",
 459                                 curthread->td_comm, ncp,
 460                                 ncp->nc_nlen, ncp->nc_nlen,
 461                                 ncp->nc_name);
 462                 }
 463                 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK);
 464         }
 465         if (__predict_false(didwarn)) {
 466                 kprintf("[diagnostic] cache_lock: "
 467                         "%s unblocked %*.*s after %d secs\n",
 468                         curthread->td_comm,
 469                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 470                         (int)(ticks - didwarn) / hz);
 471         }
 472 }
 473
 474 /*
 475  * Release a previously acquired lock.
 476  *
 477  * A concurrent shared-lock acquisition or acquisition/release can
 478  * race bit 31 so only drop the ncp if bit 31 was set.
 479  */
 480 static __inline
 481 void
 482 _cache_unlock(struct namecache *ncp)
 483 {
 484         lockmgr(&ncp->nc_lock, LK_RELEASE);
 485 }
 486
 487 /*
 488  * Lock ncp exclusively, non-blocking.  Return 0 on success.
 489  */
 490 static __inline
 491 int
 492 _cache_lock_nonblock(struct namecache *ncp)
 493 {
 494         int error;
 495
 496         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT);
 497         if (__predict_false(error != 0)) {
 498                 return(EWOULDBLOCK);
 499         }
 500         return 0;
 501 }
 502
 503 /*
 504  * This is a special form of _cache_lock() which only succeeds if
 505  * it can get a pristine, non-recursive lock.  The caller must have
 506  * already ref'd the ncp.
 507  *
 508  * On success the ncp will be locked, on failure it will not.  The
 509  * ref count does not change either way.
 510  *
 511  * We want _cache_lock_special() (on success) to return a definitively
 512  * usable vnode or a definitively unresolved ncp.
 513  */
 514 static __inline
 515 int
 516 _cache_lock_special(struct namecache *ncp)
 517 {
 518         if (_cache_lock_nonblock(ncp) == 0) {
 519                 if (lockmgr_oneexcl(&ncp->nc_lock)) {
 520                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 521                                 _cache_setunresolved(ncp);
 522                         return 0;
 523                 }
 524                 _cache_unlock(ncp);
 525         }
 526         return EWOULDBLOCK;
 527 }
 528
 529 /*
 530  * Shared lock, guarantees vp held
 531  *
 532  * The shared lock holds vp on the 0->1 transition.  It is possible to race
 533  * another shared lock release, preventing the other release from dropping
 534  * the vnode and clearing bit 31.
 535  *
 536  * If it is not set then we are responsible for setting it, and this
 537  * responsibility does not race with anyone else.
 538  */
 539 static __inline
 540 void
 541 _cache_lock_shared(struct namecache *ncp)
 542 {
 543         int didwarn = 0;
 544         int error;
 545
 546         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 547         while (__predict_false(error == EWOULDBLOCK)) {
 548                 if (didwarn == 0) {
 549                         didwarn = ticks - nclockwarn;
 550                         kprintf("[diagnostic] cache_lock_shared: "
 551                                 "%s blocked on %p "
 552                                 "\"%*.*s\"\n",
 553                                 curthread->td_comm, ncp,
 554                                 ncp->nc_nlen, ncp->nc_nlen,
 555                                 ncp->nc_name);
 556                 }
 557                 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 558         }
 559         if (__predict_false(didwarn)) {
 560                 kprintf("[diagnostic] cache_lock_shared: "
 561                         "%s unblocked %*.*s after %d secs\n",
 562                         curthread->td_comm,
 563                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 564                         (int)(ticks - didwarn) / hz);
 565         }
 566 }
 567
 568 /*
 569  * Shared lock, guarantees vp held.  Non-blocking.  Returns 0 on success
 570  */
 571 static __inline
 572 int
 573 _cache_lock_shared_nonblock(struct namecache *ncp)
 574 {
 575         int error;
 576
 577         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT);
 578         if (__predict_false(error != 0)) {
 579                 return(EWOULDBLOCK);
 580         }
 581         return 0;
 582 }
 583
 584 /*
 585  * This function tries to get a shared lock but will back-off to an
 586  * exclusive lock if:
 587  *
 588  * (1) Some other thread is trying to obtain an exclusive lock
 589  *     (to prevent the exclusive requester from getting livelocked out
 590  *     by many shared locks).
 591  *
 592  * (2) The current thread already owns an exclusive lock (to avoid
 593  *     deadlocking).
 594  *
 595  * WARNING! On machines with lots of cores we really want to try hard to
 596  *          get a shared lock or concurrent path lookups can chain-react
 597  *          into a very high-latency exclusive lock.
 598  *
 599  *          This is very evident in dsynth's initial scans.
 600  */
 601 static __inline
 602 int
 603 _cache_lock_shared_special(struct namecache *ncp)
 604 {
 605         /*
 606          * Only honor a successful shared lock (returning 0) if there is
 607          * no exclusive request pending and the vnode, if present, is not
 608          * in a reclaimed state.
 609          */
 610         if (_cache_lock_shared_nonblock(ncp) == 0) {
 611                 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) {
 612                         if (ncp->nc_vp == NULL ||
 613                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
 614                                 return(0);
 615                         }
 616                 }
 617                 _cache_unlock(ncp);
 618                 return(EWOULDBLOCK);
 619         }
 620
 621         /*
 622          * Non-blocking shared lock failed.  If we already own the exclusive
 623          * lock just acquire another exclusive lock (instead of deadlocking).
 624          * Otherwise acquire a shared lock.
 625          */
 626         if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) {
 627                 _cache_lock(ncp);
 628                 return(0);
 629         }
 630         _cache_lock_shared(ncp);
 631         return(0);
 632 }
 633
 634 static __inline
 635 int
 636 _cache_lockstatus(struct namecache *ncp)
 637 {
 638         int status;
 639
 640         status = lockstatus(&ncp->nc_lock, curthread);
 641         if (status == 0 || status == LK_EXCLOTHER)
 642                 status = -1;
 643         return status;
 644 }
 645
 646 /*
 647  * cache_hold() and cache_drop() prevent the premature deletion of a
 648  * namecache entry but do not prevent operations (such as zapping) on
 649  * that namecache entry.
 650  *
 651  * This routine may only be called from outside this source module if
 652  * nc_refs is already deterministically at least 1, such as being
 653  * associated with e.g. a process, file descriptor, or some other entity.
 654  *
 655  * Only the above situations, similar situations within this module where
 656  * the ref count is deterministically at least 1, or when the ncp is found
 657  * via the nchpp (hash table) lookup, can bump nc_refs.
 658  *
 659  * Very specifically, a ncp found via nc_list CANNOT bump nc_refs.  It
 660  * can still be removed from the nc_list, however, as long as the caller
 661  * can acquire its lock (in the wrong order).
 662  *
 663  * This is a rare case where callers are allowed to hold a spinlock,
 664  * so we can't ourselves.
 665  */
 666 static __inline
 667 struct namecache *
 668 _cache_hold(struct namecache *ncp)
 669 {
 670         KKASSERT(ncp->nc_refs > 0);
 671         atomic_add_int(&ncp->nc_refs, 1);
 672
 673         return(ncp);
 674 }
 675
 676 /*
 677  * Drop a cache entry.
 678  *
 679  * The 1->0 transition is special and requires the caller to destroy the
 680  * entry.  It means that the ncp is no longer on a nchpp list (since that
 681  * would mean there was stilla ref).  The ncp could still be on a nc_list
 682  * but will not have any child of its own, again because nc_refs is now 0
 683  * and children would have a ref to their parent.
 684  *
 685  * Once the 1->0 transition is made, nc_refs cannot be incremented again.
 686  */
 687 static __inline
 688 void
 689 _cache_drop(struct namecache *ncp)
 690 {
 691         if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) {
 692                 /*
 693                  * Executed unlocked (no need to lock on last drop)
 694                  */
 695                 _cache_setunresolved(ncp);
 696
 697                 /*
 698                  * Scrap it.
 699                  */
 700                 ncp->nc_refs = -1;      /* safety */
 701                 if (ncp->nc_name)
 702                         kfree(ncp->nc_name, M_VFSCACHE);
 703                 kfree(ncp, M_VFSCACHE);
 704         }
 705 }
 706
 707 /*
 708  * Link a new namecache entry to its parent and to the hash table.  Be
 709  * careful to avoid races if vhold() blocks in the future.
 710  *
 711  * Both ncp and par must be referenced and locked.  The reference is
 712  * transfered to the nchpp (and, most notably, NOT to the parent list).
 713  *
 714  * NOTE: The hash table spinlock is held across this call, we can't do
 715  *       anything fancy.
 716  */
 717 static void
 718 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 719                    struct nchash_head *nchpp)
 720 {
 721         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 722
 723         KKASSERT(ncp->nc_parent == NULL);
 724         ncp->nc_parent = par;
 725         ncp->nc_head = nchpp;
 726
 727         /*
 728          * Set inheritance flags.  Note that the parent flags may be
 729          * stale due to getattr potentially not having been run yet
 730          * (it gets run during nlookup()'s).
 731          */
 732         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 733         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 734                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 735         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 736                 ncp->nc_flag |= NCF_UF_PCACHE;
 737
 738         /*
 739          * Add to hash table and parent, adjust accounting
 740          */
 741         TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 742         atomic_add_long(&pn->vfscache_count, 1);
 743         if (TAILQ_EMPTY(&ncp->nc_list))
 744                 atomic_add_long(&pn->vfscache_leafs, 1);
 745
 746         if (TAILQ_EMPTY(&par->nc_list)) {
 747                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 748                 atomic_add_long(&pn->vfscache_leafs, -1);
 749                 /*
 750                  * Any vp associated with an ncp which has children must
 751                  * be held to prevent it from being recycled.
 752                  */
 753                 if (par->nc_vp)
 754                         vhold(par->nc_vp);
 755         } else {
 756                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 757         }
 758         _cache_hold(par);       /* add nc_parent ref */
 759 }
 760
 761 /*
 762  * Remove the parent and hash associations from a namecache structure.
 763  * Drop the ref-count on the parent.  The caller receives the ref
 764  * from the ncp's nchpp linkage that was removed and may forward that
 765  * ref to a new linkage.
 766
 767  * The caller usually holds an additional ref * on the ncp so the unlink
 768  * cannot be the final drop.  XXX should not be necessary now since the
 769  * caller receives the ref from the nchpp linkage, assuming the ncp
 770  * was linked in the first place.
 771  *
 772  * ncp must be locked, which means that there won't be any nc_parent
 773  * removal races.  This routine will acquire a temporary lock on
 774  * the parent as well as the appropriate hash chain.
 775  */
 776 static void
 777 _cache_unlink_parent(struct namecache *ncp)
 778 {
 779         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 780         struct namecache *par;
 781         struct vnode *dropvp;
 782         struct nchash_head *nchpp;
 783
 784         if ((par = ncp->nc_parent) != NULL) {
 785                 cpu_ccfence();
 786                 KKASSERT(ncp->nc_parent == par);
 787
 788                 /* don't add a ref, we drop the nchpp ref later */
 789                 _cache_lock(par);
 790                 nchpp = ncp->nc_head;
 791                 spin_lock(&nchpp->spin);
 792
 793                 /*
 794                  * Remove from hash table and parent, adjust accounting
 795                  */
 796                 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
 797                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 798                 atomic_add_long(&pn->vfscache_count, -1);
 799                 if (TAILQ_EMPTY(&ncp->nc_list))
 800                         atomic_add_long(&pn->vfscache_leafs, -1);
 801
 802                 dropvp = NULL;
 803                 if (TAILQ_EMPTY(&par->nc_list)) {
 804                         atomic_add_long(&pn->vfscache_leafs, 1);
 805                         if (par->nc_vp)
 806                                 dropvp = par->nc_vp;
 807                 }
 808                 ncp->nc_parent = NULL;
 809                 ncp->nc_head = NULL;
 810                 spin_unlock(&nchpp->spin);
 811                 _cache_unlock(par);
 812                 _cache_drop(par);       /* drop nc_parent ref */
 813
 814                 /*
 815                  * We can only safely vdrop with no spinlocks held.
 816                  */
 817                 if (dropvp)
 818                         vdrop(dropvp);
 819         }
 820 }
 821
 822 /*
 823  * Allocate a new namecache structure.  Most of the code does not require
 824  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 825  *
 826  * The returned ncp will be locked and referenced.  The ref is generally meant
 827  * to be transfered to the nchpp linkage.
 828  */
 829 static struct namecache *
 830 cache_alloc(int nlen)
 831 {
 832         struct namecache *ncp;
 833
 834         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 835         if (nlen)
 836                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 837         ncp->nc_nlen = nlen;
 838         ncp->nc_flag = NCF_UNRESOLVED;
 839         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 840         ncp->nc_refs = 1;
 841         TAILQ_INIT(&ncp->nc_list);
 842         lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE);
 843         lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 844
 845         return(ncp);
 846 }
 847
 848 /*
 849  * Can only be called for the case where the ncp has never been
 850  * associated with anything (so no spinlocks are needed).
 851  */
 852 static void
 853 _cache_free(struct namecache *ncp)
 854 {
 855         KKASSERT(ncp->nc_refs == 1);
 856         if (ncp->nc_name)
 857                 kfree(ncp->nc_name, M_VFSCACHE);
 858         kfree(ncp, M_VFSCACHE);
 859 }
 860
 861 /*
 862  * [re]initialize a nchandle.
 863  */
 864 void
 865 cache_zero(struct nchandle *nch)
 866 {
 867         nch->ncp = NULL;
 868         nch->mount = NULL;
 869 }
 870
 871 /*
 872  * Ref and deref a nchandle structure (ncp + mp)
 873  *
 874  * The caller must specify a stable ncp pointer, typically meaning the
 875  * ncp is already referenced but this can also occur indirectly through
 876  * e.g. holding a lock on a direct child.
 877  *
 878  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 879  *          use read spinlocks here.
 880  */
 881 struct nchandle *
 882 cache_hold(struct nchandle *nch)
 883 {
 884         _cache_hold(nch->ncp);
 885         _cache_mntref(nch->mount);
 886         return(nch);
 887 }
 888
 889 /*
 890  * Create a copy of a namecache handle for an already-referenced
 891  * entry.
 892  */
 893 void
 894 cache_copy(struct nchandle *nch, struct nchandle *target)
 895 {
 896         struct namecache *ncp;
 897         struct mount *mp;
 898         struct mntcache_elm *elm;
 899         struct namecache *ncpr;
 900         int i;
 901
 902         ncp = nch->ncp;
 903         mp = nch->mount;
 904         target->ncp = ncp;
 905         target->mount = mp;
 906
 907         elm = _cache_mntcache_hash(ncp);
 908         for (i = 0; i < MNTCACHE_SET; ++i) {
 909                 if (elm->ncp == ncp) {
 910                         ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL);
 911                         if (ncpr == ncp) {
 912                                 _cache_mntref(mp);
 913                                 return;
 914                         }
 915                         if (ncpr)
 916                                 _cache_drop(ncpr);
 917                 }
 918                 ++elm;
 919         }
 920         if (ncp)
 921                 _cache_hold(ncp);
 922         _cache_mntref(mp);
 923 }
 924
 925 /*
 926  * Drop the nchandle, but try to cache the ref to avoid global atomic
 927  * ops.  This is typically done on the system root and jail root nchandles.
 928  */
 929 void
 930 cache_drop_and_cache(struct nchandle *nch, int elmno)
 931 {
 932         struct mntcache_elm *elm;
 933         struct mntcache_elm *best;
 934         struct namecache *ncpr;
 935         int delta1;
 936         int delta2;
 937         int i;
 938
 939         if (elmno > 4) {
 940                 if (nch->ncp) {
 941                         _cache_drop(nch->ncp);
 942                         nch->ncp = NULL;
 943                 }
 944                 if (nch->mount) {
 945                         _cache_mntrel(nch->mount);
 946                         nch->mount = NULL;
 947                 }
 948                 return;
 949         }
 950
 951         elm = _cache_mntcache_hash(nch->ncp);
 952         best = elm;
 953         for (i = 0; i < MNTCACHE_SET; ++i) {
 954                 if (elm->ncp == NULL) {
 955                         ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp);
 956                         _cache_mntrel(nch->mount);
 957                         elm->ticks = ticks;
 958                         nch->mount = NULL;
 959                         nch->ncp = NULL;
 960                         if (ncpr)
 961                                 _cache_drop(ncpr);
 962                         return;
 963                 }
 964                 delta1 = ticks - best->ticks;
 965                 delta2 = ticks - elm->ticks;
 966                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
 967                         best = elm;
 968                 ++elm;
 969         }
 970         ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp);
 971         _cache_mntrel(nch->mount);
 972         best->ticks = ticks;
 973         nch->mount = NULL;
 974         nch->ncp = NULL;
 975         if (ncpr)
 976                 _cache_drop(ncpr);
 977 }
 978
 979 void
 980 cache_changemount(struct nchandle *nch, struct mount *mp)
 981 {
 982         _cache_mntref(mp);
 983         _cache_mntrel(nch->mount);
 984         nch->mount = mp;
 985 }
 986
 987 void
 988 cache_drop(struct nchandle *nch)
 989 {
 990         _cache_mntrel(nch->mount);
 991         _cache_drop(nch->ncp);
 992         nch->ncp = NULL;
 993         nch->mount = NULL;
 994 }
 995
 996 int
 997 cache_lockstatus(struct nchandle *nch)
 998 {
 999         return(_cache_lockstatus(nch->ncp));
1000 }
1001
1002 void
1003 cache_lock(struct nchandle *nch)
1004 {
1005         _cache_lock(nch->ncp);
1006 }
1007
1008 void
1009 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1010 {
1011         struct namecache *ncp = nch->ncp;
1012
1013         if (ncp_shared_lock_disable || excl ||
1014             (ncp->nc_flag & NCF_UNRESOLVED)) {
1015                 _cache_lock(ncp);
1016         } else {
1017                 _cache_lock_shared(ncp);
1018                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1019                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1020                                 _cache_unlock(ncp);
1021                                 _cache_lock(ncp);
1022                         }
1023                 } else {
1024                         _cache_unlock(ncp);
1025                         _cache_lock(ncp);
1026                 }
1027         }
1028 }
1029
1030 /*
1031  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
1032  * is responsible for checking both for validity on return as they
1033  * may have become invalid.
1034  *
1035  * We have to deal with potential deadlocks here, just ping pong
1036  * the lock until we get it (we will always block somewhere when
1037  * looping so this is not cpu-intensive).
1038  *
1039  * which = 0    nch1 not locked, nch2 is locked
1040  * which = 1    nch1 is locked, nch2 is not locked
1041  */
1042 void
1043 cache_relock(struct nchandle *nch1, struct ucred *cred1,
1044              struct nchandle *nch2, struct ucred *cred2)
1045 {
1046         int which;
1047
1048         which = 0;
1049
1050         for (;;) {
1051                 if (which == 0) {
1052                         if (cache_lock_nonblock(nch1) == 0) {
1053                                 cache_resolve(nch1, cred1);
1054                                 break;
1055                         }
1056                         cache_unlock(nch2);
1057                         cache_lock(nch1);
1058                         cache_resolve(nch1, cred1);
1059                         which = 1;
1060                 } else {
1061                         if (cache_lock_nonblock(nch2) == 0) {
1062                                 cache_resolve(nch2, cred2);
1063                                 break;
1064                         }
1065                         cache_unlock(nch1);
1066                         cache_lock(nch2);
1067                         cache_resolve(nch2, cred2);
1068                         which = 0;
1069                 }
1070         }
1071 }
1072
1073 int
1074 cache_lock_nonblock(struct nchandle *nch)
1075 {
1076         return(_cache_lock_nonblock(nch->ncp));
1077 }
1078
1079 void
1080 cache_unlock(struct nchandle *nch)
1081 {
1082         _cache_unlock(nch->ncp);
1083 }
1084
1085 /*
1086  * ref-and-lock, unlock-and-deref functions.
1087  *
1088  * This function is primarily used by nlookup.  Even though cache_lock
1089  * holds the vnode, it is possible that the vnode may have already
1090  * initiated a recyclement.
1091  *
1092  * We want cache_get() to return a definitively usable vnode or a
1093  * definitively unresolved ncp.
1094  */
1095 static
1096 struct namecache *
1097 _cache_get(struct namecache *ncp)
1098 {
1099         _cache_hold(ncp);
1100         _cache_lock(ncp);
1101         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1102                 _cache_setunresolved(ncp);
1103         return(ncp);
1104 }
1105
1106 /*
1107  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1108  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1109  * valid.  Otherwise an exclusive lock will be acquired instead.
1110  */
1111 static
1112 struct namecache *
1113 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1114 {
1115         if (ncp_shared_lock_disable || excl ||
1116             (ncp->nc_flag & NCF_UNRESOLVED)) {
1117                 return(_cache_get(ncp));
1118         }
1119         _cache_hold(ncp);
1120         _cache_lock_shared(ncp);
1121         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1122                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1123                         _cache_unlock(ncp);
1124                         ncp = _cache_get(ncp);
1125                         _cache_drop(ncp);
1126                 }
1127         } else {
1128                 _cache_unlock(ncp);
1129                 ncp = _cache_get(ncp);
1130                 _cache_drop(ncp);
1131         }
1132         return(ncp);
1133 }
1134
1135 /*
1136  * NOTE: The same nchandle can be passed for both arguments.
1137  */
1138 void
1139 cache_get(struct nchandle *nch, struct nchandle *target)
1140 {
1141         KKASSERT(nch->ncp->nc_refs > 0);
1142         target->mount = nch->mount;
1143         target->ncp = _cache_get(nch->ncp);
1144         _cache_mntref(target->mount);
1145 }
1146
1147 void
1148 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1149 {
1150         KKASSERT(nch->ncp->nc_refs > 0);
1151         target->mount = nch->mount;
1152         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1153         _cache_mntref(target->mount);
1154 }
1155
1156 /*
1157  * Release a held and locked ncp
1158  */
1159 static __inline
1160 void
1161 _cache_put(struct namecache *ncp)
1162 {
1163         _cache_unlock(ncp);
1164         _cache_drop(ncp);
1165 }
1166
1167 void
1168 cache_put(struct nchandle *nch)
1169 {
1170         _cache_mntrel(nch->mount);
1171         _cache_put(nch->ncp);
1172         nch->ncp = NULL;
1173         nch->mount = NULL;
1174 }
1175
1176 /*
1177  * Resolve an unresolved ncp by associating a vnode with it.  If the
1178  * vnode is NULL, a negative cache entry is created.
1179  *
1180  * The ncp should be locked on entry and will remain locked on return.
1181  */
1182 static
1183 void
1184 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1185 {
1186         KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) &&
1187                  (_cache_lockstatus(ncp) == LK_EXCLUSIVE) &&
1188                  ncp->nc_vp == NULL);
1189
1190         if (vp) {
1191                 /*
1192                  * Any vp associated with an ncp which has children must
1193                  * be held.  Any vp associated with a locked ncp must be held.
1194                  */
1195                 if (!TAILQ_EMPTY(&ncp->nc_list))
1196                         vhold(vp);
1197                 spin_lock(&vp->v_spin);
1198                 ncp->nc_vp = vp;
1199                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1200                 ++vp->v_namecache_count;
1201                 _cache_hold(ncp);               /* v_namecache assoc */
1202                 spin_unlock(&vp->v_spin);
1203                 vhold(vp);                      /* nc_vp */
1204
1205                 /*
1206                  * Set auxiliary flags
1207                  */
1208                 switch(vp->v_type) {
1209                 case VDIR:
1210                         ncp->nc_flag |= NCF_ISDIR;
1211                         break;
1212                 case VLNK:
1213                         ncp->nc_flag |= NCF_ISSYMLINK;
1214                         /* XXX cache the contents of the symlink */
1215                         break;
1216                 default:
1217                         break;
1218                 }
1219
1220                 ncp->nc_error = 0;
1221
1222                 /*
1223                  * XXX: this is a hack to work-around the lack of a real pfs vfs
1224                  * implementation
1225                  */
1226                 if (mp) {
1227                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1228                                 vp->v_pfsmp = mp;
1229                 }
1230         } else {
1231                 /*
1232                  * When creating a negative cache hit we set the
1233                  * namecache_gen.  A later resolve will clean out the
1234                  * negative cache hit if the mount point's namecache_gen
1235                  * has changed.  Used by devfs, could also be used by
1236                  * other remote FSs.
1237                  */
1238                 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1239
1240                 ncp->nc_vp = NULL;
1241                 ncp->nc_negcpu = mycpu->gd_cpuid;
1242                 spin_lock(&pn->neg_spin);
1243                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1244                 _cache_hold(ncp);       /* neg_list assoc */
1245                 ++pn->neg_count;
1246                 spin_unlock(&pn->neg_spin);
1247                 atomic_add_long(&pn->vfscache_negs, 1);
1248
1249                 ncp->nc_error = ENOENT;
1250                 if (mp)
1251                         VFS_NCPGEN_SET(mp, ncp);
1252         }
1253         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1254 }
1255
1256 void
1257 cache_setvp(struct nchandle *nch, struct vnode *vp)
1258 {
1259         _cache_setvp(nch->mount, nch->ncp, vp);
1260 }
1261
1262 /*
1263  * Used for NFS
1264  */
1265 void
1266 cache_settimeout(struct nchandle *nch, int nticks)
1267 {
1268         struct namecache *ncp = nch->ncp;
1269
1270         if ((ncp->nc_timeout = ticks + nticks) == 0)
1271                 ncp->nc_timeout = 1;
1272 }
1273
1274 /*
1275  * Disassociate the vnode or negative-cache association and mark a
1276  * namecache entry as unresolved again.  Note that the ncp is still
1277  * left in the hash table and still linked to its parent.
1278  *
1279  * The ncp should be locked and refd on entry and will remain locked and refd
1280  * on return.
1281  *
1282  * This routine is normally never called on a directory containing children.
1283  * However, NFS often does just that in its rename() code as a cop-out to
1284  * avoid complex namespace operations.  This disconnects a directory vnode
1285  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1286  * sync.
1287  *
1288  */
1289 static
1290 void
1291 _cache_setunresolved(struct namecache *ncp)
1292 {
1293         struct vnode *vp;
1294
1295         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1296                 ncp->nc_flag |= NCF_UNRESOLVED;
1297                 ncp->nc_timeout = 0;
1298                 ncp->nc_error = ENOTCONN;
1299                 if ((vp = ncp->nc_vp) != NULL) {
1300                         spin_lock(&vp->v_spin);
1301                         ncp->nc_vp = NULL;
1302                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1303                         --vp->v_namecache_count;
1304                         spin_unlock(&vp->v_spin);
1305
1306                         /*
1307                          * Any vp associated with an ncp with children is
1308                          * held by that ncp.  Any vp associated with  ncp
1309                          * is held by that ncp.  These conditions must be
1310                          * undone when the vp is cleared out from the ncp.
1311                          */
1312                         if (!TAILQ_EMPTY(&ncp->nc_list))
1313                                 vdrop(vp);
1314                         vdrop(vp);
1315                 } else {
1316                         struct pcpu_ncache *pn;
1317
1318                         pn = &pcpu_ncache[ncp->nc_negcpu];
1319
1320                         atomic_add_long(&pn->vfscache_negs, -1);
1321                         spin_lock(&pn->neg_spin);
1322                         TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1323                         --pn->neg_count;
1324                         spin_unlock(&pn->neg_spin);
1325                 }
1326                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1327                 _cache_drop(ncp);       /* from v_namecache or neg_list */
1328         }
1329 }
1330
1331 /*
1332  * The cache_nresolve() code calls this function to automatically
1333  * set a resolved cache element to unresolved if it has timed out
1334  * or if it is a negative cache hit and the mount point namecache_gen
1335  * has changed.
1336  */
1337 static __inline int
1338 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1339 {
1340         /*
1341          * Try to zap entries that have timed out.  We have
1342          * to be careful here because locked leafs may depend
1343          * on the vnode remaining intact in a parent, so only
1344          * do this under very specific conditions.
1345          */
1346         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1347             TAILQ_EMPTY(&ncp->nc_list)) {
1348                 return 1;
1349         }
1350
1351         /*
1352          * If a resolved negative cache hit is invalid due to
1353          * the mount's namecache generation being bumped, zap it.
1354          */
1355         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1356                 return 1;
1357         }
1358
1359         /*
1360          * Otherwise we are good
1361          */
1362         return 0;
1363 }
1364
1365 static __inline void
1366 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1367 {
1368         /*
1369          * Already in an unresolved state, nothing to do.
1370          */
1371         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1372                 if (_cache_auto_unresolve_test(mp, ncp))
1373                         _cache_setunresolved(ncp);
1374         }
1375 }
1376
1377 void
1378 cache_setunresolved(struct nchandle *nch)
1379 {
1380         _cache_setunresolved(nch->ncp);
1381 }
1382
1383 /*
1384  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1385  * looking for matches.  This flag tells the lookup code when it must
1386  * check for a mount linkage and also prevents the directories in question
1387  * from being deleted or renamed.
1388  */
1389 static
1390 int
1391 cache_clrmountpt_callback(struct mount *mp, void *data)
1392 {
1393         struct nchandle *nch = data;
1394
1395         if (mp->mnt_ncmounton.ncp == nch->ncp)
1396                 return(1);
1397         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1398                 return(1);
1399         return(0);
1400 }
1401
1402 /*
1403  * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1404  * with a mount point.
1405  */
1406 void
1407 cache_clrmountpt(struct nchandle *nch)
1408 {
1409         int count;
1410
1411         count = mountlist_scan(cache_clrmountpt_callback, nch,
1412                                MNTSCAN_FORWARD | MNTSCAN_NOBUSY |
1413                                MNTSCAN_NOUNLOCK);
1414         if (count == 0)
1415                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1416 }
1417
1418 /*
1419  * Invalidate portions of the namecache topology given a starting entry.
1420  * The passed ncp is set to an unresolved state and:
1421  *
1422  * The passed ncp must be referenced and locked.  The routine may unlock
1423  * and relock ncp several times, and will recheck the children and loop
1424  * to catch races.  When done the passed ncp will be returned with the
1425  * reference and lock intact.
1426  *
1427  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1428  *                        that the physical underlying nodes have been
1429  *                        destroyed... as in deleted.  For example, when
1430  *                        a directory is removed.  This will cause record
1431  *                        lookups on the name to no longer be able to find
1432  *                        the record and tells the resolver to return failure
1433  *                        rather then trying to resolve through the parent.
1434  *
1435  *                        The topology itself, including ncp->nc_name,
1436  *                        remains intact.
1437  *
1438  *                        This only applies to the passed ncp, if CINV_CHILDREN
1439  *                        is specified the children are not flagged.
1440  *
1441  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1442  *                        state as well.
1443  *
1444  *                        Note that this will also have the side effect of
1445  *                        cleaning out any unreferenced nodes in the topology
1446  *                        from the leaves up as the recursion backs out.
1447  *
1448  * Note that the topology for any referenced nodes remains intact, but
1449  * the nodes will be marked as having been destroyed and will be set
1450  * to an unresolved state.
1451  *
1452  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1453  * the namecache entry may not actually be invalidated on return if it was
1454  * revalidated while recursing down into its children.  This code guarentees
1455  * that the node(s) will go through an invalidation cycle, but does not
1456  * guarentee that they will remain in an invalidated state.
1457  *
1458  * Returns non-zero if a revalidation was detected during the invalidation
1459  * recursion, zero otherwise.  Note that since only the original ncp is
1460  * locked the revalidation ultimately can only indicate that the original ncp
1461  * *MIGHT* no have been reresolved.
1462  *
1463  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1464  * have to avoid blowing out the kernel stack.  We do this by saving the
1465  * deep namecache node and aborting the recursion, then re-recursing at that
1466  * node using a depth-first algorithm in order to allow multiple deep
1467  * recursions to chain through each other, then we restart the invalidation
1468  * from scratch.
1469  */
1470
1471 struct cinvtrack {
1472         struct namecache *resume_ncp;
1473         int depth;
1474 };
1475
1476 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1477
1478 static
1479 int
1480 _cache_inval(struct namecache *ncp, int flags)
1481 {
1482         struct cinvtrack track;
1483         struct namecache *ncp2;
1484         int r;
1485
1486         track.depth = 0;
1487         track.resume_ncp = NULL;
1488
1489         for (;;) {
1490                 r = _cache_inval_internal(ncp, flags, &track);
1491                 if (track.resume_ncp == NULL)
1492                         break;
1493                 _cache_unlock(ncp);
1494                 while ((ncp2 = track.resume_ncp) != NULL) {
1495                         track.resume_ncp = NULL;
1496                         _cache_lock(ncp2);
1497                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1498                                              &track);
1499                         /*_cache_put(ncp2);*/
1500                         cache_zap(ncp2);
1501                 }
1502                 _cache_lock(ncp);
1503         }
1504         return(r);
1505 }
1506
1507 int
1508 cache_inval(struct nchandle *nch, int flags)
1509 {
1510         return(_cache_inval(nch->ncp, flags));
1511 }
1512
1513 /*
1514  * Helper for _cache_inval().  The passed ncp is refd and locked and
1515  * remains that way on return, but may be unlocked/relocked multiple
1516  * times by the routine.
1517  */
1518 static int
1519 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1520 {
1521         struct namecache *nextkid;
1522         int rcnt = 0;
1523
1524         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1525
1526         _cache_setunresolved(ncp);
1527         if (flags & CINV_DESTROY) {
1528                 ncp->nc_flag |= NCF_DESTROYED;
1529                 ++ncp->nc_generation;
1530         }
1531
1532         while ((flags & CINV_CHILDREN) &&
1533                (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1534         ) {
1535                 struct namecache *kid;
1536                 int restart;
1537
1538                 restart = 0;
1539                 _cache_hold(nextkid);
1540                 if (++track->depth > MAX_RECURSION_DEPTH) {
1541                         track->resume_ncp = ncp;
1542                         _cache_hold(ncp);
1543                         ++rcnt;
1544                 }
1545                 while ((kid = nextkid) != NULL) {
1546                         /*
1547                          * Parent (ncp) must be locked for the iteration.
1548                          */
1549                         nextkid = NULL;
1550                         if (kid->nc_parent != ncp) {
1551                                 _cache_drop(kid);
1552                                 kprintf("cache_inval_internal restartA %s\n",
1553                                         ncp->nc_name);
1554                                 restart = 1;
1555                                 break;
1556                         }
1557                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1558                                 _cache_hold(nextkid);
1559
1560                         /*
1561                          * Parent unlocked for this section to avoid
1562                          * deadlocks.  Then lock the kid and check for
1563                          * races.
1564                          */
1565                         _cache_unlock(ncp);
1566                         if (track->resume_ncp) {
1567                                 _cache_drop(kid);
1568                                 _cache_lock(ncp);
1569                                 break;
1570                         }
1571                         _cache_lock(kid);
1572                         if (kid->nc_parent != ncp) {
1573                                 kprintf("cache_inval_internal "
1574                                         "restartB %s\n",
1575                                         ncp->nc_name);
1576                                 restart = 1;
1577                                 _cache_unlock(kid);
1578                                 _cache_drop(kid);
1579                                 _cache_lock(ncp);
1580                                 break;
1581                         }
1582                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1583                             TAILQ_FIRST(&kid->nc_list)
1584                         ) {
1585
1586                                 rcnt += _cache_inval_internal(kid,
1587                                                 flags & ~CINV_DESTROY, track);
1588                                 /*_cache_unlock(kid);*/
1589                                 /*_cache_drop(kid);*/
1590                                 cache_zap(kid);
1591                         } else {
1592                                 cache_zap(kid);
1593                         }
1594
1595                         /*
1596                          * Relock parent to continue scan
1597                          */
1598                         _cache_lock(ncp);
1599                 }
1600                 if (nextkid)
1601                         _cache_drop(nextkid);
1602                 --track->depth;
1603                 if (restart == 0)
1604                         break;
1605         }
1606
1607         /*
1608          * Someone could have gotten in there while ncp was unlocked,
1609          * retry if so.
1610          */
1611         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1612                 ++rcnt;
1613         return (rcnt);
1614 }
1615
1616 /*
1617  * Invalidate a vnode's namecache associations.  To avoid races against
1618  * the resolver we do not invalidate a node which we previously invalidated
1619  * but which was then re-resolved while we were in the invalidation loop.
1620  *
1621  * Returns non-zero if any namecache entries remain after the invalidation
1622  * loop completed.
1623  *
1624  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1625  *       be ripped out of the topology while held, the vnode's v_namecache
1626  *       list has no such restriction.  NCP's can be ripped out of the list
1627  *       at virtually any time if not locked, even if held.
1628  *
1629  *       In addition, the v_namecache list itself must be locked via
1630  *       the vnode's spinlock.
1631  */
1632 int
1633 cache_inval_vp(struct vnode *vp, int flags)
1634 {
1635         struct namecache *ncp;
1636         struct namecache *next;
1637
1638 restart:
1639         spin_lock(&vp->v_spin);
1640         ncp = TAILQ_FIRST(&vp->v_namecache);
1641         if (ncp)
1642                 _cache_hold(ncp);
1643         while (ncp) {
1644                 /* loop entered with ncp held and vp spin-locked */
1645                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1646                         _cache_hold(next);
1647                 spin_unlock(&vp->v_spin);
1648                 _cache_lock(ncp);
1649                 if (ncp->nc_vp != vp) {
1650                         kprintf("Warning: cache_inval_vp: race-A detected on "
1651                                 "%s\n", ncp->nc_name);
1652                         _cache_put(ncp);
1653                         if (next)
1654                                 _cache_drop(next);
1655                         goto restart;
1656                 }
1657                 _cache_inval(ncp, flags);
1658                 _cache_put(ncp);                /* also releases reference */
1659                 ncp = next;
1660                 spin_lock(&vp->v_spin);
1661                 if (ncp && ncp->nc_vp != vp) {
1662                         spin_unlock(&vp->v_spin);
1663                         kprintf("Warning: cache_inval_vp: race-B detected on "
1664                                 "%s\n", ncp->nc_name);
1665                         _cache_drop(ncp);
1666                         goto restart;
1667                 }
1668         }
1669         spin_unlock(&vp->v_spin);
1670         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1671 }
1672
1673 /*
1674  * This routine is used instead of the normal cache_inval_vp() when we
1675  * are trying to recycle otherwise good vnodes.
1676  *
1677  * Return 0 on success, non-zero if not all namecache records could be
1678  * disassociated from the vnode (for various reasons).
1679  */
1680 int
1681 cache_inval_vp_nonblock(struct vnode *vp)
1682 {
1683         struct namecache *ncp;
1684         struct namecache *next;
1685
1686         spin_lock(&vp->v_spin);
1687         ncp = TAILQ_FIRST(&vp->v_namecache);
1688         if (ncp)
1689                 _cache_hold(ncp);
1690         while (ncp) {
1691                 /* loop entered with ncp held */
1692                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1693                         _cache_hold(next);
1694                 spin_unlock(&vp->v_spin);
1695                 if (_cache_lock_nonblock(ncp)) {
1696                         _cache_drop(ncp);
1697                         if (next)
1698                                 _cache_drop(next);
1699                         goto done;
1700                 }
1701                 if (ncp->nc_vp != vp) {
1702                         kprintf("Warning: cache_inval_vp: race-A detected on "
1703                                 "%s\n", ncp->nc_name);
1704                         _cache_put(ncp);
1705                         if (next)
1706                                 _cache_drop(next);
1707                         goto done;
1708                 }
1709                 _cache_inval(ncp, 0);
1710                 _cache_put(ncp);                /* also releases reference */
1711                 ncp = next;
1712                 spin_lock(&vp->v_spin);
1713                 if (ncp && ncp->nc_vp != vp) {
1714                         spin_unlock(&vp->v_spin);
1715                         kprintf("Warning: cache_inval_vp: race-B detected on "
1716                                 "%s\n", ncp->nc_name);
1717                         _cache_drop(ncp);
1718                         goto done;
1719                 }
1720         }
1721         spin_unlock(&vp->v_spin);
1722 done:
1723         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1724 }
1725
1726 /*
1727  * Clears the universal directory search 'ok' flag.  This flag allows
1728  * nlookup() to bypass normal vnode checks.  This flag is a cached flag
1729  * so clearing it simply forces revalidation.
1730  */
1731 void
1732 cache_inval_wxok(struct vnode *vp)
1733 {
1734         struct namecache *ncp;
1735
1736         spin_lock(&vp->v_spin);
1737         TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
1738                 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX))
1739                         atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX);
1740         }
1741         spin_unlock(&vp->v_spin);
1742 }
1743
1744 /*
1745  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1746  * must be locked.  The target ncp is destroyed (as a normal rename-over
1747  * would destroy the target file or directory).
1748  *
1749  * Because there may be references to the source ncp we cannot copy its
1750  * contents to the target.  Instead the source ncp is relinked as the target
1751  * and the target ncp is removed from the namecache topology.
1752  */
1753 void
1754 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1755 {
1756         struct namecache *fncp = fnch->ncp;
1757         struct namecache *tncp = tnch->ncp;
1758         struct namecache *tncp_par;
1759         struct nchash_head *nchpp;
1760         u_int32_t hash;
1761         char *oname;
1762         char *nname;
1763
1764         ++fncp->nc_generation;
1765         ++tncp->nc_generation;
1766         if (tncp->nc_nlen) {
1767                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1768                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1769                 nname[tncp->nc_nlen] = 0;
1770         } else {
1771                 nname = NULL;
1772         }
1773
1774         /*
1775          * Rename fncp (unlink)
1776          */
1777         _cache_unlink_parent(fncp);
1778         oname = fncp->nc_name;
1779         fncp->nc_name = nname;
1780         fncp->nc_nlen = tncp->nc_nlen;
1781         if (oname)
1782                 kfree(oname, M_VFSCACHE);
1783
1784         tncp_par = tncp->nc_parent;
1785         _cache_hold(tncp_par);
1786         _cache_lock(tncp_par);
1787
1788         /*
1789          * Rename fncp (relink)
1790          */
1791         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1792         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1793         nchpp = NCHHASH(hash);
1794
1795         spin_lock(&nchpp->spin);
1796         _cache_link_parent(fncp, tncp_par, nchpp);
1797         spin_unlock(&nchpp->spin);
1798
1799         _cache_put(tncp_par);
1800
1801         /*
1802          * Get rid of the overwritten tncp (unlink)
1803          */
1804         _cache_unlink(tncp);
1805 }
1806
1807 /*
1808  * Perform actions consistent with unlinking a file.  The passed-in ncp
1809  * must be locked.
1810  *
1811  * The ncp is marked DESTROYED so it no longer shows up in searches,
1812  * and will be physically deleted when the vnode goes away.
1813  *
1814  * If the related vnode has no refs then we cycle it through vget()/vput()
1815  * to (possibly if we don't have a ref race) trigger a deactivation,
1816  * allowing the VFS to trivially detect and recycle the deleted vnode
1817  * via VOP_INACTIVE().
1818  *
1819  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1820  *       target ncp.
1821  */
1822 void
1823 cache_unlink(struct nchandle *nch)
1824 {
1825         _cache_unlink(nch->ncp);
1826 }
1827
1828 static void
1829 _cache_unlink(struct namecache *ncp)
1830 {
1831         struct vnode *vp;
1832
1833         /*
1834          * Causes lookups to fail and allows another ncp with the same
1835          * name to be created under ncp->nc_parent.
1836          */
1837         ncp->nc_flag |= NCF_DESTROYED;
1838         ++ncp->nc_generation;
1839
1840         /*
1841          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
1842          * force action on the 1->0 transition.
1843          */
1844         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1845             (vp = ncp->nc_vp) != NULL) {
1846                 atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
1847                 if (VREFCNT(vp) <= 0) {
1848                         if (vget(vp, LK_SHARED) == 0)
1849                                 vput(vp);
1850                 }
1851         }
1852 }
1853
1854 /*
1855  * Return non-zero if the nch might be associated with an open and/or mmap()'d
1856  * file.  The easy solution is to just return non-zero if the vnode has refs.
1857  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
1858  * force the reclaim).
1859  */
1860 int
1861 cache_isopen(struct nchandle *nch)
1862 {
1863         struct vnode *vp;
1864         struct namecache *ncp = nch->ncp;
1865
1866         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1867             (vp = ncp->nc_vp) != NULL &&
1868             VREFCNT(vp)) {
1869                 return 1;
1870         }
1871         return 0;
1872 }
1873
1874
1875 /*
1876  * vget the vnode associated with the namecache entry.  Resolve the namecache
1877  * entry if necessary.  The passed ncp must be referenced and locked.  If
1878  * the ncp is resolved it might be locked shared.
1879  *
1880  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
1881  * (depending on the passed lk_type) will be returned in *vpp with an error
1882  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
1883  * most typical error is ENOENT, meaning that the ncp represents a negative
1884  * cache hit and there is no vnode to retrieve, but other errors can occur
1885  * too.
1886  *
1887  * The vget() can race a reclaim.  If this occurs we re-resolve the
1888  * namecache entry.
1889  *
1890  * There are numerous places in the kernel where vget() is called on a
1891  * vnode while one or more of its namecache entries is locked.  Releasing
1892  * a vnode never deadlocks against locked namecache entries (the vnode
1893  * will not get recycled while referenced ncp's exist).  This means we
1894  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
1895  * lock when acquiring the vp lock or we might cause a deadlock.
1896  *
1897  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1898  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1899  *       relocked exclusively before being re-resolved.
1900  */
1901 int
1902 cache_vget(struct nchandle *nch, struct ucred *cred,
1903            int lk_type, struct vnode **vpp)
1904 {
1905         struct namecache *ncp;
1906         struct vnode *vp;
1907         int error;
1908
1909         ncp = nch->ncp;
1910 again:
1911         vp = NULL;
1912         if (ncp->nc_flag & NCF_UNRESOLVED)
1913                 error = cache_resolve(nch, cred);
1914         else
1915                 error = 0;
1916
1917         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1918                 error = vget(vp, lk_type);
1919                 if (error) {
1920                         /*
1921                          * VRECLAIM race
1922                          *
1923                          * The ncp may have been locked shared, we must relock
1924                          * it exclusively before we can set it to unresolved.
1925                          */
1926                         if (error == ENOENT) {
1927                                 kprintf("Warning: vnode reclaim race detected "
1928                                         "in cache_vget on %p (%s)\n",
1929                                         vp, ncp->nc_name);
1930                                 _cache_unlock(ncp);
1931                                 _cache_lock(ncp);
1932                                 _cache_setunresolved(ncp);
1933                                 goto again;
1934                         }
1935
1936                         /*
1937                          * Not a reclaim race, some other error.
1938                          */
1939                         KKASSERT(ncp->nc_vp == vp);
1940                         vp = NULL;
1941                 } else {
1942                         KKASSERT(ncp->nc_vp == vp);
1943                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1944                 }
1945         }
1946         if (error == 0 && vp == NULL)
1947                 error = ENOENT;
1948         *vpp = vp;
1949         return(error);
1950 }
1951
1952 /*
1953  * Similar to cache_vget() but only acquires a ref on the vnode.  The vnode
1954  * is already held by virtuue of the ncp being locked, but it might not be
1955  * referenced and while it is not referenced it can transition into the
1956  * VRECLAIMED state.
1957  *
1958  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1959  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1960  *       relocked exclusively before being re-resolved.
1961  *
1962  * NOTE: At the moment we have to issue a vget() on the vnode, even though
1963  *       we are going to immediately release the lock, in order to resolve
1964  *       potential reclamation races.  Once we have a solid vnode ref that
1965  *       was (at some point) interlocked via a vget(), the vnode will not
1966  *       be reclaimed.
1967  *
1968  * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
1969  */
1970 int
1971 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1972 {
1973         struct namecache *ncp;
1974         struct vnode *vp;
1975         int error;
1976         int v;
1977
1978         ncp = nch->ncp;
1979 again:
1980         vp = NULL;
1981         if (ncp->nc_flag & NCF_UNRESOLVED)
1982                 error = cache_resolve(nch, cred);
1983         else
1984                 error = 0;
1985
1986         while (error == 0 && (vp = ncp->nc_vp) != NULL) {
1987                 /*
1988                  * Try a lockless ref of the vnode.  VRECLAIMED transitions
1989                  * use the vx_lock state and update-counter mechanism so we
1990                  * can detect if one is in-progress or occurred.
1991                  *
1992                  * If we can successfully ref the vnode and interlock against
1993                  * the update-counter mechanism, and VRECLAIMED is found to
1994                  * not be set after that, we should be good.
1995                  */
1996                 v = spin_access_start_only(&vp->v_spin);
1997                 if (__predict_true(spin_access_check_inprog(v) == 0)) {
1998                         vref_special(vp);
1999                         if (__predict_false(
2000                                     spin_access_end_only(&vp->v_spin, v))) {
2001                                 vrele(vp);
2002                                 continue;
2003                         }
2004                         if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) {
2005                                 break;
2006                         }
2007                         vrele(vp);
2008                         kprintf("CACHE_VREF: IN-RECLAIM\n");
2009                 }
2010
2011                 /*
2012                  * Do it the slow way
2013                  */
2014                 error = vget(vp, LK_SHARED);
2015                 if (error) {
2016                         /*
2017                          * VRECLAIM race
2018                          */
2019                         if (error == ENOENT) {
2020                                 kprintf("Warning: vnode reclaim race detected "
2021                                         "in cache_vget on %p (%s)\n",
2022                                         vp, ncp->nc_name);
2023                                 _cache_unlock(ncp);
2024                                 _cache_lock(ncp);
2025                                 _cache_setunresolved(ncp);
2026                                 goto again;
2027                         }
2028
2029                         /*
2030                          * Not a reclaim race, some other error.
2031                          */
2032                         KKASSERT(ncp->nc_vp == vp);
2033                         vp = NULL;
2034                 } else {
2035                         KKASSERT(ncp->nc_vp == vp);
2036                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2037                         /* caller does not want a lock */
2038                         vn_unlock(vp);
2039                 }
2040                 break;
2041         }
2042         if (error == 0 && vp == NULL)
2043                 error = ENOENT;
2044         *vpp = vp;
2045
2046         return(error);
2047 }
2048
2049 /*
2050  * Return a referenced vnode representing the parent directory of
2051  * ncp.
2052  *
2053  * Because the caller has locked the ncp it should not be possible for
2054  * the parent ncp to go away.  However, the parent can unresolve its
2055  * dvp at any time so we must be able to acquire a lock on the parent
2056  * to safely access nc_vp.
2057  *
2058  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2059  * so use vhold()/vdrop() while holding the lock to prevent dvp from
2060  * getting destroyed.
2061  *
2062  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2063  *       lock on the ncp in question..
2064  */
2065 struct vnode *
2066 cache_dvpref(struct namecache *ncp)
2067 {
2068         struct namecache *par;
2069         struct vnode *dvp;
2070
2071         dvp = NULL;
2072         if ((par = ncp->nc_parent) != NULL) {
2073                 _cache_hold(par);
2074                 _cache_lock(par);
2075                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2076                         if ((dvp = par->nc_vp) != NULL)
2077                                 vhold(dvp);
2078                 }
2079                 _cache_unlock(par);
2080                 if (dvp) {
2081                         if (vget(dvp, LK_SHARED) == 0) {
2082                                 vn_unlock(dvp);
2083                                 vdrop(dvp);
2084                                 /* return refd, unlocked dvp */
2085                         } else {
2086                                 vdrop(dvp);
2087                                 dvp = NULL;
2088                         }
2089                 }
2090                 _cache_drop(par);
2091         }
2092         return(dvp);
2093 }
2094
2095 /*
2096  * Convert a directory vnode to a namecache record without any other
2097  * knowledge of the topology.  This ONLY works with directory vnodes and
2098  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2099  * returned ncp (if not NULL) will be held and unlocked.
2100  *
2101  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2102  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2103  * for dvp.  This will fail only if the directory has been deleted out from
2104  * under the caller.
2105  *
2106  * Callers must always check for a NULL return no matter the value of 'makeit'.
2107  *
2108  * To avoid underflowing the kernel stack each recursive call increments
2109  * the makeit variable.
2110  */
2111
2112 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2113                                   struct vnode *dvp, char *fakename);
2114 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2115                                   struct vnode **saved_dvp);
2116
2117 int
2118 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2119               struct nchandle *nch)
2120 {
2121         struct vnode *saved_dvp;
2122         struct vnode *pvp;
2123         char *fakename;
2124         int error;
2125
2126         nch->ncp = NULL;
2127         nch->mount = dvp->v_mount;
2128         saved_dvp = NULL;
2129         fakename = NULL;
2130
2131         /*
2132          * Handle the makeit == 0 degenerate case
2133          */
2134         if (makeit == 0) {
2135                 spin_lock_shared(&dvp->v_spin);
2136                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2137                 if (nch->ncp)
2138                         cache_hold(nch);
2139                 spin_unlock_shared(&dvp->v_spin);
2140         }
2141
2142         /*
2143          * Loop until resolution, inside code will break out on error.
2144          */
2145         while (makeit) {
2146                 /*
2147                  * Break out if we successfully acquire a working ncp.
2148                  */
2149                 spin_lock_shared(&dvp->v_spin);
2150                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2151                 if (nch->ncp) {
2152                         cache_hold(nch);
2153                         spin_unlock_shared(&dvp->v_spin);
2154                         break;
2155                 }
2156                 spin_unlock_shared(&dvp->v_spin);
2157
2158                 /*
2159                  * If dvp is the root of its filesystem it should already
2160                  * have a namecache pointer associated with it as a side
2161                  * effect of the mount, but it may have been disassociated.
2162                  */
2163                 if (dvp->v_flag & VROOT) {
2164                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2165                         error = cache_resolve_mp(nch->mount);
2166                         _cache_put(nch->ncp);
2167                         if (ncvp_debug) {
2168                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2169                                         dvp->v_mount, error);
2170                         }
2171                         if (error) {
2172                                 if (ncvp_debug)
2173                                         kprintf(" failed\n");
2174                                 nch->ncp = NULL;
2175                                 break;
2176                         }
2177                         if (ncvp_debug)
2178                                 kprintf(" succeeded\n");
2179                         continue;
2180                 }
2181
2182                 /*
2183                  * If we are recursed too deeply resort to an O(n^2)
2184                  * algorithm to resolve the namecache topology.  The
2185                  * resolved pvp is left referenced in saved_dvp to
2186                  * prevent the tree from being destroyed while we loop.
2187                  */
2188                 if (makeit > 20) {
2189                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2190                         if (error) {
2191                                 kprintf("lookupdotdot(longpath) failed %d "
2192                                        "dvp %p\n", error, dvp);
2193                                 nch->ncp = NULL;
2194                                 break;
2195                         }
2196                         continue;
2197                 }
2198
2199                 /*
2200                  * Get the parent directory and resolve its ncp.
2201                  */
2202                 if (fakename) {
2203                         kfree(fakename, M_TEMP);
2204                         fakename = NULL;
2205                 }
2206                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2207                                           &fakename);
2208                 if (error) {
2209                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2210                         break;
2211                 }
2212                 vn_unlock(pvp);
2213
2214                 /*
2215                  * Reuse makeit as a recursion depth counter.  On success
2216                  * nch will be fully referenced.
2217                  */
2218                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2219                 vrele(pvp);
2220                 if (nch->ncp == NULL)
2221                         break;
2222
2223                 /*
2224                  * Do an inefficient scan of pvp (embodied by ncp) to look
2225                  * for dvp.  This will create a namecache record for dvp on
2226                  * success.  We loop up to recheck on success.
2227                  *
2228                  * ncp and dvp are both held but not locked.
2229                  */
2230                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2231                 if (error) {
2232                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2233                                 pvp, nch->ncp->nc_name, dvp);
2234                         cache_drop(nch);
2235                         /* nch was NULLed out, reload mount */
2236                         nch->mount = dvp->v_mount;
2237                         break;
2238                 }
2239                 if (ncvp_debug) {
2240                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2241                                 pvp, nch->ncp->nc_name);
2242                 }
2243                 cache_drop(nch);
2244                 /* nch was NULLed out, reload mount */
2245                 nch->mount = dvp->v_mount;
2246         }
2247
2248         /*
2249          * If nch->ncp is non-NULL it will have been held already.
2250          */
2251         if (fakename)
2252                 kfree(fakename, M_TEMP);
2253         if (saved_dvp)
2254                 vrele(saved_dvp);
2255         if (nch->ncp)
2256                 return (0);
2257         return (EINVAL);
2258 }
2259
2260 /*
2261  * Go up the chain of parent directories until we find something
2262  * we can resolve into the namecache.  This is very inefficient.
2263  */
2264 static
2265 int
2266 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2267                   struct vnode **saved_dvp)
2268 {
2269         struct nchandle nch;
2270         struct vnode *pvp;
2271         int error;
2272         static time_t last_fromdvp_report;
2273         char *fakename;
2274
2275         /*
2276          * Loop getting the parent directory vnode until we get something we
2277          * can resolve in the namecache.
2278          */
2279         vref(dvp);
2280         nch.mount = dvp->v_mount;
2281         nch.ncp = NULL;
2282         fakename = NULL;
2283
2284         for (;;) {
2285                 if (fakename) {
2286                         kfree(fakename, M_TEMP);
2287                         fakename = NULL;
2288                 }
2289                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2290                                           &fakename);
2291                 if (error) {
2292                         vrele(dvp);
2293                         break;
2294                 }
2295                 vn_unlock(pvp);
2296                 spin_lock_shared(&pvp->v_spin);
2297                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2298                         _cache_hold(nch.ncp);
2299                         spin_unlock_shared(&pvp->v_spin);
2300                         vrele(pvp);
2301                         break;
2302                 }
2303                 spin_unlock_shared(&pvp->v_spin);
2304                 if (pvp->v_flag & VROOT) {
2305                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2306                         error = cache_resolve_mp(nch.mount);
2307                         _cache_unlock(nch.ncp);
2308                         vrele(pvp);
2309                         if (error) {
2310                                 _cache_drop(nch.ncp);
2311                                 nch.ncp = NULL;
2312                                 vrele(dvp);
2313                         }
2314                         break;
2315                 }
2316                 vrele(dvp);
2317                 dvp = pvp;
2318         }
2319         if (error == 0) {
2320                 if (last_fromdvp_report != time_uptime) {
2321                         last_fromdvp_report = time_uptime;
2322                         kprintf("Warning: extremely inefficient path "
2323                                 "resolution on %s\n",
2324                                 nch.ncp->nc_name);
2325                 }
2326                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2327
2328                 /*
2329                  * Hopefully dvp now has a namecache record associated with
2330                  * it.  Leave it referenced to prevent the kernel from
2331                  * recycling the vnode.  Otherwise extremely long directory
2332                  * paths could result in endless recycling.
2333                  */
2334                 if (*saved_dvp)
2335                     vrele(*saved_dvp);
2336                 *saved_dvp = dvp;
2337                 _cache_drop(nch.ncp);
2338         }
2339         if (fakename)
2340                 kfree(fakename, M_TEMP);
2341         return (error);
2342 }
2343
2344 /*
2345  * Do an inefficient scan of the directory represented by ncp looking for
2346  * the directory vnode dvp.  ncp must be held but not locked on entry and
2347  * will be held on return.  dvp must be refd but not locked on entry and
2348  * will remain refd on return.
2349  *
2350  * Why do this at all?  Well, due to its stateless nature the NFS server
2351  * converts file handles directly to vnodes without necessarily going through
2352  * the namecache ops that would otherwise create the namecache topology
2353  * leading to the vnode.  We could either (1) Change the namecache algorithms
2354  * to allow disconnect namecache records that are re-merged opportunistically,
2355  * or (2) Make the NFS server backtrack and scan to recover a connected
2356  * namecache topology in order to then be able to issue new API lookups.
2357  *
2358  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2359  * namecache algorithms and introduces a lot of complication in every subsystem
2360  * that calls into the namecache to deal with the re-merge case, especially
2361  * since we are using the namecache to placehold negative lookups and the
2362  * vnode might not be immediately assigned. (2) is certainly far less
2363  * efficient then (1), but since we are only talking about directories here
2364  * (which are likely to remain cached), the case does not actually run all
2365  * that often and has the supreme advantage of not polluting the namecache
2366  * algorithms.
2367  *
2368  * If a fakename is supplied just construct a namecache entry using the
2369  * fake name.
2370  */
2371 static int
2372 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2373                        struct vnode *dvp, char *fakename)
2374 {
2375         struct nlcomponent nlc;
2376         struct nchandle rncp;
2377         struct dirent *den;
2378         struct vnode *pvp;
2379         struct vattr vat;
2380         struct iovec iov;
2381         struct uio uio;
2382         int blksize;
2383         int eofflag;
2384         int bytes;
2385         char *rbuf;
2386         int error;
2387
2388         vat.va_blocksize = 0;
2389         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2390                 return (error);
2391         cache_lock(nch);
2392         error = cache_vref(nch, cred, &pvp);
2393         cache_unlock(nch);
2394         if (error)
2395                 return (error);
2396         if (ncvp_debug) {
2397                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2398                         "vattr fileid = %lld\n",
2399                         nch->ncp, nch->ncp->nc_name,
2400                         vat.va_blocksize,
2401                         (long long)vat.va_fileid);
2402         }
2403
2404         /*
2405          * Use the supplied fakename if not NULL.  Fake names are typically
2406          * not in the actual filesystem hierarchy.  This is used by HAMMER
2407          * to glue @@timestamp recursions together.
2408          */
2409         if (fakename) {
2410                 nlc.nlc_nameptr = fakename;
2411                 nlc.nlc_namelen = strlen(fakename);
2412                 rncp = cache_nlookup(nch, &nlc);
2413                 goto done;
2414         }
2415
2416         if ((blksize = vat.va_blocksize) == 0)
2417                 blksize = DEV_BSIZE;
2418         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2419         rncp.ncp = NULL;
2420
2421         eofflag = 0;
2422         uio.uio_offset = 0;
2423 again:
2424         iov.iov_base = rbuf;
2425         iov.iov_len = blksize;
2426         uio.uio_iov = &iov;
2427         uio.uio_iovcnt = 1;
2428         uio.uio_resid = blksize;
2429         uio.uio_segflg = UIO_SYSSPACE;
2430         uio.uio_rw = UIO_READ;
2431         uio.uio_td = curthread;
2432
2433         if (ncvp_debug >= 2)
2434                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2435         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2436         if (error == 0) {
2437                 den = (struct dirent *)rbuf;
2438                 bytes = blksize - uio.uio_resid;
2439
2440                 while (bytes > 0) {
2441                         if (ncvp_debug >= 2) {
2442                                 kprintf("cache_inefficient_scan: %*.*s\n",
2443                                         den->d_namlen, den->d_namlen,
2444                                         den->d_name);
2445                         }
2446                         if (den->d_type != DT_WHT &&
2447                             den->d_ino == vat.va_fileid) {
2448                                 if (ncvp_debug) {
2449                                         kprintf("cache_inefficient_scan: "
2450                                                "MATCHED inode %lld path %s/%*.*s\n",
2451                                                (long long)vat.va_fileid,
2452                                                nch->ncp->nc_name,
2453                                                den->d_namlen, den->d_namlen,
2454                                                den->d_name);
2455                                 }
2456                                 nlc.nlc_nameptr = den->d_name;
2457                                 nlc.nlc_namelen = den->d_namlen;
2458                                 rncp = cache_nlookup(nch, &nlc);
2459                                 KKASSERT(rncp.ncp != NULL);
2460                                 break;
2461                         }
2462                         bytes -= _DIRENT_DIRSIZ(den);
2463                         den = _DIRENT_NEXT(den);
2464                 }
2465                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2466                         goto again;
2467         }
2468         kfree(rbuf, M_TEMP);
2469 done:
2470         vrele(pvp);
2471         if (rncp.ncp) {
2472                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2473                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2474                         if (ncvp_debug >= 2) {
2475                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2476                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2477                         }
2478                 } else {
2479                         if (ncvp_debug >= 2) {
2480                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2481                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2482                                         rncp.ncp->nc_vp);
2483                         }
2484                 }
2485                 if (rncp.ncp->nc_vp == NULL)
2486                         error = rncp.ncp->nc_error;
2487                 /*
2488                  * Release rncp after a successful nlookup.  rncp was fully
2489                  * referenced.
2490                  */
2491                 cache_put(&rncp);
2492         } else {
2493                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2494                         dvp, nch->ncp->nc_name);
2495                 error = ENOENT;
2496         }
2497         return (error);
2498 }
2499
2500 /*
2501  * This function must be called with the ncp held and locked and will unlock
2502  * and drop it during zapping.
2503  *
2504  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2505  * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2506  * and removes the related reference.  If the ncp can be removed, and the
2507  * parent can be zapped non-blocking, this function loops up.
2508  *
2509  * There will be one ref from the caller (which we now own).  The only
2510  * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2511  * so possibly 2 refs left.  Taking this into account, if there are no
2512  * additional refs and no children, the ncp will be removed from the topology
2513  * and destroyed.
2514  *
2515  * References and/or children may exist if the ncp is in the middle of the
2516  * topology, preventing the ncp from being destroyed.
2517  *
2518  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2519  *
2520  * This function may return a held (but NOT locked) parent node which the
2521  * caller must drop in a loop.  Looping is one way to avoid unbounded recursion
2522  * due to deep namecache trees.
2523  *
2524  * WARNING!  For MPSAFE operation this routine must acquire up to three
2525  *           spin locks to be able to safely test nc_refs.  Lock order is
2526  *           very important.
2527  *
2528  *           hash spinlock if on hash list
2529  *           parent spinlock if child of parent
2530  *           (the ncp is unresolved so there is no vnode association)
2531  */
2532 static void
2533 cache_zap(struct namecache *ncp)
2534 {
2535         struct namecache *par;
2536         struct vnode *dropvp;
2537         struct nchash_head *nchpp;
2538         int refcmp;
2539         int nonblock = 1;       /* XXX cleanup */
2540
2541 again:
2542         /*
2543          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2544          * This gets rid of any vp->v_namecache list or negative list and
2545          * the related ref.
2546          */
2547         _cache_setunresolved(ncp);
2548
2549         /*
2550          * Try to scrap the entry and possibly tail-recurse on its parent.
2551          * We only scrap unref'd (other then our ref) unresolved entries,
2552          * we do not scrap 'live' entries.
2553          *
2554          * If nc_parent is non NULL we expect 2 references, else just 1.
2555          * If there are more, someone else also holds the ncp and we cannot
2556          * destroy it.
2557          */
2558         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2559         KKASSERT(ncp->nc_refs > 0);
2560
2561         /*
2562          * If the ncp is linked to its parent it will also be in the hash
2563          * table.  We have to be able to lock the parent and the hash table.
2564          *
2565          * Acquire locks.  Note that the parent can't go away while we hold
2566          * a child locked.  If nc_parent is present, expect 2 refs instead
2567          * of 1.
2568          */
2569         nchpp = NULL;
2570         if ((par = ncp->nc_parent) != NULL) {
2571                 if (nonblock) {
2572                         if (_cache_lock_nonblock(par)) {
2573                                 /* lock failed */
2574                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2575                                 atomic_add_long(
2576                                     &pcpu_ncache[mycpu->gd_cpuid].numdefered,
2577                                     1);
2578                                 _cache_unlock(ncp);
2579                                 _cache_drop(ncp);       /* caller's ref */
2580                                 return;
2581                         }
2582                         _cache_hold(par);
2583                 } else {
2584                         _cache_hold(par);
2585                         _cache_lock(par);
2586                 }
2587                 nchpp = ncp->nc_head;
2588                 spin_lock(&nchpp->spin);
2589         }
2590
2591         /*
2592          * With the parent and nchpp locked, and the vnode removed
2593          * (no vp->v_namecache), we expect 1 or 2 refs.  If there are
2594          * more someone else has a ref and we cannot zap the entry.
2595          *
2596          * one for our hold
2597          * one for our parent link (parent also has one from the linkage)
2598          */
2599         if (par)
2600                 refcmp = 2;
2601         else
2602                 refcmp = 1;
2603
2604         /*
2605          * On failure undo the work we've done so far and drop the
2606          * caller's ref and ncp.
2607          */
2608         if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) {
2609                 if (par) {
2610                         spin_unlock(&nchpp->spin);
2611                         _cache_put(par);
2612                 }
2613                 _cache_unlock(ncp);
2614                 _cache_drop(ncp);
2615                 return;
2616         }
2617
2618         /*
2619          * We own all the refs and with the spinlocks held no further
2620          * refs can be acquired by others.
2621          *
2622          * Remove us from the hash list and parent list.  We have to
2623          * drop a ref on the parent's vp if the parent's list becomes
2624          * empty.
2625          */
2626         dropvp = NULL;
2627         if (par) {
2628                 struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
2629
2630                 KKASSERT(nchpp == ncp->nc_head);
2631                 TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
2632                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2633                 atomic_add_long(&pn->vfscache_count, -1);
2634                 if (TAILQ_EMPTY(&ncp->nc_list))
2635                         atomic_add_long(&pn->vfscache_leafs, -1);
2636
2637                 if (TAILQ_EMPTY(&par->nc_list)) {
2638                         atomic_add_long(&pn->vfscache_leafs, 1);
2639                         if (par->nc_vp)
2640                                 dropvp = par->nc_vp;
2641                 }
2642                 ncp->nc_parent = NULL;
2643                 ncp->nc_head = NULL;
2644                 spin_unlock(&nchpp->spin);
2645                 _cache_drop(par);       /* removal of ncp from par->nc_list */
2646                 /*_cache_unlock(par);*/
2647         } else {
2648                 KKASSERT(ncp->nc_head == NULL);
2649         }
2650
2651         /*
2652          * ncp should not have picked up any refs.  Physically
2653          * destroy the ncp.
2654          */
2655         if (ncp->nc_refs != refcmp) {
2656                 panic("cache_zap: %p bad refs %d (expected %d)\n",
2657                         ncp, ncp->nc_refs, refcmp);
2658         }
2659         /* _cache_unlock(ncp) not required */
2660         ncp->nc_refs = -1;      /* safety */
2661         if (ncp->nc_name)
2662                 kfree(ncp->nc_name, M_VFSCACHE);
2663         kfree(ncp, M_VFSCACHE);
2664
2665         /*
2666          * Delayed drop (we had to release our spinlocks)
2667          */
2668         if (dropvp)
2669                 vdrop(dropvp);
2670
2671         /*
2672          * Loop up if we can recursively clean out the parent.
2673          */
2674         if (par) {
2675                 refcmp = 1;             /* ref on parent */
2676                 if (par->nc_parent)     /* par->par */
2677                         ++refcmp;
2678                 par->nc_flag &= ~NCF_DEFEREDZAP;
2679                 if ((par->nc_flag & NCF_UNRESOLVED) &&
2680                     par->nc_refs == refcmp &&
2681                     TAILQ_EMPTY(&par->nc_list)) {
2682                         ncp = par;
2683                         goto again;
2684                 }
2685                 _cache_unlock(par);
2686                 _cache_drop(par);
2687         }
2688 }
2689
2690 /*
2691  * Clean up dangling negative cache and defered-drop entries in the
2692  * namecache.
2693  *
2694  * This routine is called in the critical path and also called from
2695  * vnlru().  When called from vnlru we use a lower limit to try to
2696  * deal with the negative cache before the critical path has to start
2697  * dealing with it.
2698  */
2699 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2700
2701 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2702 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
2703
2704 void
2705 cache_hysteresis(int critpath)
2706 {
2707         long poslimit;
2708         long neglimit = maxvnodes / ncnegfactor;
2709         long xnumcache = vfscache_leafs;
2710
2711         if (critpath == 0)
2712                 neglimit = neglimit * 8 / 10;
2713
2714         /*
2715          * Don't cache too many negative hits.  We use hysteresis to reduce
2716          * the impact on the critical path.
2717          */
2718         switch(neg_cache_hysteresis_state[critpath]) {
2719         case CHI_LOW:
2720                 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
2721                         if (critpath)
2722                                 _cache_cleanneg(ncnegflush);
2723                         else
2724                                 _cache_cleanneg(ncnegflush +
2725                                                 vfscache_negs - neglimit);
2726                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
2727                 }
2728                 break;
2729         case CHI_HIGH:
2730                 if (vfscache_negs > MINNEG * 9 / 10 &&
2731                     vfscache_negs * 9 / 10 > neglimit
2732                 ) {
2733                         if (critpath)
2734                                 _cache_cleanneg(ncnegflush);
2735                         else
2736                                 _cache_cleanneg(ncnegflush +
2737                                                 vfscache_negs * 9 / 10 -
2738                                                 neglimit);
2739                 } else {
2740                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
2741                 }
2742                 break;
2743         }
2744
2745         /*
2746          * Don't cache too many positive hits.  We use hysteresis to reduce
2747          * the impact on the critical path.
2748          *
2749          * Excessive positive hits can accumulate due to large numbers of
2750          * hardlinks (the vnode cache will not prevent hl ncps from growing
2751          * into infinity).
2752          */
2753         if ((poslimit = ncposlimit) == 0)
2754                 poslimit = maxvnodes * 2;
2755         if (critpath == 0)
2756                 poslimit = poslimit * 8 / 10;
2757
2758         switch(pos_cache_hysteresis_state[critpath]) {
2759         case CHI_LOW:
2760                 if (xnumcache > poslimit && xnumcache > MINPOS) {
2761                         if (critpath)
2762                                 _cache_cleanpos(ncposflush);
2763                         else
2764                                 _cache_cleanpos(ncposflush +
2765                                                 xnumcache - poslimit);
2766                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
2767                 }
2768                 break;
2769         case CHI_HIGH:
2770                 if (xnumcache > poslimit * 5 / 6 && xnumcache > MINPOS) {
2771                         if (critpath)
2772                                 _cache_cleanpos(ncposflush);
2773                         else
2774                                 _cache_cleanpos(ncposflush +
2775                                                 xnumcache - poslimit * 5 / 6);
2776                 } else {
2777                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
2778                 }
2779                 break;
2780         }
2781
2782         /*
2783          * Clean out dangling defered-zap ncps which could not be cleanly
2784          * dropped if too many build up.  Note that numdefered is
2785          * heuristical.  Make sure we are real-time for the current cpu,
2786          * plus the global rollup.
2787          */
2788         if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) {
2789                 _cache_cleandefered();
2790         }
2791 }
2792
2793 /*
2794  * NEW NAMECACHE LOOKUP API
2795  *
2796  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2797  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2798  * is ALWAYS returned, eve if the supplied component is illegal.
2799  *
2800  * The resulting namecache entry should be returned to the system with
2801  * cache_put() or cache_unlock() + cache_drop().
2802  *
2803  * namecache locks are recursive but care must be taken to avoid lock order
2804  * reversals (hence why the passed par_nch must be unlocked).  Locking
2805  * rules are to order for parent traversals, not for child traversals.
2806  *
2807  * Nobody else will be able to manipulate the associated namespace (e.g.
2808  * create, delete, rename, rename-target) until the caller unlocks the
2809  * entry.
2810  *
2811  * The returned entry will be in one of three states:  positive hit (non-null
2812  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2813  * Unresolved entries must be resolved through the filesystem to associate the
2814  * vnode and/or determine whether a positive or negative hit has occured.
2815  *
2816  * It is not necessary to lock a directory in order to lock namespace under
2817  * that directory.  In fact, it is explicitly not allowed to do that.  A
2818  * directory is typically only locked when being created, renamed, or
2819  * destroyed.
2820  *
2821  * The directory (par) may be unresolved, in which case any returned child
2822  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2823  * the filesystem lookup requires a resolved directory vnode the caller is
2824  * responsible for resolving the namecache chain top-down.  This API
2825  * specifically allows whole chains to be created in an unresolved state.
2826  */
2827 struct nchandle
2828 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2829 {
2830         struct nchandle nch;
2831         struct namecache *ncp;
2832         struct namecache *new_ncp;
2833         struct namecache *rep_ncp;      /* reuse a destroyed ncp */
2834         struct nchash_head *nchpp;
2835         struct mount *mp;
2836         u_int32_t hash;
2837         globaldata_t gd;
2838         int par_locked;
2839
2840         gd = mycpu;
2841         mp = par_nch->mount;
2842         par_locked = 0;
2843
2844         /*
2845          * This is a good time to call it, no ncp's are locked by
2846          * the caller or us.
2847          */
2848         cache_hysteresis(1);
2849
2850         /*
2851          * Try to locate an existing entry
2852          */
2853         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2854         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2855         new_ncp = NULL;
2856         nchpp = NCHHASH(hash);
2857 restart:
2858         rep_ncp = NULL;
2859         if (new_ncp)
2860                 spin_lock(&nchpp->spin);
2861         else
2862                 spin_lock_shared(&nchpp->spin);
2863
2864         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
2865                 /*
2866                  * Break out if we find a matching entry.  Note that
2867                  * UNRESOLVED entries may match, but DESTROYED entries
2868                  * do not.
2869                  *
2870                  * We may be able to reuse DESTROYED entries that we come
2871                  * across, even if the name does not match, as long as
2872                  * nc_nlen is correct and the only hold ref is from the nchpp
2873                  * list itself.
2874                  */
2875                 if (ncp->nc_parent == par_nch->ncp &&
2876                     ncp->nc_nlen == nlc->nlc_namelen) {
2877                         if (ncp->nc_flag & NCF_DESTROYED) {
2878                                 if (ncp->nc_refs == 1 && rep_ncp == NULL)
2879                                         rep_ncp = ncp;
2880                                 continue;
2881                         }
2882                         if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen))
2883                                 continue;
2884                         _cache_hold(ncp);
2885                         if (new_ncp)
2886                                 spin_unlock(&nchpp->spin);
2887                         else
2888                                 spin_unlock_shared(&nchpp->spin);
2889                         if (par_locked) {
2890                                 _cache_unlock(par_nch->ncp);
2891                                 par_locked = 0;
2892                         }
2893                         if (_cache_lock_special(ncp) == 0) {
2894                                 /*
2895                                  * Successfully locked but we must re-test
2896                                  * conditions that might have changed since
2897                                  * we did not have the lock before.
2898                                  */
2899                                 if (ncp->nc_parent != par_nch->ncp ||
2900                                     ncp->nc_nlen != nlc->nlc_namelen ||
2901                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
2902                                          ncp->nc_nlen) ||
2903                                     (ncp->nc_flag & NCF_DESTROYED)) {
2904                                         _cache_put(ncp);
2905                                         goto restart;
2906                                 }
2907                                 _cache_auto_unresolve(mp, ncp);
2908                                 if (new_ncp)
2909                                         _cache_free(new_ncp);
2910                                 goto found;
2911                         }
2912                         _cache_get(ncp);        /* cycle the lock to block */
2913                         _cache_put(ncp);
2914                         _cache_drop(ncp);
2915                         goto restart;
2916                 }
2917         }
2918
2919         /*
2920          * We failed to locate the entry, try to resurrect a destroyed
2921          * entry that we did find that is already correctly linked into
2922          * nchpp and the parent.  We must re-test conditions after
2923          * successfully locking rep_ncp.
2924          *
2925          * This case can occur under heavy loads due to not being able
2926          * to safely lock the parent in cache_zap().  Nominally a repeated
2927          * create/unlink load, but only the namelen needs to match.
2928          */
2929         if (rep_ncp && new_ncp == NULL) {
2930                 if (_cache_lock_nonblock(rep_ncp) == 0) {
2931                         _cache_hold(rep_ncp);
2932                         if (rep_ncp->nc_parent == par_nch->ncp &&
2933                             rep_ncp->nc_nlen == nlc->nlc_namelen &&
2934                             (rep_ncp->nc_flag & NCF_DESTROYED) &&
2935                             rep_ncp->nc_refs == 2) {
2936                                 /*
2937                                  * Update nc_name as reuse as new.
2938                                  */
2939                                 ncp = rep_ncp;
2940                                 bcopy(nlc->nlc_nameptr, ncp->nc_name,
2941                                       nlc->nlc_namelen);
2942                                 spin_unlock_shared(&nchpp->spin);
2943                                 _cache_setunresolved(ncp);
2944                                 ncp->nc_flag = NCF_UNRESOLVED;
2945                                 ncp->nc_error = ENOTCONN;
2946                                 goto found;
2947                         }
2948                         _cache_put(rep_ncp);
2949                 }
2950         }
2951
2952         /*
2953          * Otherwise create a new entry and add it to the cache.  The parent
2954          * ncp must also be locked so we can link into it.
2955          *
2956          * We have to relookup after possibly blocking in kmalloc or
2957          * when locking par_nch.
2958          *
2959          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2960          *       mount case, in which case nc_name will be NULL.
2961          */
2962         if (new_ncp == NULL) {
2963                 spin_unlock_shared(&nchpp->spin);
2964                 new_ncp = cache_alloc(nlc->nlc_namelen);
2965                 if (nlc->nlc_namelen) {
2966                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2967                               nlc->nlc_namelen);
2968                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2969                 }
2970                 goto restart;
2971         }
2972
2973         /*
2974          * NOTE! The spinlock is held exclusively here because new_ncp
2975          *       is non-NULL.
2976          */
2977         if (par_locked == 0) {
2978                 spin_unlock(&nchpp->spin);
2979                 _cache_lock(par_nch->ncp);
2980                 par_locked = 1;
2981                 goto restart;
2982         }
2983
2984         /*
2985          * Link to parent (requires another ref, the one already in new_ncp
2986          * is what we wil lreturn).
2987          *
2988          * WARNING!  We still hold the spinlock.  We have to set the hash
2989          *           table entry atomically.
2990          */
2991         ncp = new_ncp;
2992         ++ncp->nc_refs;
2993         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2994         spin_unlock(&nchpp->spin);
2995         _cache_unlock(par_nch->ncp);
2996         /* par_locked = 0 - not used */
2997 found:
2998         /*
2999          * stats and namecache size management
3000          */
3001         if (ncp->nc_flag & NCF_UNRESOLVED)
3002                 ++gd->gd_nchstats->ncs_miss;
3003         else if (ncp->nc_vp)
3004                 ++gd->gd_nchstats->ncs_goodhits;
3005         else
3006                 ++gd->gd_nchstats->ncs_neghits;
3007         nch.mount = mp;
3008         nch.ncp = ncp;
3009         _cache_mntref(nch.mount);
3010
3011         return(nch);
3012 }
3013
3014 /*
3015  * Attempt to lookup a namecache entry and return with a shared namecache
3016  * lock.  This operates non-blocking.  EWOULDBLOCK is returned if excl is
3017  * set or we are unable to lock.
3018  */
3019 int
3020 cache_nlookup_maybe_shared(struct nchandle *par_nch,
3021                            struct nlcomponent *nlc,
3022                            int excl, struct nchandle *res_nch)
3023 {
3024         struct namecache *ncp;
3025         struct nchash_head *nchpp;
3026         struct mount *mp;
3027         u_int32_t hash;
3028         globaldata_t gd;
3029
3030         /*
3031          * If exclusive requested or shared namecache locks are disabled,
3032          * return failure.
3033          */
3034         if (ncp_shared_lock_disable || excl)
3035                 return(EWOULDBLOCK);
3036
3037         gd = mycpu;
3038         mp = par_nch->mount;
3039
3040         /*
3041          * This is a good time to call it, no ncp's are locked by
3042          * the caller or us.
3043          */
3044         cache_hysteresis(1);
3045
3046         /*
3047          * Try to locate an existing entry
3048          */
3049         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3050         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3051         nchpp = NCHHASH(hash);
3052
3053         spin_lock_shared(&nchpp->spin);
3054
3055         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3056                 /*
3057                  * Break out if we find a matching entry.  Note that
3058                  * UNRESOLVED entries may match, but DESTROYED entries
3059                  * do not.
3060                  */
3061                 if (ncp->nc_parent == par_nch->ncp &&
3062                     ncp->nc_nlen == nlc->nlc_namelen &&
3063                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3064                     (ncp->nc_flag & NCF_DESTROYED) == 0
3065                 ) {
3066                         _cache_hold(ncp);
3067                         spin_unlock_shared(&nchpp->spin);
3068
3069                         if (_cache_lock_shared_special(ncp) == 0) {
3070                                 if (ncp->nc_parent == par_nch->ncp &&
3071                                     ncp->nc_nlen == nlc->nlc_namelen &&
3072                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3073                                          ncp->nc_nlen) == 0 &&
3074                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3075                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3076                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
3077                                         goto found;
3078                                 }
3079                                 _cache_unlock(ncp);
3080                         }
3081                         _cache_drop(ncp);
3082                         return(EWOULDBLOCK);
3083                 }
3084         }
3085
3086         /*
3087          * Failure
3088          */
3089         spin_unlock_shared(&nchpp->spin);
3090         return(EWOULDBLOCK);
3091
3092         /*
3093          * Success
3094          *
3095          * Note that nc_error might be non-zero (e.g ENOENT).
3096          */
3097 found:
3098         res_nch->mount = mp;
3099         res_nch->ncp = ncp;
3100         ++gd->gd_nchstats->ncs_goodhits;
3101         _cache_mntref(res_nch->mount);
3102
3103         KKASSERT(ncp->nc_error != EWOULDBLOCK);
3104         return(ncp->nc_error);
3105 }
3106
3107 /*
3108  * This is a non-blocking verison of cache_nlookup() used by
3109  * nfs_readdirplusrpc_uio().  It can fail for any reason and
3110  * will return nch.ncp == NULL in that case.
3111  */
3112 struct nchandle
3113 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3114 {
3115         struct nchandle nch;
3116         struct namecache *ncp;
3117         struct namecache *new_ncp;
3118         struct nchash_head *nchpp;
3119         struct mount *mp;
3120         u_int32_t hash;
3121         globaldata_t gd;
3122         int par_locked;
3123
3124         gd = mycpu;
3125         mp = par_nch->mount;
3126         par_locked = 0;
3127
3128         /*
3129          * Try to locate an existing entry
3130          */
3131         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3132         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3133         new_ncp = NULL;
3134         nchpp = NCHHASH(hash);
3135 restart:
3136         spin_lock(&nchpp->spin);
3137         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3138                 /*
3139                  * Break out if we find a matching entry.  Note that
3140                  * UNRESOLVED entries may match, but DESTROYED entries
3141                  * do not.
3142                  */
3143                 if (ncp->nc_parent == par_nch->ncp &&
3144                     ncp->nc_nlen == nlc->nlc_namelen &&
3145                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3146                     (ncp->nc_flag & NCF_DESTROYED) == 0
3147                 ) {
3148                         _cache_hold(ncp);
3149                         spin_unlock(&nchpp->spin);
3150                         if (par_locked) {
3151                                 _cache_unlock(par_nch->ncp);
3152                                 par_locked = 0;
3153                         }
3154                         if (_cache_lock_special(ncp) == 0) {
3155                                 if (ncp->nc_parent != par_nch->ncp ||
3156                                     ncp->nc_nlen != nlc->nlc_namelen ||
3157                                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3158                                     (ncp->nc_flag & NCF_DESTROYED)) {
3159                                         kprintf("cache_lookup_nonblock: "
3160                                                 "ncp-race %p %*.*s\n",
3161                                                 ncp,
3162                                                 nlc->nlc_namelen,
3163                                                 nlc->nlc_namelen,
3164                                                 nlc->nlc_nameptr);
3165                                         _cache_unlock(ncp);
3166                                         _cache_drop(ncp);
3167                                         goto failed;
3168                                 }
3169                                 _cache_auto_unresolve(mp, ncp);
3170                                 if (new_ncp) {
3171                                         _cache_free(new_ncp);
3172                                         new_ncp = NULL;
3173                                 }
3174                                 goto found;
3175                         }
3176                         _cache_drop(ncp);
3177                         goto failed;
3178                 }
3179         }
3180
3181         /*
3182          * We failed to locate an entry, create a new entry and add it to
3183          * the cache.  The parent ncp must also be locked so we
3184          * can link into it.
3185          *
3186          * We have to relookup after possibly blocking in kmalloc or
3187          * when locking par_nch.
3188          *
3189          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3190          *       mount case, in which case nc_name will be NULL.
3191          */
3192         if (new_ncp == NULL) {
3193                 spin_unlock(&nchpp->spin);
3194                 new_ncp = cache_alloc(nlc->nlc_namelen);
3195                 if (nlc->nlc_namelen) {
3196                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3197                               nlc->nlc_namelen);
3198                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3199                 }
3200                 goto restart;
3201         }
3202         if (par_locked == 0) {
3203                 spin_unlock(&nchpp->spin);
3204                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3205                         par_locked = 1;
3206                         goto restart;
3207                 }
3208                 goto failed;
3209         }
3210
3211         /*
3212          * Link to parent (requires another ref, the one already in new_ncp
3213          * is what we wil lreturn).
3214          *
3215          * WARNING!  We still hold the spinlock.  We have to set the hash
3216          *           table entry atomically.
3217          */
3218         ncp = new_ncp;
3219         ++ncp->nc_refs;
3220         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3221         spin_unlock(&nchpp->spin);
3222         _cache_unlock(par_nch->ncp);
3223         /* par_locked = 0 - not used */
3224 found:
3225         /*
3226          * stats and namecache size management
3227          */
3228         if (ncp->nc_flag & NCF_UNRESOLVED)
3229                 ++gd->gd_nchstats->ncs_miss;
3230         else if (ncp->nc_vp)
3231                 ++gd->gd_nchstats->ncs_goodhits;
3232         else
3233                 ++gd->gd_nchstats->ncs_neghits;
3234         nch.mount = mp;
3235         nch.ncp = ncp;
3236         _cache_mntref(nch.mount);
3237
3238         return(nch);
3239 failed:
3240         if (new_ncp) {
3241                 _cache_free(new_ncp);
3242                 new_ncp = NULL;
3243         }
3244         nch.mount = NULL;
3245         nch.ncp = NULL;
3246         return(nch);
3247 }
3248
3249 /*
3250  * This version is non-locking.  The caller must validate the result
3251  * for parent-to-child continuity.
3252  *
3253  * It can fail for any reason and will return nch.ncp == NULL in that case.
3254  */
3255 struct nchandle
3256 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc)
3257 {
3258         struct nchandle nch;
3259         struct namecache *ncp;
3260         struct nchash_head *nchpp;
3261         struct mount *mp;
3262         u_int32_t hash;
3263         globaldata_t gd;
3264
3265         gd = mycpu;
3266         mp = par_nch->mount;
3267
3268         /*
3269          * Try to locate an existing entry
3270          */
3271         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3272         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3273         nchpp = NCHHASH(hash);
3274
3275         spin_lock_shared(&nchpp->spin);
3276         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3277                 /*
3278                  * Break out if we find a matching entry.  Note that
3279                  * UNRESOLVED entries may match, but DESTROYED entries
3280                  * do not.
3281                  *
3282                  * Resolved NFS entries which have timed out fail so the
3283                  * caller can rerun with normal locking.
3284                  */
3285                 if (ncp->nc_parent == par_nch->ncp &&
3286                     ncp->nc_nlen == nlc->nlc_namelen &&
3287                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3288                     (ncp->nc_flag & NCF_DESTROYED) == 0
3289                 ) {
3290                         if (_cache_auto_unresolve_test(par_nch->mount, ncp))
3291                                 break;
3292                         _cache_hold(ncp);
3293                         spin_unlock_shared(&nchpp->spin);
3294                         goto found;
3295                 }
3296         }
3297         spin_unlock_shared(&nchpp->spin);
3298         nch.mount = NULL;
3299         nch.ncp = NULL;
3300         return nch;
3301 found:
3302         /*
3303          * stats and namecache size management
3304          */
3305         if (ncp->nc_flag & NCF_UNRESOLVED)
3306                 ++gd->gd_nchstats->ncs_miss;
3307         else if (ncp->nc_vp)
3308                 ++gd->gd_nchstats->ncs_goodhits;
3309         else
3310                 ++gd->gd_nchstats->ncs_neghits;
3311         nch.mount = mp;
3312         nch.ncp = ncp;
3313         _cache_mntref(nch.mount);
3314
3315         return(nch);
3316 }
3317
3318 /*
3319  * The namecache entry is marked as being used as a mount point.
3320  * Locate the mount if it is visible to the caller.  The DragonFly
3321  * mount system allows arbitrary loops in the topology and disentangles
3322  * those loops by matching against (mp, ncp) rather than just (ncp).
3323  * This means any given ncp can dive any number of mounts, depending
3324  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3325  *
3326  * We use a very simple frontend cache to reduce SMP conflicts,
3327  * which we have to do because the mountlist scan needs an exclusive
3328  * lock around its ripout info list.  Not to mention that there might
3329  * be a lot of mounts.
3330  *
3331  * Because all mounts can potentially be accessed by all cpus, break the cpu's
3332  * down a bit to allow some contention rather than making the cache
3333  * excessively huge.
3334  *
3335  * The hash table is split into per-cpu areas, is 4-way set-associative.
3336  */
3337 struct findmount_info {
3338         struct mount *result;
3339         struct mount *nch_mount;
3340         struct namecache *nch_ncp;
3341 };
3342
3343 static __inline
3344 struct ncmount_cache *
3345 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp)
3346 {
3347         uint32_t hash;
3348
3349         hash = iscsi_crc32(&mp, sizeof(mp));
3350         hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash);
3351         hash ^= hash >> 16;
3352         hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1));
3353
3354         return (&ncmount_cache[hash]);
3355 }
3356
3357 static
3358 struct ncmount_cache *
3359 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3360 {
3361         struct ncmount_cache *ncc;
3362         struct ncmount_cache *best;
3363         int delta;
3364         int best_delta;
3365         int i;
3366
3367         ncc = ncmount_cache_lookup4(mp, ncp);
3368
3369         /*
3370          * NOTE: When checking for a ticks overflow implement a slop of
3371          *       2 ticks just to be safe, because ticks is accessed
3372          *       non-atomically one CPU can increment it while another
3373          *       is still using the old value.
3374          */
3375         if (ncc->ncp == ncp && ncc->mp == mp)   /* 0 */
3376                 return ncc;
3377         delta = (int)(ticks - ncc->ticks);      /* beware GCC opts */
3378         if (delta < -2)                         /* overflow reset */
3379                 ncc->ticks = ticks;
3380         best = ncc;
3381         best_delta = delta;
3382
3383         for (i = 1; i < NCMOUNT_SET; ++i) {     /* 1, 2, 3 */
3384                 ++ncc;
3385                 if (ncc->ncp == ncp && ncc->mp == mp)
3386                         return ncc;
3387                 delta = (int)(ticks - ncc->ticks);
3388                 if (delta < -2)
3389                         ncc->ticks = ticks;
3390                 if (delta > best_delta) {
3391                         best_delta = delta;
3392                         best = ncc;
3393                 }
3394         }
3395         return best;
3396 }
3397
3398 /*
3399  * pcpu-optimized mount search.  Locate the recursive mountpoint, avoid
3400  * doing an expensive mountlist_scan*() if possible.
3401  *
3402  * (mp, ncp) -> mountonpt.k
3403  *
3404  * Returns a referenced mount pointer or NULL
3405  *
3406  * General SMP operation uses a per-cpu umount_spin to interlock unmount
3407  * operations (that is, where the mp_target can be freed out from under us).
3408  *
3409  * Lookups use the ncc->updating counter to validate the contents in order
3410  * to avoid having to obtain the per cache-element spin-lock.  In addition,
3411  * the ticks field is only updated when it changes.  However, if our per-cpu
3412  * lock fails due to an unmount-in-progress, we fall-back to the
3413  * cache-element's spin-lock.
3414  */
3415 struct mount *
3416 cache_findmount(struct nchandle *nch)
3417 {
3418         struct findmount_info info;
3419         struct ncmount_cache *ncc;
3420         struct ncmount_cache ncc_copy;
3421         struct mount *target;
3422         struct pcpu_ncache *pcpu;
3423         struct spinlock *spinlk;
3424         int update;
3425
3426         pcpu = pcpu_ncache;
3427         if (ncmount_cache_enable == 0 || pcpu == NULL) {
3428                 ncc = NULL;
3429                 goto skip;
3430         }
3431         pcpu += mycpu->gd_cpuid;
3432
3433 again:
3434         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3435         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3436 found:
3437                 /*
3438                  * This is a bit messy for now because we do not yet have
3439                  * safe disposal of mount structures.  We have to ref
3440                  * ncc->mp_target but the 'update' counter only tell us
3441                  * whether the cache has changed after the fact.
3442                  *
3443                  * For now get a per-cpu spinlock that will only contend
3444                  * against umount's.  This is the best path.  If it fails,
3445                  * instead of waiting on the umount we fall-back to a
3446                  * shared ncc->spin lock, which will generally only cost a
3447                  * cache ping-pong.
3448                  */
3449                 update = ncc->updating;
3450                 if (__predict_true(spin_trylock(&pcpu->umount_spin))) {
3451                         spinlk = &pcpu->umount_spin;
3452                 } else {
3453                         spinlk = &ncc->spin;
3454                         spin_lock_shared(spinlk);
3455                 }
3456                 if (update & 1) {               /* update in progress */
3457                         spin_unlock_any(spinlk);
3458                         goto skip;
3459                 }
3460                 ncc_copy = *ncc;
3461                 cpu_lfence();
3462                 if (ncc->updating != update) {  /* content changed */
3463                         spin_unlock_any(spinlk);
3464                         goto again;
3465                 }
3466                 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) {
3467                         spin_unlock_any(spinlk);
3468                         goto again;
3469                 }
3470                 if (ncc_copy.isneg == 0) {
3471                         target = ncc_copy.mp_target;
3472                         if (target->mnt_ncmounton.mount == nch->mount &&
3473                             target->mnt_ncmounton.ncp == nch->ncp) {
3474                                 /*
3475                                  * Cache hit (positive) (avoid dirtying
3476                                  * the cache line if possible)
3477                                  */
3478                                 if (ncc->ticks != (int)ticks)
3479                                         ncc->ticks = (int)ticks;
3480                                 _cache_mntref(target);
3481                         }
3482                 } else {
3483                         /*
3484                          * Cache hit (negative) (avoid dirtying
3485                          * the cache line if possible)
3486                          */
3487                         if (ncc->ticks != (int)ticks)
3488                                 ncc->ticks = (int)ticks;
3489                         target = NULL;
3490                 }
3491                 spin_unlock_any(spinlk);
3492
3493                 return target;
3494         }
3495 skip:
3496
3497         /*
3498          * Slow
3499          */
3500         info.result = NULL;
3501         info.nch_mount = nch->mount;
3502         info.nch_ncp = nch->ncp;
3503         mountlist_scan(cache_findmount_callback, &info,
3504                        MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK);
3505
3506         /*
3507          * To reduce multi-re-entry on the cache, relookup in the cache.
3508          * This can still race, obviously, but that's ok.
3509          */
3510         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3511         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3512                 if (info.result)
3513                         atomic_add_int(&info.result->mnt_refs, -1);
3514                 goto found;
3515         }
3516
3517         /*
3518          * Cache the result.
3519          */
3520         if ((info.result == NULL ||
3521             (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) {
3522                 spin_lock(&ncc->spin);
3523                 atomic_add_int_nonlocked(&ncc->updating, 1);
3524                 cpu_sfence();
3525                 KKASSERT(ncc->updating & 1);
3526                 if (ncc->mp != nch->mount) {
3527                         if (ncc->mp)
3528                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3529                         atomic_add_int(&nch->mount->mnt_refs, 1);
3530                         ncc->mp = nch->mount;
3531                 }
3532                 ncc->ncp = nch->ncp;    /* ptr compares only, not refd*/
3533                 ncc->ticks = (int)ticks;
3534
3535                 if (info.result) {
3536                         ncc->isneg = 0;
3537                         if (ncc->mp_target != info.result) {
3538                                 if (ncc->mp_target)
3539                                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3540                                 ncc->mp_target = info.result;
3541                                 atomic_add_int(&info.result->mnt_refs, 1);
3542                         }
3543                 } else {
3544                         ncc->isneg = 1;
3545                         if (ncc->mp_target) {
3546                                 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3547                                 ncc->mp_target = NULL;
3548                         }
3549                 }
3550                 cpu_sfence();
3551                 atomic_add_int_nonlocked(&ncc->updating, 1);
3552                 spin_unlock(&ncc->spin);
3553         }
3554         return(info.result);
3555 }
3556
3557 static
3558 int
3559 cache_findmount_callback(struct mount *mp, void *data)
3560 {
3561         struct findmount_info *info = data;
3562
3563         /*
3564          * Check the mount's mounted-on point against the passed nch.
3565          */
3566         if (mp->mnt_ncmounton.mount == info->nch_mount &&
3567             mp->mnt_ncmounton.ncp == info->nch_ncp
3568         ) {
3569             info->result = mp;
3570             _cache_mntref(mp);
3571             return(-1);
3572         }
3573         return(0);
3574 }
3575
3576 void
3577 cache_dropmount(struct mount *mp)
3578 {
3579         _cache_mntrel(mp);
3580 }
3581
3582 /*
3583  * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
3584  * or negative).
3585  *
3586  * A full scan is not required, but for now just do it anyway.
3587  */
3588 void
3589 cache_ismounting(struct mount *mp)
3590 {
3591         struct ncmount_cache *ncc;
3592         struct mount *ncc_mp;
3593         int i;
3594
3595         if (pcpu_ncache == NULL)
3596                 return;
3597
3598         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3599                 ncc = &ncmount_cache[i];
3600                 if (ncc->mp != mp->mnt_ncmounton.mount ||
3601                     ncc->ncp != mp->mnt_ncmounton.ncp) {
3602                         continue;
3603                 }
3604                 spin_lock(&ncc->spin);
3605                 atomic_add_int_nonlocked(&ncc->updating, 1);
3606                 cpu_sfence();
3607                 KKASSERT(ncc->updating & 1);
3608                 if (ncc->mp != mp->mnt_ncmounton.mount ||
3609                     ncc->ncp != mp->mnt_ncmounton.ncp) {
3610                         cpu_sfence();
3611                         ++ncc->updating;
3612                         spin_unlock(&ncc->spin);
3613                         continue;
3614                 }
3615                 ncc_mp = ncc->mp;
3616                 ncc->ncp = NULL;
3617                 ncc->mp = NULL;
3618                 if (ncc_mp)
3619                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3620                 ncc_mp = ncc->mp_target;
3621                 ncc->mp_target = NULL;
3622                 if (ncc_mp)
3623                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3624                 ncc->ticks = (int)ticks - hz * 120;
3625
3626                 cpu_sfence();
3627                 atomic_add_int_nonlocked(&ncc->updating, 1);
3628                 spin_unlock(&ncc->spin);
3629         }
3630
3631         /*
3632          * Pre-cache the mount point
3633          */
3634         ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount,
3635                                    mp->mnt_ncmounton.ncp);
3636
3637         spin_lock(&ncc->spin);
3638         atomic_add_int_nonlocked(&ncc->updating, 1);
3639         cpu_sfence();
3640         KKASSERT(ncc->updating & 1);
3641
3642         if (ncc->mp)
3643                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3644         atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1);
3645         ncc->mp = mp->mnt_ncmounton.mount;
3646         ncc->ncp = mp->mnt_ncmounton.ncp;       /* ptr compares only */
3647         ncc->ticks = (int)ticks;
3648
3649         ncc->isneg = 0;
3650         if (ncc->mp_target != mp) {
3651                 if (ncc->mp_target)
3652                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
3653                 ncc->mp_target = mp;
3654                 atomic_add_int(&mp->mnt_refs, 1);
3655         }
3656         cpu_sfence();
3657         atomic_add_int_nonlocked(&ncc->updating, 1);
3658         spin_unlock(&ncc->spin);
3659 }
3660
3661 /*
3662  * Scrap any ncmount_cache entries related to mp.  Not only do we need to
3663  * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
3664  * negative hits involving (mp, <any>).
3665  *
3666  * A full scan is required.
3667  */
3668 void
3669 cache_unmounting(struct mount *mp)
3670 {
3671         struct ncmount_cache *ncc;
3672         struct pcpu_ncache *pcpu;
3673         struct mount *ncc_mp;
3674         int i;
3675
3676         pcpu = pcpu_ncache;
3677         if (pcpu == NULL)
3678                 return;
3679
3680         for (i = 0; i < ncpus; ++i)
3681                 spin_lock(&pcpu[i].umount_spin);
3682
3683         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
3684                 ncc = &ncmount_cache[i];
3685                 if (ncc->mp != mp && ncc->mp_target != mp)
3686                         continue;
3687                 spin_lock(&ncc->spin);
3688                 atomic_add_int_nonlocked(&ncc->updating, 1);
3689                 cpu_sfence();
3690
3691                 if (ncc->mp != mp && ncc->mp_target != mp) {
3692                         atomic_add_int_nonlocked(&ncc->updating, 1);
3693                         cpu_sfence();
3694                         spin_unlock(&ncc->spin);
3695                         continue;
3696                 }
3697                 ncc_mp = ncc->mp;
3698                 ncc->ncp = NULL;
3699                 ncc->mp = NULL;
3700                 if (ncc_mp)
3701                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3702                 ncc_mp = ncc->mp_target;
3703                 ncc->mp_target = NULL;
3704                 if (ncc_mp)
3705                         atomic_add_int(&ncc_mp->mnt_refs, -1);
3706                 ncc->ticks = (int)ticks - hz * 120;
3707
3708                 cpu_sfence();
3709                 atomic_add_int_nonlocked(&ncc->updating, 1);
3710                 spin_unlock(&ncc->spin);
3711         }
3712
3713         for (i = 0; i < ncpus; ++i)
3714                 spin_unlock(&pcpu[i].umount_spin);
3715 }
3716
3717 /*
3718  * Resolve an unresolved namecache entry, generally by looking it up.
3719  * The passed ncp must be locked and refd.
3720  *
3721  * Theoretically since a vnode cannot be recycled while held, and since
3722  * the nc_parent chain holds its vnode as long as children exist, the
3723  * direct parent of the cache entry we are trying to resolve should
3724  * have a valid vnode.  If not then generate an error that we can
3725  * determine is related to a resolver bug.
3726  *
3727  * However, if a vnode was in the middle of a recyclement when the NCP
3728  * got locked, ncp->nc_vp might point to a vnode that is about to become
3729  * invalid.  cache_resolve() handles this case by unresolving the entry
3730  * and then re-resolving it.
3731  *
3732  * Note that successful resolution does not necessarily return an error
3733  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3734  * will be returned.
3735  */
3736 int
3737 cache_resolve(struct nchandle *nch, struct ucred *cred)
3738 {
3739         struct namecache *par_tmp;
3740         struct namecache *par;
3741         struct namecache *ncp;
3742         struct nchandle nctmp;
3743         struct mount *mp;
3744         struct vnode *dvp;
3745         int error;
3746
3747         ncp = nch->ncp;
3748         mp = nch->mount;
3749         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3750 restart:
3751         /*
3752          * If the ncp is already resolved we have nothing to do.  However,
3753          * we do want to guarentee that a usable vnode is returned when
3754          * a vnode is present, so make sure it hasn't been reclaimed.
3755          */
3756         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3757                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3758                         _cache_setunresolved(ncp);
3759                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3760                         return (ncp->nc_error);
3761         }
3762
3763         /*
3764          * If the ncp was destroyed it will never resolve again.  This
3765          * can basically only happen when someone is chdir'd into an
3766          * empty directory which is then rmdir'd.  We want to catch this
3767          * here and not dive the VFS because the VFS might actually
3768          * have a way to re-resolve the disconnected ncp, which will
3769          * result in inconsistencies in the cdir/nch for proc->p_fd.
3770          */
3771         if (ncp->nc_flag & NCF_DESTROYED)
3772                 return(EINVAL);
3773
3774         /*
3775          * Mount points need special handling because the parent does not
3776          * belong to the same filesystem as the ncp.
3777          */
3778         if (ncp == mp->mnt_ncmountpt.ncp)
3779                 return (cache_resolve_mp(mp));
3780
3781         /*
3782          * We expect an unbroken chain of ncps to at least the mount point,
3783          * and even all the way to root (but this code doesn't have to go
3784          * past the mount point).
3785          */
3786         if (ncp->nc_parent == NULL) {
3787                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3788                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3789                 ncp->nc_error = EXDEV;
3790                 return(ncp->nc_error);
3791         }
3792
3793         /*
3794          * The vp's of the parent directories in the chain are held via vhold()
3795          * due to the existance of the child, and should not disappear.
3796          * However, there are cases where they can disappear:
3797          *
3798          *      - due to filesystem I/O errors.
3799          *      - due to NFS being stupid about tracking the namespace and
3800          *        destroys the namespace for entire directories quite often.
3801          *      - due to forced unmounts.
3802          *      - due to an rmdir (parent will be marked DESTROYED)
3803          *
3804          * When this occurs we have to track the chain backwards and resolve
3805          * it, looping until the resolver catches up to the current node.  We
3806          * could recurse here but we might run ourselves out of kernel stack
3807          * so we do it in a more painful manner.  This situation really should
3808          * not occur all that often, or if it does not have to go back too
3809          * many nodes to resolve the ncp.
3810          */
3811         while ((dvp = cache_dvpref(ncp)) == NULL) {
3812                 /*
3813                  * This case can occur if a process is CD'd into a
3814                  * directory which is then rmdir'd.  If the parent is marked
3815                  * destroyed there is no point trying to resolve it.
3816                  */
3817                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3818                         return(ENOENT);
3819                 par = ncp->nc_parent;
3820                 _cache_hold(par);
3821                 _cache_lock(par);
3822                 while ((par_tmp = par->nc_parent) != NULL &&
3823                        par_tmp->nc_vp == NULL) {
3824                         _cache_hold(par_tmp);
3825                         _cache_lock(par_tmp);
3826                         _cache_put(par);
3827                         par = par_tmp;
3828                 }
3829                 if (par->nc_parent == NULL) {
3830                         kprintf("EXDEV case 2 %*.*s\n",
3831                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3832                         _cache_put(par);
3833                         return (EXDEV);
3834                 }
3835                 /*
3836                  * The parent is not set in stone, ref and lock it to prevent
3837                  * it from disappearing.  Also note that due to renames it
3838                  * is possible for our ncp to move and for par to no longer
3839                  * be one of its parents.  We resolve it anyway, the loop
3840                  * will handle any moves.
3841                  */
3842                 _cache_get(par);        /* additional hold/lock */
3843                 _cache_put(par);        /* from earlier hold/lock */
3844                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3845                         cache_resolve_mp(nch->mount);
3846                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3847                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
3848                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3849                         _cache_put(par);
3850                         continue;
3851                 } else {
3852                         if (par->nc_flag & NCF_UNRESOLVED) {
3853                                 nctmp.mount = mp;
3854                                 nctmp.ncp = par;
3855                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3856                         }
3857                         vrele(dvp);
3858                 }
3859                 if ((error = par->nc_error) != 0) {
3860                         if (par->nc_error != EAGAIN) {
3861                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3862                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3863                                     par->nc_error);
3864                                 _cache_put(par);
3865                                 return(error);
3866                         }
3867                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3868                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3869                 }
3870                 _cache_put(par);
3871                 /* loop */
3872         }
3873
3874         /*
3875          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3876          * ncp's and reattach them.  If this occurs the original ncp is marked
3877          * EAGAIN to force a relookup.
3878          *
3879          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3880          * ncp must already be resolved.
3881          */
3882         if (dvp) {
3883                 nctmp.mount = mp;
3884                 nctmp.ncp = ncp;
3885                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3886                 vrele(dvp);
3887         } else {
3888                 ncp->nc_error = EPERM;
3889         }
3890         if (ncp->nc_error == EAGAIN) {
3891                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3892                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3893                 goto restart;
3894         }
3895         return(ncp->nc_error);
3896 }
3897
3898 /*
3899  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3900  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3901  * re-resolution more often due to its mac-truck-smash-the-namecache
3902  * method of tracking namespace changes.
3903  *
3904  * The semantics for this call is that the passed ncp must be locked on
3905  * entry and will be locked on return.  However, if we actually have to
3906  * resolve the mount point we temporarily unlock the entry in order to
3907  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3908  * the unlock we have to recheck the flags after we relock.
3909  */
3910 static int
3911 cache_resolve_mp(struct mount *mp)
3912 {
3913         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3914         struct vnode *vp;
3915         int error;
3916
3917         KKASSERT(mp != NULL);
3918
3919         /*
3920          * If the ncp is already resolved we have nothing to do.  However,
3921          * we do want to guarentee that a usable vnode is returned when
3922          * a vnode is present, so make sure it hasn't been reclaimed.
3923          */
3924         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3925                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3926                         _cache_setunresolved(ncp);
3927         }
3928
3929         if (ncp->nc_flag & NCF_UNRESOLVED) {
3930                 _cache_unlock(ncp);
3931                 while (vfs_busy(mp, 0))
3932                         ;
3933                 error = VFS_ROOT(mp, &vp);
3934                 _cache_lock(ncp);
3935
3936                 /*
3937                  * recheck the ncp state after relocking.
3938                  */
3939                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3940                         ncp->nc_error = error;
3941                         if (error == 0) {
3942                                 _cache_setvp(mp, ncp, vp);
3943                                 vput(vp);
3944                         } else {
3945                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3946                                         " to resolve mount %p err=%d ncp=%p\n",
3947                                         mp, error, ncp);
3948                                 _cache_setvp(mp, ncp, NULL);
3949                         }
3950                 } else if (error == 0) {
3951                         vput(vp);
3952                 }
3953                 vfs_unbusy(mp);
3954         }
3955         return(ncp->nc_error);
3956 }
3957
3958 /*
3959  * Clean out negative cache entries when too many have accumulated.
3960  */
3961 static void
3962 _cache_cleanneg(long count)
3963 {
3964         struct pcpu_ncache *pn;
3965         struct namecache *ncp;
3966         static uint32_t neg_rover;
3967         uint32_t n;
3968         long vnegs;
3969
3970         n = neg_rover++;        /* SMP heuristical, race ok */
3971         cpu_ccfence();
3972         n = n % (uint32_t)ncpus;
3973
3974         /*
3975          * Normalize vfscache_negs and count.  count is sometimes based
3976          * on vfscache_negs.  vfscache_negs is heuristical and can sometimes
3977          * have crazy values.
3978          */
3979         vnegs = vfscache_negs;
3980         cpu_ccfence();
3981         if (vnegs <= MINNEG)
3982                 vnegs = MINNEG;
3983         if (count < 1)
3984                 count = 1;
3985
3986         pn = &pcpu_ncache[n];
3987         spin_lock(&pn->neg_spin);
3988         count = pn->neg_count * count / vnegs + 1;
3989         spin_unlock(&pn->neg_spin);
3990
3991         /*
3992          * Attempt to clean out the specified number of negative cache
3993          * entries.
3994          */
3995         while (count > 0) {
3996                 spin_lock(&pn->neg_spin);
3997                 ncp = TAILQ_FIRST(&pn->neg_list);
3998                 if (ncp == NULL) {
3999                         spin_unlock(&pn->neg_spin);
4000                         break;
4001                 }
4002                 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
4003                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
4004                 _cache_hold(ncp);
4005                 spin_unlock(&pn->neg_spin);
4006
4007                 /*
4008                  * This can race, so we must re-check that the ncp
4009                  * is on the ncneg.list after successfully locking it.
4010                  */
4011                 if (_cache_lock_special(ncp) == 0) {
4012                         if (ncp->nc_vp == NULL &&
4013                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4014                                 cache_zap(ncp);
4015                         } else {
4016                                 _cache_unlock(ncp);
4017                                 _cache_drop(ncp);
4018                         }
4019                 } else {
4020                         _cache_drop(ncp);
4021                 }
4022                 --count;
4023         }
4024 }
4025
4026 /*
4027  * Clean out positive cache entries when too many have accumulated.
4028  */
4029 static void
4030 _cache_cleanpos(long count)
4031 {
4032         static volatile int rover;
4033         struct nchash_head *nchpp;
4034         struct namecache *ncp;
4035         int rover_copy;
4036
4037         /*
4038          * Attempt to clean out the specified number of negative cache
4039          * entries.
4040          */
4041         while (count > 0) {
4042                 rover_copy = ++rover;   /* MPSAFEENOUGH */
4043                 cpu_ccfence();
4044                 nchpp = NCHHASH(rover_copy);
4045
4046                 if (TAILQ_FIRST(&nchpp->list) == NULL) {
4047                         --count;
4048                         continue;
4049                 }
4050
4051                 /*
4052                  * Cycle ncp on list, ignore and do not move DUMMY
4053                  * ncps.  These are temporary list iterators.
4054                  *
4055                  * We must cycle the ncp to the end of the list to
4056                  * ensure that all ncp's have an equal chance of
4057                  * being removed.
4058                  */
4059                 spin_lock(&nchpp->spin);
4060                 ncp = TAILQ_FIRST(&nchpp->list);
4061                 while (ncp && (ncp->nc_flag & NCF_DUMMY))
4062                         ncp = TAILQ_NEXT(ncp, nc_hash);
4063                 if (ncp) {
4064                         TAILQ_REMOVE(&nchpp->list, ncp, nc_hash);
4065                         TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash);
4066                         _cache_hold(ncp);
4067                 }
4068                 spin_unlock(&nchpp->spin);
4069
4070                 if (ncp) {
4071                         if (_cache_lock_special(ncp) == 0) {
4072                                 cache_zap(ncp);
4073                         } else {
4074                                 _cache_drop(ncp);
4075                         }
4076                 }
4077                 --count;
4078         }
4079 }
4080
4081 /*
4082  * This is a kitchen sink function to clean out ncps which we
4083  * tried to zap from cache_drop() but failed because we were
4084  * unable to acquire the parent lock.
4085  *
4086  * Such entries can also be removed via cache_inval_vp(), such
4087  * as when unmounting.
4088  */
4089 static void
4090 _cache_cleandefered(void)
4091 {
4092         struct nchash_head *nchpp;
4093         struct namecache *ncp;
4094         struct namecache dummy;
4095         int i;
4096
4097         /*
4098          * Create a list iterator.  DUMMY indicates that this is a list
4099          * iterator, DESTROYED prevents matches by lookup functions.
4100          */
4101         numdefered = 0;
4102         pcpu_ncache[mycpu->gd_cpuid].numdefered = 0;
4103         bzero(&dummy, sizeof(dummy));
4104         dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY;
4105         dummy.nc_refs = 1;
4106
4107         for (i = 0; i <= nchash; ++i) {
4108                 nchpp = &nchashtbl[i];
4109
4110                 spin_lock(&nchpp->spin);
4111                 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
4112                 ncp = &dummy;
4113                 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) {
4114                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
4115                                 continue;
4116                         TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4117                         TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash);
4118                         _cache_hold(ncp);
4119                         spin_unlock(&nchpp->spin);
4120                         if (_cache_lock_nonblock(ncp) == 0) {
4121                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
4122                                 _cache_unlock(ncp);
4123                         }
4124                         _cache_drop(ncp);
4125                         spin_lock(&nchpp->spin);
4126                         ncp = &dummy;
4127                 }
4128                 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4129                 spin_unlock(&nchpp->spin);
4130         }
4131 }
4132
4133 /*
4134  * Name cache initialization, from vfsinit() when we are booting
4135  */
4136 void
4137 nchinit(void)
4138 {
4139         struct pcpu_ncache *pn;
4140         globaldata_t gd;
4141         int i;
4142
4143         /*
4144          * Per-cpu accounting and negative hit list
4145          */
4146         pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
4147                               M_VFSCACHE, M_WAITOK|M_ZERO);
4148         for (i = 0; i < ncpus; ++i) {
4149                 pn = &pcpu_ncache[i];
4150                 TAILQ_INIT(&pn->neg_list);
4151                 spin_init(&pn->neg_spin, "ncneg");
4152                 spin_init(&pn->umount_spin, "ncumm");
4153         }
4154
4155         /*
4156          * Initialise per-cpu namecache effectiveness statistics.
4157          */
4158         for (i = 0; i < ncpus; ++i) {
4159                 gd = globaldata_find(i);
4160                 gd->gd_nchstats = &nchstats[i];
4161         }
4162
4163         /*
4164          * Create a generous namecache hash table
4165          */
4166         nchashtbl = hashinit_ext(vfs_inodehashsize(),
4167                                  sizeof(struct nchash_head),
4168                                  M_VFSCACHE, &nchash);
4169         for (i = 0; i <= (int)nchash; ++i) {
4170                 TAILQ_INIT(&nchashtbl[i].list);
4171                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4172         }
4173         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4174                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4175         nclockwarn = 5 * hz;
4176 }
4177
4178 /*
4179  * Called from start_init() to bootstrap the root filesystem.  Returns
4180  * a referenced, unlocked namecache record.
4181  */
4182 void
4183 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4184 {
4185         nch->ncp = cache_alloc(0);
4186         nch->mount = mp;
4187         _cache_mntref(mp);
4188         if (vp)
4189                 _cache_setvp(nch->mount, nch->ncp, vp);
4190 }
4191
4192 /*
4193  * vfs_cache_setroot()
4194  *
4195  *      Create an association between the root of our namecache and
4196  *      the root vnode.  This routine may be called several times during
4197  *      booting.
4198  *
4199  *      If the caller intends to save the returned namecache pointer somewhere
4200  *      it must cache_hold() it.
4201  */
4202 void
4203 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4204 {
4205         struct vnode *ovp;
4206         struct nchandle onch;
4207
4208         ovp = rootvnode;
4209         onch = rootnch;
4210         rootvnode = nvp;
4211         if (nch)
4212                 rootnch = *nch;
4213         else
4214                 cache_zero(&rootnch);
4215         if (ovp)
4216                 vrele(ovp);
4217         if (onch.ncp)
4218                 cache_drop(&onch);
4219 }
4220
4221 /*
4222  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
4223  * topology and is being removed as quickly as possible.  The new VOP_N*()
4224  * API calls are required to make specific adjustments using the supplied
4225  * ncp pointers rather then just bogusly purging random vnodes.
4226  *
4227  * Invalidate all namecache entries to a particular vnode as well as
4228  * any direct children of that vnode in the namecache.  This is a
4229  * 'catch all' purge used by filesystems that do not know any better.
4230  *
4231  * Note that the linkage between the vnode and its namecache entries will
4232  * be removed, but the namecache entries themselves might stay put due to
4233  * active references from elsewhere in the system or due to the existance of
4234  * the children.   The namecache topology is left intact even if we do not
4235  * know what the vnode association is.  Such entries will be marked
4236  * NCF_UNRESOLVED.
4237  */
4238 void
4239 cache_purge(struct vnode *vp)
4240 {
4241         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
4242 }
4243
4244 static int disablecwd;
4245 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
4246     "Disable getcwd");
4247
4248 static u_long numcwdcalls;
4249 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
4250     "Number of current directory resolution calls");
4251 static u_long numcwdfailnf;
4252 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
4253     "Number of current directory failures due to lack of file");
4254 static u_long numcwdfailsz;
4255 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
4256     "Number of current directory failures due to large result");
4257 static u_long numcwdfound;
4258 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
4259     "Number of current directory resolution successes");
4260
4261 /*
4262  * MPALMOSTSAFE
4263  */
4264 int
4265 sys___getcwd(struct __getcwd_args *uap)
4266 {
4267         u_int buflen;
4268         int error;
4269         char *buf;
4270         char *bp;
4271
4272         if (disablecwd)
4273                 return (ENODEV);
4274
4275         buflen = uap->buflen;
4276         if (buflen == 0)
4277                 return (EINVAL);
4278         if (buflen > MAXPATHLEN)
4279                 buflen = MAXPATHLEN;
4280
4281         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
4282         bp = kern_getcwd(buf, buflen, &error);
4283         if (error == 0)
4284                 error = copyout(bp, uap->buf, strlen(bp) + 1);
4285         kfree(buf, M_TEMP);
4286         return (error);
4287 }
4288
4289 char *
4290 kern_getcwd(char *buf, size_t buflen, int *error)
4291 {
4292         struct proc *p = curproc;
4293         char *bp;
4294         int i, slash_prefixed;
4295         struct filedesc *fdp;
4296         struct nchandle nch;
4297         struct namecache *ncp;
4298
4299         numcwdcalls++;
4300         bp = buf;
4301         bp += buflen - 1;
4302         *bp = '\0';
4303         fdp = p->p_fd;
4304         slash_prefixed = 0;
4305
4306         nch = fdp->fd_ncdir;
4307         ncp = nch.ncp;
4308         if (ncp)
4309                 _cache_hold(ncp);
4310
4311         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
4312                nch.mount != fdp->fd_nrdir.mount)
4313         ) {
4314                 /*
4315                  * While traversing upwards if we encounter the root
4316                  * of the current mount we have to skip to the mount point
4317                  * in the underlying filesystem.
4318                  */
4319                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
4320                         nch = nch.mount->mnt_ncmounton;
4321                         _cache_drop(ncp);
4322                         ncp = nch.ncp;
4323                         if (ncp)
4324                                 _cache_hold(ncp);
4325                         continue;
4326                 }
4327
4328                 /*
4329                  * Prepend the path segment
4330                  */
4331                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4332                         if (bp == buf) {
4333                                 numcwdfailsz++;
4334                                 *error = ERANGE;
4335                                 bp = NULL;
4336                                 goto done;
4337                         }
4338                         *--bp = ncp->nc_name[i];
4339                 }
4340                 if (bp == buf) {
4341                         numcwdfailsz++;
4342                         *error = ERANGE;
4343                         bp = NULL;
4344                         goto done;
4345                 }
4346                 *--bp = '/';
4347                 slash_prefixed = 1;
4348
4349                 /*
4350                  * Go up a directory.  This isn't a mount point so we don't
4351                  * have to check again.
4352                  */
4353                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4354                         if (ncp_shared_lock_disable)
4355                                 _cache_lock(ncp);
4356                         else
4357                                 _cache_lock_shared(ncp);
4358                         if (nch.ncp != ncp->nc_parent) {
4359                                 _cache_unlock(ncp);
4360                                 continue;
4361                         }
4362                         _cache_hold(nch.ncp);
4363                         _cache_unlock(ncp);
4364                         break;
4365                 }
4366                 _cache_drop(ncp);
4367                 ncp = nch.ncp;
4368         }
4369         if (ncp == NULL) {
4370                 numcwdfailnf++;
4371                 *error = ENOENT;
4372                 bp = NULL;
4373                 goto done;
4374         }
4375         if (!slash_prefixed) {
4376                 if (bp == buf) {
4377                         numcwdfailsz++;
4378                         *error = ERANGE;
4379                         bp = NULL;
4380                         goto done;
4381                 }
4382                 *--bp = '/';
4383         }
4384         numcwdfound++;
4385         *error = 0;
4386 done:
4387         if (ncp)
4388                 _cache_drop(ncp);
4389         return (bp);
4390 }
4391
4392 /*
4393  * Thus begins the fullpath magic.
4394  *
4395  * The passed nchp is referenced but not locked.
4396  */
4397 static int disablefullpath;
4398 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
4399     &disablefullpath, 0,
4400     "Disable fullpath lookups");
4401
4402 int
4403 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
4404                char **retbuf, char **freebuf, int guess)
4405 {
4406         struct nchandle fd_nrdir;
4407         struct nchandle nch;
4408         struct namecache *ncp;
4409         struct mount *mp, *new_mp;
4410         char *bp, *buf;
4411         int slash_prefixed;
4412         int error = 0;
4413         int i;
4414
4415         *retbuf = NULL;
4416         *freebuf = NULL;
4417
4418         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
4419         bp = buf + MAXPATHLEN - 1;
4420         *bp = '\0';
4421         if (nchbase)
4422                 fd_nrdir = *nchbase;
4423         else if (p != NULL)
4424                 fd_nrdir = p->p_fd->fd_nrdir;
4425         else
4426                 fd_nrdir = rootnch;
4427         slash_prefixed = 0;
4428         nch = *nchp;
4429         ncp = nch.ncp;
4430         if (ncp)
4431                 _cache_hold(ncp);
4432         mp = nch.mount;
4433
4434         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
4435                 new_mp = NULL;
4436
4437                 /*
4438                  * If we are asked to guess the upwards path, we do so whenever
4439                  * we encounter an ncp marked as a mountpoint. We try to find
4440                  * the actual mountpoint by finding the mountpoint with this
4441                  * ncp.
4442                  */
4443                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
4444                         new_mp = mount_get_by_nc(ncp);
4445                 }
4446                 /*
4447                  * While traversing upwards if we encounter the root
4448                  * of the current mount we have to skip to the mount point.
4449                  */
4450                 if (ncp == mp->mnt_ncmountpt.ncp) {
4451                         new_mp = mp;
4452                 }
4453                 if (new_mp) {
4454                         nch = new_mp->mnt_ncmounton;
4455                         _cache_drop(ncp);
4456                         ncp = nch.ncp;
4457                         if (ncp)
4458                                 _cache_hold(ncp);
4459                         mp = nch.mount;
4460                         continue;
4461                 }
4462
4463                 /*
4464                  * Prepend the path segment
4465                  */
4466                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
4467                         if (bp == buf) {
4468                                 kfree(buf, M_TEMP);
4469                                 error = ENOMEM;
4470                                 goto done;
4471                         }
4472                         *--bp = ncp->nc_name[i];
4473                 }
4474                 if (bp == buf) {
4475                         kfree(buf, M_TEMP);
4476                         error = ENOMEM;
4477                         goto done;
4478                 }
4479                 *--bp = '/';
4480                 slash_prefixed = 1;
4481
4482                 /*
4483                  * Go up a directory.  This isn't a mount point so we don't
4484                  * have to check again.
4485                  *
4486                  * We can only safely access nc_parent with ncp held locked.
4487                  */
4488                 while ((nch.ncp = ncp->nc_parent) != NULL) {
4489                         _cache_lock_shared(ncp);
4490                         if (nch.ncp != ncp->nc_parent) {
4491                                 _cache_unlock(ncp);
4492                                 continue;
4493                         }
4494                         _cache_hold(nch.ncp);
4495                         _cache_unlock(ncp);
4496                         break;
4497                 }
4498                 _cache_drop(ncp);
4499                 ncp = nch.ncp;
4500         }
4501         if (ncp == NULL) {
4502                 kfree(buf, M_TEMP);
4503                 error = ENOENT;
4504                 goto done;
4505         }
4506
4507         if (!slash_prefixed) {
4508                 if (bp == buf) {
4509                         kfree(buf, M_TEMP);
4510                         error = ENOMEM;
4511                         goto done;
4512                 }
4513                 *--bp = '/';
4514         }
4515         *retbuf = bp;
4516         *freebuf = buf;
4517         error = 0;
4518 done:
4519         if (ncp)
4520                 _cache_drop(ncp);
4521         return(error);
4522 }
4523
4524 int
4525 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
4526             char **freebuf, int guess)
4527 {
4528         struct namecache *ncp;
4529         struct nchandle nch;
4530         int error;
4531
4532         *freebuf = NULL;
4533         if (disablefullpath)
4534                 return (ENODEV);
4535
4536         if (p == NULL)
4537                 return (EINVAL);
4538
4539         /* vn is NULL, client wants us to use p->p_textvp */
4540         if (vn == NULL) {
4541                 if ((vn = p->p_textvp) == NULL)
4542                         return (EINVAL);
4543         }
4544         spin_lock_shared(&vn->v_spin);
4545         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
4546                 if (ncp->nc_nlen)
4547                         break;
4548         }
4549         if (ncp == NULL) {
4550                 spin_unlock_shared(&vn->v_spin);
4551                 return (EINVAL);
4552         }
4553         _cache_hold(ncp);
4554         spin_unlock_shared(&vn->v_spin);
4555
4556         nch.ncp = ncp;
4557         nch.mount = vn->v_mount;
4558         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
4559         _cache_drop(ncp);
4560         return (error);
4561 }
4562
4563 void
4564 vfscache_rollup_cpu(struct globaldata *gd)
4565 {
4566         struct pcpu_ncache *pn;
4567         long count;
4568
4569         if (pcpu_ncache == NULL)
4570                 return;
4571         pn = &pcpu_ncache[gd->gd_cpuid];
4572
4573         if (pn->vfscache_count) {
4574                 count = atomic_swap_long(&pn->vfscache_count, 0);
4575                 atomic_add_long(&vfscache_count, count);
4576         }
4577         if (pn->vfscache_leafs) {
4578                 count = atomic_swap_long(&pn->vfscache_leafs, 0);
4579                 atomic_add_long(&vfscache_leafs, count);
4580         }
4581         if (pn->vfscache_negs) {
4582                 count = atomic_swap_long(&pn->vfscache_negs, 0);
4583                 atomic_add_long(&vfscache_negs, count);
4584         }
4585         if (pn->numdefered) {
4586                 count = atomic_swap_long(&pn->numdefered, 0);
4587                 atomic_add_long(&numdefered, count);
4588         }
4589 }