sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003-2020 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. Neither the name of the University nor the names of its contributors
  49  *    may be used to endorse or promote products derived from this software
  50  *    without specific prior written permission.
  51  *
  52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  62  * SUCH DAMAGE.
  63  */
  64
  65 #include <sys/param.h>
  66 #include <sys/systm.h>
  67 #include <sys/uio.h>
  68 #include <sys/kernel.h>
  69 #include <sys/sysctl.h>
  70 #include <sys/mount.h>
  71 #include <sys/vnode.h>
  72 #include <sys/malloc.h>
  73 #include <sys/sysmsg.h>
  74 #include <sys/spinlock.h>
  75 #include <sys/proc.h>
  76 #include <sys/nlookup.h>
  77 #include <sys/filedesc.h>
  78 #include <sys/fnv_hash.h>
  79 #include <sys/globaldata.h>
  80 #include <sys/kern_syscall.h>
  81 #include <sys/dirent.h>
  82 #include <ddb/ddb.h>
  83
  84 #include <sys/spinlock2.h>
  85
  86 #define MAX_RECURSION_DEPTH     64
  87
  88 /*
  89  * Random lookups in the cache are accomplished with a hash table using
  90  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock,
  91  * but we use the ncp->update counter trick to avoid acquiring any
  92  * contestable spin-locks during a lookup.
  93  *
  94  * Negative entries may exist and correspond to resolved namecache
  95  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
  96  * will be set if the entry corresponds to a whited-out directory entry
  97  * (verses simply not finding the entry at all).  pcpu_ncache[n].neg_list
  98  * is locked via pcpu_ncache[n].neg_spin;
  99  *
 100  * MPSAFE RULES:
 101  *
 102  * (1) ncp's typically have at least a nc_refs of 1, and usually 2.  One
 103  *     is applicable to direct lookups via the hash table nchpp or via
 104  *     nc_list (the two are added or removed together).  Removal of the ncp
 105  *     from the hash table drops this reference.  The second is applicable
 106  *     to vp->v_namecache linkages (or negative list linkages), and removal
 107  *     of the ncp from these lists drops this reference.
 108  *
 109  *     On the 1->0 transition of nc_refs the ncp can no longer be referenced
 110  *     and must be destroyed.  No other thread should have access to it at
 111  *     this point so it can be safely locked and freed without any deadlock
 112  *     fears.
 113  *
 114  *     The 1->0 transition can occur at almost any juncture and so cache_drop()
 115  *     deals with it directly.
 116  *
 117  * (2) Once the 1->0 transition occurs, the entity that caused the transition
 118  *     will be responsible for destroying the ncp.  The ncp cannot be on any
 119  *     list or hash at this time, or be held by anyone other than the caller
 120  *     responsible for the transition.
 121  *
 122  * (3) A ncp must be locked in order to modify it.
 123  *
 124  * (5) ncp locks are ordered, child-to-parent.  Child first, then parent.
 125  *     This may seem backwards but forward-scans use the hash table and thus
 126  *     can hold the parent unlocked while traversing downward.  Deletions,
 127  *     on the other-hand, tend to propagate bottom-up since the ref on the
 128  *     is dropped as the children go away.
 129  *
 130  * (6) Both parent and child must be locked in order to enter the child onto
 131  *     the parent's nc_list.
 132  */
 133
 134 /*
 135  * Structures associated with name cacheing.
 136  */
 137 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 138 #define MINNEG                  1024
 139 #define MINPOS                  1024
 140 #define NCMOUNT_NUMCACHE        (16384) /* power of 2 */
 141 #define NCMOUNT_SET             (8)     /* power of 2 */
 142
 143 MALLOC_DEFINE_OBJ(M_VFSCACHE, sizeof(struct namecache),
 144                   "namecache", "namecache entries");
 145 MALLOC_DEFINE(M_VFSCACHEAUX, "namecachestr", "namecache strings");
 146
 147 TAILQ_HEAD(nchash_list, namecache);
 148
 149 /*
 150  * Don't cachealign, but at least pad to 32 bytes so entries
 151  * don't cross a cache line.
 152  */
 153 struct nchash_head {
 154        struct nchash_list list; /* 16 bytes */
 155        struct spinlock  spin;   /* 8 bytes */
 156        long     pad01;          /* 8 bytes */
 157 };
 158
 159 struct ncmount_cache {
 160         struct spinlock spin;
 161         struct namecache *ncp;
 162         struct mount *mp;
 163         struct mount *mp_target;
 164         int isneg;
 165         int ticks;
 166         int updating;
 167         int unused01;
 168 };
 169
 170 struct pcpu_ncache {
 171         struct spinlock         umount_spin;    /* cache_findmount/interlock */
 172         struct spinlock         neg_spin;       /* for neg_list and neg_count */
 173         struct namecache_list   neg_list;
 174         long                    neg_count;
 175         long                    vfscache_negs;
 176         long                    vfscache_count;
 177         long                    vfscache_leafs;
 178         long                    vfscache_unres;
 179         long                    numdefered;
 180         long                    inv_kid_quick_count;
 181         long                    inv_ncp_quick_count;
 182         long                    clean_pos_count;
 183         long                    clean_neg_count;
 184 } __cachealign;
 185
 186 __read_mostly static struct nchash_head *nchashtbl;
 187 __read_mostly static struct pcpu_ncache *pcpu_ncache;
 188 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 189
 190 /*
 191  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 192  * to create the namecache infrastructure leading to a dangling vnode.
 193  *
 194  * 0    Only errors are reported
 195  * 1    Successes are reported
 196  * 2    Successes + the whole directory scan is reported
 197  * 3    Force the directory scan code run as if the parent vnode did not
 198  *      have a namecache record, even if it does have one.
 199  */
 200 __read_mostly int       ncvp_debug;
 201 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 202     "Namecache debug level (0-3)");
 203
 204 __read_mostly static u_long nchash;             /* size of hash table */
 205 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 206     "Size of namecache hash table");
 207
 208 __read_mostly static int ncnegflush = 10;       /* burst for negative flush */
 209 SYSCTL_INT(_debug, OID_AUTO, ncnegflush, CTLFLAG_RW, &ncnegflush, 0,
 210     "Batch flush negative entries");
 211
 212 __read_mostly static int ncposflush = 10;       /* burst for positive flush */
 213 SYSCTL_INT(_debug, OID_AUTO, ncposflush, CTLFLAG_RW, &ncposflush, 0,
 214     "Batch flush positive entries");
 215
 216 __read_mostly static int ncnegfactor = 16;      /* ratio of negative entries */
 217 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 218     "Ratio of negative namecache entries");
 219
 220 __read_mostly static int ncposfactor = 16;    /* ratio of unres+leaf entries */
 221 SYSCTL_INT(_debug, OID_AUTO, ncposfactor, CTLFLAG_RW, &ncposfactor, 0,
 222     "Ratio of unresolved leaf namecache entries");
 223
 224 __read_mostly static int nclockwarn;    /* warn on locked entries in ticks */
 225 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 226     "Warn on locked namecache entries in ticks");
 227
 228 __read_mostly static int ncposlimit;    /* number of cache entries allocated */
 229 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 230     "Number of cache entries allocated");
 231
 232 __read_mostly static int ncp_shared_lock_disable = 0;
 233 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 234            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 235
 236 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 237     "sizeof(struct vnode)");
 238 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 239     "sizeof(struct namecache)");
 240
 241 __read_mostly static int ncmount_cache_enable = 1;
 242 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 243            &ncmount_cache_enable, 0, "mount point cache");
 244
 245 static __inline void _cache_drop(struct namecache *ncp);
 246 static int cache_resolve_mp(struct mount *mp, int adjgen);
 247 static int cache_findmount_callback(struct mount *mp, void *data);
 248 static void _cache_setunresolved(struct namecache *ncp, int adjgen);
 249 static void _cache_cleanneg(long count);
 250 static void _cache_cleanpos(long ucount, long xcount);
 251 static void _cache_cleandefered(void);
 252 static void _cache_unlink(struct namecache *ncp);
 253
 254 /*
 255  * The new name cache statistics (these are rolled up globals and not
 256  * modified in the critical path, see struct pcpu_ncache).
 257  */
 258 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 259 static long vfscache_negs;
 260 SYSCTL_LONG(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &vfscache_negs, 0,
 261     "Number of negative namecache entries");
 262 static long vfscache_count;
 263 SYSCTL_LONG(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &vfscache_count, 0,
 264     "Number of namecaches entries");
 265 static long vfscache_leafs;
 266 SYSCTL_LONG(_vfs_cache, OID_AUTO, numleafs, CTLFLAG_RD, &vfscache_leafs, 0,
 267     "Number of leaf namecaches entries");
 268 static long vfscache_unres;
 269 SYSCTL_LONG(_vfs_cache, OID_AUTO, numunres, CTLFLAG_RD, &vfscache_unres, 0,
 270     "Number of unresolved leaf namecaches entries");
 271
 272 static long     inv_kid_quick_count;
 273 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_kid_quick_count, CTLFLAG_RD,
 274             &inv_kid_quick_count, 0,
 275             "quick kid invalidations");
 276 static long     inv_ncp_quick_count;
 277 SYSCTL_LONG(_vfs_cache, OID_AUTO, inv_ncp_quick_count, CTLFLAG_RD,
 278             &inv_ncp_quick_count, 0,
 279             "quick ncp invalidations");
 280 static long     clean_pos_count;
 281 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_pos_count, CTLFLAG_RD,
 282             &clean_pos_count, 0,
 283             "positive ncp cleanings");
 284 static long     clean_neg_count;
 285 SYSCTL_LONG(_vfs_cache, OID_AUTO, clean_neg_count, CTLFLAG_RD,
 286             &clean_neg_count, 0,
 287             "negative ncp cleanings");
 288
 289 static long     numdefered;
 290 SYSCTL_LONG(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 291     "Number of cache entries allocated");
 292
 293 /*
 294  * Returns the number of basic references expected on the ncp, not
 295  * including any children.  1 for the natural ref, and an addition ref
 296  * if the ncp is resolved (representing a positive or negative hit).
 297  */
 298 static __inline int
 299 ncpbaserefs(struct namecache *ncp)
 300 {
 301         return (1 + ((ncp->nc_flag & NCF_UNRESOLVED) == 0));
 302 }
 303
 304 struct nchstats nchstats[SMP_MAXCPU];
 305 /*
 306  * Export VFS cache effectiveness statistics to user-land.
 307  *
 308  * The statistics are left for aggregation to user-land so
 309  * neat things can be achieved, like observing per-CPU cache
 310  * distribution.
 311  */
 312 static int
 313 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 314 {
 315         struct globaldata *gd;
 316         int i, error;
 317
 318         error = 0;
 319         for (i = 0; i < ncpus; ++i) {
 320                 gd = globaldata_find(i);
 321                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 322                         sizeof(struct nchstats))))
 323                         break;
 324         }
 325
 326         return (error);
 327 }
 328 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 329   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 330
 331 static int cache_zap(struct namecache *ncp);
 332
 333 /*
 334  * Cache mount points and namecache records in order to avoid unnecessary
 335  * atomic ops on mnt_refs and ncp->refs.  This improves concurrent SMP
 336  * performance and is particularly important on multi-socket systems to
 337  * reduce cache-line ping-ponging.
 338  *
 339  * Try to keep the pcpu structure within one cache line (~64 bytes).
 340  */
 341 #define MNTCACHE_COUNT  32      /* power of 2, multiple of SET */
 342 #define MNTCACHE_SET    8       /* set associativity */
 343
 344 struct mntcache_elm {
 345         struct namecache *ncp;
 346         struct mount     *mp;
 347         int     ticks;
 348         int     unused01;
 349 };
 350
 351 struct mntcache {
 352         struct mntcache_elm array[MNTCACHE_COUNT];
 353 } __cachealign;
 354
 355 static struct mntcache  pcpu_mntcache[MAXCPU];
 356
 357 static __inline
 358 void
 359 _cache_ncp_gen_enter(struct namecache *ncp)
 360 {
 361         ncp->nc_generation += 2;
 362         cpu_sfence();
 363 }
 364
 365 static __inline
 366 void
 367 _cache_ncp_gen_exit(struct namecache *ncp)
 368 {
 369         cpu_sfence();
 370         ncp->nc_generation += 2;
 371         cpu_sfence();
 372 }
 373
 374 static __inline
 375 struct mntcache_elm *
 376 _cache_mntcache_hash(void *ptr)
 377 {
 378         struct mntcache_elm *elm;
 379         int hv;
 380
 381         hv = iscsi_crc32(&ptr, sizeof(ptr)) & (MNTCACHE_COUNT - 1);
 382         elm = &pcpu_mntcache[mycpu->gd_cpuid].array[hv & ~(MNTCACHE_SET - 1)];
 383
 384         return elm;
 385 }
 386
 387 static
 388 void
 389 _cache_mntref(struct mount *mp)
 390 {
 391         struct mntcache_elm *elm;
 392         struct mount *mpr;
 393         int i;
 394
 395         elm = _cache_mntcache_hash(mp);
 396         for (i = 0; i < MNTCACHE_SET; ++i) {
 397                 if (elm->mp == mp) {
 398                         mpr = atomic_swap_ptr((void *)&elm->mp, NULL);
 399                         if (__predict_true(mpr == mp))
 400                                 return;
 401                         if (mpr)
 402                                 atomic_add_int(&mpr->mnt_refs, -1);
 403                 }
 404                 ++elm;
 405         }
 406         atomic_add_int(&mp->mnt_refs, 1);
 407 }
 408
 409 static
 410 void
 411 _cache_mntrel(struct mount *mp)
 412 {
 413         struct mntcache_elm *elm;
 414         struct mntcache_elm *best;
 415         struct mount *mpr;
 416         int delta1;
 417         int delta2;
 418         int i;
 419
 420         elm = _cache_mntcache_hash(mp);
 421         best = elm;
 422         for (i = 0; i < MNTCACHE_SET; ++i) {
 423                 if (elm->mp == NULL) {
 424                         mpr = atomic_swap_ptr((void *)&elm->mp, mp);
 425                         if (__predict_false(mpr != NULL)) {
 426                                 atomic_add_int(&mpr->mnt_refs, -1);
 427                         }
 428                         elm->ticks = ticks;
 429                         return;
 430                 }
 431                 delta1 = ticks - best->ticks;
 432                 delta2 = ticks - elm->ticks;
 433                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
 434                         best = elm;
 435                 ++elm;
 436         }
 437         mpr = atomic_swap_ptr((void *)&best->mp, mp);
 438         best->ticks = ticks;
 439         if (mpr)
 440                 atomic_add_int(&mpr->mnt_refs, -1);
 441 }
 442
 443 /*
 444  * Clears all cached mount points on all cpus.  This routine should only
 445  * be called when we are waiting for a mount to clear, e.g. so we can
 446  * unmount.
 447  */
 448 void
 449 cache_clearmntcache(struct mount *target __unused)
 450 {
 451         int n;
 452
 453         for (n = 0; n < ncpus; ++n) {
 454                 struct mntcache *cache = &pcpu_mntcache[n];
 455                 struct mntcache_elm *elm;
 456                 struct namecache *ncp;
 457                 struct mount *mp;
 458                 int i;
 459
 460                 for (i = 0; i < MNTCACHE_COUNT; ++i) {
 461                         elm = &cache->array[i];
 462                         if (elm->mp) {
 463                                 mp = atomic_swap_ptr((void *)&elm->mp, NULL);
 464                                 if (mp)
 465                                         atomic_add_int(&mp->mnt_refs, -1);
 466                         }
 467                         if (elm->ncp) {
 468                                 ncp = atomic_swap_ptr((void *)&elm->ncp, NULL);
 469                                 if (ncp)
 470                                         _cache_drop(ncp);
 471                         }
 472                 }
 473         }
 474 }
 475
 476 /*
 477  * Namespace locking.  The caller must already hold a reference to the
 478  * namecache structure in order to lock/unlock it.  The controlling entity
 479  * in a 1->0 transition does not need to lock the ncp to dispose of it,
 480  * as nobody else will have visibility to it at that point.
 481  *
 482  * Note that holding a locked namecache structure prevents other threads
 483  * from making namespace changes (e.g. deleting or creating), prevents
 484  * vnode association state changes by other threads, and prevents the
 485  * namecache entry from being resolved or unresolved by other threads.
 486  *
 487  * An exclusive lock owner has full authority to associate/disassociate
 488  * vnodes and resolve/unresolve the locked ncp.
 489  *
 490  * A shared lock owner only has authority to acquire the underlying vnode,
 491  * if any.
 492  *
 493  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 494  * fact (when locking) or cleared prior to unlocking.
 495  *
 496  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 497  *           or recycled, but it does NOT help you if the vnode had already
 498  *           initiated a recyclement.  If this is important, use cache_get()
 499  *           rather then cache_lock() (and deal with the differences in the
 500  *           way the refs counter is handled).  Or, alternatively, make an
 501  *           unconditional call to cache_validate() or cache_resolve()
 502  *           after cache_lock() returns.
 503  */
 504 static __inline
 505 void
 506 _cache_lock(struct namecache *ncp)
 507 {
 508         int didwarn = 0;
 509         int error;
 510
 511         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 512         while (__predict_false(error == EWOULDBLOCK)) {
 513                 if (didwarn == 0) {
 514                         didwarn = ticks - nclockwarn;
 515                         kprintf("[diagnostic] cache_lock: "
 516                                 "%s blocked on %p "
 517                                 "\"%*.*s\"\n",
 518                                 curthread->td_comm, ncp,
 519                                 ncp->nc_nlen, ncp->nc_nlen,
 520                                 ncp->nc_name);
 521                 }
 522                 error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_TIMELOCK);
 523         }
 524         if (__predict_false(didwarn)) {
 525                 kprintf("[diagnostic] cache_lock: "
 526                         "%s unblocked %*.*s after %d secs\n",
 527                         curthread->td_comm,
 528                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 529                         (int)(ticks - didwarn) / hz);
 530         }
 531 }
 532
 533 /*
 534  * Release a previously acquired lock.
 535  *
 536  * A concurrent shared-lock acquisition or acquisition/release can
 537  * race bit 31 so only drop the ncp if bit 31 was set.
 538  */
 539 static __inline
 540 void
 541 _cache_unlock(struct namecache *ncp)
 542 {
 543         lockmgr(&ncp->nc_lock, LK_RELEASE);
 544 }
 545
 546 /*
 547  * Lock ncp exclusively, non-blocking.  Return 0 on success.
 548  */
 549 static __inline
 550 int
 551 _cache_lock_nonblock(struct namecache *ncp)
 552 {
 553         int error;
 554
 555         error = lockmgr(&ncp->nc_lock, LK_EXCLUSIVE | LK_NOWAIT);
 556         if (__predict_false(error != 0)) {
 557                 return(EWOULDBLOCK);
 558         }
 559         return 0;
 560 }
 561
 562 /*
 563  * This is a special form of _cache_lock() which only succeeds if
 564  * it can get a pristine, non-recursive lock.  The caller must have
 565  * already ref'd the ncp.
 566  *
 567  * On success the ncp will be locked, on failure it will not.  The
 568  * ref count does not change either way.
 569  *
 570  * We want _cache_lock_special() (on success) to return a definitively
 571  * usable vnode or a definitively unresolved ncp.
 572  */
 573 static __inline
 574 int
 575 _cache_lock_special(struct namecache *ncp)
 576 {
 577         if (_cache_lock_nonblock(ncp) == 0) {
 578                 if (lockmgr_oneexcl(&ncp->nc_lock)) {
 579                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
 580                                 _cache_setunresolved(ncp, 1);
 581                         return 0;
 582                 }
 583                 _cache_unlock(ncp);
 584         }
 585         return EWOULDBLOCK;
 586 }
 587
 588 /*
 589  * Shared lock, guarantees vp held
 590  *
 591  * The shared lock holds vp on the 0->1 transition.  It is possible to race
 592  * another shared lock release, preventing the other release from dropping
 593  * the vnode and clearing bit 31.
 594  *
 595  * If it is not set then we are responsible for setting it, and this
 596  * responsibility does not race with anyone else.
 597  */
 598 static __inline
 599 void
 600 _cache_lock_shared(struct namecache *ncp)
 601 {
 602         int didwarn = 0;
 603         int error;
 604
 605         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 606         while (__predict_false(error == EWOULDBLOCK)) {
 607                 if (didwarn == 0) {
 608                         didwarn = ticks - nclockwarn;
 609                         kprintf("[diagnostic] cache_lock_shared: "
 610                                 "%s blocked on %p "
 611                                 "\"%*.*s\"\n",
 612                                 curthread->td_comm, ncp,
 613                                 ncp->nc_nlen, ncp->nc_nlen,
 614                                 ncp->nc_name);
 615                 }
 616                 error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_TIMELOCK);
 617         }
 618         if (__predict_false(didwarn)) {
 619                 kprintf("[diagnostic] cache_lock_shared: "
 620                         "%s unblocked %*.*s after %d secs\n",
 621                         curthread->td_comm,
 622                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 623                         (int)(ticks - didwarn) / hz);
 624         }
 625 }
 626
 627 /*
 628  * Shared lock, guarantees vp held.  Non-blocking.  Returns 0 on success
 629  */
 630 static __inline
 631 int
 632 _cache_lock_shared_nonblock(struct namecache *ncp)
 633 {
 634         int error;
 635
 636         error = lockmgr(&ncp->nc_lock, LK_SHARED | LK_NOWAIT);
 637         if (__predict_false(error != 0)) {
 638                 return(EWOULDBLOCK);
 639         }
 640         return 0;
 641 }
 642
 643 /*
 644  * This function tries to get a shared lock but will back-off to an
 645  * exclusive lock if:
 646  *
 647  * (1) Some other thread is trying to obtain an exclusive lock
 648  *     (to prevent the exclusive requester from getting livelocked out
 649  *     by many shared locks).
 650  *
 651  * (2) The current thread already owns an exclusive lock (to avoid
 652  *     deadlocking).
 653  *
 654  * WARNING! On machines with lots of cores we really want to try hard to
 655  *          get a shared lock or concurrent path lookups can chain-react
 656  *          into a very high-latency exclusive lock.
 657  *
 658  *          This is very evident in dsynth's initial scans.
 659  */
 660 static __inline
 661 int
 662 _cache_lock_shared_special(struct namecache *ncp)
 663 {
 664         /*
 665          * Only honor a successful shared lock (returning 0) if there is
 666          * no exclusive request pending and the vnode, if present, is not
 667          * in a reclaimed state.
 668          */
 669         if (_cache_lock_shared_nonblock(ncp) == 0) {
 670                 if (__predict_true(!lockmgr_exclpending(&ncp->nc_lock))) {
 671                         if (ncp->nc_vp == NULL ||
 672                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
 673                                 return(0);
 674                         }
 675                 }
 676                 _cache_unlock(ncp);
 677                 return(EWOULDBLOCK);
 678         }
 679
 680         /*
 681          * Non-blocking shared lock failed.  If we already own the exclusive
 682          * lock just acquire another exclusive lock (instead of deadlocking).
 683          * Otherwise acquire a shared lock.
 684          */
 685         if (lockstatus(&ncp->nc_lock, curthread) == LK_EXCLUSIVE) {
 686                 _cache_lock(ncp);
 687                 return(0);
 688         }
 689         _cache_lock_shared(ncp);
 690         return(0);
 691 }
 692
 693 /*
 694  * Returns:
 695  *      -1      Locked by other
 696  *       0      Not locked
 697  *      (v)     LK_SHARED or LK_EXCLUSIVE
 698  */
 699 static __inline
 700 int
 701 _cache_lockstatus(struct namecache *ncp)
 702 {
 703         int status;
 704
 705         status = lockstatus(&ncp->nc_lock, curthread);
 706         if (status == LK_EXCLOTHER)
 707                 status = -1;
 708         return status;
 709 }
 710
 711 /*
 712  * cache_hold() and cache_drop() prevent the premature deletion of a
 713  * namecache entry but do not prevent operations (such as zapping) on
 714  * that namecache entry.
 715  *
 716  * This routine may only be called from outside this source module if
 717  * nc_refs is already deterministically at least 1, such as being
 718  * associated with e.g. a process, file descriptor, or some other entity.
 719  *
 720  * Only the above situations, similar situations within this module where
 721  * the ref count is deterministically at least 1, or when the ncp is found
 722  * via the nchpp (hash table) lookup, can bump nc_refs.
 723  *
 724  * Very specifically, a ncp found via nc_list CANNOT bump nc_refs.  It
 725  * can still be removed from the nc_list, however, as long as the caller
 726  * can acquire its lock (in the wrong order).
 727  *
 728  * This is a rare case where callers are allowed to hold a spinlock,
 729  * so we can't ourselves.
 730  */
 731 static __inline
 732 struct namecache *
 733 _cache_hold(struct namecache *ncp)
 734 {
 735         KKASSERT(ncp->nc_refs > 0);
 736         atomic_add_int(&ncp->nc_refs, 1);
 737
 738         return(ncp);
 739 }
 740
 741 /*
 742  * Drop a cache entry.
 743  *
 744  * The 1->0 transition can only occur after or because the natural ref
 745  * is being dropped.  If another thread had a temporary ref during the
 746  * ncp's destruction, then that other thread might wind up being the
 747  * one to drop the last ref.
 748  */
 749 static __inline
 750 void
 751 _cache_drop(struct namecache *ncp)
 752 {
 753         if (atomic_fetchadd_int(&ncp->nc_refs, -1) == 1) {
 754                 KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
 755
 756                 /*
 757                  * Scrap it.
 758                  */
 759                 ncp->nc_refs = -1;      /* safety */
 760                 if (ncp->nc_name)
 761                         kfree(ncp->nc_name, M_VFSCACHEAUX);
 762                 kfree_obj(ncp, M_VFSCACHE);
 763         }
 764 }
 765
 766 /*
 767  * Link a new namecache entry to its parent and to the hash table.  Be
 768  * careful to avoid races if vhold() blocks in the future.
 769  *
 770  * Both ncp and par must be referenced and locked.  The reference is
 771  * transfered to the nchpp (and, most notably, NOT to the parent list).
 772  *
 773  * NOTE: The hash table spinlock is held across this call, we can't do
 774  *       anything fancy.
 775  */
 776 static void
 777 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 778                    struct nchash_head *nchpp)
 779 {
 780         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 781
 782         KKASSERT(ncp->nc_parent == NULL);
 783         _cache_ncp_gen_enter(ncp);
 784         ncp->nc_parent = par;
 785         ncp->nc_head = nchpp;
 786
 787         /*
 788          * Set inheritance flags.  Note that the parent flags may be
 789          * stale due to getattr potentially not having been run yet
 790          * (it gets run during nlookup()'s).
 791          */
 792         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 793         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 794                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 795         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 796                 ncp->nc_flag |= NCF_UF_PCACHE;
 797
 798         /*
 799          * Add to hash table and parent, adjust accounting
 800          */
 801         TAILQ_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 802         atomic_add_long(&pn->vfscache_count, 1);
 803
 804         /*
 805          * ncp is a new leaf being added to the tree
 806          */
 807         if (TAILQ_EMPTY(&ncp->nc_list)) {
 808                 atomic_add_long(&pn->vfscache_leafs, 1);
 809                 if (ncp->nc_flag & NCF_UNRESOLVED)
 810                         atomic_add_long(&pn->vfscache_unres, 1);
 811         }
 812
 813         if (TAILQ_EMPTY(&par->nc_list)) {
 814                 /*
 815                  * Parent was, but now is no longer a leaf
 816                  */
 817                 /*
 818                  * XXX for now don't mess with par's gen, it causes
 819                  * unnecessary nlookup retries (though not many)
 820                  */
 821                 /*_cache_ncp_gen_enter(par);*/
 822                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 823                 if (par->nc_parent) {
 824                         if (par->nc_flag & NCF_UNRESOLVED)
 825                                 atomic_add_long(&pn->vfscache_unres, -1);
 826                         atomic_add_long(&pn->vfscache_leafs, -1);
 827                 }
 828
 829                 /*
 830                  * Any vp associated with an ncp which has children must
 831                  * be held to prevent it from being recycled.
 832                  */
 833                 if (par->nc_vp)
 834                         vhold(par->nc_vp);
 835                 /*_cache_ncp_gen_exit(par);*/
 836         } else {
 837                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 838         }
 839         _cache_hold(par);       /* add nc_parent ref */
 840         _cache_ncp_gen_exit(ncp);
 841 }
 842
 843 /*
 844  * Remove the parent and hash associations from a namecache structure.
 845  * Drop the ref-count on the parent.  The caller receives the ref
 846  * from the ncp's nchpp linkage that was removed and may forward that
 847  * ref to a new linkage.
 848
 849  * The caller usually holds an additional ref * on the ncp so the unlink
 850  * cannot be the final drop.  XXX should not be necessary now since the
 851  * caller receives the ref from the nchpp linkage, assuming the ncp
 852  * was linked in the first place.
 853  *
 854  * ncp must be locked, which means that there won't be any nc_parent
 855  * removal races.  This routine will acquire a temporary lock on
 856  * the parent as well as the appropriate hash chain.
 857  *
 858  * par must be locked and will remain locked on return.
 859  *
 860  * nhcpp must be spin-locked.  This routine eats the spin-lock.
 861  */
 862 static __inline void
 863 _cache_unlink_parent(struct namecache *par, struct namecache *ncp,
 864                      struct nchash_head *nchpp)
 865 {
 866         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
 867         struct vnode *dropvp;
 868
 869         KKASSERT(ncp->nc_parent == par);
 870         cpu_ccfence();
 871         _cache_ncp_gen_enter(ncp);
 872
 873         /* don't add a ref, we drop the nchpp ref later */
 874
 875         /*
 876          * Remove from hash table and parent, adjust accounting
 877          */
 878         TAILQ_REMOVE(&ncp->nc_head->list, ncp, nc_hash);
 879         TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 880         atomic_add_long(&pn->vfscache_count, -1);
 881
 882         /*
 883          * Removing leaf from tree
 884          */
 885         if (TAILQ_EMPTY(&ncp->nc_list)) {
 886                 if (ncp->nc_flag & NCF_UNRESOLVED)
 887                         atomic_add_long(&pn->vfscache_unres, -1);
 888                 atomic_add_long(&pn->vfscache_leafs, -1);
 889         }
 890
 891         /*
 892          * Parent is now a leaf?
 893          */
 894         dropvp = NULL;
 895         if (TAILQ_EMPTY(&par->nc_list)) {
 896                 /*
 897                  * XXX for now don't mess with par's gen, it causes
 898                  * unnecessary nlookup retries (though not many)
 899                  */
 900                 /*_cache_ncp_gen_enter(par);*/
 901                 if (par->nc_parent) {
 902                         if (par->nc_flag & NCF_UNRESOLVED)
 903                                 atomic_add_long(&pn->vfscache_unres, 1);
 904                         atomic_add_long(&pn->vfscache_leafs, 1);
 905                 }
 906                 if (par->nc_vp)
 907                         dropvp = par->nc_vp;
 908                 /*_cache_ncp_gen_exit(par);*/
 909         }
 910         ncp->nc_parent = NULL;
 911         ncp->nc_head = NULL;
 912         spin_unlock(&nchpp->spin);
 913         _cache_drop(par);       /* drop ncp's nc_parent ref from (par) */
 914
 915         /*
 916          * We can only safely vdrop with no spinlocks held.
 917          */
 918         if (dropvp)
 919                 vdrop(dropvp);
 920         _cache_ncp_gen_exit(ncp);
 921 }
 922
 923 /*
 924  * Allocate a new namecache structure.  Most of the code does not require
 925  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 926  *
 927  * The returned ncp will be locked and referenced.  The ref is generally meant
 928  * to be transfered to the nchpp linkage.
 929  */
 930 static struct namecache *
 931 cache_alloc(int nlen)
 932 {
 933         struct namecache *ncp;
 934
 935         ncp = kmalloc_obj(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 936         if (nlen)
 937                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHEAUX, M_WAITOK);
 938         ncp->nc_nlen = nlen;
 939         ncp->nc_flag = NCF_UNRESOLVED;
 940         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 941         ncp->nc_refs = 1;               /* natural ref */
 942         ncp->nc_generation = 0;         /* link/unlink/res/unres op */
 943         TAILQ_INIT(&ncp->nc_list);
 944         lockinit(&ncp->nc_lock, "ncplk", hz, LK_CANRECURSE);
 945         lockmgr(&ncp->nc_lock, LK_EXCLUSIVE);
 946
 947         return(ncp);
 948 }
 949
 950 /*
 951  * Can only be called for the case where the ncp has never been
 952  * associated with anything (so no spinlocks are needed).
 953  */
 954 static void
 955 _cache_free(struct namecache *ncp)
 956 {
 957         KKASSERT(ncp->nc_refs == 1);
 958         if (ncp->nc_name)
 959                 kfree(ncp->nc_name, M_VFSCACHEAUX);
 960         kfree_obj(ncp, M_VFSCACHE);
 961 }
 962
 963 /*
 964  * [re]initialize a nchandle.
 965  */
 966 void
 967 cache_zero(struct nchandle *nch)
 968 {
 969         nch->ncp = NULL;
 970         nch->mount = NULL;
 971 }
 972
 973 /*
 974  * Ref and deref a nchandle structure (ncp + mp)
 975  *
 976  * The caller must specify a stable ncp pointer, typically meaning the
 977  * ncp is already referenced but this can also occur indirectly through
 978  * e.g. holding a lock on a direct child.
 979  *
 980  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 981  *          use read spinlocks here.
 982  */
 983 struct nchandle *
 984 cache_hold(struct nchandle *nch)
 985 {
 986         _cache_hold(nch->ncp);
 987         _cache_mntref(nch->mount);
 988         return(nch);
 989 }
 990
 991 /*
 992  * Create a copy of a namecache handle for an already-referenced
 993  * entry.
 994  */
 995 void
 996 cache_copy(struct nchandle *nch, struct nchandle *target)
 997 {
 998         struct namecache *ncp;
 999         struct mount *mp;
1000         struct mntcache_elm *elm;
1001         struct namecache *ncpr;
1002         int i;
1003
1004         ncp = nch->ncp;
1005         mp = nch->mount;
1006         target->ncp = ncp;
1007         target->mount = mp;
1008
1009         elm = _cache_mntcache_hash(ncp);
1010         for (i = 0; i < MNTCACHE_SET; ++i) {
1011                 if (elm->ncp == ncp) {
1012                         ncpr = atomic_swap_ptr((void *)&elm->ncp, NULL);
1013                         if (ncpr == ncp) {
1014                                 _cache_mntref(mp);
1015                                 return;
1016                         }
1017                         if (ncpr)
1018                                 _cache_drop(ncpr);
1019                 }
1020                 ++elm;
1021         }
1022         if (ncp)
1023                 _cache_hold(ncp);
1024         _cache_mntref(mp);
1025 }
1026
1027 /*
1028  * Drop the nchandle, but try to cache the ref to avoid global atomic
1029  * ops.  This is typically done on the system root and jail root nchandles.
1030  */
1031 void
1032 cache_drop_and_cache(struct nchandle *nch, int elmno)
1033 {
1034         struct mntcache_elm *elm;
1035         struct mntcache_elm *best;
1036         struct namecache *ncpr;
1037         int delta1;
1038         int delta2;
1039         int i;
1040
1041         if (elmno > 4) {
1042                 if (nch->ncp) {
1043                         _cache_drop(nch->ncp);
1044                         nch->ncp = NULL;
1045                 }
1046                 if (nch->mount) {
1047                         _cache_mntrel(nch->mount);
1048                         nch->mount = NULL;
1049                 }
1050                 return;
1051         }
1052
1053         elm = _cache_mntcache_hash(nch->ncp);
1054         best = elm;
1055         for (i = 0; i < MNTCACHE_SET; ++i) {
1056                 if (elm->ncp == NULL) {
1057                         ncpr = atomic_swap_ptr((void *)&elm->ncp, nch->ncp);
1058                         _cache_mntrel(nch->mount);
1059                         elm->ticks = ticks;
1060                         nch->mount = NULL;
1061                         nch->ncp = NULL;
1062                         if (ncpr)
1063                                 _cache_drop(ncpr);
1064                         return;
1065                 }
1066                 delta1 = ticks - best->ticks;
1067                 delta2 = ticks - elm->ticks;
1068                 if (delta2 > delta1 || delta1 < -1 || delta2 < -1)
1069                         best = elm;
1070                 ++elm;
1071         }
1072         ncpr = atomic_swap_ptr((void *)&best->ncp, nch->ncp);
1073         _cache_mntrel(nch->mount);
1074         best->ticks = ticks;
1075         nch->mount = NULL;
1076         nch->ncp = NULL;
1077         if (ncpr)
1078                 _cache_drop(ncpr);
1079 }
1080
1081 void
1082 cache_changemount(struct nchandle *nch, struct mount *mp)
1083 {
1084         _cache_mntref(mp);
1085         _cache_mntrel(nch->mount);
1086         nch->mount = mp;
1087 }
1088
1089 void
1090 cache_drop(struct nchandle *nch)
1091 {
1092         _cache_mntrel(nch->mount);
1093         _cache_drop(nch->ncp);
1094         nch->ncp = NULL;
1095         nch->mount = NULL;
1096 }
1097
1098 /*
1099  * Returns:
1100  *      -1      Locked by other
1101  *       0      Not locked
1102  *      (v)     LK_SHARED or LK_EXCLUSIVE
1103  */
1104 int
1105 cache_lockstatus(struct nchandle *nch)
1106 {
1107         return(_cache_lockstatus(nch->ncp));
1108 }
1109
1110 void
1111 cache_lock(struct nchandle *nch)
1112 {
1113         _cache_lock(nch->ncp);
1114 }
1115
1116 /*
1117  * Returns a shared or exclusive-locked ncp.  The ncp will only be
1118  * shared-locked if it is already resolved.
1119  */
1120 void
1121 cache_lock_maybe_shared(struct nchandle *nch, int excl)
1122 {
1123         struct namecache *ncp = nch->ncp;
1124
1125         if (ncp_shared_lock_disable || excl ||
1126             (ncp->nc_flag & NCF_UNRESOLVED)) {
1127                 _cache_lock(ncp);
1128         } else {
1129                 _cache_lock_shared(ncp);
1130                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1131                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1132                                 _cache_unlock(ncp);
1133                                 _cache_lock(ncp);
1134                         }
1135                 } else {
1136                         _cache_unlock(ncp);
1137                         _cache_lock(ncp);
1138                 }
1139         }
1140 }
1141
1142 /*
1143  * Lock fncpd, fncp, tncpd, and tncp.  tncp is already locked but may
1144  * have to be cycled to avoid deadlocks.  Make sure all four are resolved.
1145  *
1146  * The caller is responsible for checking the validity upon return as
1147  * the records may have been flagged DESTROYED in the interim.
1148  *
1149  * Namecache lock ordering is leaf first, then parent.  However, complex
1150  * interactions may occur between the source and target because there is
1151  * no ordering guarantee between (fncpd, fncp) and (tncpd and tncp).
1152  */
1153 void
1154 cache_lock4_tondlocked(struct nchandle *fncpd, struct nchandle *fncp,
1155                        struct nchandle *tncpd, struct nchandle *tncp,
1156                        struct ucred *fcred, struct ucred *tcred)
1157 {
1158         int tlocked = 1;
1159         u_int dummy_gen = 0;
1160
1161         /*
1162          * Lock tncp and tncpd
1163          *
1164          * NOTE: Because these ncps are not locked to begin with, it is
1165          *       possible for other rename races to cause the normal lock
1166          *       order assumptions to fail.
1167          *
1168          * NOTE: Lock ordering assumptions are valid if a leaf's parent
1169          *       matches after the leaf has been locked.  However, ordering
1170          *       between the 'from' and the 'to' is not and an overlapping
1171          *       lock order reversal is still possible.
1172          */
1173 again:
1174         if (__predict_false(tlocked == 0)) {
1175                 cache_lock(tncp);
1176         }
1177         if (__predict_false(cache_lock_nonblock(tncpd) != 0)) {
1178                 cache_unlock(tncp);
1179                 cache_lock(tncpd);      /* cycle tncpd lock */
1180                 cache_unlock(tncpd);
1181                 tlocked = 0;
1182                 goto again;
1183         }
1184
1185         /*
1186          * Lock fncp and fncpd
1187          *
1188          * NOTE: Because these ncps are not locked to begin with, it is
1189          *       possible for other rename races to cause the normal lock
1190          *       order assumptions to fail.
1191          *
1192          * NOTE: Lock ordering assumptions are valid if a leaf's parent
1193          *       matches after the leaf has been locked.  However, ordering
1194          *       between the 'from' and the 'to' is not and an overlapping
1195          *       lock order reversal is still possible.
1196          */
1197         if (__predict_false(cache_lock_nonblock(fncp) != 0)) {
1198                 cache_unlock(tncpd);
1199                 cache_unlock(tncp);
1200                 cache_lock(fncp);       /* cycle fncp lock */
1201                 cache_unlock(fncp);
1202                 tlocked = 0;
1203                 goto again;
1204         }
1205
1206         if (__predict_false(cache_lock_nonblock(fncpd) != 0)) {
1207                 cache_unlock(fncp);
1208                 cache_unlock(tncpd);
1209                 cache_unlock(tncp);
1210                 cache_lock(fncpd);
1211                 cache_unlock(fncpd);    /* cycle fncpd lock */
1212                 tlocked = 0;
1213                 goto again;
1214         }
1215
1216         if (__predict_true((fncpd->ncp->nc_flag & NCF_DESTROYED) == 0))
1217                 cache_resolve(fncpd, &dummy_gen, fcred);
1218         if (__predict_true((tncpd->ncp->nc_flag & NCF_DESTROYED) == 0))
1219                 cache_resolve(tncpd, &dummy_gen, tcred);
1220         if (__predict_true((fncp->ncp->nc_flag & NCF_DESTROYED) == 0))
1221                 cache_resolve(fncp, &dummy_gen, fcred);
1222         if (__predict_true((tncp->ncp->nc_flag & NCF_DESTROYED) == 0))
1223                 cache_resolve(tncp, &dummy_gen, tcred);
1224 }
1225
1226 int
1227 cache_lock_nonblock(struct nchandle *nch)
1228 {
1229         return(_cache_lock_nonblock(nch->ncp));
1230 }
1231
1232 void
1233 cache_unlock(struct nchandle *nch)
1234 {
1235         _cache_unlock(nch->ncp);
1236 }
1237
1238 /*
1239  * ref-and-lock, unlock-and-deref functions.
1240  *
1241  * This function is primarily used by nlookup.  Even though cache_lock
1242  * holds the vnode, it is possible that the vnode may have already
1243  * initiated a recyclement.
1244  *
1245  * We want cache_get() to return a definitively usable vnode or a
1246  * definitively unresolved ncp.
1247  */
1248 static
1249 struct namecache *
1250 _cache_get(struct namecache *ncp)
1251 {
1252         _cache_hold(ncp);
1253         _cache_lock(ncp);
1254         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1255                 _cache_setunresolved(ncp, 1);
1256         return(ncp);
1257 }
1258
1259 /*
1260  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1261  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1262  * valid.  Otherwise an exclusive lock will be acquired instead.
1263  */
1264 static
1265 struct namecache *
1266 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1267 {
1268         if (ncp_shared_lock_disable || excl ||
1269             (ncp->nc_flag & NCF_UNRESOLVED))
1270         {
1271                 return(_cache_get(ncp));
1272         }
1273         _cache_hold(ncp);
1274         _cache_lock_shared(ncp);
1275         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1276                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1277                         _cache_unlock(ncp);
1278                         ncp = _cache_get(ncp);
1279                         _cache_drop(ncp);
1280                 }
1281         } else {
1282                 _cache_unlock(ncp);
1283                 ncp = _cache_get(ncp);
1284                 _cache_drop(ncp);
1285         }
1286         return(ncp);
1287 }
1288
1289 /*
1290  * NOTE: The same nchandle can be passed for both arguments.
1291  */
1292 void
1293 cache_get(struct nchandle *nch, struct nchandle *target)
1294 {
1295         KKASSERT(nch->ncp->nc_refs > 0);
1296         target->mount = nch->mount;
1297         target->ncp = _cache_get(nch->ncp);
1298         _cache_mntref(target->mount);
1299 }
1300
1301 void
1302 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1303 {
1304         KKASSERT(nch->ncp->nc_refs > 0);
1305         target->mount = nch->mount;
1306         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1307         _cache_mntref(target->mount);
1308 }
1309
1310 /*
1311  * Release a held and locked ncp
1312  */
1313 static __inline
1314 void
1315 _cache_put(struct namecache *ncp)
1316 {
1317         _cache_unlock(ncp);
1318         _cache_drop(ncp);
1319 }
1320
1321 void
1322 cache_put(struct nchandle *nch)
1323 {
1324         _cache_mntrel(nch->mount);
1325         _cache_put(nch->ncp);
1326         nch->ncp = NULL;
1327         nch->mount = NULL;
1328 }
1329
1330 /*
1331  * Resolve an unresolved ncp by associating a vnode with it.  If the
1332  * vnode is NULL, a negative cache entry is created.
1333  *
1334  * The ncp should be locked on entry and will remain locked on return.
1335  */
1336 static
1337 void
1338 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp,
1339              int adjgen)
1340 {
1341         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1342
1343         KKASSERT((ncp->nc_flag & NCF_UNRESOLVED) &&
1344                  (_cache_lockstatus(ncp) == LK_EXCLUSIVE) &&
1345                  ncp->nc_vp == NULL);
1346
1347         if (adjgen)
1348                 _cache_ncp_gen_enter(ncp);
1349
1350         if (vp) {
1351                 /*
1352                  * Any vp associated with an ncp which has children must
1353                  * be held.  Any vp associated with a locked ncp must be held.
1354                  */
1355                 if (!TAILQ_EMPTY(&ncp->nc_list))
1356                         vhold(vp);
1357                 spin_lock(&vp->v_spin);
1358                 ncp->nc_vp = vp;
1359                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1360                 ++vp->v_namecache_count;
1361                 _cache_hold(ncp);               /* v_namecache assoc */
1362                 spin_unlock(&vp->v_spin);
1363                 vhold(vp);                      /* nc_vp */
1364
1365                 /*
1366                  * Set auxiliary flags
1367                  */
1368                 switch(vp->v_type) {
1369                 case VDIR:
1370                         ncp->nc_flag |= NCF_ISDIR;
1371                         break;
1372                 case VLNK:
1373                         ncp->nc_flag |= NCF_ISSYMLINK;
1374                         /* XXX cache the contents of the symlink */
1375                         break;
1376                 default:
1377                         break;
1378                 }
1379
1380                 ncp->nc_error = 0;
1381
1382                 /*
1383                  * XXX: this is a hack to work-around the lack of a real pfs vfs
1384                  * implementation
1385                  */
1386                 if (mp) {
1387                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1388                                 vp->v_pfsmp = mp;
1389                 }
1390         } else {
1391                 /*
1392                  * When creating a negative cache hit we set the
1393                  * namecache_gen.  A later resolve will clean out the
1394                  * negative cache hit if the mount point's namecache_gen
1395                  * has changed.  Used by devfs, could also be used by
1396                  * other remote FSs.
1397                  */
1398                 ncp->nc_vp = NULL;
1399                 ncp->nc_negcpu = mycpu->gd_cpuid;
1400                 spin_lock(&pn->neg_spin);
1401                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
1402                 _cache_hold(ncp);       /* neg_list assoc */
1403                 ++pn->neg_count;
1404                 spin_unlock(&pn->neg_spin);
1405                 atomic_add_long(&pn->vfscache_negs, 1);
1406
1407                 ncp->nc_error = ENOENT;
1408                 if (mp)
1409                         VFS_NCPGEN_SET(mp, ncp);
1410         }
1411
1412         /*
1413          * Previously unresolved leaf is now resolved.
1414          *
1415          * Clear the NCF_UNRESOLVED flag last (see cache_nlookup_nonlocked()).
1416          * We only adjust vfscache_unres for ncp's that are in the tree.
1417          */
1418         if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent)
1419                 atomic_add_long(&pn->vfscache_unres, -1);
1420         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1421         if (adjgen)
1422                 _cache_ncp_gen_exit(ncp);
1423 }
1424
1425 void
1426 cache_setvp(struct nchandle *nch, struct vnode *vp)
1427 {
1428         _cache_setvp(nch->mount, nch->ncp, vp, 1);
1429 }
1430
1431 /*
1432  * Used for NFS
1433  */
1434 void
1435 cache_settimeout(struct nchandle *nch, int nticks)
1436 {
1437         struct namecache *ncp = nch->ncp;
1438
1439         if ((ncp->nc_timeout = ticks + nticks) == 0)
1440                 ncp->nc_timeout = 1;
1441 }
1442
1443 /*
1444  * Disassociate the vnode or negative-cache association and mark a
1445  * namecache entry as unresolved again.  Note that the ncp is still
1446  * left in the hash table and still linked to its parent.
1447  *
1448  * The ncp should be locked and refd on entry and will remain locked and refd
1449  * on return.
1450  *
1451  * This routine is normally never called on a directory containing children.
1452  * However, NFS often does just that in its rename() code as a cop-out to
1453  * avoid complex namespace operations.  This disconnects a directory vnode
1454  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1455  * sync.
1456  *
1457  */
1458 static
1459 void
1460 _cache_setunresolved(struct namecache *ncp, int adjgen)
1461 {
1462         struct vnode *vp;
1463
1464         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1465                 struct pcpu_ncache *pn;
1466
1467                 if (adjgen)
1468                         _cache_ncp_gen_enter(ncp);
1469
1470                 /*
1471                  * Is a resolved or destroyed leaf now becoming unresolved?
1472                  * Only adjust vfscache_unres for linked ncp's.
1473                  */
1474                 if (TAILQ_EMPTY(&ncp->nc_list) && ncp->nc_parent) {
1475                         pn = &pcpu_ncache[mycpu->gd_cpuid];
1476                         atomic_add_long(&pn->vfscache_unres, 1);
1477                 }
1478
1479                 ncp->nc_flag |= NCF_UNRESOLVED;
1480                 ncp->nc_timeout = 0;
1481                 ncp->nc_error = ENOTCONN;
1482                 if ((vp = ncp->nc_vp) != NULL) {
1483                         spin_lock(&vp->v_spin);
1484                         ncp->nc_vp = NULL;
1485                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1486                         --vp->v_namecache_count;
1487                         spin_unlock(&vp->v_spin);
1488
1489                         /*
1490                          * Any vp associated with an ncp with children is
1491                          * held by that ncp.  Any vp associated with  ncp
1492                          * is held by that ncp.  These conditions must be
1493                          * undone when the vp is cleared out from the ncp.
1494                          */
1495                         if (!TAILQ_EMPTY(&ncp->nc_list))
1496                                 vdrop(vp);
1497                         vdrop(vp);
1498                 } else {
1499                         pn = &pcpu_ncache[ncp->nc_negcpu];
1500
1501                         atomic_add_long(&pn->vfscache_negs, -1);
1502                         spin_lock(&pn->neg_spin);
1503                         TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
1504                         --pn->neg_count;
1505                         spin_unlock(&pn->neg_spin);
1506                 }
1507                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1508
1509                 if (adjgen)
1510                         _cache_ncp_gen_exit(ncp);
1511                 _cache_drop(ncp);       /* from v_namecache or neg_list */
1512         }
1513 }
1514
1515 /*
1516  * The cache_nresolve() code calls this function to automatically
1517  * set a resolved cache element to unresolved if it has timed out
1518  * or if it is a negative cache hit and the mount point namecache_gen
1519  * has changed.
1520  */
1521 static __inline int
1522 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1523 {
1524         /*
1525          * Try to zap entries that have timed out.  We have
1526          * to be careful here because locked leafs may depend
1527          * on the vnode remaining intact in a parent, so only
1528          * do this under very specific conditions.
1529          */
1530         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1531             TAILQ_EMPTY(&ncp->nc_list)) {
1532                 return 1;
1533         }
1534
1535         /*
1536          * If a resolved negative cache hit is invalid due to
1537          * the mount's namecache generation being bumped, zap it.
1538          */
1539         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1540                 return 1;
1541         }
1542
1543         /*
1544          * Otherwise we are good
1545          */
1546         return 0;
1547 }
1548
1549 static __inline void
1550 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1551 {
1552         /*
1553          * Already in an unresolved state, nothing to do.
1554          */
1555         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1556                 if (_cache_auto_unresolve_test(mp, ncp))
1557                         _cache_setunresolved(ncp, 1);
1558         }
1559 }
1560
1561 void
1562 cache_setunresolved(struct nchandle *nch)
1563 {
1564         _cache_setunresolved(nch->ncp, 1);
1565 }
1566
1567 /*
1568  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1569  * looking for matches.  This flag tells the lookup code when it must
1570  * check for a mount linkage and also prevents the directories in question
1571  * from being deleted or renamed.
1572  */
1573 static
1574 int
1575 cache_clrmountpt_callback(struct mount *mp, void *data)
1576 {
1577         struct nchandle *nch = data;
1578
1579         if (mp->mnt_ncmounton.ncp == nch->ncp)
1580                 return(1);
1581         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1582                 return(1);
1583         return(0);
1584 }
1585
1586 /*
1587  * Clear NCF_ISMOUNTPT on nch->ncp if it is no longer associated
1588  * with a mount point.
1589  */
1590 void
1591 cache_clrmountpt(struct nchandle *nch)
1592 {
1593         int count;
1594
1595         count = mountlist_scan(cache_clrmountpt_callback, nch,
1596                                MNTSCAN_FORWARD | MNTSCAN_NOBUSY |
1597                                MNTSCAN_NOUNLOCK);
1598         if (count == 0)
1599                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1600 }
1601
1602 /*
1603  * Invalidate portions of the namecache topology given a starting entry.
1604  * The passed ncp is set to an unresolved state and:
1605  *
1606  * The passed ncp must be referenced and locked.  The routine may unlock
1607  * and relock ncp several times, and will recheck the children and loop
1608  * to catch races.  When done the passed ncp will be returned with the
1609  * reference and lock intact.
1610  *
1611  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1612  *                        that the physical underlying nodes have been
1613  *                        destroyed... as in deleted.  For example, when
1614  *                        a directory is removed.  This will cause record
1615  *                        lookups on the name to no longer be able to find
1616  *                        the record and tells the resolver to return failure
1617  *                        rather then trying to resolve through the parent.
1618  *
1619  *                        The topology itself, including ncp->nc_name,
1620  *                        remains intact.
1621  *
1622  *                        This only applies to the passed ncp, if CINV_CHILDREN
1623  *                        is specified the children are not flagged.
1624  *
1625  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1626  *                        state as well.
1627  *
1628  *                        Note that this will also have the side effect of
1629  *                        cleaning out any unreferenced nodes in the topology
1630  *                        from the leaves up as the recursion backs out.
1631  *
1632  * Note that the topology for any referenced nodes remains intact, but
1633  * the nodes will be marked as having been destroyed and will be set
1634  * to an unresolved state.
1635  *
1636  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1637  * the namecache entry may not actually be invalidated on return if it was
1638  * revalidated while recursing down into its children.  This code guarentees
1639  * that the node(s) will go through an invalidation cycle, but does not
1640  * guarentee that they will remain in an invalidated state.
1641  *
1642  * Returns non-zero if a revalidation was detected during the invalidation
1643  * recursion, zero otherwise.  Note that since only the original ncp is
1644  * locked the revalidation ultimately can only indicate that the original ncp
1645  * *MIGHT* no have been reresolved.
1646  *
1647  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1648  * have to avoid blowing out the kernel stack.  We do this by saving the
1649  * deep namecache node and aborting the recursion, then re-recursing at that
1650  * node using a depth-first algorithm in order to allow multiple deep
1651  * recursions to chain through each other, then we restart the invalidation
1652  * from scratch.
1653  */
1654
1655 struct cinvtrack {
1656         struct namecache *resume_ncp;
1657         int depth;
1658 };
1659
1660 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1661
1662 static
1663 int
1664 _cache_inval(struct namecache *ncp, int flags)
1665 {
1666         struct cinvtrack track;
1667         struct namecache *ncp2;
1668         int r;
1669
1670         track.depth = 0;
1671         track.resume_ncp = NULL;
1672
1673         for (;;) {
1674                 r = _cache_inval_internal(ncp, flags, &track);
1675                 if (track.resume_ncp == NULL)
1676                         break;
1677                 _cache_unlock(ncp);
1678                 while ((ncp2 = track.resume_ncp) != NULL) {
1679                         track.resume_ncp = NULL;
1680                         _cache_lock(ncp2);
1681                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1682                                              &track);
1683                         /*_cache_put(ncp2);*/
1684                         cache_zap(ncp2);
1685                 }
1686                 _cache_lock(ncp);
1687         }
1688         return(r);
1689 }
1690
1691 int
1692 cache_inval(struct nchandle *nch, int flags)
1693 {
1694         return(_cache_inval(nch->ncp, flags));
1695 }
1696
1697 /*
1698  * Helper for _cache_inval().  The passed ncp is refd and locked and
1699  * remains that way on return, but may be unlocked/relocked multiple
1700  * times by the routine.
1701  */
1702 static int
1703 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1704 {
1705         struct namecache *nextkid;
1706         int rcnt = 0;
1707
1708         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1709
1710         _cache_ncp_gen_enter(ncp);
1711         _cache_setunresolved(ncp, 0);
1712         if (flags & CINV_DESTROY) {
1713                 ncp->nc_flag |= NCF_DESTROYED;
1714                 cpu_sfence();
1715         }
1716
1717         while ((flags & CINV_CHILDREN) &&
1718                (nextkid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1719         ) {
1720                 struct namecache *kid;
1721                 int restart;
1722
1723                 restart = 0;
1724                 _cache_hold(nextkid);
1725                 if (++track->depth > MAX_RECURSION_DEPTH) {
1726                         track->resume_ncp = ncp;
1727                         _cache_hold(ncp);
1728                         ++rcnt;
1729                 }
1730                 while ((kid = nextkid) != NULL) {
1731                         /*
1732                          * Parent (ncp) must be locked for the iteration.
1733                          */
1734                         nextkid = NULL;
1735                         if (kid->nc_parent != ncp) {
1736                                 _cache_drop(kid);
1737                                 kprintf("cache_inval_internal restartA %s\n",
1738                                         ncp->nc_name);
1739                                 restart = 1;
1740                                 break;
1741                         }
1742                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1743                                 _cache_hold(nextkid);
1744
1745                         /*
1746                          * Parent unlocked for this section to avoid
1747                          * deadlocks.  Then lock the kid and check for
1748                          * races.
1749                          */
1750                         _cache_unlock(ncp);
1751                         if (track->resume_ncp) {
1752                                 _cache_drop(kid);
1753                                 _cache_lock(ncp);
1754                                 break;
1755                         }
1756                         _cache_lock(kid);
1757                         if (kid->nc_parent != ncp) {
1758                                 kprintf("cache_inval_internal "
1759                                         "restartB %s\n",
1760                                         ncp->nc_name);
1761                                 restart = 1;
1762                                 _cache_unlock(kid);
1763                                 _cache_drop(kid);
1764                                 _cache_lock(ncp);
1765                                 break;
1766                         }
1767                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1768                             TAILQ_FIRST(&kid->nc_list)
1769                         ) {
1770
1771                                 rcnt += _cache_inval_internal(kid,
1772                                                 flags & ~CINV_DESTROY, track);
1773                                 /*_cache_unlock(kid);*/
1774                                 /*_cache_drop(kid);*/
1775                                 cache_zap(kid);
1776                         } else {
1777                                 cache_zap(kid);
1778                         }
1779
1780                         /*
1781                          * Relock parent to continue scan
1782                          */
1783                         _cache_lock(ncp);
1784                 }
1785                 if (nextkid)
1786                         _cache_drop(nextkid);
1787                 --track->depth;
1788                 if (restart == 0)
1789                         break;
1790         }
1791
1792         /*
1793          * Someone could have gotten in there while ncp was unlocked,
1794          * retry if so.
1795          */
1796         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1797                 ++rcnt;
1798         _cache_ncp_gen_exit(ncp);
1799
1800         return (rcnt);
1801 }
1802
1803 /*
1804  * Invalidate a vnode's namecache associations.  To avoid races against
1805  * the resolver we do not invalidate a node which we previously invalidated
1806  * but which was then re-resolved while we were in the invalidation loop.
1807  *
1808  * Returns non-zero if any namecache entries remain after the invalidation
1809  * loop completed.
1810  *
1811  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1812  *       be ripped out of the topology while held, the vnode's v_namecache
1813  *       list has no such restriction.  NCP's can be ripped out of the list
1814  *       at virtually any time if not locked, even if held.
1815  *
1816  *       In addition, the v_namecache list itself must be locked via
1817  *       the vnode's spinlock.
1818  */
1819 int
1820 cache_inval_vp(struct vnode *vp, int flags)
1821 {
1822         struct namecache *ncp;
1823         struct namecache *next;
1824
1825 restart:
1826         spin_lock(&vp->v_spin);
1827         ncp = TAILQ_FIRST(&vp->v_namecache);
1828         if (ncp)
1829                 _cache_hold(ncp);
1830         while (ncp) {
1831                 /* loop entered with ncp held and vp spin-locked */
1832                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1833                         _cache_hold(next);
1834                 spin_unlock(&vp->v_spin);
1835                 _cache_lock(ncp);
1836                 if (ncp->nc_vp != vp) {
1837                         kprintf("Warning: cache_inval_vp: race-A detected on "
1838                                 "%s\n", ncp->nc_name);
1839                         _cache_put(ncp);
1840                         if (next)
1841                                 _cache_drop(next);
1842                         goto restart;
1843                 }
1844                 _cache_inval(ncp, flags);
1845                 _cache_put(ncp);                /* also releases reference */
1846                 ncp = next;
1847                 spin_lock(&vp->v_spin);
1848                 if (ncp && ncp->nc_vp != vp) {
1849                         spin_unlock(&vp->v_spin);
1850                         kprintf("Warning: cache_inval_vp: race-B detected on "
1851                                 "%s\n", ncp->nc_name);
1852                         _cache_drop(ncp);
1853                         goto restart;
1854                 }
1855         }
1856         spin_unlock(&vp->v_spin);
1857         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1858 }
1859
1860 /*
1861  * This routine is used instead of the normal cache_inval_vp() when we
1862  * are trying to recycle otherwise good vnodes.
1863  *
1864  * Return 0 on success, non-zero if not all namecache records could be
1865  * disassociated from the vnode (for various reasons).
1866  */
1867 int
1868 cache_inval_vp_nonblock(struct vnode *vp)
1869 {
1870         struct namecache *ncp;
1871         struct namecache *next;
1872
1873         spin_lock(&vp->v_spin);
1874
1875         ncp = TAILQ_FIRST(&vp->v_namecache);
1876         if (ncp)
1877                 _cache_hold(ncp);
1878
1879         while (ncp) {
1880                 /* loop entered with ncp held */
1881                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1882                         _cache_hold(next);
1883                 spin_unlock(&vp->v_spin);
1884                 if (_cache_lock_nonblock(ncp)) {
1885                         _cache_drop(ncp);
1886                         if (next)
1887                                 _cache_drop(next);
1888                         goto done;
1889                 }
1890                 if (ncp->nc_vp != vp) {
1891                         kprintf("Warning: cache_inval_vp: race-A detected on "
1892                                 "%s\n", ncp->nc_name);
1893                         _cache_put(ncp);
1894                         if (next)
1895                                 _cache_drop(next);
1896                         goto done;
1897                 }
1898                 _cache_inval(ncp, 0);
1899                 _cache_put(ncp);                /* also releases reference */
1900                 ncp = next;
1901                 spin_lock(&vp->v_spin);
1902                 if (ncp && ncp->nc_vp != vp) {
1903                         spin_unlock(&vp->v_spin);
1904                         kprintf("Warning: cache_inval_vp: race-B detected on "
1905                                 "%s\n", ncp->nc_name);
1906                         _cache_drop(ncp);
1907                         goto done;
1908                 }
1909         }
1910         spin_unlock(&vp->v_spin);
1911 done:
1912         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1913 }
1914
1915 /*
1916  * Attempt to quickly invalidate the vnode's namecache entry.  This function
1917  * will also dive the ncp and free its children but only if they are trivial.
1918  * All locks are non-blocking and the function will fail if required locks
1919  * cannot be obtained.
1920  *
1921  * We want this sort of function to be able to guarantee progress when vnlru
1922  * wants to recycle a vnode.  Directories could otherwise get stuck and not
1923  * be able to recycle due to destroyed or unresolved children in the
1924  * namecache.
1925  */
1926 void
1927 cache_inval_vp_quick(struct vnode *vp)
1928 {
1929         struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];
1930         struct namecache *ncp;
1931         struct namecache *kid;
1932
1933         spin_lock(&vp->v_spin);
1934         while ((ncp = TAILQ_FIRST(&vp->v_namecache)) != NULL) {
1935                 _cache_hold(ncp);
1936                 spin_unlock(&vp->v_spin);
1937                 if (_cache_lock_nonblock(ncp)) {
1938                         _cache_drop(ncp);
1939                         return;
1940                 }
1941
1942                 /*
1943                  * Try to trivially destroy any children.
1944                  */
1945                 while ((kid = TAILQ_FIRST(&ncp->nc_list)) != NULL) {
1946                         struct nchash_head *nchpp;
1947
1948                         /*
1949                          * Early test without the lock.  Give-up if the
1950                          * child has children of its own, the child is
1951                          * positively-resolved, or the ref-count is
1952                          * unexpected.
1953                          */
1954                         if (TAILQ_FIRST(&kid->nc_list) ||
1955                             kid->nc_vp ||
1956                             kid->nc_refs != ncpbaserefs(kid))
1957                         {
1958                                 _cache_put(ncp);
1959                                 return;
1960                         }
1961
1962                         _cache_hold(kid);
1963                         if (_cache_lock_nonblock(kid)) {
1964                                 _cache_drop(kid);
1965                                 _cache_put(ncp);
1966                                 return;
1967                         }
1968
1969                         /*
1970                          * A destruction/free test requires the parent,
1971                          * the kid, and the hash table to be locked.  Note
1972                          * that the kid may still be on the negative cache
1973                          * list.
1974                          */
1975                         nchpp = kid->nc_head;
1976                         spin_lock(&nchpp->spin);
1977
1978                         /*
1979                          * Give up if the child isn't trivial.  It can be
1980                          * resolved or unresolved but must not have a vp.
1981                          */
1982                         if (kid->nc_parent != ncp ||
1983                             kid->nc_vp ||
1984                             TAILQ_FIRST(&kid->nc_list) ||
1985                             kid->nc_refs != 1 + ncpbaserefs(kid))
1986                         {
1987                                 spin_unlock(&nchpp->spin);
1988                                 _cache_put(kid);
1989                                 _cache_put(ncp);
1990                                 return;
1991                         }
1992
1993                         ++pn->inv_kid_quick_count;
1994
1995                         /*
1996                          * We can safely destroy the kid.  It may still
1997                          * have extra refs due to ncneglist races, but since
1998                          * we checked above with the lock held those races
1999                          * will self-resolve.
2000                          *
2001                          * With these actions the kid should nominally
2002                          * have just its natural ref plus our ref.
2003                          *
2004                          * This is only safe because we hold locks on
2005                          * the parent, the kid, and the nchpp.  The only
2006                          * lock we don't have is on the ncneglist and that
2007                          * can race a ref, but as long as we unresolve the
2008                          * kid before executing our final drop the ncneglist
2009                          * code path(s) will just drop their own ref so all
2010                          * is good.
2011                          */
2012                         _cache_unlink_parent(ncp, kid, nchpp);
2013                         _cache_setunresolved(kid, 1);
2014                         if (kid->nc_refs != 2) {
2015                                 kprintf("Warning: kid %p unexpected refs=%d "
2016                                         "%08x %s\n",
2017                                         kid, kid->nc_refs,
2018                                         kid->nc_flag, kid->nc_name);
2019                         }
2020                         _cache_put(kid);    /* drop our ref and lock */
2021                         _cache_drop(kid);   /* drop natural ref to destroy */
2022                 }
2023
2024                 /*
2025                  * Now check ncp itself against our expectations.  With
2026                  * no children left we have our ref plus whether it is
2027                  * resolved or not (which it has to be, actually, since it
2028                  * is hanging off the vp->v_namecache).
2029                  */
2030                 if (ncp->nc_refs != 1 + ncpbaserefs(ncp)) {
2031                         _cache_put(ncp);
2032                         spin_lock(&vp->v_spin);
2033                         break;
2034                 }
2035
2036                 ++pn->inv_ncp_quick_count;
2037
2038                 /*
2039                  * Success, disassociate and release the ncp.  Do not
2040                  * try to zap it here.
2041                  *
2042                  * NOTE: Releasing the ncp here leaves it in the tree,
2043                  *       but since we have disassociated the vnode this
2044                  *       ncp entry becomes 'trivial' and successive calls
2045                  *       to cache_inval_vp_quick() will be able to continue
2046                  *       to make progress.
2047                  */
2048                 _cache_setunresolved(ncp, 1);
2049                 _cache_put(ncp);
2050                 spin_lock(&vp->v_spin);
2051         }
2052         spin_unlock(&vp->v_spin);
2053 }
2054
2055 /*
2056  * Clears the universal directory search 'ok' flag.  This flag allows
2057  * nlookup() to bypass normal vnode checks.  This flag is a cached flag
2058  * so clearing it simply forces revalidation.
2059  */
2060 void
2061 cache_inval_wxok(struct vnode *vp)
2062 {
2063         struct namecache *ncp;
2064
2065         spin_lock(&vp->v_spin);
2066         TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
2067                 if (ncp->nc_flag & (NCF_WXOK | NCF_NOTX))
2068                         atomic_clear_short(&ncp->nc_flag, NCF_WXOK | NCF_NOTX);
2069         }
2070         spin_unlock(&vp->v_spin);
2071 }
2072
2073 /*
2074  * The source ncp has been renamed to the target ncp.  All elements have been
2075  * locked, including the parent ncp's.
2076  *
2077  * The target ncp is destroyed (as a normal rename-over would destroy the
2078  * target file or directory).
2079  *
2080  * Because there may be references to the source ncp we cannot copy its
2081  * contents to the target.  Instead the source ncp is relinked as the target
2082  * and the target ncp is removed from the namecache topology.
2083  */
2084 void
2085 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
2086 {
2087         struct namecache *fncp = fnch->ncp;
2088         struct namecache *tncp = tnch->ncp;
2089         struct namecache *par;
2090         struct nchash_head *nchpp;
2091         u_int32_t hash;
2092         char *oname;
2093         char *nname;
2094
2095         if (tncp->nc_nlen) {
2096                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHEAUX, M_WAITOK);
2097                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
2098                 nname[tncp->nc_nlen] = 0;
2099         } else {
2100                 nname = NULL;
2101         }
2102
2103         /*
2104          * Rename fncp (unlink)
2105          */
2106         if (fncp->nc_parent) {
2107                 par = fncp->nc_parent;
2108                 _cache_hold(par);
2109                 _cache_lock(par);
2110                 nchpp = fncp->nc_head;
2111                 spin_lock(&nchpp->spin);
2112                 _cache_unlink_parent(par, fncp, nchpp); /* eats nchpp */
2113                 _cache_put(par);
2114         } else {
2115                 par = NULL;
2116                 nchpp = NULL;
2117         }
2118         oname = fncp->nc_name;
2119         fncp->nc_name = nname;
2120         fncp->nc_nlen = tncp->nc_nlen;
2121         if (oname)
2122                 kfree(oname, M_VFSCACHEAUX);
2123
2124         par = tncp->nc_parent;
2125         KKASSERT(par->nc_lock.lk_lockholder == curthread);
2126
2127         /*
2128          * Rename fncp (relink)
2129          */
2130         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
2131         hash = fnv_32_buf(&par, sizeof(par), hash);
2132         nchpp = NCHHASH(hash);
2133
2134         spin_lock(&nchpp->spin);
2135         _cache_link_parent(fncp, par, nchpp);
2136         spin_unlock(&nchpp->spin);
2137
2138         /*
2139          * Get rid of the overwritten tncp (unlink)
2140          */
2141         _cache_unlink(tncp);
2142 }
2143
2144 /*
2145  * Perform actions consistent with unlinking a file.  The passed-in ncp
2146  * must be locked.
2147  *
2148  * The ncp is marked DESTROYED so it no longer shows up in searches,
2149  * and will be physically deleted when the vnode goes away.
2150  *
2151  * If the related vnode has no refs then we cycle it through vget()/vput()
2152  * to (possibly if we don't have a ref race) trigger a deactivation,
2153  * allowing the VFS to trivially detect and recycle the deleted vnode
2154  * via VOP_INACTIVE().
2155  *
2156  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
2157  *       target ncp.
2158  */
2159 void
2160 cache_unlink(struct nchandle *nch)
2161 {
2162         _cache_unlink(nch->ncp);
2163 }
2164
2165 static void
2166 _cache_unlink(struct namecache *ncp)
2167 {
2168         struct vnode *vp;
2169
2170         /*
2171          * Causes lookups to fail and allows another ncp with the same
2172          * name to be created under ncp->nc_parent.
2173          */
2174         _cache_ncp_gen_enter(ncp);
2175         ncp->nc_flag |= NCF_DESTROYED;
2176
2177         /*
2178          * Attempt to trigger a deactivation.  Set VREF_FINALIZE to
2179          * force action on the 1->0 transition.  Do not destroy the
2180          * vp association if a vp is present (leave the destroyed ncp
2181          * resolved through the vp finalization).
2182          *
2183          * Cleanup the refs in the resolved-not-found case by setting
2184          * the ncp to an unresolved state.  This improves our ability
2185          * to get rid of dead ncp elements in other cache_*() routines.
2186          */
2187         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
2188                 vp = ncp->nc_vp;
2189                 if (vp) {
2190                         atomic_set_int(&vp->v_refcnt, VREF_FINALIZE);
2191                         if (VREFCNT(vp) <= 0) {
2192                                 if (vget(vp, LK_SHARED) == 0)
2193                                         vput(vp);
2194                         }
2195                 } else {
2196                         _cache_setunresolved(ncp, 0);
2197                 }
2198         }
2199         _cache_ncp_gen_exit(ncp);
2200 }
2201
2202 /*
2203  * Return non-zero if the nch might be associated with an open and/or mmap()'d
2204  * file.  The easy solution is to just return non-zero if the vnode has refs.
2205  * Used to interlock hammer2 reclaims (VREF_FINALIZE should already be set to
2206  * force the reclaim).
2207  */
2208 int
2209 cache_isopen(struct nchandle *nch)
2210 {
2211         struct vnode *vp;
2212         struct namecache *ncp = nch->ncp;
2213
2214         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2215             (vp = ncp->nc_vp) != NULL &&
2216             VREFCNT(vp)) {
2217                 return 1;
2218         }
2219         return 0;
2220 }
2221
2222
2223 /*
2224  * vget the vnode associated with the namecache entry.  Resolve the namecache
2225  * entry if necessary.  The passed ncp must be referenced and locked.  If
2226  * the ncp is resolved it might be locked shared.
2227  *
2228  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
2229  * (depending on the passed lk_type) will be returned in *vpp with an error
2230  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
2231  * most typical error is ENOENT, meaning that the ncp represents a negative
2232  * cache hit and there is no vnode to retrieve, but other errors can occur
2233  * too.
2234  *
2235  * The vget() can race a reclaim.  If this occurs we re-resolve the
2236  * namecache entry.
2237  *
2238  * There are numerous places in the kernel where vget() is called on a
2239  * vnode while one or more of its namecache entries is locked.  Releasing
2240  * a vnode never deadlocks against locked namecache entries (the vnode
2241  * will not get recycled while referenced ncp's exist).  This means we
2242  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
2243  * lock when acquiring the vp lock or we might cause a deadlock.
2244  *
2245  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2246  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2247  *       relocked exclusively before being re-resolved.
2248  */
2249 int
2250 cache_vget(struct nchandle *nch, struct ucred *cred,
2251            int lk_type, struct vnode **vpp)
2252 {
2253         struct namecache *ncp;
2254         struct vnode *vp;
2255         int error;
2256         u_int dummy_gen = 0;
2257
2258         ncp = nch->ncp;
2259 again:
2260         vp = NULL;
2261         if (ncp->nc_flag & NCF_UNRESOLVED)
2262                 error = cache_resolve(nch, &dummy_gen, cred);
2263         else
2264                 error = 0;
2265
2266         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
2267                 error = vget(vp, lk_type);
2268                 if (error) {
2269                         /*
2270                          * VRECLAIM race
2271                          *
2272                          * The ncp may have been locked shared, we must relock
2273                          * it exclusively before we can set it to unresolved.
2274                          */
2275                         if (error == ENOENT) {
2276                                 kprintf("Warning: vnode reclaim race detected "
2277                                         "in cache_vget on %p (%s)\n",
2278                                         vp, ncp->nc_name);
2279                                 _cache_unlock(ncp);
2280                                 _cache_lock(ncp);
2281                                 _cache_setunresolved(ncp, 1);
2282                                 goto again;
2283                         }
2284
2285                         /*
2286                          * Not a reclaim race, some other error.
2287                          */
2288                         KKASSERT(ncp->nc_vp == vp);
2289                         vp = NULL;
2290                 } else {
2291                         KKASSERT(ncp->nc_vp == vp);
2292                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2293                 }
2294         }
2295         if (error == 0 && vp == NULL)
2296                 error = ENOENT;
2297         *vpp = vp;
2298         return(error);
2299 }
2300
2301 /*
2302  * Similar to cache_vget() but only acquires a ref on the vnode.  The vnode
2303  * is already held by virtuue of the ncp being locked, but it might not be
2304  * referenced and while it is not referenced it can transition into the
2305  * VRECLAIMED state.
2306  *
2307  * NOTE: The passed-in ncp must be locked exclusively if it is initially
2308  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
2309  *       relocked exclusively before being re-resolved.
2310  *
2311  * NOTE: At the moment we have to issue a vget() on the vnode, even though
2312  *       we are going to immediately release the lock, in order to resolve
2313  *       potential reclamation races.  Once we have a solid vnode ref that
2314  *       was (at some point) interlocked via a vget(), the vnode will not
2315  *       be reclaimed.
2316  *
2317  * NOTE: vhold counts (v_auxrefs) do not prevent reclamation.
2318  */
2319 int
2320 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
2321 {
2322         struct namecache *ncp;
2323         struct vnode *vp;
2324         int error;
2325         int v;
2326         u_int dummy_gen = 0;
2327
2328         ncp = nch->ncp;
2329 again:
2330         vp = NULL;
2331         if (ncp->nc_flag & NCF_UNRESOLVED)
2332                 error = cache_resolve(nch, &dummy_gen, cred);
2333         else
2334                 error = 0;
2335
2336         while (error == 0 && (vp = ncp->nc_vp) != NULL) {
2337                 /*
2338                  * Try a lockless ref of the vnode.  VRECLAIMED transitions
2339                  * use the vx_lock state and update-counter mechanism so we
2340                  * can detect if one is in-progress or occurred.
2341                  *
2342                  * If we can successfully ref the vnode and interlock against
2343                  * the update-counter mechanism, and VRECLAIMED is found to
2344                  * not be set after that, we should be good.
2345                  */
2346                 v = spin_access_start_only(&vp->v_spin);
2347                 if (__predict_true(spin_access_check_inprog(v) == 0)) {
2348                         vref_special(vp);
2349                         if (__predict_false(
2350                                     spin_access_end_only(&vp->v_spin, v))) {
2351                                 vrele(vp);
2352                                 continue;
2353                         }
2354                         if (__predict_true((vp->v_flag & VRECLAIMED) == 0)) {
2355                                 break;
2356                         }
2357                         vrele(vp);
2358                         kprintf("CACHE_VREF: IN-RECLAIM\n");
2359                 }
2360
2361                 /*
2362                  * Do it the slow way
2363                  */
2364                 error = vget(vp, LK_SHARED);
2365                 if (error) {
2366                         /*
2367                          * VRECLAIM race
2368                          */
2369                         if (error == ENOENT) {
2370                                 kprintf("Warning: vnode reclaim race detected "
2371                                         "in cache_vget on %p (%s)\n",
2372                                         vp, ncp->nc_name);
2373                                 _cache_unlock(ncp);
2374                                 _cache_lock(ncp);
2375                                 _cache_setunresolved(ncp, 1);
2376                                 goto again;
2377                         }
2378
2379                         /*
2380                          * Not a reclaim race, some other error.
2381                          */
2382                         KKASSERT(ncp->nc_vp == vp);
2383                         vp = NULL;
2384                 } else {
2385                         KKASSERT(ncp->nc_vp == vp);
2386                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
2387                         /* caller does not want a lock */
2388                         vn_unlock(vp);
2389                 }
2390                 break;
2391         }
2392         if (error == 0 && vp == NULL)
2393                 error = ENOENT;
2394         *vpp = vp;
2395
2396         return(error);
2397 }
2398
2399 /*
2400  * Return a referenced vnode representing the parent directory of
2401  * ncp.
2402  *
2403  * Because the caller has locked the ncp it should not be possible for
2404  * the parent ncp to go away.  However, the parent can unresolve its
2405  * dvp at any time so we must be able to acquire a lock on the parent
2406  * to safely access nc_vp.
2407  *
2408  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
2409  * so use vhold()/vdrop() while holding the lock to prevent dvp from
2410  * getting destroyed.
2411  *
2412  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
2413  *       lock on the ncp in question..
2414  */
2415 struct vnode *
2416 cache_dvpref(struct namecache *ncp)
2417 {
2418         struct namecache *par;
2419         struct vnode *dvp;
2420
2421         dvp = NULL;
2422         if ((par = ncp->nc_parent) != NULL) {
2423                 _cache_hold(par);
2424                 _cache_lock(par);
2425                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
2426                         if ((dvp = par->nc_vp) != NULL)
2427                                 vhold(dvp);
2428                 }
2429                 _cache_unlock(par);
2430                 if (dvp) {
2431                         if (vget(dvp, LK_SHARED) == 0) {
2432                                 vn_unlock(dvp);
2433                                 vdrop(dvp);
2434                                 /* return refd, unlocked dvp */
2435                         } else {
2436                                 vdrop(dvp);
2437                                 dvp = NULL;
2438                         }
2439                 }
2440                 _cache_drop(par);
2441         }
2442         return(dvp);
2443 }
2444
2445 /*
2446  * Convert a directory vnode to a namecache record without any other
2447  * knowledge of the topology.  This ONLY works with directory vnodes and
2448  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
2449  * returned ncp (if not NULL) will be held and unlocked.
2450  *
2451  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
2452  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
2453  * for dvp.  This will fail only if the directory has been deleted out from
2454  * under the caller.
2455  *
2456  * Callers must always check for a NULL return no matter the value of 'makeit'.
2457  *
2458  * To avoid underflowing the kernel stack each recursive call increments
2459  * the makeit variable.
2460  */
2461
2462 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2463                                   struct vnode *dvp, char *fakename);
2464 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2465                                   struct vnode **saved_dvp);
2466
2467 int
2468 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
2469               struct nchandle *nch)
2470 {
2471         struct vnode *saved_dvp;
2472         struct vnode *pvp;
2473         char *fakename;
2474         int error;
2475
2476         nch->ncp = NULL;
2477         nch->mount = dvp->v_mount;
2478         saved_dvp = NULL;
2479         fakename = NULL;
2480
2481         /*
2482          * Handle the makeit == 0 degenerate case
2483          */
2484         if (makeit == 0) {
2485                 spin_lock_shared(&dvp->v_spin);
2486                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2487                 if (nch->ncp)
2488                         cache_hold(nch);
2489                 spin_unlock_shared(&dvp->v_spin);
2490         }
2491
2492         /*
2493          * Loop until resolution, inside code will break out on error.
2494          */
2495         while (makeit) {
2496                 /*
2497                  * Break out if we successfully acquire a working ncp.
2498                  */
2499                 spin_lock_shared(&dvp->v_spin);
2500                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
2501                 if (nch->ncp) {
2502                         cache_hold(nch);
2503                         spin_unlock_shared(&dvp->v_spin);
2504                         break;
2505                 }
2506                 spin_unlock_shared(&dvp->v_spin);
2507
2508                 /*
2509                  * If dvp is the root of its filesystem it should already
2510                  * have a namecache pointer associated with it as a side
2511                  * effect of the mount, but it may have been disassociated.
2512                  */
2513                 if (dvp->v_flag & VROOT) {
2514                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2515                         error = cache_resolve_mp(nch->mount, 1);
2516                         _cache_put(nch->ncp);
2517                         if (ncvp_debug & 1) {
2518                                 kprintf("cache_fromdvp: resolve root of "
2519                                         "mount %p error %d",
2520                                         dvp->v_mount, error);
2521                         }
2522                         if (error) {
2523                                 if (ncvp_debug & 1)
2524                                         kprintf(" failed\n");
2525                                 nch->ncp = NULL;
2526                                 break;
2527                         }
2528                         if (ncvp_debug & 1)
2529                                 kprintf(" succeeded\n");
2530                         continue;
2531                 }
2532
2533                 /*
2534                  * If we are recursed too deeply resort to an O(n^2)
2535                  * algorithm to resolve the namecache topology.  The
2536                  * resolved pvp is left referenced in saved_dvp to
2537                  * prevent the tree from being destroyed while we loop.
2538                  */
2539                 if (makeit > 20) {
2540                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2541                         if (error) {
2542                                 kprintf("lookupdotdot(longpath) failed %d "
2543                                        "dvp %p\n", error, dvp);
2544                                 nch->ncp = NULL;
2545                                 break;
2546                         }
2547                         continue;
2548                 }
2549
2550                 /*
2551                  * Get the parent directory and resolve its ncp.
2552                  */
2553                 if (fakename) {
2554                         kfree(fakename, M_TEMP);
2555                         fakename = NULL;
2556                 }
2557                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2558                                           &fakename);
2559                 if (error) {
2560                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2561                         break;
2562                 }
2563                 vn_unlock(pvp);
2564
2565                 /*
2566                  * Reuse makeit as a recursion depth counter.  On success
2567                  * nch will be fully referenced.
2568                  */
2569                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2570                 vrele(pvp);
2571                 if (nch->ncp == NULL)
2572                         break;
2573
2574                 /*
2575                  * Do an inefficient scan of pvp (embodied by ncp) to look
2576                  * for dvp.  This will create a namecache record for dvp on
2577                  * success.  We loop up to recheck on success.
2578                  *
2579                  * ncp and dvp are both held but not locked.
2580                  */
2581                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2582                 if (error) {
2583                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2584                                 pvp, nch->ncp->nc_name, dvp);
2585                         cache_drop(nch);
2586                         /* nch was NULLed out, reload mount */
2587                         nch->mount = dvp->v_mount;
2588                         break;
2589                 }
2590                 if (ncvp_debug & 1) {
2591                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2592                                 pvp, nch->ncp->nc_name);
2593                 }
2594                 cache_drop(nch);
2595                 /* nch was NULLed out, reload mount */
2596                 nch->mount = dvp->v_mount;
2597         }
2598
2599         /*
2600          * If nch->ncp is non-NULL it will have been held already.
2601          */
2602         if (fakename)
2603                 kfree(fakename, M_TEMP);
2604         if (saved_dvp)
2605                 vrele(saved_dvp);
2606         if (nch->ncp)
2607                 return (0);
2608         return (EINVAL);
2609 }
2610
2611 /*
2612  * Go up the chain of parent directories until we find something
2613  * we can resolve into the namecache.  This is very inefficient.
2614  */
2615 static
2616 int
2617 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2618                   struct vnode **saved_dvp)
2619 {
2620         struct nchandle nch;
2621         struct vnode *pvp;
2622         int error;
2623         static time_t last_fromdvp_report;
2624         char *fakename;
2625
2626         /*
2627          * Loop getting the parent directory vnode until we get something we
2628          * can resolve in the namecache.
2629          */
2630         vref(dvp);
2631         nch.mount = dvp->v_mount;
2632         nch.ncp = NULL;
2633         fakename = NULL;
2634
2635         for (;;) {
2636                 if (fakename) {
2637                         kfree(fakename, M_TEMP);
2638                         fakename = NULL;
2639                 }
2640                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2641                                           &fakename);
2642                 if (error) {
2643                         vrele(dvp);
2644                         break;
2645                 }
2646                 vn_unlock(pvp);
2647                 spin_lock_shared(&pvp->v_spin);
2648                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2649                         _cache_hold(nch.ncp);
2650                         spin_unlock_shared(&pvp->v_spin);
2651                         vrele(pvp);
2652                         break;
2653                 }
2654                 spin_unlock_shared(&pvp->v_spin);
2655                 if (pvp->v_flag & VROOT) {
2656                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2657                         error = cache_resolve_mp(nch.mount, 1);
2658                         _cache_unlock(nch.ncp);
2659                         vrele(pvp);
2660                         if (error) {
2661                                 _cache_drop(nch.ncp);
2662                                 nch.ncp = NULL;
2663                                 vrele(dvp);
2664                         }
2665                         break;
2666                 }
2667                 vrele(dvp);
2668                 dvp = pvp;
2669         }
2670         if (error == 0) {
2671                 if (last_fromdvp_report != time_uptime) {
2672                         last_fromdvp_report = time_uptime;
2673                         kprintf("Warning: extremely inefficient path "
2674                                 "resolution on %s\n",
2675                                 nch.ncp->nc_name);
2676                 }
2677                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2678
2679                 /*
2680                  * Hopefully dvp now has a namecache record associated with
2681                  * it.  Leave it referenced to prevent the kernel from
2682                  * recycling the vnode.  Otherwise extremely long directory
2683                  * paths could result in endless recycling.
2684                  */
2685                 if (*saved_dvp)
2686                     vrele(*saved_dvp);
2687                 *saved_dvp = dvp;
2688                 _cache_drop(nch.ncp);
2689         }
2690         if (fakename)
2691                 kfree(fakename, M_TEMP);
2692         return (error);
2693 }
2694
2695 /*
2696  * Do an inefficient scan of the directory represented by ncp looking for
2697  * the directory vnode dvp.  ncp must be held but not locked on entry and
2698  * will be held on return.  dvp must be refd but not locked on entry and
2699  * will remain refd on return.
2700  *
2701  * Why do this at all?  Well, due to its stateless nature the NFS server
2702  * converts file handles directly to vnodes without necessarily going through
2703  * the namecache ops that would otherwise create the namecache topology
2704  * leading to the vnode.  We could either (1) Change the namecache algorithms
2705  * to allow disconnect namecache records that are re-merged opportunistically,
2706  * or (2) Make the NFS server backtrack and scan to recover a connected
2707  * namecache topology in order to then be able to issue new API lookups.
2708  *
2709  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2710  * namecache algorithms and introduces a lot of complication in every subsystem
2711  * that calls into the namecache to deal with the re-merge case, especially
2712  * since we are using the namecache to placehold negative lookups and the
2713  * vnode might not be immediately assigned. (2) is certainly far less
2714  * efficient then (1), but since we are only talking about directories here
2715  * (which are likely to remain cached), the case does not actually run all
2716  * that often and has the supreme advantage of not polluting the namecache
2717  * algorithms.
2718  *
2719  * If a fakename is supplied just construct a namecache entry using the
2720  * fake name.
2721  */
2722 static int
2723 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2724                        struct vnode *dvp, char *fakename)
2725 {
2726         struct nlcomponent nlc;
2727         struct nchandle rncp;
2728         struct dirent *den;
2729         struct vnode *pvp;
2730         struct vattr vat;
2731         struct iovec iov;
2732         struct uio uio;
2733         int blksize;
2734         int eofflag;
2735         int bytes;
2736         char *rbuf;
2737         int error;
2738
2739         vat.va_blocksize = 0;
2740         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2741                 return (error);
2742         cache_lock(nch);
2743         error = cache_vref(nch, cred, &pvp);
2744         cache_unlock(nch);
2745         if (error)
2746                 return (error);
2747         if (ncvp_debug & 1) {
2748                 kprintf("inefficient_scan of (%p,%s): directory iosize %ld "
2749                         "vattr fileid = %lld\n",
2750                         nch->ncp, nch->ncp->nc_name,
2751                         vat.va_blocksize,
2752                         (long long)vat.va_fileid);
2753         }
2754
2755         /*
2756          * Use the supplied fakename if not NULL.  Fake names are typically
2757          * not in the actual filesystem hierarchy.  This is used by HAMMER
2758          * to glue @@timestamp recursions together.
2759          */
2760         if (fakename) {
2761                 nlc.nlc_nameptr = fakename;
2762                 nlc.nlc_namelen = strlen(fakename);
2763                 rncp = cache_nlookup(nch, &nlc);
2764                 goto done;
2765         }
2766
2767         if ((blksize = vat.va_blocksize) == 0)
2768                 blksize = DEV_BSIZE;
2769         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2770         rncp.ncp = NULL;
2771
2772         eofflag = 0;
2773         uio.uio_offset = 0;
2774 again:
2775         iov.iov_base = rbuf;
2776         iov.iov_len = blksize;
2777         uio.uio_iov = &iov;
2778         uio.uio_iovcnt = 1;
2779         uio.uio_resid = blksize;
2780         uio.uio_segflg = UIO_SYSSPACE;
2781         uio.uio_rw = UIO_READ;
2782         uio.uio_td = curthread;
2783
2784         if (ncvp_debug & 2)
2785                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2786         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2787         if (error == 0) {
2788                 den = (struct dirent *)rbuf;
2789                 bytes = blksize - uio.uio_resid;
2790
2791                 while (bytes > 0) {
2792                         if (ncvp_debug & 2) {
2793                                 kprintf("cache_inefficient_scan: %*.*s\n",
2794                                         den->d_namlen, den->d_namlen,
2795                                         den->d_name);
2796                         }
2797                         if (den->d_type != DT_WHT &&
2798                             den->d_ino == vat.va_fileid) {
2799                                 if (ncvp_debug & 1) {
2800                                         kprintf("cache_inefficient_scan: "
2801                                                "MATCHED inode %lld path %s/%*.*s\n",
2802                                                (long long)vat.va_fileid,
2803                                                nch->ncp->nc_name,
2804                                                den->d_namlen, den->d_namlen,
2805                                                den->d_name);
2806                                 }
2807                                 nlc.nlc_nameptr = den->d_name;
2808                                 nlc.nlc_namelen = den->d_namlen;
2809                                 rncp = cache_nlookup(nch, &nlc);
2810                                 KKASSERT(rncp.ncp != NULL);
2811                                 break;
2812                         }
2813                         bytes -= _DIRENT_DIRSIZ(den);
2814                         den = _DIRENT_NEXT(den);
2815                 }
2816                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2817                         goto again;
2818         }
2819         kfree(rbuf, M_TEMP);
2820 done:
2821         vrele(pvp);
2822         if (rncp.ncp) {
2823                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2824                         _cache_setvp(rncp.mount, rncp.ncp, dvp, 1);
2825                         if (ncvp_debug & 2) {
2826                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2827                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2828                         }
2829                 } else {
2830                         if (ncvp_debug & 2) {
2831                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2832                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2833                                         rncp.ncp->nc_vp);
2834                         }
2835                 }
2836                 if (rncp.ncp->nc_vp == NULL)
2837                         error = rncp.ncp->nc_error;
2838                 /*
2839                  * Release rncp after a successful nlookup.  rncp was fully
2840                  * referenced.
2841                  */
2842                 cache_put(&rncp);
2843         } else {
2844                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2845                         dvp, nch->ncp->nc_name);
2846                 error = ENOENT;
2847         }
2848         return (error);
2849 }
2850
2851 /*
2852  * This function must be called with the ncp held and locked and will unlock
2853  * and drop it during zapping.
2854  *
2855  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2856  * state, which disassociates it from its vnode or pcpu_ncache[n].neg_list
2857  * and removes the related reference.  If the ncp can be removed, and the
2858  * parent can be zapped non-blocking, this function loops up.
2859  *
2860  * There will be one ref from the caller (which we now own).  The only
2861  * remaining autonomous refs to the ncp will then be due to nc_parent->nc_list,
2862  * so possibly 2 refs left.  Taking this into account, if there are no
2863  * additional refs and no children, the ncp will be removed from the topology
2864  * and destroyed.
2865  *
2866  * References and/or children may exist if the ncp is in the middle of the
2867  * topology, preventing the ncp from being destroyed.
2868  *
2869  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2870  *
2871  * This function may return a held (but NOT locked) parent node which the
2872  * caller must drop in a loop.  Looping is one way to avoid unbounded recursion
2873  * due to deep namecache trees.
2874  *
2875  * WARNING!  For MPSAFE operation this routine must acquire up to three
2876  *           spin locks to be able to safely test nc_refs.  Lock order is
2877  *           very important.
2878  *
2879  *           hash spinlock if on hash list
2880  *           parent spinlock if child of parent
2881  *           (the ncp is unresolved so there is no vnode association)
2882  */
2883 static int
2884 cache_zap(struct namecache *ncp)
2885 {
2886         struct namecache *par;
2887         struct nchash_head *nchpp;
2888         int refcmp;
2889         int nonblock = 1;       /* XXX cleanup */
2890         int res = 0;
2891
2892 again:
2893         /*
2894          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2895          * This gets rid of any vp->v_namecache list or negative list and
2896          * the related ref.
2897          */
2898         _cache_setunresolved(ncp, 1);
2899
2900         /*
2901          * Try to scrap the entry and possibly tail-recurse on its parent.
2902          * We only scrap unref'd (other then our ref) unresolved entries,
2903          * we do not scrap 'live' entries.
2904          *
2905          * If nc_parent is non NULL we expect 2 references, else just 1.
2906          * If there are more, someone else also holds the ncp and we cannot
2907          * destroy it.
2908          */
2909         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2910         KKASSERT(ncp->nc_refs > 0);
2911
2912         /*
2913          * If the ncp is linked to its parent it will also be in the hash
2914          * table.  We have to be able to lock the parent and the hash table.
2915          *
2916          * Acquire locks.  Note that the parent can't go away while we hold
2917          * a child locked.  If nc_parent is present, expect 2 refs instead
2918          * of 1.
2919          */
2920         nchpp = NULL;
2921         if ((par = ncp->nc_parent) != NULL) {
2922                 if (nonblock) {
2923                         if (_cache_lock_nonblock(par)) {
2924                                 /* lock failed */
2925                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2926                                 atomic_add_long(
2927                                     &pcpu_ncache[mycpu->gd_cpuid].numdefered,
2928                                     1);
2929                                 _cache_unlock(ncp);
2930                                 _cache_drop(ncp);       /* caller's ref */
2931                                 return res;
2932                         }
2933                         _cache_hold(par);
2934                 } else {
2935                         _cache_hold(par);
2936                         _cache_lock(par);
2937                 }
2938                 nchpp = ncp->nc_head;
2939                 spin_lock(&nchpp->spin);
2940         }
2941
2942         /*
2943          * With the parent and nchpp locked, and the vnode removed
2944          * (no vp->v_namecache), we expect 1 or 2 refs.  If there are
2945          * more someone else has a ref and we cannot zap the entry.
2946          *
2947          * one for our hold
2948          * one for our parent link (parent also has one from the linkage)
2949          */
2950         if (par)
2951                 refcmp = 2;
2952         else
2953                 refcmp = 1;
2954
2955         /*
2956          * On failure undo the work we've done so far and drop the
2957          * caller's ref and ncp.
2958          */
2959         if (ncp->nc_refs != refcmp || TAILQ_FIRST(&ncp->nc_list)) {
2960                 if (par) {
2961                         spin_unlock(&nchpp->spin);
2962                         _cache_put(par);
2963                 }
2964                 _cache_unlock(ncp);
2965                 _cache_drop(ncp);
2966                 return res;
2967         }
2968
2969         /*
2970          * We own all the refs and with the spinlocks held no further
2971          * refs can be acquired by others.
2972          *
2973          * Remove us from the hash list and parent list.  We have to
2974          * drop a ref on the parent's vp if the parent's list becomes
2975          * empty.
2976          */
2977         if (par) {
2978                 KKASSERT(nchpp == ncp->nc_head);
2979                 _cache_unlink_parent(par, ncp, nchpp); /* eats nhcpp */
2980                 /*_cache_unlock(par);*/
2981                 /* &nchpp->spin is unlocked by call */
2982         } else {
2983                 KKASSERT(ncp->nc_head == NULL);
2984         }
2985
2986         /*
2987          * ncp should not have picked up any refs.  Physically
2988          * destroy the ncp.
2989          */
2990         if (ncp->nc_refs != refcmp) {
2991                 panic("cache_zap: %p bad refs %d (expected %d)\n",
2992                         ncp, ncp->nc_refs, refcmp);
2993         }
2994         /* _cache_unlock(ncp) not required */
2995         ncp->nc_refs = -1;      /* safety */
2996         if (ncp->nc_name)
2997                 kfree(ncp->nc_name, M_VFSCACHEAUX);
2998         kfree_obj(ncp, M_VFSCACHE);
2999         res = 1;
3000
3001         /*
3002          * Loop up if we can recursively clean out the parent.
3003          */
3004         if (par) {
3005                 refcmp = 1;             /* ref on parent */
3006                 if (par->nc_parent)     /* par->par */
3007                         ++refcmp;
3008                 par->nc_flag &= ~NCF_DEFEREDZAP;
3009                 if ((par->nc_flag & NCF_UNRESOLVED) &&
3010                     par->nc_refs == refcmp &&
3011                     TAILQ_EMPTY(&par->nc_list))
3012                 {
3013                         ncp = par;
3014                         goto again;
3015                 }
3016                 _cache_unlock(par);
3017                 _cache_drop(par);
3018         }
3019         return 1;
3020 }
3021
3022 /*
3023  * Clean up dangling negative cache and defered-drop entries in the
3024  * namecache.
3025  *
3026  * This routine is called in the critical path and also called from
3027  * vnlru().  When called from vnlru we use a lower limit to try to
3028  * deal with the negative cache before the critical path has to start
3029  * dealing with it.
3030  */
3031 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
3032
3033 static cache_hs_t neg_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
3034 static cache_hs_t pos_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
3035 static cache_hs_t exc_cache_hysteresis_state[2] = { CHI_LOW, CHI_LOW };
3036
3037 void
3038 cache_hysteresis(int critpath)
3039 {
3040         long poslimit;
3041         long exclimit;
3042         long neglimit;
3043         long xnumunres;
3044         long xnumleafs;
3045         long clean_neg;
3046         long clean_unres;
3047         long clean_excess;
3048
3049         /*
3050          * Calculate negative ncp limit
3051          */
3052         neglimit = maxvnodes / ncnegfactor;
3053         if (critpath == 0)
3054                 neglimit = neglimit * 8 / 10;
3055
3056         /*
3057          * Don't cache too many negative hits.  We use hysteresis to reduce
3058          * the impact on the critical path.
3059          */
3060         clean_neg = 0;
3061
3062         switch(neg_cache_hysteresis_state[critpath]) {
3063         case CHI_LOW:
3064                 if (vfscache_negs > MINNEG && vfscache_negs > neglimit) {
3065                         if (critpath)
3066                                 clean_neg = ncnegflush;
3067                         else
3068                                 clean_neg = ncnegflush +
3069                                             vfscache_negs - neglimit;
3070                         neg_cache_hysteresis_state[critpath] = CHI_HIGH;
3071                 }
3072                 break;
3073         case CHI_HIGH:
3074                 if (vfscache_negs > MINNEG * 9 / 10 &&
3075                     vfscache_negs * 9 / 10 > neglimit
3076                 ) {
3077                         if (critpath)
3078                                 clean_neg = ncnegflush;
3079                         else
3080                                 clean_neg = ncnegflush +
3081                                             vfscache_negs * 9 / 10 -
3082                                             neglimit;
3083                 } else {
3084                         neg_cache_hysteresis_state[critpath] = CHI_LOW;
3085                 }
3086                 break;
3087         }
3088         if (clean_neg)
3089                 _cache_cleanneg(clean_neg);
3090
3091         /*
3092          * Don't cache too many unresolved elements.  We use hysteresis to
3093          * reduce the impact on the critical path.
3094          */
3095         if ((poslimit = ncposlimit) == 0)
3096                 poslimit = maxvnodes / ncposfactor;
3097         if (critpath == 0)
3098                 poslimit = poslimit * 8 / 10;
3099
3100         /*
3101          * Number of unresolved leaf elements in the namecache.  These
3102          * can build-up for various reasons and may have to be disposed
3103          * of to allow the inactive list to be cleaned out by vnlru_proc()
3104          *
3105          * Collect count
3106          */
3107         xnumunres = vfscache_unres;
3108         clean_unres = 0;
3109
3110         switch(pos_cache_hysteresis_state[critpath]) {
3111         case CHI_LOW:
3112                 if (xnumunres > poslimit && xnumunres > MINPOS) {
3113                         if (critpath)
3114                                 clean_unres = ncposflush;
3115                         else
3116                                 clean_unres = ncposflush + xnumunres -
3117                                               poslimit;
3118                         pos_cache_hysteresis_state[critpath] = CHI_HIGH;
3119                 }
3120                 break;
3121         case CHI_HIGH:
3122                 if (xnumunres > poslimit * 5 / 6 && xnumunres > MINPOS) {
3123                         if (critpath)
3124                                 clean_unres = ncposflush;
3125                         else
3126                                 clean_unres = ncposflush + xnumunres -
3127                                               poslimit * 5 / 6;
3128                 } else {
3129                         pos_cache_hysteresis_state[critpath] = CHI_LOW;
3130                 }
3131                 break;
3132         }
3133
3134         /*
3135          * Excessive positive hits can accumulate due to large numbers of
3136          * hardlinks (the vnode cache will not prevent ncps representing
3137          * hardlinks from growing into infinity).
3138          */
3139         exclimit = maxvnodes * 2;
3140         if (critpath == 0)
3141                 exclimit = exclimit * 8 / 10;
3142         xnumleafs = vfscache_leafs;
3143         clean_excess = 0;
3144
3145         switch(exc_cache_hysteresis_state[critpath]) {
3146         case CHI_LOW:
3147                 if (xnumleafs > exclimit && xnumleafs > MINPOS) {
3148                         if (critpath)
3149                                 clean_excess = ncposflush;
3150                         else
3151                                 clean_excess = ncposflush + xnumleafs -
3152                                                exclimit;
3153                         exc_cache_hysteresis_state[critpath] = CHI_HIGH;
3154                 }
3155                 break;
3156         case CHI_HIGH:
3157                 if (xnumleafs > exclimit * 5 / 6 && xnumleafs > MINPOS) {
3158                         if (critpath)
3159                                 clean_excess = ncposflush;
3160                         else
3161                                 clean_excess = ncposflush + xnumleafs -
3162                                                exclimit * 5 / 6;
3163                 } else {
3164                         exc_cache_hysteresis_state[critpath] = CHI_LOW;
3165                 }
3166                 break;
3167         }
3168
3169         if (clean_unres || clean_excess)
3170                 _cache_cleanpos(clean_unres, clean_excess);
3171
3172         /*
3173          * Clean out dangling defered-zap ncps which could not be cleanly
3174          * dropped if too many build up.  Note that numdefered is
3175          * heuristical.  Make sure we are real-time for the current cpu,
3176          * plus the global rollup.
3177          */
3178         if (pcpu_ncache[mycpu->gd_cpuid].numdefered + numdefered > neglimit) {
3179                 _cache_cleandefered();
3180         }
3181 }
3182
3183 /*
3184  * NEW NAMECACHE LOOKUP API
3185  *
3186  * Lookup an entry in the namecache.  The passed par_nch must be referenced
3187  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
3188  * is ALWAYS returned, eve if the supplied component is illegal.
3189  *
3190  * The resulting namecache entry should be returned to the system with
3191  * cache_put() or cache_unlock() + cache_drop().
3192  *
3193  * namecache locks are recursive but care must be taken to avoid lock order
3194  * reversals (hence why the passed par_nch must be unlocked).  Locking
3195  * rules are to order for parent traversals, not for child traversals.
3196  *
3197  * Nobody else will be able to manipulate the associated namespace (e.g.
3198  * create, delete, rename, rename-target) until the caller unlocks the
3199  * entry.
3200  *
3201  * The returned entry will be in one of three states:  positive hit (non-null
3202  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
3203  * Unresolved entries must be resolved through the filesystem to associate the
3204  * vnode and/or determine whether a positive or negative hit has occured.
3205  *
3206  * It is not necessary to lock a directory in order to lock namespace under
3207  * that directory.  In fact, it is explicitly not allowed to do that.  A
3208  * directory is typically only locked when being created, renamed, or
3209  * destroyed.
3210  *
3211  * The directory (par) may be unresolved, in which case any returned child
3212  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
3213  * the filesystem lookup requires a resolved directory vnode the caller is
3214  * responsible for resolving the namecache chain top-down.  This API
3215  * specifically allows whole chains to be created in an unresolved state.
3216  */
3217 struct nchandle
3218 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
3219 {
3220         struct nchandle nch;
3221         struct namecache *ncp;
3222         struct namecache *new_ncp;
3223         struct namecache *rep_ncp;      /* reuse a destroyed ncp */
3224         struct nchash_head *nchpp;
3225         struct mount *mp;
3226         u_int32_t hash;
3227         globaldata_t gd;
3228         int par_locked;
3229         int use_excl;
3230
3231         gd = mycpu;
3232         mp = par_nch->mount;
3233         par_locked = 0;
3234
3235         /*
3236          * This is a good time to call it, no ncp's are locked by
3237          * the caller or us.
3238          */
3239         cache_hysteresis(1);
3240
3241         /*
3242          * Try to locate an existing entry
3243          */
3244         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3245         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3246         new_ncp = NULL;
3247         use_excl = 0;
3248         nchpp = NCHHASH(hash);
3249 restart:
3250         rep_ncp = NULL;
3251         if (use_excl)
3252                 spin_lock(&nchpp->spin);
3253         else
3254                 spin_lock_shared(&nchpp->spin);
3255
3256         /*
3257          * Do a reverse scan to collect any DESTROYED ncps prior to matching
3258          * an existing entry.
3259          */
3260         TAILQ_FOREACH_REVERSE(ncp, &nchpp->list, nchash_list, nc_hash) {
3261                 /*
3262                  * Break out if we find a matching entry.  Note that
3263                  * UNRESOLVED entries may match, but DESTROYED entries
3264                  * do not.
3265                  *
3266                  * We may be able to reuse DESTROYED entries that we come
3267                  * across, even if the name does not match, as long as
3268                  * nc_nlen is correct and the only hold ref is from the nchpp
3269                  * list itself.
3270                  */
3271                 if (ncp->nc_parent == par_nch->ncp &&
3272                     ncp->nc_nlen == nlc->nlc_namelen) {
3273                         if (ncp->nc_flag & NCF_DESTROYED) {
3274                                 if (ncp->nc_refs == 1 && rep_ncp == NULL)
3275                                         rep_ncp = ncp;
3276                                 continue;
3277                         }
3278                         if (bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen))
3279                                 continue;
3280
3281                         /*
3282                          * Matched ncp
3283                          */
3284                         _cache_hold(ncp);
3285                         if (rep_ncp)
3286                                 _cache_hold(rep_ncp);
3287
3288                         if (use_excl)
3289                                 spin_unlock(&nchpp->spin);
3290                         else
3291                                 spin_unlock_shared(&nchpp->spin);
3292
3293                         if (par_locked) {
3294                                 _cache_unlock(par_nch->ncp);
3295                                 par_locked = 0;
3296                         }
3297
3298                         /*
3299                          * Really try to destroy rep_ncp if encountered.
3300                          * Various edge cases can build up more than one,
3301                          * so loop if we succeed.  This isn't perfect, but
3302                          * we can't afford to have tons of entries build
3303                          * up on a single nhcpp list due to rename-over
3304                          * operations.  If that were to happen, the system
3305                          * would bog down quickly.
3306                          */
3307                         if (rep_ncp) {
3308                                 if (_cache_lock_nonblock(rep_ncp) == 0) {
3309                                         if (rep_ncp->nc_flag & NCF_DESTROYED) {
3310                                                 if (cache_zap(rep_ncp)) {
3311                                                         _cache_drop(ncp);
3312                                                         goto restart;
3313                                                 }
3314                                         } else {
3315                                                 _cache_unlock(rep_ncp);
3316                                                 _cache_drop(rep_ncp);
3317                                         }
3318                                 } else {
3319                                         _cache_drop(rep_ncp);
3320                                 }
3321                         }
3322
3323                         /*
3324                          * Continue processing the matched entry
3325                          */
3326                         if (_cache_lock_special(ncp) == 0) {
3327                                 /*
3328                                  * Successfully locked but we must re-test
3329                                  * conditions that might have changed since
3330                                  * we did not have the lock before.
3331                                  */
3332                                 if (ncp->nc_parent != par_nch->ncp ||
3333                                     ncp->nc_nlen != nlc->nlc_namelen ||
3334                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3335                                          ncp->nc_nlen) ||
3336                                     (ncp->nc_flag & NCF_DESTROYED)) {
3337                                         _cache_put(ncp);
3338                                         goto restart;
3339                                 }
3340                                 _cache_auto_unresolve(mp, ncp);
3341                                 if (new_ncp) {
3342                                         _cache_free(new_ncp);
3343                                         new_ncp = NULL; /* safety */
3344                                 }
3345                                 goto found;
3346                         }
3347                         _cache_get(ncp);        /* cycle the lock to block */
3348                         _cache_put(ncp);
3349                         _cache_drop(ncp);
3350                         goto restart;
3351                 }
3352         }
3353
3354         /*
3355          * We failed to locate the entry, try to resurrect a destroyed
3356          * entry that we did find that is already correctly linked into
3357          * nchpp and the parent.  We must re-test conditions after
3358          * successfully locking rep_ncp.
3359          *
3360          * This case can occur under heavy loads due to not being able
3361          * to safely lock the parent in cache_zap().  Nominally a repeated
3362          * create/unlink load, but only the namelen needs to match.
3363          *
3364          * An exclusive lock on the nchpp is required to process this case,
3365          * otherwise a race can cause duplicate entries to be created with
3366          * one cpu reusing a DESTROYED ncp while another creates a new_ncp.
3367          */
3368         if (rep_ncp && use_excl) {
3369                 if (_cache_lock_nonblock(rep_ncp) == 0) {
3370                         _cache_hold(rep_ncp);
3371                         if (rep_ncp->nc_parent == par_nch->ncp &&
3372                             rep_ncp->nc_nlen == nlc->nlc_namelen &&
3373                             (rep_ncp->nc_flag & NCF_DESTROYED) &&
3374                             rep_ncp->nc_refs == 2)
3375                         {
3376                                 /*
3377                                  * Update nc_name.
3378                                  */
3379                                 ncp = rep_ncp;
3380
3381                                 _cache_ncp_gen_enter(ncp);
3382
3383                                 bcopy(nlc->nlc_nameptr, ncp->nc_name,
3384                                       nlc->nlc_namelen);
3385
3386                                 /*
3387                                  * This takes some care.  We must clear the
3388                                  * NCF_DESTROYED flag before unlocking the
3389                                  * hash chain so other concurrent searches
3390                                  * do not skip this element.
3391                                  *
3392                                  * We must also unlock the hash chain before
3393                                  * unresolving the ncp to avoid deadlocks.
3394                                  * We hold the lock on the ncp so we can safely
3395                                  * reinitialize nc_flag after that.
3396                                  */
3397                                 ncp->nc_flag &= ~NCF_DESTROYED;
3398                                 spin_unlock(&nchpp->spin);      /* use_excl */
3399
3400                                 _cache_setunresolved(ncp, 0);
3401                                 ncp->nc_flag = NCF_UNRESOLVED;
3402                                 ncp->nc_error = ENOTCONN;
3403
3404                                 _cache_ncp_gen_exit(ncp);
3405
3406                                 if (par_locked) {
3407                                         _cache_unlock(par_nch->ncp);
3408                                         par_locked = 0;
3409                                 }
3410                                 if (new_ncp) {
3411                                         _cache_free(new_ncp);
3412                                         new_ncp = NULL; /* safety */
3413                                 }
3414                                 goto found;
3415                         }
3416                         _cache_put(rep_ncp);
3417                 }
3418         }
3419
3420         /*
3421          * Otherwise create a new entry and add it to the cache.  The parent
3422          * ncp must also be locked so we can link into it.
3423          *
3424          * We have to relookup after possibly blocking in kmalloc or
3425          * when locking par_nch.
3426          *
3427          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3428          *       mount case, in which case nc_name will be NULL.
3429          *
3430          * NOTE: In the rep_ncp != NULL case we are trying to reuse
3431          *       a DESTROYED entry, but didn't have an exclusive lock.
3432          *       In this situation we do not create a new_ncp.
3433          */
3434         if (new_ncp == NULL) {
3435                 if (use_excl)
3436                         spin_unlock(&nchpp->spin);
3437                 else
3438                         spin_unlock_shared(&nchpp->spin);
3439                 if (rep_ncp == NULL) {
3440                         new_ncp = cache_alloc(nlc->nlc_namelen);
3441                         if (nlc->nlc_namelen) {
3442                                 bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3443                                       nlc->nlc_namelen);
3444                                 new_ncp->nc_name[nlc->nlc_namelen] = 0;
3445                         }
3446                 }
3447                 use_excl = 1;
3448                 goto restart;
3449         }
3450
3451         /*
3452          * NOTE! The spinlock is held exclusively here because new_ncp
3453          *       is non-NULL.
3454          */
3455         if (par_locked == 0) {
3456                 spin_unlock(&nchpp->spin);
3457                 _cache_lock(par_nch->ncp);
3458                 par_locked = 1;
3459                 goto restart;
3460         }
3461
3462         /*
3463          * Link to parent (requires another ref, the one already in new_ncp
3464          * is what we wil lreturn).
3465          *
3466          * WARNING!  We still hold the spinlock.  We have to set the hash
3467          *           table entry atomically.
3468          */
3469         ncp = new_ncp;
3470         ++ncp->nc_refs;
3471         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3472         spin_unlock(&nchpp->spin);
3473         _cache_unlock(par_nch->ncp);
3474         /* par_locked = 0 - not used */
3475 found:
3476         /*
3477          * stats and namecache size management
3478          */
3479         if (ncp->nc_flag & NCF_UNRESOLVED)
3480                 ++gd->gd_nchstats->ncs_miss;
3481         else if (ncp->nc_vp)
3482                 ++gd->gd_nchstats->ncs_goodhits;
3483         else
3484                 ++gd->gd_nchstats->ncs_neghits;
3485         nch.mount = mp;
3486         nch.ncp = ncp;
3487         _cache_mntref(nch.mount);
3488
3489         return(nch);
3490 }
3491
3492 /*
3493  * Attempt to lookup a namecache entry and return with a shared namecache
3494  * lock.  This operates non-blocking.  EWOULDBLOCK is returned if excl is
3495  * set or we are unable to lock.
3496  */
3497 int
3498 cache_nlookup_maybe_shared(struct nchandle *par_nch,
3499                            struct nlcomponent *nlc,
3500                            int excl, struct nchandle *res_nch)
3501 {
3502         struct namecache *ncp;
3503         struct nchash_head *nchpp;
3504         struct mount *mp;
3505         u_int32_t hash;
3506         globaldata_t gd;
3507
3508         /*
3509          * If exclusive requested or shared namecache locks are disabled,
3510          * return failure.
3511          */
3512         if (ncp_shared_lock_disable || excl)
3513                 return(EWOULDBLOCK);
3514
3515         gd = mycpu;
3516         mp = par_nch->mount;
3517
3518         /*
3519          * This is a good time to call it, no ncp's are locked by
3520          * the caller or us.
3521          */
3522         cache_hysteresis(1);
3523
3524         /*
3525          * Try to locate an existing entry
3526          */
3527         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3528         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3529         nchpp = NCHHASH(hash);
3530
3531         spin_lock_shared(&nchpp->spin);
3532
3533         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3534                 /*
3535                  * Break out if we find a matching entry.  Note that
3536                  * UNRESOLVED entries may match, but DESTROYED entries
3537                  * do not.
3538                  */
3539                 if (ncp->nc_parent == par_nch->ncp &&
3540                     ncp->nc_nlen == nlc->nlc_namelen &&
3541                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3542                     (ncp->nc_flag & NCF_DESTROYED) == 0
3543                 ) {
3544                         _cache_hold(ncp);
3545                         spin_unlock_shared(&nchpp->spin);
3546
3547                         if (_cache_lock_shared_special(ncp) == 0) {
3548                                 if (ncp->nc_parent == par_nch->ncp &&
3549                                     ncp->nc_nlen == nlc->nlc_namelen &&
3550                                     bcmp(ncp->nc_name, nlc->nlc_nameptr,
3551                                          ncp->nc_nlen) == 0 &&
3552                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
3553                                     (ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
3554                                     _cache_auto_unresolve_test(mp, ncp) == 0)
3555                                 {
3556                                         goto found;
3557                                 }
3558                                 _cache_unlock(ncp);
3559                         }
3560                         _cache_drop(ncp);
3561                         return(EWOULDBLOCK);
3562                 }
3563         }
3564
3565         /*
3566          * Failure
3567          */
3568         spin_unlock_shared(&nchpp->spin);
3569         return(EWOULDBLOCK);
3570
3571         /*
3572          * Success
3573          *
3574          * Note that nc_error might be non-zero (e.g ENOENT).
3575          */
3576 found:
3577         res_nch->mount = mp;
3578         res_nch->ncp = ncp;
3579         ++gd->gd_nchstats->ncs_goodhits;
3580         _cache_mntref(res_nch->mount);
3581
3582         KKASSERT(ncp->nc_error != EWOULDBLOCK);
3583         return(ncp->nc_error);
3584 }
3585
3586 /*
3587  * This is a non-blocking verison of cache_nlookup() used by
3588  * nfs_readdirplusrpc_uio().  It can fail for any reason and
3589  * will return nch.ncp == NULL in that case.
3590  */
3591 struct nchandle
3592 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
3593 {
3594         struct nchandle nch;
3595         struct namecache *ncp;
3596         struct namecache *new_ncp;
3597         struct nchash_head *nchpp;
3598         struct mount *mp;
3599         u_int32_t hash;
3600         globaldata_t gd;
3601         int par_locked;
3602
3603         gd = mycpu;
3604         mp = par_nch->mount;
3605         par_locked = 0;
3606
3607         /*
3608          * Try to locate an existing entry
3609          */
3610         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3611         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3612         new_ncp = NULL;
3613         nchpp = NCHHASH(hash);
3614 restart:
3615         spin_lock(&nchpp->spin);
3616         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3617                 /*
3618                  * Break out if we find a matching entry.  Note that
3619                  * UNRESOLVED entries may match, but DESTROYED entries
3620                  * do not.
3621                  */
3622                 if (ncp->nc_parent == par_nch->ncp &&
3623                     ncp->nc_nlen == nlc->nlc_namelen &&
3624                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3625                     (ncp->nc_flag & NCF_DESTROYED) == 0
3626                 ) {
3627                         _cache_hold(ncp);
3628                         spin_unlock(&nchpp->spin);
3629                         if (par_locked) {
3630                                 _cache_unlock(par_nch->ncp);
3631                                 par_locked = 0;
3632                         }
3633                         if (_cache_lock_special(ncp) == 0) {
3634                                 if (ncp->nc_parent != par_nch->ncp ||
3635                                     ncp->nc_nlen != nlc->nlc_namelen ||
3636                                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) ||
3637                                     (ncp->nc_flag & NCF_DESTROYED)) {
3638                                         kprintf("cache_lookup_nonblock: "
3639                                                 "ncp-race %p %*.*s\n",
3640                                                 ncp,
3641                                                 nlc->nlc_namelen,
3642                                                 nlc->nlc_namelen,
3643                                                 nlc->nlc_nameptr);
3644                                         _cache_unlock(ncp);
3645                                         _cache_drop(ncp);
3646                                         goto failed;
3647                                 }
3648                                 _cache_auto_unresolve(mp, ncp);
3649                                 if (new_ncp) {
3650                                         _cache_free(new_ncp);
3651                                         new_ncp = NULL;
3652                                 }
3653                                 goto found;
3654                         }
3655                         _cache_drop(ncp);
3656                         goto failed;
3657                 }
3658         }
3659
3660         /*
3661          * We failed to locate an entry, create a new entry and add it to
3662          * the cache.  The parent ncp must also be locked so we
3663          * can link into it.
3664          *
3665          * We have to relookup after possibly blocking in kmalloc or
3666          * when locking par_nch.
3667          *
3668          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
3669          *       mount case, in which case nc_name will be NULL.
3670          */
3671         if (new_ncp == NULL) {
3672                 spin_unlock(&nchpp->spin);
3673                 new_ncp = cache_alloc(nlc->nlc_namelen);
3674                 if (nlc->nlc_namelen) {
3675                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
3676                               nlc->nlc_namelen);
3677                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
3678                 }
3679                 goto restart;
3680         }
3681         if (par_locked == 0) {
3682                 spin_unlock(&nchpp->spin);
3683                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
3684                         par_locked = 1;
3685                         goto restart;
3686                 }
3687                 goto failed;
3688         }
3689
3690         /*
3691          * Link to parent (requires another ref, the one already in new_ncp
3692          * is what we wil lreturn).
3693          *
3694          * WARNING!  We still hold the spinlock.  We have to set the hash
3695          *           table entry atomically.
3696          */
3697         ncp = new_ncp;
3698         ++ncp->nc_refs;
3699         _cache_link_parent(ncp, par_nch->ncp, nchpp);
3700         spin_unlock(&nchpp->spin);
3701         _cache_unlock(par_nch->ncp);
3702         /* par_locked = 0 - not used */
3703 found:
3704         /*
3705          * stats and namecache size management
3706          */
3707         if (ncp->nc_flag & NCF_UNRESOLVED)
3708                 ++gd->gd_nchstats->ncs_miss;
3709         else if (ncp->nc_vp)
3710                 ++gd->gd_nchstats->ncs_goodhits;
3711         else
3712                 ++gd->gd_nchstats->ncs_neghits;
3713         nch.mount = mp;
3714         nch.ncp = ncp;
3715         _cache_mntref(nch.mount);
3716
3717         return(nch);
3718 failed:
3719         if (new_ncp) {
3720                 _cache_free(new_ncp);
3721                 new_ncp = NULL;
3722         }
3723         nch.mount = NULL;
3724         nch.ncp = NULL;
3725         return(nch);
3726 }
3727
3728 /*
3729  * This is a non-locking optimized lookup that depends on adding a ref
3730  * to prevent normal eviction.  nch.ncp can be returned as NULL for any
3731  * reason and the caller will retry with normal locking in that case.
3732  *
3733  * This function only returns resolved entries so callers do not accidentally
3734  * race doing out of order / unfenced field checks.
3735  *
3736  * The caller must validate the result for parent-to-child continuity.
3737  */
3738 struct nchandle
3739 cache_nlookup_nonlocked(struct nchandle *par_nch, struct nlcomponent *nlc)
3740 {
3741         struct nchandle nch;
3742         struct namecache *ncp;
3743         struct nchash_head *nchpp;
3744         struct mount *mp;
3745         u_int32_t hash;
3746         globaldata_t gd;
3747
3748         gd = mycpu;
3749         mp = par_nch->mount;
3750
3751         /*
3752          * Try to locate an existing entry
3753          */
3754         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
3755         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
3756         nchpp = NCHHASH(hash);
3757
3758         spin_lock_shared(&nchpp->spin);
3759         TAILQ_FOREACH(ncp, &nchpp->list, nc_hash) {
3760                 /*
3761                  * Break out if we find a matching entry.  Note that
3762                  * UNRESOLVED entries may match, but DESTROYED entries
3763                  * do not.  However, UNRESOLVED entries still return failure.
3764                  */
3765                 if (ncp->nc_parent == par_nch->ncp &&
3766                     ncp->nc_nlen == nlc->nlc_namelen &&
3767                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
3768                     (ncp->nc_flag & NCF_DESTROYED) == 0
3769                 ) {
3770                         /*
3771                          * Test NFS timeout for auto-unresolve.  Give up if
3772                          * the entry is not resolved.
3773                          *
3774                          * Getting the ref with the nchpp locked prevents
3775                          * any transition to NCF_DESTROYED.
3776                          */
3777                         if (_cache_auto_unresolve_test(par_nch->mount, ncp))
3778                                 break;
3779                         if (ncp->nc_flag & NCF_UNRESOLVED)
3780                                 break;
3781                         _cache_hold(ncp);
3782                         spin_unlock_shared(&nchpp->spin);
3783
3784                         /*
3785                          * We need an additional test to ensure that the ref
3786                          * we got above prevents transitions to NCF_UNRESOLVED.
3787                          * This can occur if another thread is currently
3788                          * holding the ncp exclusively locked or (if we raced
3789                          * that and it unlocked before our test) the flag
3790                          * has been set.
3791                          *
3792                          * XXX check if superceeded by nc_generation XXX
3793                          */
3794                         if (_cache_lockstatus(ncp) < 0 ||
3795                             (ncp->nc_flag & (NCF_DESTROYED | NCF_UNRESOLVED)))
3796                         {
3797                                 if ((ncvp_debug & 4) &&
3798                                     (ncp->nc_flag &
3799                                      (NCF_DESTROYED | NCF_UNRESOLVED)))
3800                                 {
3801                                     kprintf("ncp state change: %p %08x %d %s\n",
3802                                             ncp, ncp->nc_flag, ncp->nc_error,
3803                                             ncp->nc_name);
3804                                 }
3805                                 _cache_drop(ncp);
3806                                 spin_lock_shared(&nchpp->spin);
3807                                 break;
3808                         }
3809
3810                         /*
3811                          * Return the ncp bundled into a nch on success.
3812                          * The ref should passively prevent the ncp from
3813                          * becoming unresolved without having to hold a lock.
3814                          * (XXX this may not be entirely true)
3815                          */
3816                         goto found;
3817                 }
3818         }
3819         spin_unlock_shared(&nchpp->spin);
3820         nch.mount = NULL;
3821         nch.ncp = NULL;
3822
3823         return nch;
3824 found:
3825         /*
3826          * stats and namecache size management
3827          */
3828         if (ncp->nc_flag & NCF_UNRESOLVED)
3829                 ++gd->gd_nchstats->ncs_miss;
3830         else if (ncp->nc_vp)
3831                 ++gd->gd_nchstats->ncs_goodhits;
3832         else
3833                 ++gd->gd_nchstats->ncs_neghits;
3834         nch.mount = mp;
3835         nch.ncp = ncp;
3836         _cache_mntref(nch.mount);
3837
3838         return(nch);
3839 }
3840
3841 /*
3842  * The namecache entry is marked as being used as a mount point.
3843  * Locate the mount if it is visible to the caller.  The DragonFly
3844  * mount system allows arbitrary loops in the topology and disentangles
3845  * those loops by matching against (mp, ncp) rather than just (ncp).
3846  * This means any given ncp can dive any number of mounts, depending
3847  * on the relative mount (e.g. nullfs) the caller is at in the topology.
3848  *
3849  * We use a very simple frontend cache to reduce SMP conflicts,
3850  * which we have to do because the mountlist scan needs an exclusive
3851  * lock around its ripout info list.  Not to mention that there might
3852  * be a lot of mounts.
3853  *
3854  * Because all mounts can potentially be accessed by all cpus, break the cpu's
3855  * down a bit to allow some contention rather than making the cache
3856  * excessively huge.
3857  *
3858  * The hash table is split into per-cpu areas, is 4-way set-associative.
3859  */
3860 struct findmount_info {
3861         struct mount *result;
3862         struct mount *nch_mount;
3863         struct namecache *nch_ncp;
3864 };
3865
3866 static __inline
3867 struct ncmount_cache *
3868 ncmount_cache_lookup4(struct mount *mp, struct namecache *ncp)
3869 {
3870         uint32_t hash;
3871
3872         hash = iscsi_crc32(&mp, sizeof(mp));
3873         hash = iscsi_crc32_ext(&ncp, sizeof(ncp), hash);
3874         hash ^= hash >> 16;
3875         hash = hash & ((NCMOUNT_NUMCACHE - 1) & ~(NCMOUNT_SET - 1));
3876
3877         return (&ncmount_cache[hash]);
3878 }
3879
3880 static
3881 struct ncmount_cache *
3882 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
3883 {
3884         struct ncmount_cache *ncc;
3885         struct ncmount_cache *best;
3886         int delta;
3887         int best_delta;
3888         int i;
3889
3890         ncc = ncmount_cache_lookup4(mp, ncp);
3891
3892         /*
3893          * NOTE: When checking for a ticks overflow implement a slop of
3894          *       2 ticks just to be safe, because ticks is accessed
3895          *       non-atomically one CPU can increment it while another
3896          *       is still using the old value.
3897          */
3898         if (ncc->ncp == ncp && ncc->mp == mp)   /* 0 */
3899                 return ncc;
3900         delta = (int)(ticks - ncc->ticks);      /* beware GCC opts */
3901         if (delta < -2)                         /* overflow reset */
3902                 ncc->ticks = ticks;
3903         best = ncc;
3904         best_delta = delta;
3905
3906         for (i = 1; i < NCMOUNT_SET; ++i) {     /* 1, 2, 3 */
3907                 ++ncc;
3908                 if (ncc->ncp == ncp && ncc->mp == mp)
3909                         return ncc;
3910                 delta = (int)(ticks - ncc->ticks);
3911                 if (delta < -2)
3912                         ncc->ticks = ticks;
3913                 if (delta > best_delta) {
3914                         best_delta = delta;
3915                         best = ncc;
3916                 }
3917         }
3918         return best;
3919 }
3920
3921 /*
3922  * pcpu-optimized mount search.  Locate the recursive mountpoint, avoid
3923  * doing an expensive mountlist_scan*() if possible.
3924  *
3925  * (mp, ncp) -> mountonpt.k
3926  *
3927  * Returns a referenced mount pointer or NULL
3928  *
3929  * General SMP operation uses a per-cpu umount_spin to interlock unmount
3930  * operations (that is, where the mp_target can be freed out from under us).
3931  *
3932  * Lookups use the ncc->updating counter to validate the contents in order
3933  * to avoid having to obtain the per cache-element spin-lock.  In addition,
3934  * the ticks field is only updated when it changes.  However, if our per-cpu
3935  * lock fails due to an unmount-in-progress, we fall-back to the
3936  * cache-element's spin-lock.
3937  */
3938 struct mount *
3939 cache_findmount(struct nchandle *nch)
3940 {
3941         struct findmount_info info;
3942         struct ncmount_cache *ncc;
3943         struct ncmount_cache ncc_copy;
3944         struct mount *target;
3945         struct pcpu_ncache *pcpu;
3946         struct spinlock *spinlk;
3947         int update;
3948
3949         pcpu = pcpu_ncache;
3950         if (ncmount_cache_enable == 0 || pcpu == NULL) {
3951                 ncc = NULL;
3952                 goto skip;
3953         }
3954         pcpu += mycpu->gd_cpuid;
3955
3956 again:
3957         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3958         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3959 found:
3960                 /*
3961                  * This is a bit messy for now because we do not yet have
3962                  * safe disposal of mount structures.  We have to ref
3963                  * ncc->mp_target but the 'update' counter only tell us
3964                  * whether the cache has changed after the fact.
3965                  *
3966                  * For now get a per-cpu spinlock that will only contend
3967                  * against umount's.  This is the best path.  If it fails,
3968                  * instead of waiting on the umount we fall-back to a
3969                  * shared ncc->spin lock, which will generally only cost a
3970                  * cache ping-pong.
3971                  */
3972                 update = ncc->updating;
3973                 if (__predict_true(spin_trylock(&pcpu->umount_spin))) {
3974                         spinlk = &pcpu->umount_spin;
3975                 } else {
3976                         spinlk = &ncc->spin;
3977                         spin_lock_shared(spinlk);
3978                 }
3979                 if (update & 1) {               /* update in progress */
3980                         spin_unlock_any(spinlk);
3981                         goto skip;
3982                 }
3983                 ncc_copy = *ncc;
3984                 cpu_lfence();
3985                 if (ncc->updating != update) {  /* content changed */
3986                         spin_unlock_any(spinlk);
3987                         goto again;
3988                 }
3989                 if (ncc_copy.ncp != nch->ncp || ncc_copy.mp != nch->mount) {
3990                         spin_unlock_any(spinlk);
3991                         goto again;
3992                 }
3993                 if (ncc_copy.isneg == 0) {
3994                         target = ncc_copy.mp_target;
3995                         if (target->mnt_ncmounton.mount == nch->mount &&
3996                             target->mnt_ncmounton.ncp == nch->ncp) {
3997                                 /*
3998                                  * Cache hit (positive) (avoid dirtying
3999                                  * the cache line if possible)
4000                                  */
4001                                 if (ncc->ticks != (int)ticks)
4002                                         ncc->ticks = (int)ticks;
4003                                 _cache_mntref(target);
4004                         }
4005                 } else {
4006                         /*
4007                          * Cache hit (negative) (avoid dirtying
4008                          * the cache line if possible)
4009                          */
4010                         if (ncc->ticks != (int)ticks)
4011                                 ncc->ticks = (int)ticks;
4012                         target = NULL;
4013                 }
4014                 spin_unlock_any(spinlk);
4015
4016                 return target;
4017         }
4018 skip:
4019
4020         /*
4021          * Slow
4022          */
4023         info.result = NULL;
4024         info.nch_mount = nch->mount;
4025         info.nch_ncp = nch->ncp;
4026         mountlist_scan(cache_findmount_callback, &info,
4027                        MNTSCAN_FORWARD | MNTSCAN_NOBUSY | MNTSCAN_NOUNLOCK);
4028
4029         /*
4030          * To reduce multi-re-entry on the cache, relookup in the cache.
4031          * This can still race, obviously, but that's ok.
4032          */
4033         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
4034         if (ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
4035                 if (info.result)
4036                         atomic_add_int(&info.result->mnt_refs, -1);
4037                 goto found;
4038         }
4039
4040         /*
4041          * Cache the result.
4042          */
4043         if ((info.result == NULL ||
4044             (info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0)) {
4045                 spin_lock(&ncc->spin);
4046                 atomic_add_int_nonlocked(&ncc->updating, 1);
4047                 cpu_sfence();
4048                 KKASSERT(ncc->updating & 1);
4049                 if (ncc->mp != nch->mount) {
4050                         if (ncc->mp)
4051                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
4052                         atomic_add_int(&nch->mount->mnt_refs, 1);
4053                         ncc->mp = nch->mount;
4054                 }
4055                 ncc->ncp = nch->ncp;    /* ptr compares only, not refd*/
4056                 ncc->ticks = (int)ticks;
4057
4058                 if (info.result) {
4059                         ncc->isneg = 0;
4060                         if (ncc->mp_target != info.result) {
4061                                 if (ncc->mp_target)
4062                                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
4063                                 ncc->mp_target = info.result;
4064                                 atomic_add_int(&info.result->mnt_refs, 1);
4065                         }
4066                 } else {
4067                         ncc->isneg = 1;
4068                         if (ncc->mp_target) {
4069                                 atomic_add_int(&ncc->mp_target->mnt_refs, -1);
4070                                 ncc->mp_target = NULL;
4071                         }
4072                 }
4073                 cpu_sfence();
4074                 atomic_add_int_nonlocked(&ncc->updating, 1);
4075                 spin_unlock(&ncc->spin);
4076         }
4077         return(info.result);
4078 }
4079
4080 static
4081 int
4082 cache_findmount_callback(struct mount *mp, void *data)
4083 {
4084         struct findmount_info *info = data;
4085
4086         /*
4087          * Check the mount's mounted-on point against the passed nch.
4088          */
4089         if (mp->mnt_ncmounton.mount == info->nch_mount &&
4090             mp->mnt_ncmounton.ncp == info->nch_ncp
4091         ) {
4092             info->result = mp;
4093             _cache_mntref(mp);
4094             return(-1);
4095         }
4096         return(0);
4097 }
4098
4099 void
4100 cache_dropmount(struct mount *mp)
4101 {
4102         _cache_mntrel(mp);
4103 }
4104
4105 /*
4106  * mp is being mounted, scrap entries matching mp->mnt_ncmounton (positive
4107  * or negative).
4108  *
4109  * A full scan is not required, but for now just do it anyway.
4110  */
4111 void
4112 cache_ismounting(struct mount *mp)
4113 {
4114         struct ncmount_cache *ncc;
4115         struct mount *ncc_mp;
4116         int i;
4117
4118         if (pcpu_ncache == NULL)
4119                 return;
4120
4121         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
4122                 ncc = &ncmount_cache[i];
4123                 if (ncc->mp != mp->mnt_ncmounton.mount ||
4124                     ncc->ncp != mp->mnt_ncmounton.ncp) {
4125                         continue;
4126                 }
4127                 spin_lock(&ncc->spin);
4128                 atomic_add_int_nonlocked(&ncc->updating, 1);
4129                 cpu_sfence();
4130                 KKASSERT(ncc->updating & 1);
4131                 if (ncc->mp != mp->mnt_ncmounton.mount ||
4132                     ncc->ncp != mp->mnt_ncmounton.ncp) {
4133                         cpu_sfence();
4134                         ++ncc->updating;
4135                         spin_unlock(&ncc->spin);
4136                         continue;
4137                 }
4138                 ncc_mp = ncc->mp;
4139                 ncc->ncp = NULL;
4140                 ncc->mp = NULL;
4141                 if (ncc_mp)
4142                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4143                 ncc_mp = ncc->mp_target;
4144                 ncc->mp_target = NULL;
4145                 if (ncc_mp)
4146                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4147                 ncc->ticks = (int)ticks - hz * 120;
4148
4149                 cpu_sfence();
4150                 atomic_add_int_nonlocked(&ncc->updating, 1);
4151                 spin_unlock(&ncc->spin);
4152         }
4153
4154         /*
4155          * Pre-cache the mount point
4156          */
4157         ncc = ncmount_cache_lookup(mp->mnt_ncmounton.mount,
4158                                    mp->mnt_ncmounton.ncp);
4159
4160         spin_lock(&ncc->spin);
4161         atomic_add_int_nonlocked(&ncc->updating, 1);
4162         cpu_sfence();
4163         KKASSERT(ncc->updating & 1);
4164
4165         if (ncc->mp)
4166                 atomic_add_int(&ncc->mp->mnt_refs, -1);
4167         atomic_add_int(&mp->mnt_ncmounton.mount->mnt_refs, 1);
4168         ncc->mp = mp->mnt_ncmounton.mount;
4169         ncc->ncp = mp->mnt_ncmounton.ncp;       /* ptr compares only */
4170         ncc->ticks = (int)ticks;
4171
4172         ncc->isneg = 0;
4173         if (ncc->mp_target != mp) {
4174                 if (ncc->mp_target)
4175                         atomic_add_int(&ncc->mp_target->mnt_refs, -1);
4176                 ncc->mp_target = mp;
4177                 atomic_add_int(&mp->mnt_refs, 1);
4178         }
4179         cpu_sfence();
4180         atomic_add_int_nonlocked(&ncc->updating, 1);
4181         spin_unlock(&ncc->spin);
4182 }
4183
4184 /*
4185  * Scrap any ncmount_cache entries related to mp.  Not only do we need to
4186  * scrap entries matching mp->mnt_ncmounton, but we also need to scrap any
4187  * negative hits involving (mp, <any>).
4188  *
4189  * A full scan is required.
4190  */
4191 void
4192 cache_unmounting(struct mount *mp)
4193 {
4194         struct ncmount_cache *ncc;
4195         struct pcpu_ncache *pcpu;
4196         struct mount *ncc_mp;
4197         int i;
4198
4199         pcpu = pcpu_ncache;
4200         if (pcpu == NULL)
4201                 return;
4202
4203         for (i = 0; i < ncpus; ++i)
4204                 spin_lock(&pcpu[i].umount_spin);
4205
4206         for (i = 0; i < NCMOUNT_NUMCACHE; ++i) {
4207                 ncc = &ncmount_cache[i];
4208                 if (ncc->mp != mp && ncc->mp_target != mp)
4209                         continue;
4210                 spin_lock(&ncc->spin);
4211                 atomic_add_int_nonlocked(&ncc->updating, 1);
4212                 cpu_sfence();
4213
4214                 if (ncc->mp != mp && ncc->mp_target != mp) {
4215                         atomic_add_int_nonlocked(&ncc->updating, 1);
4216                         cpu_sfence();
4217                         spin_unlock(&ncc->spin);
4218                         continue;
4219                 }
4220                 ncc_mp = ncc->mp;
4221                 ncc->ncp = NULL;
4222                 ncc->mp = NULL;
4223                 if (ncc_mp)
4224                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4225                 ncc_mp = ncc->mp_target;
4226                 ncc->mp_target = NULL;
4227                 if (ncc_mp)
4228                         atomic_add_int(&ncc_mp->mnt_refs, -1);
4229                 ncc->ticks = (int)ticks - hz * 120;
4230
4231                 cpu_sfence();
4232                 atomic_add_int_nonlocked(&ncc->updating, 1);
4233                 spin_unlock(&ncc->spin);
4234         }
4235
4236         for (i = 0; i < ncpus; ++i)
4237                 spin_unlock(&pcpu[i].umount_spin);
4238 }
4239
4240 /*
4241  * Resolve an unresolved namecache entry, generally by looking it up.
4242  * The passed ncp must be locked and refd.
4243  *
4244  * Theoretically since a vnode cannot be recycled while held, and since
4245  * the nc_parent chain holds its vnode as long as children exist, the
4246  * direct parent of the cache entry we are trying to resolve should
4247  * have a valid vnode.  If not then generate an error that we can
4248  * determine is related to a resolver bug.
4249  *
4250  * However, if a vnode was in the middle of a recyclement when the NCP
4251  * got locked, ncp->nc_vp might point to a vnode that is about to become
4252  * invalid.  cache_resolve() handles this case by unresolving the entry
4253  * and then re-resolving it.
4254  *
4255  * Note that successful resolution does not necessarily return an error
4256  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
4257  * will be returned.
4258  *
4259  * (*genp) is adjusted based on our resolution operation.  If it is already
4260  * wrong, that's ok... it will still be wrong on return.
4261  */
4262 int
4263 cache_resolve(struct nchandle *nch, u_int *genp, struct ucred *cred)
4264 {
4265         struct namecache *par_tmp;
4266         struct namecache *par;
4267         struct namecache *ncp;
4268         struct nchandle nctmp;
4269         struct mount *mp;
4270         struct vnode *dvp;
4271         int error;
4272
4273         ncp = nch->ncp;
4274         mp = nch->mount;
4275         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
4276
4277 restart:
4278         /*
4279          * If the ncp is already resolved we have nothing to do.  However,
4280          * we do want to guarentee that a usable vnode is returned when
4281          * a vnode is present, so make sure it hasn't been reclaimed.
4282          */
4283         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4284                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
4285                         _cache_ncp_gen_enter(ncp);
4286                         _cache_setunresolved(ncp, 0);
4287                         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4288                                 _cache_ncp_gen_exit(ncp);
4289                                 *genp += 4;
4290                                 return (ncp->nc_error);
4291                         }
4292                 } else if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4293                         return (ncp->nc_error);
4294                 } else {
4295                         _cache_ncp_gen_enter(ncp);
4296                 }
4297         } else {
4298                 _cache_ncp_gen_enter(ncp);
4299         }
4300         /* in gen_enter state */
4301         *genp += 4;
4302
4303         /*
4304          * If the ncp was destroyed it will never resolve again.  This
4305          * can basically only happen when someone is chdir'd into an
4306          * empty directory which is then rmdir'd.  We want to catch this
4307          * here and not dive the VFS because the VFS might actually
4308          * have a way to re-resolve the disconnected ncp, which will
4309          * result in inconsistencies in the cdir/nch for proc->p_fd.
4310          */
4311         if (ncp->nc_flag & NCF_DESTROYED) {
4312                 _cache_ncp_gen_exit(ncp);
4313                 return(EINVAL);
4314         }
4315
4316         /*
4317          * Mount points need special handling because the parent does not
4318          * belong to the same filesystem as the ncp.
4319          */
4320         if (ncp == mp->mnt_ncmountpt.ncp) {
4321                 error = cache_resolve_mp(mp, 0);
4322                 _cache_ncp_gen_exit(ncp);
4323                 return error;
4324         }
4325
4326         /*
4327          * We expect an unbroken chain of ncps to at least the mount point,
4328          * and even all the way to root (but this code doesn't have to go
4329          * past the mount point).
4330          */
4331         if (ncp->nc_parent == NULL) {
4332                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
4333                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
4334                 ncp->nc_error = EXDEV;
4335                 _cache_ncp_gen_exit(ncp);
4336                 return(ncp->nc_error);
4337         }
4338
4339         /*
4340          * The vp's of the parent directories in the chain are held via vhold()
4341          * due to the existance of the child, and should not disappear.
4342          * However, there are cases where they can disappear:
4343          *
4344          *      - due to filesystem I/O errors.
4345          *      - due to NFS being stupid about tracking the namespace and
4346          *        destroys the namespace for entire directories quite often.
4347          *      - due to forced unmounts.
4348          *      - due to an rmdir (parent will be marked DESTROYED)
4349          *
4350          * When this occurs we have to track the chain backwards and resolve
4351          * it, looping until the resolver catches up to the current node.  We
4352          * could recurse here but we might run ourselves out of kernel stack
4353          * so we do it in a more painful manner.  This situation really should
4354          * not occur all that often, or if it does not have to go back too
4355          * many nodes to resolve the ncp.
4356          */
4357         while ((dvp = cache_dvpref(ncp)) == NULL) {
4358                 /*
4359                  * This case can occur if a process is CD'd into a
4360                  * directory which is then rmdir'd.  If the parent is marked
4361                  * destroyed there is no point trying to resolve it.
4362                  */
4363                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED) {
4364                         if (ncvp_debug & 8) {
4365                                 kprintf("nc_parent destroyed: %s/%s\n",
4366                                         ncp->nc_parent->nc_name, ncp->nc_name);
4367                         }
4368                         _cache_ncp_gen_exit(ncp);
4369                         return(ENOENT);
4370                 }
4371                 par = ncp->nc_parent;
4372                 _cache_hold(par);
4373                 _cache_lock(par);
4374                 while ((par_tmp = par->nc_parent) != NULL &&
4375                        par_tmp->nc_vp == NULL) {
4376                         _cache_hold(par_tmp);
4377                         _cache_lock(par_tmp);
4378                         _cache_put(par);
4379                         par = par_tmp;
4380                 }
4381                 if (par->nc_parent == NULL) {
4382                         kprintf("EXDEV case 2 %*.*s\n",
4383                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4384                         _cache_put(par);
4385                         _cache_ncp_gen_exit(ncp);
4386                         return (EXDEV);
4387                 }
4388                 /*
4389                  * The parent is not set in stone, ref and lock it to prevent
4390                  * it from disappearing.  Also note that due to renames it
4391                  * is possible for our ncp to move and for par to no longer
4392                  * be one of its parents.  We resolve it anyway, the loop
4393                  * will handle any moves.
4394                  */
4395                 _cache_get(par);        /* additional hold/lock */
4396                 _cache_put(par);        /* from earlier hold/lock */
4397                 if (par == nch->mount->mnt_ncmountpt.ncp) {
4398                         cache_resolve_mp(nch->mount, 0);
4399                 } else if ((dvp = cache_dvpref(par)) == NULL) {
4400                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
4401                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4402                         _cache_put(par);
4403                         continue;
4404                 } else {
4405                         if (par->nc_flag & NCF_UNRESOLVED) {
4406                                 nctmp.mount = mp;
4407                                 nctmp.ncp = par;
4408                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
4409                         }
4410                         vrele(dvp);
4411                 }
4412                 if ((error = par->nc_error) != 0) {
4413                         if (par->nc_error != EAGAIN) {
4414                                 kprintf("EXDEV case 3 %*.*s error %d\n",
4415                                     par->nc_nlen, par->nc_nlen, par->nc_name,
4416                                     par->nc_error);
4417                                 _cache_put(par);
4418                                 _cache_ncp_gen_exit(ncp);
4419                                 return(error);
4420                         }
4421                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
4422                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
4423                 }
4424                 _cache_put(par);
4425                 /* loop */
4426         }
4427
4428         /*
4429          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
4430          * ncp's and reattach them.  If this occurs the original ncp is marked
4431          * EAGAIN to force a relookup.
4432          *
4433          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
4434          * ncp must already be resolved.
4435          */
4436         if (dvp) {
4437                 nctmp.mount = mp;
4438                 nctmp.ncp = ncp;
4439                 *genp += 4;     /* setvp bumps the generation */
4440                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
4441                 vrele(dvp);
4442         } else {
4443                 ncp->nc_error = EPERM;
4444         }
4445
4446         if (ncp->nc_error == EAGAIN) {
4447                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
4448                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
4449                 goto restart;
4450         }
4451         _cache_ncp_gen_exit(ncp);
4452
4453         return(ncp->nc_error);
4454 }
4455
4456 /*
4457  * Resolve the ncp associated with a mount point.  Such ncp's almost always
4458  * remain resolved and this routine is rarely called.  NFS MPs tends to force
4459  * re-resolution more often due to its mac-truck-smash-the-namecache
4460  * method of tracking namespace changes.
4461  *
4462  * The semantics for this call is that the passed ncp must be locked on
4463  * entry and will be locked on return.  However, if we actually have to
4464  * resolve the mount point we temporarily unlock the entry in order to
4465  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
4466  * the unlock we have to recheck the flags after we relock.
4467  */
4468 static int
4469 cache_resolve_mp(struct mount *mp, int adjgen)
4470 {
4471         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
4472         struct vnode *vp;
4473         int error;
4474
4475         KKASSERT(mp != NULL);
4476
4477         /*
4478          * If the ncp is already resolved we have nothing to do.  However,
4479          * we do want to guarentee that a usable vnode is returned when
4480          * a vnode is present, so make sure it hasn't been reclaimed.
4481          */
4482         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
4483                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
4484                         _cache_setunresolved(ncp, adjgen);
4485         }
4486
4487         if (ncp->nc_flag & NCF_UNRESOLVED) {
4488                 /*
4489                  * ncp must be unlocked across the vfs_busy(), but
4490                  * once busied lock ordering is ncp(s), then vnodes,
4491                  * so we must relock the ncp before issuing the VFS_ROOT().
4492                  */
4493                 _cache_unlock(ncp);
4494                 while (vfs_busy(mp, 0))
4495                         ;
4496                 _cache_lock(ncp);
4497                 error = VFS_ROOT(mp, &vp);
4498
4499                 /*
4500                  * recheck the ncp state after relocking.
4501                  */
4502                 if (ncp->nc_flag & NCF_UNRESOLVED) {
4503                         ncp->nc_error = error;
4504                         if (error == 0) {
4505                                 _cache_setvp(mp, ncp, vp, adjgen);
4506                                 vput(vp);
4507                         } else {
4508                                 kprintf("[diagnostic] cache_resolve_mp: failed"
4509                                         " to resolve mount %p err=%d ncp=%p\n",
4510                                         mp, error, ncp);
4511                                 _cache_setvp(mp, ncp, NULL, adjgen);
4512                         }
4513                 } else if (error == 0) {
4514                         vput(vp);
4515                 }
4516                 vfs_unbusy(mp);
4517         }
4518         return(ncp->nc_error);
4519 }
4520
4521 /*
4522  * Resolve the parent vnode
4523  */
4524 int
4525 cache_resolve_dvp(struct nchandle *nch, struct ucred *cred, struct vnode **dvpp)
4526 {
4527         struct namecache *par_tmp;
4528         struct namecache *par;
4529         struct namecache *ncp;
4530         struct nchandle nctmp;
4531         struct mount *mp;
4532         struct vnode *dvp;
4533         int error;
4534
4535         *dvpp = NULL;
4536         ncp = nch->ncp;
4537         mp = nch->mount;
4538         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
4539
4540         /*
4541          * Treat this as a mount point even if it has a parent (e.g.
4542          * null-mount).  Return a NULL dvp and no error.
4543          */
4544         if (ncp == mp->mnt_ncmountpt.ncp)
4545                 return 0;
4546
4547         /*
4548          * If the ncp was destroyed there is no parent directory, return
4549          * EINVAL.
4550          */
4551         if (ncp->nc_flag & NCF_DESTROYED)
4552                 return(EINVAL);
4553
4554         /*
4555          * No parent if at the root of a filesystem, no error.  Typically
4556          * not applicable to null-mounts.  This case should have been caught
4557          * in the above ncmountpt check.
4558          */
4559         if (ncp->nc_parent == NULL)
4560                 return 0;
4561
4562         /*
4563          * Resolve the parent dvp.
4564          *
4565          * The vp's of the parent directories in the chain are held via vhold()
4566          * due to the existance of the child, and should not disappear.
4567          * However, there are cases where they can disappear:
4568          *
4569          *      - due to filesystem I/O errors.
4570          *      - due to NFS being stupid about tracking the namespace and
4571          *        destroys the namespace for entire directories quite often.
4572          *      - due to forced unmounts.
4573          *      - due to an rmdir (parent will be marked DESTROYED)
4574          *
4575          * When this occurs we have to track the chain backwards and resolve
4576          * it, looping until the resolver catches up to the current node.  We
4577          * could recurse here but we might run ourselves out of kernel stack
4578          * so we do it in a more painful manner.  This situation really should
4579          * not occur all that often, or if it does not have to go back too
4580          * many nodes to resolve the ncp.
4581          */
4582         while ((dvp = cache_dvpref(ncp)) == NULL) {
4583                 /*
4584                  * This case can occur if a process is CD'd into a
4585                  * directory which is then rmdir'd.  If the parent is marked
4586                  * destroyed there is no point trying to resolve it.
4587                  */
4588                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
4589                         return(ENOENT);
4590                 par = ncp->nc_parent;
4591                 _cache_hold(par);
4592                 _cache_lock(par);
4593                 while ((par_tmp = par->nc_parent) != NULL &&
4594                        par_tmp->nc_vp == NULL) {
4595                         _cache_hold(par_tmp);
4596                         _cache_lock(par_tmp);
4597                         _cache_put(par);
4598                         par = par_tmp;
4599                 }
4600                 if (par->nc_parent == NULL) {
4601                         kprintf("EXDEV case 2 %*.*s\n",
4602                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4603                         _cache_put(par);
4604                         return (EXDEV);
4605                 }
4606
4607                 /*
4608                  * The parent is not set in stone, ref and lock it to prevent
4609                  * it from disappearing.  Also note that due to renames it
4610                  * is possible for our ncp to move and for par to no longer
4611                  * be one of its parents.  We resolve it anyway, the loop
4612                  * will handle any moves.
4613                  */
4614                 _cache_get(par);        /* additional hold/lock */
4615                 _cache_put(par);        /* from earlier hold/lock */
4616                 if (par == nch->mount->mnt_ncmountpt.ncp) {
4617                         cache_resolve_mp(nch->mount, 1);
4618                 } else if ((dvp = cache_dvpref(par)) == NULL) {
4619                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n",
4620                                 par->nc_nlen, par->nc_nlen, par->nc_name);
4621                         _cache_put(par);
4622                         continue;
4623                 } else {
4624                         if (par->nc_flag & NCF_UNRESOLVED) {
4625                                 nctmp.mount = mp;
4626                                 nctmp.ncp = par;
4627                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
4628                         }
4629                         vrele(dvp);
4630                 }
4631                 if ((error = par->nc_error) != 0) {
4632                         if (par->nc_error != EAGAIN) {
4633                                 kprintf("EXDEV case 3 %*.*s error %d\n",
4634                                     par->nc_nlen, par->nc_nlen, par->nc_name,
4635                                     par->nc_error);
4636                                 _cache_put(par);
4637                                 return(error);
4638                         }
4639                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
4640                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
4641                 }
4642                 _cache_put(par);
4643                 /* loop */
4644         }
4645
4646         /*
4647          * We have a referenced dvp
4648          */
4649         *dvpp = dvp;
4650         return 0;
4651 }
4652
4653 /*
4654  * Clean out negative cache entries when too many have accumulated.
4655  */
4656 static void
4657 _cache_cleanneg(long count)
4658 {
4659         struct pcpu_ncache *pn;
4660         struct namecache *ncp;
4661         static uint32_t neg_rover;
4662         uint32_t n;
4663         long vnegs;
4664
4665         n = neg_rover++;        /* SMP heuristical, race ok */
4666         cpu_ccfence();
4667         n = n % (uint32_t)ncpus;
4668
4669         /*
4670          * Normalize vfscache_negs and count.  count is sometimes based
4671          * on vfscache_negs.  vfscache_negs is heuristical and can sometimes
4672          * have crazy values.
4673          */
4674         vnegs = vfscache_negs;
4675         cpu_ccfence();
4676         if (vnegs <= MINNEG)
4677                 vnegs = MINNEG;
4678         if (count < 1)
4679                 count = 1;
4680
4681         pn = &pcpu_ncache[n];
4682         spin_lock(&pn->neg_spin);
4683         count = pn->neg_count * count / vnegs + 1;
4684         spin_unlock(&pn->neg_spin);
4685
4686         /*
4687          * Attempt to clean out the specified number of negative cache
4688          * entries.
4689          */
4690         while (count > 0) {
4691                 spin_lock(&pn->neg_spin);
4692                 ncp = TAILQ_FIRST(&pn->neg_list);
4693                 if (ncp == NULL) {
4694                         spin_unlock(&pn->neg_spin);
4695                         break;
4696                 }
4697                 TAILQ_REMOVE(&pn->neg_list, ncp, nc_vnode);
4698                 TAILQ_INSERT_TAIL(&pn->neg_list, ncp, nc_vnode);
4699                 _cache_hold(ncp);
4700                 spin_unlock(&pn->neg_spin);
4701
4702                 /*
4703                  * This can race, so we must re-check that the ncp
4704                  * is on the ncneg.list after successfully locking it.
4705                  *
4706                  * Don't scrap actively referenced ncps.  There should be
4707                  * 3 refs.  The natural ref, one from being on the neg list,
4708                  * and one from us.
4709                  *
4710                  * Recheck fields after successfully locking to ensure
4711                  * that it is in-fact still on the negative list with no
4712                  * extra refs.
4713                  *
4714                  * WARNING! On the ncneglist scan any race against other
4715                  *          destructors (zaps or cache_inval_vp_quick() calls)
4716                  *          will have already unresolved the ncp and cause
4717                  *          us to drop instead of zap.  This fine, if
4718                  *          our drop winds up being the last one it will
4719                  *          kfree() the ncp.
4720                  */
4721                 if (_cache_lock_special(ncp) == 0) {
4722                         if (ncp->nc_vp == NULL &&
4723                             ncp->nc_refs == 3 &&
4724                             (ncp->nc_flag & NCF_UNRESOLVED) == 0)
4725                         {
4726                                 ++pcpu_ncache[mycpu->gd_cpuid].clean_neg_count;
4727                                 cache_zap(ncp);
4728                         } else {
4729                                 _cache_unlock(ncp);
4730                                 _cache_drop(ncp);
4731                         }
4732                 } else {
4733                         _cache_drop(ncp);
4734                 }
4735                 --count;
4736         }
4737 }
4738
4739 /*
4740  * Clean out unresolved cache entries when too many have accumulated.
4741  * Resolved cache entries are cleaned out via the vnode reclamation
4742  * mechanism and by _cache_cleanneg().
4743  */
4744 static void
4745 _cache_cleanpos(long ucount, long xcount)
4746 {
4747         static volatile int rover;
4748         struct nchash_head *nchpp;
4749         struct namecache *ncp;
4750         long count;
4751         int rover_copy;
4752
4753         /*
4754          * Don't burn too much cpu looking for stuff
4755          */
4756         count = (ucount > xcount) ? ucount : xcount;
4757         count = count * 4;
4758
4759         /*
4760          * Attempt to clean out the specified number of cache entries.
4761          */
4762         while (count > 0 && (ucount > 0 || xcount > 0)) {
4763                 rover_copy = ++rover;   /* MPSAFEENOUGH */
4764                 cpu_ccfence();
4765                 nchpp = NCHHASH(rover_copy);
4766
4767                 if (TAILQ_FIRST(&nchpp->list) == NULL) {
4768                         --count;
4769                         continue;
4770                 }
4771
4772                 /*
4773                  * Get the next ncp
4774                  */
4775                 spin_lock(&nchpp->spin);
4776                 ncp = TAILQ_FIRST(&nchpp->list);
4777
4778                 /*
4779                  * Skip placeholder ncp's.  Do not shift their
4780                  * position in the list.
4781                  */
4782                 while (ncp && (ncp->nc_flag & NCF_DUMMY))
4783                         ncp = TAILQ_NEXT(ncp, nc_hash);
4784
4785                 if (ncp) {
4786                         /*
4787                          * Move to end of list
4788                          */
4789                         TAILQ_REMOVE(&nchpp->list, ncp, nc_hash);
4790                         TAILQ_INSERT_TAIL(&nchpp->list, ncp, nc_hash);
4791
4792                         if (ncp->nc_refs != ncpbaserefs(ncp)) {
4793                                 /*
4794                                  * Do not destroy internal nodes that have
4795                                  * children or nodes which have thread
4796                                  * references.
4797                                  */
4798                                 ncp = NULL;
4799                         } else if (ucount > 0 &&
4800                                    (ncp->nc_flag & NCF_UNRESOLVED))
4801                         {
4802                                 /*
4803                                  * Destroy unresolved nodes if asked.
4804                                  */
4805                                 --ucount;
4806                                 --xcount;
4807                                 _cache_hold(ncp);
4808                         } else if (xcount > 0) {
4809                                 /*
4810                                  * Destroy any other node if asked.
4811                                  */
4812                                 --xcount;
4813                                 _cache_hold(ncp);
4814                         } else {
4815                                 /*
4816                                  * Otherwise don't
4817                                  */
4818                                 ncp = NULL;
4819                         }
4820                 }
4821                 spin_unlock(&nchpp->spin);
4822
4823                 /*
4824                  * Try to scap the ncp if we can do so non-blocking.
4825                  * We must re-check nc_refs after locking, and it will
4826                  * have one additional ref from above.
4827                  */
4828                 if (ncp) {
4829                         if (_cache_lock_special(ncp) == 0) {
4830                                 if (ncp->nc_refs == 1 + ncpbaserefs(ncp)) {
4831                                         ++pcpu_ncache[mycpu->gd_cpuid].
4832                                                 clean_pos_count;
4833                                         cache_zap(ncp);
4834                                 } else {
4835                                         _cache_unlock(ncp);
4836                                         _cache_drop(ncp);
4837                                 }
4838                         } else {
4839                                 _cache_drop(ncp);
4840                         }
4841                 }
4842                 --count;
4843         }
4844 }
4845
4846 /*
4847  * This is a kitchen sink function to clean out ncps which we
4848  * tried to zap from cache_drop() but failed because we were
4849  * unable to acquire the parent lock.
4850  *
4851  * Such entries can also be removed via cache_inval_vp(), such
4852  * as when unmounting.
4853  */
4854 static void
4855 _cache_cleandefered(void)
4856 {
4857         struct nchash_head *nchpp;
4858         struct namecache *ncp;
4859         struct namecache dummy;
4860         int i;
4861
4862         /*
4863          * Create a list iterator.  DUMMY indicates that this is a list
4864          * iterator, DESTROYED prevents matches by lookup functions.
4865          */
4866         numdefered = 0;
4867         pcpu_ncache[mycpu->gd_cpuid].numdefered = 0;
4868         bzero(&dummy, sizeof(dummy));
4869         dummy.nc_flag = NCF_DESTROYED | NCF_DUMMY;
4870         dummy.nc_refs = 1;
4871
4872         for (i = 0; i <= nchash; ++i) {
4873                 nchpp = &nchashtbl[i];
4874
4875                 spin_lock(&nchpp->spin);
4876                 TAILQ_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
4877                 ncp = &dummy;
4878                 while ((ncp = TAILQ_NEXT(ncp, nc_hash)) != NULL) {
4879                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
4880                                 continue;
4881                         TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4882                         TAILQ_INSERT_AFTER(&nchpp->list, ncp, &dummy, nc_hash);
4883                         _cache_hold(ncp);
4884                         spin_unlock(&nchpp->spin);
4885                         if (_cache_lock_nonblock(ncp) == 0) {
4886                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
4887                                 _cache_unlock(ncp);
4888                         }
4889                         _cache_drop(ncp);
4890                         spin_lock(&nchpp->spin);
4891                         ncp = &dummy;
4892                 }
4893                 TAILQ_REMOVE(&nchpp->list, &dummy, nc_hash);
4894                 spin_unlock(&nchpp->spin);
4895         }
4896 }
4897
4898 /*
4899  * Name cache initialization, from vfsinit() when we are booting
4900  */
4901 void
4902 nchinit(void)
4903 {
4904         struct pcpu_ncache *pn;
4905         globaldata_t gd;
4906         int i;
4907
4908         /*
4909          * Per-cpu accounting and negative hit list
4910          */
4911         pcpu_ncache = kmalloc(sizeof(*pcpu_ncache) * ncpus,
4912                               M_VFSCACHEAUX, M_WAITOK|M_ZERO);
4913         for (i = 0; i < ncpus; ++i) {
4914                 pn = &pcpu_ncache[i];
4915                 TAILQ_INIT(&pn->neg_list);
4916                 spin_init(&pn->neg_spin, "ncneg");
4917                 spin_init(&pn->umount_spin, "ncumm");
4918         }
4919
4920         /*
4921          * Initialise per-cpu namecache effectiveness statistics.
4922          */
4923         for (i = 0; i < ncpus; ++i) {
4924                 gd = globaldata_find(i);
4925                 gd->gd_nchstats = &nchstats[i];
4926         }
4927
4928         /*
4929          * Create a generous namecache hash table
4930          */
4931         nchashtbl = hashinit_ext(vfs_inodehashsize(),
4932                                  sizeof(struct nchash_head),
4933                                  M_VFSCACHEAUX, &nchash);
4934         for (i = 0; i <= (int)nchash; ++i) {
4935                 TAILQ_INIT(&nchashtbl[i].list);
4936                 spin_init(&nchashtbl[i].spin, "nchinit_hash");
4937         }
4938         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
4939                 spin_init(&ncmount_cache[i].spin, "nchinit_cache");
4940         nclockwarn = 5 * hz;
4941 }
4942
4943 /*
4944  * Called from start_init() to bootstrap the root filesystem.  Returns
4945  * a referenced, unlocked namecache record to serve as a root or the
4946  * root of the system.
4947  *
4948  * Adjust our namecache counts
4949  */
4950 void
4951 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
4952 {
4953         /*struct pcpu_ncache *pn = &pcpu_ncache[mycpu->gd_cpuid];*/
4954
4955         /* nc_parent is NULL, doesn't count as a leaf or unresolved */
4956         /*atomic_add_long(&pn->vfscache_leafs, 1);*/
4957         /*atomic_add_long(&pn->vfscache_unres, 1);*/
4958
4959         nch->ncp = cache_alloc(0);
4960         nch->mount = mp;
4961         _cache_mntref(mp);
4962         if (vp)
4963                 _cache_setvp(nch->mount, nch->ncp, vp, 1);
4964 }
4965
4966 /*
4967  * vfs_cache_setroot()
4968  *
4969  *      Create an association between the root of our namecache and
4970  *      the root vnode.  This routine may be called several times during
4971  *      booting.
4972  *
4973  *      If the caller intends to save the returned namecache pointer somewhere
4974  *      it must cache_hold() it.
4975  */
4976 void
4977 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
4978 {
4979         struct vnode *ovp;
4980         struct nchandle onch;
4981
4982         ovp = rootvnode;
4983         onch = rootnch;
4984         rootvnode = nvp;
4985         if (nch)
4986                 rootnch = *nch;
4987         else
4988                 cache_zero(&rootnch);
4989         if (ovp)
4990                 vrele(ovp);
4991         if (onch.ncp)
4992                 cache_drop(&onch);
4993 }
4994
4995 /*
4996  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
4997  * topology and is being removed as quickly as possible.  The new VOP_N*()
4998  * API calls are required to make specific adjustments using the supplied
4999  * ncp pointers rather then just bogusly purging random vnodes.
5000  *
5001  * Invalidate all namecache entries to a particular vnode as well as
5002  * any direct children of that vnode in the namecache.  This is a
5003  * 'catch all' purge used by filesystems that do not know any better.
5004  *
5005  * Note that the linkage between the vnode and its namecache entries will
5006  * be removed, but the namecache entries themselves might stay put due to
5007  * active references from elsewhere in the system or due to the existance of
5008  * the children.   The namecache topology is left intact even if we do not
5009  * know what the vnode association is.  Such entries will be marked
5010  * NCF_UNRESOLVED.
5011  */
5012 void
5013 cache_purge(struct vnode *vp)
5014 {
5015         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
5016 }
5017
5018 __read_mostly static int disablecwd;
5019 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
5020     "Disable getcwd");
5021
5022 /*
5023  * MPALMOSTSAFE
5024  */
5025 int
5026 sys___getcwd(struct sysmsg *sysmsg, const struct __getcwd_args *uap)
5027 {
5028         u_int buflen;
5029         int error;
5030         char *buf;
5031         char *bp;
5032
5033         if (disablecwd)
5034                 return (ENODEV);
5035
5036         buflen = uap->buflen;
5037         if (buflen == 0)
5038                 return (EINVAL);
5039         if (buflen > MAXPATHLEN)
5040                 buflen = MAXPATHLEN;
5041
5042         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
5043         bp = kern_getcwd(buf, buflen, &error);
5044         if (error == 0)
5045                 error = copyout(bp, uap->buf, strlen(bp) + 1);
5046         kfree(buf, M_TEMP);
5047         return (error);
5048 }
5049
5050 char *
5051 kern_getcwd(char *buf, size_t buflen, int *error)
5052 {
5053         struct proc *p = curproc;
5054         char *bp;
5055         int i, slash_prefixed;
5056         struct filedesc *fdp;
5057         struct nchandle nch;
5058         struct namecache *ncp;
5059
5060         bp = buf;
5061         bp += buflen - 1;
5062         *bp = '\0';
5063         fdp = p->p_fd;
5064         slash_prefixed = 0;
5065
5066         nch = fdp->fd_ncdir;
5067         ncp = nch.ncp;
5068         if (ncp)
5069                 _cache_hold(ncp);
5070
5071         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
5072                nch.mount != fdp->fd_nrdir.mount)
5073         ) {
5074                 if (ncp->nc_flag & NCF_DESTROYED) {
5075                         _cache_drop(ncp);
5076                         ncp = NULL;
5077                         break;
5078                 }
5079                 /*
5080                  * While traversing upwards if we encounter the root
5081                  * of the current mount we have to skip to the mount point
5082                  * in the underlying filesystem.
5083                  */
5084                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
5085                         nch = nch.mount->mnt_ncmounton;
5086                         _cache_drop(ncp);
5087                         ncp = nch.ncp;
5088                         if (ncp)
5089                                 _cache_hold(ncp);
5090                         continue;
5091                 }
5092
5093                 /*
5094                  * Prepend the path segment
5095                  */
5096                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
5097                         if (bp == buf) {
5098                                 *error = ERANGE;
5099                                 bp = NULL;
5100                                 goto done;
5101                         }
5102                         *--bp = ncp->nc_name[i];
5103                 }
5104                 if (bp == buf) {
5105                         *error = ERANGE;
5106                         bp = NULL;
5107                         goto done;
5108                 }
5109                 *--bp = '/';
5110                 slash_prefixed = 1;
5111
5112                 /*
5113                  * Go up a directory.  This isn't a mount point so we don't
5114                  * have to check again.
5115                  */
5116                 while ((nch.ncp = ncp->nc_parent) != NULL) {
5117                         if (ncp_shared_lock_disable)
5118                                 _cache_lock(ncp);
5119                         else
5120                                 _cache_lock_shared(ncp);
5121                         if (nch.ncp != ncp->nc_parent) {
5122                                 _cache_unlock(ncp);
5123                                 continue;
5124                         }
5125                         _cache_hold(nch.ncp);
5126                         _cache_unlock(ncp);
5127                         break;
5128                 }
5129                 _cache_drop(ncp);
5130                 ncp = nch.ncp;
5131         }
5132         if (ncp == NULL) {
5133                 *error = ENOENT;
5134                 bp = NULL;
5135                 goto done;
5136         }
5137         if (!slash_prefixed) {
5138                 if (bp == buf) {
5139                         *error = ERANGE;
5140                         bp = NULL;
5141                         goto done;
5142                 }
5143                 *--bp = '/';
5144         }
5145         *error = 0;
5146 done:
5147         if (ncp)
5148                 _cache_drop(ncp);
5149         return (bp);
5150 }
5151
5152 /*
5153  * Thus begins the fullpath magic.
5154  *
5155  * The passed nchp is referenced but not locked.
5156  */
5157 __read_mostly static int disablefullpath;
5158 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
5159     &disablefullpath, 0,
5160     "Disable fullpath lookups");
5161
5162 int
5163 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
5164                char **retbuf, char **freebuf, int guess)
5165 {
5166         struct nchandle fd_nrdir;
5167         struct nchandle nch;
5168         struct namecache *ncp;
5169         struct mount *mp, *new_mp;
5170         char *bp, *buf;
5171         int slash_prefixed;
5172         int error = 0;
5173         int i;
5174
5175         *retbuf = NULL;
5176         *freebuf = NULL;
5177
5178         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
5179         bp = buf + MAXPATHLEN - 1;
5180         *bp = '\0';
5181         if (nchbase)
5182                 fd_nrdir = *nchbase;
5183         else if (p != NULL)
5184                 fd_nrdir = p->p_fd->fd_nrdir;
5185         else
5186                 fd_nrdir = rootnch;
5187         slash_prefixed = 0;
5188         nch = *nchp;
5189         ncp = nch.ncp;
5190         if (ncp)
5191                 _cache_hold(ncp);
5192         mp = nch.mount;
5193
5194         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
5195                 new_mp = NULL;
5196
5197                 /*
5198                  * If we are asked to guess the upwards path, we do so whenever
5199                  * we encounter an ncp marked as a mountpoint. We try to find
5200                  * the actual mountpoint by finding the mountpoint with this
5201                  * ncp.
5202                  */
5203                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
5204                         new_mp = mount_get_by_nc(ncp);
5205                 }
5206                 /*
5207                  * While traversing upwards if we encounter the root
5208                  * of the current mount we have to skip to the mount point.
5209                  */
5210                 if (ncp == mp->mnt_ncmountpt.ncp) {
5211                         new_mp = mp;
5212                 }
5213                 if (new_mp) {
5214                         nch = new_mp->mnt_ncmounton;
5215                         _cache_drop(ncp);
5216                         ncp = nch.ncp;
5217                         if (ncp)
5218                                 _cache_hold(ncp);
5219                         mp = nch.mount;
5220                         continue;
5221                 }
5222
5223                 /*
5224                  * Prepend the path segment
5225                  */
5226                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
5227                         if (bp == buf) {
5228                                 kfree(buf, M_TEMP);
5229                                 error = ENOMEM;
5230                                 goto done;
5231                         }
5232                         *--bp = ncp->nc_name[i];
5233                 }
5234                 if (bp == buf) {
5235                         kfree(buf, M_TEMP);
5236                         error = ENOMEM;
5237                         goto done;
5238                 }
5239                 *--bp = '/';
5240                 slash_prefixed = 1;
5241
5242                 /*
5243                  * Go up a directory.  This isn't a mount point so we don't
5244                  * have to check again.
5245                  *
5246                  * We can only safely access nc_parent with ncp held locked.
5247                  */
5248                 while ((nch.ncp = ncp->nc_parent) != NULL) {
5249                         _cache_lock_shared(ncp);
5250                         if (nch.ncp != ncp->nc_parent) {
5251                                 _cache_unlock(ncp);
5252                                 continue;
5253                         }
5254                         _cache_hold(nch.ncp);
5255                         _cache_unlock(ncp);
5256                         break;
5257                 }
5258                 _cache_drop(ncp);
5259                 ncp = nch.ncp;
5260         }
5261         if (ncp == NULL) {
5262                 kfree(buf, M_TEMP);
5263                 error = ENOENT;
5264                 goto done;
5265         }
5266
5267         if (!slash_prefixed) {
5268                 if (bp == buf) {
5269                         kfree(buf, M_TEMP);
5270                         error = ENOMEM;
5271                         goto done;
5272                 }
5273                 *--bp = '/';
5274         }
5275         *retbuf = bp;
5276         *freebuf = buf;
5277         error = 0;
5278 done:
5279         if (ncp)
5280                 _cache_drop(ncp);
5281         return(error);
5282 }
5283
5284 int
5285 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf,
5286             char **freebuf, int guess)
5287 {
5288         struct namecache *ncp;
5289         struct nchandle nch;
5290         int error;
5291
5292         *freebuf = NULL;
5293         if (disablefullpath)
5294                 return (ENODEV);
5295
5296         if (p == NULL)
5297                 return (EINVAL);
5298
5299         /* vn is NULL, client wants us to use p->p_textvp */
5300         if (vn == NULL) {
5301                 if ((vn = p->p_textvp) == NULL)
5302                         return (EINVAL);
5303         }
5304         spin_lock_shared(&vn->v_spin);
5305         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
5306                 if (ncp->nc_nlen)
5307                         break;
5308         }
5309         if (ncp == NULL) {
5310                 spin_unlock_shared(&vn->v_spin);
5311                 return (EINVAL);
5312         }
5313         _cache_hold(ncp);
5314         spin_unlock_shared(&vn->v_spin);
5315
5316         nch.ncp = ncp;
5317         nch.mount = vn->v_mount;
5318         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
5319         _cache_drop(ncp);
5320         return (error);
5321 }
5322
5323 void
5324 vfscache_rollup_cpu(struct globaldata *gd)
5325 {
5326         struct pcpu_ncache *pn;
5327         long count;
5328
5329         if (pcpu_ncache == NULL)
5330                 return;
5331         pn = &pcpu_ncache[gd->gd_cpuid];
5332
5333         /*
5334          * namecache statistics
5335          */
5336         if (pn->vfscache_count) {
5337                 count = atomic_swap_long(&pn->vfscache_count, 0);
5338                 atomic_add_long(&vfscache_count, count);
5339         }
5340         if (pn->vfscache_leafs) {
5341                 count = atomic_swap_long(&pn->vfscache_leafs, 0);
5342                 atomic_add_long(&vfscache_leafs, count);
5343         }
5344         if (pn->vfscache_unres) {
5345                 count = atomic_swap_long(&pn->vfscache_unres, 0);
5346                 atomic_add_long(&vfscache_unres, count);
5347         }
5348         if (pn->vfscache_negs) {
5349                 count = atomic_swap_long(&pn->vfscache_negs, 0);
5350                 atomic_add_long(&vfscache_negs, count);
5351         }
5352
5353         /*
5354          * hysteresis based cleanings
5355          */
5356         if (pn->inv_kid_quick_count) {
5357                 count = atomic_swap_long(&pn->inv_kid_quick_count, 0);
5358                 atomic_add_long(&inv_kid_quick_count, count);
5359         }
5360         if (pn->inv_ncp_quick_count) {
5361                 count = atomic_swap_long(&pn->inv_ncp_quick_count, 0);
5362                 atomic_add_long(&inv_ncp_quick_count, count);
5363         }
5364         if (pn->clean_pos_count) {
5365                 count = atomic_swap_long(&pn->clean_pos_count, 0);
5366                 atomic_add_long(&clean_pos_count, count);
5367         }
5368         if (pn->clean_neg_count) {
5369                 count = atomic_swap_long(&pn->clean_neg_count, 0);
5370                 atomic_add_long(&clean_neg_count, count);
5371         }
5372
5373         if (pn->numdefered) {
5374                 count = atomic_swap_long(&pn->numdefered, 0);
5375                 atomic_add_long(&numdefered, count);
5376         }
5377 }