sys/kern/vfs_cache.c

   1 /*
   2  * Copyright (c) 2003,2004,2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * Copyright (c) 1989, 1993, 1995
  35  *      The Regents of the University of California.  All rights reserved.
  36  *
  37  * This code is derived from software contributed to Berkeley by
  38  * Poul-Henning Kamp of the FreeBSD Project.
  39  *
  40  * Redistribution and use in source and binary forms, with or without
  41  * modification, are permitted provided that the following conditions
  42  * are met:
  43  * 1. Redistributions of source code must retain the above copyright
  44  *    notice, this list of conditions and the following disclaimer.
  45  * 2. Redistributions in binary form must reproduce the above copyright
  46  *    notice, this list of conditions and the following disclaimer in the
  47  *    documentation and/or other materials provided with the distribution.
  48  * 3. All advertising materials mentioning features or use of this software
  49  *    must display the following acknowledgement:
  50  *      This product includes software developed by the University of
  51  *      California, Berkeley and its contributors.
  52  * 4. Neither the name of the University nor the names of its contributors
  53  *    may be used to endorse or promote products derived from this software
  54  *    without specific prior written permission.
  55  *
  56  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  57  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  59  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  60  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  61  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  62  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  63  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  64  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  65  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  66  * SUCH DAMAGE.
  67  */
  68
  69 #include <sys/param.h>
  70 #include <sys/systm.h>
  71 #include <sys/kernel.h>
  72 #include <sys/sysctl.h>
  73 #include <sys/mount.h>
  74 #include <sys/vnode.h>
  75 #include <sys/malloc.h>
  76 #include <sys/sysproto.h>
  77 #include <sys/spinlock.h>
  78 #include <sys/proc.h>
  79 #include <sys/namei.h>
  80 #include <sys/nlookup.h>
  81 #include <sys/filedesc.h>
  82 #include <sys/fnv_hash.h>
  83 #include <sys/globaldata.h>
  84 #include <sys/kern_syscall.h>
  85 #include <sys/dirent.h>
  86 #include <ddb/ddb.h>
  87
  88 #include <sys/sysref2.h>
  89 #include <sys/spinlock2.h>
  90 #include <sys/mplock2.h>
  91
  92 #define MAX_RECURSION_DEPTH     64
  93
  94 /*
  95  * Random lookups in the cache are accomplished with a hash table using
  96  * a hash key of (nc_src_vp, name).  Each hash chain has its own spin lock.
  97  *
  98  * Negative entries may exist and correspond to resolved namecache
  99  * structures where nc_vp is NULL.  In a negative entry, NCF_WHITEOUT
 100  * will be set if the entry corresponds to a whited-out directory entry
 101  * (verses simply not finding the entry at all).   ncneglist is locked
 102  * with a global spinlock (ncspin).
 103  *
 104  * MPSAFE RULES:
 105  *
 106  * (1) A ncp must be referenced before it can be locked.
 107  *
 108  * (2) A ncp must be locked in order to modify it.
 109  *
 110  * (3) ncp locks are always ordered child -> parent.  That may seem
 111  *     backwards but forward scans use the hash table and thus can hold
 112  *     the parent unlocked when traversing downward.
 113  *
 114  *     This allows insert/rename/delete/dot-dot and other operations
 115  *     to use ncp->nc_parent links.
 116  *
 117  *     This also prevents a locked up e.g. NFS node from creating a
 118  *     chain reaction all the way back to the root vnode / namecache.
 119  *
 120  * (4) parent linkages require both the parent and child to be locked.
 121  */
 122
 123 /*
 124  * Structures associated with name cacheing.
 125  */
 126 #define NCHHASH(hash)           (&nchashtbl[(hash) & nchash])
 127 #define MINNEG                  1024
 128 #define MINPOS                  1024
 129 #define NCMOUNT_NUMCACHE        1009    /* prime number */
 130
 131 MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 132
 133 LIST_HEAD(nchash_list, namecache);
 134
 135 struct nchash_head {
 136        struct nchash_list list;
 137        struct spinlock  spin;
 138 };
 139
 140 struct ncmount_cache {
 141         struct spinlock spin;
 142         struct namecache *ncp;
 143         struct mount *mp;
 144         int isneg;              /* if != 0 mp is originator and not target */
 145 };
 146
 147 static struct nchash_head       *nchashtbl;
 148 static struct namecache_list    ncneglist;
 149 static struct spinlock          ncspin;
 150 static struct ncmount_cache     ncmount_cache[NCMOUNT_NUMCACHE];
 151
 152 /*
 153  * ncvp_debug - debug cache_fromvp().  This is used by the NFS server
 154  * to create the namecache infrastructure leading to a dangling vnode.
 155  *
 156  * 0    Only errors are reported
 157  * 1    Successes are reported
 158  * 2    Successes + the whole directory scan is reported
 159  * 3    Force the directory scan code run as if the parent vnode did not
 160  *      have a namecache record, even if it does have one.
 161  */
 162 static int      ncvp_debug;
 163 SYSCTL_INT(_debug, OID_AUTO, ncvp_debug, CTLFLAG_RW, &ncvp_debug, 0,
 164     "Namecache debug level (0-3)");
 165
 166 static u_long   nchash;                 /* size of hash table */
 167 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
 168     "Size of namecache hash table");
 169
 170 static int      ncnegfactor = 16;       /* ratio of negative entries */
 171 SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
 172     "Ratio of namecache negative entries");
 173
 174 static int      nclockwarn;             /* warn on locked entries in ticks */
 175 SYSCTL_INT(_debug, OID_AUTO, nclockwarn, CTLFLAG_RW, &nclockwarn, 0,
 176     "Warn on locked namecache entries in ticks");
 177
 178 static int      numdefered;             /* number of cache entries allocated */
 179 SYSCTL_INT(_debug, OID_AUTO, numdefered, CTLFLAG_RD, &numdefered, 0,
 180     "Number of cache entries allocated");
 181
 182 static int      ncposlimit;             /* number of cache entries allocated */
 183 SYSCTL_INT(_debug, OID_AUTO, ncposlimit, CTLFLAG_RW, &ncposlimit, 0,
 184     "Number of cache entries allocated");
 185
 186 static int      ncp_shared_lock_disable = 1;
 187 SYSCTL_INT(_debug, OID_AUTO, ncp_shared_lock_disable, CTLFLAG_RW,
 188            &ncp_shared_lock_disable, 0, "Disable shared namecache locks");
 189
 190 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode),
 191     "sizeof(struct vnode)");
 192 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache),
 193     "sizeof(struct namecache)");
 194
 195 static int      ncmount_cache_enable = 1;
 196 SYSCTL_INT(_debug, OID_AUTO, ncmount_cache_enable, CTLFLAG_RW,
 197            &ncmount_cache_enable, 0, "mount point cache");
 198 static long     ncmount_cache_hit;
 199 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_hit, CTLFLAG_RW,
 200             &ncmount_cache_hit, 0, "mpcache hits");
 201 static long     ncmount_cache_miss;
 202 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_miss, CTLFLAG_RW,
 203             &ncmount_cache_miss, 0, "mpcache misses");
 204 static long     ncmount_cache_overwrite;
 205 SYSCTL_LONG(_debug, OID_AUTO, ncmount_cache_overwrite, CTLFLAG_RW,
 206             &ncmount_cache_overwrite, 0, "mpcache entry overwrites");
 207
 208 static int cache_resolve_mp(struct mount *mp);
 209 static struct vnode *cache_dvpref(struct namecache *ncp);
 210 static void _cache_lock(struct namecache *ncp);
 211 static void _cache_setunresolved(struct namecache *ncp);
 212 static void _cache_cleanneg(int count);
 213 static void _cache_cleanpos(int count);
 214 static void _cache_cleandefered(void);
 215 static void _cache_unlink(struct namecache *ncp);
 216
 217 /*
 218  * The new name cache statistics
 219  */
 220 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 221 static int numneg;
 222 SYSCTL_INT(_vfs_cache, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
 223     "Number of negative namecache entries");
 224 static int numcache;
 225 SYSCTL_INT(_vfs_cache, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
 226     "Number of namecaches entries");
 227 static u_long numcalls;
 228 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcalls, CTLFLAG_RD, &numcalls, 0,
 229     "Number of namecache lookups");
 230 static u_long numchecks;
 231 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numchecks, CTLFLAG_RD, &numchecks, 0,
 232     "Number of checked entries in namecache lookups");
 233
 234 struct nchstats nchstats[SMP_MAXCPU];
 235 /*
 236  * Export VFS cache effectiveness statistics to user-land.
 237  *
 238  * The statistics are left for aggregation to user-land so
 239  * neat things can be achieved, like observing per-CPU cache
 240  * distribution.
 241  */
 242 static int
 243 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 244 {
 245         struct globaldata *gd;
 246         int i, error;
 247
 248         error = 0;
 249         for (i = 0; i < ncpus; ++i) {
 250                 gd = globaldata_find(i);
 251                 if ((error = SYSCTL_OUT(req, (void *)&(*gd->gd_nchstats),
 252                         sizeof(struct nchstats))))
 253                         break;
 254         }
 255
 256         return (error);
 257 }
 258 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE|CTLFLAG_RD,
 259   0, 0, sysctl_nchstats, "S,nchstats", "VFS cache effectiveness statistics");
 260
 261 static struct namecache *cache_zap(struct namecache *ncp, int nonblock);
 262
 263 /*
 264  * Namespace locking.  The caller must already hold a reference to the
 265  * namecache structure in order to lock/unlock it.  This function prevents
 266  * the namespace from being created or destroyed by accessors other then
 267  * the lock holder.
 268  *
 269  * Note that holding a locked namecache structure prevents other threads
 270  * from making namespace changes (e.g. deleting or creating), prevents
 271  * vnode association state changes by other threads, and prevents the
 272  * namecache entry from being resolved or unresolved by other threads.
 273  *
 274  * An exclusive lock owner has full authority to associate/disassociate
 275  * vnodes and resolve/unresolve the locked ncp.
 276  *
 277  * A shared lock owner only has authority to acquire the underlying vnode,
 278  * if any.
 279  *
 280  * The primary lock field is nc_lockstatus.  nc_locktd is set after the
 281  * fact (when locking) or cleared prior to unlocking.
 282  *
 283  * WARNING!  Holding a locked ncp will prevent a vnode from being destroyed
 284  *           or recycled, but it does NOT help you if the vnode had already
 285  *           initiated a recyclement.  If this is important, use cache_get()
 286  *           rather then cache_lock() (and deal with the differences in the
 287  *           way the refs counter is handled).  Or, alternatively, make an
 288  *           unconditional call to cache_validate() or cache_resolve()
 289  *           after cache_lock() returns.
 290  */
 291 static
 292 void
 293 _cache_lock(struct namecache *ncp)
 294 {
 295         thread_t td;
 296         int didwarn;
 297         int error;
 298         u_int count;
 299
 300         KKASSERT(ncp->nc_refs != 0);
 301         didwarn = 0;
 302         td = curthread;
 303
 304         for (;;) {
 305                 count = ncp->nc_lockstatus;
 306                 cpu_ccfence();
 307
 308                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 309                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 310                                               count, count + 1)) {
 311                                 /*
 312                                  * The vp associated with a locked ncp must
 313                                  * be held to prevent it from being recycled.
 314                                  *
 315                                  * WARNING!  If VRECLAIMED is set the vnode
 316                                  * could already be in the middle of a recycle.
 317                                  * Callers must use cache_vref() or
 318                                  * cache_vget() on the locked ncp to
 319                                  * validate the vp or set the cache entry
 320                                  * to unresolved.
 321                                  *
 322                                  * NOTE! vhold() is allowed if we hold a
 323                                  *       lock on the ncp (which we do).
 324                                  */
 325                                 ncp->nc_locktd = td;
 326                                 if (ncp->nc_vp)
 327                                         vhold(ncp->nc_vp);
 328                                 break;
 329                         }
 330                         /* cmpset failed */
 331                         continue;
 332                 }
 333                 if (ncp->nc_locktd == td) {
 334                         KKASSERT((count & NC_SHLOCK_FLAG) == 0);
 335                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 336                                               count, count + 1)) {
 337                                 break;
 338                         }
 339                         /* cmpset failed */
 340                         continue;
 341                 }
 342                 tsleep_interlock(&ncp->nc_locktd, 0);
 343                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 344                                       count | NC_EXLOCK_REQ) == 0) {
 345                         /* cmpset failed */
 346                         continue;
 347                 }
 348                 error = tsleep(&ncp->nc_locktd, PINTERLOCKED,
 349                                "clock", nclockwarn);
 350                 if (error == EWOULDBLOCK) {
 351                         if (didwarn == 0) {
 352                                 didwarn = ticks;
 353                                 kprintf("[diagnostic] cache_lock: "
 354                                         "blocked on %p %08x",
 355                                         ncp, count);
 356                                 kprintf(" \"%*.*s\"\n",
 357                                         ncp->nc_nlen, ncp->nc_nlen,
 358                                         ncp->nc_name);
 359                         }
 360                 }
 361                 /* loop */
 362         }
 363         if (didwarn) {
 364                 kprintf("[diagnostic] cache_lock: unblocked %*.*s after "
 365                         "%d secs\n",
 366                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 367                         (int)(ticks - didwarn) / hz);
 368         }
 369 }
 370
 371 /*
 372  * The shared lock works similarly to the exclusive lock except
 373  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 374  * prevent vhold() races, since the moment our cmpset_int succeeds
 375  * another cpu can come in and get its own shared lock.
 376  *
 377  * A critical section is needed to prevent interruption during the
 378  * VHOLD interlock.
 379  */
 380 static
 381 void
 382 _cache_lock_shared(struct namecache *ncp)
 383 {
 384         int didwarn;
 385         int error;
 386         u_int count;
 387
 388         KKASSERT(ncp->nc_refs != 0);
 389         didwarn = 0;
 390
 391         for (;;) {
 392                 count = ncp->nc_lockstatus;
 393                 cpu_ccfence();
 394
 395                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 396                         crit_enter();
 397                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 398                                       count,
 399                                       (count + 1) | NC_SHLOCK_FLAG |
 400                                                     NC_SHLOCK_VHOLD)) {
 401                                 /*
 402                                  * The vp associated with a locked ncp must
 403                                  * be held to prevent it from being recycled.
 404                                  *
 405                                  * WARNING!  If VRECLAIMED is set the vnode
 406                                  * could already be in the middle of a recycle.
 407                                  * Callers must use cache_vref() or
 408                                  * cache_vget() on the locked ncp to
 409                                  * validate the vp or set the cache entry
 410                                  * to unresolved.
 411                                  *
 412                                  * NOTE! vhold() is allowed if we hold a
 413                                  *       lock on the ncp (which we do).
 414                                  */
 415                                 if (ncp->nc_vp)
 416                                         vhold(ncp->nc_vp);
 417                                 atomic_clear_int(&ncp->nc_lockstatus,
 418                                                  NC_SHLOCK_VHOLD);
 419                                 crit_exit();
 420                                 break;
 421                         }
 422                         /* cmpset failed */
 423                         crit_exit();
 424                         continue;
 425                 }
 426
 427                 /*
 428                  * If already held shared we can just bump the count, but
 429                  * only allow this if nobody is trying to get the lock
 430                  * exclusively.
 431                  *
 432                  * VHOLD is a bit of a hack.  Even though we successfully
 433                  * added another shared ref, the cpu that got the first
 434                  * shared ref might not yet have held the vnode.
 435                  */
 436                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
 437                     NC_SHLOCK_FLAG) {
 438                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 439                                             NC_SHLOCK_REQ |
 440                                             NC_SHLOCK_FLAG)) > 0);
 441                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 442                                               count, count + 1)) {
 443                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 444                                         cpu_pause();
 445                                 break;
 446                         }
 447                         continue;
 448                 }
 449                 tsleep_interlock(ncp, 0);
 450                 if (atomic_cmpset_int(&ncp->nc_lockstatus, count,
 451                                       count | NC_SHLOCK_REQ) == 0) {
 452                         /* cmpset failed */
 453                         continue;
 454                 }
 455                 error = tsleep(ncp, PINTERLOCKED, "clocksh", nclockwarn);
 456                 if (error == EWOULDBLOCK) {
 457                         if (didwarn == 0) {
 458                                 didwarn = ticks;
 459                                 kprintf("[diagnostic] cache_lock_shared: "
 460                                         "blocked on %p %08x",
 461                                         ncp, count);
 462                                 kprintf(" \"%*.*s\"\n",
 463                                         ncp->nc_nlen, ncp->nc_nlen,
 464                                         ncp->nc_name);
 465                         }
 466                 }
 467                 /* loop */
 468         }
 469         if (didwarn) {
 470                 kprintf("[diagnostic] cache_lock_shared: "
 471                         "unblocked %*.*s after %d secs\n",
 472                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name,
 473                         (int)(ticks - didwarn) / hz);
 474         }
 475 }
 476
 477 /*
 478  * NOTE: nc_refs may be zero if the ncp is interlocked by circumstance,
 479  *       such as the case where one of its children is locked.
 480  */
 481 static
 482 int
 483 _cache_lock_nonblock(struct namecache *ncp)
 484 {
 485         thread_t td;
 486         u_int count;
 487
 488         td = curthread;
 489
 490         for (;;) {
 491                 count = ncp->nc_lockstatus;
 492
 493                 if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 0) {
 494                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 495                                               count, count + 1)) {
 496                                 /*
 497                                  * The vp associated with a locked ncp must
 498                                  * be held to prevent it from being recycled.
 499                                  *
 500                                  * WARNING!  If VRECLAIMED is set the vnode
 501                                  * could already be in the middle of a recycle.
 502                                  * Callers must use cache_vref() or
 503                                  * cache_vget() on the locked ncp to
 504                                  * validate the vp or set the cache entry
 505                                  * to unresolved.
 506                                  *
 507                                  * NOTE! vhold() is allowed if we hold a
 508                                  *       lock on the ncp (which we do).
 509                                  */
 510                                 ncp->nc_locktd = td;
 511                                 if (ncp->nc_vp)
 512                                         vhold(ncp->nc_vp);
 513                                 break;
 514                         }
 515                         /* cmpset failed */
 516                         continue;
 517                 }
 518                 if (ncp->nc_locktd == td) {
 519                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 520                                               count, count + 1)) {
 521                                 break;
 522                         }
 523                         /* cmpset failed */
 524                         continue;
 525                 }
 526                 return(EWOULDBLOCK);
 527         }
 528         return(0);
 529 }
 530
 531 /*
 532  * The shared lock works similarly to the exclusive lock except
 533  * nc_locktd is left NULL and we need an interlock (VHOLD) to
 534  * prevent vhold() races, since the moment our cmpset_int succeeds
 535  * another cpu can come in and get its own shared lock.
 536  *
 537  * A critical section is needed to prevent interruption during the
 538  * VHOLD interlock.
 539  */
 540 static
 541 int
 542 _cache_lock_shared_nonblock(struct namecache *ncp)
 543 {
 544         u_int count;
 545
 546         for (;;) {
 547                 count = ncp->nc_lockstatus;
 548
 549                 if ((count & ~NC_SHLOCK_REQ) == 0) {
 550                         crit_enter();
 551                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 552                                       count,
 553                                       (count + 1) | NC_SHLOCK_FLAG |
 554                                                     NC_SHLOCK_VHOLD)) {
 555                                 /*
 556                                  * The vp associated with a locked ncp must
 557                                  * be held to prevent it from being recycled.
 558                                  *
 559                                  * WARNING!  If VRECLAIMED is set the vnode
 560                                  * could already be in the middle of a recycle.
 561                                  * Callers must use cache_vref() or
 562                                  * cache_vget() on the locked ncp to
 563                                  * validate the vp or set the cache entry
 564                                  * to unresolved.
 565                                  *
 566                                  * NOTE! vhold() is allowed if we hold a
 567                                  *       lock on the ncp (which we do).
 568                                  */
 569                                 if (ncp->nc_vp)
 570                                         vhold(ncp->nc_vp);
 571                                 atomic_clear_int(&ncp->nc_lockstatus,
 572                                                  NC_SHLOCK_VHOLD);
 573                                 crit_exit();
 574                                 break;
 575                         }
 576                         /* cmpset failed */
 577                         crit_exit();
 578                         continue;
 579                 }
 580
 581                 /*
 582                  * If already held shared we can just bump the count, but
 583                  * only allow this if nobody is trying to get the lock
 584                  * exclusively.
 585                  *
 586                  * VHOLD is a bit of a hack.  Even though we successfully
 587                  * added another shared ref, the cpu that got the first
 588                  * shared ref might not yet have held the vnode.
 589                  */
 590                 if ((count & (NC_EXLOCK_REQ|NC_SHLOCK_FLAG)) ==
 591                     NC_SHLOCK_FLAG) {
 592                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 593                                             NC_SHLOCK_REQ |
 594                                             NC_SHLOCK_FLAG)) > 0);
 595                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 596                                               count, count + 1)) {
 597                                 while (ncp->nc_lockstatus & NC_SHLOCK_VHOLD)
 598                                         cpu_pause();
 599                                 break;
 600                         }
 601                         continue;
 602                 }
 603                 return(EWOULDBLOCK);
 604         }
 605         return(0);
 606 }
 607
 608 /*
 609  * Helper function
 610  *
 611  * NOTE: nc_refs can be 0 (degenerate case during _cache_drop).
 612  *
 613  *       nc_locktd must be NULLed out prior to nc_lockstatus getting cleared.
 614  */
 615 static
 616 void
 617 _cache_unlock(struct namecache *ncp)
 618 {
 619         thread_t td __debugvar = curthread;
 620         u_int count;
 621         u_int ncount;
 622         struct vnode *dropvp;
 623
 624         KKASSERT(ncp->nc_refs >= 0);
 625         KKASSERT((ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) > 0);
 626         KKASSERT((ncp->nc_lockstatus & NC_SHLOCK_FLAG) || ncp->nc_locktd == td);
 627
 628         count = ncp->nc_lockstatus;
 629         cpu_ccfence();
 630
 631         /*
 632          * Clear nc_locktd prior to the atomic op (excl lock only)
 633          */
 634         if ((count & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1)
 635                 ncp->nc_locktd = NULL;
 636         dropvp = NULL;
 637
 638         for (;;) {
 639                 if ((count &
 640                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ|NC_SHLOCK_FLAG)) == 1) {
 641                         dropvp = ncp->nc_vp;
 642                         if (count & NC_EXLOCK_REQ)
 643                                 ncount = count & NC_SHLOCK_REQ; /* cnt->0 */
 644                         else
 645                                 ncount = 0;
 646
 647                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 648                                               count, ncount)) {
 649                                 if (count & NC_EXLOCK_REQ)
 650                                         wakeup(&ncp->nc_locktd);
 651                                 else if (count & NC_SHLOCK_REQ)
 652                                         wakeup(ncp);
 653                                 break;
 654                         }
 655                         dropvp = NULL;
 656                 } else {
 657                         KKASSERT((count & NC_SHLOCK_VHOLD) == 0);
 658                         KKASSERT((count & ~(NC_EXLOCK_REQ |
 659                                             NC_SHLOCK_REQ |
 660                                             NC_SHLOCK_FLAG)) > 1);
 661                         if (atomic_cmpset_int(&ncp->nc_lockstatus,
 662                                               count, count - 1)) {
 663                                 break;
 664                         }
 665                 }
 666                 count = ncp->nc_lockstatus;
 667                 cpu_ccfence();
 668         }
 669
 670         /*
 671          * Don't actually drop the vp until we successfully clean out
 672          * the lock, otherwise we may race another shared lock.
 673          */
 674         if (dropvp)
 675                 vdrop(dropvp);
 676 }
 677
 678 static
 679 int
 680 _cache_lockstatus(struct namecache *ncp)
 681 {
 682         if (ncp->nc_locktd == curthread)
 683                 return(LK_EXCLUSIVE);
 684         if (ncp->nc_lockstatus & NC_SHLOCK_FLAG)
 685                 return(LK_SHARED);
 686         return(-1);
 687 }
 688
 689 /*
 690  * cache_hold() and cache_drop() prevent the premature deletion of a
 691  * namecache entry but do not prevent operations (such as zapping) on
 692  * that namecache entry.
 693  *
 694  * This routine may only be called from outside this source module if
 695  * nc_refs is already at least 1.
 696  *
 697  * This is a rare case where callers are allowed to hold a spinlock,
 698  * so we can't ourselves.
 699  */
 700 static __inline
 701 struct namecache *
 702 _cache_hold(struct namecache *ncp)
 703 {
 704         atomic_add_int(&ncp->nc_refs, 1);
 705         return(ncp);
 706 }
 707
 708 /*
 709  * Drop a cache entry, taking care to deal with races.
 710  *
 711  * For potential 1->0 transitions we must hold the ncp lock to safely
 712  * test its flags.  An unresolved entry with no children must be zapped
 713  * to avoid leaks.
 714  *
 715  * The call to cache_zap() itself will handle all remaining races and
 716  * will decrement the ncp's refs regardless.  If we are resolved or
 717  * have children nc_refs can safely be dropped to 0 without having to
 718  * zap the entry.
 719  *
 720  * NOTE: cache_zap() will re-check nc_refs and nc_list in a MPSAFE fashion.
 721  *
 722  * NOTE: cache_zap() may return a non-NULL referenced parent which must
 723  *       be dropped in a loop.
 724  */
 725 static __inline
 726 void
 727 _cache_drop(struct namecache *ncp)
 728 {
 729         int refs;
 730
 731         while (ncp) {
 732                 KKASSERT(ncp->nc_refs > 0);
 733                 refs = ncp->nc_refs;
 734
 735                 if (refs == 1) {
 736                         if (_cache_lock_nonblock(ncp) == 0) {
 737                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
 738                                 if ((ncp->nc_flag & NCF_UNRESOLVED) &&
 739                                     TAILQ_EMPTY(&ncp->nc_list)) {
 740                                         ncp = cache_zap(ncp, 1);
 741                                         continue;
 742                                 }
 743                                 if (atomic_cmpset_int(&ncp->nc_refs, 1, 0)) {
 744                                         _cache_unlock(ncp);
 745                                         break;
 746                                 }
 747                                 _cache_unlock(ncp);
 748                         }
 749                 } else {
 750                         if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1))
 751                                 break;
 752                 }
 753                 cpu_pause();
 754         }
 755 }
 756
 757 /*
 758  * Link a new namecache entry to its parent and to the hash table.  Be
 759  * careful to avoid races if vhold() blocks in the future.
 760  *
 761  * Both ncp and par must be referenced and locked.
 762  *
 763  * NOTE: The hash table spinlock is likely held during this call, we
 764  *       can't do anything fancy.
 765  */
 766 static void
 767 _cache_link_parent(struct namecache *ncp, struct namecache *par,
 768                    struct nchash_head *nchpp)
 769 {
 770         KKASSERT(ncp->nc_parent == NULL);
 771         ncp->nc_parent = par;
 772         ncp->nc_head = nchpp;
 773
 774         /*
 775          * Set inheritance flags.  Note that the parent flags may be
 776          * stale due to getattr potentially not having been run yet
 777          * (it gets run during nlookup()'s).
 778          */
 779         ncp->nc_flag &= ~(NCF_SF_PNOCACHE | NCF_UF_PCACHE);
 780         if (par->nc_flag & (NCF_SF_NOCACHE | NCF_SF_PNOCACHE))
 781                 ncp->nc_flag |= NCF_SF_PNOCACHE;
 782         if (par->nc_flag & (NCF_UF_CACHE | NCF_UF_PCACHE))
 783                 ncp->nc_flag |= NCF_UF_PCACHE;
 784
 785         LIST_INSERT_HEAD(&nchpp->list, ncp, nc_hash);
 786
 787         if (TAILQ_EMPTY(&par->nc_list)) {
 788                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 789                 /*
 790                  * Any vp associated with an ncp which has children must
 791                  * be held to prevent it from being recycled.
 792                  */
 793                 if (par->nc_vp)
 794                         vhold(par->nc_vp);
 795         } else {
 796                 TAILQ_INSERT_HEAD(&par->nc_list, ncp, nc_entry);
 797         }
 798 }
 799
 800 /*
 801  * Remove the parent and hash associations from a namecache structure.
 802  * If this is the last child of the parent the cache_drop(par) will
 803  * attempt to recursively zap the parent.
 804  *
 805  * ncp must be locked.  This routine will acquire a temporary lock on
 806  * the parent as wlel as the appropriate hash chain.
 807  */
 808 static void
 809 _cache_unlink_parent(struct namecache *ncp)
 810 {
 811         struct namecache *par;
 812         struct vnode *dropvp;
 813
 814         if ((par = ncp->nc_parent) != NULL) {
 815                 KKASSERT(ncp->nc_parent == par);
 816                 _cache_hold(par);
 817                 _cache_lock(par);
 818                 spin_lock(&ncp->nc_head->spin);
 819                 LIST_REMOVE(ncp, nc_hash);
 820                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
 821                 dropvp = NULL;
 822                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
 823                         dropvp = par->nc_vp;
 824                 spin_unlock(&ncp->nc_head->spin);
 825                 ncp->nc_parent = NULL;
 826                 ncp->nc_head = NULL;
 827                 _cache_unlock(par);
 828                 _cache_drop(par);
 829
 830                 /*
 831                  * We can only safely vdrop with no spinlocks held.
 832                  */
 833                 if (dropvp)
 834                         vdrop(dropvp);
 835         }
 836 }
 837
 838 /*
 839  * Allocate a new namecache structure.  Most of the code does not require
 840  * zero-termination of the string but it makes vop_compat_ncreate() easier.
 841  */
 842 static struct namecache *
 843 cache_alloc(int nlen)
 844 {
 845         struct namecache *ncp;
 846
 847         ncp = kmalloc(sizeof(*ncp), M_VFSCACHE, M_WAITOK|M_ZERO);
 848         if (nlen)
 849                 ncp->nc_name = kmalloc(nlen + 1, M_VFSCACHE, M_WAITOK);
 850         ncp->nc_nlen = nlen;
 851         ncp->nc_flag = NCF_UNRESOLVED;
 852         ncp->nc_error = ENOTCONN;       /* needs to be resolved */
 853         ncp->nc_refs = 1;
 854
 855         TAILQ_INIT(&ncp->nc_list);
 856         _cache_lock(ncp);
 857         return(ncp);
 858 }
 859
 860 /*
 861  * Can only be called for the case where the ncp has never been
 862  * associated with anything (so no spinlocks are needed).
 863  */
 864 static void
 865 _cache_free(struct namecache *ncp)
 866 {
 867         KKASSERT(ncp->nc_refs == 1 && ncp->nc_lockstatus == 1);
 868         if (ncp->nc_name)
 869                 kfree(ncp->nc_name, M_VFSCACHE);
 870         kfree(ncp, M_VFSCACHE);
 871 }
 872
 873 /*
 874  * [re]initialize a nchandle.
 875  */
 876 void
 877 cache_zero(struct nchandle *nch)
 878 {
 879         nch->ncp = NULL;
 880         nch->mount = NULL;
 881 }
 882
 883 /*
 884  * Ref and deref a namecache structure.
 885  *
 886  * The caller must specify a stable ncp pointer, typically meaning the
 887  * ncp is already referenced but this can also occur indirectly through
 888  * e.g. holding a lock on a direct child.
 889  *
 890  * WARNING: Caller may hold an unrelated read spinlock, which means we can't
 891  *          use read spinlocks here.
 892  *
 893  * MPSAFE if nch is
 894  */
 895 struct nchandle *
 896 cache_hold(struct nchandle *nch)
 897 {
 898         _cache_hold(nch->ncp);
 899         atomic_add_int(&nch->mount->mnt_refs, 1);
 900         return(nch);
 901 }
 902
 903 /*
 904  * Create a copy of a namecache handle for an already-referenced
 905  * entry.
 906  *
 907  * MPSAFE if nch is
 908  */
 909 void
 910 cache_copy(struct nchandle *nch, struct nchandle *target)
 911 {
 912         *target = *nch;
 913         if (target->ncp)
 914                 _cache_hold(target->ncp);
 915         atomic_add_int(&nch->mount->mnt_refs, 1);
 916 }
 917
 918 /*
 919  * MPSAFE if nch is
 920  */
 921 void
 922 cache_changemount(struct nchandle *nch, struct mount *mp)
 923 {
 924         atomic_add_int(&nch->mount->mnt_refs, -1);
 925         nch->mount = mp;
 926         atomic_add_int(&nch->mount->mnt_refs, 1);
 927 }
 928
 929 void
 930 cache_drop(struct nchandle *nch)
 931 {
 932         atomic_add_int(&nch->mount->mnt_refs, -1);
 933         _cache_drop(nch->ncp);
 934         nch->ncp = NULL;
 935         nch->mount = NULL;
 936 }
 937
 938 int
 939 cache_lockstatus(struct nchandle *nch)
 940 {
 941         return(_cache_lockstatus(nch->ncp));
 942 }
 943
 944 void
 945 cache_lock(struct nchandle *nch)
 946 {
 947         _cache_lock(nch->ncp);
 948 }
 949
 950 void
 951 cache_lock_maybe_shared(struct nchandle *nch, int excl)
 952 {
 953         struct namecache *ncp = nch->ncp;
 954
 955         if (ncp_shared_lock_disable || excl ||
 956             (ncp->nc_flag & NCF_UNRESOLVED)) {
 957                 _cache_lock(ncp);
 958         } else {
 959                 _cache_lock_shared(ncp);
 960                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
 961                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
 962                                 _cache_unlock(ncp);
 963                                 _cache_lock(ncp);
 964                         }
 965                 } else {
 966                         _cache_unlock(ncp);
 967                         _cache_lock(ncp);
 968                 }
 969         }
 970 }
 971
 972 /*
 973  * Relock nch1 given an unlocked nch1 and a locked nch2.  The caller
 974  * is responsible for checking both for validity on return as they
 975  * may have become invalid.
 976  *
 977  * We have to deal with potential deadlocks here, just ping pong
 978  * the lock until we get it (we will always block somewhere when
 979  * looping so this is not cpu-intensive).
 980  *
 981  * which = 0    nch1 not locked, nch2 is locked
 982  * which = 1    nch1 is locked, nch2 is not locked
 983  */
 984 void
 985 cache_relock(struct nchandle *nch1, struct ucred *cred1,
 986              struct nchandle *nch2, struct ucred *cred2)
 987 {
 988         int which;
 989
 990         which = 0;
 991
 992         for (;;) {
 993                 if (which == 0) {
 994                         if (cache_lock_nonblock(nch1) == 0) {
 995                                 cache_resolve(nch1, cred1);
 996                                 break;
 997                         }
 998                         cache_unlock(nch2);
 999                         cache_lock(nch1);
1000                         cache_resolve(nch1, cred1);
1001                         which = 1;
1002                 } else {
1003                         if (cache_lock_nonblock(nch2) == 0) {
1004                                 cache_resolve(nch2, cred2);
1005                                 break;
1006                         }
1007                         cache_unlock(nch1);
1008                         cache_lock(nch2);
1009                         cache_resolve(nch2, cred2);
1010                         which = 0;
1011                 }
1012         }
1013 }
1014
1015 int
1016 cache_lock_nonblock(struct nchandle *nch)
1017 {
1018         return(_cache_lock_nonblock(nch->ncp));
1019 }
1020
1021 void
1022 cache_unlock(struct nchandle *nch)
1023 {
1024         _cache_unlock(nch->ncp);
1025 }
1026
1027 /*
1028  * ref-and-lock, unlock-and-deref functions.
1029  *
1030  * This function is primarily used by nlookup.  Even though cache_lock
1031  * holds the vnode, it is possible that the vnode may have already
1032  * initiated a recyclement.
1033  *
1034  * We want cache_get() to return a definitively usable vnode or a
1035  * definitively unresolved ncp.
1036  */
1037 static
1038 struct namecache *
1039 _cache_get(struct namecache *ncp)
1040 {
1041         _cache_hold(ncp);
1042         _cache_lock(ncp);
1043         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1044                 _cache_setunresolved(ncp);
1045         return(ncp);
1046 }
1047
1048 /*
1049  * Attempt to obtain a shared lock on the ncp.  A shared lock will only
1050  * be obtained if the ncp is resolved and the vnode (if not ENOENT) is
1051  * valid.  Otherwise an exclusive lock will be acquired instead.
1052  */
1053 static
1054 struct namecache *
1055 _cache_get_maybe_shared(struct namecache *ncp, int excl)
1056 {
1057         if (ncp_shared_lock_disable || excl ||
1058             (ncp->nc_flag & NCF_UNRESOLVED)) {
1059                 return(_cache_get(ncp));
1060         }
1061         _cache_hold(ncp);
1062         _cache_lock_shared(ncp);
1063         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1064                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED)) {
1065                         _cache_unlock(ncp);
1066                         ncp = _cache_get(ncp);
1067                         _cache_drop(ncp);
1068                 }
1069         } else {
1070                 _cache_unlock(ncp);
1071                 ncp = _cache_get(ncp);
1072                 _cache_drop(ncp);
1073         }
1074         return(ncp);
1075 }
1076
1077 /*
1078  * This is a special form of _cache_lock() which only succeeds if
1079  * it can get a pristine, non-recursive lock.  The caller must have
1080  * already ref'd the ncp.
1081  *
1082  * On success the ncp will be locked, on failure it will not.  The
1083  * ref count does not change either way.
1084  *
1085  * We want _cache_lock_special() (on success) to return a definitively
1086  * usable vnode or a definitively unresolved ncp.
1087  */
1088 static int
1089 _cache_lock_special(struct namecache *ncp)
1090 {
1091         if (_cache_lock_nonblock(ncp) == 0) {
1092                 if ((ncp->nc_lockstatus &
1093                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == 1) {
1094                         if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
1095                                 _cache_setunresolved(ncp);
1096                         return(0);
1097                 }
1098                 _cache_unlock(ncp);
1099         }
1100         return(EWOULDBLOCK);
1101 }
1102
1103 static int
1104 _cache_lock_shared_special(struct namecache *ncp)
1105 {
1106         if (_cache_lock_shared_nonblock(ncp) == 0) {
1107                 if ((ncp->nc_lockstatus &
1108                      ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ)) == (NC_SHLOCK_FLAG | 1)) {
1109                         if (ncp->nc_vp == NULL ||
1110                             (ncp->nc_vp->v_flag & VRECLAIMED) == 0) {
1111                                 return(0);
1112                         }
1113                 }
1114                 _cache_unlock(ncp);
1115         }
1116         return(EWOULDBLOCK);
1117 }
1118
1119
1120 /*
1121  * NOTE: The same nchandle can be passed for both arguments.
1122  */
1123 void
1124 cache_get(struct nchandle *nch, struct nchandle *target)
1125 {
1126         KKASSERT(nch->ncp->nc_refs > 0);
1127         target->mount = nch->mount;
1128         target->ncp = _cache_get(nch->ncp);
1129         atomic_add_int(&target->mount->mnt_refs, 1);
1130 }
1131
1132 void
1133 cache_get_maybe_shared(struct nchandle *nch, struct nchandle *target, int excl)
1134 {
1135         KKASSERT(nch->ncp->nc_refs > 0);
1136         target->mount = nch->mount;
1137         target->ncp = _cache_get_maybe_shared(nch->ncp, excl);
1138         atomic_add_int(&target->mount->mnt_refs, 1);
1139 }
1140
1141 /*
1142  *
1143  */
1144 static __inline
1145 void
1146 _cache_put(struct namecache *ncp)
1147 {
1148         _cache_unlock(ncp);
1149         _cache_drop(ncp);
1150 }
1151
1152 /*
1153  *
1154  */
1155 void
1156 cache_put(struct nchandle *nch)
1157 {
1158         atomic_add_int(&nch->mount->mnt_refs, -1);
1159         _cache_put(nch->ncp);
1160         nch->ncp = NULL;
1161         nch->mount = NULL;
1162 }
1163
1164 /*
1165  * Resolve an unresolved ncp by associating a vnode with it.  If the
1166  * vnode is NULL, a negative cache entry is created.
1167  *
1168  * The ncp should be locked on entry and will remain locked on return.
1169  */
1170 static
1171 void
1172 _cache_setvp(struct mount *mp, struct namecache *ncp, struct vnode *vp)
1173 {
1174         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
1175         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1176
1177         if (vp != NULL) {
1178                 /*
1179                  * Any vp associated with an ncp which has children must
1180                  * be held.  Any vp associated with a locked ncp must be held.
1181                  */
1182                 if (!TAILQ_EMPTY(&ncp->nc_list))
1183                         vhold(vp);
1184                 spin_lock(&vp->v_spin);
1185                 ncp->nc_vp = vp;
1186                 TAILQ_INSERT_HEAD(&vp->v_namecache, ncp, nc_vnode);
1187                 spin_unlock(&vp->v_spin);
1188                 if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1189                         vhold(vp);
1190
1191                 /*
1192                  * Set auxiliary flags
1193                  */
1194                 switch(vp->v_type) {
1195                 case VDIR:
1196                         ncp->nc_flag |= NCF_ISDIR;
1197                         break;
1198                 case VLNK:
1199                         ncp->nc_flag |= NCF_ISSYMLINK;
1200                         /* XXX cache the contents of the symlink */
1201                         break;
1202                 default:
1203                         break;
1204                 }
1205                 atomic_add_int(&numcache, 1);
1206                 ncp->nc_error = 0;
1207                 /* XXX: this is a hack to work-around the lack of a real pfs vfs
1208                  * implementation*/
1209                 if (mp != NULL)
1210                         if (strncmp(mp->mnt_stat.f_fstypename, "null", 5) == 0)
1211                                 vp->v_pfsmp = mp;
1212         } else {
1213                 /*
1214                  * When creating a negative cache hit we set the
1215                  * namecache_gen.  A later resolve will clean out the
1216                  * negative cache hit if the mount point's namecache_gen
1217                  * has changed.  Used by devfs, could also be used by
1218                  * other remote FSs.
1219                  */
1220                 ncp->nc_vp = NULL;
1221                 spin_lock(&ncspin);
1222                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
1223                 ++numneg;
1224                 spin_unlock(&ncspin);
1225                 ncp->nc_error = ENOENT;
1226                 if (mp)
1227                         VFS_NCPGEN_SET(mp, ncp);
1228         }
1229         ncp->nc_flag &= ~(NCF_UNRESOLVED | NCF_DEFEREDZAP);
1230 }
1231
1232 /*
1233  *
1234  */
1235 void
1236 cache_setvp(struct nchandle *nch, struct vnode *vp)
1237 {
1238         _cache_setvp(nch->mount, nch->ncp, vp);
1239 }
1240
1241 /*
1242  *
1243  */
1244 void
1245 cache_settimeout(struct nchandle *nch, int nticks)
1246 {
1247         struct namecache *ncp = nch->ncp;
1248
1249         if ((ncp->nc_timeout = ticks + nticks) == 0)
1250                 ncp->nc_timeout = 1;
1251 }
1252
1253 /*
1254  * Disassociate the vnode or negative-cache association and mark a
1255  * namecache entry as unresolved again.  Note that the ncp is still
1256  * left in the hash table and still linked to its parent.
1257  *
1258  * The ncp should be locked and refd on entry and will remain locked and refd
1259  * on return.
1260  *
1261  * This routine is normally never called on a directory containing children.
1262  * However, NFS often does just that in its rename() code as a cop-out to
1263  * avoid complex namespace operations.  This disconnects a directory vnode
1264  * from its namecache and can cause the OLDAPI and NEWAPI to get out of
1265  * sync.
1266  *
1267  */
1268 static
1269 void
1270 _cache_setunresolved(struct namecache *ncp)
1271 {
1272         struct vnode *vp;
1273
1274         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1275                 ncp->nc_flag |= NCF_UNRESOLVED;
1276                 ncp->nc_timeout = 0;
1277                 ncp->nc_error = ENOTCONN;
1278                 if ((vp = ncp->nc_vp) != NULL) {
1279                         atomic_add_int(&numcache, -1);
1280                         spin_lock(&vp->v_spin);
1281                         ncp->nc_vp = NULL;
1282                         TAILQ_REMOVE(&vp->v_namecache, ncp, nc_vnode);
1283                         spin_unlock(&vp->v_spin);
1284
1285                         /*
1286                          * Any vp associated with an ncp with children is
1287                          * held by that ncp.  Any vp associated with a locked
1288                          * ncp is held by that ncp.  These conditions must be
1289                          * undone when the vp is cleared out from the ncp.
1290                          */
1291                         if (!TAILQ_EMPTY(&ncp->nc_list))
1292                                 vdrop(vp);
1293                         if (ncp->nc_lockstatus & ~(NC_EXLOCK_REQ|NC_SHLOCK_REQ))
1294                                 vdrop(vp);
1295                 } else {
1296                         spin_lock(&ncspin);
1297                         TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
1298                         --numneg;
1299                         spin_unlock(&ncspin);
1300                 }
1301                 ncp->nc_flag &= ~(NCF_WHITEOUT|NCF_ISDIR|NCF_ISSYMLINK);
1302         }
1303 }
1304
1305 /*
1306  * The cache_nresolve() code calls this function to automatically
1307  * set a resolved cache element to unresolved if it has timed out
1308  * or if it is a negative cache hit and the mount point namecache_gen
1309  * has changed.
1310  */
1311 static __inline int
1312 _cache_auto_unresolve_test(struct mount *mp, struct namecache *ncp)
1313 {
1314         /*
1315          * Try to zap entries that have timed out.  We have
1316          * to be careful here because locked leafs may depend
1317          * on the vnode remaining intact in a parent, so only
1318          * do this under very specific conditions.
1319          */
1320         if (ncp->nc_timeout && (int)(ncp->nc_timeout - ticks) < 0 &&
1321             TAILQ_EMPTY(&ncp->nc_list)) {
1322                 return 1;
1323         }
1324
1325         /*
1326          * If a resolved negative cache hit is invalid due to
1327          * the mount's namecache generation being bumped, zap it.
1328          */
1329         if (ncp->nc_vp == NULL && VFS_NCPGEN_TEST(mp, ncp)) {
1330                 return 1;
1331         }
1332
1333         /*
1334          * Otherwise we are good
1335          */
1336         return 0;
1337 }
1338
1339 static __inline void
1340 _cache_auto_unresolve(struct mount *mp, struct namecache *ncp)
1341 {
1342         /*
1343          * Already in an unresolved state, nothing to do.
1344          */
1345         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
1346                 if (_cache_auto_unresolve_test(mp, ncp))
1347                         _cache_setunresolved(ncp);
1348         }
1349 }
1350
1351 /*
1352  *
1353  */
1354 void
1355 cache_setunresolved(struct nchandle *nch)
1356 {
1357         _cache_setunresolved(nch->ncp);
1358 }
1359
1360 /*
1361  * Determine if we can clear NCF_ISMOUNTPT by scanning the mountlist
1362  * looking for matches.  This flag tells the lookup code when it must
1363  * check for a mount linkage and also prevents the directories in question
1364  * from being deleted or renamed.
1365  */
1366 static
1367 int
1368 cache_clrmountpt_callback(struct mount *mp, void *data)
1369 {
1370         struct nchandle *nch = data;
1371
1372         if (mp->mnt_ncmounton.ncp == nch->ncp)
1373                 return(1);
1374         if (mp->mnt_ncmountpt.ncp == nch->ncp)
1375                 return(1);
1376         return(0);
1377 }
1378
1379 /*
1380  *
1381  */
1382 void
1383 cache_clrmountpt(struct nchandle *nch)
1384 {
1385         int count;
1386
1387         count = mountlist_scan(cache_clrmountpt_callback, nch,
1388                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
1389         if (count == 0)
1390                 nch->ncp->nc_flag &= ~NCF_ISMOUNTPT;
1391 }
1392
1393 /*
1394  * Invalidate portions of the namecache topology given a starting entry.
1395  * The passed ncp is set to an unresolved state and:
1396  *
1397  * The passed ncp must be referencxed and locked.  The routine may unlock
1398  * and relock ncp several times, and will recheck the children and loop
1399  * to catch races.  When done the passed ncp will be returned with the
1400  * reference and lock intact.
1401  *
1402  * CINV_DESTROY         - Set a flag in the passed ncp entry indicating
1403  *                        that the physical underlying nodes have been
1404  *                        destroyed... as in deleted.  For example, when
1405  *                        a directory is removed.  This will cause record
1406  *                        lookups on the name to no longer be able to find
1407  *                        the record and tells the resolver to return failure
1408  *                        rather then trying to resolve through the parent.
1409  *
1410  *                        The topology itself, including ncp->nc_name,
1411  *                        remains intact.
1412  *
1413  *                        This only applies to the passed ncp, if CINV_CHILDREN
1414  *                        is specified the children are not flagged.
1415  *
1416  * CINV_CHILDREN        - Set all children (recursively) to an unresolved
1417  *                        state as well.
1418  *
1419  *                        Note that this will also have the side effect of
1420  *                        cleaning out any unreferenced nodes in the topology
1421  *                        from the leaves up as the recursion backs out.
1422  *
1423  * Note that the topology for any referenced nodes remains intact, but
1424  * the nodes will be marked as having been destroyed and will be set
1425  * to an unresolved state.
1426  *
1427  * It is possible for cache_inval() to race a cache_resolve(), meaning that
1428  * the namecache entry may not actually be invalidated on return if it was
1429  * revalidated while recursing down into its children.  This code guarentees
1430  * that the node(s) will go through an invalidation cycle, but does not
1431  * guarentee that they will remain in an invalidated state.
1432  *
1433  * Returns non-zero if a revalidation was detected during the invalidation
1434  * recursion, zero otherwise.  Note that since only the original ncp is
1435  * locked the revalidation ultimately can only indicate that the original ncp
1436  * *MIGHT* no have been reresolved.
1437  *
1438  * DEEP RECURSION HANDLING - If a recursive invalidation recurses deeply we
1439  * have to avoid blowing out the kernel stack.  We do this by saving the
1440  * deep namecache node and aborting the recursion, then re-recursing at that
1441  * node using a depth-first algorithm in order to allow multiple deep
1442  * recursions to chain through each other, then we restart the invalidation
1443  * from scratch.
1444  */
1445
1446 struct cinvtrack {
1447         struct namecache *resume_ncp;
1448         int depth;
1449 };
1450
1451 static int _cache_inval_internal(struct namecache *, int, struct cinvtrack *);
1452
1453 static
1454 int
1455 _cache_inval(struct namecache *ncp, int flags)
1456 {
1457         struct cinvtrack track;
1458         struct namecache *ncp2;
1459         int r;
1460
1461         track.depth = 0;
1462         track.resume_ncp = NULL;
1463
1464         for (;;) {
1465                 r = _cache_inval_internal(ncp, flags, &track);
1466                 if (track.resume_ncp == NULL)
1467                         break;
1468                 kprintf("Warning: deep namecache recursion at %s\n",
1469                         ncp->nc_name);
1470                 _cache_unlock(ncp);
1471                 while ((ncp2 = track.resume_ncp) != NULL) {
1472                         track.resume_ncp = NULL;
1473                         _cache_lock(ncp2);
1474                         _cache_inval_internal(ncp2, flags & ~CINV_DESTROY,
1475                                              &track);
1476                         _cache_put(ncp2);
1477                 }
1478                 _cache_lock(ncp);
1479         }
1480         return(r);
1481 }
1482
1483 int
1484 cache_inval(struct nchandle *nch, int flags)
1485 {
1486         return(_cache_inval(nch->ncp, flags));
1487 }
1488
1489 /*
1490  * Helper for _cache_inval().  The passed ncp is refd and locked and
1491  * remains that way on return, but may be unlocked/relocked multiple
1492  * times by the routine.
1493  */
1494 static int
1495 _cache_inval_internal(struct namecache *ncp, int flags, struct cinvtrack *track)
1496 {
1497         struct namecache *kid;
1498         struct namecache *nextkid;
1499         int rcnt = 0;
1500
1501         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
1502
1503         _cache_setunresolved(ncp);
1504         if (flags & CINV_DESTROY)
1505                 ncp->nc_flag |= NCF_DESTROYED;
1506         if ((flags & CINV_CHILDREN) &&
1507             (kid = TAILQ_FIRST(&ncp->nc_list)) != NULL
1508         ) {
1509                 _cache_hold(kid);
1510                 if (++track->depth > MAX_RECURSION_DEPTH) {
1511                         track->resume_ncp = ncp;
1512                         _cache_hold(ncp);
1513                         ++rcnt;
1514                 }
1515                 _cache_unlock(ncp);
1516                 while (kid) {
1517                         if (track->resume_ncp) {
1518                                 _cache_drop(kid);
1519                                 break;
1520                         }
1521                         if ((nextkid = TAILQ_NEXT(kid, nc_entry)) != NULL)
1522                                 _cache_hold(nextkid);
1523                         if ((kid->nc_flag & NCF_UNRESOLVED) == 0 ||
1524                             TAILQ_FIRST(&kid->nc_list)
1525                         ) {
1526                                 _cache_lock(kid);
1527                                 rcnt += _cache_inval_internal(kid, flags & ~CINV_DESTROY, track);
1528                                 _cache_unlock(kid);
1529                         }
1530                         _cache_drop(kid);
1531                         kid = nextkid;
1532                 }
1533                 --track->depth;
1534                 _cache_lock(ncp);
1535         }
1536
1537         /*
1538          * Someone could have gotten in there while ncp was unlocked,
1539          * retry if so.
1540          */
1541         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
1542                 ++rcnt;
1543         return (rcnt);
1544 }
1545
1546 /*
1547  * Invalidate a vnode's namecache associations.  To avoid races against
1548  * the resolver we do not invalidate a node which we previously invalidated
1549  * but which was then re-resolved while we were in the invalidation loop.
1550  *
1551  * Returns non-zero if any namecache entries remain after the invalidation
1552  * loop completed.
1553  *
1554  * NOTE: Unlike the namecache topology which guarentees that ncp's will not
1555  *       be ripped out of the topology while held, the vnode's v_namecache
1556  *       list has no such restriction.  NCP's can be ripped out of the list
1557  *       at virtually any time if not locked, even if held.
1558  *
1559  *       In addition, the v_namecache list itself must be locked via
1560  *       the vnode's spinlock.
1561  */
1562 int
1563 cache_inval_vp(struct vnode *vp, int flags)
1564 {
1565         struct namecache *ncp;
1566         struct namecache *next;
1567
1568 restart:
1569         spin_lock(&vp->v_spin);
1570         ncp = TAILQ_FIRST(&vp->v_namecache);
1571         if (ncp)
1572                 _cache_hold(ncp);
1573         while (ncp) {
1574                 /* loop entered with ncp held and vp spin-locked */
1575                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1576                         _cache_hold(next);
1577                 spin_unlock(&vp->v_spin);
1578                 _cache_lock(ncp);
1579                 if (ncp->nc_vp != vp) {
1580                         kprintf("Warning: cache_inval_vp: race-A detected on "
1581                                 "%s\n", ncp->nc_name);
1582                         _cache_put(ncp);
1583                         if (next)
1584                                 _cache_drop(next);
1585                         goto restart;
1586                 }
1587                 _cache_inval(ncp, flags);
1588                 _cache_put(ncp);                /* also releases reference */
1589                 ncp = next;
1590                 spin_lock(&vp->v_spin);
1591                 if (ncp && ncp->nc_vp != vp) {
1592                         spin_unlock(&vp->v_spin);
1593                         kprintf("Warning: cache_inval_vp: race-B detected on "
1594                                 "%s\n", ncp->nc_name);
1595                         _cache_drop(ncp);
1596                         goto restart;
1597                 }
1598         }
1599         spin_unlock(&vp->v_spin);
1600         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1601 }
1602
1603 /*
1604  * This routine is used instead of the normal cache_inval_vp() when we
1605  * are trying to recycle otherwise good vnodes.
1606  *
1607  * Return 0 on success, non-zero if not all namecache records could be
1608  * disassociated from the vnode (for various reasons).
1609  */
1610 int
1611 cache_inval_vp_nonblock(struct vnode *vp)
1612 {
1613         struct namecache *ncp;
1614         struct namecache *next;
1615
1616         spin_lock(&vp->v_spin);
1617         ncp = TAILQ_FIRST(&vp->v_namecache);
1618         if (ncp)
1619                 _cache_hold(ncp);
1620         while (ncp) {
1621                 /* loop entered with ncp held */
1622                 if ((next = TAILQ_NEXT(ncp, nc_vnode)) != NULL)
1623                         _cache_hold(next);
1624                 spin_unlock(&vp->v_spin);
1625                 if (_cache_lock_nonblock(ncp)) {
1626                         _cache_drop(ncp);
1627                         if (next)
1628                                 _cache_drop(next);
1629                         goto done;
1630                 }
1631                 if (ncp->nc_vp != vp) {
1632                         kprintf("Warning: cache_inval_vp: race-A detected on "
1633                                 "%s\n", ncp->nc_name);
1634                         _cache_put(ncp);
1635                         if (next)
1636                                 _cache_drop(next);
1637                         goto done;
1638                 }
1639                 _cache_inval(ncp, 0);
1640                 _cache_put(ncp);                /* also releases reference */
1641                 ncp = next;
1642                 spin_lock(&vp->v_spin);
1643                 if (ncp && ncp->nc_vp != vp) {
1644                         spin_unlock(&vp->v_spin);
1645                         kprintf("Warning: cache_inval_vp: race-B detected on "
1646                                 "%s\n", ncp->nc_name);
1647                         _cache_drop(ncp);
1648                         goto done;
1649                 }
1650         }
1651         spin_unlock(&vp->v_spin);
1652 done:
1653         return(TAILQ_FIRST(&vp->v_namecache) != NULL);
1654 }
1655
1656 /*
1657  * The source ncp has been renamed to the target ncp.  Both fncp and tncp
1658  * must be locked.  The target ncp is destroyed (as a normal rename-over
1659  * would destroy the target file or directory).
1660  *
1661  * Because there may be references to the source ncp we cannot copy its
1662  * contents to the target.  Instead the source ncp is relinked as the target
1663  * and the target ncp is removed from the namecache topology.
1664  */
1665 void
1666 cache_rename(struct nchandle *fnch, struct nchandle *tnch)
1667 {
1668         struct namecache *fncp = fnch->ncp;
1669         struct namecache *tncp = tnch->ncp;
1670         struct namecache *tncp_par;
1671         struct nchash_head *nchpp;
1672         u_int32_t hash;
1673         char *oname;
1674         char *nname;
1675
1676         if (tncp->nc_nlen) {
1677                 nname = kmalloc(tncp->nc_nlen + 1, M_VFSCACHE, M_WAITOK);
1678                 bcopy(tncp->nc_name, nname, tncp->nc_nlen);
1679                 nname[tncp->nc_nlen] = 0;
1680         } else {
1681                 nname = NULL;
1682         }
1683
1684         /*
1685          * Rename fncp (unlink)
1686          */
1687         _cache_unlink_parent(fncp);
1688         oname = fncp->nc_name;
1689         fncp->nc_name = nname;
1690         fncp->nc_nlen = tncp->nc_nlen;
1691         if (oname)
1692                 kfree(oname, M_VFSCACHE);
1693
1694         tncp_par = tncp->nc_parent;
1695         _cache_hold(tncp_par);
1696         _cache_lock(tncp_par);
1697
1698         /*
1699          * Rename fncp (relink)
1700          */
1701         hash = fnv_32_buf(fncp->nc_name, fncp->nc_nlen, FNV1_32_INIT);
1702         hash = fnv_32_buf(&tncp_par, sizeof(tncp_par), hash);
1703         nchpp = NCHHASH(hash);
1704
1705         spin_lock(&nchpp->spin);
1706         _cache_link_parent(fncp, tncp_par, nchpp);
1707         spin_unlock(&nchpp->spin);
1708
1709         _cache_put(tncp_par);
1710
1711         /*
1712          * Get rid of the overwritten tncp (unlink)
1713          */
1714         _cache_unlink(tncp);
1715 }
1716
1717 /*
1718  * Perform actions consistent with unlinking a file.  The passed-in ncp
1719  * must be locked.
1720  *
1721  * The ncp is marked DESTROYED so it no longer shows up in searches,
1722  * and will be physically deleted when the vnode goes away.
1723  *
1724  * If the related vnode has no refs then we cycle it through vget()/vput()
1725  * to (possibly if we don't have a ref race) trigger a deactivation,
1726  * allowing the VFS to trivially detect and recycle the deleted vnode
1727  * via VOP_INACTIVE().
1728  *
1729  * NOTE: _cache_rename() will automatically call _cache_unlink() on the
1730  *       target ncp.
1731  */
1732 void
1733 cache_unlink(struct nchandle *nch)
1734 {
1735         _cache_unlink(nch->ncp);
1736 }
1737
1738 static void
1739 _cache_unlink(struct namecache *ncp)
1740 {
1741         struct vnode *vp;
1742
1743         /*
1744          * Causes lookups to fail and allows another ncp with the same
1745          * name to be created under ncp->nc_parent.
1746          */
1747         ncp->nc_flag |= NCF_DESTROYED;
1748
1749         /*
1750          * Attempt to trigger a deactivation.
1751          */
1752         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
1753             (vp = ncp->nc_vp) != NULL &&
1754             !sysref_isactive(&vp->v_sysref)) {
1755                 if (vget(vp, LK_SHARED) == 0)
1756                         vput(vp);
1757         }
1758 }
1759
1760 /*
1761  * vget the vnode associated with the namecache entry.  Resolve the namecache
1762  * entry if necessary.  The passed ncp must be referenced and locked.
1763  *
1764  * lk_type may be LK_SHARED, LK_EXCLUSIVE.  A ref'd, possibly locked
1765  * (depending on the passed lk_type) will be returned in *vpp with an error
1766  * of 0, or NULL will be returned in *vpp with a non-0 error code.  The
1767  * most typical error is ENOENT, meaning that the ncp represents a negative
1768  * cache hit and there is no vnode to retrieve, but other errors can occur
1769  * too.
1770  *
1771  * The vget() can race a reclaim.  If this occurs we re-resolve the
1772  * namecache entry.
1773  *
1774  * There are numerous places in the kernel where vget() is called on a
1775  * vnode while one or more of its namecache entries is locked.  Releasing
1776  * a vnode never deadlocks against locked namecache entries (the vnode
1777  * will not get recycled while referenced ncp's exist).  This means we
1778  * can safely acquire the vnode.  In fact, we MUST NOT release the ncp
1779  * lock when acquiring the vp lock or we might cause a deadlock.
1780  *
1781  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1782  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1783  *       relocked exclusively before being re-resolved.
1784  */
1785 int
1786 cache_vget(struct nchandle *nch, struct ucred *cred,
1787            int lk_type, struct vnode **vpp)
1788 {
1789         struct namecache *ncp;
1790         struct vnode *vp;
1791         int error;
1792
1793         ncp = nch->ncp;
1794 again:
1795         vp = NULL;
1796         if (ncp->nc_flag & NCF_UNRESOLVED)
1797                 error = cache_resolve(nch, cred);
1798         else
1799                 error = 0;
1800
1801         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1802                 error = vget(vp, lk_type);
1803                 if (error) {
1804                         /*
1805                          * VRECLAIM race
1806                          */
1807                         if (error == ENOENT) {
1808                                 kprintf("Warning: vnode reclaim race detected "
1809                                         "in cache_vget on %p (%s)\n",
1810                                         vp, ncp->nc_name);
1811                                 _cache_unlock(ncp);
1812                                 _cache_lock(ncp);
1813                                 _cache_setunresolved(ncp);
1814                                 goto again;
1815                         }
1816
1817                         /*
1818                          * Not a reclaim race, some other error.
1819                          */
1820                         KKASSERT(ncp->nc_vp == vp);
1821                         vp = NULL;
1822                 } else {
1823                         KKASSERT(ncp->nc_vp == vp);
1824                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1825                 }
1826         }
1827         if (error == 0 && vp == NULL)
1828                 error = ENOENT;
1829         *vpp = vp;
1830         return(error);
1831 }
1832
1833 /*
1834  * Similar to cache_vget() but only acquires a ref on the vnode.
1835  *
1836  * NOTE: The passed-in ncp must be locked exclusively if it is initially
1837  *       unresolved.  If a reclaim race occurs the passed-in ncp will be
1838  *       relocked exclusively before being re-resolved.
1839  */
1840 int
1841 cache_vref(struct nchandle *nch, struct ucred *cred, struct vnode **vpp)
1842 {
1843         struct namecache *ncp;
1844         struct vnode *vp;
1845         int error;
1846
1847         ncp = nch->ncp;
1848 again:
1849         vp = NULL;
1850         if (ncp->nc_flag & NCF_UNRESOLVED)
1851                 error = cache_resolve(nch, cred);
1852         else
1853                 error = 0;
1854
1855         if (error == 0 && (vp = ncp->nc_vp) != NULL) {
1856                 error = vget(vp, LK_SHARED);
1857                 if (error) {
1858                         /*
1859                          * VRECLAIM race
1860                          */
1861                         if (error == ENOENT) {
1862                                 kprintf("Warning: vnode reclaim race detected "
1863                                         "in cache_vget on %p (%s)\n",
1864                                         vp, ncp->nc_name);
1865                                 _cache_unlock(ncp);
1866                                 _cache_lock(ncp);
1867                                 _cache_setunresolved(ncp);
1868                                 goto again;
1869                         }
1870
1871                         /*
1872                          * Not a reclaim race, some other error.
1873                          */
1874                         KKASSERT(ncp->nc_vp == vp);
1875                         vp = NULL;
1876                 } else {
1877                         KKASSERT(ncp->nc_vp == vp);
1878                         KKASSERT((vp->v_flag & VRECLAIMED) == 0);
1879                         /* caller does not want a lock */
1880                         vn_unlock(vp);
1881                 }
1882         }
1883         if (error == 0 && vp == NULL)
1884                 error = ENOENT;
1885         *vpp = vp;
1886         return(error);
1887 }
1888
1889 /*
1890  * Return a referenced vnode representing the parent directory of
1891  * ncp.
1892  *
1893  * Because the caller has locked the ncp it should not be possible for
1894  * the parent ncp to go away.  However, the parent can unresolve its
1895  * dvp at any time so we must be able to acquire a lock on the parent
1896  * to safely access nc_vp.
1897  *
1898  * We have to leave par unlocked when vget()ing dvp to avoid a deadlock,
1899  * so use vhold()/vdrop() while holding the lock to prevent dvp from
1900  * getting destroyed.
1901  *
1902  * NOTE: vhold() is allowed when dvp has 0 refs if we hold a
1903  *       lock on the ncp in question..
1904  */
1905 static struct vnode *
1906 cache_dvpref(struct namecache *ncp)
1907 {
1908         struct namecache *par;
1909         struct vnode *dvp;
1910
1911         dvp = NULL;
1912         if ((par = ncp->nc_parent) != NULL) {
1913                 _cache_hold(par);
1914                 _cache_lock(par);
1915                 if ((par->nc_flag & NCF_UNRESOLVED) == 0) {
1916                         if ((dvp = par->nc_vp) != NULL)
1917                                 vhold(dvp);
1918                 }
1919                 _cache_unlock(par);
1920                 if (dvp) {
1921                         if (vget(dvp, LK_SHARED) == 0) {
1922                                 vn_unlock(dvp);
1923                                 vdrop(dvp);
1924                                 /* return refd, unlocked dvp */
1925                         } else {
1926                                 vdrop(dvp);
1927                                 dvp = NULL;
1928                         }
1929                 }
1930                 _cache_drop(par);
1931         }
1932         return(dvp);
1933 }
1934
1935 /*
1936  * Convert a directory vnode to a namecache record without any other
1937  * knowledge of the topology.  This ONLY works with directory vnodes and
1938  * is ONLY used by the NFS server.  dvp must be refd but unlocked, and the
1939  * returned ncp (if not NULL) will be held and unlocked.
1940  *
1941  * If 'makeit' is 0 and dvp has no existing namecache record, NULL is returned.
1942  * If 'makeit' is 1 we attempt to track-down and create the namecache topology
1943  * for dvp.  This will fail only if the directory has been deleted out from
1944  * under the caller.
1945  *
1946  * Callers must always check for a NULL return no matter the value of 'makeit'.
1947  *
1948  * To avoid underflowing the kernel stack each recursive call increments
1949  * the makeit variable.
1950  */
1951
1952 static int cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
1953                                   struct vnode *dvp, char *fakename);
1954 static int cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
1955                                   struct vnode **saved_dvp);
1956
1957 int
1958 cache_fromdvp(struct vnode *dvp, struct ucred *cred, int makeit,
1959               struct nchandle *nch)
1960 {
1961         struct vnode *saved_dvp;
1962         struct vnode *pvp;
1963         char *fakename;
1964         int error;
1965
1966         nch->ncp = NULL;
1967         nch->mount = dvp->v_mount;
1968         saved_dvp = NULL;
1969         fakename = NULL;
1970
1971         /*
1972          * Handle the makeit == 0 degenerate case
1973          */
1974         if (makeit == 0) {
1975                 spin_lock(&dvp->v_spin);
1976                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
1977                 if (nch->ncp)
1978                         cache_hold(nch);
1979                 spin_unlock(&dvp->v_spin);
1980         }
1981
1982         /*
1983          * Loop until resolution, inside code will break out on error.
1984          */
1985         while (makeit) {
1986                 /*
1987                  * Break out if we successfully acquire a working ncp.
1988                  */
1989                 spin_lock(&dvp->v_spin);
1990                 nch->ncp = TAILQ_FIRST(&dvp->v_namecache);
1991                 if (nch->ncp) {
1992                         cache_hold(nch);
1993                         spin_unlock(&dvp->v_spin);
1994                         break;
1995                 }
1996                 spin_unlock(&dvp->v_spin);
1997
1998                 /*
1999                  * If dvp is the root of its filesystem it should already
2000                  * have a namecache pointer associated with it as a side
2001                  * effect of the mount, but it may have been disassociated.
2002                  */
2003                 if (dvp->v_flag & VROOT) {
2004                         nch->ncp = _cache_get(nch->mount->mnt_ncmountpt.ncp);
2005                         error = cache_resolve_mp(nch->mount);
2006                         _cache_put(nch->ncp);
2007                         if (ncvp_debug) {
2008                                 kprintf("cache_fromdvp: resolve root of mount %p error %d",
2009                                         dvp->v_mount, error);
2010                         }
2011                         if (error) {
2012                                 if (ncvp_debug)
2013                                         kprintf(" failed\n");
2014                                 nch->ncp = NULL;
2015                                 break;
2016                         }
2017                         if (ncvp_debug)
2018                                 kprintf(" succeeded\n");
2019                         continue;
2020                 }
2021
2022                 /*
2023                  * If we are recursed too deeply resort to an O(n^2)
2024                  * algorithm to resolve the namecache topology.  The
2025                  * resolved pvp is left referenced in saved_dvp to
2026                  * prevent the tree from being destroyed while we loop.
2027                  */
2028                 if (makeit > 20) {
2029                         error = cache_fromdvp_try(dvp, cred, &saved_dvp);
2030                         if (error) {
2031                                 kprintf("lookupdotdot(longpath) failed %d "
2032                                        "dvp %p\n", error, dvp);
2033                                 nch->ncp = NULL;
2034                                 break;
2035                         }
2036                         continue;
2037                 }
2038
2039                 /*
2040                  * Get the parent directory and resolve its ncp.
2041                  */
2042                 if (fakename) {
2043                         kfree(fakename, M_TEMP);
2044                         fakename = NULL;
2045                 }
2046                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2047                                           &fakename);
2048                 if (error) {
2049                         kprintf("lookupdotdot failed %d dvp %p\n", error, dvp);
2050                         break;
2051                 }
2052                 vn_unlock(pvp);
2053
2054                 /*
2055                  * Reuse makeit as a recursion depth counter.  On success
2056                  * nch will be fully referenced.
2057                  */
2058                 cache_fromdvp(pvp, cred, makeit + 1, nch);
2059                 vrele(pvp);
2060                 if (nch->ncp == NULL)
2061                         break;
2062
2063                 /*
2064                  * Do an inefficient scan of pvp (embodied by ncp) to look
2065                  * for dvp.  This will create a namecache record for dvp on
2066                  * success.  We loop up to recheck on success.
2067                  *
2068                  * ncp and dvp are both held but not locked.
2069                  */
2070                 error = cache_inefficient_scan(nch, cred, dvp, fakename);
2071                 if (error) {
2072                         kprintf("cache_fromdvp: scan %p (%s) failed on dvp=%p\n",
2073                                 pvp, nch->ncp->nc_name, dvp);
2074                         cache_drop(nch);
2075                         /* nch was NULLed out, reload mount */
2076                         nch->mount = dvp->v_mount;
2077                         break;
2078                 }
2079                 if (ncvp_debug) {
2080                         kprintf("cache_fromdvp: scan %p (%s) succeeded\n",
2081                                 pvp, nch->ncp->nc_name);
2082                 }
2083                 cache_drop(nch);
2084                 /* nch was NULLed out, reload mount */
2085                 nch->mount = dvp->v_mount;
2086         }
2087
2088         /*
2089          * If nch->ncp is non-NULL it will have been held already.
2090          */
2091         if (fakename)
2092                 kfree(fakename, M_TEMP);
2093         if (saved_dvp)
2094                 vrele(saved_dvp);
2095         if (nch->ncp)
2096                 return (0);
2097         return (EINVAL);
2098 }
2099
2100 /*
2101  * Go up the chain of parent directories until we find something
2102  * we can resolve into the namecache.  This is very inefficient.
2103  */
2104 static
2105 int
2106 cache_fromdvp_try(struct vnode *dvp, struct ucred *cred,
2107                   struct vnode **saved_dvp)
2108 {
2109         struct nchandle nch;
2110         struct vnode *pvp;
2111         int error;
2112         static time_t last_fromdvp_report;
2113         char *fakename;
2114
2115         /*
2116          * Loop getting the parent directory vnode until we get something we
2117          * can resolve in the namecache.
2118          */
2119         vref(dvp);
2120         nch.mount = dvp->v_mount;
2121         nch.ncp = NULL;
2122         fakename = NULL;
2123
2124         for (;;) {
2125                 if (fakename) {
2126                         kfree(fakename, M_TEMP);
2127                         fakename = NULL;
2128                 }
2129                 error = vop_nlookupdotdot(*dvp->v_ops, dvp, &pvp, cred,
2130                                           &fakename);
2131                 if (error) {
2132                         vrele(dvp);
2133                         break;
2134                 }
2135                 vn_unlock(pvp);
2136                 spin_lock(&pvp->v_spin);
2137                 if ((nch.ncp = TAILQ_FIRST(&pvp->v_namecache)) != NULL) {
2138                         _cache_hold(nch.ncp);
2139                         spin_unlock(&pvp->v_spin);
2140                         vrele(pvp);
2141                         break;
2142                 }
2143                 spin_unlock(&pvp->v_spin);
2144                 if (pvp->v_flag & VROOT) {
2145                         nch.ncp = _cache_get(pvp->v_mount->mnt_ncmountpt.ncp);
2146                         error = cache_resolve_mp(nch.mount);
2147                         _cache_unlock(nch.ncp);
2148                         vrele(pvp);
2149                         if (error) {
2150                                 _cache_drop(nch.ncp);
2151                                 nch.ncp = NULL;
2152                                 vrele(dvp);
2153                         }
2154                         break;
2155                 }
2156                 vrele(dvp);
2157                 dvp = pvp;
2158         }
2159         if (error == 0) {
2160                 if (last_fromdvp_report != time_second) {
2161                         last_fromdvp_report = time_second;
2162                         kprintf("Warning: extremely inefficient path "
2163                                 "resolution on %s\n",
2164                                 nch.ncp->nc_name);
2165                 }
2166                 error = cache_inefficient_scan(&nch, cred, dvp, fakename);
2167
2168                 /*
2169                  * Hopefully dvp now has a namecache record associated with
2170                  * it.  Leave it referenced to prevent the kernel from
2171                  * recycling the vnode.  Otherwise extremely long directory
2172                  * paths could result in endless recycling.
2173                  */
2174                 if (*saved_dvp)
2175                     vrele(*saved_dvp);
2176                 *saved_dvp = dvp;
2177                 _cache_drop(nch.ncp);
2178         }
2179         if (fakename)
2180                 kfree(fakename, M_TEMP);
2181         return (error);
2182 }
2183
2184 /*
2185  * Do an inefficient scan of the directory represented by ncp looking for
2186  * the directory vnode dvp.  ncp must be held but not locked on entry and
2187  * will be held on return.  dvp must be refd but not locked on entry and
2188  * will remain refd on return.
2189  *
2190  * Why do this at all?  Well, due to its stateless nature the NFS server
2191  * converts file handles directly to vnodes without necessarily going through
2192  * the namecache ops that would otherwise create the namecache topology
2193  * leading to the vnode.  We could either (1) Change the namecache algorithms
2194  * to allow disconnect namecache records that are re-merged opportunistically,
2195  * or (2) Make the NFS server backtrack and scan to recover a connected
2196  * namecache topology in order to then be able to issue new API lookups.
2197  *
2198  * It turns out that (1) is a huge mess.  It takes a nice clean set of
2199  * namecache algorithms and introduces a lot of complication in every subsystem
2200  * that calls into the namecache to deal with the re-merge case, especially
2201  * since we are using the namecache to placehold negative lookups and the
2202  * vnode might not be immediately assigned. (2) is certainly far less
2203  * efficient then (1), but since we are only talking about directories here
2204  * (which are likely to remain cached), the case does not actually run all
2205  * that often and has the supreme advantage of not polluting the namecache
2206  * algorithms.
2207  *
2208  * If a fakename is supplied just construct a namecache entry using the
2209  * fake name.
2210  */
2211 static int
2212 cache_inefficient_scan(struct nchandle *nch, struct ucred *cred,
2213                        struct vnode *dvp, char *fakename)
2214 {
2215         struct nlcomponent nlc;
2216         struct nchandle rncp;
2217         struct dirent *den;
2218         struct vnode *pvp;
2219         struct vattr vat;
2220         struct iovec iov;
2221         struct uio uio;
2222         int blksize;
2223         int eofflag;
2224         int bytes;
2225         char *rbuf;
2226         int error;
2227
2228         vat.va_blocksize = 0;
2229         if ((error = VOP_GETATTR(dvp, &vat)) != 0)
2230                 return (error);
2231         cache_lock(nch);
2232         error = cache_vref(nch, cred, &pvp);
2233         cache_unlock(nch);
2234         if (error)
2235                 return (error);
2236         if (ncvp_debug) {
2237                 kprintf("inefficient_scan: directory iosize %ld "
2238                         "vattr fileid = %lld\n",
2239                         vat.va_blocksize,
2240                         (long long)vat.va_fileid);
2241         }
2242
2243         /*
2244          * Use the supplied fakename if not NULL.  Fake names are typically
2245          * not in the actual filesystem hierarchy.  This is used by HAMMER
2246          * to glue @@timestamp recursions together.
2247          */
2248         if (fakename) {
2249                 nlc.nlc_nameptr = fakename;
2250                 nlc.nlc_namelen = strlen(fakename);
2251                 rncp = cache_nlookup(nch, &nlc);
2252                 goto done;
2253         }
2254
2255         if ((blksize = vat.va_blocksize) == 0)
2256                 blksize = DEV_BSIZE;
2257         rbuf = kmalloc(blksize, M_TEMP, M_WAITOK);
2258         rncp.ncp = NULL;
2259
2260         eofflag = 0;
2261         uio.uio_offset = 0;
2262 again:
2263         iov.iov_base = rbuf;
2264         iov.iov_len = blksize;
2265         uio.uio_iov = &iov;
2266         uio.uio_iovcnt = 1;
2267         uio.uio_resid = blksize;
2268         uio.uio_segflg = UIO_SYSSPACE;
2269         uio.uio_rw = UIO_READ;
2270         uio.uio_td = curthread;
2271
2272         if (ncvp_debug >= 2)
2273                 kprintf("cache_inefficient_scan: readdir @ %08x\n", (int)uio.uio_offset);
2274         error = VOP_READDIR(pvp, &uio, cred, &eofflag, NULL, NULL);
2275         if (error == 0) {
2276                 den = (struct dirent *)rbuf;
2277                 bytes = blksize - uio.uio_resid;
2278
2279                 while (bytes > 0) {
2280                         if (ncvp_debug >= 2) {
2281                                 kprintf("cache_inefficient_scan: %*.*s\n",
2282                                         den->d_namlen, den->d_namlen,
2283                                         den->d_name);
2284                         }
2285                         if (den->d_type != DT_WHT &&
2286                             den->d_ino == vat.va_fileid) {
2287                                 if (ncvp_debug) {
2288                                         kprintf("cache_inefficient_scan: "
2289                                                "MATCHED inode %lld path %s/%*.*s\n",
2290                                                (long long)vat.va_fileid,
2291                                                nch->ncp->nc_name,
2292                                                den->d_namlen, den->d_namlen,
2293                                                den->d_name);
2294                                 }
2295                                 nlc.nlc_nameptr = den->d_name;
2296                                 nlc.nlc_namelen = den->d_namlen;
2297                                 rncp = cache_nlookup(nch, &nlc);
2298                                 KKASSERT(rncp.ncp != NULL);
2299                                 break;
2300                         }
2301                         bytes -= _DIRENT_DIRSIZ(den);
2302                         den = _DIRENT_NEXT(den);
2303                 }
2304                 if (rncp.ncp == NULL && eofflag == 0 && uio.uio_resid != blksize)
2305                         goto again;
2306         }
2307         kfree(rbuf, M_TEMP);
2308 done:
2309         vrele(pvp);
2310         if (rncp.ncp) {
2311                 if (rncp.ncp->nc_flag & NCF_UNRESOLVED) {
2312                         _cache_setvp(rncp.mount, rncp.ncp, dvp);
2313                         if (ncvp_debug >= 2) {
2314                                 kprintf("cache_inefficient_scan: setvp %s/%s = %p\n",
2315                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp);
2316                         }
2317                 } else {
2318                         if (ncvp_debug >= 2) {
2319                                 kprintf("cache_inefficient_scan: setvp %s/%s already set %p/%p\n",
2320                                         nch->ncp->nc_name, rncp.ncp->nc_name, dvp,
2321                                         rncp.ncp->nc_vp);
2322                         }
2323                 }
2324                 if (rncp.ncp->nc_vp == NULL)
2325                         error = rncp.ncp->nc_error;
2326                 /*
2327                  * Release rncp after a successful nlookup.  rncp was fully
2328                  * referenced.
2329                  */
2330                 cache_put(&rncp);
2331         } else {
2332                 kprintf("cache_inefficient_scan: dvp %p NOT FOUND in %s\n",
2333                         dvp, nch->ncp->nc_name);
2334                 error = ENOENT;
2335         }
2336         return (error);
2337 }
2338
2339 /*
2340  * Zap a namecache entry.  The ncp is unconditionally set to an unresolved
2341  * state, which disassociates it from its vnode or ncneglist.
2342  *
2343  * Then, if there are no additional references to the ncp and no children,
2344  * the ncp is removed from the topology and destroyed.
2345  *
2346  * References and/or children may exist if the ncp is in the middle of the
2347  * topology, preventing the ncp from being destroyed.
2348  *
2349  * This function must be called with the ncp held and locked and will unlock
2350  * and drop it during zapping.
2351  *
2352  * If nonblock is non-zero and the parent ncp cannot be locked we give up.
2353  * This case can occur in the cache_drop() path.
2354  *
2355  * This function may returned a held (but NOT locked) parent node which the
2356  * caller must drop.  We do this so _cache_drop() can loop, to avoid
2357  * blowing out the kernel stack.
2358  *
2359  * WARNING!  For MPSAFE operation this routine must acquire up to three
2360  *           spin locks to be able to safely test nc_refs.  Lock order is
2361  *           very important.
2362  *
2363  *           hash spinlock if on hash list
2364  *           parent spinlock if child of parent
2365  *           (the ncp is unresolved so there is no vnode association)
2366  */
2367 static struct namecache *
2368 cache_zap(struct namecache *ncp, int nonblock)
2369 {
2370         struct namecache *par;
2371         struct vnode *dropvp;
2372         int refs;
2373
2374         /*
2375          * Disassociate the vnode or negative cache ref and set NCF_UNRESOLVED.
2376          */
2377         _cache_setunresolved(ncp);
2378
2379         /*
2380          * Try to scrap the entry and possibly tail-recurse on its parent.
2381          * We only scrap unref'd (other then our ref) unresolved entries,
2382          * we do not scrap 'live' entries.
2383          *
2384          * Note that once the spinlocks are acquired if nc_refs == 1 no
2385          * other references are possible.  If it isn't, however, we have
2386          * to decrement but also be sure to avoid a 1->0 transition.
2387          */
2388         KKASSERT(ncp->nc_flag & NCF_UNRESOLVED);
2389         KKASSERT(ncp->nc_refs > 0);
2390
2391         /*
2392          * Acquire locks.  Note that the parent can't go away while we hold
2393          * a child locked.
2394          */
2395         if ((par = ncp->nc_parent) != NULL) {
2396                 if (nonblock) {
2397                         for (;;) {
2398                                 if (_cache_lock_nonblock(par) == 0)
2399                                         break;
2400                                 refs = ncp->nc_refs;
2401                                 ncp->nc_flag |= NCF_DEFEREDZAP;
2402                                 ++numdefered;   /* MP race ok */
2403                                 if (atomic_cmpset_int(&ncp->nc_refs,
2404                                                       refs, refs - 1)) {
2405                                         _cache_unlock(ncp);
2406                                         return(NULL);
2407                                 }
2408                                 cpu_pause();
2409                         }
2410                         _cache_hold(par);
2411                 } else {
2412                         _cache_hold(par);
2413                         _cache_lock(par);
2414                 }
2415                 spin_lock(&ncp->nc_head->spin);
2416         }
2417
2418         /*
2419          * If someone other then us has a ref or we have children
2420          * we cannot zap the entry.  The 1->0 transition and any
2421          * further list operation is protected by the spinlocks
2422          * we have acquired but other transitions are not.
2423          */
2424         for (;;) {
2425                 refs = ncp->nc_refs;
2426                 if (refs == 1 && TAILQ_EMPTY(&ncp->nc_list))
2427                         break;
2428                 if (atomic_cmpset_int(&ncp->nc_refs, refs, refs - 1)) {
2429                         if (par) {
2430                                 spin_unlock(&ncp->nc_head->spin);
2431                                 _cache_put(par);
2432                         }
2433                         _cache_unlock(ncp);
2434                         return(NULL);
2435                 }
2436                 cpu_pause();
2437         }
2438
2439         /*
2440          * We are the only ref and with the spinlocks held no further
2441          * refs can be acquired by others.
2442          *
2443          * Remove us from the hash list and parent list.  We have to
2444          * drop a ref on the parent's vp if the parent's list becomes
2445          * empty.
2446          */
2447         dropvp = NULL;
2448         if (par) {
2449                 struct nchash_head *nchpp = ncp->nc_head;
2450
2451                 KKASSERT(nchpp != NULL);
2452                 LIST_REMOVE(ncp, nc_hash);
2453                 TAILQ_REMOVE(&par->nc_list, ncp, nc_entry);
2454                 if (par->nc_vp && TAILQ_EMPTY(&par->nc_list))
2455                         dropvp = par->nc_vp;
2456                 ncp->nc_head = NULL;
2457                 ncp->nc_parent = NULL;
2458                 spin_unlock(&nchpp->spin);
2459                 _cache_unlock(par);
2460         } else {
2461                 KKASSERT(ncp->nc_head == NULL);
2462         }
2463
2464         /*
2465          * ncp should not have picked up any refs.  Physically
2466          * destroy the ncp.
2467          */
2468         KKASSERT(ncp->nc_refs == 1);
2469         /* _cache_unlock(ncp) not required */
2470         ncp->nc_refs = -1;      /* safety */
2471         if (ncp->nc_name)
2472                 kfree(ncp->nc_name, M_VFSCACHE);
2473         kfree(ncp, M_VFSCACHE);
2474
2475         /*
2476          * Delayed drop (we had to release our spinlocks)
2477          *
2478          * The refed parent (if not  NULL) must be dropped.  The
2479          * caller is responsible for looping.
2480          */
2481         if (dropvp)
2482                 vdrop(dropvp);
2483         return(par);
2484 }
2485
2486 /*
2487  * Clean up dangling negative cache and defered-drop entries in the
2488  * namecache.
2489  */
2490 typedef enum { CHI_LOW, CHI_HIGH } cache_hs_t;
2491
2492 static cache_hs_t neg_cache_hysteresis_state = CHI_LOW;
2493 static cache_hs_t pos_cache_hysteresis_state = CHI_LOW;
2494
2495 void
2496 cache_hysteresis(void)
2497 {
2498         int poslimit;
2499
2500         /*
2501          * Don't cache too many negative hits.  We use hysteresis to reduce
2502          * the impact on the critical path.
2503          */
2504         switch(neg_cache_hysteresis_state) {
2505         case CHI_LOW:
2506                 if (numneg > MINNEG && numneg * ncnegfactor > numcache) {
2507                         _cache_cleanneg(10);
2508                         neg_cache_hysteresis_state = CHI_HIGH;
2509                 }
2510                 break;
2511         case CHI_HIGH:
2512                 if (numneg > MINNEG * 9 / 10 &&
2513                     numneg * ncnegfactor * 9 / 10 > numcache
2514                 ) {
2515                         _cache_cleanneg(10);
2516                 } else {
2517                         neg_cache_hysteresis_state = CHI_LOW;
2518                 }
2519                 break;
2520         }
2521
2522         /*
2523          * Don't cache too many positive hits.  We use hysteresis to reduce
2524          * the impact on the critical path.
2525          *
2526          * Excessive positive hits can accumulate due to large numbers of
2527          * hardlinks (the vnode cache will not prevent hl ncps from growing
2528          * into infinity).
2529          */
2530         if ((poslimit = ncposlimit) == 0)
2531                 poslimit = desiredvnodes * 2;
2532
2533         switch(pos_cache_hysteresis_state) {
2534         case CHI_LOW:
2535                 if (numcache > poslimit && numcache > MINPOS) {
2536                         _cache_cleanpos(10);
2537                         pos_cache_hysteresis_state = CHI_HIGH;
2538                 }
2539                 break;
2540         case CHI_HIGH:
2541                 if (numcache > poslimit * 5 / 6 && numcache > MINPOS) {
2542                         _cache_cleanpos(10);
2543                 } else {
2544                         pos_cache_hysteresis_state = CHI_LOW;
2545                 }
2546                 break;
2547         }
2548
2549         /*
2550          * Clean out dangling defered-zap ncps which could not
2551          * be cleanly dropped if too many build up.  Note
2552          * that numdefered is not an exact number as such ncps
2553          * can be reused and the counter is not handled in a MP
2554          * safe manner by design.
2555          */
2556         if (numdefered * ncnegfactor > numcache) {
2557                 _cache_cleandefered();
2558         }
2559 }
2560
2561 /*
2562  * NEW NAMECACHE LOOKUP API
2563  *
2564  * Lookup an entry in the namecache.  The passed par_nch must be referenced
2565  * and unlocked.  A referenced and locked nchandle with a non-NULL nch.ncp
2566  * is ALWAYS returned, eve if the supplied component is illegal.
2567  *
2568  * The resulting namecache entry should be returned to the system with
2569  * cache_put() or cache_unlock() + cache_drop().
2570  *
2571  * namecache locks are recursive but care must be taken to avoid lock order
2572  * reversals (hence why the passed par_nch must be unlocked).  Locking
2573  * rules are to order for parent traversals, not for child traversals.
2574  *
2575  * Nobody else will be able to manipulate the associated namespace (e.g.
2576  * create, delete, rename, rename-target) until the caller unlocks the
2577  * entry.
2578  *
2579  * The returned entry will be in one of three states:  positive hit (non-null
2580  * vnode), negative hit (null vnode), or unresolved (NCF_UNRESOLVED is set).
2581  * Unresolved entries must be resolved through the filesystem to associate the
2582  * vnode and/or determine whether a positive or negative hit has occured.
2583  *
2584  * It is not necessary to lock a directory in order to lock namespace under
2585  * that directory.  In fact, it is explicitly not allowed to do that.  A
2586  * directory is typically only locked when being created, renamed, or
2587  * destroyed.
2588  *
2589  * The directory (par) may be unresolved, in which case any returned child
2590  * will likely also be marked unresolved.  Likely but not guarenteed.  Since
2591  * the filesystem lookup requires a resolved directory vnode the caller is
2592  * responsible for resolving the namecache chain top-down.  This API
2593  * specifically allows whole chains to be created in an unresolved state.
2594  */
2595 struct nchandle
2596 cache_nlookup(struct nchandle *par_nch, struct nlcomponent *nlc)
2597 {
2598         struct nchandle nch;
2599         struct namecache *ncp;
2600         struct namecache *new_ncp;
2601         struct nchash_head *nchpp;
2602         struct mount *mp;
2603         u_int32_t hash;
2604         globaldata_t gd;
2605         int par_locked;
2606
2607         numcalls++;
2608         gd = mycpu;
2609         mp = par_nch->mount;
2610         par_locked = 0;
2611
2612         /*
2613          * This is a good time to call it, no ncp's are locked by
2614          * the caller or us.
2615          */
2616         cache_hysteresis();
2617
2618         /*
2619          * Try to locate an existing entry
2620          */
2621         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2622         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2623         new_ncp = NULL;
2624         nchpp = NCHHASH(hash);
2625 restart:
2626         spin_lock(&nchpp->spin);
2627         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2628                 numchecks++;
2629
2630                 /*
2631                  * Break out if we find a matching entry.  Note that
2632                  * UNRESOLVED entries may match, but DESTROYED entries
2633                  * do not.
2634                  */
2635                 if (ncp->nc_parent == par_nch->ncp &&
2636                     ncp->nc_nlen == nlc->nlc_namelen &&
2637                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2638                     (ncp->nc_flag & NCF_DESTROYED) == 0
2639                 ) {
2640                         _cache_hold(ncp);
2641                         spin_unlock(&nchpp->spin);
2642                         if (par_locked) {
2643                                 _cache_unlock(par_nch->ncp);
2644                                 par_locked = 0;
2645                         }
2646                         if (_cache_lock_special(ncp) == 0) {
2647                                 _cache_auto_unresolve(mp, ncp);
2648                                 if (new_ncp)
2649                                         _cache_free(new_ncp);
2650                                 goto found;
2651                         }
2652                         _cache_get(ncp);
2653                         _cache_put(ncp);
2654                         _cache_drop(ncp);
2655                         goto restart;
2656                 }
2657         }
2658
2659         /*
2660          * We failed to locate an entry, create a new entry and add it to
2661          * the cache.  The parent ncp must also be locked so we
2662          * can link into it.
2663          *
2664          * We have to relookup after possibly blocking in kmalloc or
2665          * when locking par_nch.
2666          *
2667          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2668          *       mount case, in which case nc_name will be NULL.
2669          */
2670         if (new_ncp == NULL) {
2671                 spin_unlock(&nchpp->spin);
2672                 new_ncp = cache_alloc(nlc->nlc_namelen);
2673                 if (nlc->nlc_namelen) {
2674                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2675                               nlc->nlc_namelen);
2676                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2677                 }
2678                 goto restart;
2679         }
2680         if (par_locked == 0) {
2681                 spin_unlock(&nchpp->spin);
2682                 _cache_lock(par_nch->ncp);
2683                 par_locked = 1;
2684                 goto restart;
2685         }
2686
2687         /*
2688          * WARNING!  We still hold the spinlock.  We have to set the hash
2689          *           table entry atomically.
2690          */
2691         ncp = new_ncp;
2692         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2693         spin_unlock(&nchpp->spin);
2694         _cache_unlock(par_nch->ncp);
2695         /* par_locked = 0 - not used */
2696 found:
2697         /*
2698          * stats and namecache size management
2699          */
2700         if (ncp->nc_flag & NCF_UNRESOLVED)
2701                 ++gd->gd_nchstats->ncs_miss;
2702         else if (ncp->nc_vp)
2703                 ++gd->gd_nchstats->ncs_goodhits;
2704         else
2705                 ++gd->gd_nchstats->ncs_neghits;
2706         nch.mount = mp;
2707         nch.ncp = ncp;
2708         atomic_add_int(&nch.mount->mnt_refs, 1);
2709         return(nch);
2710 }
2711
2712 /*
2713  * Attempt to lookup a namecache entry and return with a shared namecache
2714  * lock.
2715  */
2716 int
2717 cache_nlookup_maybe_shared(struct nchandle *par_nch, struct nlcomponent *nlc,
2718                            int excl, struct nchandle *res_nch)
2719 {
2720         struct namecache *ncp;
2721         struct nchash_head *nchpp;
2722         struct mount *mp;
2723         u_int32_t hash;
2724         globaldata_t gd;
2725
2726         /*
2727          * If exclusive requested or shared namecache locks are disabled,
2728          * return failure.
2729          */
2730         if (ncp_shared_lock_disable || excl)
2731                 return(EWOULDBLOCK);
2732
2733         numcalls++;
2734         gd = mycpu;
2735         mp = par_nch->mount;
2736
2737         /*
2738          * This is a good time to call it, no ncp's are locked by
2739          * the caller or us.
2740          */
2741         cache_hysteresis();
2742
2743         /*
2744          * Try to locate an existing entry
2745          */
2746         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2747         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2748         nchpp = NCHHASH(hash);
2749
2750         spin_lock(&nchpp->spin);
2751
2752         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2753                 numchecks++;
2754
2755                 /*
2756                  * Break out if we find a matching entry.  Note that
2757                  * UNRESOLVED entries may match, but DESTROYED entries
2758                  * do not.
2759                  */
2760                 if (ncp->nc_parent == par_nch->ncp &&
2761                     ncp->nc_nlen == nlc->nlc_namelen &&
2762                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2763                     (ncp->nc_flag & NCF_DESTROYED) == 0
2764                 ) {
2765                         _cache_hold(ncp);
2766                         spin_unlock(&nchpp->spin);
2767                         if (_cache_lock_shared_special(ncp) == 0) {
2768                                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0 &&
2769                                     (ncp->nc_flag & NCF_DESTROYED) == 0 &&
2770                                     _cache_auto_unresolve_test(mp, ncp) == 0) {
2771                                         goto found;
2772                                 }
2773                                 _cache_unlock(ncp);
2774                         }
2775                         _cache_drop(ncp);
2776                         spin_lock(&nchpp->spin);
2777                         break;
2778                 }
2779         }
2780
2781         /*
2782          * Failure
2783          */
2784         spin_unlock(&nchpp->spin);
2785         return(EWOULDBLOCK);
2786
2787         /*
2788          * Success
2789          *
2790          * Note that nc_error might be non-zero (e.g ENOENT).
2791          */
2792 found:
2793         res_nch->mount = mp;
2794         res_nch->ncp = ncp;
2795         ++gd->gd_nchstats->ncs_goodhits;
2796         atomic_add_int(&res_nch->mount->mnt_refs, 1);
2797
2798         KKASSERT(ncp->nc_error != EWOULDBLOCK);
2799         return(ncp->nc_error);
2800 }
2801
2802 /*
2803  * This is a non-blocking verison of cache_nlookup() used by
2804  * nfs_readdirplusrpc_uio().  It can fail for any reason and
2805  * will return nch.ncp == NULL in that case.
2806  */
2807 struct nchandle
2808 cache_nlookup_nonblock(struct nchandle *par_nch, struct nlcomponent *nlc)
2809 {
2810         struct nchandle nch;
2811         struct namecache *ncp;
2812         struct namecache *new_ncp;
2813         struct nchash_head *nchpp;
2814         struct mount *mp;
2815         u_int32_t hash;
2816         globaldata_t gd;
2817         int par_locked;
2818
2819         numcalls++;
2820         gd = mycpu;
2821         mp = par_nch->mount;
2822         par_locked = 0;
2823
2824         /*
2825          * Try to locate an existing entry
2826          */
2827         hash = fnv_32_buf(nlc->nlc_nameptr, nlc->nlc_namelen, FNV1_32_INIT);
2828         hash = fnv_32_buf(&par_nch->ncp, sizeof(par_nch->ncp), hash);
2829         new_ncp = NULL;
2830         nchpp = NCHHASH(hash);
2831 restart:
2832         spin_lock(&nchpp->spin);
2833         LIST_FOREACH(ncp, &nchpp->list, nc_hash) {
2834                 numchecks++;
2835
2836                 /*
2837                  * Break out if we find a matching entry.  Note that
2838                  * UNRESOLVED entries may match, but DESTROYED entries
2839                  * do not.
2840                  */
2841                 if (ncp->nc_parent == par_nch->ncp &&
2842                     ncp->nc_nlen == nlc->nlc_namelen &&
2843                     bcmp(ncp->nc_name, nlc->nlc_nameptr, ncp->nc_nlen) == 0 &&
2844                     (ncp->nc_flag & NCF_DESTROYED) == 0
2845                 ) {
2846                         _cache_hold(ncp);
2847                         spin_unlock(&nchpp->spin);
2848                         if (par_locked) {
2849                                 _cache_unlock(par_nch->ncp);
2850                                 par_locked = 0;
2851                         }
2852                         if (_cache_lock_special(ncp) == 0) {
2853                                 _cache_auto_unresolve(mp, ncp);
2854                                 if (new_ncp) {
2855                                         _cache_free(new_ncp);
2856                                         new_ncp = NULL;
2857                                 }
2858                                 goto found;
2859                         }
2860                         _cache_drop(ncp);
2861                         goto failed;
2862                 }
2863         }
2864
2865         /*
2866          * We failed to locate an entry, create a new entry and add it to
2867          * the cache.  The parent ncp must also be locked so we
2868          * can link into it.
2869          *
2870          * We have to relookup after possibly blocking in kmalloc or
2871          * when locking par_nch.
2872          *
2873          * NOTE: nlc_namelen can be 0 and nlc_nameptr NULL as a special
2874          *       mount case, in which case nc_name will be NULL.
2875          */
2876         if (new_ncp == NULL) {
2877                 spin_unlock(&nchpp->spin);
2878                 new_ncp = cache_alloc(nlc->nlc_namelen);
2879                 if (nlc->nlc_namelen) {
2880                         bcopy(nlc->nlc_nameptr, new_ncp->nc_name,
2881                               nlc->nlc_namelen);
2882                         new_ncp->nc_name[nlc->nlc_namelen] = 0;
2883                 }
2884                 goto restart;
2885         }
2886         if (par_locked == 0) {
2887                 spin_unlock(&nchpp->spin);
2888                 if (_cache_lock_nonblock(par_nch->ncp) == 0) {
2889                         par_locked = 1;
2890                         goto restart;
2891                 }
2892                 goto failed;
2893         }
2894
2895         /*
2896          * WARNING!  We still hold the spinlock.  We have to set the hash
2897          *           table entry atomically.
2898          */
2899         ncp = new_ncp;
2900         _cache_link_parent(ncp, par_nch->ncp, nchpp);
2901         spin_unlock(&nchpp->spin);
2902         _cache_unlock(par_nch->ncp);
2903         /* par_locked = 0 - not used */
2904 found:
2905         /*
2906          * stats and namecache size management
2907          */
2908         if (ncp->nc_flag & NCF_UNRESOLVED)
2909                 ++gd->gd_nchstats->ncs_miss;
2910         else if (ncp->nc_vp)
2911                 ++gd->gd_nchstats->ncs_goodhits;
2912         else
2913                 ++gd->gd_nchstats->ncs_neghits;
2914         nch.mount = mp;
2915         nch.ncp = ncp;
2916         atomic_add_int(&nch.mount->mnt_refs, 1);
2917         return(nch);
2918 failed:
2919         if (new_ncp) {
2920                 _cache_free(new_ncp);
2921                 new_ncp = NULL;
2922         }
2923         nch.mount = NULL;
2924         nch.ncp = NULL;
2925         return(nch);
2926 }
2927
2928 /*
2929  * The namecache entry is marked as being used as a mount point.
2930  * Locate the mount if it is visible to the caller.  The DragonFly
2931  * mount system allows arbitrary loops in the topology and disentangles
2932  * those loops by matching against (mp, ncp) rather than just (ncp).
2933  * This means any given ncp can dive any number of mounts, depending
2934  * on the relative mount (e.g. nullfs) the caller is at in the topology.
2935  *
2936  * We use a very simple frontend cache to reduce SMP conflicts,
2937  * which we have to do because the mountlist scan needs an exclusive
2938  * lock around its ripout info list.  Not to mention that there might
2939  * be a lot of mounts.
2940  */
2941 struct findmount_info {
2942         struct mount *result;
2943         struct mount *nch_mount;
2944         struct namecache *nch_ncp;
2945 };
2946
2947 static
2948 struct ncmount_cache *
2949 ncmount_cache_lookup(struct mount *mp, struct namecache *ncp)
2950 {
2951         int hash;
2952
2953         hash = ((int)(intptr_t)mp / sizeof(*mp)) ^
2954                ((int)(intptr_t)ncp / sizeof(*ncp));
2955         hash = (hash & 0x7FFFFFFF) % NCMOUNT_NUMCACHE;
2956         return (&ncmount_cache[hash]);
2957 }
2958
2959 static
2960 int
2961 cache_findmount_callback(struct mount *mp, void *data)
2962 {
2963         struct findmount_info *info = data;
2964
2965         /*
2966          * Check the mount's mounted-on point against the passed nch.
2967          */
2968         if (mp->mnt_ncmounton.mount == info->nch_mount &&
2969             mp->mnt_ncmounton.ncp == info->nch_ncp
2970         ) {
2971             info->result = mp;
2972             atomic_add_int(&mp->mnt_refs, 1);
2973             return(-1);
2974         }
2975         return(0);
2976 }
2977
2978 struct mount *
2979 cache_findmount(struct nchandle *nch)
2980 {
2981         struct findmount_info info;
2982         struct ncmount_cache *ncc;
2983         struct mount *mp;
2984
2985         /*
2986          * Fast
2987          */
2988         if (ncmount_cache_enable == 0) {
2989                 ncc = NULL;
2990                 goto skip;
2991         }
2992         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
2993         if (ncc->ncp == nch->ncp) {
2994                 spin_lock_shared(&ncc->spin);
2995                 if (ncc->isneg == 0 &&
2996                     ncc->ncp == nch->ncp && (mp = ncc->mp) != NULL) {
2997                         if (mp->mnt_ncmounton.mount == nch->mount &&
2998                             mp->mnt_ncmounton.ncp == nch->ncp) {
2999                                 /*
3000                                  * Cache hit (positive)
3001                                  */
3002                                 atomic_add_int(&mp->mnt_refs, 1);
3003                                 spin_unlock_shared(&ncc->spin);
3004                                 ++ncmount_cache_hit;
3005                                 return(mp);
3006                         }
3007                         /* else cache miss */
3008                 }
3009                 if (ncc->isneg &&
3010                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3011                         /*
3012                          * Cache hit (negative)
3013                          */
3014                         spin_unlock_shared(&ncc->spin);
3015                         ++ncmount_cache_hit;
3016                         return(NULL);
3017                 }
3018                 spin_unlock_shared(&ncc->spin);
3019         }
3020 skip:
3021
3022         /*
3023          * Slow
3024          */
3025         info.result = NULL;
3026         info.nch_mount = nch->mount;
3027         info.nch_ncp = nch->ncp;
3028         mountlist_scan(cache_findmount_callback, &info,
3029                                MNTSCAN_FORWARD|MNTSCAN_NOBUSY);
3030
3031         /*
3032          * Cache the result.
3033          *
3034          * Negative lookups: We cache the originating {ncp,mp}. (mp) is
3035          *                   only used for pointer comparisons and is not
3036          *                   referenced (otherwise there would be dangling
3037          *                   refs).
3038          *
3039          * Positive lookups: We cache the originating {ncp} and the target
3040          *                   (mp).  (mp) is referenced.
3041          *
3042          * Indeterminant:    If the match is undergoing an unmount we do
3043          *                   not cache it to avoid racing cache_unmounting(),
3044          *                   but still return the match.
3045          */
3046         if (ncc) {
3047                 spin_lock(&ncc->spin);
3048                 if (info.result == NULL) {
3049                         if (ncc->isneg == 0 && ncc->mp)
3050                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3051                         ncc->ncp = nch->ncp;
3052                         ncc->mp = nch->mount;
3053                         ncc->isneg = 1;
3054                         spin_unlock(&ncc->spin);
3055                         ++ncmount_cache_overwrite;
3056                 } else if ((info.result->mnt_kern_flag & MNTK_UNMOUNT) == 0) {
3057                         if (ncc->isneg == 0 && ncc->mp)
3058                                 atomic_add_int(&ncc->mp->mnt_refs, -1);
3059                         atomic_add_int(&info.result->mnt_refs, 1);
3060                         ncc->ncp = nch->ncp;
3061                         ncc->mp = info.result;
3062                         ncc->isneg = 0;
3063                         spin_unlock(&ncc->spin);
3064                         ++ncmount_cache_overwrite;
3065                 } else {
3066                         spin_unlock(&ncc->spin);
3067                 }
3068                 ++ncmount_cache_miss;
3069         }
3070         return(info.result);
3071 }
3072
3073 void
3074 cache_dropmount(struct mount *mp)
3075 {
3076         atomic_add_int(&mp->mnt_refs, -1);
3077 }
3078
3079 void
3080 cache_ismounting(struct mount *mp)
3081 {
3082         struct nchandle *nch = &mp->mnt_ncmounton;
3083         struct ncmount_cache *ncc;
3084
3085         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3086         if (ncc->isneg &&
3087             ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3088                 spin_lock(&ncc->spin);
3089                 if (ncc->isneg &&
3090                     ncc->ncp == nch->ncp && ncc->mp == nch->mount) {
3091                         ncc->ncp = NULL;
3092                         ncc->mp = NULL;
3093                 }
3094                 spin_unlock(&ncc->spin);
3095         }
3096 }
3097
3098 void
3099 cache_unmounting(struct mount *mp)
3100 {
3101         struct nchandle *nch = &mp->mnt_ncmounton;
3102         struct ncmount_cache *ncc;
3103
3104         ncc = ncmount_cache_lookup(nch->mount, nch->ncp);
3105         if (ncc->isneg == 0 &&
3106             ncc->ncp == nch->ncp && ncc->mp == mp) {
3107                 spin_lock(&ncc->spin);
3108                 if (ncc->isneg == 0 &&
3109                     ncc->ncp == nch->ncp && ncc->mp == mp) {
3110                         atomic_add_int(&mp->mnt_refs, -1);
3111                         ncc->ncp = NULL;
3112                         ncc->mp = NULL;
3113                 }
3114                 spin_unlock(&ncc->spin);
3115         }
3116 }
3117
3118 /*
3119  * Resolve an unresolved namecache entry, generally by looking it up.
3120  * The passed ncp must be locked and refd.
3121  *
3122  * Theoretically since a vnode cannot be recycled while held, and since
3123  * the nc_parent chain holds its vnode as long as children exist, the
3124  * direct parent of the cache entry we are trying to resolve should
3125  * have a valid vnode.  If not then generate an error that we can
3126  * determine is related to a resolver bug.
3127  *
3128  * However, if a vnode was in the middle of a recyclement when the NCP
3129  * got locked, ncp->nc_vp might point to a vnode that is about to become
3130  * invalid.  cache_resolve() handles this case by unresolving the entry
3131  * and then re-resolving it.
3132  *
3133  * Note that successful resolution does not necessarily return an error
3134  * code of 0.  If the ncp resolves to a negative cache hit then ENOENT
3135  * will be returned.
3136  */
3137 int
3138 cache_resolve(struct nchandle *nch, struct ucred *cred)
3139 {
3140         struct namecache *par_tmp;
3141         struct namecache *par;
3142         struct namecache *ncp;
3143         struct nchandle nctmp;
3144         struct mount *mp;
3145         struct vnode *dvp;
3146         int error;
3147
3148         ncp = nch->ncp;
3149         mp = nch->mount;
3150         KKASSERT(_cache_lockstatus(ncp) == LK_EXCLUSIVE);
3151 restart:
3152         /*
3153          * If the ncp is already resolved we have nothing to do.  However,
3154          * we do want to guarentee that a usable vnode is returned when
3155          * a vnode is present, so make sure it hasn't been reclaimed.
3156          */
3157         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3158                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3159                         _cache_setunresolved(ncp);
3160                 if ((ncp->nc_flag & NCF_UNRESOLVED) == 0)
3161                         return (ncp->nc_error);
3162         }
3163
3164         /*
3165          * If the ncp was destroyed it will never resolve again.  This
3166          * can basically only happen when someone is chdir'd into an
3167          * empty directory which is then rmdir'd.  We want to catch this
3168          * here and not dive the VFS because the VFS might actually
3169          * have a way to re-resolve the disconnected ncp, which will
3170          * result in inconsistencies in the cdir/nch for proc->p_fd.
3171          */
3172         if (ncp->nc_flag & NCF_DESTROYED) {
3173                 kprintf("Warning: cache_resolve: ncp '%s' was unlinked\n",
3174                         ncp->nc_name);
3175                 return(EINVAL);
3176         }
3177
3178         /*
3179          * Mount points need special handling because the parent does not
3180          * belong to the same filesystem as the ncp.
3181          */
3182         if (ncp == mp->mnt_ncmountpt.ncp)
3183                 return (cache_resolve_mp(mp));
3184
3185         /*
3186          * We expect an unbroken chain of ncps to at least the mount point,
3187          * and even all the way to root (but this code doesn't have to go
3188          * past the mount point).
3189          */
3190         if (ncp->nc_parent == NULL) {
3191                 kprintf("EXDEV case 1 %p %*.*s\n", ncp,
3192                         ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3193                 ncp->nc_error = EXDEV;
3194                 return(ncp->nc_error);
3195         }
3196
3197         /*
3198          * The vp's of the parent directories in the chain are held via vhold()
3199          * due to the existance of the child, and should not disappear.
3200          * However, there are cases where they can disappear:
3201          *
3202          *      - due to filesystem I/O errors.
3203          *      - due to NFS being stupid about tracking the namespace and
3204          *        destroys the namespace for entire directories quite often.
3205          *      - due to forced unmounts.
3206          *      - due to an rmdir (parent will be marked DESTROYED)
3207          *
3208          * When this occurs we have to track the chain backwards and resolve
3209          * it, looping until the resolver catches up to the current node.  We
3210          * could recurse here but we might run ourselves out of kernel stack
3211          * so we do it in a more painful manner.  This situation really should
3212          * not occur all that often, or if it does not have to go back too
3213          * many nodes to resolve the ncp.
3214          */
3215         while ((dvp = cache_dvpref(ncp)) == NULL) {
3216                 /*
3217                  * This case can occur if a process is CD'd into a
3218                  * directory which is then rmdir'd.  If the parent is marked
3219                  * destroyed there is no point trying to resolve it.
3220                  */
3221                 if (ncp->nc_parent->nc_flag & NCF_DESTROYED)
3222                         return(ENOENT);
3223                 par = ncp->nc_parent;
3224                 _cache_hold(par);
3225                 _cache_lock(par);
3226                 while ((par_tmp = par->nc_parent) != NULL &&
3227                        par_tmp->nc_vp == NULL) {
3228                         _cache_hold(par_tmp);
3229                         _cache_lock(par_tmp);
3230                         _cache_put(par);
3231                         par = par_tmp;
3232                 }
3233                 if (par->nc_parent == NULL) {
3234                         kprintf("EXDEV case 2 %*.*s\n",
3235                                 par->nc_nlen, par->nc_nlen, par->nc_name);
3236                         _cache_put(par);
3237                         return (EXDEV);
3238                 }
3239                 kprintf("[diagnostic] cache_resolve: had to recurse on %*.*s\n",
3240                         par->nc_nlen, par->nc_nlen, par->nc_name);
3241                 /*
3242                  * The parent is not set in stone, ref and lock it to prevent
3243                  * it from disappearing.  Also note that due to renames it
3244                  * is possible for our ncp to move and for par to no longer
3245                  * be one of its parents.  We resolve it anyway, the loop
3246                  * will handle any moves.
3247                  */
3248                 _cache_get(par);        /* additional hold/lock */
3249                 _cache_put(par);        /* from earlier hold/lock */
3250                 if (par == nch->mount->mnt_ncmountpt.ncp) {
3251                         cache_resolve_mp(nch->mount);
3252                 } else if ((dvp = cache_dvpref(par)) == NULL) {
3253                         kprintf("[diagnostic] cache_resolve: raced on %*.*s\n", par->nc_nlen, par->nc_nlen, par->nc_name);
3254                         _cache_put(par);
3255                         continue;
3256                 } else {
3257                         if (par->nc_flag & NCF_UNRESOLVED) {
3258                                 nctmp.mount = mp;
3259                                 nctmp.ncp = par;
3260                                 par->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3261                         }
3262                         vrele(dvp);
3263                 }
3264                 if ((error = par->nc_error) != 0) {
3265                         if (par->nc_error != EAGAIN) {
3266                                 kprintf("EXDEV case 3 %*.*s error %d\n",
3267                                     par->nc_nlen, par->nc_nlen, par->nc_name,
3268                                     par->nc_error);
3269                                 _cache_put(par);
3270                                 return(error);
3271                         }
3272                         kprintf("[diagnostic] cache_resolve: EAGAIN par %p %*.*s\n",
3273                                 par, par->nc_nlen, par->nc_nlen, par->nc_name);
3274                 }
3275                 _cache_put(par);
3276                 /* loop */
3277         }
3278
3279         /*
3280          * Call VOP_NRESOLVE() to get the vp, then scan for any disconnected
3281          * ncp's and reattach them.  If this occurs the original ncp is marked
3282          * EAGAIN to force a relookup.
3283          *
3284          * NOTE: in order to call VOP_NRESOLVE(), the parent of the passed
3285          * ncp must already be resolved.
3286          */
3287         if (dvp) {
3288                 nctmp.mount = mp;
3289                 nctmp.ncp = ncp;
3290                 ncp->nc_error = VOP_NRESOLVE(&nctmp, dvp, cred);
3291                 vrele(dvp);
3292         } else {
3293                 ncp->nc_error = EPERM;
3294         }
3295         if (ncp->nc_error == EAGAIN) {
3296                 kprintf("[diagnostic] cache_resolve: EAGAIN ncp %p %*.*s\n",
3297                         ncp, ncp->nc_nlen, ncp->nc_nlen, ncp->nc_name);
3298                 goto restart;
3299         }
3300         return(ncp->nc_error);
3301 }
3302
3303 /*
3304  * Resolve the ncp associated with a mount point.  Such ncp's almost always
3305  * remain resolved and this routine is rarely called.  NFS MPs tends to force
3306  * re-resolution more often due to its mac-truck-smash-the-namecache
3307  * method of tracking namespace changes.
3308  *
3309  * The semantics for this call is that the passed ncp must be locked on
3310  * entry and will be locked on return.  However, if we actually have to
3311  * resolve the mount point we temporarily unlock the entry in order to
3312  * avoid race-to-root deadlocks due to e.g. dead NFS mounts.  Because of
3313  * the unlock we have to recheck the flags after we relock.
3314  */
3315 static int
3316 cache_resolve_mp(struct mount *mp)
3317 {
3318         struct namecache *ncp = mp->mnt_ncmountpt.ncp;
3319         struct vnode *vp;
3320         int error;
3321
3322         KKASSERT(mp != NULL);
3323
3324         /*
3325          * If the ncp is already resolved we have nothing to do.  However,
3326          * we do want to guarentee that a usable vnode is returned when
3327          * a vnode is present, so make sure it hasn't been reclaimed.
3328          */
3329         if ((ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3330                 if (ncp->nc_vp && (ncp->nc_vp->v_flag & VRECLAIMED))
3331                         _cache_setunresolved(ncp);
3332         }
3333
3334         if (ncp->nc_flag & NCF_UNRESOLVED) {
3335                 _cache_unlock(ncp);
3336                 while (vfs_busy(mp, 0))
3337                         ;
3338                 error = VFS_ROOT(mp, &vp);
3339                 _cache_lock(ncp);
3340
3341                 /*
3342                  * recheck the ncp state after relocking.
3343                  */
3344                 if (ncp->nc_flag & NCF_UNRESOLVED) {
3345                         ncp->nc_error = error;
3346                         if (error == 0) {
3347                                 _cache_setvp(mp, ncp, vp);
3348                                 vput(vp);
3349                         } else {
3350                                 kprintf("[diagnostic] cache_resolve_mp: failed"
3351                                         " to resolve mount %p err=%d ncp=%p\n",
3352                                         mp, error, ncp);
3353                                 _cache_setvp(mp, ncp, NULL);
3354                         }
3355                 } else if (error == 0) {
3356                         vput(vp);
3357                 }
3358                 vfs_unbusy(mp);
3359         }
3360         return(ncp->nc_error);
3361 }
3362
3363 /*
3364  * Clean out negative cache entries when too many have accumulated.
3365  */
3366 static void
3367 _cache_cleanneg(int count)
3368 {
3369         struct namecache *ncp;
3370
3371         /*
3372          * Attempt to clean out the specified number of negative cache
3373          * entries.
3374          */
3375         while (count) {
3376                 spin_lock(&ncspin);
3377                 ncp = TAILQ_FIRST(&ncneglist);
3378                 if (ncp == NULL) {
3379                         spin_unlock(&ncspin);
3380                         break;
3381                 }
3382                 TAILQ_REMOVE(&ncneglist, ncp, nc_vnode);
3383                 TAILQ_INSERT_TAIL(&ncneglist, ncp, nc_vnode);
3384                 _cache_hold(ncp);
3385                 spin_unlock(&ncspin);
3386
3387                 /*
3388                  * This can race, so we must re-check that the ncp
3389                  * is on the ncneglist after successfully locking it.
3390                  */
3391                 if (_cache_lock_special(ncp) == 0) {
3392                         if (ncp->nc_vp == NULL &&
3393                             (ncp->nc_flag & NCF_UNRESOLVED) == 0) {
3394                                 ncp = cache_zap(ncp, 1);
3395                                 if (ncp)
3396                                         _cache_drop(ncp);
3397                         } else {
3398                                 kprintf("cache_cleanneg: race avoided\n");
3399                                 _cache_unlock(ncp);
3400                         }
3401                 } else {
3402                         _cache_drop(ncp);
3403                 }
3404                 --count;
3405         }
3406 }
3407
3408 /*
3409  * Clean out positive cache entries when too many have accumulated.
3410  */
3411 static void
3412 _cache_cleanpos(int count)
3413 {
3414         static volatile int rover;
3415         struct nchash_head *nchpp;
3416         struct namecache *ncp;
3417         int rover_copy;
3418
3419         /*
3420          * Attempt to clean out the specified number of negative cache
3421          * entries.
3422          */
3423         while (count) {
3424                 rover_copy = ++rover;   /* MPSAFEENOUGH */
3425                 cpu_ccfence();
3426                 nchpp = NCHHASH(rover_copy);
3427
3428                 spin_lock(&nchpp->spin);
3429                 ncp = LIST_FIRST(&nchpp->list);
3430                 if (ncp)
3431                         _cache_hold(ncp);
3432                 spin_unlock(&nchpp->spin);
3433
3434                 if (ncp) {
3435                         if (_cache_lock_special(ncp) == 0) {
3436                                 ncp = cache_zap(ncp, 1);
3437                                 if (ncp)
3438                                         _cache_drop(ncp);
3439                         } else {
3440                                 _cache_drop(ncp);
3441                         }
3442                 }
3443                 --count;
3444         }
3445 }
3446
3447 /*
3448  * This is a kitchen sink function to clean out ncps which we
3449  * tried to zap from cache_drop() but failed because we were
3450  * unable to acquire the parent lock.
3451  *
3452  * Such entries can also be removed via cache_inval_vp(), such
3453  * as when unmounting.
3454  */
3455 static void
3456 _cache_cleandefered(void)
3457 {
3458         struct nchash_head *nchpp;
3459         struct namecache *ncp;
3460         struct namecache dummy;
3461         int i;
3462
3463         numdefered = 0;
3464         bzero(&dummy, sizeof(dummy));
3465         dummy.nc_flag = NCF_DESTROYED;
3466
3467         for (i = 0; i <= nchash; ++i) {
3468                 nchpp = &nchashtbl[i];
3469
3470                 spin_lock(&nchpp->spin);
3471                 LIST_INSERT_HEAD(&nchpp->list, &dummy, nc_hash);
3472                 ncp = &dummy;
3473                 while ((ncp = LIST_NEXT(ncp, nc_hash)) != NULL) {
3474                         if ((ncp->nc_flag & NCF_DEFEREDZAP) == 0)
3475                                 continue;
3476                         LIST_REMOVE(&dummy, nc_hash);
3477                         LIST_INSERT_AFTER(ncp, &dummy, nc_hash);
3478                         _cache_hold(ncp);
3479                         spin_unlock(&nchpp->spin);
3480                         if (_cache_lock_nonblock(ncp) == 0) {
3481                                 ncp->nc_flag &= ~NCF_DEFEREDZAP;
3482                                 _cache_unlock(ncp);
3483                         }
3484                         _cache_drop(ncp);
3485                         spin_lock(&nchpp->spin);
3486                         ncp = &dummy;
3487                 }
3488                 LIST_REMOVE(&dummy, nc_hash);
3489                 spin_unlock(&nchpp->spin);
3490         }
3491 }
3492
3493 /*
3494  * Name cache initialization, from vfsinit() when we are booting
3495  */
3496 void
3497 nchinit(void)
3498 {
3499         int i;
3500         globaldata_t gd;
3501
3502         /* initialise per-cpu namecache effectiveness statistics. */
3503         for (i = 0; i < ncpus; ++i) {
3504                 gd = globaldata_find(i);
3505                 gd->gd_nchstats = &nchstats[i];
3506         }
3507         TAILQ_INIT(&ncneglist);
3508         spin_init(&ncspin);
3509         nchashtbl = hashinit_ext(desiredvnodes / 2,
3510                                  sizeof(struct nchash_head),
3511                                  M_VFSCACHE, &nchash);
3512         for (i = 0; i <= (int)nchash; ++i) {
3513                 LIST_INIT(&nchashtbl[i].list);
3514                 spin_init(&nchashtbl[i].spin);
3515         }
3516         for (i = 0; i < NCMOUNT_NUMCACHE; ++i)
3517                 spin_init(&ncmount_cache[i].spin);
3518         nclockwarn = 5 * hz;
3519 }
3520
3521 /*
3522  * Called from start_init() to bootstrap the root filesystem.  Returns
3523  * a referenced, unlocked namecache record.
3524  */
3525 void
3526 cache_allocroot(struct nchandle *nch, struct mount *mp, struct vnode *vp)
3527 {
3528         nch->ncp = cache_alloc(0);
3529         nch->mount = mp;
3530         atomic_add_int(&mp->mnt_refs, 1);
3531         if (vp)
3532                 _cache_setvp(nch->mount, nch->ncp, vp);
3533 }
3534
3535 /*
3536  * vfs_cache_setroot()
3537  *
3538  *      Create an association between the root of our namecache and
3539  *      the root vnode.  This routine may be called several times during
3540  *      booting.
3541  *
3542  *      If the caller intends to save the returned namecache pointer somewhere
3543  *      it must cache_hold() it.
3544  */
3545 void
3546 vfs_cache_setroot(struct vnode *nvp, struct nchandle *nch)
3547 {
3548         struct vnode *ovp;
3549         struct nchandle onch;
3550
3551         ovp = rootvnode;
3552         onch = rootnch;
3553         rootvnode = nvp;
3554         if (nch)
3555                 rootnch = *nch;
3556         else
3557                 cache_zero(&rootnch);
3558         if (ovp)
3559                 vrele(ovp);
3560         if (onch.ncp)
3561                 cache_drop(&onch);
3562 }
3563
3564 /*
3565  * XXX OLD API COMPAT FUNCTION.  This really messes up the new namecache
3566  * topology and is being removed as quickly as possible.  The new VOP_N*()
3567  * API calls are required to make specific adjustments using the supplied
3568  * ncp pointers rather then just bogusly purging random vnodes.
3569  *
3570  * Invalidate all namecache entries to a particular vnode as well as
3571  * any direct children of that vnode in the namecache.  This is a
3572  * 'catch all' purge used by filesystems that do not know any better.
3573  *
3574  * Note that the linkage between the vnode and its namecache entries will
3575  * be removed, but the namecache entries themselves might stay put due to
3576  * active references from elsewhere in the system or due to the existance of
3577  * the children.   The namecache topology is left intact even if we do not
3578  * know what the vnode association is.  Such entries will be marked
3579  * NCF_UNRESOLVED.
3580  */
3581 void
3582 cache_purge(struct vnode *vp)
3583 {
3584         cache_inval_vp(vp, CINV_DESTROY | CINV_CHILDREN);
3585 }
3586
3587 /*
3588  * Flush all entries referencing a particular filesystem.
3589  *
3590  * Since we need to check it anyway, we will flush all the invalid
3591  * entries at the same time.
3592  */
3593 #if 0
3594
3595 void
3596 cache_purgevfs(struct mount *mp)
3597 {
3598         struct nchash_head *nchpp;
3599         struct namecache *ncp, *nnp;
3600
3601         /*
3602          * Scan hash tables for applicable entries.
3603          */
3604         for (nchpp = &nchashtbl[nchash]; nchpp >= nchashtbl; nchpp--) {
3605                 spin_lock_wr(&nchpp->spin); XXX
3606                 ncp = LIST_FIRST(&nchpp->list);
3607                 if (ncp)
3608                         _cache_hold(ncp);
3609                 while (ncp) {
3610                         nnp = LIST_NEXT(ncp, nc_hash);
3611                         if (nnp)
3612                                 _cache_hold(nnp);
3613                         if (ncp->nc_mount == mp) {
3614                                 _cache_lock(ncp);
3615                                 ncp = cache_zap(ncp, 0);
3616                                 if (ncp)
3617                                         _cache_drop(ncp);
3618                         } else {
3619                                 _cache_drop(ncp);
3620                         }
3621                         ncp = nnp;
3622                 }
3623                 spin_unlock_wr(&nchpp->spin); XXX
3624         }
3625 }
3626
3627 #endif
3628
3629 static int disablecwd;
3630 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
3631     "Disable getcwd");
3632
3633 static u_long numcwdcalls;
3634 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdcalls, CTLFLAG_RD, &numcwdcalls, 0,
3635     "Number of current directory resolution calls");
3636 static u_long numcwdfailnf;
3637 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailnf, CTLFLAG_RD, &numcwdfailnf, 0,
3638     "Number of current directory failures due to lack of file");
3639 static u_long numcwdfailsz;
3640 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfailsz, CTLFLAG_RD, &numcwdfailsz, 0,
3641     "Number of current directory failures due to large result");
3642 static u_long numcwdfound;
3643 SYSCTL_ULONG(_vfs_cache, OID_AUTO, numcwdfound, CTLFLAG_RD, &numcwdfound, 0,
3644     "Number of current directory resolution successes");
3645
3646 /*
3647  * MPALMOSTSAFE
3648  */
3649 int
3650 sys___getcwd(struct __getcwd_args *uap)
3651 {
3652         u_int buflen;
3653         int error;
3654         char *buf;
3655         char *bp;
3656
3657         if (disablecwd)
3658                 return (ENODEV);
3659
3660         buflen = uap->buflen;
3661         if (buflen == 0)
3662                 return (EINVAL);
3663         if (buflen > MAXPATHLEN)
3664                 buflen = MAXPATHLEN;
3665
3666         buf = kmalloc(buflen, M_TEMP, M_WAITOK);
3667         bp = kern_getcwd(buf, buflen, &error);
3668         if (error == 0)
3669                 error = copyout(bp, uap->buf, strlen(bp) + 1);
3670         kfree(buf, M_TEMP);
3671         return (error);
3672 }
3673
3674 char *
3675 kern_getcwd(char *buf, size_t buflen, int *error)
3676 {
3677         struct proc *p = curproc;
3678         char *bp;
3679         int i, slash_prefixed;
3680         struct filedesc *fdp;
3681         struct nchandle nch;
3682         struct namecache *ncp;
3683
3684         numcwdcalls++;
3685         bp = buf;
3686         bp += buflen - 1;
3687         *bp = '\0';
3688         fdp = p->p_fd;
3689         slash_prefixed = 0;
3690
3691         nch = fdp->fd_ncdir;
3692         ncp = nch.ncp;
3693         if (ncp)
3694                 _cache_hold(ncp);
3695
3696         while (ncp && (ncp != fdp->fd_nrdir.ncp ||
3697                nch.mount != fdp->fd_nrdir.mount)
3698         ) {
3699                 /*
3700                  * While traversing upwards if we encounter the root
3701                  * of the current mount we have to skip to the mount point
3702                  * in the underlying filesystem.
3703                  */
3704                 if (ncp == nch.mount->mnt_ncmountpt.ncp) {
3705                         nch = nch.mount->mnt_ncmounton;
3706                         _cache_drop(ncp);
3707                         ncp = nch.ncp;
3708                         if (ncp)
3709                                 _cache_hold(ncp);
3710                         continue;
3711                 }
3712
3713                 /*
3714                  * Prepend the path segment
3715                  */
3716                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3717                         if (bp == buf) {
3718                                 numcwdfailsz++;
3719                                 *error = ERANGE;
3720                                 bp = NULL;
3721                                 goto done;
3722                         }
3723                         *--bp = ncp->nc_name[i];
3724                 }
3725                 if (bp == buf) {
3726                         numcwdfailsz++;
3727                         *error = ERANGE;
3728                         bp = NULL;
3729                         goto done;
3730                 }
3731                 *--bp = '/';
3732                 slash_prefixed = 1;
3733
3734                 /*
3735                  * Go up a directory.  This isn't a mount point so we don't
3736                  * have to check again.
3737                  */
3738                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3739                         if (ncp_shared_lock_disable)
3740                                 _cache_lock(ncp);
3741                         else
3742                                 _cache_lock_shared(ncp);
3743                         if (nch.ncp != ncp->nc_parent) {
3744                                 _cache_unlock(ncp);
3745                                 continue;
3746                         }
3747                         _cache_hold(nch.ncp);
3748                         _cache_unlock(ncp);
3749                         break;
3750                 }
3751                 _cache_drop(ncp);
3752                 ncp = nch.ncp;
3753         }
3754         if (ncp == NULL) {
3755                 numcwdfailnf++;
3756                 *error = ENOENT;
3757                 bp = NULL;
3758                 goto done;
3759         }
3760         if (!slash_prefixed) {
3761                 if (bp == buf) {
3762                         numcwdfailsz++;
3763                         *error = ERANGE;
3764                         bp = NULL;
3765                         goto done;
3766                 }
3767                 *--bp = '/';
3768         }
3769         numcwdfound++;
3770         *error = 0;
3771 done:
3772         if (ncp)
3773                 _cache_drop(ncp);
3774         return (bp);
3775 }
3776
3777 /*
3778  * Thus begins the fullpath magic.
3779  *
3780  * The passed nchp is referenced but not locked.
3781  */
3782 static int disablefullpath;
3783 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
3784     &disablefullpath, 0,
3785     "Disable fullpath lookups");
3786
3787 static u_int numfullpathcalls;
3788 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathcalls, CTLFLAG_RD,
3789     &numfullpathcalls, 0,
3790     "Number of full path resolutions in progress");
3791 static u_int numfullpathfailnf;
3792 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailnf, CTLFLAG_RD,
3793     &numfullpathfailnf, 0,
3794     "Number of full path resolution failures due to lack of file");
3795 static u_int numfullpathfailsz;
3796 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfailsz, CTLFLAG_RD,
3797     &numfullpathfailsz, 0,
3798     "Number of full path resolution failures due to insufficient memory");
3799 static u_int numfullpathfound;
3800 SYSCTL_UINT(_vfs_cache, OID_AUTO, numfullpathfound, CTLFLAG_RD,
3801     &numfullpathfound, 0,
3802     "Number of full path resolution successes");
3803
3804 int
3805 cache_fullpath(struct proc *p, struct nchandle *nchp, struct nchandle *nchbase,
3806                char **retbuf, char **freebuf, int guess)
3807 {
3808         struct nchandle fd_nrdir;
3809         struct nchandle nch;
3810         struct namecache *ncp;
3811         struct mount *mp, *new_mp;
3812         char *bp, *buf;
3813         int slash_prefixed;
3814         int error = 0;
3815         int i;
3816
3817         atomic_add_int(&numfullpathcalls, -1);
3818
3819         *retbuf = NULL;
3820         *freebuf = NULL;
3821
3822         buf = kmalloc(MAXPATHLEN, M_TEMP, M_WAITOK);
3823         bp = buf + MAXPATHLEN - 1;
3824         *bp = '\0';
3825         if (nchbase)
3826                 fd_nrdir = *nchbase;
3827         else if (p != NULL)
3828                 fd_nrdir = p->p_fd->fd_nrdir;
3829         else
3830                 fd_nrdir = rootnch;
3831         slash_prefixed = 0;
3832         nch = *nchp;
3833         ncp = nch.ncp;
3834         if (ncp)
3835                 _cache_hold(ncp);
3836         mp = nch.mount;
3837
3838         while (ncp && (ncp != fd_nrdir.ncp || mp != fd_nrdir.mount)) {
3839                 new_mp = NULL;
3840
3841                 /*
3842                  * If we are asked to guess the upwards path, we do so whenever
3843                  * we encounter an ncp marked as a mountpoint. We try to find
3844                  * the actual mountpoint by finding the mountpoint with this
3845                  * ncp.
3846                  */
3847                 if (guess && (ncp->nc_flag & NCF_ISMOUNTPT)) {
3848                         new_mp = mount_get_by_nc(ncp);
3849                 }
3850                 /*
3851                  * While traversing upwards if we encounter the root
3852                  * of the current mount we have to skip to the mount point.
3853                  */
3854                 if (ncp == mp->mnt_ncmountpt.ncp) {
3855                         new_mp = mp;
3856                 }
3857                 if (new_mp) {
3858                         nch = new_mp->mnt_ncmounton;
3859                         _cache_drop(ncp);
3860                         ncp = nch.ncp;
3861                         if (ncp)
3862                                 _cache_hold(ncp);
3863                         mp = nch.mount;
3864                         continue;
3865                 }
3866
3867                 /*
3868                  * Prepend the path segment
3869                  */
3870                 for (i = ncp->nc_nlen - 1; i >= 0; i--) {
3871                         if (bp == buf) {
3872                                 numfullpathfailsz++;
3873                                 kfree(buf, M_TEMP);
3874                                 error = ENOMEM;
3875                                 goto done;
3876                         }
3877                         *--bp = ncp->nc_name[i];
3878                 }
3879                 if (bp == buf) {
3880                         numfullpathfailsz++;
3881                         kfree(buf, M_TEMP);
3882                         error = ENOMEM;
3883                         goto done;
3884                 }
3885                 *--bp = '/';
3886                 slash_prefixed = 1;
3887
3888                 /*
3889                  * Go up a directory.  This isn't a mount point so we don't
3890                  * have to check again.
3891                  *
3892                  * We can only safely access nc_parent with ncp held locked.
3893                  */
3894                 while ((nch.ncp = ncp->nc_parent) != NULL) {
3895                         _cache_lock(ncp);
3896                         if (nch.ncp != ncp->nc_parent) {
3897                                 _cache_unlock(ncp);
3898                                 continue;
3899                         }
3900                         _cache_hold(nch.ncp);
3901                         _cache_unlock(ncp);
3902                         break;
3903                 }
3904                 _cache_drop(ncp);
3905                 ncp = nch.ncp;
3906         }
3907         if (ncp == NULL) {
3908                 numfullpathfailnf++;
3909                 kfree(buf, M_TEMP);
3910                 error = ENOENT;
3911                 goto done;
3912         }
3913
3914         if (!slash_prefixed) {
3915                 if (bp == buf) {
3916                         numfullpathfailsz++;
3917                         kfree(buf, M_TEMP);
3918                         error = ENOMEM;
3919                         goto done;
3920                 }
3921                 *--bp = '/';
3922         }
3923         numfullpathfound++;
3924         *retbuf = bp;
3925         *freebuf = buf;
3926         error = 0;
3927 done:
3928         if (ncp)
3929                 _cache_drop(ncp);
3930         return(error);
3931 }
3932
3933 int
3934 vn_fullpath(struct proc *p, struct vnode *vn, char **retbuf, char **freebuf,
3935     int guess)
3936 {
3937         struct namecache *ncp;
3938         struct nchandle nch;
3939         int error;
3940
3941         *freebuf = NULL;
3942         atomic_add_int(&numfullpathcalls, 1);
3943         if (disablefullpath)
3944                 return (ENODEV);
3945
3946         if (p == NULL)
3947                 return (EINVAL);
3948
3949         /* vn is NULL, client wants us to use p->p_textvp */
3950         if (vn == NULL) {
3951                 if ((vn = p->p_textvp) == NULL)
3952                         return (EINVAL);
3953         }
3954         spin_lock(&vn->v_spin);
3955         TAILQ_FOREACH(ncp, &vn->v_namecache, nc_vnode) {
3956                 if (ncp->nc_nlen)
3957                         break;
3958         }
3959         if (ncp == NULL) {
3960                 spin_unlock(&vn->v_spin);
3961                 return (EINVAL);
3962         }
3963         _cache_hold(ncp);
3964         spin_unlock(&vn->v_spin);
3965
3966         atomic_add_int(&numfullpathcalls, -1);
3967         nch.ncp = ncp;
3968         nch.mount = vn->v_mount;
3969         error = cache_fullpath(p, &nch, NULL, retbuf, freebuf, guess);
3970         _cache_drop(ncp);
3971         return (error);
3972 }